[aarch64] Use force_reg instead of copy_to_mode_reg.
[official-gcc.git] / gcc / config / aarch64 / aarch64.cc
blob321580d7f6a1aa58202b8aedddb396ce7f4421c3
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2023 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #define INCLUDE_STRING
24 #define INCLUDE_ALGORITHM
25 #include "config.h"
26 #include "system.h"
27 #include "coretypes.h"
28 #include "backend.h"
29 #include "target.h"
30 #include "rtl.h"
31 #include "tree.h"
32 #include "memmodel.h"
33 #include "gimple.h"
34 #include "cfghooks.h"
35 #include "cfgloop.h"
36 #include "df.h"
37 #include "tm_p.h"
38 #include "stringpool.h"
39 #include "attribs.h"
40 #include "optabs.h"
41 #include "regs.h"
42 #include "emit-rtl.h"
43 #include "recog.h"
44 #include "cgraph.h"
45 #include "diagnostic.h"
46 #include "insn-attr.h"
47 #include "alias.h"
48 #include "fold-const.h"
49 #include "stor-layout.h"
50 #include "calls.h"
51 #include "varasm.h"
52 #include "output.h"
53 #include "flags.h"
54 #include "explow.h"
55 #include "expr.h"
56 #include "reload.h"
57 #include "langhooks.h"
58 #include "opts.h"
59 #include "gimplify.h"
60 #include "dwarf2.h"
61 #include "gimple-iterator.h"
62 #include "tree-vectorizer.h"
63 #include "aarch64-cost-tables.h"
64 #include "dumpfile.h"
65 #include "builtins.h"
66 #include "rtl-iter.h"
67 #include "tm-constrs.h"
68 #include "sched-int.h"
69 #include "target-globals.h"
70 #include "common/common-target.h"
71 #include "cfgrtl.h"
72 #include "selftest.h"
73 #include "selftest-rtl.h"
74 #include "rtx-vector-builder.h"
75 #include "intl.h"
76 #include "expmed.h"
77 #include "function-abi.h"
78 #include "gimple-pretty-print.h"
79 #include "tree-ssa-loop-niter.h"
80 #include "fractional-cost.h"
81 #include "rtlanal.h"
82 #include "tree-dfa.h"
83 #include "asan.h"
84 #include "aarch64-feature-deps.h"
85 #include "config/arm/aarch-common.h"
86 #include "config/arm/aarch-common-protos.h"
88 /* This file should be included last. */
89 #include "target-def.h"
91 /* Defined for convenience. */
92 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
94 /* Information about a legitimate vector immediate operand. */
95 struct simd_immediate_info
97 enum insn_type { MOV, MVN, INDEX, PTRUE };
98 enum modifier_type { LSL, MSL };
100 simd_immediate_info () {}
101 simd_immediate_info (scalar_float_mode, rtx);
102 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
103 insn_type = MOV, modifier_type = LSL,
104 unsigned int = 0);
105 simd_immediate_info (scalar_mode, rtx, rtx);
106 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
108 /* The mode of the elements. */
109 scalar_mode elt_mode;
111 /* The instruction to use to move the immediate into a vector. */
112 insn_type insn;
114 union
116 /* For MOV and MVN. */
117 struct
119 /* The value of each element. */
120 rtx value;
122 /* The kind of shift modifier to use, and the number of bits to shift.
123 This is (LSL, 0) if no shift is needed. */
124 modifier_type modifier;
125 unsigned int shift;
126 } mov;
128 /* For INDEX. */
129 struct
131 /* The value of the first element and the step to be added for each
132 subsequent element. */
133 rtx base, step;
134 } index;
136 /* For PTRUE. */
137 aarch64_svpattern pattern;
138 } u;
141 /* Construct a floating-point immediate in which each element has mode
142 ELT_MODE_IN and value VALUE_IN. */
143 inline simd_immediate_info
144 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
145 : elt_mode (elt_mode_in), insn (MOV)
147 u.mov.value = value_in;
148 u.mov.modifier = LSL;
149 u.mov.shift = 0;
152 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
153 and value VALUE_IN. The other parameters are as for the structure
154 fields. */
155 inline simd_immediate_info
156 ::simd_immediate_info (scalar_int_mode elt_mode_in,
157 unsigned HOST_WIDE_INT value_in,
158 insn_type insn_in, modifier_type modifier_in,
159 unsigned int shift_in)
160 : elt_mode (elt_mode_in), insn (insn_in)
162 u.mov.value = gen_int_mode (value_in, elt_mode_in);
163 u.mov.modifier = modifier_in;
164 u.mov.shift = shift_in;
167 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
168 and where element I is equal to BASE_IN + I * STEP_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
171 : elt_mode (elt_mode_in), insn (INDEX)
173 u.index.base = base_in;
174 u.index.step = step_in;
177 /* Construct a predicate that controls elements of mode ELT_MODE_IN
178 and has PTRUE pattern PATTERN_IN. */
179 inline simd_immediate_info
180 ::simd_immediate_info (scalar_int_mode elt_mode_in,
181 aarch64_svpattern pattern_in)
182 : elt_mode (elt_mode_in), insn (PTRUE)
184 u.pattern = pattern_in;
187 namespace {
189 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
190 class pure_scalable_type_info
192 public:
193 /* Represents the result of analyzing a type. All values are nonzero,
194 in the possibly forlorn hope that accidental conversions to bool
195 trigger a warning. */
196 enum analysis_result
198 /* The type does not have an ABI identity; i.e. it doesn't contain
199 at least one object whose type is a Fundamental Data Type. */
200 NO_ABI_IDENTITY = 1,
202 /* The type is definitely a Pure Scalable Type. */
203 IS_PST,
205 /* The type is definitely not a Pure Scalable Type. */
206 ISNT_PST,
208 /* It doesn't matter for PCS purposes whether the type is a Pure
209 Scalable Type or not, since the type will be handled the same
210 way regardless.
212 Specifically, this means that if the type is a Pure Scalable Type,
213 there aren't enough argument registers to hold it, and so it will
214 need to be passed or returned in memory. If the type isn't a
215 Pure Scalable Type, it's too big to be passed or returned in core
216 or SIMD&FP registers, and so again will need to go in memory. */
217 DOESNT_MATTER
220 /* Aggregates of 17 bytes or more are normally passed and returned
221 in memory, so aggregates of that size can safely be analyzed as
222 DOESNT_MATTER. We need to be able to collect enough pieces to
223 represent a PST that is smaller than that. Since predicates are
224 2 bytes in size for -msve-vector-bits=128, that means we need to be
225 able to store at least 8 pieces.
227 We also need to be able to store enough pieces to represent
228 a single vector in each vector argument register and a single
229 predicate in each predicate argument register. This means that
230 we need at least 12 pieces. */
231 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
232 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
234 /* Describes one piece of a PST. Each piece is one of:
236 - a single Scalable Vector Type (SVT)
237 - a single Scalable Predicate Type (SPT)
238 - a PST containing 2, 3 or 4 SVTs, with no padding
240 It either represents a single built-in type or a PST formed from
241 multiple homogeneous built-in types. */
242 struct piece
244 rtx get_rtx (unsigned int, unsigned int) const;
246 /* The number of vector and predicate registers that the piece
247 occupies. One of the two is always zero. */
248 unsigned int num_zr;
249 unsigned int num_pr;
251 /* The mode of the registers described above. */
252 machine_mode mode;
254 /* If this piece is formed from multiple homogeneous built-in types,
255 this is the mode of the built-in types, otherwise it is MODE. */
256 machine_mode orig_mode;
258 /* The offset in bytes of the piece from the start of the type. */
259 poly_uint64_pod offset;
262 /* Divides types analyzed as IS_PST into individual pieces. The pieces
263 are in memory order. */
264 auto_vec<piece, MAX_PIECES> pieces;
266 unsigned int num_zr () const;
267 unsigned int num_pr () const;
269 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
271 analysis_result analyze (const_tree);
272 bool analyze_registers (const_tree);
274 private:
275 analysis_result analyze_array (const_tree);
276 analysis_result analyze_record (const_tree);
277 void add_piece (const piece &);
281 /* The current code model. */
282 enum aarch64_code_model aarch64_cmodel;
284 /* The number of 64-bit elements in an SVE vector. */
285 poly_uint16 aarch64_sve_vg;
287 #ifdef HAVE_AS_TLS
288 #undef TARGET_HAVE_TLS
289 #define TARGET_HAVE_TLS 1
290 #endif
292 static bool aarch64_composite_type_p (const_tree, machine_mode);
293 static bool aarch64_return_in_memory_1 (const_tree);
294 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
295 const_tree,
296 machine_mode *, int *,
297 bool *, bool);
298 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
299 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
300 static void aarch64_override_options_after_change (void);
301 static bool aarch64_vector_mode_supported_p (machine_mode);
302 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
303 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
304 const_tree type,
305 int misalignment,
306 bool is_packed);
307 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
308 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
309 aarch64_addr_query_type);
311 /* The processor for which instructions should be scheduled. */
312 enum aarch64_processor aarch64_tune = cortexa53;
314 /* Mask to specify which instruction scheduling options should be used. */
315 uint64_t aarch64_tune_flags = 0;
317 /* Global flag for PC relative loads. */
318 bool aarch64_pcrelative_literal_loads;
320 /* Global flag for whether frame pointer is enabled. */
321 bool aarch64_use_frame_pointer;
323 char *accepted_branch_protection_string = NULL;
325 /* Support for command line parsing of boolean flags in the tuning
326 structures. */
327 struct aarch64_flag_desc
329 const char* name;
330 unsigned int flag;
333 #define AARCH64_FUSION_PAIR(name, internal_name) \
334 { name, AARCH64_FUSE_##internal_name },
335 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
337 { "none", AARCH64_FUSE_NOTHING },
338 #include "aarch64-fusion-pairs.def"
339 { "all", AARCH64_FUSE_ALL },
340 { NULL, AARCH64_FUSE_NOTHING }
343 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
344 { name, AARCH64_EXTRA_TUNE_##internal_name },
345 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
347 { "none", AARCH64_EXTRA_TUNE_NONE },
348 #include "aarch64-tuning-flags.def"
349 { "all", AARCH64_EXTRA_TUNE_ALL },
350 { NULL, AARCH64_EXTRA_TUNE_NONE }
353 /* Tuning parameters. */
355 static const struct cpu_addrcost_table generic_addrcost_table =
358 1, /* hi */
359 0, /* si */
360 0, /* di */
361 1, /* ti */
363 0, /* pre_modify */
364 0, /* post_modify */
365 0, /* post_modify_ld3_st3 */
366 0, /* post_modify_ld4_st4 */
367 0, /* register_offset */
368 0, /* register_sextend */
369 0, /* register_zextend */
370 0 /* imm_offset */
373 static const struct cpu_addrcost_table exynosm1_addrcost_table =
376 0, /* hi */
377 0, /* si */
378 0, /* di */
379 2, /* ti */
381 0, /* pre_modify */
382 0, /* post_modify */
383 0, /* post_modify_ld3_st3 */
384 0, /* post_modify_ld4_st4 */
385 1, /* register_offset */
386 1, /* register_sextend */
387 2, /* register_zextend */
388 0, /* imm_offset */
391 static const struct cpu_addrcost_table xgene1_addrcost_table =
394 1, /* hi */
395 0, /* si */
396 0, /* di */
397 1, /* ti */
399 1, /* pre_modify */
400 1, /* post_modify */
401 1, /* post_modify_ld3_st3 */
402 1, /* post_modify_ld4_st4 */
403 0, /* register_offset */
404 1, /* register_sextend */
405 1, /* register_zextend */
406 0, /* imm_offset */
409 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
412 1, /* hi */
413 1, /* si */
414 1, /* di */
415 2, /* ti */
417 0, /* pre_modify */
418 0, /* post_modify */
419 0, /* post_modify_ld3_st3 */
420 0, /* post_modify_ld4_st4 */
421 2, /* register_offset */
422 3, /* register_sextend */
423 3, /* register_zextend */
424 0, /* imm_offset */
427 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
430 1, /* hi */
431 1, /* si */
432 1, /* di */
433 2, /* ti */
435 0, /* pre_modify */
436 0, /* post_modify */
437 0, /* post_modify_ld3_st3 */
438 0, /* post_modify_ld4_st4 */
439 2, /* register_offset */
440 3, /* register_sextend */
441 3, /* register_zextend */
442 0, /* imm_offset */
445 static const struct cpu_addrcost_table tsv110_addrcost_table =
448 1, /* hi */
449 0, /* si */
450 0, /* di */
451 1, /* ti */
453 0, /* pre_modify */
454 0, /* post_modify */
455 0, /* post_modify_ld3_st3 */
456 0, /* post_modify_ld4_st4 */
457 0, /* register_offset */
458 1, /* register_sextend */
459 1, /* register_zextend */
460 0, /* imm_offset */
463 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
466 1, /* hi */
467 1, /* si */
468 1, /* di */
469 2, /* ti */
471 1, /* pre_modify */
472 1, /* post_modify */
473 1, /* post_modify_ld3_st3 */
474 1, /* post_modify_ld4_st4 */
475 3, /* register_offset */
476 3, /* register_sextend */
477 3, /* register_zextend */
478 2, /* imm_offset */
481 static const struct cpu_addrcost_table a64fx_addrcost_table =
484 1, /* hi */
485 1, /* si */
486 1, /* di */
487 2, /* ti */
489 0, /* pre_modify */
490 0, /* post_modify */
491 0, /* post_modify_ld3_st3 */
492 0, /* post_modify_ld4_st4 */
493 2, /* register_offset */
494 3, /* register_sextend */
495 3, /* register_zextend */
496 0, /* imm_offset */
499 static const struct cpu_addrcost_table neoversev1_addrcost_table =
502 1, /* hi */
503 0, /* si */
504 0, /* di */
505 1, /* ti */
507 0, /* pre_modify */
508 0, /* post_modify */
509 3, /* post_modify_ld3_st3 */
510 3, /* post_modify_ld4_st4 */
511 0, /* register_offset */
512 0, /* register_sextend */
513 0, /* register_zextend */
514 0 /* imm_offset */
517 static const struct cpu_addrcost_table neoversen2_addrcost_table =
520 1, /* hi */
521 0, /* si */
522 0, /* di */
523 1, /* ti */
525 0, /* pre_modify */
526 0, /* post_modify */
527 2, /* post_modify_ld3_st3 */
528 2, /* post_modify_ld4_st4 */
529 0, /* register_offset */
530 0, /* register_sextend */
531 0, /* register_zextend */
532 0 /* imm_offset */
535 static const struct cpu_addrcost_table neoversev2_addrcost_table =
538 1, /* hi */
539 0, /* si */
540 0, /* di */
541 1, /* ti */
543 0, /* pre_modify */
544 0, /* post_modify */
545 2, /* post_modify_ld3_st3 */
546 2, /* post_modify_ld4_st4 */
547 0, /* register_offset */
548 0, /* register_sextend */
549 0, /* register_zextend */
550 0 /* imm_offset */
553 static const struct cpu_regmove_cost generic_regmove_cost =
555 1, /* GP2GP */
556 /* Avoid the use of slow int<->fp moves for spilling by setting
557 their cost higher than memmov_cost. */
558 5, /* GP2FP */
559 5, /* FP2GP */
560 2 /* FP2FP */
563 static const struct cpu_regmove_cost cortexa57_regmove_cost =
565 1, /* GP2GP */
566 /* Avoid the use of slow int<->fp moves for spilling by setting
567 their cost higher than memmov_cost. */
568 5, /* GP2FP */
569 5, /* FP2GP */
570 2 /* FP2FP */
573 static const struct cpu_regmove_cost cortexa53_regmove_cost =
575 1, /* GP2GP */
576 /* Avoid the use of slow int<->fp moves for spilling by setting
577 their cost higher than memmov_cost. */
578 5, /* GP2FP */
579 5, /* FP2GP */
580 2 /* FP2FP */
583 static const struct cpu_regmove_cost exynosm1_regmove_cost =
585 1, /* GP2GP */
586 /* Avoid the use of slow int<->fp moves for spilling by setting
587 their cost higher than memmov_cost (actual, 4 and 9). */
588 9, /* GP2FP */
589 9, /* FP2GP */
590 1 /* FP2FP */
593 static const struct cpu_regmove_cost thunderx_regmove_cost =
595 2, /* GP2GP */
596 2, /* GP2FP */
597 6, /* FP2GP */
598 4 /* FP2FP */
601 static const struct cpu_regmove_cost xgene1_regmove_cost =
603 1, /* GP2GP */
604 /* Avoid the use of slow int<->fp moves for spilling by setting
605 their cost higher than memmov_cost. */
606 8, /* GP2FP */
607 8, /* FP2GP */
608 2 /* FP2FP */
611 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
613 2, /* GP2GP */
614 /* Avoid the use of int<->fp moves for spilling. */
615 6, /* GP2FP */
616 6, /* FP2GP */
617 4 /* FP2FP */
620 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
622 1, /* GP2GP */
623 /* Avoid the use of int<->fp moves for spilling. */
624 5, /* GP2FP */
625 6, /* FP2GP */
626 3, /* FP2FP */
629 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
631 1, /* GP2GP */
632 /* Avoid the use of int<->fp moves for spilling. */
633 4, /* GP2FP */
634 5, /* FP2GP */
635 4 /* FP2FP */
638 static const struct cpu_regmove_cost tsv110_regmove_cost =
640 1, /* GP2GP */
641 /* Avoid the use of slow int<->fp moves for spilling by setting
642 their cost higher than memmov_cost. */
643 2, /* GP2FP */
644 3, /* FP2GP */
645 2 /* FP2FP */
648 static const struct cpu_regmove_cost a64fx_regmove_cost =
650 1, /* GP2GP */
651 /* Avoid the use of slow int<->fp moves for spilling by setting
652 their cost higher than memmov_cost. */
653 5, /* GP2FP */
654 7, /* FP2GP */
655 2 /* FP2FP */
658 static const struct cpu_regmove_cost neoversen2_regmove_cost =
660 1, /* GP2GP */
661 /* Spilling to int<->fp instead of memory is recommended so set
662 realistic costs compared to memmov_cost. */
663 3, /* GP2FP */
664 2, /* FP2GP */
665 2 /* FP2FP */
668 static const struct cpu_regmove_cost neoversev1_regmove_cost =
670 1, /* GP2GP */
671 /* Spilling to int<->fp instead of memory is recommended so set
672 realistic costs compared to memmov_cost. */
673 3, /* GP2FP */
674 2, /* FP2GP */
675 2 /* FP2FP */
678 static const struct cpu_regmove_cost neoversev2_regmove_cost =
680 1, /* GP2GP */
681 /* Spilling to int<->fp instead of memory is recommended so set
682 realistic costs compared to memmov_cost. */
683 3, /* GP2FP */
684 2, /* FP2GP */
685 2 /* FP2FP */
688 /* Generic costs for Advanced SIMD vector operations. */
689 static const advsimd_vec_cost generic_advsimd_vector_cost =
691 1, /* int_stmt_cost */
692 1, /* fp_stmt_cost */
693 0, /* ld2_st2_permute_cost */
694 0, /* ld3_st3_permute_cost */
695 0, /* ld4_st4_permute_cost */
696 2, /* permute_cost */
697 2, /* reduc_i8_cost */
698 2, /* reduc_i16_cost */
699 2, /* reduc_i32_cost */
700 2, /* reduc_i64_cost */
701 2, /* reduc_f16_cost */
702 2, /* reduc_f32_cost */
703 2, /* reduc_f64_cost */
704 2, /* store_elt_extra_cost */
705 2, /* vec_to_scalar_cost */
706 1, /* scalar_to_vec_cost */
707 1, /* align_load_cost */
708 1, /* unalign_load_cost */
709 1, /* unalign_store_cost */
710 1 /* store_cost */
713 /* Generic costs for SVE vector operations. */
714 static const sve_vec_cost generic_sve_vector_cost =
717 1, /* int_stmt_cost */
718 1, /* fp_stmt_cost */
719 0, /* ld2_st2_permute_cost */
720 0, /* ld3_st3_permute_cost */
721 0, /* ld4_st4_permute_cost */
722 2, /* permute_cost */
723 2, /* reduc_i8_cost */
724 2, /* reduc_i16_cost */
725 2, /* reduc_i32_cost */
726 2, /* reduc_i64_cost */
727 2, /* reduc_f16_cost */
728 2, /* reduc_f32_cost */
729 2, /* reduc_f64_cost */
730 2, /* store_elt_extra_cost */
731 2, /* vec_to_scalar_cost */
732 1, /* scalar_to_vec_cost */
733 1, /* align_load_cost */
734 1, /* unalign_load_cost */
735 1, /* unalign_store_cost */
736 1 /* store_cost */
738 2, /* clast_cost */
739 2, /* fadda_f16_cost */
740 2, /* fadda_f32_cost */
741 2, /* fadda_f64_cost */
742 4, /* gather_load_x32_cost */
743 2, /* gather_load_x64_cost */
744 1 /* scatter_store_elt_cost */
747 /* Generic costs for vector insn classes. */
748 static const struct cpu_vector_cost generic_vector_cost =
750 1, /* scalar_int_stmt_cost */
751 1, /* scalar_fp_stmt_cost */
752 1, /* scalar_load_cost */
753 1, /* scalar_store_cost */
754 3, /* cond_taken_branch_cost */
755 1, /* cond_not_taken_branch_cost */
756 &generic_advsimd_vector_cost, /* advsimd */
757 &generic_sve_vector_cost, /* sve */
758 nullptr /* issue_info */
761 static const advsimd_vec_cost a64fx_advsimd_vector_cost =
763 2, /* int_stmt_cost */
764 5, /* fp_stmt_cost */
765 0, /* ld2_st2_permute_cost */
766 0, /* ld3_st3_permute_cost */
767 0, /* ld4_st4_permute_cost */
768 3, /* permute_cost */
769 13, /* reduc_i8_cost */
770 13, /* reduc_i16_cost */
771 13, /* reduc_i32_cost */
772 13, /* reduc_i64_cost */
773 13, /* reduc_f16_cost */
774 13, /* reduc_f32_cost */
775 13, /* reduc_f64_cost */
776 13, /* store_elt_extra_cost */
777 13, /* vec_to_scalar_cost */
778 4, /* scalar_to_vec_cost */
779 6, /* align_load_cost */
780 6, /* unalign_load_cost */
781 1, /* unalign_store_cost */
782 1 /* store_cost */
785 static const sve_vec_cost a64fx_sve_vector_cost =
788 2, /* int_stmt_cost */
789 5, /* fp_stmt_cost */
790 0, /* ld2_st2_permute_cost */
791 0, /* ld3_st3_permute_cost */
792 0, /* ld4_st4_permute_cost */
793 3, /* permute_cost */
794 13, /* reduc_i8_cost */
795 13, /* reduc_i16_cost */
796 13, /* reduc_i32_cost */
797 13, /* reduc_i64_cost */
798 13, /* reduc_f16_cost */
799 13, /* reduc_f32_cost */
800 13, /* reduc_f64_cost */
801 13, /* store_elt_extra_cost */
802 13, /* vec_to_scalar_cost */
803 4, /* scalar_to_vec_cost */
804 6, /* align_load_cost */
805 6, /* unalign_load_cost */
806 1, /* unalign_store_cost */
807 1 /* store_cost */
809 13, /* clast_cost */
810 13, /* fadda_f16_cost */
811 13, /* fadda_f32_cost */
812 13, /* fadda_f64_cost */
813 64, /* gather_load_x32_cost */
814 32, /* gather_load_x64_cost */
815 1 /* scatter_store_elt_cost */
818 static const struct cpu_vector_cost a64fx_vector_cost =
820 1, /* scalar_int_stmt_cost */
821 5, /* scalar_fp_stmt_cost */
822 4, /* scalar_load_cost */
823 1, /* scalar_store_cost */
824 3, /* cond_taken_branch_cost */
825 1, /* cond_not_taken_branch_cost */
826 &a64fx_advsimd_vector_cost, /* advsimd */
827 &a64fx_sve_vector_cost, /* sve */
828 nullptr /* issue_info */
831 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
833 1, /* int_stmt_cost */
834 3, /* fp_stmt_cost */
835 0, /* ld2_st2_permute_cost */
836 0, /* ld3_st3_permute_cost */
837 0, /* ld4_st4_permute_cost */
838 2, /* permute_cost */
839 1, /* reduc_i8_cost */
840 1, /* reduc_i16_cost */
841 1, /* reduc_i32_cost */
842 1, /* reduc_i64_cost */
843 1, /* reduc_f16_cost */
844 1, /* reduc_f32_cost */
845 1, /* reduc_f64_cost */
846 1, /* store_elt_extra_cost */
847 1, /* vec_to_scalar_cost */
848 1, /* scalar_to_vec_cost */
849 1, /* align_load_cost */
850 1, /* unalign_load_cost */
851 1, /* unalign_store_cost */
852 1 /* store_cost */
855 /* QDF24XX costs for vector insn classes. */
856 static const struct cpu_vector_cost qdf24xx_vector_cost =
858 1, /* scalar_int_stmt_cost */
859 1, /* scalar_fp_stmt_cost */
860 1, /* scalar_load_cost */
861 1, /* scalar_store_cost */
862 3, /* cond_taken_branch_cost */
863 1, /* cond_not_taken_branch_cost */
864 &qdf24xx_advsimd_vector_cost, /* advsimd */
865 nullptr, /* sve */
866 nullptr /* issue_info */
870 static const advsimd_vec_cost thunderx_advsimd_vector_cost =
872 4, /* int_stmt_cost */
873 1, /* fp_stmt_cost */
874 0, /* ld2_st2_permute_cost */
875 0, /* ld3_st3_permute_cost */
876 0, /* ld4_st4_permute_cost */
877 4, /* permute_cost */
878 2, /* reduc_i8_cost */
879 2, /* reduc_i16_cost */
880 2, /* reduc_i32_cost */
881 2, /* reduc_i64_cost */
882 2, /* reduc_f16_cost */
883 2, /* reduc_f32_cost */
884 2, /* reduc_f64_cost */
885 2, /* store_elt_extra_cost */
886 2, /* vec_to_scalar_cost */
887 2, /* scalar_to_vec_cost */
888 3, /* align_load_cost */
889 5, /* unalign_load_cost */
890 5, /* unalign_store_cost */
891 1 /* store_cost */
894 /* ThunderX costs for vector insn classes. */
895 static const struct cpu_vector_cost thunderx_vector_cost =
897 1, /* scalar_int_stmt_cost */
898 1, /* scalar_fp_stmt_cost */
899 3, /* scalar_load_cost */
900 1, /* scalar_store_cost */
901 3, /* cond_taken_branch_cost */
902 3, /* cond_not_taken_branch_cost */
903 &thunderx_advsimd_vector_cost, /* advsimd */
904 nullptr, /* sve */
905 nullptr /* issue_info */
908 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
910 2, /* int_stmt_cost */
911 2, /* fp_stmt_cost */
912 0, /* ld2_st2_permute_cost */
913 0, /* ld3_st3_permute_cost */
914 0, /* ld4_st4_permute_cost */
915 2, /* permute_cost */
916 3, /* reduc_i8_cost */
917 3, /* reduc_i16_cost */
918 3, /* reduc_i32_cost */
919 3, /* reduc_i64_cost */
920 3, /* reduc_f16_cost */
921 3, /* reduc_f32_cost */
922 3, /* reduc_f64_cost */
923 3, /* store_elt_extra_cost */
924 3, /* vec_to_scalar_cost */
925 2, /* scalar_to_vec_cost */
926 5, /* align_load_cost */
927 5, /* unalign_load_cost */
928 1, /* unalign_store_cost */
929 1 /* store_cost */
932 static const struct cpu_vector_cost tsv110_vector_cost =
934 1, /* scalar_int_stmt_cost */
935 1, /* scalar_fp_stmt_cost */
936 5, /* scalar_load_cost */
937 1, /* scalar_store_cost */
938 1, /* cond_taken_branch_cost */
939 1, /* cond_not_taken_branch_cost */
940 &tsv110_advsimd_vector_cost, /* advsimd */
941 nullptr, /* sve */
942 nullptr /* issue_info */
945 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
947 2, /* int_stmt_cost */
948 2, /* fp_stmt_cost */
949 0, /* ld2_st2_permute_cost */
950 0, /* ld3_st3_permute_cost */
951 0, /* ld4_st4_permute_cost */
952 3, /* permute_cost */
953 8, /* reduc_i8_cost */
954 8, /* reduc_i16_cost */
955 8, /* reduc_i32_cost */
956 8, /* reduc_i64_cost */
957 8, /* reduc_f16_cost */
958 8, /* reduc_f32_cost */
959 8, /* reduc_f64_cost */
960 8, /* store_elt_extra_cost */
961 8, /* vec_to_scalar_cost */
962 8, /* scalar_to_vec_cost */
963 4, /* align_load_cost */
964 4, /* unalign_load_cost */
965 1, /* unalign_store_cost */
966 1 /* store_cost */
969 /* Cortex-A57 costs for vector insn classes. */
970 static const struct cpu_vector_cost cortexa57_vector_cost =
972 1, /* scalar_int_stmt_cost */
973 1, /* scalar_fp_stmt_cost */
974 4, /* scalar_load_cost */
975 1, /* scalar_store_cost */
976 1, /* cond_taken_branch_cost */
977 1, /* cond_not_taken_branch_cost */
978 &cortexa57_advsimd_vector_cost, /* advsimd */
979 nullptr, /* sve */
980 nullptr /* issue_info */
983 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
985 3, /* int_stmt_cost */
986 3, /* fp_stmt_cost */
987 0, /* ld2_st2_permute_cost */
988 0, /* ld3_st3_permute_cost */
989 0, /* ld4_st4_permute_cost */
990 3, /* permute_cost */
991 3, /* reduc_i8_cost */
992 3, /* reduc_i16_cost */
993 3, /* reduc_i32_cost */
994 3, /* reduc_i64_cost */
995 3, /* reduc_f16_cost */
996 3, /* reduc_f32_cost */
997 3, /* reduc_f64_cost */
998 3, /* store_elt_extra_cost */
999 3, /* vec_to_scalar_cost */
1000 3, /* scalar_to_vec_cost */
1001 5, /* align_load_cost */
1002 5, /* unalign_load_cost */
1003 1, /* unalign_store_cost */
1004 1 /* store_cost */
1007 static const struct cpu_vector_cost exynosm1_vector_cost =
1009 1, /* scalar_int_stmt_cost */
1010 1, /* scalar_fp_stmt_cost */
1011 5, /* scalar_load_cost */
1012 1, /* scalar_store_cost */
1013 1, /* cond_taken_branch_cost */
1014 1, /* cond_not_taken_branch_cost */
1015 &exynosm1_advsimd_vector_cost, /* advsimd */
1016 nullptr, /* sve */
1017 nullptr /* issue_info */
1020 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
1022 2, /* int_stmt_cost */
1023 2, /* fp_stmt_cost */
1024 0, /* ld2_st2_permute_cost */
1025 0, /* ld3_st3_permute_cost */
1026 0, /* ld4_st4_permute_cost */
1027 2, /* permute_cost */
1028 4, /* reduc_i8_cost */
1029 4, /* reduc_i16_cost */
1030 4, /* reduc_i32_cost */
1031 4, /* reduc_i64_cost */
1032 4, /* reduc_f16_cost */
1033 4, /* reduc_f32_cost */
1034 4, /* reduc_f64_cost */
1035 4, /* store_elt_extra_cost */
1036 4, /* vec_to_scalar_cost */
1037 4, /* scalar_to_vec_cost */
1038 10, /* align_load_cost */
1039 10, /* unalign_load_cost */
1040 2, /* unalign_store_cost */
1041 2 /* store_cost */
1044 /* Generic costs for vector insn classes. */
1045 static const struct cpu_vector_cost xgene1_vector_cost =
1047 1, /* scalar_int_stmt_cost */
1048 1, /* scalar_fp_stmt_cost */
1049 5, /* scalar_load_cost */
1050 1, /* scalar_store_cost */
1051 2, /* cond_taken_branch_cost */
1052 1, /* cond_not_taken_branch_cost */
1053 &xgene1_advsimd_vector_cost, /* advsimd */
1054 nullptr, /* sve */
1055 nullptr /* issue_info */
1058 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
1060 4, /* int_stmt_cost */
1061 5, /* fp_stmt_cost */
1062 0, /* ld2_st2_permute_cost */
1063 0, /* ld3_st3_permute_cost */
1064 0, /* ld4_st4_permute_cost */
1065 10, /* permute_cost */
1066 6, /* reduc_i8_cost */
1067 6, /* reduc_i16_cost */
1068 6, /* reduc_i32_cost */
1069 6, /* reduc_i64_cost */
1070 6, /* reduc_f16_cost */
1071 6, /* reduc_f32_cost */
1072 6, /* reduc_f64_cost */
1073 6, /* store_elt_extra_cost */
1074 6, /* vec_to_scalar_cost */
1075 5, /* scalar_to_vec_cost */
1076 4, /* align_load_cost */
1077 4, /* unalign_load_cost */
1078 1, /* unalign_store_cost */
1079 1 /* store_cost */
1082 /* Costs for vector insn classes for Vulcan. */
1083 static const struct cpu_vector_cost thunderx2t99_vector_cost =
1085 1, /* scalar_int_stmt_cost */
1086 6, /* scalar_fp_stmt_cost */
1087 4, /* scalar_load_cost */
1088 1, /* scalar_store_cost */
1089 2, /* cond_taken_branch_cost */
1090 1, /* cond_not_taken_branch_cost */
1091 &thunderx2t99_advsimd_vector_cost, /* advsimd */
1092 nullptr, /* sve */
1093 nullptr /* issue_info */
1096 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
1098 5, /* int_stmt_cost */
1099 5, /* fp_stmt_cost */
1100 0, /* ld2_st2_permute_cost */
1101 0, /* ld3_st3_permute_cost */
1102 0, /* ld4_st4_permute_cost */
1103 10, /* permute_cost */
1104 5, /* reduc_i8_cost */
1105 5, /* reduc_i16_cost */
1106 5, /* reduc_i32_cost */
1107 5, /* reduc_i64_cost */
1108 5, /* reduc_f16_cost */
1109 5, /* reduc_f32_cost */
1110 5, /* reduc_f64_cost */
1111 5, /* store_elt_extra_cost */
1112 5, /* vec_to_scalar_cost */
1113 5, /* scalar_to_vec_cost */
1114 4, /* align_load_cost */
1115 4, /* unalign_load_cost */
1116 4, /* unalign_store_cost */
1117 4 /* store_cost */
1120 static const struct cpu_vector_cost thunderx3t110_vector_cost =
1122 1, /* scalar_int_stmt_cost */
1123 5, /* scalar_fp_stmt_cost */
1124 4, /* scalar_load_cost */
1125 1, /* scalar_store_cost */
1126 2, /* cond_taken_branch_cost */
1127 1, /* cond_not_taken_branch_cost */
1128 &thunderx3t110_advsimd_vector_cost, /* advsimd */
1129 nullptr, /* sve */
1130 nullptr /* issue_info */
1133 static const advsimd_vec_cost ampere1_advsimd_vector_cost =
1135 1, /* int_stmt_cost */
1136 3, /* fp_stmt_cost */
1137 0, /* ld2_st2_permute_cost */
1138 0, /* ld3_st3_permute_cost */
1139 0, /* ld4_st4_permute_cost */
1140 2, /* permute_cost */
1141 12, /* reduc_i8_cost */
1142 9, /* reduc_i16_cost */
1143 6, /* reduc_i32_cost */
1144 5, /* reduc_i64_cost */
1145 9, /* reduc_f16_cost */
1146 6, /* reduc_f32_cost */
1147 5, /* reduc_f64_cost */
1148 8, /* store_elt_extra_cost */
1149 6, /* vec_to_scalar_cost */
1150 7, /* scalar_to_vec_cost */
1151 4, /* align_load_cost */
1152 4, /* unalign_load_cost */
1153 1, /* unalign_store_cost */
1154 1 /* store_cost */
1157 /* Ampere-1 costs for vector insn classes. */
1158 static const struct cpu_vector_cost ampere1_vector_cost =
1160 1, /* scalar_int_stmt_cost */
1161 3, /* scalar_fp_stmt_cost */
1162 4, /* scalar_load_cost */
1163 1, /* scalar_store_cost */
1164 1, /* cond_taken_branch_cost */
1165 1, /* cond_not_taken_branch_cost */
1166 &ampere1_advsimd_vector_cost, /* advsimd */
1167 nullptr, /* sve */
1168 nullptr /* issue_info */
1171 /* Generic costs for branch instructions. */
1172 static const struct cpu_branch_cost generic_branch_cost =
1174 1, /* Predictable. */
1175 3 /* Unpredictable. */
1178 /* Generic approximation modes. */
1179 static const cpu_approx_modes generic_approx_modes =
1181 AARCH64_APPROX_NONE, /* division */
1182 AARCH64_APPROX_NONE, /* sqrt */
1183 AARCH64_APPROX_NONE /* recip_sqrt */
1186 /* Approximation modes for Exynos M1. */
1187 static const cpu_approx_modes exynosm1_approx_modes =
1189 AARCH64_APPROX_NONE, /* division */
1190 AARCH64_APPROX_ALL, /* sqrt */
1191 AARCH64_APPROX_ALL /* recip_sqrt */
1194 /* Approximation modes for X-Gene 1. */
1195 static const cpu_approx_modes xgene1_approx_modes =
1197 AARCH64_APPROX_NONE, /* division */
1198 AARCH64_APPROX_NONE, /* sqrt */
1199 AARCH64_APPROX_ALL /* recip_sqrt */
1202 /* Generic prefetch settings (which disable prefetch). */
1203 static const cpu_prefetch_tune generic_prefetch_tune =
1205 0, /* num_slots */
1206 -1, /* l1_cache_size */
1207 -1, /* l1_cache_line_size */
1208 -1, /* l2_cache_size */
1209 true, /* prefetch_dynamic_strides */
1210 -1, /* minimum_stride */
1211 -1 /* default_opt_level */
1214 static const cpu_prefetch_tune exynosm1_prefetch_tune =
1216 0, /* num_slots */
1217 -1, /* l1_cache_size */
1218 64, /* l1_cache_line_size */
1219 -1, /* l2_cache_size */
1220 true, /* prefetch_dynamic_strides */
1221 -1, /* minimum_stride */
1222 -1 /* default_opt_level */
1225 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
1227 4, /* num_slots */
1228 32, /* l1_cache_size */
1229 64, /* l1_cache_line_size */
1230 512, /* l2_cache_size */
1231 false, /* prefetch_dynamic_strides */
1232 2048, /* minimum_stride */
1233 3 /* default_opt_level */
1236 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
1238 8, /* num_slots */
1239 32, /* l1_cache_size */
1240 128, /* l1_cache_line_size */
1241 16*1024, /* l2_cache_size */
1242 true, /* prefetch_dynamic_strides */
1243 -1, /* minimum_stride */
1244 3 /* default_opt_level */
1247 static const cpu_prefetch_tune thunderx_prefetch_tune =
1249 8, /* num_slots */
1250 32, /* l1_cache_size */
1251 128, /* l1_cache_line_size */
1252 -1, /* l2_cache_size */
1253 true, /* prefetch_dynamic_strides */
1254 -1, /* minimum_stride */
1255 -1 /* default_opt_level */
1258 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
1260 8, /* num_slots */
1261 32, /* l1_cache_size */
1262 64, /* l1_cache_line_size */
1263 256, /* l2_cache_size */
1264 true, /* prefetch_dynamic_strides */
1265 -1, /* minimum_stride */
1266 -1 /* default_opt_level */
1269 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
1271 8, /* num_slots */
1272 32, /* l1_cache_size */
1273 64, /* l1_cache_line_size */
1274 256, /* l2_cache_size */
1275 true, /* prefetch_dynamic_strides */
1276 -1, /* minimum_stride */
1277 -1 /* default_opt_level */
1280 static const cpu_prefetch_tune tsv110_prefetch_tune =
1282 0, /* num_slots */
1283 64, /* l1_cache_size */
1284 64, /* l1_cache_line_size */
1285 512, /* l2_cache_size */
1286 true, /* prefetch_dynamic_strides */
1287 -1, /* minimum_stride */
1288 -1 /* default_opt_level */
1291 static const cpu_prefetch_tune xgene1_prefetch_tune =
1293 8, /* num_slots */
1294 32, /* l1_cache_size */
1295 64, /* l1_cache_line_size */
1296 256, /* l2_cache_size */
1297 true, /* prefetch_dynamic_strides */
1298 -1, /* minimum_stride */
1299 -1 /* default_opt_level */
1302 static const cpu_prefetch_tune a64fx_prefetch_tune =
1304 8, /* num_slots */
1305 64, /* l1_cache_size */
1306 256, /* l1_cache_line_size */
1307 32768, /* l2_cache_size */
1308 true, /* prefetch_dynamic_strides */
1309 -1, /* minimum_stride */
1310 -1 /* default_opt_level */
1313 static const cpu_prefetch_tune ampere1_prefetch_tune =
1315 0, /* num_slots */
1316 64, /* l1_cache_size */
1317 64, /* l1_cache_line_size */
1318 2048, /* l2_cache_size */
1319 true, /* prefetch_dynamic_strides */
1320 -1, /* minimum_stride */
1321 -1 /* default_opt_level */
1324 static const struct tune_params generic_tunings =
1326 &cortexa57_extra_costs,
1327 &generic_addrcost_table,
1328 &generic_regmove_cost,
1329 &generic_vector_cost,
1330 &generic_branch_cost,
1331 &generic_approx_modes,
1332 SVE_NOT_IMPLEMENTED, /* sve_width */
1333 { 4, /* load_int. */
1334 4, /* store_int. */
1335 4, /* load_fp. */
1336 4, /* store_fp. */
1337 4, /* load_pred. */
1338 4 /* store_pred. */
1339 }, /* memmov_cost. */
1340 2, /* issue_rate */
1341 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1342 "16:12", /* function_align. */
1343 "4", /* jump_align. */
1344 "8", /* loop_align. */
1345 2, /* int_reassoc_width. */
1346 4, /* fp_reassoc_width. */
1347 1, /* fma_reassoc_width. */
1348 1, /* vec_reassoc_width. */
1349 2, /* min_div_recip_mul_sf. */
1350 2, /* min_div_recip_mul_df. */
1351 0, /* max_case_values. */
1352 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1353 /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
1354 Neoverse V1. It does not have a noticeable effect on A64FX and should
1355 have at most a very minor effect on SVE2 cores. */
1356 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */
1357 &generic_prefetch_tune
1360 static const struct tune_params cortexa35_tunings =
1362 &cortexa53_extra_costs,
1363 &generic_addrcost_table,
1364 &cortexa53_regmove_cost,
1365 &generic_vector_cost,
1366 &generic_branch_cost,
1367 &generic_approx_modes,
1368 SVE_NOT_IMPLEMENTED, /* sve_width */
1369 { 4, /* load_int. */
1370 4, /* store_int. */
1371 4, /* load_fp. */
1372 4, /* store_fp. */
1373 4, /* load_pred. */
1374 4 /* store_pred. */
1375 }, /* memmov_cost. */
1376 1, /* issue_rate */
1377 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1378 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1379 "16", /* function_align. */
1380 "4", /* jump_align. */
1381 "8", /* loop_align. */
1382 2, /* int_reassoc_width. */
1383 4, /* fp_reassoc_width. */
1384 1, /* fma_reassoc_width. */
1385 1, /* vec_reassoc_width. */
1386 2, /* min_div_recip_mul_sf. */
1387 2, /* min_div_recip_mul_df. */
1388 0, /* max_case_values. */
1389 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1390 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1391 &generic_prefetch_tune
1394 static const struct tune_params cortexa53_tunings =
1396 &cortexa53_extra_costs,
1397 &generic_addrcost_table,
1398 &cortexa53_regmove_cost,
1399 &generic_vector_cost,
1400 &generic_branch_cost,
1401 &generic_approx_modes,
1402 SVE_NOT_IMPLEMENTED, /* sve_width */
1403 { 4, /* load_int. */
1404 4, /* store_int. */
1405 4, /* load_fp. */
1406 4, /* store_fp. */
1407 4, /* load_pred. */
1408 4 /* store_pred. */
1409 }, /* memmov_cost. */
1410 2, /* issue_rate */
1411 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1412 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1413 "16", /* function_align. */
1414 "4", /* jump_align. */
1415 "8", /* loop_align. */
1416 2, /* int_reassoc_width. */
1417 4, /* fp_reassoc_width. */
1418 1, /* fma_reassoc_width. */
1419 1, /* vec_reassoc_width. */
1420 2, /* min_div_recip_mul_sf. */
1421 2, /* min_div_recip_mul_df. */
1422 0, /* max_case_values. */
1423 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1424 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1425 &generic_prefetch_tune
1428 static const struct tune_params cortexa57_tunings =
1430 &cortexa57_extra_costs,
1431 &generic_addrcost_table,
1432 &cortexa57_regmove_cost,
1433 &cortexa57_vector_cost,
1434 &generic_branch_cost,
1435 &generic_approx_modes,
1436 SVE_NOT_IMPLEMENTED, /* sve_width */
1437 { 4, /* load_int. */
1438 4, /* store_int. */
1439 4, /* load_fp. */
1440 4, /* store_fp. */
1441 4, /* load_pred. */
1442 4 /* store_pred. */
1443 }, /* memmov_cost. */
1444 3, /* issue_rate */
1445 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1446 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
1447 "16", /* function_align. */
1448 "4", /* jump_align. */
1449 "8", /* loop_align. */
1450 2, /* int_reassoc_width. */
1451 4, /* fp_reassoc_width. */
1452 1, /* fma_reassoc_width. */
1453 1, /* vec_reassoc_width. */
1454 2, /* min_div_recip_mul_sf. */
1455 2, /* min_div_recip_mul_df. */
1456 0, /* max_case_values. */
1457 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1458 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
1459 &generic_prefetch_tune
1462 static const struct tune_params cortexa72_tunings =
1464 &cortexa57_extra_costs,
1465 &generic_addrcost_table,
1466 &cortexa57_regmove_cost,
1467 &cortexa57_vector_cost,
1468 &generic_branch_cost,
1469 &generic_approx_modes,
1470 SVE_NOT_IMPLEMENTED, /* sve_width */
1471 { 4, /* load_int. */
1472 4, /* store_int. */
1473 4, /* load_fp. */
1474 4, /* store_fp. */
1475 4, /* load_pred. */
1476 4 /* store_pred. */
1477 }, /* memmov_cost. */
1478 3, /* issue_rate */
1479 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1480 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
1481 "16", /* function_align. */
1482 "4", /* jump_align. */
1483 "8", /* loop_align. */
1484 2, /* int_reassoc_width. */
1485 4, /* fp_reassoc_width. */
1486 1, /* fma_reassoc_width. */
1487 1, /* vec_reassoc_width. */
1488 2, /* min_div_recip_mul_sf. */
1489 2, /* min_div_recip_mul_df. */
1490 0, /* max_case_values. */
1491 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1492 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1493 &generic_prefetch_tune
1496 static const struct tune_params cortexa73_tunings =
1498 &cortexa57_extra_costs,
1499 &generic_addrcost_table,
1500 &cortexa57_regmove_cost,
1501 &cortexa57_vector_cost,
1502 &generic_branch_cost,
1503 &generic_approx_modes,
1504 SVE_NOT_IMPLEMENTED, /* sve_width */
1505 { 4, /* load_int. */
1506 4, /* store_int. */
1507 4, /* load_fp. */
1508 4, /* store_fp. */
1509 4, /* load_pred. */
1510 4 /* store_pred. */
1511 }, /* memmov_cost. */
1512 2, /* issue_rate. */
1513 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1514 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1515 "16", /* function_align. */
1516 "4", /* jump_align. */
1517 "8", /* loop_align. */
1518 2, /* int_reassoc_width. */
1519 4, /* fp_reassoc_width. */
1520 1, /* fma_reassoc_width. */
1521 1, /* vec_reassoc_width. */
1522 2, /* min_div_recip_mul_sf. */
1523 2, /* min_div_recip_mul_df. */
1524 0, /* max_case_values. */
1525 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1526 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1527 &generic_prefetch_tune
1532 static const struct tune_params exynosm1_tunings =
1534 &exynosm1_extra_costs,
1535 &exynosm1_addrcost_table,
1536 &exynosm1_regmove_cost,
1537 &exynosm1_vector_cost,
1538 &generic_branch_cost,
1539 &exynosm1_approx_modes,
1540 SVE_NOT_IMPLEMENTED, /* sve_width */
1541 { 4, /* load_int. */
1542 4, /* store_int. */
1543 4, /* load_fp. */
1544 4, /* store_fp. */
1545 4, /* load_pred. */
1546 4 /* store_pred. */
1547 }, /* memmov_cost. */
1548 3, /* issue_rate */
1549 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
1550 "4", /* function_align. */
1551 "4", /* jump_align. */
1552 "4", /* loop_align. */
1553 2, /* int_reassoc_width. */
1554 4, /* fp_reassoc_width. */
1555 1, /* fma_reassoc_width. */
1556 1, /* vec_reassoc_width. */
1557 2, /* min_div_recip_mul_sf. */
1558 2, /* min_div_recip_mul_df. */
1559 48, /* max_case_values. */
1560 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1561 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1562 &exynosm1_prefetch_tune
1565 static const struct tune_params thunderxt88_tunings =
1567 &thunderx_extra_costs,
1568 &generic_addrcost_table,
1569 &thunderx_regmove_cost,
1570 &thunderx_vector_cost,
1571 &generic_branch_cost,
1572 &generic_approx_modes,
1573 SVE_NOT_IMPLEMENTED, /* sve_width */
1574 { 6, /* load_int. */
1575 6, /* store_int. */
1576 6, /* load_fp. */
1577 6, /* store_fp. */
1578 6, /* load_pred. */
1579 6 /* store_pred. */
1580 }, /* memmov_cost. */
1581 2, /* issue_rate */
1582 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1583 "8", /* function_align. */
1584 "8", /* jump_align. */
1585 "8", /* loop_align. */
1586 2, /* int_reassoc_width. */
1587 4, /* fp_reassoc_width. */
1588 1, /* fma_reassoc_width. */
1589 1, /* vec_reassoc_width. */
1590 2, /* min_div_recip_mul_sf. */
1591 2, /* min_div_recip_mul_df. */
1592 0, /* max_case_values. */
1593 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1594 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
1595 &thunderxt88_prefetch_tune
1598 static const struct tune_params thunderx_tunings =
1600 &thunderx_extra_costs,
1601 &generic_addrcost_table,
1602 &thunderx_regmove_cost,
1603 &thunderx_vector_cost,
1604 &generic_branch_cost,
1605 &generic_approx_modes,
1606 SVE_NOT_IMPLEMENTED, /* sve_width */
1607 { 6, /* load_int. */
1608 6, /* store_int. */
1609 6, /* load_fp. */
1610 6, /* store_fp. */
1611 6, /* load_pred. */
1612 6 /* store_pred. */
1613 }, /* memmov_cost. */
1614 2, /* issue_rate */
1615 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1616 "8", /* function_align. */
1617 "8", /* jump_align. */
1618 "8", /* loop_align. */
1619 2, /* int_reassoc_width. */
1620 4, /* fp_reassoc_width. */
1621 1, /* fma_reassoc_width. */
1622 1, /* vec_reassoc_width. */
1623 2, /* min_div_recip_mul_sf. */
1624 2, /* min_div_recip_mul_df. */
1625 0, /* max_case_values. */
1626 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1627 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1628 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
1629 &thunderx_prefetch_tune
1632 static const struct tune_params tsv110_tunings =
1634 &tsv110_extra_costs,
1635 &tsv110_addrcost_table,
1636 &tsv110_regmove_cost,
1637 &tsv110_vector_cost,
1638 &generic_branch_cost,
1639 &generic_approx_modes,
1640 SVE_NOT_IMPLEMENTED, /* sve_width */
1641 { 4, /* load_int. */
1642 4, /* store_int. */
1643 4, /* load_fp. */
1644 4, /* store_fp. */
1645 4, /* load_pred. */
1646 4 /* store_pred. */
1647 }, /* memmov_cost. */
1648 4, /* issue_rate */
1649 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1650 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1651 "16", /* function_align. */
1652 "4", /* jump_align. */
1653 "8", /* loop_align. */
1654 2, /* int_reassoc_width. */
1655 4, /* fp_reassoc_width. */
1656 1, /* fma_reassoc_width. */
1657 1, /* vec_reassoc_width. */
1658 2, /* min_div_recip_mul_sf. */
1659 2, /* min_div_recip_mul_df. */
1660 0, /* max_case_values. */
1661 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1662 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1663 &tsv110_prefetch_tune
1666 static const struct tune_params xgene1_tunings =
1668 &xgene1_extra_costs,
1669 &xgene1_addrcost_table,
1670 &xgene1_regmove_cost,
1671 &xgene1_vector_cost,
1672 &generic_branch_cost,
1673 &xgene1_approx_modes,
1674 SVE_NOT_IMPLEMENTED, /* sve_width */
1675 { 6, /* load_int. */
1676 6, /* store_int. */
1677 6, /* load_fp. */
1678 6, /* store_fp. */
1679 6, /* load_pred. */
1680 6 /* store_pred. */
1681 }, /* memmov_cost. */
1682 4, /* issue_rate */
1683 AARCH64_FUSE_NOTHING, /* fusible_ops */
1684 "16", /* function_align. */
1685 "16", /* jump_align. */
1686 "16", /* loop_align. */
1687 2, /* int_reassoc_width. */
1688 4, /* fp_reassoc_width. */
1689 1, /* fma_reassoc_width. */
1690 1, /* vec_reassoc_width. */
1691 2, /* min_div_recip_mul_sf. */
1692 2, /* min_div_recip_mul_df. */
1693 17, /* max_case_values. */
1694 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1695 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1696 &xgene1_prefetch_tune
1699 static const struct tune_params emag_tunings =
1701 &xgene1_extra_costs,
1702 &xgene1_addrcost_table,
1703 &xgene1_regmove_cost,
1704 &xgene1_vector_cost,
1705 &generic_branch_cost,
1706 &xgene1_approx_modes,
1707 SVE_NOT_IMPLEMENTED,
1708 { 6, /* load_int. */
1709 6, /* store_int. */
1710 6, /* load_fp. */
1711 6, /* store_fp. */
1712 6, /* load_pred. */
1713 6 /* store_pred. */
1714 }, /* memmov_cost. */
1715 4, /* issue_rate */
1716 AARCH64_FUSE_NOTHING, /* fusible_ops */
1717 "16", /* function_align. */
1718 "16", /* jump_align. */
1719 "16", /* loop_align. */
1720 2, /* int_reassoc_width. */
1721 4, /* fp_reassoc_width. */
1722 1, /* fma_reassoc_width. */
1723 1, /* vec_reassoc_width. */
1724 2, /* min_div_recip_mul_sf. */
1725 2, /* min_div_recip_mul_df. */
1726 17, /* max_case_values. */
1727 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1728 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1729 &xgene1_prefetch_tune
1732 static const struct tune_params qdf24xx_tunings =
1734 &qdf24xx_extra_costs,
1735 &qdf24xx_addrcost_table,
1736 &qdf24xx_regmove_cost,
1737 &qdf24xx_vector_cost,
1738 &generic_branch_cost,
1739 &generic_approx_modes,
1740 SVE_NOT_IMPLEMENTED, /* sve_width */
1741 { 4, /* load_int. */
1742 4, /* store_int. */
1743 4, /* load_fp. */
1744 4, /* store_fp. */
1745 4, /* load_pred. */
1746 4 /* store_pred. */
1747 }, /* memmov_cost. */
1748 4, /* issue_rate */
1749 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1750 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1751 "16", /* function_align. */
1752 "8", /* jump_align. */
1753 "16", /* loop_align. */
1754 2, /* int_reassoc_width. */
1755 4, /* fp_reassoc_width. */
1756 1, /* fma_reassoc_width. */
1757 1, /* vec_reassoc_width. */
1758 2, /* min_div_recip_mul_sf. */
1759 2, /* min_div_recip_mul_df. */
1760 0, /* max_case_values. */
1761 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1762 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1763 &qdf24xx_prefetch_tune
1766 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1767 for now. */
1768 static const struct tune_params saphira_tunings =
1770 &generic_extra_costs,
1771 &generic_addrcost_table,
1772 &generic_regmove_cost,
1773 &generic_vector_cost,
1774 &generic_branch_cost,
1775 &generic_approx_modes,
1776 SVE_NOT_IMPLEMENTED, /* sve_width */
1777 { 4, /* load_int. */
1778 4, /* store_int. */
1779 4, /* load_fp. */
1780 4, /* store_fp. */
1781 4, /* load_pred. */
1782 4 /* store_pred. */
1783 }, /* memmov_cost. */
1784 4, /* issue_rate */
1785 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1786 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1787 "16", /* function_align. */
1788 "8", /* jump_align. */
1789 "16", /* loop_align. */
1790 2, /* int_reassoc_width. */
1791 4, /* fp_reassoc_width. */
1792 1, /* fma_reassoc_width. */
1793 1, /* vec_reassoc_width. */
1794 2, /* min_div_recip_mul_sf. */
1795 2, /* min_div_recip_mul_df. */
1796 0, /* max_case_values. */
1797 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1798 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1799 &generic_prefetch_tune
1802 static const struct tune_params thunderx2t99_tunings =
1804 &thunderx2t99_extra_costs,
1805 &thunderx2t99_addrcost_table,
1806 &thunderx2t99_regmove_cost,
1807 &thunderx2t99_vector_cost,
1808 &generic_branch_cost,
1809 &generic_approx_modes,
1810 SVE_NOT_IMPLEMENTED, /* sve_width */
1811 { 4, /* load_int. */
1812 4, /* store_int. */
1813 4, /* load_fp. */
1814 4, /* store_fp. */
1815 4, /* load_pred. */
1816 4 /* store_pred. */
1817 }, /* memmov_cost. */
1818 4, /* issue_rate. */
1819 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1820 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1821 "16", /* function_align. */
1822 "8", /* jump_align. */
1823 "16", /* loop_align. */
1824 3, /* int_reassoc_width. */
1825 2, /* fp_reassoc_width. */
1826 1, /* fma_reassoc_width. */
1827 2, /* vec_reassoc_width. */
1828 2, /* min_div_recip_mul_sf. */
1829 2, /* min_div_recip_mul_df. */
1830 0, /* max_case_values. */
1831 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1832 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1833 &thunderx2t99_prefetch_tune
1836 static const struct tune_params thunderx3t110_tunings =
1838 &thunderx3t110_extra_costs,
1839 &thunderx3t110_addrcost_table,
1840 &thunderx3t110_regmove_cost,
1841 &thunderx3t110_vector_cost,
1842 &generic_branch_cost,
1843 &generic_approx_modes,
1844 SVE_NOT_IMPLEMENTED, /* sve_width */
1845 { 4, /* load_int. */
1846 4, /* store_int. */
1847 4, /* load_fp. */
1848 4, /* store_fp. */
1849 4, /* load_pred. */
1850 4 /* store_pred. */
1851 }, /* memmov_cost. */
1852 6, /* issue_rate. */
1853 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1854 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1855 "16", /* function_align. */
1856 "8", /* jump_align. */
1857 "16", /* loop_align. */
1858 3, /* int_reassoc_width. */
1859 2, /* fp_reassoc_width. */
1860 1, /* fma_reassoc_width. */
1861 2, /* vec_reassoc_width. */
1862 2, /* min_div_recip_mul_sf. */
1863 2, /* min_div_recip_mul_df. */
1864 0, /* max_case_values. */
1865 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1866 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1867 &thunderx3t110_prefetch_tune
1870 static const struct tune_params neoversen1_tunings =
1872 &cortexa76_extra_costs,
1873 &generic_addrcost_table,
1874 &generic_regmove_cost,
1875 &cortexa57_vector_cost,
1876 &generic_branch_cost,
1877 &generic_approx_modes,
1878 SVE_NOT_IMPLEMENTED, /* sve_width */
1879 { 4, /* load_int. */
1880 2, /* store_int. */
1881 5, /* load_fp. */
1882 2, /* store_fp. */
1883 4, /* load_pred. */
1884 4 /* store_pred. */
1885 }, /* memmov_cost. */
1886 3, /* issue_rate */
1887 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1888 "32:16", /* function_align. */
1889 "4", /* jump_align. */
1890 "32:16", /* loop_align. */
1891 2, /* int_reassoc_width. */
1892 4, /* fp_reassoc_width. */
1893 1, /* fma_reassoc_width. */
1894 2, /* vec_reassoc_width. */
1895 2, /* min_div_recip_mul_sf. */
1896 2, /* min_div_recip_mul_df. */
1897 0, /* max_case_values. */
1898 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1899 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
1900 &generic_prefetch_tune
1903 static const struct tune_params ampere1_tunings =
1905 &ampere1_extra_costs,
1906 &generic_addrcost_table,
1907 &generic_regmove_cost,
1908 &ampere1_vector_cost,
1909 &generic_branch_cost,
1910 &generic_approx_modes,
1911 SVE_NOT_IMPLEMENTED, /* sve_width */
1912 { 4, /* load_int. */
1913 4, /* store_int. */
1914 4, /* load_fp. */
1915 4, /* store_fp. */
1916 4, /* load_pred. */
1917 4 /* store_pred. */
1918 }, /* memmov_cost. */
1919 4, /* issue_rate */
1920 (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1921 AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1922 AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1923 AARCH64_FUSE_CMP_BRANCH),
1924 /* fusible_ops */
1925 "32", /* function_align. */
1926 "4", /* jump_align. */
1927 "32:16", /* loop_align. */
1928 2, /* int_reassoc_width. */
1929 4, /* fp_reassoc_width. */
1930 1, /* fma_reassoc_width. */
1931 2, /* vec_reassoc_width. */
1932 2, /* min_div_recip_mul_sf. */
1933 2, /* min_div_recip_mul_df. */
1934 0, /* max_case_values. */
1935 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1936 (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE), /* tune_flags. */
1937 &ampere1_prefetch_tune
1940 static const struct tune_params ampere1a_tunings =
1942 &ampere1a_extra_costs,
1943 &generic_addrcost_table,
1944 &generic_regmove_cost,
1945 &ampere1_vector_cost,
1946 &generic_branch_cost,
1947 &generic_approx_modes,
1948 SVE_NOT_IMPLEMENTED, /* sve_width */
1949 { 4, /* load_int. */
1950 4, /* store_int. */
1951 4, /* load_fp. */
1952 4, /* store_fp. */
1953 4, /* load_pred. */
1954 4 /* store_pred. */
1955 }, /* memmov_cost. */
1956 4, /* issue_rate */
1957 (AARCH64_FUSE_ADRP_ADD | AARCH64_FUSE_AES_AESMC |
1958 AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_MOVK_MOVK |
1959 AARCH64_FUSE_ALU_BRANCH /* adds, ands, bics, ccmp, ccmn */ |
1960 AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_ALU_CBZ |
1961 AARCH64_FUSE_ADDSUB_2REG_CONST1),
1962 /* fusible_ops */
1963 "32", /* function_align. */
1964 "4", /* jump_align. */
1965 "32:16", /* loop_align. */
1966 2, /* int_reassoc_width. */
1967 4, /* fp_reassoc_width. */
1968 1, /* fma_reassoc_width. */
1969 2, /* vec_reassoc_width. */
1970 2, /* min_div_recip_mul_sf. */
1971 2, /* min_div_recip_mul_df. */
1972 0, /* max_case_values. */
1973 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1974 (AARCH64_EXTRA_TUNE_NO_LDP_COMBINE), /* tune_flags. */
1975 &ampere1_prefetch_tune
1978 static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
1980 2, /* int_stmt_cost */
1981 2, /* fp_stmt_cost */
1982 4, /* ld2_st2_permute_cost */
1983 4, /* ld3_st3_permute_cost */
1984 5, /* ld4_st4_permute_cost */
1985 3, /* permute_cost */
1986 4, /* reduc_i8_cost */
1987 4, /* reduc_i16_cost */
1988 2, /* reduc_i32_cost */
1989 2, /* reduc_i64_cost */
1990 6, /* reduc_f16_cost */
1991 3, /* reduc_f32_cost */
1992 2, /* reduc_f64_cost */
1993 2, /* store_elt_extra_cost */
1994 /* This value is just inherited from the Cortex-A57 table. */
1995 8, /* vec_to_scalar_cost */
1996 /* This depends very much on what the scalar value is and
1997 where it comes from. E.g. some constants take two dependent
1998 instructions or a load, while others might be moved from a GPR.
1999 4 seems to be a reasonable compromise in practice. */
2000 4, /* scalar_to_vec_cost */
2001 4, /* align_load_cost */
2002 4, /* unalign_load_cost */
2003 /* Although stores have a latency of 2 and compete for the
2004 vector pipes, in practice it's better not to model that. */
2005 1, /* unalign_store_cost */
2006 1 /* store_cost */
2009 static const sve_vec_cost neoversev1_sve_vector_cost =
2012 2, /* int_stmt_cost */
2013 2, /* fp_stmt_cost */
2014 4, /* ld2_st2_permute_cost */
2015 7, /* ld3_st3_permute_cost */
2016 8, /* ld4_st4_permute_cost */
2017 3, /* permute_cost */
2018 /* Theoretically, a reduction involving 31 scalar ADDs could
2019 complete in ~9 cycles and would have a cost of 31. [SU]ADDV
2020 completes in 14 cycles, so give it a cost of 31 + 5. */
2021 36, /* reduc_i8_cost */
2022 /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */
2023 22, /* reduc_i16_cost */
2024 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */
2025 14, /* reduc_i32_cost */
2026 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */
2027 11, /* reduc_i64_cost */
2028 /* Theoretically, a reduction involving 15 scalar FADDs could
2029 complete in ~9 cycles and would have a cost of 30. FADDV
2030 completes in 13 cycles, so give it a cost of 30 + 4. */
2031 34, /* reduc_f16_cost */
2032 /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */
2033 19, /* reduc_f32_cost */
2034 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */
2035 11, /* reduc_f64_cost */
2036 2, /* store_elt_extra_cost */
2037 /* This value is just inherited from the Cortex-A57 table. */
2038 8, /* vec_to_scalar_cost */
2039 /* See the comment above the Advanced SIMD versions. */
2040 4, /* scalar_to_vec_cost */
2041 4, /* align_load_cost */
2042 4, /* unalign_load_cost */
2043 /* Although stores have a latency of 2 and compete for the
2044 vector pipes, in practice it's better not to model that. */
2045 1, /* unalign_store_cost */
2046 1 /* store_cost */
2048 3, /* clast_cost */
2049 19, /* fadda_f16_cost */
2050 11, /* fadda_f32_cost */
2051 8, /* fadda_f64_cost */
2052 32, /* gather_load_x32_cost */
2053 16, /* gather_load_x64_cost */
2054 3 /* scatter_store_elt_cost */
2057 static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
2059 3, /* loads_stores_per_cycle */
2060 2, /* stores_per_cycle */
2061 4, /* general_ops_per_cycle */
2062 0, /* fp_simd_load_general_ops */
2063 1 /* fp_simd_store_general_ops */
2066 static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
2069 3, /* loads_stores_per_cycle */
2070 2, /* stores_per_cycle */
2071 4, /* general_ops_per_cycle */
2072 0, /* fp_simd_load_general_ops */
2073 1 /* fp_simd_store_general_ops */
2075 2, /* ld2_st2_general_ops */
2076 2, /* ld3_st3_general_ops */
2077 3 /* ld4_st4_general_ops */
2080 static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
2084 2, /* loads_per_cycle */
2085 2, /* stores_per_cycle */
2086 2, /* general_ops_per_cycle */
2087 0, /* fp_simd_load_general_ops */
2088 1 /* fp_simd_store_general_ops */
2090 2, /* ld2_st2_general_ops */
2091 2, /* ld3_st3_general_ops */
2092 3 /* ld4_st4_general_ops */
2094 1, /* pred_ops_per_cycle */
2095 2, /* while_pred_ops */
2096 2, /* int_cmp_pred_ops */
2097 1, /* fp_cmp_pred_ops */
2098 1, /* gather_scatter_pair_general_ops */
2099 1 /* gather_scatter_pair_pred_ops */
2102 static const aarch64_vec_issue_info neoversev1_vec_issue_info =
2104 &neoversev1_scalar_issue_info,
2105 &neoversev1_advsimd_issue_info,
2106 &neoversev1_sve_issue_info
2109 /* Neoverse V1 costs for vector insn classes. */
2110 static const struct cpu_vector_cost neoversev1_vector_cost =
2112 1, /* scalar_int_stmt_cost */
2113 2, /* scalar_fp_stmt_cost */
2114 4, /* scalar_load_cost */
2115 1, /* scalar_store_cost */
2116 1, /* cond_taken_branch_cost */
2117 1, /* cond_not_taken_branch_cost */
2118 &neoversev1_advsimd_vector_cost, /* advsimd */
2119 &neoversev1_sve_vector_cost, /* sve */
2120 &neoversev1_vec_issue_info /* issue_info */
2123 static const struct tune_params neoversev1_tunings =
2125 &cortexa76_extra_costs,
2126 &neoversev1_addrcost_table,
2127 &neoversev1_regmove_cost,
2128 &neoversev1_vector_cost,
2129 &generic_branch_cost,
2130 &generic_approx_modes,
2131 SVE_256, /* sve_width */
2132 { 4, /* load_int. */
2133 2, /* store_int. */
2134 6, /* load_fp. */
2135 2, /* store_fp. */
2136 6, /* load_pred. */
2137 1 /* store_pred. */
2138 }, /* memmov_cost. */
2139 3, /* issue_rate */
2140 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2141 "32:16", /* function_align. */
2142 "4", /* jump_align. */
2143 "32:16", /* loop_align. */
2144 2, /* int_reassoc_width. */
2145 4, /* fp_reassoc_width. */
2146 4, /* fma_reassoc_width. */
2147 2, /* vec_reassoc_width. */
2148 2, /* min_div_recip_mul_sf. */
2149 2, /* min_div_recip_mul_df. */
2150 0, /* max_case_values. */
2151 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2152 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2153 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2154 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT
2155 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
2156 &generic_prefetch_tune
2159 static const sve_vec_cost neoverse512tvb_sve_vector_cost =
2162 2, /* int_stmt_cost */
2163 2, /* fp_stmt_cost */
2164 4, /* ld2_st2_permute_cost */
2165 5, /* ld3_st3_permute_cost */
2166 5, /* ld4_st4_permute_cost */
2167 3, /* permute_cost */
2168 /* Theoretically, a reduction involving 15 scalar ADDs could
2169 complete in ~5 cycles and would have a cost of 15. Assume that
2170 [SU]ADDV completes in 11 cycles and so give it a cost of 15 + 6. */
2171 21, /* reduc_i8_cost */
2172 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
2173 13, /* reduc_i16_cost */
2174 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
2175 9, /* reduc_i32_cost */
2176 /* Likewise for 1 scalar ADD (1 cycle) vs. 8: 1 + 7. */
2177 8, /* reduc_i64_cost */
2178 /* Theoretically, a reduction involving 7 scalar FADDs could
2179 complete in ~6 cycles and would have a cost of 14. Assume that
2180 FADDV completes in 8 cycles and so give it a cost of 14 + 2. */
2181 16, /* reduc_f16_cost */
2182 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
2183 8, /* reduc_f32_cost */
2184 /* Likewise for 1 scalar FADD (2 cycles) vs. 4: 2 + 2. */
2185 4, /* reduc_f64_cost */
2186 2, /* store_elt_extra_cost */
2187 /* This value is just inherited from the Cortex-A57 table. */
2188 8, /* vec_to_scalar_cost */
2189 /* This depends very much on what the scalar value is and
2190 where it comes from. E.g. some constants take two dependent
2191 instructions or a load, while others might be moved from a GPR.
2192 4 seems to be a reasonable compromise in practice. */
2193 4, /* scalar_to_vec_cost */
2194 4, /* align_load_cost */
2195 4, /* unalign_load_cost */
2196 /* Although stores generally have a latency of 2 and compete for the
2197 vector pipes, in practice it's better not to model that. */
2198 1, /* unalign_store_cost */
2199 1 /* store_cost */
2201 3, /* clast_cost */
2202 10, /* fadda_f16_cost */
2203 6, /* fadda_f32_cost */
2204 4, /* fadda_f64_cost */
2205 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2206 (6 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2207 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2208 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2209 (cost 2) to that, to avoid the difference being lost in rounding.
2211 There is no easy comparison between a strided Advanced SIMD x32 load
2212 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2213 operation more than a 64-bit gather. */
2214 14, /* gather_load_x32_cost */
2215 12, /* gather_load_x64_cost */
2216 3 /* scatter_store_elt_cost */
2219 static const aarch64_sve_vec_issue_info neoverse512tvb_sve_issue_info =
2223 3, /* loads_per_cycle */
2224 2, /* stores_per_cycle */
2225 4, /* general_ops_per_cycle */
2226 0, /* fp_simd_load_general_ops */
2227 1 /* fp_simd_store_general_ops */
2229 2, /* ld2_st2_general_ops */
2230 2, /* ld3_st3_general_ops */
2231 3 /* ld4_st4_general_ops */
2233 2, /* pred_ops_per_cycle */
2234 2, /* while_pred_ops */
2235 2, /* int_cmp_pred_ops */
2236 1, /* fp_cmp_pred_ops */
2237 1, /* gather_scatter_pair_general_ops */
2238 1 /* gather_scatter_pair_pred_ops */
2241 static const aarch64_vec_issue_info neoverse512tvb_vec_issue_info =
2243 &neoversev1_scalar_issue_info,
2244 &neoversev1_advsimd_issue_info,
2245 &neoverse512tvb_sve_issue_info
2248 static const struct cpu_vector_cost neoverse512tvb_vector_cost =
2250 1, /* scalar_int_stmt_cost */
2251 2, /* scalar_fp_stmt_cost */
2252 4, /* scalar_load_cost */
2253 1, /* scalar_store_cost */
2254 1, /* cond_taken_branch_cost */
2255 1, /* cond_not_taken_branch_cost */
2256 &neoversev1_advsimd_vector_cost, /* advsimd */
2257 &neoverse512tvb_sve_vector_cost, /* sve */
2258 &neoverse512tvb_vec_issue_info /* issue_info */
2261 static const struct tune_params neoverse512tvb_tunings =
2263 &cortexa76_extra_costs,
2264 &neoversev1_addrcost_table,
2265 &neoversev1_regmove_cost,
2266 &neoverse512tvb_vector_cost,
2267 &generic_branch_cost,
2268 &generic_approx_modes,
2269 SVE_128 | SVE_256, /* sve_width */
2270 { 4, /* load_int. */
2271 2, /* store_int. */
2272 6, /* load_fp. */
2273 2, /* store_fp. */
2274 6, /* load_pred. */
2275 1 /* store_pred. */
2276 }, /* memmov_cost. */
2277 3, /* issue_rate */
2278 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2279 "32:16", /* function_align. */
2280 "4", /* jump_align. */
2281 "32:16", /* loop_align. */
2282 2, /* int_reassoc_width. */
2283 4, /* fp_reassoc_width. */
2284 4, /* fma_reassoc_width. */
2285 2, /* vec_reassoc_width. */
2286 2, /* min_div_recip_mul_sf. */
2287 2, /* min_div_recip_mul_df. */
2288 0, /* max_case_values. */
2289 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2290 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2291 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2292 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
2293 &generic_prefetch_tune
2296 static const advsimd_vec_cost neoversen2_advsimd_vector_cost =
2298 2, /* int_stmt_cost */
2299 2, /* fp_stmt_cost */
2300 2, /* ld2_st2_permute_cost */
2301 2, /* ld3_st3_permute_cost */
2302 3, /* ld4_st4_permute_cost */
2303 3, /* permute_cost */
2304 4, /* reduc_i8_cost */
2305 4, /* reduc_i16_cost */
2306 2, /* reduc_i32_cost */
2307 2, /* reduc_i64_cost */
2308 6, /* reduc_f16_cost */
2309 4, /* reduc_f32_cost */
2310 2, /* reduc_f64_cost */
2311 2, /* store_elt_extra_cost */
2312 /* This value is just inherited from the Cortex-A57 table. */
2313 8, /* vec_to_scalar_cost */
2314 /* This depends very much on what the scalar value is and
2315 where it comes from. E.g. some constants take two dependent
2316 instructions or a load, while others might be moved from a GPR.
2317 4 seems to be a reasonable compromise in practice. */
2318 4, /* scalar_to_vec_cost */
2319 4, /* align_load_cost */
2320 4, /* unalign_load_cost */
2321 /* Although stores have a latency of 2 and compete for the
2322 vector pipes, in practice it's better not to model that. */
2323 1, /* unalign_store_cost */
2324 1 /* store_cost */
2327 static const sve_vec_cost neoversen2_sve_vector_cost =
2330 2, /* int_stmt_cost */
2331 2, /* fp_stmt_cost */
2332 3, /* ld2_st2_permute_cost */
2333 4, /* ld3_st3_permute_cost */
2334 4, /* ld4_st4_permute_cost */
2335 3, /* permute_cost */
2336 /* Theoretically, a reduction involving 15 scalar ADDs could
2337 complete in ~5 cycles and would have a cost of 15. [SU]ADDV
2338 completes in 11 cycles, so give it a cost of 15 + 6. */
2339 21, /* reduc_i8_cost */
2340 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 9: 7 + 6. */
2341 13, /* reduc_i16_cost */
2342 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 6. */
2343 9, /* reduc_i32_cost */
2344 /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
2345 2, /* reduc_i64_cost */
2346 /* Theoretically, a reduction involving 7 scalar FADDs could
2347 complete in ~8 cycles and would have a cost of 14. FADDV
2348 completes in 6 cycles, so give it a cost of 14 - 2. */
2349 12, /* reduc_f16_cost */
2350 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 4: 6 - 0. */
2351 6, /* reduc_f32_cost */
2352 /* Likewise for 1 scalar FADD (~2 cycles) vs. 2: 2 - 0. */
2353 2, /* reduc_f64_cost */
2354 2, /* store_elt_extra_cost */
2355 /* This value is just inherited from the Cortex-A57 table. */
2356 8, /* vec_to_scalar_cost */
2357 /* See the comment above the Advanced SIMD versions. */
2358 4, /* scalar_to_vec_cost */
2359 4, /* align_load_cost */
2360 4, /* unalign_load_cost */
2361 /* Although stores have a latency of 2 and compete for the
2362 vector pipes, in practice it's better not to model that. */
2363 1, /* unalign_store_cost */
2364 1 /* store_cost */
2366 3, /* clast_cost */
2367 10, /* fadda_f16_cost */
2368 6, /* fadda_f32_cost */
2369 4, /* fadda_f64_cost */
2370 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2371 (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2372 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2373 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2374 (cost 2) to that, to avoid the difference being lost in rounding.
2376 There is no easy comparison between a strided Advanced SIMD x32 load
2377 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2378 operation more than a 64-bit gather. */
2379 14, /* gather_load_x32_cost */
2380 12, /* gather_load_x64_cost */
2381 3 /* scatter_store_elt_cost */
2384 static const aarch64_scalar_vec_issue_info neoversen2_scalar_issue_info =
2386 3, /* loads_stores_per_cycle */
2387 2, /* stores_per_cycle */
2388 4, /* general_ops_per_cycle */
2389 0, /* fp_simd_load_general_ops */
2390 1 /* fp_simd_store_general_ops */
2393 static const aarch64_advsimd_vec_issue_info neoversen2_advsimd_issue_info =
2396 3, /* loads_stores_per_cycle */
2397 2, /* stores_per_cycle */
2398 2, /* general_ops_per_cycle */
2399 0, /* fp_simd_load_general_ops */
2400 1 /* fp_simd_store_general_ops */
2402 2, /* ld2_st2_general_ops */
2403 2, /* ld3_st3_general_ops */
2404 3 /* ld4_st4_general_ops */
2407 static const aarch64_sve_vec_issue_info neoversen2_sve_issue_info =
2411 3, /* loads_per_cycle */
2412 2, /* stores_per_cycle */
2413 2, /* general_ops_per_cycle */
2414 0, /* fp_simd_load_general_ops */
2415 1 /* fp_simd_store_general_ops */
2417 2, /* ld2_st2_general_ops */
2418 3, /* ld3_st3_general_ops */
2419 3 /* ld4_st4_general_ops */
2421 2, /* pred_ops_per_cycle */
2422 2, /* while_pred_ops */
2423 2, /* int_cmp_pred_ops */
2424 1, /* fp_cmp_pred_ops */
2425 1, /* gather_scatter_pair_general_ops */
2426 1 /* gather_scatter_pair_pred_ops */
2429 static const aarch64_vec_issue_info neoversen2_vec_issue_info =
2431 &neoversen2_scalar_issue_info,
2432 &neoversen2_advsimd_issue_info,
2433 &neoversen2_sve_issue_info
2436 /* Neoverse N2 costs for vector insn classes. */
2437 static const struct cpu_vector_cost neoversen2_vector_cost =
2439 1, /* scalar_int_stmt_cost */
2440 2, /* scalar_fp_stmt_cost */
2441 4, /* scalar_load_cost */
2442 1, /* scalar_store_cost */
2443 1, /* cond_taken_branch_cost */
2444 1, /* cond_not_taken_branch_cost */
2445 &neoversen2_advsimd_vector_cost, /* advsimd */
2446 &neoversen2_sve_vector_cost, /* sve */
2447 &neoversen2_vec_issue_info /* issue_info */
2450 static const struct tune_params neoversen2_tunings =
2452 &cortexa76_extra_costs,
2453 &neoversen2_addrcost_table,
2454 &neoversen2_regmove_cost,
2455 &neoversen2_vector_cost,
2456 &generic_branch_cost,
2457 &generic_approx_modes,
2458 SVE_128, /* sve_width */
2459 { 4, /* load_int. */
2460 1, /* store_int. */
2461 6, /* load_fp. */
2462 2, /* store_fp. */
2463 6, /* load_pred. */
2464 1 /* store_pred. */
2465 }, /* memmov_cost. */
2466 3, /* issue_rate */
2467 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2468 "32:16", /* function_align. */
2469 "4", /* jump_align. */
2470 "32:16", /* loop_align. */
2471 2, /* int_reassoc_width. */
2472 4, /* fp_reassoc_width. */
2473 1, /* fma_reassoc_width. */
2474 2, /* vec_reassoc_width. */
2475 2, /* min_div_recip_mul_sf. */
2476 2, /* min_div_recip_mul_df. */
2477 0, /* max_case_values. */
2478 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2479 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2480 | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2481 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2482 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
2483 &generic_prefetch_tune
2486 static const advsimd_vec_cost neoversev2_advsimd_vector_cost =
2488 2, /* int_stmt_cost */
2489 2, /* fp_stmt_cost */
2490 2, /* ld2_st2_permute_cost */
2491 2, /* ld3_st3_permute_cost */
2492 3, /* ld4_st4_permute_cost */
2493 3, /* permute_cost */
2494 4, /* reduc_i8_cost */
2495 4, /* reduc_i16_cost */
2496 2, /* reduc_i32_cost */
2497 2, /* reduc_i64_cost */
2498 6, /* reduc_f16_cost */
2499 3, /* reduc_f32_cost */
2500 2, /* reduc_f64_cost */
2501 2, /* store_elt_extra_cost */
2502 /* This value is just inherited from the Cortex-A57 table. */
2503 8, /* vec_to_scalar_cost */
2504 /* This depends very much on what the scalar value is and
2505 where it comes from. E.g. some constants take two dependent
2506 instructions or a load, while others might be moved from a GPR.
2507 4 seems to be a reasonable compromise in practice. */
2508 4, /* scalar_to_vec_cost */
2509 4, /* align_load_cost */
2510 4, /* unalign_load_cost */
2511 /* Although stores have a latency of 2 and compete for the
2512 vector pipes, in practice it's better not to model that. */
2513 1, /* unalign_store_cost */
2514 1 /* store_cost */
2517 static const sve_vec_cost neoversev2_sve_vector_cost =
2520 2, /* int_stmt_cost */
2521 2, /* fp_stmt_cost */
2522 3, /* ld2_st2_permute_cost */
2523 3, /* ld3_st3_permute_cost */
2524 4, /* ld4_st4_permute_cost */
2525 3, /* permute_cost */
2526 /* Theoretically, a reduction involving 15 scalar ADDs could
2527 complete in ~3 cycles and would have a cost of 15. [SU]ADDV
2528 completes in 11 cycles, so give it a cost of 15 + 8. */
2529 21, /* reduc_i8_cost */
2530 /* Likewise for 7 scalar ADDs (~2 cycles) vs. 9: 7 + 7. */
2531 14, /* reduc_i16_cost */
2532 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 8: 3 + 4. */
2533 7, /* reduc_i32_cost */
2534 /* Likewise for 1 scalar ADD (~1 cycles) vs. 2: 1 + 1. */
2535 2, /* reduc_i64_cost */
2536 /* Theoretically, a reduction involving 7 scalar FADDs could
2537 complete in ~6 cycles and would have a cost of 14. FADDV
2538 completes in 8 cycles, so give it a cost of 14 + 2. */
2539 16, /* reduc_f16_cost */
2540 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 6: 6 + 2. */
2541 8, /* reduc_f32_cost */
2542 /* Likewise for 1 scalar FADD (~2 cycles) vs. 4: 2 + 2. */
2543 4, /* reduc_f64_cost */
2544 2, /* store_elt_extra_cost */
2545 /* This value is just inherited from the Cortex-A57 table. */
2546 8, /* vec_to_scalar_cost */
2547 /* See the comment above the Advanced SIMD versions. */
2548 4, /* scalar_to_vec_cost */
2549 4, /* align_load_cost */
2550 4, /* unalign_load_cost */
2551 /* Although stores have a latency of 2 and compete for the
2552 vector pipes, in practice it's better not to model that. */
2553 1, /* unalign_store_cost */
2554 1 /* store_cost */
2556 3, /* clast_cost */
2557 10, /* fadda_f16_cost */
2558 6, /* fadda_f32_cost */
2559 4, /* fadda_f64_cost */
2560 /* A strided Advanced SIMD x64 load would take two parallel FP loads
2561 (8 cycles) plus an insertion (2 cycles). Assume a 64-bit SVE gather
2562 is 1 cycle more. The Advanced SIMD version is costed as 2 scalar loads
2563 (cost 8) and a vec_construct (cost 2). Add a full vector operation
2564 (cost 2) to that, to avoid the difference being lost in rounding.
2566 There is no easy comparison between a strided Advanced SIMD x32 load
2567 and an SVE 32-bit gather, but cost an SVE 32-bit gather as 1 vector
2568 operation more than a 64-bit gather. */
2569 14, /* gather_load_x32_cost */
2570 12, /* gather_load_x64_cost */
2571 3 /* scatter_store_elt_cost */
2574 static const aarch64_scalar_vec_issue_info neoversev2_scalar_issue_info =
2576 3, /* loads_stores_per_cycle */
2577 2, /* stores_per_cycle */
2578 6, /* general_ops_per_cycle */
2579 0, /* fp_simd_load_general_ops */
2580 1 /* fp_simd_store_general_ops */
2583 static const aarch64_advsimd_vec_issue_info neoversev2_advsimd_issue_info =
2586 3, /* loads_stores_per_cycle */
2587 2, /* stores_per_cycle */
2588 4, /* general_ops_per_cycle */
2589 0, /* fp_simd_load_general_ops */
2590 1 /* fp_simd_store_general_ops */
2592 2, /* ld2_st2_general_ops */
2593 2, /* ld3_st3_general_ops */
2594 3 /* ld4_st4_general_ops */
2597 static const aarch64_sve_vec_issue_info neoversev2_sve_issue_info =
2601 3, /* loads_per_cycle */
2602 2, /* stores_per_cycle */
2603 4, /* general_ops_per_cycle */
2604 0, /* fp_simd_load_general_ops */
2605 1 /* fp_simd_store_general_ops */
2607 2, /* ld2_st2_general_ops */
2608 3, /* ld3_st3_general_ops */
2609 3 /* ld4_st4_general_ops */
2611 2, /* pred_ops_per_cycle */
2612 2, /* while_pred_ops */
2613 2, /* int_cmp_pred_ops */
2614 1, /* fp_cmp_pred_ops */
2615 1, /* gather_scatter_pair_general_ops */
2616 1 /* gather_scatter_pair_pred_ops */
2619 static const aarch64_vec_issue_info neoversev2_vec_issue_info =
2621 &neoversev2_scalar_issue_info,
2622 &neoversev2_advsimd_issue_info,
2623 &neoversev2_sve_issue_info
2626 /* Demeter costs for vector insn classes. */
2627 static const struct cpu_vector_cost neoversev2_vector_cost =
2629 1, /* scalar_int_stmt_cost */
2630 2, /* scalar_fp_stmt_cost */
2631 4, /* scalar_load_cost */
2632 1, /* scalar_store_cost */
2633 1, /* cond_taken_branch_cost */
2634 1, /* cond_not_taken_branch_cost */
2635 &neoversev2_advsimd_vector_cost, /* advsimd */
2636 &neoversev2_sve_vector_cost, /* sve */
2637 &neoversev2_vec_issue_info /* issue_info */
2640 static const struct tune_params neoversev2_tunings =
2642 &cortexa76_extra_costs,
2643 &neoversev2_addrcost_table,
2644 &neoversev2_regmove_cost,
2645 &neoversev2_vector_cost,
2646 &generic_branch_cost,
2647 &generic_approx_modes,
2648 SVE_128, /* sve_width */
2649 { 4, /* load_int. */
2650 2, /* store_int. */
2651 6, /* load_fp. */
2652 1, /* store_fp. */
2653 6, /* load_pred. */
2654 2 /* store_pred. */
2655 }, /* memmov_cost. */
2656 5, /* issue_rate */
2657 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2658 "32:16", /* function_align. */
2659 "4", /* jump_align. */
2660 "32:16", /* loop_align. */
2661 3, /* int_reassoc_width. */
2662 6, /* fp_reassoc_width. */
2663 4, /* fma_reassoc_width. */
2664 3, /* vec_reassoc_width. */
2665 2, /* min_div_recip_mul_sf. */
2666 2, /* min_div_recip_mul_df. */
2667 0, /* max_case_values. */
2668 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2669 (AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND
2670 | AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
2671 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
2672 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
2673 &generic_prefetch_tune
2676 static const struct tune_params a64fx_tunings =
2678 &a64fx_extra_costs,
2679 &a64fx_addrcost_table,
2680 &a64fx_regmove_cost,
2681 &a64fx_vector_cost,
2682 &generic_branch_cost,
2683 &generic_approx_modes,
2684 SVE_512, /* sve_width */
2685 { 4, /* load_int. */
2686 4, /* store_int. */
2687 4, /* load_fp. */
2688 4, /* store_fp. */
2689 4, /* load_pred. */
2690 4 /* store_pred. */
2691 }, /* memmov_cost. */
2692 7, /* issue_rate */
2693 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
2694 "32", /* function_align. */
2695 "16", /* jump_align. */
2696 "32", /* loop_align. */
2697 4, /* int_reassoc_width. */
2698 2, /* fp_reassoc_width. */
2699 1, /* fma_reassoc_width. */
2700 2, /* vec_reassoc_width. */
2701 2, /* min_div_recip_mul_sf. */
2702 2, /* min_div_recip_mul_df. */
2703 0, /* max_case_values. */
2704 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
2705 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
2706 &a64fx_prefetch_tune
2709 /* Support for fine-grained override of the tuning structures. */
2710 struct aarch64_tuning_override_function
2712 const char* name;
2713 void (*parse_override)(const char*, struct tune_params*);
2716 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
2717 static void aarch64_parse_tune_string (const char*, struct tune_params*);
2718 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
2720 static const struct aarch64_tuning_override_function
2721 aarch64_tuning_override_functions[] =
2723 { "fuse", aarch64_parse_fuse_string },
2724 { "tune", aarch64_parse_tune_string },
2725 { "sve_width", aarch64_parse_sve_width_string },
2726 { NULL, NULL }
2729 /* A processor implementing AArch64. */
2730 struct processor
2732 const char *name;
2733 aarch64_processor ident;
2734 aarch64_processor sched_core;
2735 aarch64_arch arch;
2736 aarch64_feature_flags flags;
2737 const tune_params *tune;
2740 /* Architectures implementing AArch64. */
2741 static CONSTEXPR const processor all_architectures[] =
2743 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, D, E) \
2744 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, \
2745 feature_deps::ARCH_IDENT ().enable, NULL},
2746 #include "aarch64-arches.def"
2747 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2750 /* Processor cores implementing AArch64. */
2751 static const struct processor all_cores[] =
2753 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, E, COSTS, G, H, I) \
2754 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
2755 feature_deps::cpu_##IDENT, &COSTS##_tunings},
2756 #include "aarch64-cores.def"
2757 {"generic", generic, cortexa53, AARCH64_ARCH_V8A,
2758 feature_deps::V8A ().enable, &generic_tunings},
2759 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, NULL}
2762 /* The current tuning set. */
2763 struct tune_params aarch64_tune_params = generic_tunings;
2765 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
2767 static tree
2768 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
2769 int, bool *no_add_attrs)
2771 /* Since we set fn_type_req to true, the caller should have checked
2772 this for us. */
2773 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
2774 switch ((arm_pcs) fntype_abi (*node).id ())
2776 case ARM_PCS_AAPCS64:
2777 case ARM_PCS_SIMD:
2778 return NULL_TREE;
2780 case ARM_PCS_SVE:
2781 error ("the %qE attribute cannot be applied to an SVE function type",
2782 name);
2783 *no_add_attrs = true;
2784 return NULL_TREE;
2786 case ARM_PCS_TLSDESC:
2787 case ARM_PCS_UNKNOWN:
2788 break;
2790 gcc_unreachable ();
2793 /* Table of machine attributes. */
2794 static const struct attribute_spec aarch64_attribute_table[] =
2796 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
2797 affects_type_identity, handler, exclude } */
2798 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
2799 handle_aarch64_vector_pcs_attribute, NULL },
2800 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
2801 aarch64_sve::handle_arm_sve_vector_bits_attribute,
2802 NULL },
2803 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
2804 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
2805 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
2806 { NULL, 0, 0, false, false, false, false, NULL, NULL }
2809 typedef enum aarch64_cond_code
2811 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
2812 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
2813 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
2815 aarch64_cc;
2817 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
2820 /* The condition codes of the processor, and the inverse function. */
2821 static const char * const aarch64_condition_codes[] =
2823 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
2824 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
2827 /* The preferred condition codes for SVE conditions. */
2828 static const char *const aarch64_sve_condition_codes[] =
2830 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
2831 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
2834 /* Return the assembly token for svpattern value VALUE. */
2836 static const char *
2837 svpattern_token (enum aarch64_svpattern pattern)
2839 switch (pattern)
2841 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
2842 AARCH64_FOR_SVPATTERN (CASE)
2843 #undef CASE
2844 case AARCH64_NUM_SVPATTERNS:
2845 break;
2847 gcc_unreachable ();
2850 /* Return the location of a piece that is known to be passed or returned
2851 in registers. FIRST_ZR is the first unused vector argument register
2852 and FIRST_PR is the first unused predicate argument register. */
2855 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
2856 unsigned int first_pr) const
2858 gcc_assert (VECTOR_MODE_P (mode)
2859 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
2860 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
2862 if (num_zr > 0 && num_pr == 0)
2863 return gen_rtx_REG (mode, first_zr);
2865 if (num_zr == 0 && num_pr == 1)
2866 return gen_rtx_REG (mode, first_pr);
2868 gcc_unreachable ();
2871 /* Return the total number of vector registers required by the PST. */
2873 unsigned int
2874 pure_scalable_type_info::num_zr () const
2876 unsigned int res = 0;
2877 for (unsigned int i = 0; i < pieces.length (); ++i)
2878 res += pieces[i].num_zr;
2879 return res;
2882 /* Return the total number of predicate registers required by the PST. */
2884 unsigned int
2885 pure_scalable_type_info::num_pr () const
2887 unsigned int res = 0;
2888 for (unsigned int i = 0; i < pieces.length (); ++i)
2889 res += pieces[i].num_pr;
2890 return res;
2893 /* Return the location of a PST that is known to be passed or returned
2894 in registers. FIRST_ZR is the first unused vector argument register
2895 and FIRST_PR is the first unused predicate argument register. */
2898 pure_scalable_type_info::get_rtx (machine_mode mode,
2899 unsigned int first_zr,
2900 unsigned int first_pr) const
2902 /* Try to return a single REG if possible. This leads to better
2903 code generation; it isn't required for correctness. */
2904 if (mode == pieces[0].mode)
2906 gcc_assert (pieces.length () == 1);
2907 return pieces[0].get_rtx (first_zr, first_pr);
2910 /* Build up a PARALLEL that contains the individual pieces. */
2911 rtvec rtxes = rtvec_alloc (pieces.length ());
2912 for (unsigned int i = 0; i < pieces.length (); ++i)
2914 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
2915 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
2916 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
2917 first_zr += pieces[i].num_zr;
2918 first_pr += pieces[i].num_pr;
2920 return gen_rtx_PARALLEL (mode, rtxes);
2923 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
2924 in the AAPCS64. */
2926 pure_scalable_type_info::analysis_result
2927 pure_scalable_type_info::analyze (const_tree type)
2929 /* Prevent accidental reuse. */
2930 gcc_assert (pieces.is_empty ());
2932 /* No code will be generated for erroneous types, so we won't establish
2933 an ABI mapping. */
2934 if (type == error_mark_node)
2935 return NO_ABI_IDENTITY;
2937 /* Zero-sized types disappear in the language->ABI mapping. */
2938 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2939 return NO_ABI_IDENTITY;
2941 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
2942 piece p = {};
2943 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
2945 machine_mode mode = TYPE_MODE_RAW (type);
2946 gcc_assert (VECTOR_MODE_P (mode)
2947 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
2949 p.mode = p.orig_mode = mode;
2950 add_piece (p);
2951 return IS_PST;
2954 /* Check for user-defined PSTs. */
2955 if (TREE_CODE (type) == ARRAY_TYPE)
2956 return analyze_array (type);
2957 if (TREE_CODE (type) == RECORD_TYPE)
2958 return analyze_record (type);
2960 return ISNT_PST;
2963 /* Analyze a type that is known not to be passed or returned in memory.
2964 Return true if it has an ABI identity and is a Pure Scalable Type. */
2966 bool
2967 pure_scalable_type_info::analyze_registers (const_tree type)
2969 analysis_result result = analyze (type);
2970 gcc_assert (result != DOESNT_MATTER);
2971 return result == IS_PST;
2974 /* Subroutine of analyze for handling ARRAY_TYPEs. */
2976 pure_scalable_type_info::analysis_result
2977 pure_scalable_type_info::analyze_array (const_tree type)
2979 /* Analyze the element type. */
2980 pure_scalable_type_info element_info;
2981 analysis_result result = element_info.analyze (TREE_TYPE (type));
2982 if (result != IS_PST)
2983 return result;
2985 /* An array of unknown, flexible or variable length will be passed and
2986 returned by reference whatever we do. */
2987 tree nelts_minus_one = array_type_nelts (type);
2988 if (!tree_fits_uhwi_p (nelts_minus_one))
2989 return DOESNT_MATTER;
2991 /* Likewise if the array is constant-sized but too big to be interesting.
2992 The double checks against MAX_PIECES are to protect against overflow. */
2993 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
2994 if (count > MAX_PIECES)
2995 return DOESNT_MATTER;
2996 count += 1;
2997 if (count * element_info.pieces.length () > MAX_PIECES)
2998 return DOESNT_MATTER;
3000 /* The above checks should have weeded out elements of unknown size. */
3001 poly_uint64 element_bytes;
3002 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
3003 gcc_unreachable ();
3005 /* Build up the list of individual vectors and predicates. */
3006 gcc_assert (!element_info.pieces.is_empty ());
3007 for (unsigned int i = 0; i < count; ++i)
3008 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
3010 piece p = element_info.pieces[j];
3011 p.offset += i * element_bytes;
3012 add_piece (p);
3014 return IS_PST;
3017 /* Subroutine of analyze for handling RECORD_TYPEs. */
3019 pure_scalable_type_info::analysis_result
3020 pure_scalable_type_info::analyze_record (const_tree type)
3022 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3024 if (TREE_CODE (field) != FIELD_DECL)
3025 continue;
3027 /* Zero-sized fields disappear in the language->ABI mapping. */
3028 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
3029 continue;
3031 /* All fields with an ABI identity must be PSTs for the record as
3032 a whole to be a PST. If any individual field is too big to be
3033 interesting then the record is too. */
3034 pure_scalable_type_info field_info;
3035 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
3036 if (subresult == NO_ABI_IDENTITY)
3037 continue;
3038 if (subresult != IS_PST)
3039 return subresult;
3041 /* Since all previous fields are PSTs, we ought to be able to track
3042 the field offset using poly_ints. */
3043 tree bitpos = bit_position (field);
3044 gcc_assert (poly_int_tree_p (bitpos));
3046 /* For the same reason, it shouldn't be possible to create a PST field
3047 whose offset isn't byte-aligned. */
3048 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
3049 BITS_PER_UNIT);
3051 /* Punt if the record is too big to be interesting. */
3052 poly_uint64 bytepos;
3053 if (!wide_bytepos.to_uhwi (&bytepos)
3054 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
3055 return DOESNT_MATTER;
3057 /* Add the individual vectors and predicates in the field to the
3058 record's list. */
3059 gcc_assert (!field_info.pieces.is_empty ());
3060 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
3062 piece p = field_info.pieces[i];
3063 p.offset += bytepos;
3064 add_piece (p);
3067 /* Empty structures disappear in the language->ABI mapping. */
3068 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
3071 /* Add P to the list of pieces in the type. */
3073 void
3074 pure_scalable_type_info::add_piece (const piece &p)
3076 /* Try to fold the new piece into the previous one to form a
3077 single-mode PST. For example, if we see three consecutive vectors
3078 of the same mode, we can represent them using the corresponding
3079 3-tuple mode.
3081 This is purely an optimization. */
3082 if (!pieces.is_empty ())
3084 piece &prev = pieces.last ();
3085 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
3086 unsigned int nelems1, nelems2;
3087 if (prev.orig_mode == p.orig_mode
3088 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
3089 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
3090 GET_MODE_NUNITS (p.orig_mode), &nelems1)
3091 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
3092 GET_MODE_NUNITS (p.orig_mode), &nelems2)
3093 && targetm.array_mode (p.orig_mode,
3094 nelems1 + nelems2).exists (&prev.mode))
3096 prev.num_zr += p.num_zr;
3097 prev.num_pr += p.num_pr;
3098 return;
3101 pieces.quick_push (p);
3104 /* Return true if at least one possible value of type TYPE includes at
3105 least one object of Pure Scalable Type, in the sense of the AAPCS64.
3107 This is a relatively expensive test for some types, so it should
3108 generally be made as late as possible. */
3110 static bool
3111 aarch64_some_values_include_pst_objects_p (const_tree type)
3113 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
3114 return false;
3116 if (aarch64_sve::builtin_type_p (type))
3117 return true;
3119 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
3120 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
3122 if (RECORD_OR_UNION_TYPE_P (type))
3123 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
3124 if (TREE_CODE (field) == FIELD_DECL
3125 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
3126 return true;
3128 return false;
3131 /* Return the descriptor of the SIMD ABI. */
3133 static const predefined_function_abi &
3134 aarch64_simd_abi (void)
3136 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
3137 if (!simd_abi.initialized_p ())
3139 HARD_REG_SET full_reg_clobbers
3140 = default_function_abi.full_reg_clobbers ();
3141 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
3142 if (FP_SIMD_SAVED_REGNUM_P (regno))
3143 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3144 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
3146 return simd_abi;
3149 /* Return the descriptor of the SVE PCS. */
3151 static const predefined_function_abi &
3152 aarch64_sve_abi (void)
3154 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
3155 if (!sve_abi.initialized_p ())
3157 HARD_REG_SET full_reg_clobbers
3158 = default_function_abi.full_reg_clobbers ();
3159 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
3160 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3161 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
3162 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
3163 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
3165 return sve_abi;
3168 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
3169 wraps, otherwise return X itself. */
3171 static rtx
3172 strip_salt (rtx x)
3174 rtx search = x;
3175 if (GET_CODE (search) == CONST)
3176 search = XEXP (search, 0);
3177 if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
3178 x = XVECEXP (search, 0, 0);
3179 return x;
3182 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
3183 expression. */
3185 static rtx
3186 strip_offset_and_salt (rtx addr, poly_int64 *offset)
3188 return strip_salt (strip_offset (addr, offset));
3191 /* Generate code to enable conditional branches in functions over 1 MiB. */
3192 const char *
3193 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
3194 const char * branch_format)
3196 rtx_code_label * tmp_label = gen_label_rtx ();
3197 char label_buf[256];
3198 char buffer[128];
3199 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
3200 CODE_LABEL_NUMBER (tmp_label));
3201 const char *label_ptr = targetm.strip_name_encoding (label_buf);
3202 rtx dest_label = operands[pos_label];
3203 operands[pos_label] = tmp_label;
3205 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
3206 output_asm_insn (buffer, operands);
3208 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
3209 operands[pos_label] = dest_label;
3210 output_asm_insn (buffer, operands);
3211 return "";
3214 void
3215 aarch64_err_no_fpadvsimd (machine_mode mode)
3217 if (TARGET_GENERAL_REGS_ONLY)
3218 if (FLOAT_MODE_P (mode))
3219 error ("%qs is incompatible with the use of floating-point types",
3220 "-mgeneral-regs-only");
3221 else
3222 error ("%qs is incompatible with the use of vector types",
3223 "-mgeneral-regs-only");
3224 else
3225 if (FLOAT_MODE_P (mode))
3226 error ("%qs feature modifier is incompatible with the use of"
3227 " floating-point types", "+nofp");
3228 else
3229 error ("%qs feature modifier is incompatible with the use of"
3230 " vector types", "+nofp");
3233 /* Report when we try to do something that requires SVE when SVE is disabled.
3234 This is an error of last resort and isn't very high-quality. It usually
3235 involves attempts to measure the vector length in some way. */
3236 static void
3237 aarch64_report_sve_required (void)
3239 static bool reported_p = false;
3241 /* Avoid reporting a slew of messages for a single oversight. */
3242 if (reported_p)
3243 return;
3245 error ("this operation requires the SVE ISA extension");
3246 inform (input_location, "you can enable SVE using the command-line"
3247 " option %<-march%>, or by using the %<target%>"
3248 " attribute or pragma");
3249 reported_p = true;
3252 /* Return true if REGNO is P0-P15 or one of the special FFR-related
3253 registers. */
3254 inline bool
3255 pr_or_ffr_regnum_p (unsigned int regno)
3257 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
3260 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
3261 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
3262 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
3263 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
3264 and GENERAL_REGS is lower than the memory cost (in this case the best class
3265 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
3266 cost results in bad allocations with many redundant int<->FP moves which
3267 are expensive on various cores.
3268 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
3269 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
3270 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
3271 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
3272 The result of this is that it is no longer inefficient to have a higher
3273 memory move cost than the register move cost.
3276 static reg_class_t
3277 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
3278 reg_class_t best_class)
3280 machine_mode mode;
3282 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
3283 || !reg_class_subset_p (FP_REGS, allocno_class))
3284 return allocno_class;
3286 if (!reg_class_subset_p (GENERAL_REGS, best_class)
3287 || !reg_class_subset_p (FP_REGS, best_class))
3288 return best_class;
3290 mode = PSEUDO_REGNO_MODE (regno);
3291 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
3294 static unsigned int
3295 aarch64_min_divisions_for_recip_mul (machine_mode mode)
3297 if (GET_MODE_UNIT_SIZE (mode) == 4)
3298 return aarch64_tune_params.min_div_recip_mul_sf;
3299 return aarch64_tune_params.min_div_recip_mul_df;
3302 /* Return the reassociation width of treeop OPC with mode MODE. */
3303 static int
3304 aarch64_reassociation_width (unsigned opc, machine_mode mode)
3306 if (VECTOR_MODE_P (mode))
3307 return aarch64_tune_params.vec_reassoc_width;
3308 if (INTEGRAL_MODE_P (mode))
3309 return aarch64_tune_params.int_reassoc_width;
3310 /* Reassociation reduces the number of FMAs which may result in worse
3311 performance. Use a per-CPU setting for FMA reassociation which allows
3312 narrow CPUs with few FP pipes to switch it off (value of 1), and wider
3313 CPUs with many FP pipes to enable reassociation.
3314 Since the reassociation pass doesn't understand FMA at all, assume
3315 that any FP addition might turn into FMA. */
3316 if (FLOAT_MODE_P (mode))
3317 return opc == PLUS_EXPR ? aarch64_tune_params.fma_reassoc_width
3318 : aarch64_tune_params.fp_reassoc_width;
3319 return 1;
3322 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
3323 unsigned
3324 aarch64_debugger_regno (unsigned regno)
3326 if (GP_REGNUM_P (regno))
3327 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
3328 else if (regno == SP_REGNUM)
3329 return AARCH64_DWARF_SP;
3330 else if (FP_REGNUM_P (regno))
3331 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
3332 else if (PR_REGNUM_P (regno))
3333 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
3334 else if (regno == VG_REGNUM)
3335 return AARCH64_DWARF_VG;
3337 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
3338 equivalent DWARF register. */
3339 return DWARF_FRAME_REGISTERS;
3342 /* Implement TARGET_DWARF_FRAME_REG_MODE. */
3343 static machine_mode
3344 aarch64_dwarf_frame_reg_mode (int regno)
3346 /* Predicate registers are call-clobbered in the EH ABI (which is
3347 ARM_PCS_AAPCS64), so they should not be described by CFI.
3348 Their size changes as VL changes, so any values computed by
3349 __builtin_init_dwarf_reg_size_table might not be valid for
3350 all frames. */
3351 if (PR_REGNUM_P (regno))
3352 return VOIDmode;
3353 return default_dwarf_frame_reg_mode (regno);
3356 /* If X is a CONST_DOUBLE, return its bit representation as a constant
3357 integer, otherwise return X unmodified. */
3358 static rtx
3359 aarch64_bit_representation (rtx x)
3361 if (CONST_DOUBLE_P (x))
3362 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
3363 return x;
3366 /* Return an estimate for the number of quadwords in an SVE vector. This is
3367 equivalent to the number of Advanced SIMD vectors in an SVE vector. */
3368 static unsigned int
3369 aarch64_estimated_sve_vq ()
3371 return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
3374 /* Return true if MODE is an SVE predicate mode. */
3375 static bool
3376 aarch64_sve_pred_mode_p (machine_mode mode)
3378 return (TARGET_SVE
3379 && (mode == VNx16BImode
3380 || mode == VNx8BImode
3381 || mode == VNx4BImode
3382 || mode == VNx2BImode));
3385 /* Three mutually-exclusive flags describing a vector or predicate type. */
3386 const unsigned int VEC_ADVSIMD = 1;
3387 const unsigned int VEC_SVE_DATA = 2;
3388 const unsigned int VEC_SVE_PRED = 4;
3389 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
3390 a structure of 2, 3 or 4 vectors. */
3391 const unsigned int VEC_STRUCT = 8;
3392 /* Can be used in combination with VEC_SVE_DATA to indicate that the
3393 vector has fewer significant bytes than a full SVE vector. */
3394 const unsigned int VEC_PARTIAL = 16;
3395 /* Useful combinations of the above. */
3396 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
3397 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
3399 /* Return a set of flags describing the vector properties of mode MODE.
3400 Ignore modes that are not supported by the current target. */
3401 static unsigned int
3402 aarch64_classify_vector_mode (machine_mode mode)
3404 if (aarch64_sve_pred_mode_p (mode))
3405 return VEC_SVE_PRED;
3407 /* Make the decision based on the mode's enum value rather than its
3408 properties, so that we keep the correct classification regardless
3409 of -msve-vector-bits. */
3410 switch (mode)
3412 /* Partial SVE QI vectors. */
3413 case E_VNx2QImode:
3414 case E_VNx4QImode:
3415 case E_VNx8QImode:
3416 /* Partial SVE HI vectors. */
3417 case E_VNx2HImode:
3418 case E_VNx4HImode:
3419 /* Partial SVE SI vector. */
3420 case E_VNx2SImode:
3421 /* Partial SVE HF vectors. */
3422 case E_VNx2HFmode:
3423 case E_VNx4HFmode:
3424 /* Partial SVE BF vectors. */
3425 case E_VNx2BFmode:
3426 case E_VNx4BFmode:
3427 /* Partial SVE SF vector. */
3428 case E_VNx2SFmode:
3429 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
3431 case E_VNx16QImode:
3432 case E_VNx8HImode:
3433 case E_VNx4SImode:
3434 case E_VNx2DImode:
3435 case E_VNx8BFmode:
3436 case E_VNx8HFmode:
3437 case E_VNx4SFmode:
3438 case E_VNx2DFmode:
3439 return TARGET_SVE ? VEC_SVE_DATA : 0;
3441 /* x2 SVE vectors. */
3442 case E_VNx32QImode:
3443 case E_VNx16HImode:
3444 case E_VNx8SImode:
3445 case E_VNx4DImode:
3446 case E_VNx16BFmode:
3447 case E_VNx16HFmode:
3448 case E_VNx8SFmode:
3449 case E_VNx4DFmode:
3450 /* x3 SVE vectors. */
3451 case E_VNx48QImode:
3452 case E_VNx24HImode:
3453 case E_VNx12SImode:
3454 case E_VNx6DImode:
3455 case E_VNx24BFmode:
3456 case E_VNx24HFmode:
3457 case E_VNx12SFmode:
3458 case E_VNx6DFmode:
3459 /* x4 SVE vectors. */
3460 case E_VNx64QImode:
3461 case E_VNx32HImode:
3462 case E_VNx16SImode:
3463 case E_VNx8DImode:
3464 case E_VNx32BFmode:
3465 case E_VNx32HFmode:
3466 case E_VNx16SFmode:
3467 case E_VNx8DFmode:
3468 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
3470 case E_OImode:
3471 case E_CImode:
3472 case E_XImode:
3473 return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
3475 /* Structures of 64-bit Advanced SIMD vectors. */
3476 case E_V2x8QImode:
3477 case E_V2x4HImode:
3478 case E_V2x2SImode:
3479 case E_V2x1DImode:
3480 case E_V2x4BFmode:
3481 case E_V2x4HFmode:
3482 case E_V2x2SFmode:
3483 case E_V2x1DFmode:
3484 case E_V3x8QImode:
3485 case E_V3x4HImode:
3486 case E_V3x2SImode:
3487 case E_V3x1DImode:
3488 case E_V3x4BFmode:
3489 case E_V3x4HFmode:
3490 case E_V3x2SFmode:
3491 case E_V3x1DFmode:
3492 case E_V4x8QImode:
3493 case E_V4x4HImode:
3494 case E_V4x2SImode:
3495 case E_V4x1DImode:
3496 case E_V4x4BFmode:
3497 case E_V4x4HFmode:
3498 case E_V4x2SFmode:
3499 case E_V4x1DFmode:
3500 return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
3502 /* Structures of 128-bit Advanced SIMD vectors. */
3503 case E_V2x16QImode:
3504 case E_V2x8HImode:
3505 case E_V2x4SImode:
3506 case E_V2x2DImode:
3507 case E_V2x8BFmode:
3508 case E_V2x8HFmode:
3509 case E_V2x4SFmode:
3510 case E_V2x2DFmode:
3511 case E_V3x16QImode:
3512 case E_V3x8HImode:
3513 case E_V3x4SImode:
3514 case E_V3x2DImode:
3515 case E_V3x8BFmode:
3516 case E_V3x8HFmode:
3517 case E_V3x4SFmode:
3518 case E_V3x2DFmode:
3519 case E_V4x16QImode:
3520 case E_V4x8HImode:
3521 case E_V4x4SImode:
3522 case E_V4x2DImode:
3523 case E_V4x8BFmode:
3524 case E_V4x8HFmode:
3525 case E_V4x4SFmode:
3526 case E_V4x2DFmode:
3527 return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
3529 /* 64-bit Advanced SIMD vectors. */
3530 case E_V8QImode:
3531 case E_V4HImode:
3532 case E_V2SImode:
3533 case E_V1DImode:
3534 case E_V4HFmode:
3535 case E_V4BFmode:
3536 case E_V2SFmode:
3537 case E_V1DFmode:
3538 /* 128-bit Advanced SIMD vectors. */
3539 case E_V16QImode:
3540 case E_V8HImode:
3541 case E_V4SImode:
3542 case E_V2DImode:
3543 case E_V8HFmode:
3544 case E_V8BFmode:
3545 case E_V4SFmode:
3546 case E_V2DFmode:
3547 return TARGET_FLOAT ? VEC_ADVSIMD : 0;
3549 default:
3550 return 0;
3554 /* Return true if MODE is any of the Advanced SIMD structure modes. */
3555 bool
3556 aarch64_advsimd_struct_mode_p (machine_mode mode)
3558 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3559 return (vec_flags & VEC_ADVSIMD) && (vec_flags & VEC_STRUCT);
3562 /* Return true if MODE is an Advanced SIMD D-register structure mode. */
3563 static bool
3564 aarch64_advsimd_partial_struct_mode_p (machine_mode mode)
3566 return (aarch64_classify_vector_mode (mode)
3567 == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL));
3570 /* Return true if MODE is an Advanced SIMD Q-register structure mode. */
3571 static bool
3572 aarch64_advsimd_full_struct_mode_p (machine_mode mode)
3574 return (aarch64_classify_vector_mode (mode) == (VEC_ADVSIMD | VEC_STRUCT));
3577 /* Return true if MODE is any of the data vector modes, including
3578 structure modes. */
3579 static bool
3580 aarch64_vector_data_mode_p (machine_mode mode)
3582 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
3585 /* Return true if MODE is any form of SVE mode, including predicates,
3586 vectors and structures. */
3587 bool
3588 aarch64_sve_mode_p (machine_mode mode)
3590 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
3593 /* Return true if MODE is an SVE data vector mode; either a single vector
3594 or a structure of vectors. */
3595 static bool
3596 aarch64_sve_data_mode_p (machine_mode mode)
3598 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
3601 /* Return the number of defined bytes in one constituent vector of
3602 SVE mode MODE, which has vector flags VEC_FLAGS. */
3603 static poly_int64
3604 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
3606 if (vec_flags & VEC_PARTIAL)
3607 /* A single partial vector. */
3608 return GET_MODE_SIZE (mode);
3610 if (vec_flags & VEC_SVE_DATA)
3611 /* A single vector or a tuple. */
3612 return BYTES_PER_SVE_VECTOR;
3614 /* A single predicate. */
3615 gcc_assert (vec_flags & VEC_SVE_PRED);
3616 return BYTES_PER_SVE_PRED;
3619 /* If MODE holds an array of vectors, return the number of vectors
3620 in the array, otherwise return 1. */
3622 static unsigned int
3623 aarch64_ldn_stn_vectors (machine_mode mode)
3625 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3626 if (vec_flags == (VEC_ADVSIMD | VEC_PARTIAL | VEC_STRUCT))
3627 return exact_div (GET_MODE_SIZE (mode), 8).to_constant ();
3628 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
3629 return exact_div (GET_MODE_SIZE (mode), 16).to_constant ();
3630 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
3631 return exact_div (GET_MODE_SIZE (mode),
3632 BYTES_PER_SVE_VECTOR).to_constant ();
3633 return 1;
3636 /* Given an Advanced SIMD vector mode MODE and a tuple size NELEMS, return the
3637 corresponding vector structure mode. */
3638 static opt_machine_mode
3639 aarch64_advsimd_vector_array_mode (machine_mode mode,
3640 unsigned HOST_WIDE_INT nelems)
3642 unsigned int flags = VEC_ADVSIMD | VEC_STRUCT;
3643 if (known_eq (GET_MODE_SIZE (mode), 8))
3644 flags |= VEC_PARTIAL;
3646 machine_mode struct_mode;
3647 FOR_EACH_MODE_IN_CLASS (struct_mode, GET_MODE_CLASS (mode))
3648 if (aarch64_classify_vector_mode (struct_mode) == flags
3649 && GET_MODE_INNER (struct_mode) == GET_MODE_INNER (mode)
3650 && known_eq (GET_MODE_NUNITS (struct_mode),
3651 GET_MODE_NUNITS (mode) * nelems))
3652 return struct_mode;
3653 return opt_machine_mode ();
3656 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
3658 opt_machine_mode
3659 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
3661 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
3662 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
3663 machine_mode mode;
3664 FOR_EACH_MODE_IN_CLASS (mode, mclass)
3665 if (inner_mode == GET_MODE_INNER (mode)
3666 && known_eq (nunits, GET_MODE_NUNITS (mode))
3667 && aarch64_sve_data_mode_p (mode))
3668 return mode;
3669 return opt_machine_mode ();
3672 /* Implement target hook TARGET_ARRAY_MODE. */
3673 static opt_machine_mode
3674 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
3676 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
3677 && IN_RANGE (nelems, 2, 4))
3678 return aarch64_sve_data_mode (GET_MODE_INNER (mode),
3679 GET_MODE_NUNITS (mode) * nelems);
3680 if (aarch64_classify_vector_mode (mode) == VEC_ADVSIMD
3681 && IN_RANGE (nelems, 2, 4))
3682 return aarch64_advsimd_vector_array_mode (mode, nelems);
3684 return opt_machine_mode ();
3687 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
3688 static bool
3689 aarch64_array_mode_supported_p (machine_mode mode,
3690 unsigned HOST_WIDE_INT nelems)
3692 if (TARGET_SIMD
3693 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
3694 || AARCH64_VALID_SIMD_DREG_MODE (mode))
3695 && (nelems >= 2 && nelems <= 4))
3696 return true;
3698 return false;
3701 /* MODE is some form of SVE vector mode. For data modes, return the number
3702 of vector register bits that each element of MODE occupies, such as 64
3703 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
3704 in a 64-bit container). For predicate modes, return the number of
3705 data bits controlled by each significant predicate bit. */
3707 static unsigned int
3708 aarch64_sve_container_bits (machine_mode mode)
3710 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3711 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
3712 ? BITS_PER_SVE_VECTOR
3713 : GET_MODE_BITSIZE (mode));
3714 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
3717 /* Return the SVE predicate mode to use for elements that have
3718 ELEM_NBYTES bytes, if such a mode exists. */
3720 opt_machine_mode
3721 aarch64_sve_pred_mode (unsigned int elem_nbytes)
3723 if (TARGET_SVE)
3725 if (elem_nbytes == 1)
3726 return VNx16BImode;
3727 if (elem_nbytes == 2)
3728 return VNx8BImode;
3729 if (elem_nbytes == 4)
3730 return VNx4BImode;
3731 if (elem_nbytes == 8)
3732 return VNx2BImode;
3734 return opt_machine_mode ();
3737 /* Return the SVE predicate mode that should be used to control
3738 SVE mode MODE. */
3740 machine_mode
3741 aarch64_sve_pred_mode (machine_mode mode)
3743 unsigned int bits = aarch64_sve_container_bits (mode);
3744 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
3747 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
3749 static opt_machine_mode
3750 aarch64_get_mask_mode (machine_mode mode)
3752 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3753 if (vec_flags & VEC_SVE_DATA)
3754 return aarch64_sve_pred_mode (mode);
3756 return default_get_mask_mode (mode);
3759 /* Return the integer element mode associated with SVE mode MODE. */
3761 static scalar_int_mode
3762 aarch64_sve_element_int_mode (machine_mode mode)
3764 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3765 ? BITS_PER_SVE_VECTOR
3766 : GET_MODE_BITSIZE (mode));
3767 unsigned int elt_bits = vector_element_size (vector_bits,
3768 GET_MODE_NUNITS (mode));
3769 return int_mode_for_size (elt_bits, 0).require ();
3772 /* Return an integer element mode that contains exactly
3773 aarch64_sve_container_bits (MODE) bits. This is wider than
3774 aarch64_sve_element_int_mode if MODE is a partial vector,
3775 otherwise it's the same. */
3777 static scalar_int_mode
3778 aarch64_sve_container_int_mode (machine_mode mode)
3780 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
3783 /* Return the integer vector mode associated with SVE mode MODE.
3784 Unlike related_int_vector_mode, this can handle the case in which
3785 MODE is a predicate (and thus has a different total size). */
3787 machine_mode
3788 aarch64_sve_int_mode (machine_mode mode)
3790 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
3791 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
3794 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
3796 static opt_machine_mode
3797 aarch64_vectorize_related_mode (machine_mode vector_mode,
3798 scalar_mode element_mode,
3799 poly_uint64 nunits)
3801 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
3803 /* If we're operating on SVE vectors, try to return an SVE mode. */
3804 poly_uint64 sve_nunits;
3805 if ((vec_flags & VEC_SVE_DATA)
3806 && multiple_p (BYTES_PER_SVE_VECTOR,
3807 GET_MODE_SIZE (element_mode), &sve_nunits))
3809 machine_mode sve_mode;
3810 if (maybe_ne (nunits, 0U))
3812 /* Try to find a full or partial SVE mode with exactly
3813 NUNITS units. */
3814 if (multiple_p (sve_nunits, nunits)
3815 && aarch64_sve_data_mode (element_mode,
3816 nunits).exists (&sve_mode))
3817 return sve_mode;
3819 else
3821 /* Take the preferred number of units from the number of bytes
3822 that fit in VECTOR_MODE. We always start by "autodetecting"
3823 a full vector mode with preferred_simd_mode, so vectors
3824 chosen here will also be full vector modes. Then
3825 autovectorize_vector_modes tries smaller starting modes
3826 and thus smaller preferred numbers of units. */
3827 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
3828 if (aarch64_sve_data_mode (element_mode,
3829 sve_nunits).exists (&sve_mode))
3830 return sve_mode;
3834 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
3835 if (TARGET_SIMD
3836 && (vec_flags & VEC_ADVSIMD)
3837 && known_eq (nunits, 0U)
3838 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
3839 && maybe_ge (GET_MODE_BITSIZE (element_mode)
3840 * GET_MODE_NUNITS (vector_mode), 128U))
3842 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
3843 if (VECTOR_MODE_P (res))
3844 return res;
3847 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
3850 /* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */
3852 static bool
3853 aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
3855 machine_mode mode = TYPE_MODE (type);
3856 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3857 bool sve_p = (vec_flags & VEC_ANY_SVE);
3858 bool simd_p = (vec_flags & VEC_ADVSIMD);
3860 return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD);
3863 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
3864 prefer to use the first arithmetic operand as the else value if
3865 the else value doesn't matter, since that exactly matches the SVE
3866 destructive merging form. For ternary operations we could either
3867 pick the first operand and use FMAD-like instructions or the last
3868 operand and use FMLA-like instructions; the latter seems more
3869 natural. */
3871 static tree
3872 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
3874 return nops == 3 ? ops[2] : ops[0];
3877 /* Implement TARGET_HARD_REGNO_NREGS. */
3879 static unsigned int
3880 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
3882 /* ??? Logically we should only need to provide a value when
3883 HARD_REGNO_MODE_OK says that the combination is valid,
3884 but at the moment we need to handle all modes. Just ignore
3885 any runtime parts for registers that can't store them. */
3886 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
3887 switch (aarch64_regno_regclass (regno))
3889 case FP_REGS:
3890 case FP_LO_REGS:
3891 case FP_LO8_REGS:
3893 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3894 if (vec_flags & VEC_SVE_DATA)
3895 return exact_div (GET_MODE_SIZE (mode),
3896 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
3897 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL))
3898 return GET_MODE_SIZE (mode).to_constant () / 8;
3899 return CEIL (lowest_size, UNITS_PER_VREG);
3901 case PR_REGS:
3902 case PR_LO_REGS:
3903 case PR_HI_REGS:
3904 case FFR_REGS:
3905 case PR_AND_FFR_REGS:
3906 return 1;
3907 default:
3908 return CEIL (lowest_size, UNITS_PER_WORD);
3910 gcc_unreachable ();
3913 /* Implement TARGET_HARD_REGNO_MODE_OK. */
3915 static bool
3916 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
3918 if (mode == V8DImode)
3919 return IN_RANGE (regno, R0_REGNUM, R23_REGNUM)
3920 && multiple_p (regno - R0_REGNUM, 2);
3922 if (GET_MODE_CLASS (mode) == MODE_CC)
3923 return regno == CC_REGNUM;
3925 if (regno == VG_REGNUM)
3926 /* This must have the same size as _Unwind_Word. */
3927 return mode == DImode;
3929 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3930 if (vec_flags & VEC_SVE_PRED)
3931 return pr_or_ffr_regnum_p (regno);
3933 if (pr_or_ffr_regnum_p (regno))
3934 return false;
3936 if (regno == SP_REGNUM)
3937 /* The purpose of comparing with ptr_mode is to support the
3938 global register variable associated with the stack pointer
3939 register via the syntax of asm ("wsp") in ILP32. */
3940 return mode == Pmode || mode == ptr_mode;
3942 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
3943 return mode == Pmode;
3945 if (GP_REGNUM_P (regno))
3947 if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
3948 return false;
3949 if (known_le (GET_MODE_SIZE (mode), 8))
3950 return true;
3951 if (known_le (GET_MODE_SIZE (mode), 16))
3952 return (regno & 1) == 0;
3954 else if (FP_REGNUM_P (regno))
3956 if (vec_flags & VEC_STRUCT)
3957 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
3958 else
3959 return !VECTOR_MODE_P (mode) || vec_flags != 0;
3962 return false;
3965 /* Return true if a function with type FNTYPE returns its value in
3966 SVE vector or predicate registers. */
3968 static bool
3969 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
3971 tree return_type = TREE_TYPE (fntype);
3973 pure_scalable_type_info pst_info;
3974 switch (pst_info.analyze (return_type))
3976 case pure_scalable_type_info::IS_PST:
3977 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
3978 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
3980 case pure_scalable_type_info::DOESNT_MATTER:
3981 gcc_assert (aarch64_return_in_memory_1 (return_type));
3982 return false;
3984 case pure_scalable_type_info::NO_ABI_IDENTITY:
3985 case pure_scalable_type_info::ISNT_PST:
3986 return false;
3988 gcc_unreachable ();
3991 /* Return true if a function with type FNTYPE takes arguments in
3992 SVE vector or predicate registers. */
3994 static bool
3995 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
3997 CUMULATIVE_ARGS args_so_far_v;
3998 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
3999 NULL_TREE, 0, true);
4000 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
4002 for (tree chain = TYPE_ARG_TYPES (fntype);
4003 chain && chain != void_list_node;
4004 chain = TREE_CHAIN (chain))
4006 tree arg_type = TREE_VALUE (chain);
4007 if (arg_type == error_mark_node)
4008 return false;
4010 function_arg_info arg (arg_type, /*named=*/true);
4011 apply_pass_by_reference_rules (&args_so_far_v, arg);
4012 pure_scalable_type_info pst_info;
4013 if (pst_info.analyze_registers (arg.type))
4015 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
4016 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
4017 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
4018 return true;
4021 targetm.calls.function_arg_advance (args_so_far, arg);
4023 return false;
4026 /* Implement TARGET_FNTYPE_ABI. */
4028 static const predefined_function_abi &
4029 aarch64_fntype_abi (const_tree fntype)
4031 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
4032 return aarch64_simd_abi ();
4034 if (aarch64_returns_value_in_sve_regs_p (fntype)
4035 || aarch64_takes_arguments_in_sve_regs_p (fntype))
4036 return aarch64_sve_abi ();
4038 return default_function_abi;
4041 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
4043 static bool
4044 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
4046 return (aarch64_sve::builtin_type_p (type1)
4047 == aarch64_sve::builtin_type_p (type2));
4050 /* Return true if we should emit CFI for register REGNO. */
4052 static bool
4053 aarch64_emit_cfi_for_reg_p (unsigned int regno)
4055 return (GP_REGNUM_P (regno)
4056 || !default_function_abi.clobbers_full_reg_p (regno));
4059 /* Return the mode we should use to save and restore register REGNO. */
4061 static machine_mode
4062 aarch64_reg_save_mode (unsigned int regno)
4064 if (GP_REGNUM_P (regno))
4065 return DImode;
4067 if (FP_REGNUM_P (regno))
4068 switch (crtl->abi->id ())
4070 case ARM_PCS_AAPCS64:
4071 /* Only the low 64 bits are saved by the base PCS. */
4072 return DFmode;
4074 case ARM_PCS_SIMD:
4075 /* The vector PCS saves the low 128 bits (which is the full
4076 register on non-SVE targets). */
4077 return TFmode;
4079 case ARM_PCS_SVE:
4080 /* Use vectors of DImode for registers that need frame
4081 information, so that the first 64 bytes of the save slot
4082 are always the equivalent of what storing D<n> would give. */
4083 if (aarch64_emit_cfi_for_reg_p (regno))
4084 return VNx2DImode;
4086 /* Use vectors of bytes otherwise, so that the layout is
4087 endian-agnostic, and so that we can use LDR and STR for
4088 big-endian targets. */
4089 return VNx16QImode;
4091 case ARM_PCS_TLSDESC:
4092 case ARM_PCS_UNKNOWN:
4093 break;
4096 if (PR_REGNUM_P (regno))
4097 /* Save the full predicate register. */
4098 return VNx16BImode;
4100 gcc_unreachable ();
4103 /* Implement TARGET_INSN_CALLEE_ABI. */
4105 const predefined_function_abi &
4106 aarch64_insn_callee_abi (const rtx_insn *insn)
4108 rtx pat = PATTERN (insn);
4109 gcc_assert (GET_CODE (pat) == PARALLEL);
4110 rtx unspec = XVECEXP (pat, 0, 1);
4111 gcc_assert (GET_CODE (unspec) == UNSPEC
4112 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
4113 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
4116 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
4117 the lower 64 bits of a 128-bit register. Tell the compiler the callee
4118 clobbers the top 64 bits when restoring the bottom 64 bits. */
4120 static bool
4121 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
4122 unsigned int regno,
4123 machine_mode mode)
4125 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
4127 poly_int64 per_register_size = GET_MODE_SIZE (mode);
4128 unsigned int nregs = hard_regno_nregs (regno, mode);
4129 if (nregs > 1)
4130 per_register_size = exact_div (per_register_size, nregs);
4131 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
4132 return maybe_gt (per_register_size, 16);
4133 return maybe_gt (per_register_size, 8);
4135 return false;
4138 /* Implement REGMODE_NATURAL_SIZE. */
4139 poly_uint64
4140 aarch64_regmode_natural_size (machine_mode mode)
4142 /* The natural size for SVE data modes is one SVE data vector,
4143 and similarly for predicates. We can't independently modify
4144 anything smaller than that. */
4145 /* ??? For now, only do this for variable-width SVE registers.
4146 Doing it for constant-sized registers breaks lower-subreg.cc. */
4147 /* ??? And once that's fixed, we should probably have similar
4148 code for Advanced SIMD. */
4149 if (!aarch64_sve_vg.is_constant ())
4151 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
4152 if (vec_flags & VEC_SVE_PRED)
4153 return BYTES_PER_SVE_PRED;
4154 if (vec_flags & VEC_SVE_DATA)
4155 return BYTES_PER_SVE_VECTOR;
4157 return UNITS_PER_WORD;
4160 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
4161 machine_mode
4162 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
4163 machine_mode mode)
4165 /* The predicate mode determines which bits are significant and
4166 which are "don't care". Decreasing the number of lanes would
4167 lose data while increasing the number of lanes would make bits
4168 unnecessarily significant. */
4169 if (PR_REGNUM_P (regno))
4170 return mode;
4171 if (known_ge (GET_MODE_SIZE (mode), 4))
4172 return mode;
4173 else
4174 return SImode;
4177 /* Return true if I's bits are consecutive ones from the MSB. */
4178 bool
4179 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
4181 return exact_log2 (-i) != HOST_WIDE_INT_M1;
4184 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
4185 that strcpy from constants will be faster. */
4187 static HOST_WIDE_INT
4188 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
4190 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
4191 return MAX (align, BITS_PER_WORD);
4192 return align;
4195 /* Return true if calls to DECL should be treated as
4196 long-calls (ie called via a register). */
4197 static bool
4198 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
4200 return false;
4203 /* Return true if calls to symbol-ref SYM should be treated as
4204 long-calls (ie called via a register). */
4205 bool
4206 aarch64_is_long_call_p (rtx sym)
4208 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
4211 /* Return true if calls to symbol-ref SYM should not go through
4212 plt stubs. */
4214 bool
4215 aarch64_is_noplt_call_p (rtx sym)
4217 const_tree decl = SYMBOL_REF_DECL (sym);
4219 if (flag_pic
4220 && decl
4221 && (!flag_plt
4222 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
4223 && !targetm.binds_local_p (decl))
4224 return true;
4226 return false;
4229 /* Emit an insn that's a simple single-set. Both the operands must be
4230 known to be valid. */
4231 inline static rtx_insn *
4232 emit_set_insn (rtx x, rtx y)
4234 return emit_insn (gen_rtx_SET (x, y));
4237 /* X and Y are two things to compare using CODE. Emit the compare insn and
4238 return the rtx for register 0 in the proper mode. */
4240 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
4242 machine_mode cmp_mode = GET_MODE (x);
4243 machine_mode cc_mode;
4244 rtx cc_reg;
4246 if (cmp_mode == TImode)
4248 gcc_assert (code == NE);
4250 cc_mode = CCmode;
4251 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4253 rtx x_lo = operand_subword (x, 0, 0, TImode);
4254 rtx y_lo = operand_subword (y, 0, 0, TImode);
4255 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
4257 rtx x_hi = operand_subword (x, 1, 0, TImode);
4258 rtx y_hi = operand_subword (y, 1, 0, TImode);
4259 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
4260 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
4261 GEN_INT (AARCH64_EQ)));
4263 else
4265 cc_mode = SELECT_CC_MODE (code, x, y);
4266 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4267 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
4269 return cc_reg;
4272 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
4274 static rtx
4275 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
4276 machine_mode y_mode)
4278 if (y_mode == E_QImode || y_mode == E_HImode)
4280 if (CONST_INT_P (y))
4282 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
4283 y_mode = SImode;
4285 else
4287 rtx t, cc_reg;
4288 machine_mode cc_mode;
4290 t = gen_rtx_ZERO_EXTEND (SImode, y);
4291 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
4292 cc_mode = CC_SWPmode;
4293 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
4294 emit_set_insn (cc_reg, t);
4295 return cc_reg;
4299 if (!aarch64_plus_operand (y, y_mode))
4300 y = force_reg (y_mode, y);
4302 return aarch64_gen_compare_reg (code, x, y);
4305 /* Consider the operation:
4307 OPERANDS[0] = CODE (OPERANDS[1], OPERANDS[2]) + OPERANDS[3]
4309 where:
4311 - CODE is [SU]MAX or [SU]MIN
4312 - OPERANDS[2] and OPERANDS[3] are constant integers
4313 - OPERANDS[3] is a positive or negative shifted 12-bit immediate
4314 - all operands have mode MODE
4316 Decide whether it is possible to implement the operation using:
4318 SUBS <tmp>, OPERANDS[1], -OPERANDS[3]
4320 ADDS <tmp>, OPERANDS[1], OPERANDS[3]
4322 followed by:
4324 <insn> OPERANDS[0], <tmp>, [wx]zr, <cond>
4326 where <insn> is one of CSEL, CSINV or CSINC. Return true if so.
4327 If GENERATE_P is true, also update OPERANDS as follows:
4329 OPERANDS[4] = -OPERANDS[3]
4330 OPERANDS[5] = the rtl condition representing <cond>
4331 OPERANDS[6] = <tmp>
4332 OPERANDS[7] = 0 for CSEL, -1 for CSINV or 1 for CSINC. */
4333 bool
4334 aarch64_maxmin_plus_const (rtx_code code, rtx *operands, bool generate_p)
4336 signop sgn = (code == UMAX || code == UMIN ? UNSIGNED : SIGNED);
4337 rtx dst = operands[0];
4338 rtx maxmin_op = operands[2];
4339 rtx add_op = operands[3];
4340 machine_mode mode = GET_MODE (dst);
4342 /* max (x, y) - z == (x >= y + 1 ? x : y) - z
4343 == (x >= y ? x : y) - z
4344 == (x > y ? x : y) - z
4345 == (x > y - 1 ? x : y) - z
4347 min (x, y) - z == (x <= y - 1 ? x : y) - z
4348 == (x <= y ? x : y) - z
4349 == (x < y ? x : y) - z
4350 == (x < y + 1 ? x : y) - z
4352 Check whether z is in { y - 1, y, y + 1 } and pick the form(s) for
4353 which x is compared with z. Set DIFF to y - z. Thus the supported
4354 combinations are as follows, with DIFF being the value after the ":":
4356 max (x, y) - z == x >= y + 1 ? x - (y + 1) : -1 [z == y + 1]
4357 == x >= y ? x - y : 0 [z == y]
4358 == x > y ? x - y : 0 [z == y]
4359 == x > y - 1 ? x - (y - 1) : 1 [z == y - 1]
4361 min (x, y) - z == x <= y - 1 ? x - (y - 1) : 1 [z == y - 1]
4362 == x <= y ? x - y : 0 [z == y]
4363 == x < y ? x - y : 0 [z == y]
4364 == x < y + 1 ? x - (y + 1) : -1 [z == y + 1]. */
4365 auto maxmin_val = rtx_mode_t (maxmin_op, mode);
4366 auto add_val = rtx_mode_t (add_op, mode);
4367 auto sub_val = wi::neg (add_val);
4368 auto diff = wi::sub (maxmin_val, sub_val);
4369 if (!(diff == 0
4370 || (diff == 1 && wi::gt_p (maxmin_val, sub_val, sgn))
4371 || (diff == -1 && wi::lt_p (maxmin_val, sub_val, sgn))))
4372 return false;
4374 if (!generate_p)
4375 return true;
4377 rtx_code cmp;
4378 switch (code)
4380 case SMAX:
4381 cmp = diff == 1 ? GT : GE;
4382 break;
4383 case UMAX:
4384 cmp = diff == 1 ? GTU : GEU;
4385 break;
4386 case SMIN:
4387 cmp = diff == -1 ? LT : LE;
4388 break;
4389 case UMIN:
4390 cmp = diff == -1 ? LTU : LEU;
4391 break;
4392 default:
4393 gcc_unreachable ();
4395 rtx cc = gen_rtx_REG (CCmode, CC_REGNUM);
4397 operands[4] = immed_wide_int_const (sub_val, mode);
4398 operands[5] = gen_rtx_fmt_ee (cmp, VOIDmode, cc, const0_rtx);
4399 if (can_create_pseudo_p ())
4400 operands[6] = gen_reg_rtx (mode);
4401 else
4402 operands[6] = dst;
4403 operands[7] = immed_wide_int_const (diff, mode);
4405 return true;
4409 /* Build the SYMBOL_REF for __tls_get_addr. */
4411 static GTY(()) rtx tls_get_addr_libfunc;
4414 aarch64_tls_get_addr (void)
4416 if (!tls_get_addr_libfunc)
4417 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
4418 return tls_get_addr_libfunc;
4421 /* Return the TLS model to use for ADDR. */
4423 static enum tls_model
4424 tls_symbolic_operand_type (rtx addr)
4426 enum tls_model tls_kind = TLS_MODEL_NONE;
4427 poly_int64 offset;
4428 addr = strip_offset_and_salt (addr, &offset);
4429 if (SYMBOL_REF_P (addr))
4430 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
4432 return tls_kind;
4435 /* We'll allow lo_sum's in addresses in our legitimate addresses
4436 so that combine would take care of combining addresses where
4437 necessary, but for generation purposes, we'll generate the address
4438 as :
4439 RTL Absolute
4440 tmp = hi (symbol_ref); adrp x1, foo
4441 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
4444 PIC TLS
4445 adrp x1, :got:foo adrp tmp, :tlsgd:foo
4446 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
4447 bl __tls_get_addr
4450 Load TLS symbol, depending on TLS mechanism and TLS access model.
4452 Global Dynamic - Traditional TLS:
4453 adrp tmp, :tlsgd:imm
4454 add dest, tmp, #:tlsgd_lo12:imm
4455 bl __tls_get_addr
4457 Global Dynamic - TLS Descriptors:
4458 adrp dest, :tlsdesc:imm
4459 ldr tmp, [dest, #:tlsdesc_lo12:imm]
4460 add dest, dest, #:tlsdesc_lo12:imm
4461 blr tmp
4462 mrs tp, tpidr_el0
4463 add dest, dest, tp
4465 Initial Exec:
4466 mrs tp, tpidr_el0
4467 adrp tmp, :gottprel:imm
4468 ldr dest, [tmp, #:gottprel_lo12:imm]
4469 add dest, dest, tp
4471 Local Exec:
4472 mrs tp, tpidr_el0
4473 add t0, tp, #:tprel_hi12:imm, lsl #12
4474 add t0, t0, #:tprel_lo12_nc:imm
4477 static void
4478 aarch64_load_symref_appropriately (rtx dest, rtx imm,
4479 enum aarch64_symbol_type type)
4481 switch (type)
4483 case SYMBOL_SMALL_ABSOLUTE:
4485 /* In ILP32, the mode of dest can be either SImode or DImode. */
4486 rtx tmp_reg = dest;
4487 machine_mode mode = GET_MODE (dest);
4489 gcc_assert (mode == Pmode || mode == ptr_mode);
4491 if (can_create_pseudo_p ())
4492 tmp_reg = gen_reg_rtx (mode);
4494 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, copy_rtx (imm)));
4495 emit_insn (gen_add_losym (dest, tmp_reg, imm));
4496 return;
4499 case SYMBOL_TINY_ABSOLUTE:
4500 emit_insn (gen_rtx_SET (dest, imm));
4501 return;
4503 case SYMBOL_SMALL_GOT_28K:
4505 machine_mode mode = GET_MODE (dest);
4506 rtx gp_rtx = pic_offset_table_rtx;
4507 rtx insn;
4508 rtx mem;
4510 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
4511 here before rtl expand. Tree IVOPT will generate rtl pattern to
4512 decide rtx costs, in which case pic_offset_table_rtx is not
4513 initialized. For that case no need to generate the first adrp
4514 instruction as the final cost for global variable access is
4515 one instruction. */
4516 if (gp_rtx != NULL)
4518 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
4519 using the page base as GOT base, the first page may be wasted,
4520 in the worst scenario, there is only 28K space for GOT).
4522 The generate instruction sequence for accessing global variable
4525 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
4527 Only one instruction needed. But we must initialize
4528 pic_offset_table_rtx properly. We generate initialize insn for
4529 every global access, and allow CSE to remove all redundant.
4531 The final instruction sequences will look like the following
4532 for multiply global variables access.
4534 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
4536 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
4537 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
4538 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
4539 ... */
4541 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
4542 crtl->uses_pic_offset_table = 1;
4543 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
4545 if (mode != GET_MODE (gp_rtx))
4546 gp_rtx = gen_lowpart (mode, gp_rtx);
4550 if (mode == ptr_mode)
4552 if (mode == DImode)
4553 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
4554 else
4555 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
4557 mem = XVECEXP (SET_SRC (insn), 0, 0);
4559 else
4561 gcc_assert (mode == Pmode);
4563 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
4564 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
4567 /* The operand is expected to be MEM. Whenever the related insn
4568 pattern changed, above code which calculate mem should be
4569 updated. */
4570 gcc_assert (MEM_P (mem));
4571 MEM_READONLY_P (mem) = 1;
4572 MEM_NOTRAP_P (mem) = 1;
4573 emit_insn (insn);
4574 return;
4577 case SYMBOL_SMALL_GOT_4G:
4578 emit_insn (gen_rtx_SET (dest, imm));
4579 return;
4581 case SYMBOL_SMALL_TLSGD:
4583 rtx_insn *insns;
4584 /* The return type of __tls_get_addr is the C pointer type
4585 so use ptr_mode. */
4586 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
4587 rtx tmp_reg = dest;
4589 if (GET_MODE (dest) != ptr_mode)
4590 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
4592 start_sequence ();
4593 if (ptr_mode == SImode)
4594 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
4595 else
4596 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
4597 insns = get_insns ();
4598 end_sequence ();
4600 RTL_CONST_CALL_P (insns) = 1;
4601 emit_libcall_block (insns, tmp_reg, result, imm);
4602 /* Convert back to the mode of the dest adding a zero_extend
4603 from SImode (ptr_mode) to DImode (Pmode). */
4604 if (dest != tmp_reg)
4605 convert_move (dest, tmp_reg, true);
4606 return;
4609 case SYMBOL_SMALL_TLSDESC:
4611 machine_mode mode = GET_MODE (dest);
4612 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
4613 rtx tp;
4615 gcc_assert (mode == Pmode || mode == ptr_mode);
4617 /* In ILP32, the got entry is always of SImode size. Unlike
4618 small GOT, the dest is fixed at reg 0. */
4619 if (TARGET_ILP32)
4620 emit_insn (gen_tlsdesc_small_si (imm));
4621 else
4622 emit_insn (gen_tlsdesc_small_di (imm));
4623 tp = aarch64_load_tp (NULL);
4625 if (mode != Pmode)
4626 tp = gen_lowpart (mode, tp);
4628 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
4629 if (REG_P (dest))
4630 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4631 return;
4634 case SYMBOL_SMALL_TLSIE:
4636 /* In ILP32, the mode of dest can be either SImode or DImode,
4637 while the got entry is always of SImode size. The mode of
4638 dest depends on how dest is used: if dest is assigned to a
4639 pointer (e.g. in the memory), it has SImode; it may have
4640 DImode if dest is dereferenced to access the memeory.
4641 This is why we have to handle three different tlsie_small
4642 patterns here (two patterns for ILP32). */
4643 machine_mode mode = GET_MODE (dest);
4644 rtx tmp_reg = gen_reg_rtx (mode);
4645 rtx tp = aarch64_load_tp (NULL);
4647 if (mode == ptr_mode)
4649 if (mode == DImode)
4650 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
4651 else
4653 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
4654 tp = gen_lowpart (mode, tp);
4657 else
4659 gcc_assert (mode == Pmode);
4660 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
4663 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
4664 if (REG_P (dest))
4665 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4666 return;
4669 case SYMBOL_TLSLE12:
4670 case SYMBOL_TLSLE24:
4671 case SYMBOL_TLSLE32:
4672 case SYMBOL_TLSLE48:
4674 machine_mode mode = GET_MODE (dest);
4675 rtx tp = aarch64_load_tp (NULL);
4677 if (mode != Pmode)
4678 tp = gen_lowpart (mode, tp);
4680 switch (type)
4682 case SYMBOL_TLSLE12:
4683 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
4684 (dest, tp, imm));
4685 break;
4686 case SYMBOL_TLSLE24:
4687 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
4688 (dest, tp, imm));
4689 break;
4690 case SYMBOL_TLSLE32:
4691 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
4692 (dest, imm));
4693 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4694 (dest, dest, tp));
4695 break;
4696 case SYMBOL_TLSLE48:
4697 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
4698 (dest, imm));
4699 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
4700 (dest, dest, tp));
4701 break;
4702 default:
4703 gcc_unreachable ();
4706 if (REG_P (dest))
4707 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4708 return;
4711 case SYMBOL_TINY_GOT:
4713 rtx insn;
4714 machine_mode mode = GET_MODE (dest);
4716 if (mode == ptr_mode)
4717 insn = gen_ldr_got_tiny (mode, dest, imm);
4718 else
4720 gcc_assert (mode == Pmode);
4721 insn = gen_ldr_got_tiny_sidi (dest, imm);
4724 emit_insn (insn);
4725 return;
4728 case SYMBOL_TINY_TLSIE:
4730 machine_mode mode = GET_MODE (dest);
4731 rtx tp = aarch64_load_tp (NULL);
4733 if (mode == ptr_mode)
4735 if (mode == DImode)
4736 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
4737 else
4739 tp = gen_lowpart (mode, tp);
4740 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
4743 else
4745 gcc_assert (mode == Pmode);
4746 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
4749 if (REG_P (dest))
4750 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
4751 return;
4754 default:
4755 gcc_unreachable ();
4759 /* Emit a move from SRC to DEST. Assume that the move expanders can
4760 handle all moves if !can_create_pseudo_p (). The distinction is
4761 important because, unlike emit_move_insn, the move expanders know
4762 how to force Pmode objects into the constant pool even when the
4763 constant pool address is not itself legitimate. */
4764 static rtx
4765 aarch64_emit_move (rtx dest, rtx src)
4767 return (can_create_pseudo_p ()
4768 ? emit_move_insn (dest, src)
4769 : emit_move_insn_1 (dest, src));
4772 /* Apply UNOPTAB to OP and store the result in DEST. */
4774 static void
4775 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
4777 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
4778 if (dest != tmp)
4779 emit_move_insn (dest, tmp);
4782 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
4784 static void
4785 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
4787 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
4788 OPTAB_DIRECT);
4789 if (dest != tmp)
4790 emit_move_insn (dest, tmp);
4793 /* Split a 128-bit move operation into two 64-bit move operations,
4794 taking care to handle partial overlap of register to register
4795 copies. Special cases are needed when moving between GP regs and
4796 FP regs. SRC can be a register, constant or memory; DST a register
4797 or memory. If either operand is memory it must not have any side
4798 effects. */
4799 void
4800 aarch64_split_128bit_move (rtx dst, rtx src)
4802 rtx dst_lo, dst_hi;
4803 rtx src_lo, src_hi;
4805 machine_mode mode = GET_MODE (dst);
4807 gcc_assert (mode == TImode || mode == TFmode || mode == TDmode);
4808 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
4809 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
4811 if (REG_P (dst) && REG_P (src))
4813 int src_regno = REGNO (src);
4814 int dst_regno = REGNO (dst);
4816 /* Handle FP <-> GP regs. */
4817 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
4819 src_lo = gen_lowpart (word_mode, src);
4820 src_hi = gen_highpart (word_mode, src);
4822 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
4823 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
4824 return;
4826 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
4828 dst_lo = gen_lowpart (word_mode, dst);
4829 dst_hi = gen_highpart (word_mode, dst);
4831 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
4832 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
4833 return;
4837 dst_lo = gen_lowpart (word_mode, dst);
4838 dst_hi = gen_highpart (word_mode, dst);
4839 src_lo = gen_lowpart (word_mode, src);
4840 src_hi = gen_highpart_mode (word_mode, mode, src);
4842 /* At most one pairing may overlap. */
4843 if (reg_overlap_mentioned_p (dst_lo, src_hi))
4845 aarch64_emit_move (dst_hi, src_hi);
4846 aarch64_emit_move (dst_lo, src_lo);
4848 else
4850 aarch64_emit_move (dst_lo, src_lo);
4851 aarch64_emit_move (dst_hi, src_hi);
4855 /* Return true if we should split a move from 128-bit value SRC
4856 to 128-bit register DEST. */
4858 bool
4859 aarch64_split_128bit_move_p (rtx dst, rtx src)
4861 if (FP_REGNUM_P (REGNO (dst)))
4862 return REG_P (src) && !FP_REGNUM_P (REGNO (src));
4863 /* All moves to GPRs need to be split. */
4864 return true;
4867 /* Split a complex SIMD move. */
4869 void
4870 aarch64_split_simd_move (rtx dst, rtx src)
4872 machine_mode src_mode = GET_MODE (src);
4873 machine_mode dst_mode = GET_MODE (dst);
4875 gcc_assert (VECTOR_MODE_P (dst_mode));
4877 if (REG_P (dst) && REG_P (src))
4879 gcc_assert (VECTOR_MODE_P (src_mode));
4880 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
4884 bool
4885 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
4886 machine_mode ymode, rtx y)
4888 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
4889 gcc_assert (r != NULL);
4890 return rtx_equal_p (x, r);
4893 /* Return TARGET if it is nonnull and a register of mode MODE.
4894 Otherwise, return a fresh register of mode MODE if we can,
4895 or TARGET reinterpreted as MODE if we can't. */
4897 static rtx
4898 aarch64_target_reg (rtx target, machine_mode mode)
4900 if (target && REG_P (target) && GET_MODE (target) == mode)
4901 return target;
4902 if (!can_create_pseudo_p ())
4904 gcc_assert (target);
4905 return gen_lowpart (mode, target);
4907 return gen_reg_rtx (mode);
4910 /* Return a register that contains the constant in BUILDER, given that
4911 the constant is a legitimate move operand. Use TARGET as the register
4912 if it is nonnull and convenient. */
4914 static rtx
4915 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
4917 rtx src = builder.build ();
4918 target = aarch64_target_reg (target, GET_MODE (src));
4919 emit_insn (gen_rtx_SET (target, src));
4920 return target;
4923 static rtx
4924 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
4926 if (can_create_pseudo_p ())
4927 return force_reg (mode, value);
4928 else
4930 gcc_assert (x);
4931 aarch64_emit_move (x, value);
4932 return x;
4936 /* Return true if predicate value X is a constant in which every element
4937 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
4938 value, i.e. as a predicate in which all bits are significant. */
4940 static bool
4941 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
4943 if (!CONST_VECTOR_P (x))
4944 return false;
4946 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
4947 GET_MODE_NUNITS (GET_MODE (x)));
4948 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
4949 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
4950 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
4952 unsigned int nelts = const_vector_encoded_nelts (x);
4953 for (unsigned int i = 0; i < nelts; ++i)
4955 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
4956 if (!CONST_INT_P (elt))
4957 return false;
4959 builder.quick_push (elt);
4960 for (unsigned int j = 1; j < factor; ++j)
4961 builder.quick_push (const0_rtx);
4963 builder.finalize ();
4964 return true;
4967 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
4968 widest predicate element size it can have (that is, the largest size
4969 for which each element would still be 0 or 1). */
4971 unsigned int
4972 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
4974 /* Start with the most optimistic assumption: that we only need
4975 one bit per pattern. This is what we will use if only the first
4976 bit in each pattern is ever set. */
4977 unsigned int mask = GET_MODE_SIZE (DImode);
4978 mask |= builder.npatterns ();
4980 /* Look for set bits. */
4981 unsigned int nelts = builder.encoded_nelts ();
4982 for (unsigned int i = 1; i < nelts; ++i)
4983 if (INTVAL (builder.elt (i)) != 0)
4985 if (i & 1)
4986 return 1;
4987 mask |= i;
4989 return mask & -mask;
4992 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
4993 return that predicate mode, otherwise return opt_machine_mode (). */
4995 opt_machine_mode
4996 aarch64_ptrue_all_mode (rtx x)
4998 gcc_assert (GET_MODE (x) == VNx16BImode);
4999 if (!CONST_VECTOR_P (x)
5000 || !CONST_VECTOR_DUPLICATE_P (x)
5001 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
5002 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
5003 return opt_machine_mode ();
5005 unsigned int nelts = const_vector_encoded_nelts (x);
5006 for (unsigned int i = 1; i < nelts; ++i)
5007 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
5008 return opt_machine_mode ();
5010 return aarch64_sve_pred_mode (nelts);
5013 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
5014 that the constant would have with predicate element size ELT_SIZE
5015 (ignoring the upper bits in each element) and return:
5017 * -1 if all bits are set
5018 * N if the predicate has N leading set bits followed by all clear bits
5019 * 0 if the predicate does not have any of these forms. */
5022 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
5023 unsigned int elt_size)
5025 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
5026 followed by set bits. */
5027 if (builder.nelts_per_pattern () == 3)
5028 return 0;
5030 /* Skip over leading set bits. */
5031 unsigned int nelts = builder.encoded_nelts ();
5032 unsigned int i = 0;
5033 for (; i < nelts; i += elt_size)
5034 if (INTVAL (builder.elt (i)) == 0)
5035 break;
5036 unsigned int vl = i / elt_size;
5038 /* Check for the all-true case. */
5039 if (i == nelts)
5040 return -1;
5042 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
5043 repeating pattern of set bits followed by clear bits. */
5044 if (builder.nelts_per_pattern () != 2)
5045 return 0;
5047 /* We have a "foreground" value and a duplicated "background" value.
5048 If the background might repeat and the last set bit belongs to it,
5049 we might have set bits followed by clear bits followed by set bits. */
5050 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
5051 return 0;
5053 /* Make sure that the rest are all clear. */
5054 for (; i < nelts; i += elt_size)
5055 if (INTVAL (builder.elt (i)) != 0)
5056 return 0;
5058 return vl;
5061 /* See if there is an svpattern that encodes an SVE predicate of mode
5062 PRED_MODE in which the first VL bits are set and the rest are clear.
5063 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
5064 A VL of -1 indicates an all-true vector. */
5066 aarch64_svpattern
5067 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
5069 if (vl < 0)
5070 return AARCH64_SV_ALL;
5072 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
5073 return AARCH64_NUM_SVPATTERNS;
5075 if (vl >= 1 && vl <= 8)
5076 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
5078 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
5079 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
5081 int max_vl;
5082 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
5084 if (vl == (max_vl / 3) * 3)
5085 return AARCH64_SV_MUL3;
5086 /* These would only trigger for non-power-of-2 lengths. */
5087 if (vl == (max_vl & -4))
5088 return AARCH64_SV_MUL4;
5089 if (vl == (1 << floor_log2 (max_vl)))
5090 return AARCH64_SV_POW2;
5091 if (vl == max_vl)
5092 return AARCH64_SV_ALL;
5094 return AARCH64_NUM_SVPATTERNS;
5097 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
5098 bits has the lowest bit set and the upper bits clear. This is the
5099 VNx16BImode equivalent of a PTRUE for controlling elements of
5100 ELT_SIZE bytes. However, because the constant is VNx16BImode,
5101 all bits are significant, even the upper zeros. */
5104 aarch64_ptrue_all (unsigned int elt_size)
5106 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
5107 builder.quick_push (const1_rtx);
5108 for (unsigned int i = 1; i < elt_size; ++i)
5109 builder.quick_push (const0_rtx);
5110 return builder.build ();
5113 /* Return an all-true predicate register of mode MODE. */
5116 aarch64_ptrue_reg (machine_mode mode)
5118 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5119 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
5120 return gen_lowpart (mode, reg);
5123 /* Return an all-false predicate register of mode MODE. */
5126 aarch64_pfalse_reg (machine_mode mode)
5128 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
5129 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
5130 return gen_lowpart (mode, reg);
5133 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
5134 for it. PRED2[0] is the predicate for the instruction whose result
5135 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
5136 for it. Return true if we can prove that the two predicates are
5137 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
5138 with PRED1[0] without changing behavior. */
5140 bool
5141 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
5143 machine_mode mode = GET_MODE (pred1[0]);
5144 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
5145 && mode == GET_MODE (pred2[0])
5146 && aarch64_sve_ptrue_flag (pred1[1], SImode)
5147 && aarch64_sve_ptrue_flag (pred2[1], SImode));
5149 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
5150 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
5151 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
5152 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
5153 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
5156 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
5157 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
5158 Use TARGET as the target register if nonnull and convenient. */
5160 static rtx
5161 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
5162 machine_mode data_mode, rtx op1, rtx op2)
5164 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
5165 expand_operand ops[5];
5166 create_output_operand (&ops[0], target, pred_mode);
5167 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
5168 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
5169 create_input_operand (&ops[3], op1, data_mode);
5170 create_input_operand (&ops[4], op2, data_mode);
5171 expand_insn (icode, 5, ops);
5172 return ops[0].value;
5175 /* Use a comparison to convert integer vector SRC into MODE, which is
5176 the corresponding SVE predicate mode. Use TARGET for the result
5177 if it's nonnull and convenient. */
5180 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
5182 machine_mode src_mode = GET_MODE (src);
5183 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
5184 src, CONST0_RTX (src_mode));
5187 /* Return the assembly token for svprfop value PRFOP. */
5189 static const char *
5190 svprfop_token (enum aarch64_svprfop prfop)
5192 switch (prfop)
5194 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
5195 AARCH64_FOR_SVPRFOP (CASE)
5196 #undef CASE
5197 case AARCH64_NUM_SVPRFOPS:
5198 break;
5200 gcc_unreachable ();
5203 /* Return the assembly string for an SVE prefetch operation with
5204 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
5205 and that SUFFIX is the format for the remaining operands. */
5207 char *
5208 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
5209 const char *suffix)
5211 static char buffer[128];
5212 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
5213 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
5214 mnemonic, svprfop_token (prfop), suffix);
5215 gcc_assert (written < sizeof (buffer));
5216 return buffer;
5219 /* Check whether we can calculate the number of elements in PATTERN
5220 at compile time, given that there are NELTS_PER_VQ elements per
5221 128-bit block. Return the value if so, otherwise return -1. */
5223 HOST_WIDE_INT
5224 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
5226 unsigned int vl, const_vg;
5227 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
5228 vl = 1 + (pattern - AARCH64_SV_VL1);
5229 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
5230 vl = 16 << (pattern - AARCH64_SV_VL16);
5231 else if (aarch64_sve_vg.is_constant (&const_vg))
5233 /* There are two vector granules per quadword. */
5234 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
5235 switch (pattern)
5237 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
5238 case AARCH64_SV_MUL4: return nelts & -4;
5239 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
5240 case AARCH64_SV_ALL: return nelts;
5241 default: gcc_unreachable ();
5244 else
5245 return -1;
5247 /* There are two vector granules per quadword. */
5248 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
5249 if (known_le (vl, nelts_all))
5250 return vl;
5252 /* Requesting more elements than are available results in a PFALSE. */
5253 if (known_gt (vl, nelts_all))
5254 return 0;
5256 return -1;
5259 /* Return true if we can move VALUE into a register using a single
5260 CNT[BHWD] instruction. */
5262 static bool
5263 aarch64_sve_cnt_immediate_p (poly_int64 value)
5265 HOST_WIDE_INT factor = value.coeffs[0];
5266 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
5267 return (value.coeffs[1] == factor
5268 && IN_RANGE (factor, 2, 16 * 16)
5269 && (factor & 1) == 0
5270 && factor <= 16 * (factor & -factor));
5273 /* Likewise for rtx X. */
5275 bool
5276 aarch64_sve_cnt_immediate_p (rtx x)
5278 poly_int64 value;
5279 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
5282 /* Return the asm string for an instruction with a CNT-like vector size
5283 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5284 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5285 first part of the operands template (the part that comes before the
5286 vector size itself). PATTERN is the pattern to use. FACTOR is the
5287 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
5288 in each quadword. If it is zero, we can use any element size. */
5290 static char *
5291 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5292 aarch64_svpattern pattern,
5293 unsigned int factor,
5294 unsigned int nelts_per_vq)
5296 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
5298 if (nelts_per_vq == 0)
5299 /* There is some overlap in the ranges of the four CNT instructions.
5300 Here we always use the smallest possible element size, so that the
5301 multiplier is 1 whereever possible. */
5302 nelts_per_vq = factor & -factor;
5303 int shift = std::min (exact_log2 (nelts_per_vq), 4);
5304 gcc_assert (IN_RANGE (shift, 1, 4));
5305 char suffix = "dwhb"[shift - 1];
5307 factor >>= shift;
5308 unsigned int written;
5309 if (pattern == AARCH64_SV_ALL && factor == 1)
5310 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
5311 prefix, suffix, operands);
5312 else if (factor == 1)
5313 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
5314 prefix, suffix, operands, svpattern_token (pattern));
5315 else
5316 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
5317 prefix, suffix, operands, svpattern_token (pattern),
5318 factor);
5319 gcc_assert (written < sizeof (buffer));
5320 return buffer;
5323 /* Return the asm string for an instruction with a CNT-like vector size
5324 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5325 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5326 first part of the operands template (the part that comes before the
5327 vector size itself). X is the value of the vector size operand,
5328 as a polynomial integer rtx; we need to convert this into an "all"
5329 pattern with a multiplier. */
5331 char *
5332 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
5333 rtx x)
5335 poly_int64 value = rtx_to_poly_int64 (x);
5336 gcc_assert (aarch64_sve_cnt_immediate_p (value));
5337 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
5338 value.coeffs[1], 0);
5341 /* Return the asm string for an instruction with a CNT-like vector size
5342 operand (a vector pattern followed by a multiplier in the range [1, 16]).
5343 PREFIX is the mnemonic without the size suffix and OPERANDS is the
5344 first part of the operands template (the part that comes before the
5345 vector size itself). CNT_PAT[0..2] are the operands of the
5346 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
5348 char *
5349 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
5350 const char *operands, rtx *cnt_pat)
5352 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
5353 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
5354 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
5355 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
5356 factor, nelts_per_vq);
5359 /* Return true if we can add X using a single SVE INC or DEC instruction. */
5361 bool
5362 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
5364 poly_int64 value;
5365 return (poly_int_rtx_p (x, &value)
5366 && (aarch64_sve_cnt_immediate_p (value)
5367 || aarch64_sve_cnt_immediate_p (-value)));
5370 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
5371 operand 0. */
5373 char *
5374 aarch64_output_sve_scalar_inc_dec (rtx offset)
5376 poly_int64 offset_value = rtx_to_poly_int64 (offset);
5377 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
5378 if (offset_value.coeffs[1] > 0)
5379 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
5380 offset_value.coeffs[1], 0);
5381 else
5382 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
5383 -offset_value.coeffs[1], 0);
5386 /* Return true if we can add VALUE to a register using a single ADDVL
5387 or ADDPL instruction. */
5389 static bool
5390 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
5392 HOST_WIDE_INT factor = value.coeffs[0];
5393 if (factor == 0 || value.coeffs[1] != factor)
5394 return false;
5395 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
5396 and a value of 16 is one vector width. */
5397 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
5398 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
5401 /* Likewise for rtx X. */
5403 bool
5404 aarch64_sve_addvl_addpl_immediate_p (rtx x)
5406 poly_int64 value;
5407 return (poly_int_rtx_p (x, &value)
5408 && aarch64_sve_addvl_addpl_immediate_p (value));
5411 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
5412 to operand 1 and storing the result in operand 0. */
5414 char *
5415 aarch64_output_sve_addvl_addpl (rtx offset)
5417 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
5418 poly_int64 offset_value = rtx_to_poly_int64 (offset);
5419 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
5421 int factor = offset_value.coeffs[1];
5422 if ((factor & 15) == 0)
5423 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
5424 else
5425 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
5426 return buffer;
5429 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5430 instruction. If it is, store the number of elements in each vector
5431 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
5432 factor in *FACTOR_OUT (if nonnull). */
5434 bool
5435 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
5436 unsigned int *nelts_per_vq_out)
5438 rtx elt;
5439 poly_int64 value;
5441 if (!const_vec_duplicate_p (x, &elt)
5442 || !poly_int_rtx_p (elt, &value))
5443 return false;
5445 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
5446 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
5447 /* There's no vector INCB. */
5448 return false;
5450 HOST_WIDE_INT factor = value.coeffs[0];
5451 if (value.coeffs[1] != factor)
5452 return false;
5454 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
5455 if ((factor % nelts_per_vq) != 0
5456 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
5457 return false;
5459 if (factor_out)
5460 *factor_out = factor;
5461 if (nelts_per_vq_out)
5462 *nelts_per_vq_out = nelts_per_vq;
5463 return true;
5466 /* Return true if X is a valid immediate for an SVE vector INC or DEC
5467 instruction. */
5469 bool
5470 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
5472 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
5475 /* Return the asm template for an SVE vector INC or DEC instruction.
5476 OPERANDS gives the operands before the vector count and X is the
5477 value of the vector count operand itself. */
5479 char *
5480 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
5482 int factor;
5483 unsigned int nelts_per_vq;
5484 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
5485 gcc_unreachable ();
5486 if (factor < 0)
5487 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
5488 -factor, nelts_per_vq);
5489 else
5490 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
5491 factor, nelts_per_vq);
5494 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5496 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5498 0x0000000100000001ull,
5499 0x0001000100010001ull,
5500 0x0101010101010101ull,
5501 0x1111111111111111ull,
5502 0x5555555555555555ull,
5507 /* Return true if 64-bit VAL is a valid bitmask immediate. */
5508 static bool
5509 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val)
5511 unsigned HOST_WIDE_INT tmp, mask, first_one, next_one;
5512 int bits;
5514 /* Check for a single sequence of one bits and return quickly if so.
5515 The special cases of all ones and all zeroes returns false. */
5516 tmp = val + (val & -val);
5518 if (tmp == (tmp & -tmp))
5519 return (val + 1) > 1;
5521 /* Invert if the immediate doesn't start with a zero bit - this means we
5522 only need to search for sequences of one bits. */
5523 if (val & 1)
5524 val = ~val;
5526 /* Find the first set bit and set tmp to val with the first sequence of one
5527 bits removed. Return success if there is a single sequence of ones. */
5528 first_one = val & -val;
5529 tmp = val & (val + first_one);
5531 if (tmp == 0)
5532 return true;
5534 /* Find the next set bit and compute the difference in bit position. */
5535 next_one = tmp & -tmp;
5536 bits = clz_hwi (first_one) - clz_hwi (next_one);
5537 mask = val ^ tmp;
5539 /* Check the bit position difference is a power of 2, and that the first
5540 sequence of one bits fits within 'bits' bits. */
5541 if ((mask >> bits) != 0 || bits != (bits & -bits))
5542 return false;
5544 /* Check the sequence of one bits is repeated 64/bits times. */
5545 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5549 /* Return true if VAL is a valid bitmask immediate for MODE. */
5550 bool
5551 aarch64_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5553 if (mode == DImode)
5554 return aarch64_bitmask_imm (val);
5556 if (mode == SImode)
5557 return aarch64_bitmask_imm ((val & 0xffffffff) | (val << 32));
5559 /* Replicate small immediates to fit 64 bits. */
5560 int size = GET_MODE_UNIT_PRECISION (mode);
5561 val &= (HOST_WIDE_INT_1U << size) - 1;
5562 val *= bitmask_imm_mul[__builtin_clz (size) - 26];
5564 return aarch64_bitmask_imm (val);
5568 /* Return true if the immediate VAL can be a bitfield immediate
5569 by changing the given MASK bits in VAL to zeroes, ones or bits
5570 from the other half of VAL. Return the new immediate in VAL2. */
5571 static inline bool
5572 aarch64_check_bitmask (unsigned HOST_WIDE_INT val,
5573 unsigned HOST_WIDE_INT &val2,
5574 unsigned HOST_WIDE_INT mask)
5576 val2 = val & ~mask;
5577 if (val2 != val && aarch64_bitmask_imm (val2))
5578 return true;
5579 val2 = val | mask;
5580 if (val2 != val && aarch64_bitmask_imm (val2))
5581 return true;
5582 val = val & ~mask;
5583 val2 = val | (((val >> 32) | (val << 32)) & mask);
5584 if (val2 != val && aarch64_bitmask_imm (val2))
5585 return true;
5586 val2 = val | (((val >> 16) | (val << 48)) & mask);
5587 if (val2 != val && aarch64_bitmask_imm (val2))
5588 return true;
5589 return false;
5593 /* Return true if VAL is a valid MOVZ immediate. */
5594 static inline bool
5595 aarch64_is_movz (unsigned HOST_WIDE_INT val)
5597 return (val >> (ctz_hwi (val) & 48)) < 65536;
5601 /* Return true if immediate VAL can be created by a 64-bit MOVI/MOVN/MOVZ. */
5602 bool
5603 aarch64_is_mov_xn_imm (unsigned HOST_WIDE_INT val)
5605 return aarch64_is_movz (val) || aarch64_is_movz (~val)
5606 || aarch64_bitmask_imm (val);
5610 /* Return true if VAL is an immediate that can be created by a single
5611 MOV instruction. */
5612 bool
5613 aarch64_move_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5615 gcc_assert (mode == SImode || mode == DImode);
5617 if (val < 65536)
5618 return true;
5620 unsigned HOST_WIDE_INT mask =
5621 (val >> 32) == 0 || mode == SImode ? 0xffffffff : HOST_WIDE_INT_M1U;
5623 if (aarch64_is_movz (val & mask) || aarch64_is_movz (~val & mask))
5624 return true;
5626 val = (val & mask) | ((val << 32) & ~mask);
5627 return aarch64_bitmask_imm (val);
5631 static int
5632 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
5633 machine_mode mode)
5635 int i;
5636 unsigned HOST_WIDE_INT val, val2, mask;
5637 int one_match, zero_match;
5638 int num_insns;
5640 gcc_assert (mode == SImode || mode == DImode);
5642 val = INTVAL (imm);
5644 if (aarch64_move_imm (val, mode))
5646 if (generate)
5647 emit_insn (gen_rtx_SET (dest, imm));
5648 return 1;
5651 if ((val >> 32) == 0 || mode == SImode)
5653 if (generate)
5655 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
5656 if (mode == SImode)
5657 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
5658 GEN_INT ((val >> 16) & 0xffff)));
5659 else
5660 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
5661 GEN_INT ((val >> 16) & 0xffff)));
5663 return 2;
5666 /* Remaining cases are all for DImode. */
5668 mask = 0xffff;
5669 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
5670 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
5671 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
5672 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
5674 /* Try a bitmask immediate and a movk to generate the immediate
5675 in 2 instructions. */
5677 if (zero_match < 2 && one_match < 2)
5679 for (i = 0; i < 64; i += 16)
5681 if (aarch64_check_bitmask (val, val2, mask << i))
5682 break;
5684 val2 = val & ~(mask << i);
5685 if ((val2 >> 32) == 0 && aarch64_move_imm (val2, DImode))
5686 break;
5689 if (i != 64)
5691 if (generate)
5693 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5694 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5695 GEN_INT ((val >> i) & 0xffff)));
5697 return 2;
5701 /* Try a bitmask plus 2 movk to generate the immediate in 3 instructions. */
5702 if (zero_match + one_match == 0)
5704 for (i = 0; i < 48; i += 16)
5705 for (int j = i + 16; j < 64; j += 16)
5706 if (aarch64_check_bitmask (val, val2, (mask << i) | (mask << j)))
5708 if (generate)
5710 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
5711 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5712 GEN_INT ((val >> i) & 0xffff)));
5713 emit_insn (gen_insv_immdi (dest, GEN_INT (j),
5714 GEN_INT ((val >> j) & 0xffff)));
5716 return 3;
5720 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
5721 are emitted by the initial mov. If one_match > zero_match, skip set bits,
5722 otherwise skip zero bits. */
5724 num_insns = 1;
5725 mask = 0xffff;
5726 val2 = one_match > zero_match ? ~val : val;
5727 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
5729 if (generate)
5730 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
5731 ? (val | ~(mask << i))
5732 : (val & (mask << i)))));
5733 for (i += 16; i < 64; i += 16)
5735 if ((val2 & (mask << i)) == 0)
5736 continue;
5737 if (generate)
5738 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
5739 GEN_INT ((val >> i) & 0xffff)));
5740 num_insns ++;
5743 return num_insns;
5746 /* Return whether imm is a 128-bit immediate which is simple enough to
5747 expand inline. */
5748 bool
5749 aarch64_mov128_immediate (rtx imm)
5751 if (CONST_INT_P (imm))
5752 return true;
5754 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
5756 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
5757 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
5759 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
5760 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
5764 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5765 a left shift of 0 or 12 bits. */
5766 bool
5767 aarch64_uimm12_shift (unsigned HOST_WIDE_INT val)
5769 return val < 4096 || (val & 0xfff000) == val;
5772 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
5773 that can be created with a left shift of 0 or 12. */
5774 static HOST_WIDE_INT
5775 aarch64_clamp_to_uimm12_shift (unsigned HOST_WIDE_INT val)
5777 /* Check to see if the value fits in 24 bits, as that is the maximum we can
5778 handle correctly. */
5779 gcc_assert (val < 0x1000000);
5781 if (val < 4096)
5782 return val;
5784 return val & 0xfff000;
5788 /* Test whether:
5790 X = (X & AND_VAL) | IOR_VAL;
5792 can be implemented using:
5794 MOVK X, #(IOR_VAL >> shift), LSL #shift
5796 Return the shift if so, otherwise return -1. */
5798 aarch64_movk_shift (const wide_int_ref &and_val,
5799 const wide_int_ref &ior_val)
5801 unsigned int precision = and_val.get_precision ();
5802 unsigned HOST_WIDE_INT mask = 0xffff;
5803 for (unsigned int shift = 0; shift < precision; shift += 16)
5805 if (and_val == ~mask && (ior_val & mask) == ior_val)
5806 return shift;
5807 mask <<= 16;
5809 return -1;
5812 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5813 Assumed precondition: VAL_IN Is not zero. */
5815 unsigned HOST_WIDE_INT
5816 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5818 int lowest_bit_set = ctz_hwi (val_in);
5819 int highest_bit_set = floor_log2 (val_in);
5820 gcc_assert (val_in != 0);
5822 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5823 (HOST_WIDE_INT_1U << lowest_bit_set));
5826 /* Create constant where bits outside of lowest bit set to highest bit set
5827 are set to 1. */
5829 unsigned HOST_WIDE_INT
5830 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5832 return val_in | ~aarch64_and_split_imm1 (val_in);
5835 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5837 bool
5838 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5840 scalar_int_mode int_mode;
5841 if (!is_a <scalar_int_mode> (mode, &int_mode))
5842 return false;
5844 if (aarch64_bitmask_imm (val_in, int_mode))
5845 return false;
5847 if (aarch64_move_imm (val_in, int_mode))
5848 return false;
5850 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5852 return aarch64_bitmask_imm (imm2, int_mode);
5855 /* Return the number of temporary registers that aarch64_add_offset_1
5856 would need to add OFFSET to a register. */
5858 static unsigned int
5859 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
5861 return absu_hwi (offset) < 0x1000000 ? 0 : 1;
5864 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
5865 a non-polynomial OFFSET. MODE is the mode of the addition.
5866 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5867 be set and CFA adjustments added to the generated instructions.
5869 TEMP1, if nonnull, is a register of mode MODE that can be used as a
5870 temporary if register allocation is already complete. This temporary
5871 register may overlap DEST but must not overlap SRC. If TEMP1 is known
5872 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
5873 the immediate again.
5875 Since this function may be used to adjust the stack pointer, we must
5876 ensure that it cannot cause transient stack deallocation (for example
5877 by first incrementing SP and then decrementing when adjusting by a
5878 large immediate). */
5880 static void
5881 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
5882 rtx src, HOST_WIDE_INT offset, rtx temp1,
5883 bool frame_related_p, bool emit_move_imm)
5885 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
5886 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
5888 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
5889 rtx_insn *insn;
5891 if (!moffset)
5893 if (!rtx_equal_p (dest, src))
5895 insn = emit_insn (gen_rtx_SET (dest, src));
5896 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5898 return;
5901 /* Single instruction adjustment. */
5902 if (aarch64_uimm12_shift (moffset))
5904 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
5905 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5906 return;
5909 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
5910 and either:
5912 a) the offset cannot be loaded by a 16-bit move or
5913 b) there is no spare register into which we can move it. */
5914 if (moffset < 0x1000000
5915 && ((!temp1 && !can_create_pseudo_p ())
5916 || !aarch64_move_imm (moffset, mode)))
5918 HOST_WIDE_INT low_off = moffset & 0xfff;
5920 low_off = offset < 0 ? -low_off : low_off;
5921 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
5922 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5923 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
5924 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5925 return;
5928 /* Emit a move immediate if required and an addition/subtraction. */
5929 if (emit_move_imm)
5931 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
5932 temp1 = aarch64_force_temporary (mode, temp1,
5933 gen_int_mode (moffset, mode));
5935 insn = emit_insn (offset < 0
5936 ? gen_sub3_insn (dest, src, temp1)
5937 : gen_add3_insn (dest, src, temp1));
5938 if (frame_related_p)
5940 RTX_FRAME_RELATED_P (insn) = frame_related_p;
5941 rtx adj = plus_constant (mode, src, offset);
5942 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
5946 /* Return the number of temporary registers that aarch64_add_offset
5947 would need to move OFFSET into a register or add OFFSET to a register;
5948 ADD_P is true if we want the latter rather than the former. */
5950 static unsigned int
5951 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
5953 /* This follows the same structure as aarch64_add_offset. */
5954 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
5955 return 0;
5957 unsigned int count = 0;
5958 HOST_WIDE_INT factor = offset.coeffs[1];
5959 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
5960 poly_int64 poly_offset (factor, factor);
5961 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
5962 /* Need one register for the ADDVL/ADDPL result. */
5963 count += 1;
5964 else if (factor != 0)
5966 factor = abs (factor);
5967 if (factor > 16 * (factor & -factor))
5968 /* Need one register for the CNT result and one for the multiplication
5969 factor. If necessary, the second temporary can be reused for the
5970 constant part of the offset. */
5971 return 2;
5972 /* Need one register for the CNT result (which might then
5973 be shifted). */
5974 count += 1;
5976 return count + aarch64_add_offset_1_temporaries (constant);
5979 /* If X can be represented as a poly_int64, return the number
5980 of temporaries that are required to add it to a register.
5981 Return -1 otherwise. */
5984 aarch64_add_offset_temporaries (rtx x)
5986 poly_int64 offset;
5987 if (!poly_int_rtx_p (x, &offset))
5988 return -1;
5989 return aarch64_offset_temporaries (true, offset);
5992 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
5993 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
5994 be set and CFA adjustments added to the generated instructions.
5996 TEMP1, if nonnull, is a register of mode MODE that can be used as a
5997 temporary if register allocation is already complete. This temporary
5998 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
5999 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
6000 false to avoid emitting the immediate again.
6002 TEMP2, if nonnull, is a second temporary register that doesn't
6003 overlap either DEST or REG.
6005 Since this function may be used to adjust the stack pointer, we must
6006 ensure that it cannot cause transient stack deallocation (for example
6007 by first incrementing SP and then decrementing when adjusting by a
6008 large immediate). */
6010 static void
6011 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
6012 poly_int64 offset, rtx temp1, rtx temp2,
6013 bool frame_related_p, bool emit_move_imm = true)
6015 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
6016 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
6017 gcc_assert (temp1 == NULL_RTX
6018 || !frame_related_p
6019 || !reg_overlap_mentioned_p (temp1, dest));
6020 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
6022 /* Try using ADDVL or ADDPL to add the whole value. */
6023 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
6025 rtx offset_rtx = gen_int_mode (offset, mode);
6026 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
6027 RTX_FRAME_RELATED_P (insn) = frame_related_p;
6028 return;
6031 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
6032 SVE vector register, over and above the minimum size of 128 bits.
6033 This is equivalent to half the value returned by CNTD with a
6034 vector shape of ALL. */
6035 HOST_WIDE_INT factor = offset.coeffs[1];
6036 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
6038 /* Try using ADDVL or ADDPL to add the VG-based part. */
6039 poly_int64 poly_offset (factor, factor);
6040 if (src != const0_rtx
6041 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
6043 rtx offset_rtx = gen_int_mode (poly_offset, mode);
6044 if (frame_related_p)
6046 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
6047 RTX_FRAME_RELATED_P (insn) = true;
6048 src = dest;
6050 else
6052 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
6053 src = aarch64_force_temporary (mode, temp1, addr);
6054 temp1 = temp2;
6055 temp2 = NULL_RTX;
6058 /* Otherwise use a CNT-based sequence. */
6059 else if (factor != 0)
6061 /* Use a subtraction if we have a negative factor. */
6062 rtx_code code = PLUS;
6063 if (factor < 0)
6065 factor = -factor;
6066 code = MINUS;
6069 /* Calculate CNTD * FACTOR / 2. First try to fold the division
6070 into the multiplication. */
6071 rtx val;
6072 int shift = 0;
6073 if (factor & 1)
6074 /* Use a right shift by 1. */
6075 shift = -1;
6076 else
6077 factor /= 2;
6078 HOST_WIDE_INT low_bit = factor & -factor;
6079 if (factor <= 16 * low_bit)
6081 if (factor > 16 * 8)
6083 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
6084 the value with the minimum multiplier and shift it into
6085 position. */
6086 int extra_shift = exact_log2 (low_bit);
6087 shift += extra_shift;
6088 factor >>= extra_shift;
6090 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
6092 else
6094 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
6095 directly, since that should increase the chances of being
6096 able to use a shift and add sequence. If LOW_BIT itself
6097 is out of range, just use CNTD. */
6098 if (low_bit <= 16 * 8)
6099 factor /= low_bit;
6100 else
6101 low_bit = 1;
6103 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
6104 val = aarch64_force_temporary (mode, temp1, val);
6106 if (can_create_pseudo_p ())
6108 rtx coeff1 = gen_int_mode (factor, mode);
6109 val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
6111 else
6113 /* Go back to using a negative multiplication factor if we have
6114 no register from which to subtract. */
6115 if (code == MINUS && src == const0_rtx)
6117 factor = -factor;
6118 code = PLUS;
6120 rtx coeff1 = gen_int_mode (factor, mode);
6121 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
6122 val = gen_rtx_MULT (mode, val, coeff1);
6126 if (shift > 0)
6128 /* Multiply by 1 << SHIFT. */
6129 val = aarch64_force_temporary (mode, temp1, val);
6130 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
6132 else if (shift == -1)
6134 /* Divide by 2. */
6135 val = aarch64_force_temporary (mode, temp1, val);
6136 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
6139 /* Calculate SRC +/- CNTD * FACTOR / 2. */
6140 if (src != const0_rtx)
6142 val = aarch64_force_temporary (mode, temp1, val);
6143 val = gen_rtx_fmt_ee (code, mode, src, val);
6145 else if (code == MINUS)
6147 val = aarch64_force_temporary (mode, temp1, val);
6148 val = gen_rtx_NEG (mode, val);
6151 if (constant == 0 || frame_related_p)
6153 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
6154 if (frame_related_p)
6156 RTX_FRAME_RELATED_P (insn) = true;
6157 add_reg_note (insn, REG_CFA_ADJUST_CFA,
6158 gen_rtx_SET (dest, plus_constant (Pmode, src,
6159 poly_offset)));
6161 src = dest;
6162 if (constant == 0)
6163 return;
6165 else
6167 src = aarch64_force_temporary (mode, temp1, val);
6168 temp1 = temp2;
6169 temp2 = NULL_RTX;
6172 emit_move_imm = true;
6175 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
6176 frame_related_p, emit_move_imm);
6179 /* Like aarch64_add_offset, but the offset is given as an rtx rather
6180 than a poly_int64. */
6182 void
6183 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
6184 rtx offset_rtx, rtx temp1, rtx temp2)
6186 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
6187 temp1, temp2, false);
6190 /* Add DELTA to the stack pointer, marking the instructions frame-related.
6191 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
6192 if TEMP1 already contains abs (DELTA). */
6194 static inline void
6195 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
6197 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
6198 temp1, temp2, true, emit_move_imm);
6201 /* Subtract DELTA from the stack pointer, marking the instructions
6202 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
6203 if nonnull. */
6205 static inline void
6206 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
6207 bool emit_move_imm = true)
6209 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
6210 temp1, temp2, frame_related_p, emit_move_imm);
6213 /* Set DEST to (vec_series BASE STEP). */
6215 static void
6216 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
6218 machine_mode mode = GET_MODE (dest);
6219 scalar_mode inner = GET_MODE_INNER (mode);
6221 /* Each operand can be a register or an immediate in the range [-16, 15]. */
6222 if (!aarch64_sve_index_immediate_p (base))
6223 base = force_reg (inner, base);
6224 if (!aarch64_sve_index_immediate_p (step))
6225 step = force_reg (inner, step);
6227 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
6230 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
6231 register of mode MODE. Use TARGET for the result if it's nonnull
6232 and convenient.
6234 The two vector modes must have the same element mode. The behavior
6235 is to duplicate architectural lane N of SRC into architectural lanes
6236 N + I * STEP of the result. On big-endian targets, architectural
6237 lane 0 of an Advanced SIMD vector is the last element of the vector
6238 in memory layout, so for big-endian targets this operation has the
6239 effect of reversing SRC before duplicating it. Callers need to
6240 account for this. */
6243 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
6245 machine_mode src_mode = GET_MODE (src);
6246 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
6247 insn_code icode = (BYTES_BIG_ENDIAN
6248 ? code_for_aarch64_vec_duplicate_vq_be (mode)
6249 : code_for_aarch64_vec_duplicate_vq_le (mode));
6251 unsigned int i = 0;
6252 expand_operand ops[3];
6253 create_output_operand (&ops[i++], target, mode);
6254 create_output_operand (&ops[i++], src, src_mode);
6255 if (BYTES_BIG_ENDIAN)
6257 /* Create a PARALLEL describing the reversal of SRC. */
6258 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
6259 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
6260 nelts_per_vq - 1, -1);
6261 create_fixed_operand (&ops[i++], sel);
6263 expand_insn (icode, i, ops);
6264 return ops[0].value;
6267 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
6268 the memory image into DEST. Return true on success. */
6270 static bool
6271 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
6273 src = force_const_mem (GET_MODE (src), src);
6274 if (!src)
6275 return false;
6277 /* Make sure that the address is legitimate. */
6278 if (!aarch64_sve_ld1rq_operand_p (src))
6280 rtx addr = force_reg (Pmode, XEXP (src, 0));
6281 src = replace_equiv_address (src, addr);
6284 machine_mode mode = GET_MODE (dest);
6285 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6286 rtx ptrue = aarch64_ptrue_reg (pred_mode);
6287 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
6288 return true;
6291 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
6292 by N "background" values. Try to move it into TARGET using:
6294 PTRUE PRED.<T>, VL<N>
6295 MOV TRUE.<T>, #<foreground>
6296 MOV FALSE.<T>, #<background>
6297 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
6299 The PTRUE is always a single instruction but the MOVs might need a
6300 longer sequence. If the background value is zero (as it often is),
6301 the sequence can sometimes collapse to a PTRUE followed by a
6302 zero-predicated move.
6304 Return the target on success, otherwise return null. */
6306 static rtx
6307 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
6309 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
6311 /* Make sure that the PTRUE is valid. */
6312 machine_mode mode = GET_MODE (src);
6313 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
6314 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6315 if (aarch64_svpattern_for_vl (pred_mode, npatterns)
6316 == AARCH64_NUM_SVPATTERNS)
6317 return NULL_RTX;
6319 rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
6320 rtx_vector_builder true_builder (mode, npatterns, 1);
6321 rtx_vector_builder false_builder (mode, npatterns, 1);
6322 for (unsigned int i = 0; i < npatterns; ++i)
6324 true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6325 pred_builder.quick_push (CONST1_RTX (BImode));
6327 for (unsigned int i = 0; i < npatterns; ++i)
6329 false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
6330 pred_builder.quick_push (CONST0_RTX (BImode));
6332 expand_operand ops[4];
6333 create_output_operand (&ops[0], target, mode);
6334 create_input_operand (&ops[1], true_builder.build (), mode);
6335 create_input_operand (&ops[2], false_builder.build (), mode);
6336 create_input_operand (&ops[3], pred_builder.build (), pred_mode);
6337 expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
6338 return target;
6341 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
6342 SVE data mode and isn't a legitimate constant. Use TARGET for the
6343 result if convenient.
6345 The returned register can have whatever mode seems most natural
6346 given the contents of SRC. */
6348 static rtx
6349 aarch64_expand_sve_const_vector (rtx target, rtx src)
6351 machine_mode mode = GET_MODE (src);
6352 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
6353 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
6354 scalar_mode elt_mode = GET_MODE_INNER (mode);
6355 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
6356 unsigned int container_bits = aarch64_sve_container_bits (mode);
6357 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
6359 if (nelts_per_pattern == 1
6360 && encoded_bits <= 128
6361 && container_bits != elt_bits)
6363 /* We have a partial vector mode and a constant whose full-vector
6364 equivalent would occupy a repeating 128-bit sequence. Build that
6365 full-vector equivalent instead, so that we have the option of
6366 using LD1RQ and Advanced SIMD operations. */
6367 unsigned int repeat = container_bits / elt_bits;
6368 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
6369 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
6370 for (unsigned int i = 0; i < npatterns; ++i)
6371 for (unsigned int j = 0; j < repeat; ++j)
6372 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
6373 target = aarch64_target_reg (target, full_mode);
6374 return aarch64_expand_sve_const_vector (target, builder.build ());
6377 if (nelts_per_pattern == 1 && encoded_bits == 128)
6379 /* The constant is a duplicated quadword but can't be narrowed
6380 beyond a quadword. Get the memory image of the first quadword
6381 as a 128-bit vector and try using LD1RQ to load it from memory.
6383 The effect for both endiannesses is to load memory lane N into
6384 architectural lanes N + I * STEP of the result. On big-endian
6385 targets, the layout of the 128-bit vector in an Advanced SIMD
6386 register would be different from its layout in an SVE register,
6387 but this 128-bit vector is a memory value only. */
6388 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6389 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
6390 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
6391 return target;
6394 if (nelts_per_pattern == 1 && encoded_bits < 128)
6396 /* The vector is a repeating sequence of 64 bits or fewer.
6397 See if we can load them using an Advanced SIMD move and then
6398 duplicate it to fill a vector. This is better than using a GPR
6399 move because it keeps everything in the same register file. */
6400 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
6401 rtx_vector_builder builder (vq_mode, npatterns, 1);
6402 for (unsigned int i = 0; i < npatterns; ++i)
6404 /* We want memory lane N to go into architectural lane N,
6405 so reverse for big-endian targets. The DUP .Q pattern
6406 has a compensating reverse built-in. */
6407 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
6408 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
6410 rtx vq_src = builder.build ();
6411 if (aarch64_simd_valid_immediate (vq_src, NULL))
6413 vq_src = force_reg (vq_mode, vq_src);
6414 return aarch64_expand_sve_dupq (target, mode, vq_src);
6417 /* Get an integer representation of the repeating part of Advanced
6418 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
6419 which for big-endian targets is lane-swapped wrt a normal
6420 Advanced SIMD vector. This means that for both endiannesses,
6421 memory lane N of SVE vector SRC corresponds to architectural
6422 lane N of a register holding VQ_SRC. This in turn means that
6423 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
6424 as a single 128-bit value) and thus that memory lane 0 of SRC is
6425 in the lsb of the integer. Duplicating the integer therefore
6426 ensures that memory lane N of SRC goes into architectural lane
6427 N + I * INDEX of the SVE register. */
6428 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
6429 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
6430 if (elt_value)
6432 /* Pretend that we had a vector of INT_MODE to start with. */
6433 elt_mode = int_mode;
6434 mode = aarch64_full_sve_mode (int_mode).require ();
6436 /* If the integer can be moved into a general register by a
6437 single instruction, do that and duplicate the result. */
6438 if (CONST_INT_P (elt_value)
6439 && aarch64_move_imm (INTVAL (elt_value),
6440 encoded_bits <= 32 ? SImode : DImode))
6442 elt_value = force_reg (elt_mode, elt_value);
6443 return expand_vector_broadcast (mode, elt_value);
6446 else if (npatterns == 1)
6447 /* We're duplicating a single value, but can't do better than
6448 force it to memory and load from there. This handles things
6449 like symbolic constants. */
6450 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
6452 if (elt_value)
6454 /* Load the element from memory if we can, otherwise move it into
6455 a register and use a DUP. */
6456 rtx op = force_const_mem (elt_mode, elt_value);
6457 if (!op)
6458 op = force_reg (elt_mode, elt_value);
6459 return expand_vector_broadcast (mode, op);
6463 /* Try using INDEX. */
6464 rtx base, step;
6465 if (const_vec_series_p (src, &base, &step))
6467 aarch64_expand_vec_series (target, base, step);
6468 return target;
6471 /* From here on, it's better to force the whole constant to memory
6472 if we can. */
6473 if (GET_MODE_NUNITS (mode).is_constant ())
6474 return NULL_RTX;
6476 if (nelts_per_pattern == 2)
6477 if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
6478 return res;
6480 /* Expand each pattern individually. */
6481 gcc_assert (npatterns > 1);
6482 rtx_vector_builder builder;
6483 auto_vec<rtx, 16> vectors (npatterns);
6484 for (unsigned int i = 0; i < npatterns; ++i)
6486 builder.new_vector (mode, 1, nelts_per_pattern);
6487 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
6488 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
6489 vectors.quick_push (force_reg (mode, builder.build ()));
6492 /* Use permutes to interleave the separate vectors. */
6493 while (npatterns > 1)
6495 npatterns /= 2;
6496 for (unsigned int i = 0; i < npatterns; ++i)
6498 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
6499 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
6500 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
6501 vectors[i] = tmp;
6504 gcc_assert (vectors[0] == target);
6505 return target;
6508 /* Use WHILE to set a predicate register of mode MODE in which the first
6509 VL bits are set and the rest are clear. Use TARGET for the register
6510 if it's nonnull and convenient. */
6512 static rtx
6513 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
6514 unsigned int vl)
6516 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
6517 target = aarch64_target_reg (target, mode);
6518 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
6519 target, const0_rtx, limit));
6520 return target;
6523 static rtx
6524 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
6526 /* BUILDER is a constant predicate in which the index of every set bit
6527 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
6528 by inverting every element at a multiple of ELT_SIZE and EORing the
6529 result with an ELT_SIZE PTRUE.
6531 Return a register that contains the constant on success, otherwise
6532 return null. Use TARGET as the register if it is nonnull and
6533 convenient. */
6535 static rtx
6536 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
6537 unsigned int elt_size)
6539 /* Invert every element at a multiple of ELT_SIZE, keeping the
6540 other bits zero. */
6541 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
6542 builder.nelts_per_pattern ());
6543 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6544 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
6545 inv_builder.quick_push (const1_rtx);
6546 else
6547 inv_builder.quick_push (const0_rtx);
6548 inv_builder.finalize ();
6550 /* See if we can load the constant cheaply. */
6551 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
6552 if (!inv)
6553 return NULL_RTX;
6555 /* EOR the result with an ELT_SIZE PTRUE. */
6556 rtx mask = aarch64_ptrue_all (elt_size);
6557 mask = force_reg (VNx16BImode, mask);
6558 inv = gen_lowpart (VNx16BImode, inv);
6559 target = aarch64_target_reg (target, VNx16BImode);
6560 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
6561 return target;
6564 /* BUILDER is a constant predicate in which the index of every set bit
6565 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
6566 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
6567 register on success, otherwise return null. Use TARGET as the register
6568 if nonnull and convenient. */
6570 static rtx
6571 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
6572 unsigned int elt_size,
6573 unsigned int permute_size)
6575 /* We're going to split the constant into two new constants A and B,
6576 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
6577 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
6579 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
6580 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
6582 where _ indicates elements that will be discarded by the permute.
6584 First calculate the ELT_SIZEs for A and B. */
6585 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
6586 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
6587 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
6588 if (INTVAL (builder.elt (i)) != 0)
6590 if (i & permute_size)
6591 b_elt_size |= i - permute_size;
6592 else
6593 a_elt_size |= i;
6595 a_elt_size &= -a_elt_size;
6596 b_elt_size &= -b_elt_size;
6598 /* Now construct the vectors themselves. */
6599 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
6600 builder.nelts_per_pattern ());
6601 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
6602 builder.nelts_per_pattern ());
6603 unsigned int nelts = builder.encoded_nelts ();
6604 for (unsigned int i = 0; i < nelts; ++i)
6605 if (i & (elt_size - 1))
6607 a_builder.quick_push (const0_rtx);
6608 b_builder.quick_push (const0_rtx);
6610 else if ((i & permute_size) == 0)
6612 /* The A and B elements are significant. */
6613 a_builder.quick_push (builder.elt (i));
6614 b_builder.quick_push (builder.elt (i + permute_size));
6616 else
6618 /* The A and B elements are going to be discarded, so pick whatever
6619 is likely to give a nice constant. We are targeting element
6620 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
6621 with the aim of each being a sequence of ones followed by
6622 a sequence of zeros. So:
6624 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
6625 duplicate the last X_ELT_SIZE element, to extend the
6626 current sequence of ones or zeros.
6628 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
6629 zero, so that the constant really does have X_ELT_SIZE and
6630 not a smaller size. */
6631 if (a_elt_size > permute_size)
6632 a_builder.quick_push (const0_rtx);
6633 else
6634 a_builder.quick_push (a_builder.elt (i - a_elt_size));
6635 if (b_elt_size > permute_size)
6636 b_builder.quick_push (const0_rtx);
6637 else
6638 b_builder.quick_push (b_builder.elt (i - b_elt_size));
6640 a_builder.finalize ();
6641 b_builder.finalize ();
6643 /* Try loading A into a register. */
6644 rtx_insn *last = get_last_insn ();
6645 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
6646 if (!a)
6647 return NULL_RTX;
6649 /* Try loading B into a register. */
6650 rtx b = a;
6651 if (a_builder != b_builder)
6653 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
6654 if (!b)
6656 delete_insns_since (last);
6657 return NULL_RTX;
6661 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI
6662 operands but permutes them as though they had mode MODE. */
6663 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
6664 target = aarch64_target_reg (target, GET_MODE (a));
6665 rtx type_reg = CONST0_RTX (mode);
6666 emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
6667 return target;
6670 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
6671 constant in BUILDER into an SVE predicate register. Return the register
6672 on success, otherwise return null. Use TARGET for the register if
6673 nonnull and convenient.
6675 ALLOW_RECURSE_P is true if we can use methods that would call this
6676 function recursively. */
6678 static rtx
6679 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
6680 bool allow_recurse_p)
6682 if (builder.encoded_nelts () == 1)
6683 /* A PFALSE or a PTRUE .B ALL. */
6684 return aarch64_emit_set_immediate (target, builder);
6686 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
6687 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
6689 /* If we can load the constant using PTRUE, use it as-is. */
6690 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
6691 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
6692 return aarch64_emit_set_immediate (target, builder);
6694 /* Otherwise use WHILE to set the first VL bits. */
6695 return aarch64_sve_move_pred_via_while (target, mode, vl);
6698 if (!allow_recurse_p)
6699 return NULL_RTX;
6701 /* Try inverting the vector in element size ELT_SIZE and then EORing
6702 the result with an ELT_SIZE PTRUE. */
6703 if (INTVAL (builder.elt (0)) == 0)
6704 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
6705 elt_size))
6706 return res;
6708 /* Try using TRN1 to permute two simpler constants. */
6709 for (unsigned int i = elt_size; i <= 8; i *= 2)
6710 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
6711 elt_size, i))
6712 return res;
6714 return NULL_RTX;
6717 /* Return an SVE predicate register that contains the VNx16BImode
6718 constant in BUILDER, without going through the move expanders.
6720 The returned register can have whatever mode seems most natural
6721 given the contents of BUILDER. Use TARGET for the result if
6722 convenient. */
6724 static rtx
6725 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
6727 /* Try loading the constant using pure predicate operations. */
6728 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
6729 return res;
6731 /* Try forcing the constant to memory. */
6732 if (builder.full_nelts ().is_constant ())
6733 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
6735 target = aarch64_target_reg (target, VNx16BImode);
6736 emit_move_insn (target, mem);
6737 return target;
6740 /* The last resort is to load the constant as an integer and then
6741 compare it against zero. Use -1 for set bits in order to increase
6742 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
6743 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
6744 builder.nelts_per_pattern ());
6745 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
6746 int_builder.quick_push (INTVAL (builder.elt (i))
6747 ? constm1_rtx : const0_rtx);
6748 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
6749 int_builder.build ());
6752 /* Set DEST to immediate IMM. */
6754 void
6755 aarch64_expand_mov_immediate (rtx dest, rtx imm)
6757 machine_mode mode = GET_MODE (dest);
6759 /* Check on what type of symbol it is. */
6760 scalar_int_mode int_mode;
6761 if ((SYMBOL_REF_P (imm)
6762 || LABEL_REF_P (imm)
6763 || GET_CODE (imm) == CONST
6764 || GET_CODE (imm) == CONST_POLY_INT)
6765 && is_a <scalar_int_mode> (mode, &int_mode))
6767 rtx mem;
6768 poly_int64 offset;
6769 HOST_WIDE_INT const_offset;
6770 enum aarch64_symbol_type sty;
6772 /* If we have (const (plus symbol offset)), separate out the offset
6773 before we start classifying the symbol. */
6774 rtx base = strip_offset (imm, &offset);
6776 /* We must always add an offset involving VL separately, rather than
6777 folding it into the relocation. */
6778 if (!offset.is_constant (&const_offset))
6780 if (!TARGET_SVE)
6782 aarch64_report_sve_required ();
6783 return;
6785 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
6786 emit_insn (gen_rtx_SET (dest, imm));
6787 else
6789 /* Do arithmetic on 32-bit values if the result is smaller
6790 than that. */
6791 if (partial_subreg_p (int_mode, SImode))
6793 /* It is invalid to do symbol calculations in modes
6794 narrower than SImode. */
6795 gcc_assert (base == const0_rtx);
6796 dest = gen_lowpart (SImode, dest);
6797 int_mode = SImode;
6799 if (base != const0_rtx)
6801 base = aarch64_force_temporary (int_mode, dest, base);
6802 aarch64_add_offset (int_mode, dest, base, offset,
6803 NULL_RTX, NULL_RTX, false);
6805 else
6806 aarch64_add_offset (int_mode, dest, base, offset,
6807 dest, NULL_RTX, false);
6809 return;
6812 sty = aarch64_classify_symbol (base, const_offset);
6813 switch (sty)
6815 case SYMBOL_FORCE_TO_MEM:
6816 if (int_mode != ptr_mode)
6817 imm = convert_memory_address (ptr_mode, imm);
6819 if (const_offset != 0
6820 && targetm.cannot_force_const_mem (ptr_mode, imm))
6822 gcc_assert (can_create_pseudo_p ());
6823 base = aarch64_force_temporary (int_mode, dest, base);
6824 aarch64_add_offset (int_mode, dest, base, const_offset,
6825 NULL_RTX, NULL_RTX, false);
6826 return;
6829 mem = force_const_mem (ptr_mode, imm);
6830 gcc_assert (mem);
6832 /* If we aren't generating PC relative literals, then
6833 we need to expand the literal pool access carefully.
6834 This is something that needs to be done in a number
6835 of places, so could well live as a separate function. */
6836 if (!aarch64_pcrelative_literal_loads)
6838 gcc_assert (can_create_pseudo_p ());
6839 base = gen_reg_rtx (ptr_mode);
6840 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
6841 if (ptr_mode != Pmode)
6842 base = convert_memory_address (Pmode, base);
6843 mem = gen_rtx_MEM (ptr_mode, base);
6846 if (int_mode != ptr_mode)
6847 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
6849 emit_insn (gen_rtx_SET (dest, mem));
6851 return;
6853 case SYMBOL_SMALL_TLSGD:
6854 case SYMBOL_SMALL_TLSDESC:
6855 case SYMBOL_SMALL_TLSIE:
6856 case SYMBOL_SMALL_GOT_28K:
6857 case SYMBOL_SMALL_GOT_4G:
6858 case SYMBOL_TINY_GOT:
6859 case SYMBOL_TINY_TLSIE:
6860 if (const_offset != 0)
6862 gcc_assert(can_create_pseudo_p ());
6863 base = aarch64_force_temporary (int_mode, dest, base);
6864 aarch64_add_offset (int_mode, dest, base, const_offset,
6865 NULL_RTX, NULL_RTX, false);
6866 return;
6868 /* FALLTHRU */
6870 case SYMBOL_SMALL_ABSOLUTE:
6871 case SYMBOL_TINY_ABSOLUTE:
6872 case SYMBOL_TLSLE12:
6873 case SYMBOL_TLSLE24:
6874 case SYMBOL_TLSLE32:
6875 case SYMBOL_TLSLE48:
6876 aarch64_load_symref_appropriately (dest, imm, sty);
6877 return;
6879 default:
6880 gcc_unreachable ();
6884 if (!CONST_INT_P (imm))
6886 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
6888 /* Only the low bit of each .H, .S and .D element is defined,
6889 so we can set the upper bits to whatever we like. If the
6890 predicate is all-true in MODE, prefer to set all the undefined
6891 bits as well, so that we can share a single .B predicate for
6892 all modes. */
6893 if (imm == CONSTM1_RTX (mode))
6894 imm = CONSTM1_RTX (VNx16BImode);
6896 /* All methods for constructing predicate modes wider than VNx16BI
6897 will set the upper bits of each element to zero. Expose this
6898 by moving such constants as a VNx16BI, so that all bits are
6899 significant and so that constants for different modes can be
6900 shared. The wider constant will still be available as a
6901 REG_EQUAL note. */
6902 rtx_vector_builder builder;
6903 if (aarch64_get_sve_pred_bits (builder, imm))
6905 rtx res = aarch64_expand_sve_const_pred (dest, builder);
6906 if (dest != res)
6907 emit_move_insn (dest, gen_lowpart (mode, res));
6908 return;
6912 if (GET_CODE (imm) == HIGH
6913 || aarch64_simd_valid_immediate (imm, NULL))
6915 emit_insn (gen_rtx_SET (dest, imm));
6916 return;
6919 if (CONST_VECTOR_P (imm) && aarch64_sve_data_mode_p (mode))
6920 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
6922 if (dest != res)
6923 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
6924 return;
6927 rtx mem = force_const_mem (mode, imm);
6928 gcc_assert (mem);
6929 emit_move_insn (dest, mem);
6930 return;
6933 aarch64_internal_mov_immediate (dest, imm, true, mode);
6936 /* Return the MEM rtx that provides the canary value that should be used
6937 for stack-smashing protection. MODE is the mode of the memory.
6938 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
6939 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
6940 indicates whether the caller is performing a SET or a TEST operation. */
6943 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
6944 aarch64_salt_type salt_type)
6946 rtx addr;
6947 if (aarch64_stack_protector_guard == SSP_GLOBAL)
6949 gcc_assert (MEM_P (decl_rtl));
6950 addr = XEXP (decl_rtl, 0);
6951 poly_int64 offset;
6952 rtx base = strip_offset_and_salt (addr, &offset);
6953 if (!SYMBOL_REF_P (base))
6954 return decl_rtl;
6956 rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
6957 addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
6958 addr = gen_rtx_CONST (Pmode, addr);
6959 addr = plus_constant (Pmode, addr, offset);
6961 else
6963 /* Calculate the address from the system register. */
6964 rtx salt = GEN_INT (salt_type);
6965 addr = gen_reg_rtx (mode);
6966 if (mode == DImode)
6967 emit_insn (gen_reg_stack_protect_address_di (addr, salt));
6968 else
6970 emit_insn (gen_reg_stack_protect_address_si (addr, salt));
6971 addr = convert_memory_address (Pmode, addr);
6973 addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
6975 return gen_rtx_MEM (mode, force_reg (Pmode, addr));
6978 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
6979 that is known to contain PTRUE. */
6981 void
6982 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
6984 expand_operand ops[3];
6985 machine_mode mode = GET_MODE (dest);
6986 create_output_operand (&ops[0], dest, mode);
6987 create_input_operand (&ops[1], pred, GET_MODE(pred));
6988 create_input_operand (&ops[2], src, mode);
6989 temporary_volatile_ok v (true);
6990 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
6993 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
6994 operand is in memory. In this case we need to use the predicated LD1
6995 and ST1 instead of LDR and STR, both for correctness on big-endian
6996 targets and because LD1 and ST1 support a wider range of addressing modes.
6997 PRED_MODE is the mode of the predicate.
6999 See the comment at the head of aarch64-sve.md for details about the
7000 big-endian handling. */
7002 void
7003 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
7005 machine_mode mode = GET_MODE (dest);
7006 rtx ptrue = aarch64_ptrue_reg (pred_mode);
7007 if (!register_operand (src, mode)
7008 && !register_operand (dest, mode))
7010 rtx tmp = gen_reg_rtx (mode);
7011 if (MEM_P (src))
7012 aarch64_emit_sve_pred_move (tmp, ptrue, src);
7013 else
7014 emit_move_insn (tmp, src);
7015 src = tmp;
7017 aarch64_emit_sve_pred_move (dest, ptrue, src);
7020 /* Called only on big-endian targets. See whether an SVE vector move
7021 from SRC to DEST is effectively a REV[BHW] instruction, because at
7022 least one operand is a subreg of an SVE vector that has wider or
7023 narrower elements. Return true and emit the instruction if so.
7025 For example:
7027 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
7029 represents a VIEW_CONVERT between the following vectors, viewed
7030 in memory order:
7032 R2: { [0].high, [0].low, [1].high, [1].low, ... }
7033 R1: { [0], [1], [2], [3], ... }
7035 The high part of lane X in R2 should therefore correspond to lane X*2
7036 of R1, but the register representations are:
7038 msb lsb
7039 R2: ...... [1].high [1].low [0].high [0].low
7040 R1: ...... [3] [2] [1] [0]
7042 where the low part of lane X in R2 corresponds to lane X*2 in R1.
7043 We therefore need a reverse operation to swap the high and low values
7044 around.
7046 This is purely an optimization. Without it we would spill the
7047 subreg operand to the stack in one mode and reload it in the
7048 other mode, which has the same effect as the REV. */
7050 bool
7051 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
7053 gcc_assert (BYTES_BIG_ENDIAN);
7055 /* Do not try to optimize subregs that LRA has created for matched
7056 reloads. These subregs only exist as a temporary measure to make
7057 the RTL well-formed, but they are exempt from the usual
7058 TARGET_CAN_CHANGE_MODE_CLASS rules.
7060 For example, if we have:
7062 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
7064 and the constraints require R1 and R2 to be in the same register,
7065 LRA may need to create RTL such as:
7067 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
7068 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
7069 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
7071 which forces both the input and output of the original instruction
7072 to use the same hard register. But for this to work, the normal
7073 rules have to be suppressed on the subreg input, otherwise LRA
7074 would need to reload that input too, meaning that the process
7075 would never terminate. To compensate for this, the normal rules
7076 are also suppressed for the subreg output of the first move.
7077 Ignoring the special case and handling the first move normally
7078 would therefore generate wrong code: we would reverse the elements
7079 for the first subreg but not reverse them back for the second subreg. */
7080 if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
7081 dest = SUBREG_REG (dest);
7082 if (SUBREG_P (src) && !LRA_SUBREG_P (src))
7083 src = SUBREG_REG (src);
7085 /* The optimization handles two single SVE REGs with different element
7086 sizes. */
7087 if (!REG_P (dest)
7088 || !REG_P (src)
7089 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
7090 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
7091 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
7092 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
7093 return false;
7095 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
7096 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
7097 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
7098 UNSPEC_REV_SUBREG);
7099 emit_insn (gen_rtx_SET (dest, unspec));
7100 return true;
7103 /* Return a copy of X with mode MODE, without changing its other
7104 attributes. Unlike gen_lowpart, this doesn't care whether the
7105 mode change is valid. */
7108 aarch64_replace_reg_mode (rtx x, machine_mode mode)
7110 if (GET_MODE (x) == mode)
7111 return x;
7113 x = shallow_copy_rtx (x);
7114 set_mode_and_regno (x, mode, REGNO (x));
7115 return x;
7118 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
7119 stored in wider integer containers. */
7121 static unsigned int
7122 aarch64_sve_rev_unspec (machine_mode mode)
7124 switch (GET_MODE_UNIT_SIZE (mode))
7126 case 1: return UNSPEC_REVB;
7127 case 2: return UNSPEC_REVH;
7128 case 4: return UNSPEC_REVW;
7130 gcc_unreachable ();
7133 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
7134 operands. */
7136 void
7137 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
7139 /* Decide which REV operation we need. The mode with wider elements
7140 determines the mode of the operands and the mode with the narrower
7141 elements determines the reverse width. */
7142 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
7143 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
7144 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
7145 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
7146 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
7148 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
7149 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
7151 /* Get the operands in the appropriate modes and emit the instruction. */
7152 ptrue = gen_lowpart (pred_mode, ptrue);
7153 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
7154 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
7155 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
7156 dest, ptrue, src));
7159 static bool
7160 aarch64_function_ok_for_sibcall (tree, tree exp)
7162 if (crtl->abi->id () != expr_callee_abi (exp).id ())
7163 return false;
7165 return true;
7168 /* Subroutine of aarch64_pass_by_reference for arguments that are not
7169 passed in SVE registers. */
7171 static bool
7172 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
7173 const function_arg_info &arg)
7175 HOST_WIDE_INT size;
7176 machine_mode dummymode;
7177 int nregs;
7179 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
7180 if (arg.mode == BLKmode && arg.type)
7181 size = int_size_in_bytes (arg.type);
7182 else
7183 /* No frontends can create types with variable-sized modes, so we
7184 shouldn't be asked to pass or return them. */
7185 size = GET_MODE_SIZE (arg.mode).to_constant ();
7187 /* Aggregates are passed by reference based on their size. */
7188 if (arg.aggregate_type_p ())
7189 size = int_size_in_bytes (arg.type);
7191 /* Variable sized arguments are always returned by reference. */
7192 if (size < 0)
7193 return true;
7195 /* Can this be a candidate to be passed in fp/simd register(s)? */
7196 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
7197 &dummymode, &nregs, NULL,
7198 !pcum || pcum->silent_p))
7199 return false;
7201 /* Arguments which are variable sized or larger than 2 registers are
7202 passed by reference unless they are a homogenous floating point
7203 aggregate. */
7204 return size > 2 * UNITS_PER_WORD;
7207 /* Implement TARGET_PASS_BY_REFERENCE. */
7209 static bool
7210 aarch64_pass_by_reference (cumulative_args_t pcum_v,
7211 const function_arg_info &arg)
7213 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7215 if (!arg.type)
7216 return aarch64_pass_by_reference_1 (pcum, arg);
7218 pure_scalable_type_info pst_info;
7219 switch (pst_info.analyze (arg.type))
7221 case pure_scalable_type_info::IS_PST:
7222 if (pcum && !pcum->silent_p && !TARGET_SVE)
7223 /* We can't gracefully recover at this point, so make this a
7224 fatal error. */
7225 fatal_error (input_location, "arguments of type %qT require"
7226 " the SVE ISA extension", arg.type);
7228 /* Variadic SVE types are passed by reference. Normal non-variadic
7229 arguments are too if we've run out of registers. */
7230 return (!arg.named
7231 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
7232 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
7234 case pure_scalable_type_info::DOESNT_MATTER:
7235 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
7236 return true;
7238 case pure_scalable_type_info::NO_ABI_IDENTITY:
7239 case pure_scalable_type_info::ISNT_PST:
7240 return aarch64_pass_by_reference_1 (pcum, arg);
7242 gcc_unreachable ();
7245 /* Return TRUE if VALTYPE is padded to its least significant bits. */
7246 static bool
7247 aarch64_return_in_msb (const_tree valtype)
7249 machine_mode dummy_mode;
7250 int dummy_int;
7252 /* Never happens in little-endian mode. */
7253 if (!BYTES_BIG_ENDIAN)
7254 return false;
7256 /* Only composite types smaller than or equal to 16 bytes can
7257 be potentially returned in registers. */
7258 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
7259 || int_size_in_bytes (valtype) <= 0
7260 || int_size_in_bytes (valtype) > 16)
7261 return false;
7263 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
7264 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
7265 is always passed/returned in the least significant bits of fp/simd
7266 register(s). */
7267 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
7268 &dummy_mode, &dummy_int, NULL,
7269 false))
7270 return false;
7272 /* Likewise pure scalable types for SVE vector and predicate registers. */
7273 pure_scalable_type_info pst_info;
7274 if (pst_info.analyze_registers (valtype))
7275 return false;
7277 return true;
7280 /* Implement TARGET_FUNCTION_VALUE.
7281 Define how to find the value returned by a function. */
7283 static rtx
7284 aarch64_function_value (const_tree type, const_tree func,
7285 bool outgoing ATTRIBUTE_UNUSED)
7287 machine_mode mode;
7288 int unsignedp;
7290 mode = TYPE_MODE (type);
7291 if (INTEGRAL_TYPE_P (type))
7292 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
7294 pure_scalable_type_info pst_info;
7295 if (type && pst_info.analyze_registers (type))
7296 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
7298 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7299 are returned in memory, not by value. */
7300 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7301 bool sve_p = (vec_flags & VEC_ANY_SVE);
7303 if (aarch64_return_in_msb (type))
7305 HOST_WIDE_INT size = int_size_in_bytes (type);
7307 if (size % UNITS_PER_WORD != 0)
7309 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
7310 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
7314 int count;
7315 machine_mode ag_mode;
7316 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
7317 NULL, false))
7319 gcc_assert (!sve_p);
7320 if (!aarch64_composite_type_p (type, mode))
7322 gcc_assert (count == 1 && mode == ag_mode);
7323 return gen_rtx_REG (mode, V0_REGNUM);
7325 else if (aarch64_advsimd_full_struct_mode_p (mode)
7326 && known_eq (GET_MODE_SIZE (ag_mode), 16))
7327 return gen_rtx_REG (mode, V0_REGNUM);
7328 else if (aarch64_advsimd_partial_struct_mode_p (mode)
7329 && known_eq (GET_MODE_SIZE (ag_mode), 8))
7330 return gen_rtx_REG (mode, V0_REGNUM);
7331 else
7333 int i;
7334 rtx par;
7336 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
7337 for (i = 0; i < count; i++)
7339 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
7340 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
7341 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7342 XVECEXP (par, 0, i) = tmp;
7344 return par;
7347 else
7349 if (sve_p)
7351 /* Vector types can acquire a partial SVE mode using things like
7352 __attribute__((vector_size(N))), and this is potentially useful.
7353 However, the choice of mode doesn't affect the type's ABI
7354 identity, so we should treat the types as though they had
7355 the associated integer mode, just like they did before SVE
7356 was introduced.
7358 We know that the vector must be 128 bits or smaller,
7359 otherwise we'd have returned it in memory instead. */
7360 gcc_assert (type
7361 && (aarch64_some_values_include_pst_objects_p (type)
7362 || (vec_flags & VEC_PARTIAL)));
7364 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
7365 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
7366 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
7367 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
7369 return gen_rtx_REG (mode, R0_REGNUM);
7373 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
7374 Return true if REGNO is the number of a hard register in which the values
7375 of called function may come back. */
7377 static bool
7378 aarch64_function_value_regno_p (const unsigned int regno)
7380 /* Maximum of 16 bytes can be returned in the general registers. Examples
7381 of 16-byte return values are: 128-bit integers and 16-byte small
7382 structures (excluding homogeneous floating-point aggregates). */
7383 if (regno == R0_REGNUM || regno == R1_REGNUM)
7384 return true;
7386 /* Up to four fp/simd registers can return a function value, e.g. a
7387 homogeneous floating-point aggregate having four members. */
7388 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
7389 return TARGET_FLOAT;
7391 if (regno >= P0_REGNUM && regno < P0_REGNUM + HA_MAX_NUM_FLDS)
7392 return TARGET_SVE;
7394 return false;
7397 /* Subroutine for aarch64_return_in_memory for types that are not returned
7398 in SVE registers. */
7400 static bool
7401 aarch64_return_in_memory_1 (const_tree type)
7403 HOST_WIDE_INT size;
7404 machine_mode ag_mode;
7405 int count;
7407 if (!AGGREGATE_TYPE_P (type)
7408 && TREE_CODE (type) != COMPLEX_TYPE
7409 && TREE_CODE (type) != VECTOR_TYPE)
7410 /* Simple scalar types always returned in registers. */
7411 return false;
7413 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7414 &ag_mode, &count, NULL, false))
7415 return false;
7417 /* Types larger than 2 registers returned in memory. */
7418 size = int_size_in_bytes (type);
7419 return (size < 0 || size > 2 * UNITS_PER_WORD);
7422 /* Implement TARGET_RETURN_IN_MEMORY.
7424 If the type T of the result of a function is such that
7425 void func (T arg)
7426 would require that arg be passed as a value in a register (or set of
7427 registers) according to the parameter passing rules, then the result
7428 is returned in the same registers as would be used for such an
7429 argument. */
7431 static bool
7432 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
7434 pure_scalable_type_info pst_info;
7435 switch (pst_info.analyze (type))
7437 case pure_scalable_type_info::IS_PST:
7438 return (pst_info.num_zr () > NUM_FP_ARG_REGS
7439 || pst_info.num_pr () > NUM_PR_ARG_REGS);
7441 case pure_scalable_type_info::DOESNT_MATTER:
7442 gcc_assert (aarch64_return_in_memory_1 (type));
7443 return true;
7445 case pure_scalable_type_info::NO_ABI_IDENTITY:
7446 case pure_scalable_type_info::ISNT_PST:
7447 return aarch64_return_in_memory_1 (type);
7449 gcc_unreachable ();
7452 static bool
7453 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
7454 const_tree type, int *nregs)
7456 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7457 return aarch64_vfp_is_call_or_return_candidate (mode, type,
7458 &pcum->aapcs_vfp_rmode,
7459 nregs, NULL, pcum->silent_p);
7462 /* Given MODE and TYPE of a function argument, return the alignment in
7463 bits. The idea is to suppress any stronger alignment requested by
7464 the user and opt for the natural alignment (specified in AAPCS64 \S
7465 4.1). ABI_BREAK is set to the old alignment if the alignment was
7466 incorrectly calculated in versions of GCC prior to GCC-9.
7467 ABI_BREAK_PACKED is set to the old alignment if it was incorrectly
7468 calculated in versions between GCC-9 and GCC-13. This is a helper
7469 function for local use only. */
7471 static unsigned int
7472 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
7473 unsigned int *abi_break,
7474 unsigned int *abi_break_packed)
7476 *abi_break = 0;
7477 *abi_break_packed = 0;
7478 if (!type)
7479 return GET_MODE_ALIGNMENT (mode);
7481 if (integer_zerop (TYPE_SIZE (type)))
7482 return 0;
7484 gcc_assert (TYPE_MODE (type) == mode);
7486 if (!AGGREGATE_TYPE_P (type))
7488 /* The ABI alignment is the natural alignment of the type, without
7489 any attributes applied. Normally this is the alignment of the
7490 TYPE_MAIN_VARIANT, but not always; see PR108910 for a counterexample.
7491 For now we just handle the known exceptions explicitly. */
7492 type = TYPE_MAIN_VARIANT (type);
7493 if (POINTER_TYPE_P (type))
7495 gcc_assert (known_eq (POINTER_SIZE, GET_MODE_BITSIZE (mode)));
7496 return POINTER_SIZE;
7498 gcc_assert (!TYPE_USER_ALIGN (type));
7499 return TYPE_ALIGN (type);
7502 if (TREE_CODE (type) == ARRAY_TYPE)
7503 return TYPE_ALIGN (TREE_TYPE (type));
7505 unsigned int alignment = 0;
7506 unsigned int bitfield_alignment_with_packed = 0;
7507 unsigned int bitfield_alignment = 0;
7508 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
7509 if (TREE_CODE (field) == FIELD_DECL)
7511 /* Note that we explicitly consider zero-sized fields here,
7512 even though they don't map to AAPCS64 machine types.
7513 For example, in:
7515 struct __attribute__((aligned(8))) empty {};
7517 struct s {
7518 [[no_unique_address]] empty e;
7519 int x;
7522 "s" contains only one Fundamental Data Type (the int field)
7523 but gains 8-byte alignment and size thanks to "e". */
7524 alignment = std::max (alignment, DECL_ALIGN (field));
7525 if (DECL_BIT_FIELD_TYPE (field))
7527 /* Take the bit-field type's alignment into account only
7528 if the user didn't reduce this field's alignment with
7529 the packed attribute. */
7530 if (!DECL_PACKED (field))
7531 bitfield_alignment
7532 = std::max (bitfield_alignment,
7533 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7535 /* Compute the alignment even if the bit-field is
7536 packed, so that we can emit a warning in case the
7537 alignment changed between GCC versions. */
7538 bitfield_alignment_with_packed
7539 = std::max (bitfield_alignment_with_packed,
7540 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
7544 /* Emit a warning if the alignment is different when taking the
7545 'packed' attribute into account. */
7546 if (bitfield_alignment != bitfield_alignment_with_packed
7547 && bitfield_alignment_with_packed > alignment)
7548 *abi_break_packed = bitfield_alignment_with_packed;
7550 if (bitfield_alignment > alignment)
7552 *abi_break = alignment;
7553 return bitfield_alignment;
7556 return alignment;
7559 /* Layout a function argument according to the AAPCS64 rules. The rule
7560 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
7561 mode that was originally given to us by the target hook, whereas the
7562 mode in ARG might be the result of replacing partial SVE modes with
7563 the equivalent integer mode. */
7565 static void
7566 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7568 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7569 tree type = arg.type;
7570 machine_mode mode = arg.mode;
7571 int ncrn, nvrn, nregs;
7572 bool allocate_ncrn, allocate_nvrn;
7573 HOST_WIDE_INT size;
7574 unsigned int abi_break;
7575 unsigned int abi_break_packed;
7577 /* We need to do this once per argument. */
7578 if (pcum->aapcs_arg_processed)
7579 return;
7581 bool warn_pcs_change
7582 = (warn_psabi
7583 && !pcum->silent_p
7584 && (currently_expanding_function_start
7585 || currently_expanding_gimple_stmt));
7587 /* HFAs and HVAs can have an alignment greater than 16 bytes. For example:
7589 typedef struct foo {
7590 __Int8x16_t foo[2] __attribute__((aligned(32)));
7591 } foo;
7593 is still a HVA despite its larger-than-normal alignment.
7594 However, such over-aligned HFAs and HVAs are guaranteed to have
7595 no padding.
7597 If we exclude HFAs and HVAs from the discussion below, then there
7598 are several things to note:
7600 - Both the C and AAPCS64 interpretations of a type's alignment should
7601 give a value that is no greater than the type's size.
7603 - Types bigger than 16 bytes are passed indirectly.
7605 - If an argument of type T is passed indirectly, TYPE and MODE describe
7606 a pointer to T rather than T iself.
7608 It follows that the AAPCS64 alignment of TYPE must be no greater
7609 than 16 bytes.
7611 Versions prior to GCC 9.1 ignored a bitfield's underlying type
7612 and so could calculate an alignment that was too small. If this
7613 happened for TYPE then ABI_BREAK is this older, too-small alignment.
7615 Although GCC 9.1 fixed that bug, it introduced a different one:
7616 it would consider the alignment of a bitfield's underlying type even
7617 if the field was packed (which should have the effect of overriding
7618 the alignment of the underlying type). This was fixed in GCC 13.1.
7620 As a result of this bug, GCC 9 to GCC 12 could calculate an alignment
7621 that was too big. If this happened for TYPE, ABI_BREAK_PACKED is
7622 this older, too-big alignment.
7624 Also, the fact that GCC 9 to GCC 12 considered irrelevant
7625 alignments meant they could calculate type alignments that were
7626 bigger than the type's size, contrary to the assumption above.
7627 The handling of register arguments was nevertheless (and justifiably)
7628 written to follow the assumption that the alignment can never be
7629 greater than the size. The same was not true for stack arguments;
7630 their alignment was instead handled by MIN bounds in
7631 aarch64_function_arg_boundary.
7633 The net effect is that, if GCC 9 to GCC 12 incorrectly calculated
7634 an alignment of more than 16 bytes for TYPE then:
7636 - If the argument was passed in registers, these GCC versions
7637 would treat the alignment as though it was *less than* 16 bytes.
7639 - If the argument was passed on the stack, these GCC versions
7640 would treat the alignment as though it was *equal to* 16 bytes.
7642 Both behaviors were wrong, but in different cases. */
7644 pcum->aapcs_arg_processed = true;
7646 pure_scalable_type_info pst_info;
7647 if (type && pst_info.analyze_registers (type))
7649 /* aarch64_function_arg_alignment has never had an effect on
7650 this case. */
7652 /* The PCS says that it is invalid to pass an SVE value to an
7653 unprototyped function. There is no ABI-defined location we
7654 can return in this case, so we have no real choice but to raise
7655 an error immediately, even though this is only a query function. */
7656 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
7658 gcc_assert (!pcum->silent_p);
7659 error ("SVE type %qT cannot be passed to an unprototyped function",
7660 arg.type);
7661 /* Avoid repeating the message, and avoid tripping the assert
7662 below. */
7663 pcum->pcs_variant = ARM_PCS_SVE;
7666 /* We would have converted the argument into pass-by-reference
7667 form if it didn't fit in registers. */
7668 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
7669 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
7670 gcc_assert (arg.named
7671 && pcum->pcs_variant == ARM_PCS_SVE
7672 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
7673 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
7674 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
7675 P0_REGNUM + pcum->aapcs_nprn);
7676 return;
7679 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
7680 are passed by reference, not by value. */
7681 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
7682 bool sve_p = (vec_flags & VEC_ANY_SVE);
7683 if (sve_p)
7684 /* Vector types can acquire a partial SVE mode using things like
7685 __attribute__((vector_size(N))), and this is potentially useful.
7686 However, the choice of mode doesn't affect the type's ABI
7687 identity, so we should treat the types as though they had
7688 the associated integer mode, just like they did before SVE
7689 was introduced.
7691 We know that the vector must be 128 bits or smaller,
7692 otherwise we'd have passed it in memory instead. */
7693 gcc_assert (type
7694 && (aarch64_some_values_include_pst_objects_p (type)
7695 || (vec_flags & VEC_PARTIAL)));
7697 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
7698 if (type)
7699 size = int_size_in_bytes (type);
7700 else
7701 /* No frontends can create types with variable-sized modes, so we
7702 shouldn't be asked to pass or return them. */
7703 size = GET_MODE_SIZE (mode).to_constant ();
7704 size = ROUND_UP (size, UNITS_PER_WORD);
7706 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
7707 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
7708 mode,
7709 type,
7710 &nregs);
7711 gcc_assert (!sve_p || !allocate_nvrn);
7713 unsigned int alignment
7714 = aarch64_function_arg_alignment (mode, type, &abi_break,
7715 &abi_break_packed);
7717 gcc_assert ((allocate_nvrn || alignment <= 16 * BITS_PER_UNIT)
7718 && (!alignment || abi_break < alignment)
7719 && (!abi_break_packed || alignment < abi_break_packed));
7721 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
7722 The following code thus handles passing by SIMD/FP registers first. */
7724 nvrn = pcum->aapcs_nvrn;
7726 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
7727 and homogenous short-vector aggregates (HVA). */
7728 if (allocate_nvrn)
7730 /* aarch64_function_arg_alignment has never had an effect on
7731 this case. */
7732 if (!pcum->silent_p && !TARGET_FLOAT)
7733 aarch64_err_no_fpadvsimd (mode);
7735 if (nvrn + nregs <= NUM_FP_ARG_REGS)
7737 pcum->aapcs_nextnvrn = nvrn + nregs;
7738 if (!aarch64_composite_type_p (type, mode))
7740 gcc_assert (nregs == 1);
7741 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7743 else if (aarch64_advsimd_full_struct_mode_p (mode)
7744 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 16))
7745 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7746 else if (aarch64_advsimd_partial_struct_mode_p (mode)
7747 && known_eq (GET_MODE_SIZE (pcum->aapcs_vfp_rmode), 8))
7748 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
7749 else
7751 rtx par;
7752 int i;
7753 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7754 for (i = 0; i < nregs; i++)
7756 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
7757 V0_REGNUM + nvrn + i);
7758 rtx offset = gen_int_mode
7759 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
7760 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
7761 XVECEXP (par, 0, i) = tmp;
7763 pcum->aapcs_reg = par;
7765 return;
7767 else
7769 /* C.3 NSRN is set to 8. */
7770 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
7771 goto on_stack;
7775 ncrn = pcum->aapcs_ncrn;
7776 nregs = size / UNITS_PER_WORD;
7778 /* C6 - C9. though the sign and zero extension semantics are
7779 handled elsewhere. This is the case where the argument fits
7780 entirely general registers. */
7781 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
7783 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
7785 /* C.8 if the argument has an alignment of 16 then the NGRN is
7786 rounded up to the next even number. */
7787 if (nregs == 2
7788 && ncrn % 2)
7790 /* Emit a warning if the alignment changed when taking the
7791 'packed' attribute into account. */
7792 if (warn_pcs_change
7793 && abi_break_packed
7794 && ((abi_break_packed == 16 * BITS_PER_UNIT)
7795 != (alignment == 16 * BITS_PER_UNIT)))
7796 inform (input_location, "parameter passing for argument of type "
7797 "%qT changed in GCC 13.1", type);
7799 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
7800 comparison is there because for > 16 * BITS_PER_UNIT
7801 alignment nregs should be > 2 and therefore it should be
7802 passed by reference rather than value. */
7803 if (alignment == 16 * BITS_PER_UNIT)
7805 if (warn_pcs_change && abi_break)
7806 inform (input_location, "parameter passing for argument of type "
7807 "%qT changed in GCC 9.1", type);
7808 ++ncrn;
7809 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
7813 /* If an argument with an SVE mode needs to be shifted up to the
7814 high part of the register, treat it as though it had an integer mode.
7815 Using the normal (parallel [...]) would suppress the shifting. */
7816 if (sve_p
7817 && BYTES_BIG_ENDIAN
7818 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
7819 && aarch64_pad_reg_upward (mode, type, false))
7821 mode = int_mode_for_mode (mode).require ();
7822 sve_p = false;
7825 /* NREGS can be 0 when e.g. an empty structure is to be passed.
7826 A reg is still generated for it, but the caller should be smart
7827 enough not to use it. */
7828 if (nregs == 0
7829 || (nregs == 1 && !sve_p)
7830 || GET_MODE_CLASS (mode) == MODE_INT)
7831 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
7832 else
7834 rtx par;
7835 int i;
7837 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
7838 for (i = 0; i < nregs; i++)
7840 scalar_int_mode reg_mode = word_mode;
7841 if (nregs == 1)
7842 reg_mode = int_mode_for_mode (mode).require ();
7843 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
7844 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
7845 GEN_INT (i * UNITS_PER_WORD));
7846 XVECEXP (par, 0, i) = tmp;
7848 pcum->aapcs_reg = par;
7851 pcum->aapcs_nextncrn = ncrn + nregs;
7852 return;
7855 /* C.11 */
7856 pcum->aapcs_nextncrn = NUM_ARG_REGS;
7858 /* The argument is passed on stack; record the needed number of words for
7859 this argument and align the total size if necessary. */
7860 on_stack:
7861 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
7863 if (warn_pcs_change
7864 && abi_break_packed
7865 && ((abi_break_packed >= 16 * BITS_PER_UNIT)
7866 != (alignment >= 16 * BITS_PER_UNIT)))
7867 inform (input_location, "parameter passing for argument of type "
7868 "%qT changed in GCC 13.1", type);
7870 if (alignment == 16 * BITS_PER_UNIT)
7872 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
7873 if (pcum->aapcs_stack_size != new_size)
7875 if (warn_pcs_change && abi_break)
7876 inform (input_location, "parameter passing for argument of type "
7877 "%qT changed in GCC 9.1", type);
7878 pcum->aapcs_stack_size = new_size;
7881 return;
7884 /* Implement TARGET_FUNCTION_ARG. */
7886 static rtx
7887 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
7889 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7890 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
7891 || pcum->pcs_variant == ARM_PCS_SIMD
7892 || pcum->pcs_variant == ARM_PCS_SVE);
7894 if (arg.end_marker_p ())
7895 return gen_int_mode (pcum->pcs_variant, DImode);
7897 aarch64_layout_arg (pcum_v, arg);
7898 return pcum->aapcs_reg;
7901 void
7902 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
7903 const_tree fntype,
7904 rtx libname ATTRIBUTE_UNUSED,
7905 const_tree fndecl ATTRIBUTE_UNUSED,
7906 unsigned n_named ATTRIBUTE_UNUSED,
7907 bool silent_p)
7909 pcum->aapcs_ncrn = 0;
7910 pcum->aapcs_nvrn = 0;
7911 pcum->aapcs_nprn = 0;
7912 pcum->aapcs_nextncrn = 0;
7913 pcum->aapcs_nextnvrn = 0;
7914 pcum->aapcs_nextnprn = 0;
7915 if (fntype)
7916 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
7917 else
7918 pcum->pcs_variant = ARM_PCS_AAPCS64;
7919 pcum->aapcs_reg = NULL_RTX;
7920 pcum->aapcs_arg_processed = false;
7921 pcum->aapcs_stack_words = 0;
7922 pcum->aapcs_stack_size = 0;
7923 pcum->silent_p = silent_p;
7925 if (!silent_p
7926 && !TARGET_FLOAT
7927 && fntype && fntype != error_mark_node)
7929 const_tree type = TREE_TYPE (fntype);
7930 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
7931 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
7932 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
7933 &mode, &nregs, NULL, false))
7934 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
7937 if (!silent_p
7938 && !TARGET_SVE
7939 && pcum->pcs_variant == ARM_PCS_SVE)
7941 /* We can't gracefully recover at this point, so make this a
7942 fatal error. */
7943 if (fndecl)
7944 fatal_error (input_location, "%qE requires the SVE ISA extension",
7945 fndecl);
7946 else
7947 fatal_error (input_location, "calls to functions of type %qT require"
7948 " the SVE ISA extension", fntype);
7952 static void
7953 aarch64_function_arg_advance (cumulative_args_t pcum_v,
7954 const function_arg_info &arg)
7956 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
7957 if (pcum->pcs_variant == ARM_PCS_AAPCS64
7958 || pcum->pcs_variant == ARM_PCS_SIMD
7959 || pcum->pcs_variant == ARM_PCS_SVE)
7961 aarch64_layout_arg (pcum_v, arg);
7962 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
7963 != (pcum->aapcs_stack_words != 0));
7964 pcum->aapcs_arg_processed = false;
7965 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
7966 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
7967 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
7968 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
7969 pcum->aapcs_stack_words = 0;
7970 pcum->aapcs_reg = NULL_RTX;
7974 bool
7975 aarch64_function_arg_regno_p (unsigned regno)
7977 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
7978 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS)
7979 || (PR_REGNUM_P (regno) && regno < P0_REGNUM + NUM_PR_ARG_REGS));
7982 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
7983 PARM_BOUNDARY bits of alignment, but will be given anything up
7984 to STACK_BOUNDARY bits if the type requires it. This makes sure
7985 that both before and after the layout of each argument, the Next
7986 Stacked Argument Address (NSAA) will have a minimum alignment of
7987 8 bytes. */
7989 static unsigned int
7990 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
7992 unsigned int abi_break;
7993 unsigned int abi_break_packed;
7994 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
7995 &abi_break,
7996 &abi_break_packed);
7997 /* We rely on aarch64_layout_arg and aarch64_gimplify_va_arg_expr
7998 to emit warnings about ABI incompatibility. */
7999 alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
8000 return alignment;
8003 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
8005 static fixed_size_mode
8006 aarch64_get_reg_raw_mode (int regno)
8008 if (TARGET_SVE && FP_REGNUM_P (regno))
8009 /* Don't use the SVE part of the register for __builtin_apply and
8010 __builtin_return. The SVE registers aren't used by the normal PCS,
8011 so using them there would be a waste of time. The PCS extensions
8012 for SVE types are fundamentally incompatible with the
8013 __builtin_return/__builtin_apply interface. */
8014 return as_a <fixed_size_mode> (V16QImode);
8015 if (PR_REGNUM_P (regno))
8016 /* For SVE PR regs, indicate that they should be ignored for
8017 __builtin_apply/__builtin_return. */
8018 return as_a <fixed_size_mode> (VOIDmode);
8019 return default_get_reg_raw_mode (regno);
8022 /* Implement TARGET_FUNCTION_ARG_PADDING.
8024 Small aggregate types are placed in the lowest memory address.
8026 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
8028 static pad_direction
8029 aarch64_function_arg_padding (machine_mode mode, const_tree type)
8031 /* On little-endian targets, the least significant byte of every stack
8032 argument is passed at the lowest byte address of the stack slot. */
8033 if (!BYTES_BIG_ENDIAN)
8034 return PAD_UPWARD;
8036 /* Otherwise, integral, floating-point and pointer types are padded downward:
8037 the least significant byte of a stack argument is passed at the highest
8038 byte address of the stack slot. */
8039 if (type
8040 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
8041 || POINTER_TYPE_P (type))
8042 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
8043 return PAD_DOWNWARD;
8045 /* Everything else padded upward, i.e. data in first byte of stack slot. */
8046 return PAD_UPWARD;
8049 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
8051 It specifies padding for the last (may also be the only)
8052 element of a block move between registers and memory. If
8053 assuming the block is in the memory, padding upward means that
8054 the last element is padded after its highest significant byte,
8055 while in downward padding, the last element is padded at the
8056 its least significant byte side.
8058 Small aggregates and small complex types are always padded
8059 upwards.
8061 We don't need to worry about homogeneous floating-point or
8062 short-vector aggregates; their move is not affected by the
8063 padding direction determined here. Regardless of endianness,
8064 each element of such an aggregate is put in the least
8065 significant bits of a fp/simd register.
8067 Return !BYTES_BIG_ENDIAN if the least significant byte of the
8068 register has useful data, and return the opposite if the most
8069 significant byte does. */
8071 bool
8072 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
8073 bool first ATTRIBUTE_UNUSED)
8076 /* Aside from pure scalable types, small composite types are always
8077 padded upward. */
8078 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
8080 HOST_WIDE_INT size;
8081 if (type)
8082 size = int_size_in_bytes (type);
8083 else
8084 /* No frontends can create types with variable-sized modes, so we
8085 shouldn't be asked to pass or return them. */
8086 size = GET_MODE_SIZE (mode).to_constant ();
8087 if (size < 2 * UNITS_PER_WORD)
8089 pure_scalable_type_info pst_info;
8090 if (pst_info.analyze_registers (type))
8091 return false;
8092 return true;
8096 /* Otherwise, use the default padding. */
8097 return !BYTES_BIG_ENDIAN;
8100 static scalar_int_mode
8101 aarch64_libgcc_cmp_return_mode (void)
8103 return SImode;
8106 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
8108 /* We use the 12-bit shifted immediate arithmetic instructions so values
8109 must be multiple of (1 << 12), i.e. 4096. */
8110 #define ARITH_FACTOR 4096
8112 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
8113 #error Cannot use simple address calculation for stack probing
8114 #endif
8116 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
8117 inclusive. These are offsets from the current stack pointer. */
8119 static void
8120 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
8122 HOST_WIDE_INT size;
8123 if (!poly_size.is_constant (&size))
8125 sorry ("stack probes for SVE frames");
8126 return;
8129 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
8131 /* See the same assertion on PROBE_INTERVAL above. */
8132 gcc_assert ((first % ARITH_FACTOR) == 0);
8134 /* See if we have a constant small number of probes to generate. If so,
8135 that's the easy case. */
8136 if (size <= PROBE_INTERVAL)
8138 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
8140 emit_set_insn (reg1,
8141 plus_constant (Pmode,
8142 stack_pointer_rtx, -(first + base)));
8143 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
8146 /* The run-time loop is made up of 8 insns in the generic case while the
8147 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
8148 else if (size <= 4 * PROBE_INTERVAL)
8150 HOST_WIDE_INT i, rem;
8152 emit_set_insn (reg1,
8153 plus_constant (Pmode,
8154 stack_pointer_rtx,
8155 -(first + PROBE_INTERVAL)));
8156 emit_stack_probe (reg1);
8158 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
8159 it exceeds SIZE. If only two probes are needed, this will not
8160 generate any code. Then probe at FIRST + SIZE. */
8161 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
8163 emit_set_insn (reg1,
8164 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
8165 emit_stack_probe (reg1);
8168 rem = size - (i - PROBE_INTERVAL);
8169 if (rem > 256)
8171 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
8173 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
8174 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
8176 else
8177 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
8180 /* Otherwise, do the same as above, but in a loop. Note that we must be
8181 extra careful with variables wrapping around because we might be at
8182 the very top (or the very bottom) of the address space and we have
8183 to be able to handle this case properly; in particular, we use an
8184 equality test for the loop condition. */
8185 else
8187 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
8189 /* Step 1: round SIZE to the previous multiple of the interval. */
8191 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
8194 /* Step 2: compute initial and final value of the loop counter. */
8196 /* TEST_ADDR = SP + FIRST. */
8197 emit_set_insn (reg1,
8198 plus_constant (Pmode, stack_pointer_rtx, -first));
8200 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
8201 HOST_WIDE_INT adjustment = - (first + rounded_size);
8202 if (! aarch64_uimm12_shift (adjustment))
8204 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
8205 true, Pmode);
8206 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
8208 else
8209 emit_set_insn (reg2,
8210 plus_constant (Pmode, stack_pointer_rtx, adjustment));
8212 /* Step 3: the loop
8216 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
8217 probe at TEST_ADDR
8219 while (TEST_ADDR != LAST_ADDR)
8221 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
8222 until it is equal to ROUNDED_SIZE. */
8224 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
8227 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
8228 that SIZE is equal to ROUNDED_SIZE. */
8230 if (size != rounded_size)
8232 HOST_WIDE_INT rem = size - rounded_size;
8234 if (rem > 256)
8236 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
8238 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
8239 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
8241 else
8242 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
8246 /* Make sure nothing is scheduled before we are done. */
8247 emit_insn (gen_blockage ());
8250 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
8251 absolute addresses. */
8253 const char *
8254 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
8256 static int labelno = 0;
8257 char loop_lab[32];
8258 rtx xops[2];
8260 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
8262 /* Loop. */
8263 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
8265 HOST_WIDE_INT stack_clash_probe_interval
8266 = 1 << param_stack_clash_protection_guard_size;
8268 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
8269 xops[0] = reg1;
8270 HOST_WIDE_INT interval;
8271 if (flag_stack_clash_protection)
8272 interval = stack_clash_probe_interval;
8273 else
8274 interval = PROBE_INTERVAL;
8276 gcc_assert (aarch64_uimm12_shift (interval));
8277 xops[1] = GEN_INT (interval);
8279 output_asm_insn ("sub\t%0, %0, %1", xops);
8281 /* If doing stack clash protection then we probe up by the ABI specified
8282 amount. We do this because we're dropping full pages at a time in the
8283 loop. But if we're doing non-stack clash probing, probe at SP 0. */
8284 if (flag_stack_clash_protection)
8285 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
8286 else
8287 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
8289 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
8290 by this amount for each iteration. */
8291 output_asm_insn ("str\txzr, [%0, %1]", xops);
8293 /* Test if TEST_ADDR == LAST_ADDR. */
8294 xops[1] = reg2;
8295 output_asm_insn ("cmp\t%0, %1", xops);
8297 /* Branch. */
8298 fputs ("\tb.ne\t", asm_out_file);
8299 assemble_name_raw (asm_out_file, loop_lab);
8300 fputc ('\n', asm_out_file);
8302 return "";
8305 /* Emit the probe loop for doing stack clash probes and stack adjustments for
8306 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
8307 of GUARD_SIZE. When a probe is emitted it is done at most
8308 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
8309 at most MIN_PROBE_THRESHOLD. By the end of this function
8310 BASE = BASE - ADJUSTMENT. */
8312 const char *
8313 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
8314 rtx min_probe_threshold, rtx guard_size)
8316 /* This function is not allowed to use any instruction generation function
8317 like gen_ and friends. If you do you'll likely ICE during CFG validation,
8318 so instead emit the code you want using output_asm_insn. */
8319 gcc_assert (flag_stack_clash_protection);
8320 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
8321 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
8323 /* The minimum required allocation before the residual requires probing. */
8324 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
8326 /* Clamp the value down to the nearest value that can be used with a cmp. */
8327 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
8328 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
8330 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
8331 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
8333 static int labelno = 0;
8334 char loop_start_lab[32];
8335 char loop_end_lab[32];
8336 rtx xops[2];
8338 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
8339 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
8341 /* Emit loop start label. */
8342 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
8344 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
8345 xops[0] = adjustment;
8346 xops[1] = probe_offset_value_rtx;
8347 output_asm_insn ("cmp\t%0, %1", xops);
8349 /* Branch to end if not enough adjustment to probe. */
8350 fputs ("\tb.lt\t", asm_out_file);
8351 assemble_name_raw (asm_out_file, loop_end_lab);
8352 fputc ('\n', asm_out_file);
8354 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
8355 xops[0] = base;
8356 xops[1] = probe_offset_value_rtx;
8357 output_asm_insn ("sub\t%0, %0, %1", xops);
8359 /* Probe at BASE. */
8360 xops[1] = const0_rtx;
8361 output_asm_insn ("str\txzr, [%0, %1]", xops);
8363 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
8364 xops[0] = adjustment;
8365 xops[1] = probe_offset_value_rtx;
8366 output_asm_insn ("sub\t%0, %0, %1", xops);
8368 /* Branch to start if still more bytes to allocate. */
8369 fputs ("\tb\t", asm_out_file);
8370 assemble_name_raw (asm_out_file, loop_start_lab);
8371 fputc ('\n', asm_out_file);
8373 /* No probe leave. */
8374 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
8376 /* BASE = BASE - ADJUSTMENT. */
8377 xops[0] = base;
8378 xops[1] = adjustment;
8379 output_asm_insn ("sub\t%0, %0, %1", xops);
8380 return "";
8383 /* Determine whether a frame chain needs to be generated. */
8384 static bool
8385 aarch64_needs_frame_chain (void)
8387 /* Force a frame chain for EH returns so the return address is at FP+8. */
8388 if (frame_pointer_needed || crtl->calls_eh_return)
8389 return true;
8391 /* A leaf function cannot have calls or write LR. */
8392 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
8394 /* Don't use a frame chain in leaf functions if leaf frame pointers
8395 are disabled. */
8396 if (flag_omit_leaf_frame_pointer && is_leaf)
8397 return false;
8399 return aarch64_use_frame_pointer;
8402 /* Mark the registers that need to be saved by the callee and calculate
8403 the size of the callee-saved registers area and frame record (both FP
8404 and LR may be omitted). */
8405 static void
8406 aarch64_layout_frame (void)
8408 poly_int64 offset = 0;
8409 int regno, last_fp_reg = INVALID_REGNUM;
8410 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
8411 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
8412 bool frame_related_fp_reg_p = false;
8413 aarch64_frame &frame = cfun->machine->frame;
8415 frame.emit_frame_chain = aarch64_needs_frame_chain ();
8417 /* Adjust the outgoing arguments size if required. Keep it in sync with what
8418 the mid-end is doing. */
8419 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
8421 #define SLOT_NOT_REQUIRED (-2)
8422 #define SLOT_REQUIRED (-1)
8424 frame.wb_push_candidate1 = INVALID_REGNUM;
8425 frame.wb_push_candidate2 = INVALID_REGNUM;
8426 frame.spare_pred_reg = INVALID_REGNUM;
8428 /* First mark all the registers that really need to be saved... */
8429 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8430 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
8432 /* ... that includes the eh data registers (if needed)... */
8433 if (crtl->calls_eh_return)
8434 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
8435 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
8437 /* ... and any callee saved register that dataflow says is live. */
8438 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8439 if (df_regs_ever_live_p (regno)
8440 && !fixed_regs[regno]
8441 && (regno == R30_REGNUM
8442 || !crtl->abi->clobbers_full_reg_p (regno)))
8443 frame.reg_offset[regno] = SLOT_REQUIRED;
8445 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8446 if (df_regs_ever_live_p (regno)
8447 && !fixed_regs[regno]
8448 && !crtl->abi->clobbers_full_reg_p (regno))
8450 frame.reg_offset[regno] = SLOT_REQUIRED;
8451 last_fp_reg = regno;
8452 if (aarch64_emit_cfi_for_reg_p (regno))
8453 frame_related_fp_reg_p = true;
8456 /* Big-endian SVE frames need a spare predicate register in order
8457 to save Z8-Z15. Decide which register they should use. Prefer
8458 an unused argument register if possible, so that we don't force P4
8459 to be saved unnecessarily. */
8460 if (frame_related_fp_reg_p
8461 && crtl->abi->id () == ARM_PCS_SVE
8462 && BYTES_BIG_ENDIAN)
8464 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
8465 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
8466 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
8467 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
8468 break;
8469 gcc_assert (regno <= P7_REGNUM);
8470 frame.spare_pred_reg = regno;
8471 df_set_regs_ever_live (regno, true);
8474 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8475 if (df_regs_ever_live_p (regno)
8476 && !fixed_regs[regno]
8477 && !crtl->abi->clobbers_full_reg_p (regno))
8478 frame.reg_offset[regno] = SLOT_REQUIRED;
8480 /* With stack-clash, LR must be saved in non-leaf functions. The saving of
8481 LR counts as an implicit probe which allows us to maintain the invariant
8482 described in the comment at expand_prologue. */
8483 gcc_assert (crtl->is_leaf
8484 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
8486 /* Now assign stack slots for the registers. Start with the predicate
8487 registers, since predicate LDR and STR have a relatively small
8488 offset range. These saves happen below the hard frame pointer. */
8489 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
8490 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8492 frame.reg_offset[regno] = offset;
8493 offset += BYTES_PER_SVE_PRED;
8496 if (maybe_ne (offset, 0))
8498 /* If we have any vector registers to save above the predicate registers,
8499 the offset of the vector register save slots need to be a multiple
8500 of the vector size. This lets us use the immediate forms of LDR/STR
8501 (or LD1/ST1 for big-endian).
8503 A vector register is 8 times the size of a predicate register,
8504 and we need to save a maximum of 12 predicate registers, so the
8505 first vector register will be at either #1, MUL VL or #2, MUL VL.
8507 If we don't have any vector registers to save, and we know how
8508 big the predicate save area is, we can just round it up to the
8509 next 16-byte boundary. */
8510 if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
8511 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8512 else
8514 if (known_le (offset, vector_save_size))
8515 offset = vector_save_size;
8516 else if (known_le (offset, vector_save_size * 2))
8517 offset = vector_save_size * 2;
8518 else
8519 gcc_unreachable ();
8523 /* If we need to save any SVE vector registers, add them next. */
8524 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
8525 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8526 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8528 frame.reg_offset[regno] = offset;
8529 offset += vector_save_size;
8532 /* OFFSET is now the offset of the hard frame pointer from the bottom
8533 of the callee save area. */
8534 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
8535 frame.below_hard_fp_saved_regs_size = offset;
8536 if (frame.emit_frame_chain)
8538 /* FP and LR are placed in the linkage record. */
8539 frame.reg_offset[R29_REGNUM] = offset;
8540 frame.wb_push_candidate1 = R29_REGNUM;
8541 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
8542 frame.wb_push_candidate2 = R30_REGNUM;
8543 offset += 2 * UNITS_PER_WORD;
8546 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
8547 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8549 frame.reg_offset[regno] = offset;
8550 if (frame.wb_push_candidate1 == INVALID_REGNUM)
8551 frame.wb_push_candidate1 = regno;
8552 else if (frame.wb_push_candidate2 == INVALID_REGNUM)
8553 frame.wb_push_candidate2 = regno;
8554 offset += UNITS_PER_WORD;
8557 poly_int64 max_int_offset = offset;
8558 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8559 bool has_align_gap = maybe_ne (offset, max_int_offset);
8561 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
8562 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
8564 /* If there is an alignment gap between integer and fp callee-saves,
8565 allocate the last fp register to it if possible. */
8566 if (regno == last_fp_reg
8567 && has_align_gap
8568 && known_eq (vector_save_size, 8)
8569 && multiple_p (offset, 16))
8571 frame.reg_offset[regno] = max_int_offset;
8572 break;
8575 frame.reg_offset[regno] = offset;
8576 if (frame.wb_push_candidate1 == INVALID_REGNUM)
8577 frame.wb_push_candidate1 = regno;
8578 else if (frame.wb_push_candidate2 == INVALID_REGNUM
8579 && frame.wb_push_candidate1 >= V0_REGNUM)
8580 frame.wb_push_candidate2 = regno;
8581 offset += vector_save_size;
8584 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
8586 frame.saved_regs_size = offset;
8588 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
8590 poly_int64 above_outgoing_args
8591 = aligned_upper_bound (varargs_and_saved_regs_size
8592 + get_frame_size (),
8593 STACK_BOUNDARY / BITS_PER_UNIT);
8595 frame.hard_fp_offset
8596 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
8598 /* Both these values are already aligned. */
8599 gcc_assert (multiple_p (crtl->outgoing_args_size,
8600 STACK_BOUNDARY / BITS_PER_UNIT));
8601 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
8603 frame.locals_offset = frame.saved_varargs_size;
8605 frame.initial_adjust = 0;
8606 frame.final_adjust = 0;
8607 frame.callee_adjust = 0;
8608 frame.sve_callee_adjust = 0;
8609 frame.callee_offset = 0;
8611 frame.wb_pop_candidate1 = frame.wb_push_candidate1;
8612 frame.wb_pop_candidate2 = frame.wb_push_candidate2;
8614 /* Shadow call stack only deals with functions where the LR is pushed
8615 onto the stack and without specifying the "no_sanitize" attribute
8616 with the argument "shadow-call-stack". */
8617 frame.is_scs_enabled
8618 = (!crtl->calls_eh_return
8619 && sanitize_flags_p (SANITIZE_SHADOW_CALL_STACK)
8620 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0));
8622 /* When shadow call stack is enabled, the scs_pop in the epilogue will
8623 restore x30, and we don't need to pop x30 again in the traditional
8624 way. Pop candidates record the registers that need to be popped
8625 eventually. */
8626 if (frame.is_scs_enabled)
8628 if (frame.wb_pop_candidate2 == R30_REGNUM)
8629 frame.wb_pop_candidate2 = INVALID_REGNUM;
8630 else if (frame.wb_pop_candidate1 == R30_REGNUM)
8631 frame.wb_pop_candidate1 = INVALID_REGNUM;
8634 /* If candidate2 is INVALID_REGNUM, we need to adjust max_push_offset to
8635 256 to ensure that the offset meets the requirements of emit_move_insn.
8636 Similarly, if candidate1 is INVALID_REGNUM, we need to set
8637 max_push_offset to 0, because no registers are popped at this time,
8638 so callee_adjust cannot be adjusted. */
8639 HOST_WIDE_INT max_push_offset = 0;
8640 if (frame.wb_pop_candidate2 != INVALID_REGNUM)
8641 max_push_offset = 512;
8642 else if (frame.wb_pop_candidate1 != INVALID_REGNUM)
8643 max_push_offset = 256;
8645 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
8646 HOST_WIDE_INT const_saved_regs_size;
8647 if (frame.frame_size.is_constant (&const_size)
8648 && const_size < max_push_offset
8649 && known_eq (frame.hard_fp_offset, const_size))
8651 /* Simple, small frame with no outgoing arguments:
8653 stp reg1, reg2, [sp, -frame_size]!
8654 stp reg3, reg4, [sp, 16] */
8655 frame.callee_adjust = const_size;
8657 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
8658 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
8659 && const_outgoing_args_size + const_saved_regs_size < 512
8660 /* We could handle this case even with outgoing args, provided
8661 that the number of args left us with valid offsets for all
8662 predicate and vector save slots. It's such a rare case that
8663 it hardly seems worth the effort though. */
8664 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
8665 && !(cfun->calls_alloca
8666 && frame.hard_fp_offset.is_constant (&const_fp_offset)
8667 && const_fp_offset < max_push_offset))
8669 /* Frame with small outgoing arguments:
8671 sub sp, sp, frame_size
8672 stp reg1, reg2, [sp, outgoing_args_size]
8673 stp reg3, reg4, [sp, outgoing_args_size + 16] */
8674 frame.initial_adjust = frame.frame_size;
8675 frame.callee_offset = const_outgoing_args_size;
8677 else if (saves_below_hard_fp_p
8678 && known_eq (frame.saved_regs_size,
8679 frame.below_hard_fp_saved_regs_size))
8681 /* Frame in which all saves are SVE saves:
8683 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
8684 save SVE registers relative to SP
8685 sub sp, sp, outgoing_args_size */
8686 frame.initial_adjust = (frame.hard_fp_offset
8687 + frame.below_hard_fp_saved_regs_size);
8688 frame.final_adjust = crtl->outgoing_args_size;
8690 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
8691 && const_fp_offset < max_push_offset)
8693 /* Frame with large outgoing arguments or SVE saves, but with
8694 a small local area:
8696 stp reg1, reg2, [sp, -hard_fp_offset]!
8697 stp reg3, reg4, [sp, 16]
8698 [sub sp, sp, below_hard_fp_saved_regs_size]
8699 [save SVE registers relative to SP]
8700 sub sp, sp, outgoing_args_size */
8701 frame.callee_adjust = const_fp_offset;
8702 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8703 frame.final_adjust = crtl->outgoing_args_size;
8705 else
8707 /* Frame with large local area and outgoing arguments or SVE saves,
8708 using frame pointer:
8710 sub sp, sp, hard_fp_offset
8711 stp x29, x30, [sp, 0]
8712 add x29, sp, 0
8713 stp reg3, reg4, [sp, 16]
8714 [sub sp, sp, below_hard_fp_saved_regs_size]
8715 [save SVE registers relative to SP]
8716 sub sp, sp, outgoing_args_size */
8717 frame.initial_adjust = frame.hard_fp_offset;
8718 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
8719 frame.final_adjust = crtl->outgoing_args_size;
8722 /* Make sure the individual adjustments add up to the full frame size. */
8723 gcc_assert (known_eq (frame.initial_adjust
8724 + frame.callee_adjust
8725 + frame.sve_callee_adjust
8726 + frame.final_adjust, frame.frame_size));
8728 if (!frame.emit_frame_chain && frame.callee_adjust == 0)
8730 /* We've decided not to associate any register saves with the initial
8731 stack allocation. */
8732 frame.wb_pop_candidate1 = frame.wb_push_candidate1 = INVALID_REGNUM;
8733 frame.wb_pop_candidate2 = frame.wb_push_candidate2 = INVALID_REGNUM;
8736 frame.laid_out = true;
8739 /* Return true if the register REGNO is saved on entry to
8740 the current function. */
8742 static bool
8743 aarch64_register_saved_on_entry (int regno)
8745 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
8748 /* Return the next register up from REGNO up to LIMIT for the callee
8749 to save. */
8751 static unsigned
8752 aarch64_next_callee_save (unsigned regno, unsigned limit)
8754 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
8755 regno ++;
8756 return regno;
8759 /* Push the register number REGNO of mode MODE to the stack with write-back
8760 adjusting the stack by ADJUSTMENT. */
8762 static void
8763 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
8764 HOST_WIDE_INT adjustment)
8766 rtx base_rtx = stack_pointer_rtx;
8767 rtx insn, reg, mem;
8769 reg = gen_rtx_REG (mode, regno);
8770 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
8771 plus_constant (Pmode, base_rtx, -adjustment));
8772 mem = gen_frame_mem (mode, mem);
8774 insn = emit_move_insn (mem, reg);
8775 RTX_FRAME_RELATED_P (insn) = 1;
8778 /* Generate and return an instruction to store the pair of registers
8779 REG and REG2 of mode MODE to location BASE with write-back adjusting
8780 the stack location BASE by ADJUSTMENT. */
8782 static rtx
8783 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8784 HOST_WIDE_INT adjustment)
8786 switch (mode)
8788 case E_DImode:
8789 return gen_storewb_pairdi_di (base, base, reg, reg2,
8790 GEN_INT (-adjustment),
8791 GEN_INT (UNITS_PER_WORD - adjustment));
8792 case E_DFmode:
8793 return gen_storewb_pairdf_di (base, base, reg, reg2,
8794 GEN_INT (-adjustment),
8795 GEN_INT (UNITS_PER_WORD - adjustment));
8796 case E_TFmode:
8797 return gen_storewb_pairtf_di (base, base, reg, reg2,
8798 GEN_INT (-adjustment),
8799 GEN_INT (UNITS_PER_VREG - adjustment));
8800 default:
8801 gcc_unreachable ();
8805 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
8806 stack pointer by ADJUSTMENT. */
8808 static void
8809 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
8811 rtx_insn *insn;
8812 machine_mode mode = aarch64_reg_save_mode (regno1);
8814 if (regno2 == INVALID_REGNUM)
8815 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
8817 rtx reg1 = gen_rtx_REG (mode, regno1);
8818 rtx reg2 = gen_rtx_REG (mode, regno2);
8820 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
8821 reg2, adjustment));
8822 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
8823 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
8824 RTX_FRAME_RELATED_P (insn) = 1;
8827 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
8828 adjusting it by ADJUSTMENT afterwards. */
8830 static rtx
8831 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
8832 HOST_WIDE_INT adjustment)
8834 switch (mode)
8836 case E_DImode:
8837 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
8838 GEN_INT (UNITS_PER_WORD));
8839 case E_DFmode:
8840 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
8841 GEN_INT (UNITS_PER_WORD));
8842 case E_TFmode:
8843 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
8844 GEN_INT (UNITS_PER_VREG));
8845 default:
8846 gcc_unreachable ();
8850 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
8851 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
8852 into CFI_OPS. */
8854 static void
8855 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
8856 rtx *cfi_ops)
8858 machine_mode mode = aarch64_reg_save_mode (regno1);
8859 rtx reg1 = gen_rtx_REG (mode, regno1);
8861 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
8863 if (regno2 == INVALID_REGNUM)
8865 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
8866 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
8867 emit_move_insn (reg1, gen_frame_mem (mode, mem));
8869 else
8871 rtx reg2 = gen_rtx_REG (mode, regno2);
8872 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
8873 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
8874 reg2, adjustment));
8878 /* Generate and return a store pair instruction of mode MODE to store
8879 register REG1 to MEM1 and register REG2 to MEM2. */
8881 static rtx
8882 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
8883 rtx reg2)
8885 switch (mode)
8887 case E_DImode:
8888 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
8890 case E_DFmode:
8891 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
8893 case E_TFmode:
8894 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
8896 case E_V4SImode:
8897 return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
8899 case E_V16QImode:
8900 return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
8902 default:
8903 gcc_unreachable ();
8907 /* Generate and regurn a load pair isntruction of mode MODE to load register
8908 REG1 from MEM1 and register REG2 from MEM2. */
8910 static rtx
8911 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
8912 rtx mem2)
8914 switch (mode)
8916 case E_DImode:
8917 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
8919 case E_DFmode:
8920 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
8922 case E_TFmode:
8923 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
8925 case E_V4SImode:
8926 return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
8928 default:
8929 gcc_unreachable ();
8933 /* Return TRUE if return address signing should be enabled for the current
8934 function, otherwise return FALSE. */
8936 bool
8937 aarch64_return_address_signing_enabled (void)
8939 /* This function should only be called after frame laid out. */
8940 gcc_assert (cfun->machine->frame.laid_out);
8942 /* Turn return address signing off in any function that uses
8943 __builtin_eh_return. The address passed to __builtin_eh_return
8944 is not signed so either it has to be signed (with original sp)
8945 or the code path that uses it has to avoid authenticating it.
8946 Currently eh return introduces a return to anywhere gadget, no
8947 matter what we do here since it uses ret with user provided
8948 address. An ideal fix for that is to use indirect branch which
8949 can be protected with BTI j (to some extent). */
8950 if (crtl->calls_eh_return)
8951 return false;
8953 /* If signing scope is AARCH_FUNCTION_NON_LEAF, we only sign a leaf function
8954 if its LR is pushed onto stack. */
8955 return (aarch_ra_sign_scope == AARCH_FUNCTION_ALL
8956 || (aarch_ra_sign_scope == AARCH_FUNCTION_NON_LEAF
8957 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
8960 /* Only used by the arm backend. */
8961 void aarch_bti_arch_check (void)
8964 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
8965 bool
8966 aarch_bti_enabled (void)
8968 return (aarch_enable_bti == 1);
8971 /* Check if INSN is a BTI J insn. */
8972 bool
8973 aarch_bti_j_insn_p (rtx_insn *insn)
8975 if (!insn || !INSN_P (insn))
8976 return false;
8978 rtx pat = PATTERN (insn);
8979 return GET_CODE (pat) == UNSPEC_VOLATILE && XINT (pat, 1) == UNSPECV_BTI_J;
8982 /* Check if X (or any sub-rtx of X) is a PACIASP/PACIBSP instruction. */
8983 bool
8984 aarch_pac_insn_p (rtx x)
8986 if (!INSN_P (x))
8987 return false;
8989 subrtx_var_iterator::array_type array;
8990 FOR_EACH_SUBRTX_VAR (iter, array, PATTERN (x), ALL)
8992 rtx sub = *iter;
8993 if (sub && GET_CODE (sub) == UNSPEC)
8995 int unspec_val = XINT (sub, 1);
8996 switch (unspec_val)
8998 case UNSPEC_PACIASP:
8999 case UNSPEC_PACIBSP:
9000 return true;
9002 default:
9003 return false;
9005 iter.skip_subrtxes ();
9008 return false;
9011 rtx aarch_gen_bti_c (void)
9013 return gen_bti_c ();
9016 rtx aarch_gen_bti_j (void)
9018 return gen_bti_j ();
9021 /* The caller is going to use ST1D or LD1D to save or restore an SVE
9022 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
9023 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
9025 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
9026 or LD1D address
9028 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
9029 if the variable isn't already nonnull
9031 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
9032 Handle this case using a temporary base register that is suitable for
9033 all offsets in that range. Use ANCHOR_REG as this base register if it
9034 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
9036 static inline void
9037 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
9038 rtx &anchor_reg, poly_int64 &offset,
9039 rtx &ptrue)
9041 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
9043 /* This is the maximum valid offset of the anchor from the base.
9044 Lower values would be valid too. */
9045 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
9046 if (!anchor_reg)
9048 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9049 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
9050 gen_int_mode (anchor_offset, Pmode)));
9052 base_rtx = anchor_reg;
9053 offset -= anchor_offset;
9055 if (!ptrue)
9057 int pred_reg = cfun->machine->frame.spare_pred_reg;
9058 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
9059 CONSTM1_RTX (VNx16BImode));
9060 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
9064 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
9065 is saved at BASE + OFFSET. */
9067 static void
9068 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
9069 rtx base, poly_int64 offset)
9071 rtx mem = gen_frame_mem (GET_MODE (reg),
9072 plus_constant (Pmode, base, offset));
9073 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
9076 /* Emit code to save the callee-saved registers from register number START
9077 to LIMIT to the stack at the location starting at offset START_OFFSET,
9078 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
9079 is true if the hard frame pointer has been set up. */
9081 static void
9082 aarch64_save_callee_saves (poly_int64 start_offset,
9083 unsigned start, unsigned limit, bool skip_wb,
9084 bool hard_fp_valid_p)
9086 rtx_insn *insn;
9087 unsigned regno;
9088 unsigned regno2;
9089 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
9091 for (regno = aarch64_next_callee_save (start, limit);
9092 regno <= limit;
9093 regno = aarch64_next_callee_save (regno + 1, limit))
9095 rtx reg, mem;
9096 poly_int64 offset;
9097 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9099 if (skip_wb
9100 && (regno == cfun->machine->frame.wb_push_candidate1
9101 || regno == cfun->machine->frame.wb_push_candidate2))
9102 continue;
9104 if (cfun->machine->reg_is_wrapped_separately[regno])
9105 continue;
9107 machine_mode mode = aarch64_reg_save_mode (regno);
9108 reg = gen_rtx_REG (mode, regno);
9109 offset = start_offset + cfun->machine->frame.reg_offset[regno];
9110 rtx base_rtx = stack_pointer_rtx;
9111 poly_int64 sp_offset = offset;
9113 HOST_WIDE_INT const_offset;
9114 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9115 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
9116 offset, ptrue);
9117 else if (GP_REGNUM_P (regno)
9118 && (!offset.is_constant (&const_offset) || const_offset >= 512))
9120 gcc_assert (known_eq (start_offset, 0));
9121 poly_int64 fp_offset
9122 = cfun->machine->frame.below_hard_fp_saved_regs_size;
9123 if (hard_fp_valid_p)
9124 base_rtx = hard_frame_pointer_rtx;
9125 else
9127 if (!anchor_reg)
9129 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9130 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
9131 gen_int_mode (fp_offset, Pmode)));
9133 base_rtx = anchor_reg;
9135 offset -= fp_offset;
9137 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9138 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
9140 if (!aarch64_sve_mode_p (mode)
9141 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
9142 && !cfun->machine->reg_is_wrapped_separately[regno2]
9143 && known_eq (GET_MODE_SIZE (mode),
9144 cfun->machine->frame.reg_offset[regno2]
9145 - cfun->machine->frame.reg_offset[regno]))
9147 rtx reg2 = gen_rtx_REG (mode, regno2);
9148 rtx mem2;
9150 offset += GET_MODE_SIZE (mode);
9151 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9152 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
9153 reg2));
9155 /* The first part of a frame-related parallel insn is
9156 always assumed to be relevant to the frame
9157 calculations; subsequent parts, are only
9158 frame-related if explicitly marked. */
9159 if (aarch64_emit_cfi_for_reg_p (regno2))
9161 if (need_cfa_note_p)
9162 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
9163 sp_offset + GET_MODE_SIZE (mode));
9164 else
9165 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
9168 regno = regno2;
9170 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9172 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
9173 need_cfa_note_p = true;
9175 else if (aarch64_sve_mode_p (mode))
9176 insn = emit_insn (gen_rtx_SET (mem, reg));
9177 else
9178 insn = emit_move_insn (mem, reg);
9180 RTX_FRAME_RELATED_P (insn) = frame_related_p;
9181 if (frame_related_p && need_cfa_note_p)
9182 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
9186 /* Emit code to restore the callee registers from register number START
9187 up to and including LIMIT. Restore from the stack offset START_OFFSET,
9188 skipping any write-back candidates if SKIP_WB is true. Write the
9189 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
9191 static void
9192 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
9193 unsigned limit, bool skip_wb, rtx *cfi_ops)
9195 unsigned regno;
9196 unsigned regno2;
9197 poly_int64 offset;
9198 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
9200 for (regno = aarch64_next_callee_save (start, limit);
9201 regno <= limit;
9202 regno = aarch64_next_callee_save (regno + 1, limit))
9204 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9205 if (cfun->machine->reg_is_wrapped_separately[regno])
9206 continue;
9208 rtx reg, mem;
9210 if (skip_wb
9211 && (regno == cfun->machine->frame.wb_pop_candidate1
9212 || regno == cfun->machine->frame.wb_pop_candidate2))
9213 continue;
9215 machine_mode mode = aarch64_reg_save_mode (regno);
9216 reg = gen_rtx_REG (mode, regno);
9217 offset = start_offset + cfun->machine->frame.reg_offset[regno];
9218 rtx base_rtx = stack_pointer_rtx;
9219 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9220 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
9221 offset, ptrue);
9222 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9224 if (!aarch64_sve_mode_p (mode)
9225 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
9226 && !cfun->machine->reg_is_wrapped_separately[regno2]
9227 && known_eq (GET_MODE_SIZE (mode),
9228 cfun->machine->frame.reg_offset[regno2]
9229 - cfun->machine->frame.reg_offset[regno]))
9231 rtx reg2 = gen_rtx_REG (mode, regno2);
9232 rtx mem2;
9234 offset += GET_MODE_SIZE (mode);
9235 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
9236 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9238 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
9239 regno = regno2;
9241 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9242 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
9243 else if (aarch64_sve_mode_p (mode))
9244 emit_insn (gen_rtx_SET (reg, mem));
9245 else
9246 emit_move_insn (reg, mem);
9247 if (frame_related_p)
9248 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
9252 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
9253 of MODE. */
9255 static inline bool
9256 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9258 HOST_WIDE_INT multiple;
9259 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9260 && IN_RANGE (multiple, -8, 7));
9263 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
9264 of MODE. */
9266 static inline bool
9267 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9269 HOST_WIDE_INT multiple;
9270 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9271 && IN_RANGE (multiple, -32, 31));
9274 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
9275 of MODE. */
9277 static inline bool
9278 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9280 HOST_WIDE_INT multiple;
9281 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9282 && IN_RANGE (multiple, 0, 63));
9285 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
9286 of MODE. */
9288 bool
9289 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9291 HOST_WIDE_INT multiple;
9292 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9293 && IN_RANGE (multiple, -64, 63));
9296 /* Return true if OFFSET is a signed 9-bit value. */
9298 bool
9299 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
9300 poly_int64 offset)
9302 HOST_WIDE_INT const_offset;
9303 return (offset.is_constant (&const_offset)
9304 && IN_RANGE (const_offset, -256, 255));
9307 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
9308 of MODE. */
9310 static inline bool
9311 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
9313 HOST_WIDE_INT multiple;
9314 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9315 && IN_RANGE (multiple, -256, 255));
9318 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
9319 of MODE. */
9321 static inline bool
9322 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
9324 HOST_WIDE_INT multiple;
9325 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
9326 && IN_RANGE (multiple, 0, 4095));
9329 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
9331 static sbitmap
9332 aarch64_get_separate_components (void)
9334 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9335 bitmap_clear (components);
9337 /* The registers we need saved to the frame. */
9338 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9339 if (aarch64_register_saved_on_entry (regno))
9341 /* Punt on saves and restores that use ST1D and LD1D. We could
9342 try to be smarter, but it would involve making sure that the
9343 spare predicate register itself is safe to use at the save
9344 and restore points. Also, when a frame pointer is being used,
9345 the slots are often out of reach of ST1D and LD1D anyway. */
9346 machine_mode mode = aarch64_reg_save_mode (regno);
9347 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
9348 continue;
9350 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9352 /* If the register is saved in the first SVE save slot, we use
9353 it as a stack probe for -fstack-clash-protection. */
9354 if (flag_stack_clash_protection
9355 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
9356 && known_eq (offset, 0))
9357 continue;
9359 /* Get the offset relative to the register we'll use. */
9360 if (frame_pointer_needed)
9361 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9362 else
9363 offset += crtl->outgoing_args_size;
9365 /* Check that we can access the stack slot of the register with one
9366 direct load with no adjustments needed. */
9367 if (aarch64_sve_mode_p (mode)
9368 ? offset_9bit_signed_scaled_p (mode, offset)
9369 : offset_12bit_unsigned_scaled_p (mode, offset))
9370 bitmap_set_bit (components, regno);
9373 /* Don't mess with the hard frame pointer. */
9374 if (frame_pointer_needed)
9375 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
9377 /* If the spare predicate register used by big-endian SVE code
9378 is call-preserved, it must be saved in the main prologue
9379 before any saves that use it. */
9380 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
9381 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
9383 unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9384 unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
9385 /* If registers have been chosen to be stored/restored with
9386 writeback don't interfere with them to avoid having to output explicit
9387 stack adjustment instructions. */
9388 if (reg2 != INVALID_REGNUM)
9389 bitmap_clear_bit (components, reg2);
9390 if (reg1 != INVALID_REGNUM)
9391 bitmap_clear_bit (components, reg1);
9393 bitmap_clear_bit (components, LR_REGNUM);
9394 bitmap_clear_bit (components, SP_REGNUM);
9396 return components;
9399 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
9401 static sbitmap
9402 aarch64_components_for_bb (basic_block bb)
9404 bitmap in = DF_LIVE_IN (bb);
9405 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
9406 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
9408 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
9409 bitmap_clear (components);
9411 /* Clobbered registers don't generate values in any meaningful sense,
9412 since nothing after the clobber can rely on their value. And we can't
9413 say that partially-clobbered registers are unconditionally killed,
9414 because whether they're killed or not depends on the mode of the
9415 value they're holding. Thus partially call-clobbered registers
9416 appear in neither the kill set nor the gen set.
9418 Check manually for any calls that clobber more of a register than the
9419 current function can. */
9420 function_abi_aggregator callee_abis;
9421 rtx_insn *insn;
9422 FOR_BB_INSNS (bb, insn)
9423 if (CALL_P (insn))
9424 callee_abis.note_callee_abi (insn_callee_abi (insn));
9425 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
9427 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
9428 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9429 if (!fixed_regs[regno]
9430 && !crtl->abi->clobbers_full_reg_p (regno)
9431 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
9432 || bitmap_bit_p (in, regno)
9433 || bitmap_bit_p (gen, regno)
9434 || bitmap_bit_p (kill, regno)))
9436 bitmap_set_bit (components, regno);
9438 /* If there is a callee-save at an adjacent offset, add it too
9439 to increase the use of LDP/STP. */
9440 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9441 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
9443 if (regno2 <= LAST_SAVED_REGNUM)
9445 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9446 if (regno < regno2
9447 ? known_eq (offset + 8, offset2)
9448 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
9449 bitmap_set_bit (components, regno2);
9453 return components;
9456 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
9457 Nothing to do for aarch64. */
9459 static void
9460 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
9464 /* Return the next set bit in BMP from START onwards. Return the total number
9465 of bits in BMP if no set bit is found at or after START. */
9467 static unsigned int
9468 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
9470 unsigned int nbits = SBITMAP_SIZE (bmp);
9471 if (start == nbits)
9472 return start;
9474 gcc_assert (start < nbits);
9475 for (unsigned int i = start; i < nbits; i++)
9476 if (bitmap_bit_p (bmp, i))
9477 return i;
9479 return nbits;
9482 /* Do the work for aarch64_emit_prologue_components and
9483 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
9484 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
9485 for these components or the epilogue sequence. That is, it determines
9486 whether we should emit stores or loads and what kind of CFA notes to attach
9487 to the insns. Otherwise the logic for the two sequences is very
9488 similar. */
9490 static void
9491 aarch64_process_components (sbitmap components, bool prologue_p)
9493 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
9494 ? HARD_FRAME_POINTER_REGNUM
9495 : STACK_POINTER_REGNUM);
9497 unsigned last_regno = SBITMAP_SIZE (components);
9498 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
9499 rtx_insn *insn = NULL;
9501 while (regno != last_regno)
9503 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
9504 machine_mode mode = aarch64_reg_save_mode (regno);
9506 rtx reg = gen_rtx_REG (mode, regno);
9507 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
9508 if (frame_pointer_needed)
9509 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9510 else
9511 offset += crtl->outgoing_args_size;
9513 rtx addr = plus_constant (Pmode, ptr_reg, offset);
9514 rtx mem = gen_frame_mem (mode, addr);
9516 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
9517 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
9518 /* No more registers to handle after REGNO.
9519 Emit a single save/restore and exit. */
9520 if (regno2 == last_regno)
9522 insn = emit_insn (set);
9523 if (frame_related_p)
9525 RTX_FRAME_RELATED_P (insn) = 1;
9526 if (prologue_p)
9527 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9528 else
9529 add_reg_note (insn, REG_CFA_RESTORE, reg);
9531 break;
9534 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
9535 /* The next register is not of the same class or its offset is not
9536 mergeable with the current one into a pair. */
9537 if (aarch64_sve_mode_p (mode)
9538 || !satisfies_constraint_Ump (mem)
9539 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
9540 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
9541 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
9542 GET_MODE_SIZE (mode)))
9544 insn = emit_insn (set);
9545 if (frame_related_p)
9547 RTX_FRAME_RELATED_P (insn) = 1;
9548 if (prologue_p)
9549 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
9550 else
9551 add_reg_note (insn, REG_CFA_RESTORE, reg);
9554 regno = regno2;
9555 continue;
9558 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
9560 /* REGNO2 can be saved/restored in a pair with REGNO. */
9561 rtx reg2 = gen_rtx_REG (mode, regno2);
9562 if (frame_pointer_needed)
9563 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
9564 else
9565 offset2 += crtl->outgoing_args_size;
9566 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
9567 rtx mem2 = gen_frame_mem (mode, addr2);
9568 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
9569 : gen_rtx_SET (reg2, mem2);
9571 if (prologue_p)
9572 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
9573 else
9574 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
9576 if (frame_related_p || frame_related2_p)
9578 RTX_FRAME_RELATED_P (insn) = 1;
9579 if (prologue_p)
9581 if (frame_related_p)
9582 add_reg_note (insn, REG_CFA_OFFSET, set);
9583 if (frame_related2_p)
9584 add_reg_note (insn, REG_CFA_OFFSET, set2);
9586 else
9588 if (frame_related_p)
9589 add_reg_note (insn, REG_CFA_RESTORE, reg);
9590 if (frame_related2_p)
9591 add_reg_note (insn, REG_CFA_RESTORE, reg2);
9595 regno = aarch64_get_next_set_bit (components, regno2 + 1);
9599 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
9601 static void
9602 aarch64_emit_prologue_components (sbitmap components)
9604 aarch64_process_components (components, true);
9607 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
9609 static void
9610 aarch64_emit_epilogue_components (sbitmap components)
9612 aarch64_process_components (components, false);
9615 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
9617 static void
9618 aarch64_set_handled_components (sbitmap components)
9620 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
9621 if (bitmap_bit_p (components, regno))
9622 cfun->machine->reg_is_wrapped_separately[regno] = true;
9625 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
9626 determining the probe offset for alloca. */
9628 static HOST_WIDE_INT
9629 aarch64_stack_clash_protection_alloca_probe_range (void)
9631 return STACK_CLASH_CALLER_GUARD;
9635 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
9636 registers. If POLY_SIZE is not large enough to require a probe this function
9637 will only adjust the stack. When allocating the stack space
9638 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
9639 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
9640 arguments. If we are then we ensure that any allocation larger than the ABI
9641 defined buffer needs a probe so that the invariant of having a 1KB buffer is
9642 maintained.
9644 We emit barriers after each stack adjustment to prevent optimizations from
9645 breaking the invariant that we never drop the stack more than a page. This
9646 invariant is needed to make it easier to correctly handle asynchronous
9647 events, e.g. if we were to allow the stack to be dropped by more than a page
9648 and then have multiple probes up and we take a signal somewhere in between
9649 then the signal handler doesn't know the state of the stack and can make no
9650 assumptions about which pages have been probed. */
9652 static void
9653 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
9654 poly_int64 poly_size,
9655 bool frame_related_p,
9656 bool final_adjustment_p)
9658 HOST_WIDE_INT guard_size
9659 = 1 << param_stack_clash_protection_guard_size;
9660 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
9661 HOST_WIDE_INT min_probe_threshold
9662 = (final_adjustment_p
9663 ? guard_used_by_caller
9664 : guard_size - guard_used_by_caller);
9665 /* When doing the final adjustment for the outgoing arguments, take into
9666 account any unprobed space there is above the current SP. There are
9667 two cases:
9669 - When saving SVE registers below the hard frame pointer, we force
9670 the lowest save to take place in the prologue before doing the final
9671 adjustment (i.e. we don't allow the save to be shrink-wrapped).
9672 This acts as a probe at SP, so there is no unprobed space.
9674 - When there are no SVE register saves, we use the store of the link
9675 register as a probe. We can't assume that LR was saved at position 0
9676 though, so treat any space below it as unprobed. */
9677 if (final_adjustment_p
9678 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
9680 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
9681 if (known_ge (lr_offset, 0))
9682 min_probe_threshold -= lr_offset.to_constant ();
9683 else
9684 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
9687 poly_int64 frame_size = cfun->machine->frame.frame_size;
9689 /* We should always have a positive probe threshold. */
9690 gcc_assert (min_probe_threshold > 0);
9692 if (flag_stack_clash_protection && !final_adjustment_p)
9694 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9695 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9696 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9698 if (known_eq (frame_size, 0))
9700 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
9702 else if (known_lt (initial_adjust + sve_callee_adjust,
9703 guard_size - guard_used_by_caller)
9704 && known_lt (final_adjust, guard_used_by_caller))
9706 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
9710 /* If SIZE is not large enough to require probing, just adjust the stack and
9711 exit. */
9712 if (known_lt (poly_size, min_probe_threshold)
9713 || !flag_stack_clash_protection)
9715 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
9716 return;
9719 HOST_WIDE_INT size;
9720 /* Handle the SVE non-constant case first. */
9721 if (!poly_size.is_constant (&size))
9723 if (dump_file)
9725 fprintf (dump_file, "Stack clash SVE prologue: ");
9726 print_dec (poly_size, dump_file);
9727 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
9730 /* First calculate the amount of bytes we're actually spilling. */
9731 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
9732 poly_size, temp1, temp2, false, true);
9734 rtx_insn *insn = get_last_insn ();
9736 if (frame_related_p)
9738 /* This is done to provide unwinding information for the stack
9739 adjustments we're about to do, however to prevent the optimizers
9740 from removing the R11 move and leaving the CFA note (which would be
9741 very wrong) we tie the old and new stack pointer together.
9742 The tie will expand to nothing but the optimizers will not touch
9743 the instruction. */
9744 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
9745 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
9746 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
9748 /* We want the CFA independent of the stack pointer for the
9749 duration of the loop. */
9750 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
9751 RTX_FRAME_RELATED_P (insn) = 1;
9754 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
9755 rtx guard_const = gen_int_mode (guard_size, Pmode);
9757 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
9758 stack_pointer_rtx, temp1,
9759 probe_const, guard_const));
9761 /* Now reset the CFA register if needed. */
9762 if (frame_related_p)
9764 add_reg_note (insn, REG_CFA_DEF_CFA,
9765 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
9766 gen_int_mode (poly_size, Pmode)));
9767 RTX_FRAME_RELATED_P (insn) = 1;
9770 return;
9773 if (dump_file)
9774 fprintf (dump_file,
9775 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
9776 " bytes, probing will be required.\n", size);
9778 /* Round size to the nearest multiple of guard_size, and calculate the
9779 residual as the difference between the original size and the rounded
9780 size. */
9781 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
9782 HOST_WIDE_INT residual = size - rounded_size;
9784 /* We can handle a small number of allocations/probes inline. Otherwise
9785 punt to a loop. */
9786 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
9788 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
9790 aarch64_sub_sp (NULL, temp2, guard_size, true);
9791 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9792 guard_used_by_caller));
9793 emit_insn (gen_blockage ());
9795 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
9797 else
9799 /* Compute the ending address. */
9800 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
9801 temp1, NULL, false, true);
9802 rtx_insn *insn = get_last_insn ();
9804 /* For the initial allocation, we don't have a frame pointer
9805 set up, so we always need CFI notes. If we're doing the
9806 final allocation, then we may have a frame pointer, in which
9807 case it is the CFA, otherwise we need CFI notes.
9809 We can determine which allocation we are doing by looking at
9810 the value of FRAME_RELATED_P since the final allocations are not
9811 frame related. */
9812 if (frame_related_p)
9814 /* We want the CFA independent of the stack pointer for the
9815 duration of the loop. */
9816 add_reg_note (insn, REG_CFA_DEF_CFA,
9817 plus_constant (Pmode, temp1, rounded_size));
9818 RTX_FRAME_RELATED_P (insn) = 1;
9821 /* This allocates and probes the stack. Note that this re-uses some of
9822 the existing Ada stack protection code. However we are guaranteed not
9823 to enter the non loop or residual branches of that code.
9825 The non-loop part won't be entered because if our allocation amount
9826 doesn't require a loop, the case above would handle it.
9828 The residual amount won't be entered because TEMP1 is a mutliple of
9829 the allocation size. The residual will always be 0. As such, the only
9830 part we are actually using from that code is the loop setup. The
9831 actual probing is done in aarch64_output_probe_stack_range. */
9832 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
9833 stack_pointer_rtx, temp1));
9835 /* Now reset the CFA register if needed. */
9836 if (frame_related_p)
9838 add_reg_note (insn, REG_CFA_DEF_CFA,
9839 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
9840 RTX_FRAME_RELATED_P (insn) = 1;
9843 emit_insn (gen_blockage ());
9844 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
9847 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
9848 be probed. This maintains the requirement that each page is probed at
9849 least once. For initial probing we probe only if the allocation is
9850 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
9851 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
9852 GUARD_SIZE. This works that for any allocation that is large enough to
9853 trigger a probe here, we'll have at least one, and if they're not large
9854 enough for this code to emit anything for them, The page would have been
9855 probed by the saving of FP/LR either by this function or any callees. If
9856 we don't have any callees then we won't have more stack adjustments and so
9857 are still safe. */
9858 if (residual)
9860 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
9861 /* If we're doing final adjustments, and we've done any full page
9862 allocations then any residual needs to be probed. */
9863 if (final_adjustment_p && rounded_size != 0)
9864 min_probe_threshold = 0;
9865 /* If doing a small final adjustment, we always probe at offset 0.
9866 This is done to avoid issues when LR is not at position 0 or when
9867 the final adjustment is smaller than the probing offset. */
9868 else if (final_adjustment_p && rounded_size == 0)
9869 residual_probe_offset = 0;
9871 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
9872 if (residual >= min_probe_threshold)
9874 if (dump_file)
9875 fprintf (dump_file,
9876 "Stack clash AArch64 prologue residuals: "
9877 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
9878 "\n", residual);
9880 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
9881 residual_probe_offset));
9882 emit_insn (gen_blockage ());
9887 /* Return 1 if the register is used by the epilogue. We need to say the
9888 return register is used, but only after epilogue generation is complete.
9889 Note that in the case of sibcalls, the values "used by the epilogue" are
9890 considered live at the start of the called function.
9892 For SIMD functions we need to return 1 for FP registers that are saved and
9893 restored by a function but are not zero in call_used_regs. If we do not do
9894 this optimizations may remove the restore of the register. */
9897 aarch64_epilogue_uses (int regno)
9899 if (epilogue_completed)
9901 if (regno == LR_REGNUM)
9902 return 1;
9904 return 0;
9907 /* AArch64 stack frames generated by this compiler look like:
9909 +-------------------------------+
9911 | incoming stack arguments |
9913 +-------------------------------+
9914 | | <-- incoming stack pointer (aligned)
9915 | callee-allocated save area |
9916 | for register varargs |
9918 +-------------------------------+
9919 | local variables | <-- frame_pointer_rtx
9921 +-------------------------------+
9922 | padding | \
9923 +-------------------------------+ |
9924 | callee-saved registers | | frame.saved_regs_size
9925 +-------------------------------+ |
9926 | LR' | |
9927 +-------------------------------+ |
9928 | FP' | |
9929 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
9930 | SVE vector registers | | \
9931 +-------------------------------+ | | below_hard_fp_saved_regs_size
9932 | SVE predicate registers | / /
9933 +-------------------------------+
9934 | dynamic allocation |
9935 +-------------------------------+
9936 | padding |
9937 +-------------------------------+
9938 | outgoing stack arguments | <-- arg_pointer
9940 +-------------------------------+
9941 | | <-- stack_pointer_rtx (aligned)
9943 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
9944 but leave frame_pointer_rtx and hard_frame_pointer_rtx
9945 unchanged.
9947 By default for stack-clash we assume the guard is at least 64KB, but this
9948 value is configurable to either 4KB or 64KB. We also force the guard size to
9949 be the same as the probing interval and both values are kept in sync.
9951 With those assumptions the callee can allocate up to 63KB (or 3KB depending
9952 on the guard size) of stack space without probing.
9954 When probing is needed, we emit a probe at the start of the prologue
9955 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
9957 We have to track how much space has been allocated and the only stores
9958 to the stack we track as implicit probes are the FP/LR stores.
9960 For outgoing arguments we probe if the size is larger than 1KB, such that
9961 the ABI specified buffer is maintained for the next callee.
9963 The following registers are reserved during frame layout and should not be
9964 used for any other purpose:
9966 - r11: Used by stack clash protection when SVE is enabled, and also
9967 as an anchor register when saving and restoring registers
9968 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
9969 - r14 and r15: Used for speculation tracking.
9970 - r16(IP0), r17(IP1): Used by indirect tailcalls.
9971 - r30(LR), r29(FP): Used by standard frame layout.
9973 These registers must be avoided in frame layout related code unless the
9974 explicit intention is to interact with one of the features listed above. */
9976 /* Generate the prologue instructions for entry into a function.
9977 Establish the stack frame by decreasing the stack pointer with a
9978 properly calculated size and, if necessary, create a frame record
9979 filled with the values of LR and previous frame pointer. The
9980 current FP is also set up if it is in use. */
9982 void
9983 aarch64_expand_prologue (void)
9985 poly_int64 frame_size = cfun->machine->frame.frame_size;
9986 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
9987 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
9988 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
9989 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
9990 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
9991 poly_int64 below_hard_fp_saved_regs_size
9992 = cfun->machine->frame.below_hard_fp_saved_regs_size;
9993 unsigned reg1 = cfun->machine->frame.wb_push_candidate1;
9994 unsigned reg2 = cfun->machine->frame.wb_push_candidate2;
9995 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
9996 rtx_insn *insn;
9998 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
10000 /* Fold the SVE allocation into the initial allocation.
10001 We don't do this in aarch64_layout_arg to avoid pessimizing
10002 the epilogue code. */
10003 initial_adjust += sve_callee_adjust;
10004 sve_callee_adjust = 0;
10007 /* Sign return address for functions. */
10008 if (aarch64_return_address_signing_enabled ())
10010 switch (aarch_ra_sign_key)
10012 case AARCH_KEY_A:
10013 insn = emit_insn (gen_paciasp ());
10014 break;
10015 case AARCH_KEY_B:
10016 insn = emit_insn (gen_pacibsp ());
10017 break;
10018 default:
10019 gcc_unreachable ();
10021 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10022 RTX_FRAME_RELATED_P (insn) = 1;
10025 /* Push return address to shadow call stack. */
10026 if (cfun->machine->frame.is_scs_enabled)
10027 emit_insn (gen_scs_push ());
10029 if (flag_stack_usage_info)
10030 current_function_static_stack_size = constant_lower_bound (frame_size);
10032 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
10034 if (crtl->is_leaf && !cfun->calls_alloca)
10036 if (maybe_gt (frame_size, PROBE_INTERVAL)
10037 && maybe_gt (frame_size, get_stack_check_protect ()))
10038 aarch64_emit_probe_stack_range (get_stack_check_protect (),
10039 (frame_size
10040 - get_stack_check_protect ()));
10042 else if (maybe_gt (frame_size, 0))
10043 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
10046 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
10047 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
10049 /* In theory we should never have both an initial adjustment
10050 and a callee save adjustment. Verify that is the case since the
10051 code below does not handle it for -fstack-clash-protection. */
10052 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
10054 /* Will only probe if the initial adjustment is larger than the guard
10055 less the amount of the guard reserved for use by the caller's
10056 outgoing args. */
10057 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
10058 true, false);
10060 if (callee_adjust != 0)
10061 aarch64_push_regs (reg1, reg2, callee_adjust);
10063 /* The offset of the frame chain record (if any) from the current SP. */
10064 poly_int64 chain_offset = (initial_adjust + callee_adjust
10065 - cfun->machine->frame.hard_fp_offset);
10066 gcc_assert (known_ge (chain_offset, 0));
10068 /* The offset of the bottom of the save area from the current SP. */
10069 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
10071 if (emit_frame_chain)
10073 if (callee_adjust == 0)
10075 reg1 = R29_REGNUM;
10076 reg2 = R30_REGNUM;
10077 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
10078 false, false);
10080 else
10081 gcc_assert (known_eq (chain_offset, 0));
10082 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
10083 stack_pointer_rtx, chain_offset,
10084 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
10085 if (frame_pointer_needed && !frame_size.is_constant ())
10087 /* Variable-sized frames need to describe the save slot
10088 address using DW_CFA_expression rather than DW_CFA_offset.
10089 This means that, without taking further action, the
10090 locations of the registers that we've already saved would
10091 remain based on the stack pointer even after we redefine
10092 the CFA based on the frame pointer. We therefore need new
10093 DW_CFA_expressions to re-express the save slots with addresses
10094 based on the frame pointer. */
10095 rtx_insn *insn = get_last_insn ();
10096 gcc_assert (RTX_FRAME_RELATED_P (insn));
10098 /* Add an explicit CFA definition if this was previously
10099 implicit. */
10100 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
10102 rtx src = plus_constant (Pmode, stack_pointer_rtx,
10103 callee_offset);
10104 add_reg_note (insn, REG_CFA_ADJUST_CFA,
10105 gen_rtx_SET (hard_frame_pointer_rtx, src));
10108 /* Change the save slot expressions for the registers that
10109 we've already saved. */
10110 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
10111 hard_frame_pointer_rtx, UNITS_PER_WORD);
10112 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
10113 hard_frame_pointer_rtx, 0);
10115 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
10118 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
10119 callee_adjust != 0 || emit_frame_chain,
10120 emit_frame_chain);
10121 if (maybe_ne (sve_callee_adjust, 0))
10123 gcc_assert (!flag_stack_clash_protection
10124 || known_eq (initial_adjust, 0));
10125 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
10126 sve_callee_adjust,
10127 !frame_pointer_needed, false);
10128 saved_regs_offset += sve_callee_adjust;
10130 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
10131 false, emit_frame_chain);
10132 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
10133 callee_adjust != 0 || emit_frame_chain,
10134 emit_frame_chain);
10136 /* We may need to probe the final adjustment if it is larger than the guard
10137 that is assumed by the called. */
10138 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
10139 !frame_pointer_needed, true);
10142 /* Return TRUE if we can use a simple_return insn.
10144 This function checks whether the callee saved stack is empty, which
10145 means no restore actions are need. The pro_and_epilogue will use
10146 this to check whether shrink-wrapping opt is feasible. */
10148 bool
10149 aarch64_use_return_insn_p (void)
10151 if (!reload_completed)
10152 return false;
10154 if (crtl->profile)
10155 return false;
10157 return known_eq (cfun->machine->frame.frame_size, 0);
10160 /* Generate the epilogue instructions for returning from a function.
10161 This is almost exactly the reverse of the prolog sequence, except
10162 that we need to insert barriers to avoid scheduling loads that read
10163 from a deallocated stack, and we optimize the unwind records by
10164 emitting them all together if possible. */
10165 void
10166 aarch64_expand_epilogue (bool for_sibcall)
10168 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
10169 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
10170 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
10171 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
10172 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
10173 poly_int64 below_hard_fp_saved_regs_size
10174 = cfun->machine->frame.below_hard_fp_saved_regs_size;
10175 unsigned reg1 = cfun->machine->frame.wb_pop_candidate1;
10176 unsigned reg2 = cfun->machine->frame.wb_pop_candidate2;
10177 unsigned int last_gpr = (cfun->machine->frame.is_scs_enabled
10178 ? R29_REGNUM : R30_REGNUM);
10179 rtx cfi_ops = NULL;
10180 rtx_insn *insn;
10181 /* A stack clash protection prologue may not have left EP0_REGNUM or
10182 EP1_REGNUM in a usable state. The same is true for allocations
10183 with an SVE component, since we then need both temporary registers
10184 for each allocation. For stack clash we are in a usable state if
10185 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
10186 HOST_WIDE_INT guard_size
10187 = 1 << param_stack_clash_protection_guard_size;
10188 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
10190 /* We can re-use the registers when:
10192 (a) the deallocation amount is the same as the corresponding
10193 allocation amount (which is false if we combine the initial
10194 and SVE callee save allocations in the prologue); and
10196 (b) the allocation amount doesn't need a probe (which is false
10197 if the amount is guard_size - guard_used_by_caller or greater).
10199 In such situations the register should remain live with the correct
10200 value. */
10201 bool can_inherit_p = (initial_adjust.is_constant ()
10202 && final_adjust.is_constant ()
10203 && (!flag_stack_clash_protection
10204 || (known_lt (initial_adjust,
10205 guard_size - guard_used_by_caller)
10206 && known_eq (sve_callee_adjust, 0))));
10208 /* We need to add memory barrier to prevent read from deallocated stack. */
10209 bool need_barrier_p
10210 = maybe_ne (get_frame_size ()
10211 + cfun->machine->frame.saved_varargs_size, 0);
10213 /* Emit a barrier to prevent loads from a deallocated stack. */
10214 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
10215 || cfun->calls_alloca
10216 || crtl->calls_eh_return)
10218 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
10219 need_barrier_p = false;
10222 /* Restore the stack pointer from the frame pointer if it may not
10223 be the same as the stack pointer. */
10224 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
10225 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
10226 if (frame_pointer_needed
10227 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
10228 /* If writeback is used when restoring callee-saves, the CFA
10229 is restored on the instruction doing the writeback. */
10230 aarch64_add_offset (Pmode, stack_pointer_rtx,
10231 hard_frame_pointer_rtx,
10232 -callee_offset - below_hard_fp_saved_regs_size,
10233 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
10234 else
10235 /* The case where we need to re-use the register here is very rare, so
10236 avoid the complicated condition and just always emit a move if the
10237 immediate doesn't fit. */
10238 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
10240 /* Restore the vector registers before the predicate registers,
10241 so that we can use P4 as a temporary for big-endian SVE frames. */
10242 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
10243 callee_adjust != 0, &cfi_ops);
10244 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
10245 false, &cfi_ops);
10246 if (maybe_ne (sve_callee_adjust, 0))
10247 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
10249 /* When shadow call stack is enabled, the scs_pop in the epilogue will
10250 restore x30, we don't need to restore x30 again in the traditional
10251 way. */
10252 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
10253 R0_REGNUM, last_gpr,
10254 callee_adjust != 0, &cfi_ops);
10256 if (need_barrier_p)
10257 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
10259 if (callee_adjust != 0)
10260 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
10262 /* If we have no register restore information, the CFA must have been
10263 defined in terms of the stack pointer since the end of the prologue. */
10264 gcc_assert (cfi_ops || !frame_pointer_needed);
10266 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
10268 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
10269 insn = get_last_insn ();
10270 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
10271 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
10272 RTX_FRAME_RELATED_P (insn) = 1;
10273 cfi_ops = NULL;
10276 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
10277 add restriction on emit_move optimization to leaf functions. */
10278 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
10279 (!can_inherit_p || !crtl->is_leaf
10280 || df_regs_ever_live_p (EP0_REGNUM)));
10282 if (cfi_ops)
10284 /* Emit delayed restores and reset the CFA to be SP. */
10285 insn = get_last_insn ();
10286 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
10287 REG_NOTES (insn) = cfi_ops;
10288 RTX_FRAME_RELATED_P (insn) = 1;
10291 /* Pop return address from shadow call stack. */
10292 if (cfun->machine->frame.is_scs_enabled)
10294 machine_mode mode = aarch64_reg_save_mode (R30_REGNUM);
10295 rtx reg = gen_rtx_REG (mode, R30_REGNUM);
10297 insn = emit_insn (gen_scs_pop ());
10298 add_reg_note (insn, REG_CFA_RESTORE, reg);
10299 RTX_FRAME_RELATED_P (insn) = 1;
10302 /* We prefer to emit the combined return/authenticate instruction RETAA,
10303 however there are three cases in which we must instead emit an explicit
10304 authentication instruction.
10306 1) Sibcalls don't return in a normal way, so if we're about to call one
10307 we must authenticate.
10309 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
10310 generating code for !TARGET_ARMV8_3 we can't use it and must
10311 explicitly authenticate.
10313 if (aarch64_return_address_signing_enabled ()
10314 && (for_sibcall || !TARGET_ARMV8_3))
10316 switch (aarch_ra_sign_key)
10318 case AARCH_KEY_A:
10319 insn = emit_insn (gen_autiasp ());
10320 break;
10321 case AARCH_KEY_B:
10322 insn = emit_insn (gen_autibsp ());
10323 break;
10324 default:
10325 gcc_unreachable ();
10327 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
10328 RTX_FRAME_RELATED_P (insn) = 1;
10331 /* Stack adjustment for exception handler. */
10332 if (crtl->calls_eh_return && !for_sibcall)
10334 /* We need to unwind the stack by the offset computed by
10335 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
10336 to be SP; letting the CFA move during this adjustment
10337 is just as correct as retaining the CFA from the body
10338 of the function. Therefore, do nothing special. */
10339 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
10342 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
10343 if (!for_sibcall)
10344 emit_jump_insn (ret_rtx);
10347 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
10348 normally or return to a previous frame after unwinding.
10350 An EH return uses a single shared return sequence. The epilogue is
10351 exactly like a normal epilogue except that it has an extra input
10352 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
10353 that must be applied after the frame has been destroyed. An extra label
10354 is inserted before the epilogue which initializes this register to zero,
10355 and this is the entry point for a normal return.
10357 An actual EH return updates the return address, initializes the stack
10358 adjustment and jumps directly into the epilogue (bypassing the zeroing
10359 of the adjustment). Since the return address is typically saved on the
10360 stack when a function makes a call, the saved LR must be updated outside
10361 the epilogue.
10363 This poses problems as the store is generated well before the epilogue,
10364 so the offset of LR is not known yet. Also optimizations will remove the
10365 store as it appears dead, even after the epilogue is generated (as the
10366 base or offset for loading LR is different in many cases).
10368 To avoid these problems this implementation forces the frame pointer
10369 in eh_return functions so that the location of LR is fixed and known early.
10370 It also marks the store volatile, so no optimization is permitted to
10371 remove the store. */
10373 aarch64_eh_return_handler_rtx (void)
10375 rtx tmp = gen_frame_mem (Pmode,
10376 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
10378 /* Mark the store volatile, so no optimization is permitted to remove it. */
10379 MEM_VOLATILE_P (tmp) = true;
10380 return tmp;
10383 /* Output code to add DELTA to the first argument, and then jump
10384 to FUNCTION. Used for C++ multiple inheritance. */
10385 static void
10386 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
10387 HOST_WIDE_INT delta,
10388 HOST_WIDE_INT vcall_offset,
10389 tree function)
10391 /* The this pointer is always in x0. Note that this differs from
10392 Arm where the this pointer maybe bumped to r1 if r0 is required
10393 to return a pointer to an aggregate. On AArch64 a result value
10394 pointer will be in x8. */
10395 int this_regno = R0_REGNUM;
10396 rtx this_rtx, temp0, temp1, addr, funexp;
10397 rtx_insn *insn;
10398 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
10400 if (aarch_bti_enabled ())
10401 emit_insn (gen_bti_c());
10403 reload_completed = 1;
10404 emit_note (NOTE_INSN_PROLOGUE_END);
10406 this_rtx = gen_rtx_REG (Pmode, this_regno);
10407 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
10408 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
10410 if (vcall_offset == 0)
10411 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
10412 else
10414 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
10416 addr = this_rtx;
10417 if (delta != 0)
10419 if (delta >= -256 && delta < 256)
10420 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
10421 plus_constant (Pmode, this_rtx, delta));
10422 else
10423 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
10424 temp1, temp0, false);
10427 if (Pmode == ptr_mode)
10428 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
10429 else
10430 aarch64_emit_move (temp0,
10431 gen_rtx_ZERO_EXTEND (Pmode,
10432 gen_rtx_MEM (ptr_mode, addr)));
10434 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
10435 addr = plus_constant (Pmode, temp0, vcall_offset);
10436 else
10438 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
10439 Pmode);
10440 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
10443 if (Pmode == ptr_mode)
10444 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
10445 else
10446 aarch64_emit_move (temp1,
10447 gen_rtx_SIGN_EXTEND (Pmode,
10448 gen_rtx_MEM (ptr_mode, addr)));
10450 emit_insn (gen_add2_insn (this_rtx, temp1));
10453 /* Generate a tail call to the target function. */
10454 if (!TREE_USED (function))
10456 assemble_external (function);
10457 TREE_USED (function) = 1;
10459 funexp = XEXP (DECL_RTL (function), 0);
10460 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
10461 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
10462 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
10463 SIBLING_CALL_P (insn) = 1;
10465 insn = get_insns ();
10466 shorten_branches (insn);
10468 assemble_start_function (thunk, fnname);
10469 final_start_function (insn, file, 1);
10470 final (insn, file, 1);
10471 final_end_function ();
10472 assemble_end_function (thunk, fnname);
10474 /* Stop pretending to be a post-reload pass. */
10475 reload_completed = 0;
10478 static bool
10479 aarch64_tls_referenced_p (rtx x)
10481 if (!TARGET_HAVE_TLS)
10482 return false;
10483 subrtx_iterator::array_type array;
10484 FOR_EACH_SUBRTX (iter, array, x, ALL)
10486 const_rtx x = *iter;
10487 if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
10488 return true;
10489 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
10490 TLS offsets, not real symbol references. */
10491 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10492 iter.skip_subrtxes ();
10494 return false;
10498 static bool
10499 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
10501 if (GET_CODE (x) == HIGH)
10502 return true;
10504 /* There's no way to calculate VL-based values using relocations. */
10505 subrtx_iterator::array_type array;
10506 FOR_EACH_SUBRTX (iter, array, x, ALL)
10507 if (GET_CODE (*iter) == CONST_POLY_INT)
10508 return true;
10510 poly_int64 offset;
10511 rtx base = strip_offset_and_salt (x, &offset);
10512 if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
10514 /* We checked for POLY_INT_CST offsets above. */
10515 if (aarch64_classify_symbol (base, offset.to_constant ())
10516 != SYMBOL_FORCE_TO_MEM)
10517 return true;
10518 else
10519 /* Avoid generating a 64-bit relocation in ILP32; leave
10520 to aarch64_expand_mov_immediate to handle it properly. */
10521 return mode != ptr_mode;
10524 return aarch64_tls_referenced_p (x);
10527 /* Implement TARGET_CASE_VALUES_THRESHOLD.
10528 The expansion for a table switch is quite expensive due to the number
10529 of instructions, the table lookup and hard to predict indirect jump.
10530 When optimizing for speed, and -O3 enabled, use the per-core tuning if
10531 set, otherwise use tables for >= 11 cases as a tradeoff between size and
10532 performance. When optimizing for size, use 8 for smallest codesize. */
10534 static unsigned int
10535 aarch64_case_values_threshold (void)
10537 /* Use the specified limit for the number of cases before using jump
10538 tables at higher optimization levels. */
10539 if (optimize > 2
10540 && aarch64_tune_params.max_case_values != 0)
10541 return aarch64_tune_params.max_case_values;
10542 else
10543 return optimize_size ? 8 : 11;
10546 /* Return true if register REGNO is a valid index register.
10547 STRICT_P is true if REG_OK_STRICT is in effect. */
10549 bool
10550 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
10552 if (!HARD_REGISTER_NUM_P (regno))
10554 if (!strict_p)
10555 return true;
10557 if (!reg_renumber)
10558 return false;
10560 regno = reg_renumber[regno];
10562 return GP_REGNUM_P (regno);
10565 /* Return true if register REGNO is a valid base register for mode MODE.
10566 STRICT_P is true if REG_OK_STRICT is in effect. */
10568 bool
10569 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
10571 if (!HARD_REGISTER_NUM_P (regno))
10573 if (!strict_p)
10574 return true;
10576 if (!reg_renumber)
10577 return false;
10579 regno = reg_renumber[regno];
10582 /* The fake registers will be eliminated to either the stack or
10583 hard frame pointer, both of which are usually valid base registers.
10584 Reload deals with the cases where the eliminated form isn't valid. */
10585 return (GP_REGNUM_P (regno)
10586 || regno == SP_REGNUM
10587 || regno == FRAME_POINTER_REGNUM
10588 || regno == ARG_POINTER_REGNUM);
10591 /* Return true if X is a valid base register for mode MODE.
10592 STRICT_P is true if REG_OK_STRICT is in effect. */
10594 static bool
10595 aarch64_base_register_rtx_p (rtx x, bool strict_p)
10597 if (!strict_p
10598 && SUBREG_P (x)
10599 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
10600 x = SUBREG_REG (x);
10602 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
10605 /* Return true if address offset is a valid index. If it is, fill in INFO
10606 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
10608 static bool
10609 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
10610 machine_mode mode, bool strict_p)
10612 enum aarch64_address_type type;
10613 rtx index;
10614 int shift;
10616 /* (reg:P) */
10617 if ((REG_P (x) || SUBREG_P (x))
10618 && GET_MODE (x) == Pmode)
10620 type = ADDRESS_REG_REG;
10621 index = x;
10622 shift = 0;
10624 /* (sign_extend:DI (reg:SI)) */
10625 else if ((GET_CODE (x) == SIGN_EXTEND
10626 || GET_CODE (x) == ZERO_EXTEND)
10627 && GET_MODE (x) == DImode
10628 && GET_MODE (XEXP (x, 0)) == SImode)
10630 type = (GET_CODE (x) == SIGN_EXTEND)
10631 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10632 index = XEXP (x, 0);
10633 shift = 0;
10635 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
10636 else if (GET_CODE (x) == MULT
10637 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10638 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10639 && GET_MODE (XEXP (x, 0)) == DImode
10640 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10641 && CONST_INT_P (XEXP (x, 1)))
10643 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10644 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10645 index = XEXP (XEXP (x, 0), 0);
10646 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10648 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
10649 else if (GET_CODE (x) == ASHIFT
10650 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
10651 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
10652 && GET_MODE (XEXP (x, 0)) == DImode
10653 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
10654 && CONST_INT_P (XEXP (x, 1)))
10656 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
10657 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
10658 index = XEXP (XEXP (x, 0), 0);
10659 shift = INTVAL (XEXP (x, 1));
10661 /* (and:DI (mult:DI (reg:DI) (const_int scale))
10662 (const_int 0xffffffff<<shift)) */
10663 else if (GET_CODE (x) == AND
10664 && GET_MODE (x) == DImode
10665 && GET_CODE (XEXP (x, 0)) == MULT
10666 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10667 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10668 && CONST_INT_P (XEXP (x, 1)))
10670 type = ADDRESS_REG_UXTW;
10671 index = XEXP (XEXP (x, 0), 0);
10672 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
10673 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10674 shift = -1;
10676 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
10677 (const_int 0xffffffff<<shift)) */
10678 else if (GET_CODE (x) == AND
10679 && GET_MODE (x) == DImode
10680 && GET_CODE (XEXP (x, 0)) == ASHIFT
10681 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
10682 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
10683 && CONST_INT_P (XEXP (x, 1)))
10685 type = ADDRESS_REG_UXTW;
10686 index = XEXP (XEXP (x, 0), 0);
10687 shift = INTVAL (XEXP (XEXP (x, 0), 1));
10688 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
10689 shift = -1;
10691 /* (mult:P (reg:P) (const_int scale)) */
10692 else if (GET_CODE (x) == MULT
10693 && GET_MODE (x) == Pmode
10694 && GET_MODE (XEXP (x, 0)) == Pmode
10695 && CONST_INT_P (XEXP (x, 1)))
10697 type = ADDRESS_REG_REG;
10698 index = XEXP (x, 0);
10699 shift = exact_log2 (INTVAL (XEXP (x, 1)));
10701 /* (ashift:P (reg:P) (const_int shift)) */
10702 else if (GET_CODE (x) == ASHIFT
10703 && GET_MODE (x) == Pmode
10704 && GET_MODE (XEXP (x, 0)) == Pmode
10705 && CONST_INT_P (XEXP (x, 1)))
10707 type = ADDRESS_REG_REG;
10708 index = XEXP (x, 0);
10709 shift = INTVAL (XEXP (x, 1));
10711 else
10712 return false;
10714 if (!strict_p
10715 && SUBREG_P (index)
10716 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
10717 index = SUBREG_REG (index);
10719 if (aarch64_sve_data_mode_p (mode))
10721 if (type != ADDRESS_REG_REG
10722 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
10723 return false;
10725 else
10727 if (shift != 0
10728 && !(IN_RANGE (shift, 1, 3)
10729 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
10730 return false;
10733 if (REG_P (index)
10734 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
10736 info->type = type;
10737 info->offset = index;
10738 info->shift = shift;
10739 return true;
10742 return false;
10745 /* Return true if MODE is one of the modes for which we
10746 support LDP/STP operations. */
10748 static bool
10749 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
10751 return mode == SImode || mode == DImode
10752 || mode == SFmode || mode == DFmode
10753 || mode == SDmode || mode == DDmode
10754 || (aarch64_vector_mode_supported_p (mode)
10755 && (known_eq (GET_MODE_SIZE (mode), 8)
10756 || (known_eq (GET_MODE_SIZE (mode), 16)
10757 && (aarch64_tune_params.extra_tuning_flags
10758 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
10761 /* Return true if REGNO is a virtual pointer register, or an eliminable
10762 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
10763 include stack_pointer or hard_frame_pointer. */
10764 static bool
10765 virt_or_elim_regno_p (unsigned regno)
10767 return ((regno >= FIRST_VIRTUAL_REGISTER
10768 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
10769 || regno == FRAME_POINTER_REGNUM
10770 || regno == ARG_POINTER_REGNUM);
10773 /* Return true if X is a valid address of type TYPE for machine mode MODE.
10774 If it is, fill in INFO appropriately. STRICT_P is true if
10775 REG_OK_STRICT is in effect. */
10777 bool
10778 aarch64_classify_address (struct aarch64_address_info *info,
10779 rtx x, machine_mode mode, bool strict_p,
10780 aarch64_addr_query_type type)
10782 enum rtx_code code = GET_CODE (x);
10783 rtx op0, op1;
10784 poly_int64 offset;
10786 HOST_WIDE_INT const_size;
10788 /* Whether a vector mode is partial doesn't affect address legitimacy.
10789 Partial vectors like VNx8QImode allow the same indexed addressing
10790 mode and MUL VL addressing mode as full vectors like VNx16QImode;
10791 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
10792 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10793 vec_flags &= ~VEC_PARTIAL;
10795 /* On BE, we use load/store pair for all large int mode load/stores.
10796 TI/TF/TDmode may also use a load/store pair. */
10797 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
10798 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
10799 || type == ADDR_QUERY_LDP_STP_N
10800 || mode == TImode
10801 || mode == TFmode
10802 || mode == TDmode
10803 || ((!TARGET_SIMD || BYTES_BIG_ENDIAN)
10804 && advsimd_struct_p));
10805 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
10806 corresponds to the actual size of the memory being loaded/stored and the
10807 mode of the corresponding addressing mode is half of that. */
10808 if (type == ADDR_QUERY_LDP_STP_N)
10810 if (known_eq (GET_MODE_SIZE (mode), 16))
10811 mode = DFmode;
10812 else if (known_eq (GET_MODE_SIZE (mode), 8))
10813 mode = SFmode;
10814 else
10815 return false;
10818 bool allow_reg_index_p = (!load_store_pair_p
10819 && ((vec_flags == 0
10820 && known_lt (GET_MODE_SIZE (mode), 16))
10821 || vec_flags == VEC_ADVSIMD
10822 || vec_flags & VEC_SVE_DATA));
10824 /* For SVE, only accept [Rn], [Rn, #offset, MUL VL] and [Rn, Rm, LSL #shift].
10825 The latter is not valid for SVE predicates, and that's rejected through
10826 allow_reg_index_p above. */
10827 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
10828 && (code != REG && code != PLUS))
10829 return false;
10831 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
10832 REG addressing. */
10833 if (advsimd_struct_p
10834 && TARGET_SIMD
10835 && !BYTES_BIG_ENDIAN
10836 && (code != POST_INC && code != REG))
10837 return false;
10839 gcc_checking_assert (GET_MODE (x) == VOIDmode
10840 || SCALAR_INT_MODE_P (GET_MODE (x)));
10842 switch (code)
10844 case REG:
10845 case SUBREG:
10846 info->type = ADDRESS_REG_IMM;
10847 info->base = x;
10848 info->offset = const0_rtx;
10849 info->const_offset = 0;
10850 return aarch64_base_register_rtx_p (x, strict_p);
10852 case PLUS:
10853 op0 = XEXP (x, 0);
10854 op1 = XEXP (x, 1);
10856 if (! strict_p
10857 && REG_P (op0)
10858 && virt_or_elim_regno_p (REGNO (op0))
10859 && poly_int_rtx_p (op1, &offset))
10861 info->type = ADDRESS_REG_IMM;
10862 info->base = op0;
10863 info->offset = op1;
10864 info->const_offset = offset;
10866 return true;
10869 if (maybe_ne (GET_MODE_SIZE (mode), 0)
10870 && aarch64_base_register_rtx_p (op0, strict_p)
10871 && poly_int_rtx_p (op1, &offset))
10873 info->type = ADDRESS_REG_IMM;
10874 info->base = op0;
10875 info->offset = op1;
10876 info->const_offset = offset;
10878 /* TImode, TFmode and TDmode values are allowed in both pairs of X
10879 registers and individual Q registers. The available
10880 address modes are:
10881 X,X: 7-bit signed scaled offset
10882 Q: 9-bit signed offset
10883 We conservatively require an offset representable in either mode.
10884 When performing the check for pairs of X registers i.e. LDP/STP
10885 pass down DImode since that is the natural size of the LDP/STP
10886 instruction memory accesses. */
10887 if (mode == TImode || mode == TFmode || mode == TDmode)
10888 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10889 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10890 || offset_12bit_unsigned_scaled_p (mode, offset)));
10892 if (mode == V8DImode)
10893 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10894 && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
10896 /* A 7bit offset check because OImode will emit a ldp/stp
10897 instruction (only !TARGET_SIMD or big endian will get here).
10898 For ldp/stp instructions, the offset is scaled for the size of a
10899 single element of the pair. */
10900 if (aarch64_advsimd_partial_struct_mode_p (mode)
10901 && known_eq (GET_MODE_SIZE (mode), 16))
10902 return aarch64_offset_7bit_signed_scaled_p (DImode, offset);
10903 if (aarch64_advsimd_full_struct_mode_p (mode)
10904 && known_eq (GET_MODE_SIZE (mode), 32))
10905 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
10907 /* Three 9/12 bit offsets checks because CImode will emit three
10908 ldr/str instructions (only !TARGET_SIMD or big endian will
10909 get here). */
10910 if (aarch64_advsimd_partial_struct_mode_p (mode)
10911 && known_eq (GET_MODE_SIZE (mode), 24))
10912 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10913 && (aarch64_offset_9bit_signed_unscaled_p (DImode,
10914 offset + 16)
10915 || offset_12bit_unsigned_scaled_p (DImode,
10916 offset + 16)));
10917 if (aarch64_advsimd_full_struct_mode_p (mode)
10918 && known_eq (GET_MODE_SIZE (mode), 48))
10919 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10920 && (aarch64_offset_9bit_signed_unscaled_p (TImode,
10921 offset + 32)
10922 || offset_12bit_unsigned_scaled_p (TImode,
10923 offset + 32)));
10925 /* Two 7bit offsets checks because XImode will emit two ldp/stp
10926 instructions (only big endian will get here). */
10927 if (aarch64_advsimd_partial_struct_mode_p (mode)
10928 && known_eq (GET_MODE_SIZE (mode), 32))
10929 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
10930 && aarch64_offset_7bit_signed_scaled_p (DImode,
10931 offset + 16));
10932 if (aarch64_advsimd_full_struct_mode_p (mode)
10933 && known_eq (GET_MODE_SIZE (mode), 64))
10934 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
10935 && aarch64_offset_7bit_signed_scaled_p (TImode,
10936 offset + 32));
10938 /* Make "m" use the LD1 offset range for SVE data modes, so
10939 that pre-RTL optimizers like ivopts will work to that
10940 instead of the wider LDR/STR range. */
10941 if (vec_flags == VEC_SVE_DATA)
10942 return (type == ADDR_QUERY_M
10943 ? offset_4bit_signed_scaled_p (mode, offset)
10944 : offset_9bit_signed_scaled_p (mode, offset));
10946 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
10948 poly_int64 end_offset = (offset
10949 + GET_MODE_SIZE (mode)
10950 - BYTES_PER_SVE_VECTOR);
10951 return (type == ADDR_QUERY_M
10952 ? offset_4bit_signed_scaled_p (mode, offset)
10953 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
10954 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
10955 end_offset)));
10958 if (vec_flags == VEC_SVE_PRED)
10959 return offset_9bit_signed_scaled_p (mode, offset);
10961 if (load_store_pair_p)
10962 return ((known_eq (GET_MODE_SIZE (mode), 4)
10963 || known_eq (GET_MODE_SIZE (mode), 8)
10964 || known_eq (GET_MODE_SIZE (mode), 16))
10965 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
10966 else
10967 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
10968 || offset_12bit_unsigned_scaled_p (mode, offset));
10971 if (allow_reg_index_p)
10973 /* Look for base + (scaled/extended) index register. */
10974 if (aarch64_base_register_rtx_p (op0, strict_p)
10975 && aarch64_classify_index (info, op1, mode, strict_p))
10977 info->base = op0;
10978 return true;
10980 if (aarch64_base_register_rtx_p (op1, strict_p)
10981 && aarch64_classify_index (info, op0, mode, strict_p))
10983 info->base = op1;
10984 return true;
10988 return false;
10990 case POST_INC:
10991 case POST_DEC:
10992 case PRE_INC:
10993 case PRE_DEC:
10994 info->type = ADDRESS_REG_WB;
10995 info->base = XEXP (x, 0);
10996 info->offset = NULL_RTX;
10997 return aarch64_base_register_rtx_p (info->base, strict_p);
10999 case POST_MODIFY:
11000 case PRE_MODIFY:
11001 info->type = ADDRESS_REG_WB;
11002 info->base = XEXP (x, 0);
11003 if (GET_CODE (XEXP (x, 1)) == PLUS
11004 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
11005 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
11006 && aarch64_base_register_rtx_p (info->base, strict_p))
11008 info->offset = XEXP (XEXP (x, 1), 1);
11009 info->const_offset = offset;
11011 /* TImode, TFmode and TDmode values are allowed in both pairs of X
11012 registers and individual Q registers. The available
11013 address modes are:
11014 X,X: 7-bit signed scaled offset
11015 Q: 9-bit signed offset
11016 We conservatively require an offset representable in either mode.
11018 if (mode == TImode || mode == TFmode || mode == TDmode)
11019 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
11020 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
11022 if (load_store_pair_p)
11023 return ((known_eq (GET_MODE_SIZE (mode), 4)
11024 || known_eq (GET_MODE_SIZE (mode), 8)
11025 || known_eq (GET_MODE_SIZE (mode), 16))
11026 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
11027 else
11028 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
11030 return false;
11032 case CONST:
11033 case SYMBOL_REF:
11034 case LABEL_REF:
11035 /* load literal: pc-relative constant pool entry. Only supported
11036 for SI mode or larger. */
11037 info->type = ADDRESS_SYMBOLIC;
11039 if (!load_store_pair_p
11040 && GET_MODE_SIZE (mode).is_constant (&const_size)
11041 && const_size >= 4)
11043 poly_int64 offset;
11044 rtx sym = strip_offset_and_salt (x, &offset);
11045 return ((LABEL_REF_P (sym)
11046 || (SYMBOL_REF_P (sym)
11047 && CONSTANT_POOL_ADDRESS_P (sym)
11048 && aarch64_pcrelative_literal_loads)));
11050 return false;
11052 case LO_SUM:
11053 info->type = ADDRESS_LO_SUM;
11054 info->base = XEXP (x, 0);
11055 info->offset = XEXP (x, 1);
11056 if (allow_reg_index_p
11057 && aarch64_base_register_rtx_p (info->base, strict_p))
11059 poly_int64 offset;
11060 HOST_WIDE_INT const_offset;
11061 rtx sym = strip_offset_and_salt (info->offset, &offset);
11062 if (SYMBOL_REF_P (sym)
11063 && offset.is_constant (&const_offset)
11064 && (aarch64_classify_symbol (sym, const_offset)
11065 == SYMBOL_SMALL_ABSOLUTE))
11067 /* The symbol and offset must be aligned to the access size. */
11068 unsigned int align;
11070 if (CONSTANT_POOL_ADDRESS_P (sym))
11071 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
11072 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
11074 tree exp = SYMBOL_REF_DECL (sym);
11075 align = TYPE_ALIGN (TREE_TYPE (exp));
11076 align = aarch64_constant_alignment (exp, align);
11078 else if (SYMBOL_REF_DECL (sym))
11079 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
11080 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
11081 && SYMBOL_REF_BLOCK (sym) != NULL)
11082 align = SYMBOL_REF_BLOCK (sym)->alignment;
11083 else
11084 align = BITS_PER_UNIT;
11086 poly_int64 ref_size = GET_MODE_SIZE (mode);
11087 if (known_eq (ref_size, 0))
11088 ref_size = GET_MODE_SIZE (DImode);
11090 return (multiple_p (const_offset, ref_size)
11091 && multiple_p (align / BITS_PER_UNIT, ref_size));
11094 return false;
11096 default:
11097 return false;
11101 /* Return true if the address X is valid for a PRFM instruction.
11102 STRICT_P is true if we should do strict checking with
11103 aarch64_classify_address. */
11105 bool
11106 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
11108 struct aarch64_address_info addr;
11110 /* PRFM accepts the same addresses as DImode... */
11111 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
11112 if (!res)
11113 return false;
11115 /* ... except writeback forms. */
11116 return addr.type != ADDRESS_REG_WB;
11119 bool
11120 aarch64_symbolic_address_p (rtx x)
11122 poly_int64 offset;
11123 x = strip_offset_and_salt (x, &offset);
11124 return SYMBOL_REF_P (x) || LABEL_REF_P (x);
11127 /* Classify the base of symbolic expression X. */
11129 enum aarch64_symbol_type
11130 aarch64_classify_symbolic_expression (rtx x)
11132 rtx offset;
11134 split_const (x, &x, &offset);
11135 return aarch64_classify_symbol (x, INTVAL (offset));
11139 /* Return TRUE if X is a legitimate address for accessing memory in
11140 mode MODE. */
11141 static bool
11142 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
11144 struct aarch64_address_info addr;
11146 return aarch64_classify_address (&addr, x, mode, strict_p);
11149 /* Return TRUE if X is a legitimate address of type TYPE for accessing
11150 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
11151 bool
11152 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
11153 aarch64_addr_query_type type)
11155 struct aarch64_address_info addr;
11157 return aarch64_classify_address (&addr, x, mode, strict_p, type);
11160 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
11162 static bool
11163 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
11164 poly_int64 orig_offset,
11165 machine_mode mode)
11167 HOST_WIDE_INT size;
11168 if (GET_MODE_SIZE (mode).is_constant (&size))
11170 HOST_WIDE_INT const_offset, second_offset;
11172 /* A general SVE offset is A * VQ + B. Remove the A component from
11173 coefficient 0 in order to get the constant B. */
11174 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
11176 /* Split an out-of-range address displacement into a base and
11177 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
11178 range otherwise to increase opportunities for sharing the base
11179 address of different sizes. Unaligned accesses use the signed
11180 9-bit range, TImode/TFmode/TDmode use the intersection of signed
11181 scaled 7-bit and signed 9-bit offset. */
11182 if (mode == TImode || mode == TFmode || mode == TDmode)
11183 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
11184 else if ((const_offset & (size - 1)) != 0)
11185 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
11186 else
11187 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
11189 if (second_offset == 0 || known_eq (orig_offset, second_offset))
11190 return false;
11192 /* Split the offset into second_offset and the rest. */
11193 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11194 *offset2 = gen_int_mode (second_offset, Pmode);
11195 return true;
11197 else
11199 /* Get the mode we should use as the basis of the range. For structure
11200 modes this is the mode of one vector. */
11201 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11202 machine_mode step_mode
11203 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
11205 /* Get the "mul vl" multiplier we'd like to use. */
11206 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
11207 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
11208 if (vec_flags & VEC_SVE_DATA)
11209 /* LDR supports a 9-bit range, but the move patterns for
11210 structure modes require all vectors to be in range of the
11211 same base. The simplest way of accomodating that while still
11212 promoting reuse of anchor points between different modes is
11213 to use an 8-bit range unconditionally. */
11214 vnum = ((vnum + 128) & 255) - 128;
11215 else
11216 /* Predicates are only handled singly, so we might as well use
11217 the full range. */
11218 vnum = ((vnum + 256) & 511) - 256;
11219 if (vnum == 0)
11220 return false;
11222 /* Convert the "mul vl" multiplier into a byte offset. */
11223 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
11224 if (known_eq (second_offset, orig_offset))
11225 return false;
11227 /* Split the offset into second_offset and the rest. */
11228 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
11229 *offset2 = gen_int_mode (second_offset, Pmode);
11230 return true;
11234 /* Return the binary representation of floating point constant VALUE in INTVAL.
11235 If the value cannot be converted, return false without setting INTVAL.
11236 The conversion is done in the given MODE. */
11237 bool
11238 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
11241 /* We make a general exception for 0. */
11242 if (aarch64_float_const_zero_rtx_p (value))
11244 *intval = 0;
11245 return true;
11248 scalar_float_mode mode;
11249 if (!CONST_DOUBLE_P (value)
11250 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
11251 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
11252 /* Only support up to DF mode. */
11253 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
11254 return false;
11256 unsigned HOST_WIDE_INT ival = 0;
11258 long res[2];
11259 real_to_target (res,
11260 CONST_DOUBLE_REAL_VALUE (value),
11261 REAL_MODE_FORMAT (mode));
11263 if (mode == DFmode || mode == DDmode)
11265 int order = BYTES_BIG_ENDIAN ? 1 : 0;
11266 ival = zext_hwi (res[order], 32);
11267 ival |= (zext_hwi (res[1 - order], 32) << 32);
11269 else
11270 ival = zext_hwi (res[0], 32);
11272 *intval = ival;
11273 return true;
11276 /* Return TRUE if rtx X is an immediate constant that can be moved using a
11277 single MOV(+MOVK) followed by an FMOV. */
11278 bool
11279 aarch64_float_const_rtx_p (rtx x)
11281 machine_mode mode = GET_MODE (x);
11282 if (mode == VOIDmode)
11283 return false;
11285 /* Determine whether it's cheaper to write float constants as
11286 mov/movk pairs over ldr/adrp pairs. */
11287 unsigned HOST_WIDE_INT ival;
11289 if (CONST_DOUBLE_P (x)
11290 && SCALAR_FLOAT_MODE_P (mode)
11291 && aarch64_reinterpret_float_as_int (x, &ival))
11293 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8) ? DImode : SImode;
11294 int num_instr = aarch64_internal_mov_immediate
11295 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
11296 return num_instr < 3;
11299 return false;
11302 /* Return TRUE if rtx X is immediate constant 0.0 (but not in Decimal
11303 Floating Point). */
11304 bool
11305 aarch64_float_const_zero_rtx_p (rtx x)
11307 /* 0.0 in Decimal Floating Point cannot be represented by #0 or
11308 zr as our callers expect, so no need to check the actual
11309 value if X is of Decimal Floating Point type. */
11310 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_DECIMAL_FLOAT)
11311 return false;
11313 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
11314 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
11315 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
11318 /* Return TRUE if rtx X is immediate constant that fits in a single
11319 MOVI immediate operation. */
11320 bool
11321 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
11323 if (!TARGET_SIMD)
11324 return false;
11326 machine_mode vmode;
11327 scalar_int_mode imode;
11328 unsigned HOST_WIDE_INT ival;
11330 if (CONST_DOUBLE_P (x)
11331 && SCALAR_FLOAT_MODE_P (mode))
11333 if (!aarch64_reinterpret_float_as_int (x, &ival))
11334 return false;
11336 /* We make a general exception for 0. */
11337 if (aarch64_float_const_zero_rtx_p (x))
11338 return true;
11340 imode = int_mode_for_mode (mode).require ();
11342 else if (CONST_INT_P (x)
11343 && is_a <scalar_int_mode> (mode, &imode))
11344 ival = INTVAL (x);
11345 else
11346 return false;
11348 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
11349 a 128 bit vector mode. */
11350 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
11352 vmode = aarch64_simd_container_mode (imode, width);
11353 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
11355 return aarch64_simd_valid_immediate (v_op, NULL);
11359 /* Return the fixed registers used for condition codes. */
11361 static bool
11362 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
11364 *p1 = CC_REGNUM;
11365 *p2 = INVALID_REGNUM;
11366 return true;
11369 /* This function is used by the call expanders of the machine description.
11370 RESULT is the register in which the result is returned. It's NULL for
11371 "call" and "sibcall".
11372 MEM is the location of the function call.
11373 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
11374 SIBCALL indicates whether this function call is normal call or sibling call.
11375 It will generate different pattern accordingly. */
11377 void
11378 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
11380 rtx call, callee, tmp;
11381 rtvec vec;
11382 machine_mode mode;
11384 gcc_assert (MEM_P (mem));
11385 callee = XEXP (mem, 0);
11386 mode = GET_MODE (callee);
11387 gcc_assert (mode == Pmode);
11389 /* Decide if we should generate indirect calls by loading the
11390 address of the callee into a register before performing
11391 the branch-and-link. */
11392 if (SYMBOL_REF_P (callee)
11393 ? (aarch64_is_long_call_p (callee)
11394 || aarch64_is_noplt_call_p (callee))
11395 : !REG_P (callee))
11396 XEXP (mem, 0) = force_reg (mode, callee);
11398 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
11400 if (result != NULL_RTX)
11401 call = gen_rtx_SET (result, call);
11403 if (sibcall)
11404 tmp = ret_rtx;
11405 else
11406 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
11408 gcc_assert (CONST_INT_P (callee_abi));
11409 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
11410 UNSPEC_CALLEE_ABI);
11412 vec = gen_rtvec (3, call, callee_abi, tmp);
11413 call = gen_rtx_PARALLEL (VOIDmode, vec);
11415 aarch64_emit_call_insn (call);
11418 /* Emit call insn with PAT and do aarch64-specific handling. */
11420 void
11421 aarch64_emit_call_insn (rtx pat)
11423 rtx insn = emit_call_insn (pat);
11425 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
11426 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
11427 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
11430 machine_mode
11431 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
11433 machine_mode mode_x = GET_MODE (x);
11434 rtx_code code_x = GET_CODE (x);
11436 /* All floating point compares return CCFP if it is an equality
11437 comparison, and CCFPE otherwise. */
11438 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
11440 switch (code)
11442 case EQ:
11443 case NE:
11444 case UNORDERED:
11445 case ORDERED:
11446 case UNLT:
11447 case UNLE:
11448 case UNGT:
11449 case UNGE:
11450 case UNEQ:
11451 return CCFPmode;
11453 case LT:
11454 case LE:
11455 case GT:
11456 case GE:
11457 case LTGT:
11458 return CCFPEmode;
11460 default:
11461 gcc_unreachable ();
11465 /* Equality comparisons of short modes against zero can be performed
11466 using the TST instruction with the appropriate bitmask. */
11467 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
11468 && (code == EQ || code == NE)
11469 && (mode_x == HImode || mode_x == QImode))
11470 return CC_Zmode;
11472 /* Similarly, comparisons of zero_extends from shorter modes can
11473 be performed using an ANDS with an immediate mask. */
11474 if (y == const0_rtx && code_x == ZERO_EXTEND
11475 && (mode_x == SImode || mode_x == DImode)
11476 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
11477 && (code == EQ || code == NE))
11478 return CC_Zmode;
11480 /* Zero extracts support equality comparisons. */
11481 if ((mode_x == SImode || mode_x == DImode)
11482 && y == const0_rtx
11483 && (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
11484 && CONST_INT_P (XEXP (x, 2)))
11485 && (code == EQ || code == NE))
11486 return CC_Zmode;
11488 /* ANDS/BICS/TST support equality and all signed comparisons. */
11489 if ((mode_x == SImode || mode_x == DImode)
11490 && y == const0_rtx
11491 && (code_x == AND)
11492 && (code == EQ || code == NE || code == LT || code == GE
11493 || code == GT || code == LE))
11494 return CC_NZVmode;
11496 /* ADDS/SUBS correctly set N and Z flags. */
11497 if ((mode_x == SImode || mode_x == DImode)
11498 && y == const0_rtx
11499 && (code == EQ || code == NE || code == LT || code == GE)
11500 && (code_x == PLUS || code_x == MINUS || code_x == NEG))
11501 return CC_NZmode;
11503 /* A compare with a shifted operand. Because of canonicalization,
11504 the comparison will have to be swapped when we emit the assembly
11505 code. */
11506 if ((mode_x == SImode || mode_x == DImode)
11507 && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
11508 && (code_x == ASHIFT || code_x == ASHIFTRT
11509 || code_x == LSHIFTRT
11510 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
11511 return CC_SWPmode;
11513 /* Similarly for a negated operand, but we can only do this for
11514 equalities. */
11515 if ((mode_x == SImode || mode_x == DImode)
11516 && (REG_P (y) || SUBREG_P (y))
11517 && (code == EQ || code == NE)
11518 && code_x == NEG)
11519 return CC_Zmode;
11521 /* A test for unsigned overflow from an addition. */
11522 if ((mode_x == DImode || mode_x == TImode)
11523 && (code == LTU || code == GEU)
11524 && code_x == PLUS
11525 && rtx_equal_p (XEXP (x, 0), y))
11526 return CC_Cmode;
11528 /* A test for unsigned overflow from an add with carry. */
11529 if ((mode_x == DImode || mode_x == TImode)
11530 && (code == LTU || code == GEU)
11531 && code_x == PLUS
11532 && CONST_SCALAR_INT_P (y)
11533 && (rtx_mode_t (y, mode_x)
11534 == (wi::shwi (1, mode_x)
11535 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
11536 return CC_ADCmode;
11538 /* A test for signed overflow. */
11539 if ((mode_x == DImode || mode_x == TImode)
11540 && code == NE
11541 && code_x == PLUS
11542 && GET_CODE (y) == SIGN_EXTEND)
11543 return CC_Vmode;
11545 /* For everything else, return CCmode. */
11546 return CCmode;
11549 static int
11550 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
11553 aarch64_get_condition_code (rtx x)
11555 machine_mode mode = GET_MODE (XEXP (x, 0));
11556 enum rtx_code comp_code = GET_CODE (x);
11558 if (GET_MODE_CLASS (mode) != MODE_CC)
11559 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
11560 return aarch64_get_condition_code_1 (mode, comp_code);
11563 static int
11564 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
11566 switch (mode)
11568 case E_CCFPmode:
11569 case E_CCFPEmode:
11570 switch (comp_code)
11572 case GE: return AARCH64_GE;
11573 case GT: return AARCH64_GT;
11574 case LE: return AARCH64_LS;
11575 case LT: return AARCH64_MI;
11576 case NE: return AARCH64_NE;
11577 case EQ: return AARCH64_EQ;
11578 case ORDERED: return AARCH64_VC;
11579 case UNORDERED: return AARCH64_VS;
11580 case UNLT: return AARCH64_LT;
11581 case UNLE: return AARCH64_LE;
11582 case UNGT: return AARCH64_HI;
11583 case UNGE: return AARCH64_PL;
11584 default: return -1;
11586 break;
11588 case E_CCmode:
11589 switch (comp_code)
11591 case NE: return AARCH64_NE;
11592 case EQ: return AARCH64_EQ;
11593 case GE: return AARCH64_GE;
11594 case GT: return AARCH64_GT;
11595 case LE: return AARCH64_LE;
11596 case LT: return AARCH64_LT;
11597 case GEU: return AARCH64_CS;
11598 case GTU: return AARCH64_HI;
11599 case LEU: return AARCH64_LS;
11600 case LTU: return AARCH64_CC;
11601 default: return -1;
11603 break;
11605 case E_CC_SWPmode:
11606 switch (comp_code)
11608 case NE: return AARCH64_NE;
11609 case EQ: return AARCH64_EQ;
11610 case GE: return AARCH64_LE;
11611 case GT: return AARCH64_LT;
11612 case LE: return AARCH64_GE;
11613 case LT: return AARCH64_GT;
11614 case GEU: return AARCH64_LS;
11615 case GTU: return AARCH64_CC;
11616 case LEU: return AARCH64_CS;
11617 case LTU: return AARCH64_HI;
11618 default: return -1;
11620 break;
11622 case E_CC_NZCmode:
11623 switch (comp_code)
11625 case NE: return AARCH64_NE; /* = any */
11626 case EQ: return AARCH64_EQ; /* = none */
11627 case GE: return AARCH64_PL; /* = nfrst */
11628 case LT: return AARCH64_MI; /* = first */
11629 case GEU: return AARCH64_CS; /* = nlast */
11630 case GTU: return AARCH64_HI; /* = pmore */
11631 case LEU: return AARCH64_LS; /* = plast */
11632 case LTU: return AARCH64_CC; /* = last */
11633 default: return -1;
11635 break;
11637 case E_CC_NZVmode:
11638 switch (comp_code)
11640 case NE: return AARCH64_NE;
11641 case EQ: return AARCH64_EQ;
11642 case GE: return AARCH64_PL;
11643 case LT: return AARCH64_MI;
11644 case GT: return AARCH64_GT;
11645 case LE: return AARCH64_LE;
11646 default: return -1;
11648 break;
11650 case E_CC_NZmode:
11651 switch (comp_code)
11653 case NE: return AARCH64_NE;
11654 case EQ: return AARCH64_EQ;
11655 case GE: return AARCH64_PL;
11656 case LT: return AARCH64_MI;
11657 default: return -1;
11659 break;
11661 case E_CC_Zmode:
11662 switch (comp_code)
11664 case NE: return AARCH64_NE;
11665 case EQ: return AARCH64_EQ;
11666 default: return -1;
11668 break;
11670 case E_CC_Cmode:
11671 switch (comp_code)
11673 case LTU: return AARCH64_CS;
11674 case GEU: return AARCH64_CC;
11675 default: return -1;
11677 break;
11679 case E_CC_ADCmode:
11680 switch (comp_code)
11682 case GEU: return AARCH64_CS;
11683 case LTU: return AARCH64_CC;
11684 default: return -1;
11686 break;
11688 case E_CC_Vmode:
11689 switch (comp_code)
11691 case NE: return AARCH64_VS;
11692 case EQ: return AARCH64_VC;
11693 default: return -1;
11695 break;
11697 default:
11698 return -1;
11701 return -1;
11704 bool
11705 aarch64_const_vec_all_same_in_range_p (rtx x,
11706 HOST_WIDE_INT minval,
11707 HOST_WIDE_INT maxval)
11709 rtx elt;
11710 return (const_vec_duplicate_p (x, &elt)
11711 && CONST_INT_P (elt)
11712 && IN_RANGE (INTVAL (elt), minval, maxval));
11715 bool
11716 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
11718 return aarch64_const_vec_all_same_in_range_p (x, val, val);
11721 /* Return true if VEC is a constant in which every element is in the range
11722 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
11724 static bool
11725 aarch64_const_vec_all_in_range_p (rtx vec,
11726 HOST_WIDE_INT minval,
11727 HOST_WIDE_INT maxval)
11729 if (!CONST_VECTOR_P (vec)
11730 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
11731 return false;
11733 int nunits;
11734 if (!CONST_VECTOR_STEPPED_P (vec))
11735 nunits = const_vector_encoded_nelts (vec);
11736 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
11737 return false;
11739 for (int i = 0; i < nunits; i++)
11741 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
11742 if (!CONST_INT_P (vec_elem)
11743 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
11744 return false;
11746 return true;
11749 /* N Z C V. */
11750 #define AARCH64_CC_V 1
11751 #define AARCH64_CC_C (1 << 1)
11752 #define AARCH64_CC_Z (1 << 2)
11753 #define AARCH64_CC_N (1 << 3)
11755 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
11756 static const int aarch64_nzcv_codes[] =
11758 0, /* EQ, Z == 1. */
11759 AARCH64_CC_Z, /* NE, Z == 0. */
11760 0, /* CS, C == 1. */
11761 AARCH64_CC_C, /* CC, C == 0. */
11762 0, /* MI, N == 1. */
11763 AARCH64_CC_N, /* PL, N == 0. */
11764 0, /* VS, V == 1. */
11765 AARCH64_CC_V, /* VC, V == 0. */
11766 0, /* HI, C ==1 && Z == 0. */
11767 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
11768 AARCH64_CC_V, /* GE, N == V. */
11769 0, /* LT, N != V. */
11770 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
11771 0, /* LE, !(Z == 0 && N == V). */
11772 0, /* AL, Any. */
11773 0 /* NV, Any. */
11776 /* Print floating-point vector immediate operand X to F, negating it
11777 first if NEGATE is true. Return true on success, false if it isn't
11778 a constant we can handle. */
11780 static bool
11781 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
11783 rtx elt;
11785 if (!const_vec_duplicate_p (x, &elt))
11786 return false;
11788 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
11789 if (negate)
11790 r = real_value_negate (&r);
11792 /* Handle the SVE single-bit immediates specially, since they have a
11793 fixed form in the assembly syntax. */
11794 if (real_equal (&r, &dconst0))
11795 asm_fprintf (f, "0.0");
11796 else if (real_equal (&r, &dconst2))
11797 asm_fprintf (f, "2.0");
11798 else if (real_equal (&r, &dconst1))
11799 asm_fprintf (f, "1.0");
11800 else if (real_equal (&r, &dconsthalf))
11801 asm_fprintf (f, "0.5");
11802 else
11804 const int buf_size = 20;
11805 char float_buf[buf_size] = {'\0'};
11806 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
11807 1, GET_MODE (elt));
11808 asm_fprintf (f, "%s", float_buf);
11811 return true;
11814 /* Return the equivalent letter for size. */
11815 static char
11816 sizetochar (int size)
11818 switch (size)
11820 case 64: return 'd';
11821 case 32: return 's';
11822 case 16: return 'h';
11823 case 8 : return 'b';
11824 default: gcc_unreachable ();
11828 /* Print operand X to file F in a target specific manner according to CODE.
11829 The acceptable formatting commands given by CODE are:
11830 'c': An integer or symbol address without a preceding #
11831 sign.
11832 'C': Take the duplicated element in a vector constant
11833 and print it in hex.
11834 'D': Take the duplicated element in a vector constant
11835 and print it as an unsigned integer, in decimal.
11836 'e': Print the sign/zero-extend size as a character 8->b,
11837 16->h, 32->w. Can also be used for masks:
11838 0xff->b, 0xffff->h, 0xffffffff->w.
11839 'I': If the operand is a duplicated vector constant,
11840 replace it with the duplicated scalar. If the
11841 operand is then a floating-point constant, replace
11842 it with the integer bit representation. Print the
11843 transformed constant as a signed decimal number.
11844 'p': Prints N such that 2^N == X (X must be power of 2 and
11845 const int).
11846 'P': Print the number of non-zero bits in X (a const_int).
11847 'H': Print the higher numbered register of a pair (TImode)
11848 of regs.
11849 'm': Print a condition (eq, ne, etc).
11850 'M': Same as 'm', but invert condition.
11851 'N': Take the duplicated element in a vector constant
11852 and print the negative of it in decimal.
11853 'b/h/s/d/q': Print a scalar FP/SIMD register name.
11854 'S/T/U/V': Print a FP/SIMD register name for a register list.
11855 The register printed is the FP/SIMD register name
11856 of X + 0/1/2/3 for S/T/U/V.
11857 'R': Print a scalar Integer/FP/SIMD register name + 1.
11858 'X': Print bottom 16 bits of integer constant in hex.
11859 'w/x': Print a general register name or the zero register
11860 (32-bit or 64-bit).
11861 '0': Print a normal operand, if it's a general register,
11862 then we assume DImode.
11863 'k': Print NZCV for conditional compare instructions.
11864 'A': Output address constant representing the first
11865 argument of X, specifying a relocation offset
11866 if appropriate.
11867 'L': Output constant address specified by X
11868 with a relocation offset if appropriate.
11869 'G': Prints address of X, specifying a PC relative
11870 relocation mode if appropriate.
11871 'y': Output address of LDP or STP - this is used for
11872 some LDP/STPs which don't use a PARALLEL in their
11873 pattern (so the mode needs to be adjusted).
11874 'z': Output address of a typical LDP or STP. */
11876 static void
11877 aarch64_print_operand (FILE *f, rtx x, int code)
11879 rtx elt;
11880 switch (code)
11882 case 'c':
11883 if (CONST_INT_P (x))
11884 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
11885 else
11887 poly_int64 offset;
11888 rtx base = strip_offset_and_salt (x, &offset);
11889 if (SYMBOL_REF_P (base))
11890 output_addr_const (f, x);
11891 else
11892 output_operand_lossage ("unsupported operand for code '%c'", code);
11894 break;
11896 case 'e':
11898 x = unwrap_const_vec_duplicate (x);
11899 if (!CONST_INT_P (x))
11901 output_operand_lossage ("invalid operand for '%%%c'", code);
11902 return;
11905 HOST_WIDE_INT val = INTVAL (x);
11906 if ((val & ~7) == 8 || val == 0xff)
11907 fputc ('b', f);
11908 else if ((val & ~7) == 16 || val == 0xffff)
11909 fputc ('h', f);
11910 else if ((val & ~7) == 32 || val == 0xffffffff)
11911 fputc ('w', f);
11912 else
11914 output_operand_lossage ("invalid operand for '%%%c'", code);
11915 return;
11918 break;
11920 case 'p':
11922 int n;
11924 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
11926 output_operand_lossage ("invalid operand for '%%%c'", code);
11927 return;
11930 asm_fprintf (f, "%d", n);
11932 break;
11934 case 'P':
11935 if (!CONST_INT_P (x))
11937 output_operand_lossage ("invalid operand for '%%%c'", code);
11938 return;
11941 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
11942 break;
11944 case 'H':
11945 if (x == const0_rtx)
11947 asm_fprintf (f, "xzr");
11948 break;
11951 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
11953 output_operand_lossage ("invalid operand for '%%%c'", code);
11954 return;
11957 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
11958 break;
11960 case 'I':
11962 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
11963 if (CONST_INT_P (x))
11964 asm_fprintf (f, "%wd", INTVAL (x));
11965 else
11967 output_operand_lossage ("invalid operand for '%%%c'", code);
11968 return;
11970 break;
11973 case 'M':
11974 case 'm':
11976 int cond_code;
11977 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
11978 if (x == const_true_rtx)
11980 if (code == 'M')
11981 fputs ("nv", f);
11982 return;
11985 if (!COMPARISON_P (x))
11987 output_operand_lossage ("invalid operand for '%%%c'", code);
11988 return;
11991 cond_code = aarch64_get_condition_code (x);
11992 gcc_assert (cond_code >= 0);
11993 if (code == 'M')
11994 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
11995 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
11996 fputs (aarch64_sve_condition_codes[cond_code], f);
11997 else
11998 fputs (aarch64_condition_codes[cond_code], f);
12000 break;
12002 case 'N':
12003 if (!const_vec_duplicate_p (x, &elt))
12005 output_operand_lossage ("invalid vector constant");
12006 return;
12009 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12010 asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
12011 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12012 && aarch64_print_vector_float_operand (f, x, true))
12014 else
12016 output_operand_lossage ("invalid vector constant");
12017 return;
12019 break;
12021 case 'b':
12022 case 'h':
12023 case 's':
12024 case 'd':
12025 case 'q':
12026 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12028 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12029 return;
12031 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
12032 break;
12034 case 'S':
12035 case 'T':
12036 case 'U':
12037 case 'V':
12038 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
12040 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
12041 return;
12043 asm_fprintf (f, "%c%d",
12044 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
12045 REGNO (x) - V0_REGNUM + (code - 'S'));
12046 break;
12048 case 'R':
12049 if (REG_P (x) && FP_REGNUM_P (REGNO (x))
12050 && (aarch64_advsimd_partial_struct_mode_p (GET_MODE (x))))
12051 asm_fprintf (f, "d%d", REGNO (x) - V0_REGNUM + 1);
12052 else if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
12053 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
12054 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12055 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
12056 else
12057 output_operand_lossage ("incompatible register operand for '%%%c'",
12058 code);
12059 break;
12061 case 'X':
12062 if (!CONST_INT_P (x))
12064 output_operand_lossage ("invalid operand for '%%%c'", code);
12065 return;
12067 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
12068 break;
12070 case 'C':
12072 /* Print a replicated constant in hex. */
12073 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12075 output_operand_lossage ("invalid operand for '%%%c'", code);
12076 return;
12078 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12079 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12081 break;
12083 case 'D':
12085 /* Print a replicated constant in decimal, treating it as
12086 unsigned. */
12087 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
12089 output_operand_lossage ("invalid operand for '%%%c'", code);
12090 return;
12092 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
12093 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
12095 break;
12097 case 'w':
12098 case 'x':
12099 if (x == const0_rtx
12100 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
12102 asm_fprintf (f, "%czr", code);
12103 break;
12106 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
12108 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
12109 break;
12112 if (REG_P (x) && REGNO (x) == SP_REGNUM)
12114 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
12115 break;
12118 /* Fall through */
12120 case 0:
12121 if (x == NULL)
12123 output_operand_lossage ("missing operand");
12124 return;
12127 switch (GET_CODE (x))
12129 case REG:
12130 if (aarch64_sve_data_mode_p (GET_MODE (x)))
12132 if (REG_NREGS (x) == 1)
12133 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
12134 else
12136 char suffix
12137 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
12138 asm_fprintf (f, "{z%d.%c - z%d.%c}",
12139 REGNO (x) - V0_REGNUM, suffix,
12140 END_REGNO (x) - V0_REGNUM - 1, suffix);
12143 else
12144 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
12145 break;
12147 case MEM:
12148 output_address (GET_MODE (x), XEXP (x, 0));
12149 break;
12151 case LABEL_REF:
12152 case SYMBOL_REF:
12153 output_addr_const (asm_out_file, x);
12154 break;
12156 case CONST_INT:
12157 asm_fprintf (f, "%wd", INTVAL (x));
12158 break;
12160 case CONST:
12161 if (!VECTOR_MODE_P (GET_MODE (x)))
12163 output_addr_const (asm_out_file, x);
12164 break;
12166 /* fall through */
12168 case CONST_VECTOR:
12169 if (!const_vec_duplicate_p (x, &elt))
12171 output_operand_lossage ("invalid vector constant");
12172 return;
12175 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
12176 asm_fprintf (f, "%wd", INTVAL (elt));
12177 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
12178 && aarch64_print_vector_float_operand (f, x, false))
12180 else
12182 output_operand_lossage ("invalid vector constant");
12183 return;
12185 break;
12187 case CONST_DOUBLE:
12188 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
12189 be getting CONST_DOUBLEs holding integers. */
12190 gcc_assert (GET_MODE (x) != VOIDmode);
12191 if (aarch64_float_const_zero_rtx_p (x))
12193 fputc ('0', f);
12194 break;
12196 else if (aarch64_float_const_representable_p (x))
12198 #define buf_size 20
12199 char float_buf[buf_size] = {'\0'};
12200 real_to_decimal_for_mode (float_buf,
12201 CONST_DOUBLE_REAL_VALUE (x),
12202 buf_size, buf_size,
12203 1, GET_MODE (x));
12204 asm_fprintf (asm_out_file, "%s", float_buf);
12205 break;
12206 #undef buf_size
12208 output_operand_lossage ("invalid constant");
12209 return;
12210 default:
12211 output_operand_lossage ("invalid operand");
12212 return;
12214 break;
12216 case 'A':
12217 if (GET_CODE (x) == HIGH)
12218 x = XEXP (x, 0);
12220 switch (aarch64_classify_symbolic_expression (x))
12222 case SYMBOL_SMALL_GOT_4G:
12223 asm_fprintf (asm_out_file, ":got:");
12224 break;
12226 case SYMBOL_SMALL_TLSGD:
12227 asm_fprintf (asm_out_file, ":tlsgd:");
12228 break;
12230 case SYMBOL_SMALL_TLSDESC:
12231 asm_fprintf (asm_out_file, ":tlsdesc:");
12232 break;
12234 case SYMBOL_SMALL_TLSIE:
12235 asm_fprintf (asm_out_file, ":gottprel:");
12236 break;
12238 case SYMBOL_TLSLE24:
12239 asm_fprintf (asm_out_file, ":tprel:");
12240 break;
12242 case SYMBOL_TINY_GOT:
12243 gcc_unreachable ();
12244 break;
12246 default:
12247 break;
12249 output_addr_const (asm_out_file, x);
12250 break;
12252 case 'L':
12253 switch (aarch64_classify_symbolic_expression (x))
12255 case SYMBOL_SMALL_GOT_4G:
12256 asm_fprintf (asm_out_file, ":got_lo12:");
12257 break;
12259 case SYMBOL_SMALL_TLSGD:
12260 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
12261 break;
12263 case SYMBOL_SMALL_TLSDESC:
12264 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
12265 break;
12267 case SYMBOL_SMALL_TLSIE:
12268 asm_fprintf (asm_out_file, ":gottprel_lo12:");
12269 break;
12271 case SYMBOL_TLSLE12:
12272 asm_fprintf (asm_out_file, ":tprel_lo12:");
12273 break;
12275 case SYMBOL_TLSLE24:
12276 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
12277 break;
12279 case SYMBOL_TINY_GOT:
12280 asm_fprintf (asm_out_file, ":got:");
12281 break;
12283 case SYMBOL_TINY_TLSIE:
12284 asm_fprintf (asm_out_file, ":gottprel:");
12285 break;
12287 default:
12288 break;
12290 output_addr_const (asm_out_file, x);
12291 break;
12293 case 'G':
12294 switch (aarch64_classify_symbolic_expression (x))
12296 case SYMBOL_TLSLE24:
12297 asm_fprintf (asm_out_file, ":tprel_hi12:");
12298 break;
12299 default:
12300 break;
12302 output_addr_const (asm_out_file, x);
12303 break;
12305 case 'k':
12307 HOST_WIDE_INT cond_code;
12309 if (!CONST_INT_P (x))
12311 output_operand_lossage ("invalid operand for '%%%c'", code);
12312 return;
12315 cond_code = INTVAL (x);
12316 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
12317 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
12319 break;
12321 case 'y':
12322 case 'z':
12324 machine_mode mode = GET_MODE (x);
12326 if (!MEM_P (x)
12327 || (code == 'y'
12328 && maybe_ne (GET_MODE_SIZE (mode), 8)
12329 && maybe_ne (GET_MODE_SIZE (mode), 16)))
12331 output_operand_lossage ("invalid operand for '%%%c'", code);
12332 return;
12335 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
12336 code == 'y'
12337 ? ADDR_QUERY_LDP_STP_N
12338 : ADDR_QUERY_LDP_STP))
12339 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12341 break;
12343 default:
12344 output_operand_lossage ("invalid operand prefix '%%%c'", code);
12345 return;
12349 /* Print address 'x' of a memory access with mode 'mode'.
12350 'op' is the context required by aarch64_classify_address. It can either be
12351 MEM for a normal memory access or PARALLEL for LDP/STP. */
12352 static bool
12353 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
12354 aarch64_addr_query_type type)
12356 struct aarch64_address_info addr;
12357 unsigned int size, vec_flags;
12359 /* Check all addresses are Pmode - including ILP32. */
12360 if (GET_MODE (x) != Pmode
12361 && (!CONST_INT_P (x)
12362 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
12364 output_operand_lossage ("invalid address mode");
12365 return false;
12368 if (aarch64_classify_address (&addr, x, mode, true, type))
12369 switch (addr.type)
12371 case ADDRESS_REG_IMM:
12372 if (known_eq (addr.const_offset, 0))
12374 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
12375 return true;
12378 vec_flags = aarch64_classify_vector_mode (mode);
12379 if (vec_flags & VEC_ANY_SVE)
12381 HOST_WIDE_INT vnum
12382 = exact_div (addr.const_offset,
12383 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
12384 asm_fprintf (f, "[%s, #%wd, mul vl]",
12385 reg_names[REGNO (addr.base)], vnum);
12386 return true;
12389 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
12390 INTVAL (addr.offset));
12391 return true;
12393 case ADDRESS_REG_REG:
12394 if (addr.shift == 0)
12395 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
12396 reg_names [REGNO (addr.offset)]);
12397 else
12398 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
12399 reg_names [REGNO (addr.offset)], addr.shift);
12400 return true;
12402 case ADDRESS_REG_UXTW:
12403 if (addr.shift == 0)
12404 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
12405 REGNO (addr.offset) - R0_REGNUM);
12406 else
12407 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
12408 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12409 return true;
12411 case ADDRESS_REG_SXTW:
12412 if (addr.shift == 0)
12413 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
12414 REGNO (addr.offset) - R0_REGNUM);
12415 else
12416 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
12417 REGNO (addr.offset) - R0_REGNUM, addr.shift);
12418 return true;
12420 case ADDRESS_REG_WB:
12421 /* Writeback is only supported for fixed-width modes. */
12422 size = GET_MODE_SIZE (mode).to_constant ();
12423 switch (GET_CODE (x))
12425 case PRE_INC:
12426 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
12427 return true;
12428 case POST_INC:
12429 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
12430 return true;
12431 case PRE_DEC:
12432 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
12433 return true;
12434 case POST_DEC:
12435 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
12436 return true;
12437 case PRE_MODIFY:
12438 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
12439 INTVAL (addr.offset));
12440 return true;
12441 case POST_MODIFY:
12442 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
12443 INTVAL (addr.offset));
12444 return true;
12445 default:
12446 break;
12448 break;
12450 case ADDRESS_LO_SUM:
12451 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
12452 output_addr_const (f, addr.offset);
12453 asm_fprintf (f, "]");
12454 return true;
12456 case ADDRESS_SYMBOLIC:
12457 output_addr_const (f, x);
12458 return true;
12461 return false;
12464 /* Print address 'x' of a memory access with mode 'mode'. */
12465 static void
12466 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
12468 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
12469 output_addr_const (f, x);
12472 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
12474 static bool
12475 aarch64_output_addr_const_extra (FILE *file, rtx x)
12477 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
12479 output_addr_const (file, XVECEXP (x, 0, 0));
12480 return true;
12482 return false;
12485 bool
12486 aarch64_label_mentioned_p (rtx x)
12488 const char *fmt;
12489 int i;
12491 if (LABEL_REF_P (x))
12492 return true;
12494 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
12495 referencing instruction, but they are constant offsets, not
12496 symbols. */
12497 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
12498 return false;
12500 fmt = GET_RTX_FORMAT (GET_CODE (x));
12501 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
12503 if (fmt[i] == 'E')
12505 int j;
12507 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
12508 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
12509 return 1;
12511 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
12512 return 1;
12515 return 0;
12518 /* Implement REGNO_REG_CLASS. */
12520 enum reg_class
12521 aarch64_regno_regclass (unsigned regno)
12523 if (STUB_REGNUM_P (regno))
12524 return STUB_REGS;
12526 if (GP_REGNUM_P (regno))
12527 return GENERAL_REGS;
12529 if (regno == SP_REGNUM)
12530 return STACK_REG;
12532 if (regno == FRAME_POINTER_REGNUM
12533 || regno == ARG_POINTER_REGNUM)
12534 return POINTER_REGS;
12536 if (FP_REGNUM_P (regno))
12537 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
12538 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
12540 if (PR_REGNUM_P (regno))
12541 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
12543 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
12544 return FFR_REGS;
12546 return NO_REGS;
12549 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
12550 If OFFSET is out of range, return an offset of an anchor point
12551 that is in range. Return 0 otherwise. */
12553 static HOST_WIDE_INT
12554 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
12555 machine_mode mode)
12557 /* Does it look like we'll need a 16-byte load/store-pair operation? */
12558 if (size > 16)
12559 return (offset + 0x400) & ~0x7f0;
12561 /* For offsets that aren't a multiple of the access size, the limit is
12562 -256...255. */
12563 if (offset & (size - 1))
12565 /* BLKmode typically uses LDP of X-registers. */
12566 if (mode == BLKmode)
12567 return (offset + 512) & ~0x3ff;
12568 return (offset + 0x100) & ~0x1ff;
12571 /* Small negative offsets are supported. */
12572 if (IN_RANGE (offset, -256, 0))
12573 return 0;
12575 if (mode == TImode || mode == TFmode || mode == TDmode)
12576 return (offset + 0x100) & ~0x1ff;
12578 /* Use 12-bit offset by access size. */
12579 return offset & (~0xfff * size);
12582 static rtx
12583 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
12585 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
12586 where mask is selected by alignment and size of the offset.
12587 We try to pick as large a range for the offset as possible to
12588 maximize the chance of a CSE. However, for aligned addresses
12589 we limit the range to 4k so that structures with different sized
12590 elements are likely to use the same base. We need to be careful
12591 not to split a CONST for some forms of address expression, otherwise
12592 it will generate sub-optimal code. */
12594 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
12596 rtx base = XEXP (x, 0);
12597 rtx offset_rtx = XEXP (x, 1);
12598 HOST_WIDE_INT offset = INTVAL (offset_rtx);
12600 if (GET_CODE (base) == PLUS)
12602 rtx op0 = XEXP (base, 0);
12603 rtx op1 = XEXP (base, 1);
12605 /* Force any scaling into a temp for CSE. */
12606 op0 = force_reg (Pmode, op0);
12607 op1 = force_reg (Pmode, op1);
12609 /* Let the pointer register be in op0. */
12610 if (REG_POINTER (op1))
12611 std::swap (op0, op1);
12613 /* If the pointer is virtual or frame related, then we know that
12614 virtual register instantiation or register elimination is going
12615 to apply a second constant. We want the two constants folded
12616 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
12617 if (virt_or_elim_regno_p (REGNO (op0)))
12619 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
12620 NULL_RTX, true, OPTAB_DIRECT);
12621 return gen_rtx_PLUS (Pmode, base, op1);
12624 /* Otherwise, in order to encourage CSE (and thence loop strength
12625 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
12626 base = expand_binop (Pmode, add_optab, op0, op1,
12627 NULL_RTX, true, OPTAB_DIRECT);
12628 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
12631 HOST_WIDE_INT size;
12632 if (GET_MODE_SIZE (mode).is_constant (&size))
12634 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
12635 mode);
12636 if (base_offset != 0)
12638 base = plus_constant (Pmode, base, base_offset);
12639 base = force_operand (base, NULL_RTX);
12640 return plus_constant (Pmode, base, offset - base_offset);
12645 return x;
12648 static reg_class_t
12649 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
12650 reg_class_t rclass,
12651 machine_mode mode,
12652 secondary_reload_info *sri)
12654 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
12655 LDR and STR. See the comment at the head of aarch64-sve.md for
12656 more details about the big-endian handling. */
12657 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12658 if (reg_class_subset_p (rclass, FP_REGS)
12659 && !((REG_P (x) && HARD_REGISTER_P (x))
12660 || aarch64_simd_valid_immediate (x, NULL))
12661 && mode != VNx16QImode
12662 && (vec_flags & VEC_SVE_DATA)
12663 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
12665 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
12666 return NO_REGS;
12669 /* If we have to disable direct literal pool loads and stores because the
12670 function is too big, then we need a scratch register. */
12671 if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
12672 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
12673 || targetm.vector_mode_supported_p (GET_MODE (x)))
12674 && !aarch64_pcrelative_literal_loads)
12676 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
12677 return NO_REGS;
12680 /* Without the TARGET_SIMD instructions we cannot move a Q register
12681 to a Q register directly. We need a scratch. */
12682 if (REG_P (x)
12683 && (mode == TFmode
12684 || mode == TImode
12685 || mode == TDmode
12686 || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
12687 && mode == GET_MODE (x)
12688 && !TARGET_SIMD
12689 && FP_REGNUM_P (REGNO (x))
12690 && reg_class_subset_p (rclass, FP_REGS))
12692 sri->icode = code_for_aarch64_reload_mov (mode);
12693 return NO_REGS;
12696 /* A TFmode, TImode or TDmode memory access should be handled via an FP_REGS
12697 because AArch64 has richer addressing modes for LDR/STR instructions
12698 than LDP/STP instructions. */
12699 if (TARGET_FLOAT && rclass == GENERAL_REGS
12700 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
12701 return FP_REGS;
12703 if (rclass == FP_REGS
12704 && (mode == TImode || mode == TFmode || mode == TDmode)
12705 && CONSTANT_P(x))
12706 return GENERAL_REGS;
12708 return NO_REGS;
12711 /* Implement TARGET_SECONDARY_MEMORY_NEEDED. */
12713 static bool
12714 aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
12715 reg_class_t class2)
12717 if (!TARGET_SIMD
12718 && reg_classes_intersect_p (class1, FP_REGS)
12719 && reg_classes_intersect_p (class2, FP_REGS))
12721 /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
12722 so we can't easily split a move involving tuples of 128-bit
12723 vectors. Force the copy through memory instead.
12725 (Tuples of 64-bit vectors are fine.) */
12726 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12727 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12728 return true;
12730 return false;
12733 static bool
12734 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
12736 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
12738 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
12739 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
12740 if (frame_pointer_needed)
12741 return to == HARD_FRAME_POINTER_REGNUM;
12742 return true;
12745 poly_int64
12746 aarch64_initial_elimination_offset (unsigned from, unsigned to)
12748 if (to == HARD_FRAME_POINTER_REGNUM)
12750 if (from == ARG_POINTER_REGNUM)
12751 return cfun->machine->frame.hard_fp_offset;
12753 if (from == FRAME_POINTER_REGNUM)
12754 return cfun->machine->frame.hard_fp_offset
12755 - cfun->machine->frame.locals_offset;
12758 if (to == STACK_POINTER_REGNUM)
12760 if (from == FRAME_POINTER_REGNUM)
12761 return cfun->machine->frame.frame_size
12762 - cfun->machine->frame.locals_offset;
12765 return cfun->machine->frame.frame_size;
12769 /* Get return address without mangling. */
12772 aarch64_return_addr_rtx (void)
12774 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
12775 /* Note: aarch64_return_address_signing_enabled only
12776 works after cfun->machine->frame.laid_out is set,
12777 so here we don't know if the return address will
12778 be signed or not. */
12779 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
12780 emit_move_insn (lr, val);
12781 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
12782 return lr;
12786 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
12787 previous frame. */
12790 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
12792 if (count != 0)
12793 return const0_rtx;
12794 return aarch64_return_addr_rtx ();
12797 static void
12798 aarch64_asm_trampoline_template (FILE *f)
12800 /* Even if the current function doesn't have branch protection, some
12801 later function might, so since this template is only generated once
12802 we have to add a BTI just in case. */
12803 asm_fprintf (f, "\thint\t34 // bti c\n");
12805 if (TARGET_ILP32)
12807 asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
12808 asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
12810 else
12812 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
12813 asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
12815 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
12817 /* We always emit a speculation barrier.
12818 This is because the same trampoline template is used for every nested
12819 function. Since nested functions are not particularly common or
12820 performant we don't worry too much about the extra instructions to copy
12821 around.
12822 This is not yet a problem, since we have not yet implemented function
12823 specific attributes to choose between hardening against straight line
12824 speculation or not, but such function specific attributes are likely to
12825 happen in the future. */
12826 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
12828 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12829 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
12832 static void
12833 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
12835 rtx fnaddr, mem, a_tramp;
12836 const int tramp_code_sz = 24;
12838 /* Don't need to copy the trailing D-words, we fill those in below. */
12839 /* We create our own memory address in Pmode so that `emit_block_move` can
12840 use parts of the backend which expect Pmode addresses. */
12841 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
12842 emit_block_move (gen_rtx_MEM (BLKmode, temp),
12843 assemble_trampoline_template (),
12844 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
12845 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
12846 fnaddr = XEXP (DECL_RTL (fndecl), 0);
12847 if (GET_MODE (fnaddr) != ptr_mode)
12848 fnaddr = convert_memory_address (ptr_mode, fnaddr);
12849 emit_move_insn (mem, fnaddr);
12851 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
12852 emit_move_insn (mem, chain_value);
12854 /* XXX We should really define a "clear_cache" pattern and use
12855 gen_clear_cache(). */
12856 a_tramp = XEXP (m_tramp, 0);
12857 maybe_emit_call_builtin___clear_cache (a_tramp,
12858 plus_constant (ptr_mode,
12859 a_tramp,
12860 TRAMPOLINE_SIZE));
12863 static unsigned char
12864 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
12866 /* ??? Logically we should only need to provide a value when
12867 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
12868 can hold MODE, but at the moment we need to handle all modes.
12869 Just ignore any runtime parts for registers that can't store them. */
12870 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
12871 unsigned int nregs, vec_flags;
12872 switch (regclass)
12874 case STUB_REGS:
12875 case TAILCALL_ADDR_REGS:
12876 case POINTER_REGS:
12877 case GENERAL_REGS:
12878 case ALL_REGS:
12879 case POINTER_AND_FP_REGS:
12880 case FP_REGS:
12881 case FP_LO_REGS:
12882 case FP_LO8_REGS:
12883 vec_flags = aarch64_classify_vector_mode (mode);
12884 if ((vec_flags & VEC_SVE_DATA)
12885 && constant_multiple_p (GET_MODE_SIZE (mode),
12886 aarch64_vl_bytes (mode, vec_flags), &nregs))
12887 return nregs;
12888 return (vec_flags & VEC_ADVSIMD
12889 ? CEIL (lowest_size, UNITS_PER_VREG)
12890 : CEIL (lowest_size, UNITS_PER_WORD));
12891 case STACK_REG:
12892 case PR_REGS:
12893 case PR_LO_REGS:
12894 case PR_HI_REGS:
12895 case FFR_REGS:
12896 case PR_AND_FFR_REGS:
12897 return 1;
12899 case NO_REGS:
12900 return 0;
12902 default:
12903 break;
12905 gcc_unreachable ();
12908 static reg_class_t
12909 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
12911 if (regclass == POINTER_REGS)
12912 return GENERAL_REGS;
12914 if (regclass == STACK_REG)
12916 if (REG_P(x)
12917 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
12918 return regclass;
12920 return NO_REGS;
12923 /* Register eliminiation can result in a request for
12924 SP+constant->FP_REGS. We cannot support such operations which
12925 use SP as source and an FP_REG as destination, so reject out
12926 right now. */
12927 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
12929 rtx lhs = XEXP (x, 0);
12931 /* Look through a possible SUBREG introduced by ILP32. */
12932 if (SUBREG_P (lhs))
12933 lhs = SUBREG_REG (lhs);
12935 gcc_assert (REG_P (lhs));
12936 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
12937 POINTER_REGS));
12938 return NO_REGS;
12941 return regclass;
12944 void
12945 aarch64_asm_output_labelref (FILE* f, const char *name)
12947 asm_fprintf (f, "%U%s", name);
12950 static void
12951 aarch64_elf_asm_constructor (rtx symbol, int priority)
12953 if (priority == DEFAULT_INIT_PRIORITY)
12954 default_ctor_section_asm_out_constructor (symbol, priority);
12955 else
12957 section *s;
12958 /* While priority is known to be in range [0, 65535], so 18 bytes
12959 would be enough, the compiler might not know that. To avoid
12960 -Wformat-truncation false positive, use a larger size. */
12961 char buf[23];
12962 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
12963 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12964 switch_to_section (s);
12965 assemble_align (POINTER_SIZE);
12966 assemble_aligned_integer (POINTER_BYTES, symbol);
12970 static void
12971 aarch64_elf_asm_destructor (rtx symbol, int priority)
12973 if (priority == DEFAULT_INIT_PRIORITY)
12974 default_dtor_section_asm_out_destructor (symbol, priority);
12975 else
12977 section *s;
12978 /* While priority is known to be in range [0, 65535], so 18 bytes
12979 would be enough, the compiler might not know that. To avoid
12980 -Wformat-truncation false positive, use a larger size. */
12981 char buf[23];
12982 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
12983 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
12984 switch_to_section (s);
12985 assemble_align (POINTER_SIZE);
12986 assemble_aligned_integer (POINTER_BYTES, symbol);
12990 const char*
12991 aarch64_output_casesi (rtx *operands)
12993 char buf[100];
12994 char label[100];
12995 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
12996 int index;
12997 static const char *const patterns[4][2] =
13000 "ldrb\t%w3, [%0,%w1,uxtw]",
13001 "add\t%3, %4, %w3, sxtb #2"
13004 "ldrh\t%w3, [%0,%w1,uxtw #1]",
13005 "add\t%3, %4, %w3, sxth #2"
13008 "ldr\t%w3, [%0,%w1,uxtw #2]",
13009 "add\t%3, %4, %w3, sxtw #2"
13011 /* We assume that DImode is only generated when not optimizing and
13012 that we don't really need 64-bit address offsets. That would
13013 imply an object file with 8GB of code in a single function! */
13015 "ldr\t%w3, [%0,%w1,uxtw #2]",
13016 "add\t%3, %4, %w3, sxtw #2"
13020 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
13022 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
13023 index = exact_log2 (GET_MODE_SIZE (mode));
13025 gcc_assert (index >= 0 && index <= 3);
13027 /* Need to implement table size reduction, by chaning the code below. */
13028 output_asm_insn (patterns[index][0], operands);
13029 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
13030 snprintf (buf, sizeof (buf),
13031 "adr\t%%4, %s", targetm.strip_name_encoding (label));
13032 output_asm_insn (buf, operands);
13033 output_asm_insn (patterns[index][1], operands);
13034 output_asm_insn ("br\t%3", operands);
13035 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
13036 operands);
13037 assemble_label (asm_out_file, label);
13038 return "";
13042 /* Return size in bits of an arithmetic operand which is shifted/scaled and
13043 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
13044 operator. */
13047 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
13049 if (shift >= 0 && shift <= 4)
13051 int size;
13052 for (size = 8; size <= 32; size *= 2)
13054 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
13055 if (mask == bits << shift)
13056 return size;
13059 return 0;
13062 /* Constant pools are per function only when PC relative
13063 literal loads are true or we are in the large memory
13064 model. */
13066 static inline bool
13067 aarch64_can_use_per_function_literal_pools_p (void)
13069 return (aarch64_pcrelative_literal_loads
13070 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
13073 static bool
13074 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
13076 /* We can't use blocks for constants when we're using a per-function
13077 constant pool. */
13078 return !aarch64_can_use_per_function_literal_pools_p ();
13081 /* Select appropriate section for constants depending
13082 on where we place literal pools. */
13084 static section *
13085 aarch64_select_rtx_section (machine_mode mode,
13086 rtx x,
13087 unsigned HOST_WIDE_INT align)
13089 if (aarch64_can_use_per_function_literal_pools_p ())
13090 return function_section (current_function_decl);
13092 return default_elf_select_rtx_section (mode, x, align);
13095 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
13096 void
13097 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
13098 HOST_WIDE_INT offset)
13100 /* When using per-function literal pools, we must ensure that any code
13101 section is aligned to the minimal instruction length, lest we get
13102 errors from the assembler re "unaligned instructions". */
13103 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
13104 ASM_OUTPUT_ALIGN (f, 2);
13107 /* Costs. */
13109 /* Helper function for rtx cost calculation. Strip a shift expression
13110 from X. Returns the inner operand if successful, or the original
13111 expression on failure. */
13112 static rtx
13113 aarch64_strip_shift (rtx x)
13115 rtx op = x;
13117 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
13118 we can convert both to ROR during final output. */
13119 if ((GET_CODE (op) == ASHIFT
13120 || GET_CODE (op) == ASHIFTRT
13121 || GET_CODE (op) == LSHIFTRT
13122 || GET_CODE (op) == ROTATERT
13123 || GET_CODE (op) == ROTATE)
13124 && CONST_INT_P (XEXP (op, 1)))
13125 return XEXP (op, 0);
13127 if (GET_CODE (op) == MULT
13128 && CONST_INT_P (XEXP (op, 1))
13129 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
13130 return XEXP (op, 0);
13132 return x;
13135 /* Helper function for rtx cost calculation. Strip an extend
13136 expression from X. Returns the inner operand if successful, or the
13137 original expression on failure. We deal with a number of possible
13138 canonicalization variations here. If STRIP_SHIFT is true, then
13139 we can strip off a shift also. */
13140 static rtx
13141 aarch64_strip_extend (rtx x, bool strip_shift)
13143 scalar_int_mode mode;
13144 rtx op = x;
13146 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
13147 return op;
13149 if (GET_CODE (op) == AND
13150 && GET_CODE (XEXP (op, 0)) == MULT
13151 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
13152 && CONST_INT_P (XEXP (op, 1))
13153 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
13154 INTVAL (XEXP (op, 1))) != 0)
13155 return XEXP (XEXP (op, 0), 0);
13157 /* Now handle extended register, as this may also have an optional
13158 left shift by 1..4. */
13159 if (strip_shift
13160 && GET_CODE (op) == ASHIFT
13161 && CONST_INT_P (XEXP (op, 1))
13162 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
13163 op = XEXP (op, 0);
13165 if (GET_CODE (op) == ZERO_EXTEND
13166 || GET_CODE (op) == SIGN_EXTEND)
13167 op = XEXP (op, 0);
13169 if (op != x)
13170 return op;
13172 return x;
13175 /* Helper function for rtx cost calculation. Strip extension as well as any
13176 inner VEC_SELECT high-half from X. Returns the inner vector operand if
13177 successful, or the original expression on failure. */
13178 static rtx
13179 aarch64_strip_extend_vec_half (rtx x)
13181 if (GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13183 x = XEXP (x, 0);
13184 if (GET_CODE (x) == VEC_SELECT
13185 && vec_series_highpart_p (GET_MODE (x), GET_MODE (XEXP (x, 0)),
13186 XEXP (x, 1)))
13187 x = XEXP (x, 0);
13189 return x;
13192 /* Helper function for rtx cost calculation. Strip VEC_DUPLICATE as well as
13193 any subsequent extend and VEC_SELECT from X. Returns the inner scalar
13194 operand if successful, or the original expression on failure. */
13195 static rtx
13196 aarch64_strip_duplicate_vec_elt (rtx x)
13198 if (GET_CODE (x) == VEC_DUPLICATE
13199 && is_a<scalar_mode> (GET_MODE (XEXP (x, 0))))
13201 x = XEXP (x, 0);
13202 if (GET_CODE (x) == VEC_SELECT)
13203 x = XEXP (x, 0);
13204 else if ((GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND)
13205 && GET_CODE (XEXP (x, 0)) == VEC_SELECT)
13206 x = XEXP (XEXP (x, 0), 0);
13208 return x;
13211 /* Return true iff CODE is a shift supported in combination
13212 with arithmetic instructions. */
13214 static bool
13215 aarch64_shift_p (enum rtx_code code)
13217 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
13221 /* Return true iff X is a cheap shift without a sign extend. */
13223 static bool
13224 aarch64_cheap_mult_shift_p (rtx x)
13226 rtx op0, op1;
13228 op0 = XEXP (x, 0);
13229 op1 = XEXP (x, 1);
13231 if (!(aarch64_tune_params.extra_tuning_flags
13232 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
13233 return false;
13235 if (GET_CODE (op0) == SIGN_EXTEND)
13236 return false;
13238 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
13239 && UINTVAL (op1) <= 4)
13240 return true;
13242 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
13243 return false;
13245 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
13247 if (l2 > 0 && l2 <= 4)
13248 return true;
13250 return false;
13253 /* Helper function for rtx cost calculation. Calculate the cost of
13254 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
13255 Return the calculated cost of the expression, recursing manually in to
13256 operands where needed. */
13258 static int
13259 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
13261 rtx op0, op1;
13262 const struct cpu_cost_table *extra_cost
13263 = aarch64_tune_params.insn_extra_cost;
13264 int cost = 0;
13265 bool compound_p = (outer == PLUS || outer == MINUS);
13266 machine_mode mode = GET_MODE (x);
13268 gcc_checking_assert (code == MULT);
13270 op0 = XEXP (x, 0);
13271 op1 = XEXP (x, 1);
13273 if (VECTOR_MODE_P (mode))
13275 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13276 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
13278 /* The select-operand-high-half versions of the instruction have the
13279 same cost as the three vector version - don't add the costs of the
13280 extension or selection into the costs of the multiply. */
13281 op0 = aarch64_strip_extend_vec_half (op0);
13282 op1 = aarch64_strip_extend_vec_half (op1);
13283 /* The by-element versions of the instruction have the same costs as
13284 the normal 3-vector version. We make an assumption that the input
13285 to the VEC_DUPLICATE is already on the FP & SIMD side. This means
13286 costing of a MUL by element pre RA is a bit optimistic. */
13287 op0 = aarch64_strip_duplicate_vec_elt (op0);
13288 op1 = aarch64_strip_duplicate_vec_elt (op1);
13290 cost += rtx_cost (op0, mode, MULT, 0, speed);
13291 cost += rtx_cost (op1, mode, MULT, 1, speed);
13292 if (speed)
13294 if (GET_CODE (x) == MULT)
13295 cost += extra_cost->vect.mult;
13296 /* This is to catch the SSRA costing currently flowing here. */
13297 else
13298 cost += extra_cost->vect.alu;
13300 return cost;
13303 /* Integer multiply/fma. */
13304 if (GET_MODE_CLASS (mode) == MODE_INT)
13306 /* The multiply will be canonicalized as a shift, cost it as such. */
13307 if (aarch64_shift_p (GET_CODE (x))
13308 || (CONST_INT_P (op1)
13309 && exact_log2 (INTVAL (op1)) > 0))
13311 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
13312 || GET_CODE (op0) == SIGN_EXTEND;
13313 if (speed)
13315 if (compound_p)
13317 /* If the shift is considered cheap,
13318 then don't add any cost. */
13319 if (aarch64_cheap_mult_shift_p (x))
13321 else if (REG_P (op1))
13322 /* ARITH + shift-by-register. */
13323 cost += extra_cost->alu.arith_shift_reg;
13324 else if (is_extend)
13325 /* ARITH + extended register. We don't have a cost field
13326 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
13327 cost += extra_cost->alu.extend_arith;
13328 else
13329 /* ARITH + shift-by-immediate. */
13330 cost += extra_cost->alu.arith_shift;
13332 else
13333 /* LSL (immediate). */
13334 cost += extra_cost->alu.shift;
13337 /* Strip extends as we will have costed them in the case above. */
13338 if (is_extend)
13339 op0 = aarch64_strip_extend (op0, true);
13341 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
13343 return cost;
13346 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
13347 compound and let the below cases handle it. After all, MNEG is a
13348 special-case alias of MSUB. */
13349 if (GET_CODE (op0) == NEG)
13351 op0 = XEXP (op0, 0);
13352 compound_p = true;
13355 /* Integer multiplies or FMAs have zero/sign extending variants. */
13356 if ((GET_CODE (op0) == ZERO_EXTEND
13357 && GET_CODE (op1) == ZERO_EXTEND)
13358 || (GET_CODE (op0) == SIGN_EXTEND
13359 && GET_CODE (op1) == SIGN_EXTEND))
13361 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
13362 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
13364 if (speed)
13366 if (compound_p)
13367 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
13368 cost += extra_cost->mult[0].extend_add;
13369 else
13370 /* MUL/SMULL/UMULL. */
13371 cost += extra_cost->mult[0].extend;
13374 return cost;
13377 /* This is either an integer multiply or a MADD. In both cases
13378 we want to recurse and cost the operands. */
13379 cost += rtx_cost (op0, mode, MULT, 0, speed);
13380 cost += rtx_cost (op1, mode, MULT, 1, speed);
13382 if (speed)
13384 if (compound_p)
13385 /* MADD/MSUB. */
13386 cost += extra_cost->mult[mode == DImode].add;
13387 else
13388 /* MUL. */
13389 cost += extra_cost->mult[mode == DImode].simple;
13392 return cost;
13394 else
13396 if (speed)
13398 /* Floating-point FMA/FMUL can also support negations of the
13399 operands, unless the rounding mode is upward or downward in
13400 which case FNMUL is different than FMUL with operand negation. */
13401 bool neg0 = GET_CODE (op0) == NEG;
13402 bool neg1 = GET_CODE (op1) == NEG;
13403 if (compound_p || !flag_rounding_math || (neg0 && neg1))
13405 if (neg0)
13406 op0 = XEXP (op0, 0);
13407 if (neg1)
13408 op1 = XEXP (op1, 0);
13411 if (compound_p)
13412 /* FMADD/FNMADD/FNMSUB/FMSUB. */
13413 cost += extra_cost->fp[mode == DFmode].fma;
13414 else
13415 /* FMUL/FNMUL. */
13416 cost += extra_cost->fp[mode == DFmode].mult;
13419 cost += rtx_cost (op0, mode, MULT, 0, speed);
13420 cost += rtx_cost (op1, mode, MULT, 1, speed);
13421 return cost;
13425 static int
13426 aarch64_address_cost (rtx x,
13427 machine_mode mode,
13428 addr_space_t as ATTRIBUTE_UNUSED,
13429 bool speed)
13431 enum rtx_code c = GET_CODE (x);
13432 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
13433 struct aarch64_address_info info;
13434 int cost = 0;
13435 info.shift = 0;
13437 if (!aarch64_classify_address (&info, x, mode, false))
13439 if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
13441 /* This is a CONST or SYMBOL ref which will be split
13442 in a different way depending on the code model in use.
13443 Cost it through the generic infrastructure. */
13444 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
13445 /* Divide through by the cost of one instruction to
13446 bring it to the same units as the address costs. */
13447 cost_symbol_ref /= COSTS_N_INSNS (1);
13448 /* The cost is then the cost of preparing the address,
13449 followed by an immediate (possibly 0) offset. */
13450 return cost_symbol_ref + addr_cost->imm_offset;
13452 else
13454 /* This is most likely a jump table from a case
13455 statement. */
13456 return addr_cost->register_offset;
13460 switch (info.type)
13462 case ADDRESS_LO_SUM:
13463 case ADDRESS_SYMBOLIC:
13464 case ADDRESS_REG_IMM:
13465 cost += addr_cost->imm_offset;
13466 break;
13468 case ADDRESS_REG_WB:
13469 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
13470 cost += addr_cost->pre_modify;
13471 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
13473 unsigned int nvectors = aarch64_ldn_stn_vectors (mode);
13474 if (nvectors == 3)
13475 cost += addr_cost->post_modify_ld3_st3;
13476 else if (nvectors == 4)
13477 cost += addr_cost->post_modify_ld4_st4;
13478 else
13479 cost += addr_cost->post_modify;
13481 else
13482 gcc_unreachable ();
13484 break;
13486 case ADDRESS_REG_REG:
13487 cost += addr_cost->register_offset;
13488 break;
13490 case ADDRESS_REG_SXTW:
13491 cost += addr_cost->register_sextend;
13492 break;
13494 case ADDRESS_REG_UXTW:
13495 cost += addr_cost->register_zextend;
13496 break;
13498 default:
13499 gcc_unreachable ();
13503 if (info.shift > 0)
13505 /* For the sake of calculating the cost of the shifted register
13506 component, we can treat same sized modes in the same way. */
13507 if (known_eq (GET_MODE_BITSIZE (mode), 16))
13508 cost += addr_cost->addr_scale_costs.hi;
13509 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
13510 cost += addr_cost->addr_scale_costs.si;
13511 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
13512 cost += addr_cost->addr_scale_costs.di;
13513 else
13514 /* We can't tell, or this is a 128-bit vector. */
13515 cost += addr_cost->addr_scale_costs.ti;
13518 return cost;
13521 /* Return the cost of a branch. If SPEED_P is true then the compiler is
13522 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
13523 to be taken. */
13526 aarch64_branch_cost (bool speed_p, bool predictable_p)
13528 /* When optimizing for speed, use the cost of unpredictable branches. */
13529 const struct cpu_branch_cost *branch_costs =
13530 aarch64_tune_params.branch_costs;
13532 if (!speed_p || predictable_p)
13533 return branch_costs->predictable;
13534 else
13535 return branch_costs->unpredictable;
13538 /* Return true if X is a zero or sign extract
13539 usable in an ADD or SUB (extended register) instruction. */
13540 static bool
13541 aarch64_rtx_arith_op_extract_p (rtx x)
13543 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
13544 No shift. */
13545 if (GET_CODE (x) == SIGN_EXTEND
13546 || GET_CODE (x) == ZERO_EXTEND)
13547 return REG_P (XEXP (x, 0));
13549 return false;
13552 static bool
13553 aarch64_frint_unspec_p (unsigned int u)
13555 switch (u)
13557 case UNSPEC_FRINTZ:
13558 case UNSPEC_FRINTP:
13559 case UNSPEC_FRINTM:
13560 case UNSPEC_FRINTA:
13561 case UNSPEC_FRINTN:
13562 case UNSPEC_FRINTX:
13563 case UNSPEC_FRINTI:
13564 return true;
13566 default:
13567 return false;
13571 /* Return true iff X is an rtx that will match an extr instruction
13572 i.e. as described in the *extr<mode>5_insn family of patterns.
13573 OP0 and OP1 will be set to the operands of the shifts involved
13574 on success and will be NULL_RTX otherwise. */
13576 static bool
13577 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
13579 rtx op0, op1;
13580 scalar_int_mode mode;
13581 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
13582 return false;
13584 *res_op0 = NULL_RTX;
13585 *res_op1 = NULL_RTX;
13587 if (GET_CODE (x) != IOR)
13588 return false;
13590 op0 = XEXP (x, 0);
13591 op1 = XEXP (x, 1);
13593 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
13594 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
13596 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
13597 if (GET_CODE (op1) == ASHIFT)
13598 std::swap (op0, op1);
13600 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
13601 return false;
13603 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
13604 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
13606 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
13607 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
13609 *res_op0 = XEXP (op0, 0);
13610 *res_op1 = XEXP (op1, 0);
13611 return true;
13615 return false;
13618 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
13619 storing it in *COST. Result is true if the total cost of the operation
13620 has now been calculated. */
13621 static bool
13622 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
13624 rtx inner;
13625 rtx comparator;
13626 enum rtx_code cmpcode;
13627 const struct cpu_cost_table *extra_cost
13628 = aarch64_tune_params.insn_extra_cost;
13630 if (COMPARISON_P (op0))
13632 inner = XEXP (op0, 0);
13633 comparator = XEXP (op0, 1);
13634 cmpcode = GET_CODE (op0);
13636 else
13638 inner = op0;
13639 comparator = const0_rtx;
13640 cmpcode = NE;
13643 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
13645 /* Conditional branch. */
13646 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13647 return true;
13648 else
13650 if (cmpcode == NE || cmpcode == EQ)
13652 if (comparator == const0_rtx)
13654 /* TBZ/TBNZ/CBZ/CBNZ. */
13655 if (GET_CODE (inner) == ZERO_EXTRACT)
13656 /* TBZ/TBNZ. */
13657 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
13658 ZERO_EXTRACT, 0, speed);
13659 else
13660 /* CBZ/CBNZ. */
13661 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
13663 return true;
13665 if (register_operand (inner, VOIDmode)
13666 && aarch64_imm24 (comparator, VOIDmode))
13668 /* SUB and SUBS. */
13669 *cost += COSTS_N_INSNS (2);
13670 if (speed)
13671 *cost += extra_cost->alu.arith * 2;
13672 return true;
13675 else if (cmpcode == LT || cmpcode == GE)
13677 /* TBZ/TBNZ. */
13678 if (comparator == const0_rtx)
13679 return true;
13683 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
13685 /* CCMP. */
13686 if (GET_CODE (op1) == COMPARE)
13688 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
13689 if (XEXP (op1, 1) == const0_rtx)
13690 *cost += 1;
13691 if (speed)
13693 machine_mode mode = GET_MODE (XEXP (op1, 0));
13695 if (GET_MODE_CLASS (mode) == MODE_INT)
13696 *cost += extra_cost->alu.arith;
13697 else
13698 *cost += extra_cost->fp[mode == DFmode].compare;
13700 return true;
13703 /* It's a conditional operation based on the status flags,
13704 so it must be some flavor of CSEL. */
13706 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
13707 if (GET_CODE (op1) == NEG
13708 || GET_CODE (op1) == NOT
13709 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
13710 op1 = XEXP (op1, 0);
13711 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
13713 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
13714 op1 = XEXP (op1, 0);
13715 op2 = XEXP (op2, 0);
13717 else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
13719 inner = XEXP (op1, 0);
13720 if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
13721 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
13722 op1 = XEXP (inner, 0);
13725 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
13726 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
13727 return true;
13730 /* We don't know what this is, cost all operands. */
13731 return false;
13734 /* Check whether X is a bitfield operation of the form shift + extend that
13735 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
13736 operand to which the bitfield operation is applied. Otherwise return
13737 NULL_RTX. */
13739 static rtx
13740 aarch64_extend_bitfield_pattern_p (rtx x)
13742 rtx_code outer_code = GET_CODE (x);
13743 machine_mode outer_mode = GET_MODE (x);
13745 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
13746 && outer_mode != SImode && outer_mode != DImode)
13747 return NULL_RTX;
13749 rtx inner = XEXP (x, 0);
13750 rtx_code inner_code = GET_CODE (inner);
13751 machine_mode inner_mode = GET_MODE (inner);
13752 rtx op = NULL_RTX;
13754 switch (inner_code)
13756 case ASHIFT:
13757 if (CONST_INT_P (XEXP (inner, 1))
13758 && (inner_mode == QImode || inner_mode == HImode))
13759 op = XEXP (inner, 0);
13760 break;
13761 case LSHIFTRT:
13762 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
13763 && (inner_mode == QImode || inner_mode == HImode))
13764 op = XEXP (inner, 0);
13765 break;
13766 case ASHIFTRT:
13767 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
13768 && (inner_mode == QImode || inner_mode == HImode))
13769 op = XEXP (inner, 0);
13770 break;
13771 default:
13772 break;
13775 return op;
13778 /* Return true if the mask and a shift amount from an RTX of the form
13779 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
13780 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
13782 bool
13783 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
13784 rtx shft_amnt)
13786 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
13787 && INTVAL (mask) > 0
13788 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
13789 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
13790 && (UINTVAL (mask)
13791 & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
13794 /* Return true if the masks and a shift amount from an RTX of the form
13795 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
13796 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
13798 bool
13799 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
13800 unsigned HOST_WIDE_INT mask1,
13801 unsigned HOST_WIDE_INT shft_amnt,
13802 unsigned HOST_WIDE_INT mask2)
13804 unsigned HOST_WIDE_INT t;
13806 /* Verify that there is no overlap in what bits are set in the two masks. */
13807 if (mask1 != ~mask2)
13808 return false;
13810 /* Verify that mask2 is not all zeros or ones. */
13811 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
13812 return false;
13814 /* The shift amount should always be less than the mode size. */
13815 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
13817 /* Verify that the mask being shifted is contiguous and would be in the
13818 least significant bits after shifting by shft_amnt. */
13819 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
13820 return (t == (t & -t));
13823 /* Calculate the cost of calculating X, storing it in *COST. Result
13824 is true if the total cost of the operation has now been calculated. */
13825 static bool
13826 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
13827 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
13829 rtx op0, op1, op2;
13830 const struct cpu_cost_table *extra_cost
13831 = aarch64_tune_params.insn_extra_cost;
13832 rtx_code code = GET_CODE (x);
13833 scalar_int_mode int_mode;
13835 /* By default, assume that everything has equivalent cost to the
13836 cheapest instruction. Any additional costs are applied as a delta
13837 above this default. */
13838 *cost = COSTS_N_INSNS (1);
13840 switch (code)
13842 case SET:
13843 /* The cost depends entirely on the operands to SET. */
13844 *cost = 0;
13845 op0 = SET_DEST (x);
13846 op1 = SET_SRC (x);
13848 switch (GET_CODE (op0))
13850 case MEM:
13851 if (speed)
13853 rtx address = XEXP (op0, 0);
13854 if (VECTOR_MODE_P (mode))
13855 *cost += extra_cost->ldst.storev;
13856 else if (GET_MODE_CLASS (mode) == MODE_INT)
13857 *cost += extra_cost->ldst.store;
13858 else if (mode == SFmode || mode == SDmode)
13859 *cost += extra_cost->ldst.storef;
13860 else if (mode == DFmode || mode == DDmode)
13861 *cost += extra_cost->ldst.stored;
13863 *cost +=
13864 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13865 0, speed));
13868 *cost += rtx_cost (op1, mode, SET, 1, speed);
13869 return true;
13871 case SUBREG:
13872 if (! REG_P (SUBREG_REG (op0)))
13873 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
13875 /* Fall through. */
13876 case REG:
13877 /* The cost is one per vector-register copied. */
13878 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
13880 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
13881 *cost = COSTS_N_INSNS (nregs);
13883 /* const0_rtx is in general free, but we will use an
13884 instruction to set a register to 0. */
13885 else if (REG_P (op1) || op1 == const0_rtx)
13887 /* The cost is 1 per register copied. */
13888 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
13889 *cost = COSTS_N_INSNS (nregs);
13891 else
13892 /* Cost is just the cost of the RHS of the set. */
13893 *cost += rtx_cost (op1, mode, SET, 1, speed);
13894 return true;
13896 case ZERO_EXTRACT:
13897 case SIGN_EXTRACT:
13898 /* Bit-field insertion. Strip any redundant widening of
13899 the RHS to meet the width of the target. */
13900 if (SUBREG_P (op1))
13901 op1 = SUBREG_REG (op1);
13902 if ((GET_CODE (op1) == ZERO_EXTEND
13903 || GET_CODE (op1) == SIGN_EXTEND)
13904 && CONST_INT_P (XEXP (op0, 1))
13905 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
13906 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
13907 op1 = XEXP (op1, 0);
13909 if (CONST_INT_P (op1))
13911 /* MOV immediate is assumed to always be cheap. */
13912 *cost = COSTS_N_INSNS (1);
13914 else
13916 /* BFM. */
13917 if (speed)
13918 *cost += extra_cost->alu.bfi;
13919 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
13922 return true;
13924 default:
13925 /* We can't make sense of this, assume default cost. */
13926 *cost = COSTS_N_INSNS (1);
13927 return false;
13929 return false;
13931 case CONST_INT:
13932 /* If an instruction can incorporate a constant within the
13933 instruction, the instruction's expression avoids calling
13934 rtx_cost() on the constant. If rtx_cost() is called on a
13935 constant, then it is usually because the constant must be
13936 moved into a register by one or more instructions.
13938 The exception is constant 0, which can be expressed
13939 as XZR/WZR and is therefore free. The exception to this is
13940 if we have (set (reg) (const0_rtx)) in which case we must cost
13941 the move. However, we can catch that when we cost the SET, so
13942 we don't need to consider that here. */
13943 if (x == const0_rtx)
13944 *cost = 0;
13945 else
13947 /* To an approximation, building any other constant is
13948 proportionally expensive to the number of instructions
13949 required to build that constant. This is true whether we
13950 are compiling for SPEED or otherwise. */
13951 machine_mode imode = known_le (GET_MODE_SIZE (mode), 4)
13952 ? SImode : DImode;
13953 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
13954 (NULL_RTX, x, false, imode));
13956 return true;
13958 case CONST_DOUBLE:
13960 /* First determine number of instructions to do the move
13961 as an integer constant. */
13962 if (!aarch64_float_const_representable_p (x)
13963 && !aarch64_can_const_movi_rtx_p (x, mode)
13964 && aarch64_float_const_rtx_p (x))
13966 unsigned HOST_WIDE_INT ival;
13967 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
13968 gcc_assert (succeed);
13970 machine_mode imode = known_eq (GET_MODE_SIZE (mode), 8)
13971 ? DImode : SImode;
13972 int ncost = aarch64_internal_mov_immediate
13973 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
13974 *cost += COSTS_N_INSNS (ncost);
13975 return true;
13978 if (speed)
13980 /* mov[df,sf]_aarch64. */
13981 if (aarch64_float_const_representable_p (x))
13982 /* FMOV (scalar immediate). */
13983 *cost += extra_cost->fp[mode == DFmode || mode == DDmode].fpconst;
13984 else if (!aarch64_float_const_zero_rtx_p (x))
13986 /* This will be a load from memory. */
13987 if (mode == DFmode || mode == DDmode)
13988 *cost += extra_cost->ldst.loadd;
13989 else
13990 *cost += extra_cost->ldst.loadf;
13992 else
13993 /* Otherwise this is +0.0. We get this using MOVI d0, #0
13994 or MOV v0.s[0], wzr - neither of which are modeled by the
13995 cost tables. Just use the default cost. */
14000 return true;
14002 case MEM:
14003 if (speed)
14005 /* For loads we want the base cost of a load, plus an
14006 approximation for the additional cost of the addressing
14007 mode. */
14008 rtx address = XEXP (x, 0);
14009 if (VECTOR_MODE_P (mode))
14010 *cost += extra_cost->ldst.loadv;
14011 else if (GET_MODE_CLASS (mode) == MODE_INT)
14012 *cost += extra_cost->ldst.load;
14013 else if (mode == SFmode || mode == SDmode)
14014 *cost += extra_cost->ldst.loadf;
14015 else if (mode == DFmode || mode == DDmode)
14016 *cost += extra_cost->ldst.loadd;
14018 *cost +=
14019 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14020 0, speed));
14023 return true;
14025 case NEG:
14026 op0 = XEXP (x, 0);
14028 if (VECTOR_MODE_P (mode))
14030 if (speed)
14032 /* FNEG. */
14033 *cost += extra_cost->vect.alu;
14035 return false;
14038 if (GET_MODE_CLASS (mode) == MODE_INT)
14040 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14041 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14043 /* CSETM. */
14044 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
14045 return true;
14048 /* Cost this as SUB wzr, X. */
14049 op0 = CONST0_RTX (mode);
14050 op1 = XEXP (x, 0);
14051 goto cost_minus;
14054 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14056 /* Support (neg(fma...)) as a single instruction only if
14057 sign of zeros is unimportant. This matches the decision
14058 making in aarch64.md. */
14059 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
14061 /* FNMADD. */
14062 *cost = rtx_cost (op0, mode, NEG, 0, speed);
14063 return true;
14065 if (GET_CODE (op0) == MULT)
14067 /* FNMUL. */
14068 *cost = rtx_cost (op0, mode, NEG, 0, speed);
14069 return true;
14071 if (speed)
14072 /* FNEG. */
14073 *cost += extra_cost->fp[mode == DFmode].neg;
14074 return false;
14077 return false;
14079 case CLRSB:
14080 case CLZ:
14081 if (speed)
14083 if (VECTOR_MODE_P (mode))
14084 *cost += extra_cost->vect.alu;
14085 else
14086 *cost += extra_cost->alu.clz;
14089 return false;
14091 case CTZ:
14092 *cost = COSTS_N_INSNS (2);
14094 if (speed)
14095 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
14096 return false;
14098 case COMPARE:
14099 op0 = XEXP (x, 0);
14100 op1 = XEXP (x, 1);
14102 if (op1 == const0_rtx
14103 && GET_CODE (op0) == AND)
14105 x = op0;
14106 mode = GET_MODE (op0);
14107 goto cost_logic;
14110 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
14112 /* TODO: A write to the CC flags possibly costs extra, this
14113 needs encoding in the cost tables. */
14115 mode = GET_MODE (op0);
14116 /* ANDS. */
14117 if (GET_CODE (op0) == AND)
14119 x = op0;
14120 goto cost_logic;
14123 if (GET_CODE (op0) == PLUS)
14125 /* ADDS (and CMN alias). */
14126 x = op0;
14127 goto cost_plus;
14130 if (GET_CODE (op0) == MINUS)
14132 /* SUBS. */
14133 x = op0;
14134 goto cost_minus;
14137 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
14138 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
14139 && CONST_INT_P (XEXP (op0, 2)))
14141 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
14142 Handle it here directly rather than going to cost_logic
14143 since we know the immediate generated for the TST is valid
14144 so we can avoid creating an intermediate rtx for it only
14145 for costing purposes. */
14146 if (speed)
14147 *cost += extra_cost->alu.logical;
14149 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
14150 ZERO_EXTRACT, 0, speed);
14151 return true;
14154 if (GET_CODE (op1) == NEG)
14156 /* CMN. */
14157 if (speed)
14158 *cost += extra_cost->alu.arith;
14160 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
14161 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
14162 return true;
14165 /* CMP.
14167 Compare can freely swap the order of operands, and
14168 canonicalization puts the more complex operation first.
14169 But the integer MINUS logic expects the shift/extend
14170 operation in op1. */
14171 if (! (REG_P (op0)
14172 || (SUBREG_P (op0) && REG_P (SUBREG_REG (op0)))))
14174 op0 = XEXP (x, 1);
14175 op1 = XEXP (x, 0);
14177 goto cost_minus;
14180 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
14182 /* FCMP. */
14183 if (speed)
14184 *cost += extra_cost->fp[mode == DFmode].compare;
14186 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
14188 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
14189 /* FCMP supports constant 0.0 for no extra cost. */
14190 return true;
14192 return false;
14195 if (VECTOR_MODE_P (mode))
14197 /* Vector compare. */
14198 if (speed)
14199 *cost += extra_cost->vect.alu;
14201 if (aarch64_float_const_zero_rtx_p (op1))
14203 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
14204 cost. */
14205 return true;
14207 return false;
14209 return false;
14211 case MINUS:
14213 op0 = XEXP (x, 0);
14214 op1 = XEXP (x, 1);
14216 cost_minus:
14217 if (VECTOR_MODE_P (mode))
14219 /* SUBL2 and SUBW2. */
14220 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14221 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14223 /* The select-operand-high-half versions of the sub instruction
14224 have the same cost as the regular three vector version -
14225 don't add the costs of the select into the costs of the sub.
14227 op0 = aarch64_strip_extend_vec_half (op0);
14228 op1 = aarch64_strip_extend_vec_half (op1);
14232 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
14234 /* Detect valid immediates. */
14235 if ((GET_MODE_CLASS (mode) == MODE_INT
14236 || (GET_MODE_CLASS (mode) == MODE_CC
14237 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
14238 && CONST_INT_P (op1)
14239 && aarch64_uimm12_shift (INTVAL (op1)))
14241 if (speed)
14242 /* SUB(S) (immediate). */
14243 *cost += extra_cost->alu.arith;
14244 return true;
14247 /* Look for SUB (extended register). */
14248 if (is_a <scalar_int_mode> (mode)
14249 && aarch64_rtx_arith_op_extract_p (op1))
14251 if (speed)
14252 *cost += extra_cost->alu.extend_arith;
14254 op1 = aarch64_strip_extend (op1, true);
14255 *cost += rtx_cost (op1, VOIDmode,
14256 (enum rtx_code) GET_CODE (op1), 0, speed);
14257 return true;
14260 rtx new_op1 = aarch64_strip_extend (op1, false);
14262 /* Cost this as an FMA-alike operation. */
14263 if ((GET_CODE (new_op1) == MULT
14264 || aarch64_shift_p (GET_CODE (new_op1)))
14265 && code != COMPARE)
14267 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
14268 (enum rtx_code) code,
14269 speed);
14270 return true;
14273 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
14275 if (speed)
14277 if (VECTOR_MODE_P (mode))
14279 /* Vector SUB. */
14280 *cost += extra_cost->vect.alu;
14282 else if (GET_MODE_CLASS (mode) == MODE_INT)
14284 /* SUB(S). */
14285 *cost += extra_cost->alu.arith;
14287 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14289 /* FSUB. */
14290 *cost += extra_cost->fp[mode == DFmode].addsub;
14293 return true;
14296 case PLUS:
14298 rtx new_op0;
14300 op0 = XEXP (x, 0);
14301 op1 = XEXP (x, 1);
14303 cost_plus:
14304 if (VECTOR_MODE_P (mode))
14306 /* ADDL2 and ADDW2. */
14307 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
14308 if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
14310 /* The select-operand-high-half versions of the add instruction
14311 have the same cost as the regular three vector version -
14312 don't add the costs of the select into the costs of the add.
14314 op0 = aarch64_strip_extend_vec_half (op0);
14315 op1 = aarch64_strip_extend_vec_half (op1);
14319 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
14320 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
14322 /* CSINC. */
14323 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
14324 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14325 return true;
14328 if (GET_MODE_CLASS (mode) == MODE_INT
14329 && (aarch64_plus_immediate (op1, mode)
14330 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
14332 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14334 if (speed)
14336 /* ADD (immediate). */
14337 *cost += extra_cost->alu.arith;
14339 /* Some tunings prefer to not use the VL-based scalar ops.
14340 Increase the cost of the poly immediate to prevent their
14341 formation. */
14342 if (GET_CODE (op1) == CONST_POLY_INT
14343 && (aarch64_tune_params.extra_tuning_flags
14344 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
14345 *cost += COSTS_N_INSNS (1);
14347 return true;
14350 if (aarch64_pluslong_immediate (op1, mode))
14352 /* 24-bit add in 2 instructions or 12-bit shifted add. */
14353 if ((INTVAL (op1) & 0xfff) != 0)
14354 *cost += COSTS_N_INSNS (1);
14356 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
14357 return true;
14360 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
14362 /* Look for ADD (extended register). */
14363 if (is_a <scalar_int_mode> (mode)
14364 && aarch64_rtx_arith_op_extract_p (op0))
14366 if (speed)
14367 *cost += extra_cost->alu.extend_arith;
14369 op0 = aarch64_strip_extend (op0, true);
14370 *cost += rtx_cost (op0, VOIDmode,
14371 (enum rtx_code) GET_CODE (op0), 0, speed);
14372 return true;
14375 /* Strip any extend, leave shifts behind as we will
14376 cost them through mult_cost. */
14377 new_op0 = aarch64_strip_extend (op0, false);
14379 if (GET_CODE (new_op0) == MULT
14380 || aarch64_shift_p (GET_CODE (new_op0)))
14382 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
14383 speed);
14384 return true;
14387 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
14389 if (speed)
14391 if (VECTOR_MODE_P (mode))
14393 /* Vector ADD. */
14394 *cost += extra_cost->vect.alu;
14396 else if (GET_MODE_CLASS (mode) == MODE_INT)
14398 /* ADD. */
14399 *cost += extra_cost->alu.arith;
14401 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
14403 /* FADD. */
14404 *cost += extra_cost->fp[mode == DFmode].addsub;
14407 return true;
14410 case BSWAP:
14411 *cost = COSTS_N_INSNS (1);
14413 if (speed)
14415 if (VECTOR_MODE_P (mode))
14416 *cost += extra_cost->vect.alu;
14417 else
14418 *cost += extra_cost->alu.rev;
14420 return false;
14422 case IOR:
14423 if (aarch_rev16_p (x))
14425 *cost = COSTS_N_INSNS (1);
14427 if (speed)
14429 if (VECTOR_MODE_P (mode))
14430 *cost += extra_cost->vect.alu;
14431 else
14432 *cost += extra_cost->alu.rev;
14434 return true;
14437 if (aarch64_extr_rtx_p (x, &op0, &op1))
14439 *cost += rtx_cost (op0, mode, IOR, 0, speed);
14440 *cost += rtx_cost (op1, mode, IOR, 1, speed);
14441 if (speed)
14442 *cost += extra_cost->alu.shift;
14444 return true;
14446 /* Fall through. */
14447 case XOR:
14448 case AND:
14449 cost_logic:
14450 op0 = XEXP (x, 0);
14451 op1 = XEXP (x, 1);
14453 if (VECTOR_MODE_P (mode))
14455 if (speed)
14456 *cost += extra_cost->vect.alu;
14457 return true;
14460 if (code == AND
14461 && GET_CODE (op0) == MULT
14462 && CONST_INT_P (XEXP (op0, 1))
14463 && CONST_INT_P (op1)
14464 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
14465 INTVAL (op1)) != 0)
14467 /* This is a UBFM/SBFM. */
14468 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
14469 if (speed)
14470 *cost += extra_cost->alu.bfx;
14471 return true;
14474 if (is_int_mode (mode, &int_mode))
14476 if (CONST_INT_P (op1))
14478 /* We have a mask + shift version of a UBFIZ
14479 i.e. the *andim_ashift<mode>_bfiz pattern. */
14480 if (GET_CODE (op0) == ASHIFT
14481 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
14482 XEXP (op0, 1)))
14484 *cost += rtx_cost (XEXP (op0, 0), int_mode,
14485 (enum rtx_code) code, 0, speed);
14486 if (speed)
14487 *cost += extra_cost->alu.bfx;
14489 return true;
14491 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
14493 /* We possibly get the immediate for free, this is not
14494 modelled. */
14495 *cost += rtx_cost (op0, int_mode,
14496 (enum rtx_code) code, 0, speed);
14497 if (speed)
14498 *cost += extra_cost->alu.logical;
14500 return true;
14503 else
14505 rtx new_op0 = op0;
14507 /* Handle ORN, EON, or BIC. */
14508 if (GET_CODE (op0) == NOT)
14509 op0 = XEXP (op0, 0);
14511 new_op0 = aarch64_strip_shift (op0);
14513 /* If we had a shift on op0 then this is a logical-shift-
14514 by-register/immediate operation. Otherwise, this is just
14515 a logical operation. */
14516 if (speed)
14518 if (new_op0 != op0)
14520 /* Shift by immediate. */
14521 if (CONST_INT_P (XEXP (op0, 1)))
14522 *cost += extra_cost->alu.log_shift;
14523 else
14524 *cost += extra_cost->alu.log_shift_reg;
14526 else
14527 *cost += extra_cost->alu.logical;
14530 /* In both cases we want to cost both operands. */
14531 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
14532 0, speed);
14533 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
14534 1, speed);
14536 return true;
14539 return false;
14541 case NOT:
14542 x = XEXP (x, 0);
14543 op0 = aarch64_strip_shift (x);
14545 if (VECTOR_MODE_P (mode))
14547 /* Vector NOT. */
14548 *cost += extra_cost->vect.alu;
14549 return false;
14552 /* MVN-shifted-reg. */
14553 if (op0 != x)
14555 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
14557 if (speed)
14558 *cost += extra_cost->alu.log_shift;
14560 return true;
14562 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
14563 Handle the second form here taking care that 'a' in the above can
14564 be a shift. */
14565 else if (GET_CODE (op0) == XOR)
14567 rtx newop0 = XEXP (op0, 0);
14568 rtx newop1 = XEXP (op0, 1);
14569 rtx op0_stripped = aarch64_strip_shift (newop0);
14571 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
14572 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
14574 if (speed)
14576 if (op0_stripped != newop0)
14577 *cost += extra_cost->alu.log_shift;
14578 else
14579 *cost += extra_cost->alu.logical;
14582 return true;
14584 /* MVN. */
14585 if (speed)
14586 *cost += extra_cost->alu.logical;
14588 return false;
14590 case ZERO_EXTEND:
14592 op0 = XEXP (x, 0);
14593 /* If a value is written in SI mode, then zero extended to DI
14594 mode, the operation will in general be free as a write to
14595 a 'w' register implicitly zeroes the upper bits of an 'x'
14596 register. However, if this is
14598 (set (reg) (zero_extend (reg)))
14600 we must cost the explicit register move. */
14601 if (mode == DImode
14602 && GET_MODE (op0) == SImode)
14604 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
14606 /* If OP_COST is non-zero, then the cost of the zero extend
14607 is effectively the cost of the inner operation. Otherwise
14608 we have a MOV instruction and we take the cost from the MOV
14609 itself. This is true independently of whether we are
14610 optimizing for space or time. */
14611 if (op_cost)
14612 *cost = op_cost;
14614 return true;
14616 else if (MEM_P (op0))
14618 /* All loads can zero extend to any size for free. */
14619 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
14620 return true;
14623 op0 = aarch64_extend_bitfield_pattern_p (x);
14624 if (op0)
14626 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
14627 if (speed)
14628 *cost += extra_cost->alu.bfx;
14629 return true;
14632 if (speed)
14634 if (VECTOR_MODE_P (mode))
14636 /* UMOV. */
14637 *cost += extra_cost->vect.alu;
14639 else
14641 /* We generate an AND instead of UXTB/UXTH. */
14642 *cost += extra_cost->alu.logical;
14645 return false;
14647 case SIGN_EXTEND:
14648 if (MEM_P (XEXP (x, 0)))
14650 /* LDRSH. */
14651 if (speed)
14653 rtx address = XEXP (XEXP (x, 0), 0);
14654 *cost += extra_cost->ldst.load_sign_extend;
14656 *cost +=
14657 COSTS_N_INSNS (aarch64_address_cost (address, mode,
14658 0, speed));
14660 return true;
14663 op0 = aarch64_extend_bitfield_pattern_p (x);
14664 if (op0)
14666 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
14667 if (speed)
14668 *cost += extra_cost->alu.bfx;
14669 return true;
14672 if (speed)
14674 if (VECTOR_MODE_P (mode))
14675 *cost += extra_cost->vect.alu;
14676 else
14677 *cost += extra_cost->alu.extend;
14679 return false;
14681 case ROTATE:
14682 case ROTATERT:
14683 case LSHIFTRT:
14684 case ASHIFTRT:
14685 case ASHIFT:
14686 op0 = XEXP (x, 0);
14687 op1 = XEXP (x, 1);
14689 if (CONST_INT_P (op1))
14691 if (speed)
14693 if (VECTOR_MODE_P (mode))
14695 /* Vector shift (immediate). */
14696 *cost += extra_cost->vect.alu;
14698 else
14700 /* LSL (immediate), ASR (immediate), UBMF, UBFIZ and friends.
14701 These are all aliases. */
14702 *cost += extra_cost->alu.shift;
14706 /* We can incorporate zero/sign extend for free. */
14707 if (GET_CODE (op0) == ZERO_EXTEND
14708 || GET_CODE (op0) == SIGN_EXTEND)
14709 op0 = XEXP (op0, 0);
14711 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
14712 return true;
14714 else
14716 if (VECTOR_MODE_P (mode))
14718 if (speed)
14719 /* Vector shift (register). */
14720 *cost += extra_cost->vect.alu;
14722 else
14724 if (speed)
14725 /* LSLV, ASRV. */
14726 *cost += extra_cost->alu.shift_reg;
14728 /* The register shift amount may be in a shorter mode expressed
14729 as a lowpart SUBREG. For costing purposes just look inside. */
14730 if (SUBREG_P (op1) && subreg_lowpart_p (op1))
14731 op1 = SUBREG_REG (op1);
14732 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
14733 && CONST_INT_P (XEXP (op1, 1))
14734 && known_eq (INTVAL (XEXP (op1, 1)),
14735 GET_MODE_BITSIZE (mode) - 1))
14737 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
14738 /* We already demanded XEXP (op1, 0) to be REG_P, so
14739 don't recurse into it. */
14740 return true;
14743 return false; /* All arguments need to be in registers. */
14746 case SYMBOL_REF:
14748 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
14749 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
14751 /* LDR. */
14752 if (speed)
14753 *cost += extra_cost->ldst.load;
14755 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
14756 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
14758 /* ADRP, followed by ADD. */
14759 *cost += COSTS_N_INSNS (1);
14760 if (speed)
14761 *cost += 2 * extra_cost->alu.arith;
14763 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
14764 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14766 /* ADR. */
14767 if (speed)
14768 *cost += extra_cost->alu.arith;
14771 if (flag_pic)
14773 /* One extra load instruction, after accessing the GOT. */
14774 *cost += COSTS_N_INSNS (1);
14775 if (speed)
14776 *cost += extra_cost->ldst.load;
14778 return true;
14780 case HIGH:
14781 case LO_SUM:
14782 /* ADRP/ADD (immediate). */
14783 if (speed)
14784 *cost += extra_cost->alu.arith;
14785 return true;
14787 case ZERO_EXTRACT:
14788 case SIGN_EXTRACT:
14789 /* UBFX/SBFX. */
14790 if (speed)
14792 if (VECTOR_MODE_P (mode))
14793 *cost += extra_cost->vect.alu;
14794 else
14795 *cost += extra_cost->alu.bfx;
14798 /* We can trust that the immediates used will be correct (there
14799 are no by-register forms), so we need only cost op0. */
14800 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
14801 return true;
14803 case MULT:
14804 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
14805 /* aarch64_rtx_mult_cost always handles recursion to its
14806 operands. */
14807 return true;
14809 case MOD:
14810 /* We can expand signed mod by power of 2 using a NEGS, two parallel
14811 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
14812 an unconditional negate. This case should only ever be reached through
14813 the set_smod_pow2_cheap check in expmed.cc. */
14814 if (CONST_INT_P (XEXP (x, 1))
14815 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
14816 && (mode == SImode || mode == DImode))
14818 /* We expand to 4 instructions. Reset the baseline. */
14819 *cost = COSTS_N_INSNS (4);
14821 if (speed)
14822 *cost += 2 * extra_cost->alu.logical
14823 + 2 * extra_cost->alu.arith;
14825 return true;
14828 /* Fall-through. */
14829 case UMOD:
14830 if (speed)
14832 /* Slighly prefer UMOD over SMOD. */
14833 if (VECTOR_MODE_P (mode))
14834 *cost += extra_cost->vect.alu;
14835 else if (GET_MODE_CLASS (mode) == MODE_INT)
14836 *cost += (extra_cost->mult[mode == DImode].add
14837 + extra_cost->mult[mode == DImode].idiv
14838 + (code == MOD ? 1 : 0));
14840 return false; /* All arguments need to be in registers. */
14842 case DIV:
14843 case UDIV:
14844 case SQRT:
14845 if (speed)
14847 if (VECTOR_MODE_P (mode))
14848 *cost += extra_cost->vect.alu;
14849 else if (GET_MODE_CLASS (mode) == MODE_INT)
14850 /* There is no integer SQRT, so only DIV and UDIV can get
14851 here. */
14852 *cost += (extra_cost->mult[mode == DImode].idiv
14853 /* Slighly prefer UDIV over SDIV. */
14854 + (code == DIV ? 1 : 0));
14855 else
14856 *cost += extra_cost->fp[mode == DFmode].div;
14858 return false; /* All arguments need to be in registers. */
14860 case IF_THEN_ELSE:
14861 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
14862 XEXP (x, 2), cost, speed);
14864 case EQ:
14865 case NE:
14866 case GT:
14867 case GTU:
14868 case LT:
14869 case LTU:
14870 case GE:
14871 case GEU:
14872 case LE:
14873 case LEU:
14875 return false; /* All arguments must be in registers. */
14877 case FMA:
14878 op0 = XEXP (x, 0);
14879 op1 = XEXP (x, 1);
14880 op2 = XEXP (x, 2);
14882 if (speed)
14884 if (VECTOR_MODE_P (mode))
14885 *cost += extra_cost->vect.alu;
14886 else
14887 *cost += extra_cost->fp[mode == DFmode].fma;
14890 /* FMSUB, FNMADD, and FNMSUB are free. */
14891 if (GET_CODE (op0) == NEG)
14892 op0 = XEXP (op0, 0);
14894 if (GET_CODE (op2) == NEG)
14895 op2 = XEXP (op2, 0);
14897 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
14898 and the by-element operand as operand 0. */
14899 if (GET_CODE (op1) == NEG)
14900 op1 = XEXP (op1, 0);
14902 /* Catch vector-by-element operations. The by-element operand can
14903 either be (vec_duplicate (vec_select (x))) or just
14904 (vec_select (x)), depending on whether we are multiplying by
14905 a vector or a scalar.
14907 Canonicalization is not very good in these cases, FMA4 will put the
14908 by-element operand as operand 0, FNMA4 will have it as operand 1. */
14909 if (GET_CODE (op0) == VEC_DUPLICATE)
14910 op0 = XEXP (op0, 0);
14911 else if (GET_CODE (op1) == VEC_DUPLICATE)
14912 op1 = XEXP (op1, 0);
14914 if (GET_CODE (op0) == VEC_SELECT)
14915 op0 = XEXP (op0, 0);
14916 else if (GET_CODE (op1) == VEC_SELECT)
14917 op1 = XEXP (op1, 0);
14919 /* If the remaining parameters are not registers,
14920 get the cost to put them into registers. */
14921 *cost += rtx_cost (op0, mode, FMA, 0, speed);
14922 *cost += rtx_cost (op1, mode, FMA, 1, speed);
14923 *cost += rtx_cost (op2, mode, FMA, 2, speed);
14924 return true;
14926 case FLOAT:
14927 case UNSIGNED_FLOAT:
14928 if (speed)
14929 *cost += extra_cost->fp[mode == DFmode].fromint;
14930 return false;
14932 case FLOAT_EXTEND:
14933 if (speed)
14935 if (VECTOR_MODE_P (mode))
14937 /*Vector truncate. */
14938 *cost += extra_cost->vect.alu;
14940 else
14941 *cost += extra_cost->fp[mode == DFmode].widen;
14943 return false;
14945 case FLOAT_TRUNCATE:
14946 if (speed)
14948 if (VECTOR_MODE_P (mode))
14950 /*Vector conversion. */
14951 *cost += extra_cost->vect.alu;
14953 else
14954 *cost += extra_cost->fp[mode == DFmode].narrow;
14956 return false;
14958 case FIX:
14959 case UNSIGNED_FIX:
14960 x = XEXP (x, 0);
14961 /* Strip the rounding part. They will all be implemented
14962 by the fcvt* family of instructions anyway. */
14963 if (GET_CODE (x) == UNSPEC)
14965 unsigned int uns_code = XINT (x, 1);
14967 if (uns_code == UNSPEC_FRINTA
14968 || uns_code == UNSPEC_FRINTM
14969 || uns_code == UNSPEC_FRINTN
14970 || uns_code == UNSPEC_FRINTP
14971 || uns_code == UNSPEC_FRINTZ)
14972 x = XVECEXP (x, 0, 0);
14975 if (speed)
14977 if (VECTOR_MODE_P (mode))
14978 *cost += extra_cost->vect.alu;
14979 else
14980 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
14983 /* We can combine fmul by a power of 2 followed by a fcvt into a single
14984 fixed-point fcvt. */
14985 if (GET_CODE (x) == MULT
14986 && ((VECTOR_MODE_P (mode)
14987 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
14988 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
14990 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
14991 0, speed);
14992 return true;
14995 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
14996 return true;
14998 case ABS:
14999 if (VECTOR_MODE_P (mode))
15001 /* ABS (vector). */
15002 if (speed)
15003 *cost += extra_cost->vect.alu;
15005 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
15007 op0 = XEXP (x, 0);
15009 /* FABD, which is analogous to FADD. */
15010 if (GET_CODE (op0) == MINUS)
15012 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
15013 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
15014 if (speed)
15015 *cost += extra_cost->fp[mode == DFmode].addsub;
15017 return true;
15019 /* Simple FABS is analogous to FNEG. */
15020 if (speed)
15021 *cost += extra_cost->fp[mode == DFmode].neg;
15023 else
15025 /* Integer ABS will either be split to
15026 two arithmetic instructions, or will be an ABS
15027 (scalar), which we don't model. */
15028 *cost = COSTS_N_INSNS (2);
15029 if (speed)
15030 *cost += 2 * extra_cost->alu.arith;
15032 return false;
15034 case SMAX:
15035 case SMIN:
15036 if (speed)
15038 if (VECTOR_MODE_P (mode))
15039 *cost += extra_cost->vect.alu;
15040 else
15042 /* FMAXNM/FMINNM/FMAX/FMIN.
15043 TODO: This may not be accurate for all implementations, but
15044 we do not model this in the cost tables. */
15045 *cost += extra_cost->fp[mode == DFmode].addsub;
15048 return false;
15050 case UNSPEC:
15051 /* The floating point round to integer frint* instructions. */
15052 if (aarch64_frint_unspec_p (XINT (x, 1)))
15054 if (speed)
15055 *cost += extra_cost->fp[mode == DFmode].roundint;
15057 return false;
15060 if (XINT (x, 1) == UNSPEC_RBIT)
15062 if (speed)
15063 *cost += extra_cost->alu.rev;
15065 return false;
15067 break;
15069 case TRUNCATE:
15071 /* Decompose <su>muldi3_highpart. */
15072 if (/* (truncate:DI */
15073 mode == DImode
15074 /* (lshiftrt:TI */
15075 && GET_MODE (XEXP (x, 0)) == TImode
15076 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
15077 /* (mult:TI */
15078 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
15079 /* (ANY_EXTEND:TI (reg:DI))
15080 (ANY_EXTEND:TI (reg:DI))) */
15081 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
15082 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
15083 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
15084 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
15085 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
15086 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
15087 /* (const_int 64) */
15088 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
15089 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
15091 /* UMULH/SMULH. */
15092 if (speed)
15093 *cost += extra_cost->mult[mode == DImode].extend;
15094 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
15095 mode, MULT, 0, speed);
15096 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
15097 mode, MULT, 1, speed);
15098 return true;
15100 break;
15101 case CONST_VECTOR:
15103 /* Load using MOVI/MVNI. */
15104 if (aarch64_simd_valid_immediate (x, NULL))
15105 *cost = extra_cost->vect.movi;
15106 else /* Load using constant pool. */
15107 *cost = extra_cost->ldst.load;
15108 break;
15110 case VEC_CONCAT:
15111 /* depending on the operation, either DUP or INS.
15112 For now, keep default costing. */
15113 break;
15114 case VEC_DUPLICATE:
15115 /* Load using a DUP. */
15116 *cost = extra_cost->vect.dup;
15117 return false;
15118 case VEC_SELECT:
15120 rtx op0 = XEXP (x, 0);
15121 *cost = rtx_cost (op0, GET_MODE (op0), VEC_SELECT, 0, speed);
15123 /* cost subreg of 0 as free, otherwise as DUP */
15124 rtx op1 = XEXP (x, 1);
15125 if (vec_series_lowpart_p (mode, GET_MODE (op1), op1))
15127 else if (vec_series_highpart_p (mode, GET_MODE (op1), op1))
15128 *cost = extra_cost->vect.dup;
15129 else
15130 *cost = extra_cost->vect.extract;
15131 return true;
15133 default:
15134 break;
15137 if (dump_file
15138 && flag_aarch64_verbose_cost)
15139 fprintf (dump_file,
15140 "\nFailed to cost RTX. Assuming default cost.\n");
15142 return true;
15145 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
15146 calculated for X. This cost is stored in *COST. Returns true
15147 if the total cost of X was calculated. */
15148 static bool
15149 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
15150 int param, int *cost, bool speed)
15152 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
15154 if (dump_file
15155 && flag_aarch64_verbose_cost)
15157 print_rtl_single (dump_file, x);
15158 fprintf (dump_file, "\n%s cost: %d (%s)\n",
15159 speed ? "Hot" : "Cold",
15160 *cost, result ? "final" : "partial");
15163 return result;
15166 static int
15167 aarch64_register_move_cost (machine_mode mode,
15168 reg_class_t from_i, reg_class_t to_i)
15170 enum reg_class from = (enum reg_class) from_i;
15171 enum reg_class to = (enum reg_class) to_i;
15172 const struct cpu_regmove_cost *regmove_cost
15173 = aarch64_tune_params.regmove_cost;
15175 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
15176 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
15177 || to == STUB_REGS)
15178 to = GENERAL_REGS;
15180 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
15181 || from == STUB_REGS)
15182 from = GENERAL_REGS;
15184 /* Make RDFFR very expensive. In particular, if we know that the FFR
15185 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
15186 as a way of obtaining a PTRUE. */
15187 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15188 && hard_reg_set_subset_p (reg_class_contents[from_i],
15189 reg_class_contents[FFR_REGS]))
15190 return 80;
15192 /* Moving between GPR and stack cost is the same as GP2GP. */
15193 if ((from == GENERAL_REGS && to == STACK_REG)
15194 || (to == GENERAL_REGS && from == STACK_REG))
15195 return regmove_cost->GP2GP;
15197 /* To/From the stack register, we move via the gprs. */
15198 if (to == STACK_REG || from == STACK_REG)
15199 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
15200 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
15202 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
15203 if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
15204 && known_eq (GET_MODE_SIZE (mode), 16))
15206 /* 128-bit operations on general registers require 2 instructions. */
15207 if (from == GENERAL_REGS && to == GENERAL_REGS)
15208 return regmove_cost->GP2GP * 2;
15209 else if (from == GENERAL_REGS)
15210 return regmove_cost->GP2FP * 2;
15211 else if (to == GENERAL_REGS)
15212 return regmove_cost->FP2GP * 2;
15214 /* When AdvSIMD instructions are disabled it is not possible to move
15215 a 128-bit value directly between Q registers. This is handled in
15216 secondary reload. A general register is used as a scratch to move
15217 the upper DI value and the lower DI value is moved directly,
15218 hence the cost is the sum of three moves. */
15219 if (! TARGET_SIMD)
15220 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
15222 return regmove_cost->FP2FP;
15225 if (from == GENERAL_REGS && to == GENERAL_REGS)
15226 return regmove_cost->GP2GP;
15227 else if (from == GENERAL_REGS)
15228 return regmove_cost->GP2FP;
15229 else if (to == GENERAL_REGS)
15230 return regmove_cost->FP2GP;
15232 if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
15234 /* Needs a round-trip through memory, which can use LDP/STP for pairs.
15235 The cost must be greater than 2 units to indicate that direct
15236 moves aren't possible. */
15237 auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
15238 + aarch64_tune_params.memmov_cost.store_fp);
15239 return MIN (CEIL (per_vector, 2), 4);
15242 return regmove_cost->FP2FP;
15245 /* Implements TARGET_MEMORY_MOVE_COST. */
15246 static int
15247 aarch64_memory_move_cost (machine_mode mode, reg_class_t rclass_i, bool in)
15249 enum reg_class rclass = (enum reg_class) rclass_i;
15250 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
15251 ? reg_classes_intersect_p (rclass, PR_REGS)
15252 : reg_class_subset_p (rclass, PR_REGS))
15253 return (in
15254 ? aarch64_tune_params.memmov_cost.load_pred
15255 : aarch64_tune_params.memmov_cost.store_pred);
15257 if (VECTOR_MODE_P (mode) || FLOAT_MODE_P (mode)
15258 ? reg_classes_intersect_p (rclass, FP_REGS)
15259 : reg_class_subset_p (rclass, FP_REGS))
15260 return (in
15261 ? aarch64_tune_params.memmov_cost.load_fp
15262 : aarch64_tune_params.memmov_cost.store_fp);
15264 return (in
15265 ? aarch64_tune_params.memmov_cost.load_int
15266 : aarch64_tune_params.memmov_cost.store_int);
15269 /* Implement TARGET_INIT_BUILTINS. */
15270 static void
15271 aarch64_init_builtins ()
15273 aarch64_general_init_builtins ();
15274 aarch64_sve::init_builtins ();
15275 #ifdef SUBTARGET_INIT_BUILTINS
15276 SUBTARGET_INIT_BUILTINS;
15277 #endif
15280 /* Implement TARGET_FOLD_BUILTIN. */
15281 static tree
15282 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
15284 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15285 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15286 tree type = TREE_TYPE (TREE_TYPE (fndecl));
15287 switch (code & AARCH64_BUILTIN_CLASS)
15289 case AARCH64_BUILTIN_GENERAL:
15290 return aarch64_general_fold_builtin (subcode, type, nargs, args);
15292 case AARCH64_BUILTIN_SVE:
15293 return NULL_TREE;
15295 gcc_unreachable ();
15298 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
15299 static bool
15300 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
15302 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
15303 tree fndecl = gimple_call_fndecl (stmt);
15304 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15305 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15306 gimple *new_stmt = NULL;
15307 switch (code & AARCH64_BUILTIN_CLASS)
15309 case AARCH64_BUILTIN_GENERAL:
15310 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt, gsi);
15311 break;
15313 case AARCH64_BUILTIN_SVE:
15314 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
15315 break;
15318 if (!new_stmt)
15319 return false;
15321 gsi_replace (gsi, new_stmt, false);
15322 return true;
15325 /* Implement TARGET_EXPAND_BUILTIN. */
15326 static rtx
15327 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
15329 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
15330 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15331 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15332 switch (code & AARCH64_BUILTIN_CLASS)
15334 case AARCH64_BUILTIN_GENERAL:
15335 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
15337 case AARCH64_BUILTIN_SVE:
15338 return aarch64_sve::expand_builtin (subcode, exp, target);
15340 gcc_unreachable ();
15343 /* Implement TARGET_BUILTIN_DECL. */
15344 static tree
15345 aarch64_builtin_decl (unsigned int code, bool initialize_p)
15347 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15348 switch (code & AARCH64_BUILTIN_CLASS)
15350 case AARCH64_BUILTIN_GENERAL:
15351 return aarch64_general_builtin_decl (subcode, initialize_p);
15353 case AARCH64_BUILTIN_SVE:
15354 return aarch64_sve::builtin_decl (subcode, initialize_p);
15356 gcc_unreachable ();
15359 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
15360 to optimize 1.0/sqrt. */
15362 static bool
15363 use_rsqrt_p (machine_mode mode)
15365 return (!flag_trapping_math
15366 && flag_unsafe_math_optimizations
15367 && ((aarch64_tune_params.approx_modes->recip_sqrt
15368 & AARCH64_APPROX_MODE (mode))
15369 || flag_mrecip_low_precision_sqrt));
15372 /* Function to decide when to use the approximate reciprocal square root
15373 builtin. */
15375 static tree
15376 aarch64_builtin_reciprocal (tree fndecl)
15378 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
15380 if (!use_rsqrt_p (mode))
15381 return NULL_TREE;
15382 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
15383 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
15384 switch (code & AARCH64_BUILTIN_CLASS)
15386 case AARCH64_BUILTIN_GENERAL:
15387 return aarch64_general_builtin_rsqrt (subcode);
15389 case AARCH64_BUILTIN_SVE:
15390 return NULL_TREE;
15392 gcc_unreachable ();
15395 /* Emit code to perform the floating-point operation:
15397 DST = SRC1 * SRC2
15399 where all three operands are already known to be registers.
15400 If the operation is an SVE one, PTRUE is a suitable all-true
15401 predicate. */
15403 static void
15404 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
15406 if (ptrue)
15407 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
15408 dst, ptrue, src1, src2,
15409 gen_int_mode (SVE_RELAXED_GP, SImode)));
15410 else
15411 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
15414 /* Emit instruction sequence to compute either the approximate square root
15415 or its approximate reciprocal, depending on the flag RECP, and return
15416 whether the sequence was emitted or not. */
15418 bool
15419 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
15421 machine_mode mode = GET_MODE (dst);
15423 if (GET_MODE_INNER (mode) == HFmode)
15425 gcc_assert (!recp);
15426 return false;
15429 if (!recp)
15431 if (!(flag_mlow_precision_sqrt
15432 || (aarch64_tune_params.approx_modes->sqrt
15433 & AARCH64_APPROX_MODE (mode))))
15434 return false;
15436 if (!flag_finite_math_only
15437 || flag_trapping_math
15438 || !flag_unsafe_math_optimizations
15439 || optimize_function_for_size_p (cfun))
15440 return false;
15442 else
15443 /* Caller assumes we cannot fail. */
15444 gcc_assert (use_rsqrt_p (mode));
15446 rtx pg = NULL_RTX;
15447 if (aarch64_sve_mode_p (mode))
15448 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15449 machine_mode mmsk = (VECTOR_MODE_P (mode)
15450 ? related_int_vector_mode (mode).require ()
15451 : int_mode_for_mode (mode).require ());
15452 rtx xmsk = NULL_RTX;
15453 if (!recp)
15455 /* When calculating the approximate square root, compare the
15456 argument with 0.0 and create a mask. */
15457 rtx zero = CONST0_RTX (mode);
15458 if (pg)
15460 xmsk = gen_reg_rtx (GET_MODE (pg));
15461 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
15462 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
15463 xmsk, pg, hint, src, zero));
15465 else
15467 xmsk = gen_reg_rtx (mmsk);
15468 emit_insn (gen_rtx_SET (xmsk,
15469 gen_rtx_NEG (mmsk,
15470 gen_rtx_EQ (mmsk, src, zero))));
15474 /* Estimate the approximate reciprocal square root. */
15475 rtx xdst = gen_reg_rtx (mode);
15476 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
15478 /* Iterate over the series twice for SF and thrice for DF. */
15479 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15481 /* Optionally iterate over the series once less for faster performance
15482 while sacrificing the accuracy. */
15483 if ((recp && flag_mrecip_low_precision_sqrt)
15484 || (!recp && flag_mlow_precision_sqrt))
15485 iterations--;
15487 /* Iterate over the series to calculate the approximate reciprocal square
15488 root. */
15489 rtx x1 = gen_reg_rtx (mode);
15490 while (iterations--)
15492 rtx x2 = gen_reg_rtx (mode);
15493 aarch64_emit_mult (x2, pg, xdst, xdst);
15495 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
15497 if (iterations > 0)
15498 aarch64_emit_mult (xdst, pg, xdst, x1);
15501 if (!recp)
15503 if (pg)
15504 /* Multiply nonzero source values by the corresponding intermediate
15505 result elements, so that the final calculation is the approximate
15506 square root rather than its reciprocal. Select a zero result for
15507 zero source values, to avoid the Inf * 0 -> NaN that we'd get
15508 otherwise. */
15509 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
15510 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
15511 else
15513 /* Qualify the approximate reciprocal square root when the
15514 argument is 0.0 by squashing the intermediary result to 0.0. */
15515 rtx xtmp = gen_reg_rtx (mmsk);
15516 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
15517 gen_rtx_SUBREG (mmsk, xdst, 0)));
15518 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
15520 /* Calculate the approximate square root. */
15521 aarch64_emit_mult (xdst, pg, xdst, src);
15525 /* Finalize the approximation. */
15526 aarch64_emit_mult (dst, pg, xdst, x1);
15528 return true;
15531 /* Emit the instruction sequence to compute the approximation for the division
15532 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
15534 bool
15535 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
15537 machine_mode mode = GET_MODE (quo);
15539 if (GET_MODE_INNER (mode) == HFmode)
15540 return false;
15542 bool use_approx_division_p = (flag_mlow_precision_div
15543 || (aarch64_tune_params.approx_modes->division
15544 & AARCH64_APPROX_MODE (mode)));
15546 if (!flag_finite_math_only
15547 || flag_trapping_math
15548 || !flag_unsafe_math_optimizations
15549 || optimize_function_for_size_p (cfun)
15550 || !use_approx_division_p)
15551 return false;
15553 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
15554 return false;
15556 rtx pg = NULL_RTX;
15557 if (aarch64_sve_mode_p (mode))
15558 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
15560 /* Estimate the approximate reciprocal. */
15561 rtx xrcp = gen_reg_rtx (mode);
15562 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
15564 /* Iterate over the series twice for SF and thrice for DF. */
15565 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
15567 /* Optionally iterate over the series less for faster performance,
15568 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
15569 if (flag_mlow_precision_div)
15570 iterations = (GET_MODE_INNER (mode) == DFmode
15571 ? aarch64_double_recp_precision
15572 : aarch64_float_recp_precision);
15574 /* Iterate over the series to calculate the approximate reciprocal. */
15575 rtx xtmp = gen_reg_rtx (mode);
15576 while (iterations--)
15578 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
15580 if (iterations > 0)
15581 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
15584 if (num != CONST1_RTX (mode))
15586 /* As the approximate reciprocal of DEN is already calculated, only
15587 calculate the approximate division when NUM is not 1.0. */
15588 rtx xnum = force_reg (mode, num);
15589 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
15592 /* Finalize the approximation. */
15593 aarch64_emit_mult (quo, pg, xrcp, xtmp);
15594 return true;
15597 /* Return the number of instructions that can be issued per cycle. */
15598 static int
15599 aarch64_sched_issue_rate (void)
15601 return aarch64_tune_params.issue_rate;
15604 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
15605 static int
15606 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
15608 if (DEBUG_INSN_P (insn))
15609 return more;
15611 rtx_code code = GET_CODE (PATTERN (insn));
15612 if (code == USE || code == CLOBBER)
15613 return more;
15615 if (get_attr_type (insn) == TYPE_NO_INSN)
15616 return more;
15618 return more - 1;
15621 static int
15622 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
15624 int issue_rate = aarch64_sched_issue_rate ();
15626 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
15630 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
15631 autopref_multipass_dfa_lookahead_guard from haifa-sched.cc. It only
15632 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
15634 static int
15635 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
15636 int ready_index)
15638 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
15642 /* Vectorizer cost model target hooks. */
15644 /* If a vld1 from address ADDR should be recorded in vector_load_decls,
15645 return the decl that should be recorded. Return null otherwise. */
15646 tree
15647 aarch64_vector_load_decl (tree addr)
15649 if (TREE_CODE (addr) != ADDR_EXPR)
15650 return NULL_TREE;
15651 tree base = get_base_address (TREE_OPERAND (addr, 0));
15652 if (TREE_CODE (base) != VAR_DECL)
15653 return NULL_TREE;
15654 return base;
15657 /* Return true if STMT_INFO accesses a decl that is known to be the
15658 argument to a vld1 in the same function. */
15659 static bool
15660 aarch64_accesses_vector_load_decl_p (stmt_vec_info stmt_info)
15662 if (!cfun->machine->vector_load_decls)
15663 return false;
15664 auto dr = STMT_VINFO_DATA_REF (stmt_info);
15665 if (!dr)
15666 return false;
15667 tree decl = aarch64_vector_load_decl (DR_BASE_ADDRESS (dr));
15668 return decl && cfun->machine->vector_load_decls->contains (decl);
15671 /* Information about how the CPU would issue the scalar, Advanced SIMD
15672 or SVE version of a vector loop, using the scheme defined by the
15673 aarch64_base_vec_issue_info hierarchy of structures. */
15674 class aarch64_vec_op_count
15676 public:
15677 aarch64_vec_op_count () = default;
15678 aarch64_vec_op_count (const aarch64_vec_issue_info *, unsigned int,
15679 unsigned int = 1);
15681 unsigned int vec_flags () const { return m_vec_flags; }
15682 unsigned int vf_factor () const { return m_vf_factor; }
15684 const aarch64_base_vec_issue_info *base_issue_info () const;
15685 const aarch64_simd_vec_issue_info *simd_issue_info () const;
15686 const aarch64_sve_vec_issue_info *sve_issue_info () const;
15688 fractional_cost rename_cycles_per_iter () const;
15689 fractional_cost min_nonpred_cycles_per_iter () const;
15690 fractional_cost min_pred_cycles_per_iter () const;
15691 fractional_cost min_cycles_per_iter () const;
15693 void dump () const;
15695 /* The number of individual "general" operations. See the comments
15696 in aarch64_base_vec_issue_info for details. */
15697 unsigned int general_ops = 0;
15699 /* The number of load and store operations, under the same scheme
15700 as above. */
15701 unsigned int loads = 0;
15702 unsigned int stores = 0;
15704 /* The minimum number of cycles needed to execute all loop-carried
15705 operations, which in the vector code become associated with
15706 reductions. */
15707 unsigned int reduction_latency = 0;
15709 /* The number of individual predicate operations. See the comments
15710 in aarch64_sve_vec_issue_info for details. */
15711 unsigned int pred_ops = 0;
15713 private:
15714 /* The issue information for the core. */
15715 const aarch64_vec_issue_info *m_issue_info = nullptr;
15717 /* - If M_VEC_FLAGS is zero then this structure describes scalar code
15718 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then this structure describes
15719 Advanced SIMD code.
15720 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then this structure describes
15721 SVE code. */
15722 unsigned int m_vec_flags = 0;
15724 /* Assume that, when the code is executing on the core described
15725 by M_ISSUE_INFO, one iteration of the loop will handle M_VF_FACTOR
15726 times more data than the vectorizer anticipates.
15728 This is only ever different from 1 for SVE. It allows us to consider
15729 what would happen on a 256-bit SVE target even when the -mtune
15730 parameters say that the “likely” SVE length is 128 bits. */
15731 unsigned int m_vf_factor = 1;
15734 aarch64_vec_op_count::
15735 aarch64_vec_op_count (const aarch64_vec_issue_info *issue_info,
15736 unsigned int vec_flags, unsigned int vf_factor)
15737 : m_issue_info (issue_info),
15738 m_vec_flags (vec_flags),
15739 m_vf_factor (vf_factor)
15743 /* Return the base issue information (i.e. the parts that make sense
15744 for both scalar and vector code). Return null if we have no issue
15745 information. */
15746 const aarch64_base_vec_issue_info *
15747 aarch64_vec_op_count::base_issue_info () const
15749 if (auto *ret = simd_issue_info ())
15750 return ret;
15751 return m_issue_info->scalar;
15754 /* If the structure describes vector code and we have associated issue
15755 information, return that issue information, otherwise return null. */
15756 const aarch64_simd_vec_issue_info *
15757 aarch64_vec_op_count::simd_issue_info () const
15759 if (auto *ret = sve_issue_info ())
15760 return ret;
15761 if (m_vec_flags)
15762 return m_issue_info->advsimd;
15763 return nullptr;
15766 /* If the structure describes SVE code and we have associated issue
15767 information, return that issue information, otherwise return null. */
15768 const aarch64_sve_vec_issue_info *
15769 aarch64_vec_op_count::sve_issue_info () const
15771 if (m_vec_flags & VEC_ANY_SVE)
15772 return m_issue_info->sve;
15773 return nullptr;
15776 /* Estimate the minimum number of cycles per iteration needed to rename
15777 the instructions.
15779 ??? For now this is done inline rather than via cost tables, since it
15780 isn't clear how it should be parameterized for the general case. */
15781 fractional_cost
15782 aarch64_vec_op_count::rename_cycles_per_iter () const
15784 if (sve_issue_info () == &neoverse512tvb_sve_issue_info
15785 || sve_issue_info () == &neoversen2_sve_issue_info
15786 || sve_issue_info () == &neoversev2_sve_issue_info)
15787 /* + 1 for an addition. We've already counted a general op for each
15788 store, so we don't need to account for stores separately. The branch
15789 reads no registers and so does not need to be counted either.
15791 ??? This value is very much on the pessimistic side, but seems to work
15792 pretty well in practice. */
15793 return { general_ops + loads + pred_ops + 1, 5 };
15795 return 0;
15798 /* Like min_cycles_per_iter, but excluding predicate operations. */
15799 fractional_cost
15800 aarch64_vec_op_count::min_nonpred_cycles_per_iter () const
15802 auto *issue_info = base_issue_info ();
15804 fractional_cost cycles = MAX (reduction_latency, 1);
15805 cycles = std::max (cycles, { stores, issue_info->stores_per_cycle });
15806 cycles = std::max (cycles, { loads + stores,
15807 issue_info->loads_stores_per_cycle });
15808 cycles = std::max (cycles, { general_ops,
15809 issue_info->general_ops_per_cycle });
15810 cycles = std::max (cycles, rename_cycles_per_iter ());
15811 return cycles;
15814 /* Like min_cycles_per_iter, but including only the predicate operations. */
15815 fractional_cost
15816 aarch64_vec_op_count::min_pred_cycles_per_iter () const
15818 if (auto *issue_info = sve_issue_info ())
15819 return { pred_ops, issue_info->pred_ops_per_cycle };
15820 return 0;
15823 /* Estimate the minimum number of cycles needed to issue the operations.
15824 This is a very simplistic model! */
15825 fractional_cost
15826 aarch64_vec_op_count::min_cycles_per_iter () const
15828 return std::max (min_nonpred_cycles_per_iter (),
15829 min_pred_cycles_per_iter ());
15832 /* Dump information about the structure. */
15833 void
15834 aarch64_vec_op_count::dump () const
15836 dump_printf_loc (MSG_NOTE, vect_location,
15837 " load operations = %d\n", loads);
15838 dump_printf_loc (MSG_NOTE, vect_location,
15839 " store operations = %d\n", stores);
15840 dump_printf_loc (MSG_NOTE, vect_location,
15841 " general operations = %d\n", general_ops);
15842 if (sve_issue_info ())
15843 dump_printf_loc (MSG_NOTE, vect_location,
15844 " predicate operations = %d\n", pred_ops);
15845 dump_printf_loc (MSG_NOTE, vect_location,
15846 " reduction latency = %d\n", reduction_latency);
15847 if (auto rcpi = rename_cycles_per_iter ())
15848 dump_printf_loc (MSG_NOTE, vect_location,
15849 " estimated cycles per iteration to rename = %f\n",
15850 rcpi.as_double ());
15851 if (auto pred_cpi = min_pred_cycles_per_iter ())
15853 dump_printf_loc (MSG_NOTE, vect_location,
15854 " estimated min cycles per iteration"
15855 " without predication = %f\n",
15856 min_nonpred_cycles_per_iter ().as_double ());
15857 dump_printf_loc (MSG_NOTE, vect_location,
15858 " estimated min cycles per iteration"
15859 " for predication = %f\n", pred_cpi.as_double ());
15861 if (auto cpi = min_cycles_per_iter ())
15862 dump_printf_loc (MSG_NOTE, vect_location,
15863 " estimated min cycles per iteration = %f\n",
15864 cpi.as_double ());
15867 /* Information about vector code that we're in the process of costing. */
15868 class aarch64_vector_costs : public vector_costs
15870 public:
15871 aarch64_vector_costs (vec_info *, bool);
15873 unsigned int add_stmt_cost (int count, vect_cost_for_stmt kind,
15874 stmt_vec_info stmt_info, slp_tree, tree vectype,
15875 int misalign,
15876 vect_cost_model_location where) override;
15877 void finish_cost (const vector_costs *) override;
15878 bool better_main_loop_than_p (const vector_costs *other) const override;
15880 private:
15881 void record_potential_advsimd_unrolling (loop_vec_info);
15882 void analyze_loop_vinfo (loop_vec_info);
15883 void count_ops (unsigned int, vect_cost_for_stmt, stmt_vec_info,
15884 aarch64_vec_op_count *);
15885 fractional_cost adjust_body_cost_sve (const aarch64_vec_op_count *,
15886 fractional_cost, unsigned int,
15887 unsigned int *, bool *);
15888 unsigned int adjust_body_cost (loop_vec_info, const aarch64_vector_costs *,
15889 unsigned int);
15890 bool prefer_unrolled_loop () const;
15891 unsigned int determine_suggested_unroll_factor ();
15893 /* True if we have performed one-time initialization based on the
15894 vec_info. */
15895 bool m_analyzed_vinfo = false;
15897 /* This loop uses an average operation that is not supported by SVE, but is
15898 supported by Advanced SIMD and SVE2. */
15899 bool m_has_avg = false;
15901 /* True if the vector body contains a store to a decl and if the
15902 function is known to have a vld1 from the same decl.
15904 In the Advanced SIMD ACLE, the recommended endian-agnostic way of
15905 initializing a vector is:
15907 float f[4] = { elts };
15908 float32x4_t x = vld1q_f32(f);
15910 We should strongly prefer vectorization of the initialization of f,
15911 so that the store to f and the load back can be optimized away,
15912 leaving a vectorization of { elts }. */
15913 bool m_stores_to_vector_load_decl = false;
15915 /* - If M_VEC_FLAGS is zero then we're costing the original scalar code.
15916 - If M_VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
15917 SIMD code.
15918 - If M_VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
15919 unsigned int m_vec_flags = 0;
15921 /* At the moment, we do not model LDP and STP in the vector and scalar costs.
15922 This means that code such as:
15924 a[0] = x;
15925 a[1] = x;
15927 will be costed as two scalar instructions and two vector instructions
15928 (a scalar_to_vec and an unaligned_store). For SLP, the vector form
15929 wins if the costs are equal, because of the fact that the vector costs
15930 include constant initializations whereas the scalar costs don't.
15931 We would therefore tend to vectorize the code above, even though
15932 the scalar version can use a single STP.
15934 We should eventually fix this and model LDP and STP in the main costs;
15935 see the comment in aarch64_sve_adjust_stmt_cost for some of the problems.
15936 Until then, we look specifically for code that does nothing more than
15937 STP-like operations. We cost them on that basis in addition to the
15938 normal latency-based costs.
15940 If the scalar or vector code could be a sequence of STPs +
15941 initialization, this variable counts the cost of the sequence,
15942 with 2 units per instruction. The variable is ~0U for other
15943 kinds of code. */
15944 unsigned int m_stp_sequence_cost = 0;
15946 /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
15947 throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
15948 situations, we try to predict whether an Advanced SIMD implementation
15949 of the loop could be completely unrolled and become straight-line code.
15950 If so, it is generally better to use the Advanced SIMD version rather
15951 than length-agnostic SVE, since the SVE loop would execute an unknown
15952 number of times and so could not be completely unrolled in the same way.
15954 If we're applying this heuristic, M_UNROLLED_ADVSIMD_NITERS is the
15955 number of Advanced SIMD loop iterations that would be unrolled and
15956 M_UNROLLED_ADVSIMD_STMTS estimates the total number of statements
15957 in the unrolled loop. Both values are zero if we're not applying
15958 the heuristic. */
15959 unsigned HOST_WIDE_INT m_unrolled_advsimd_niters = 0;
15960 unsigned HOST_WIDE_INT m_unrolled_advsimd_stmts = 0;
15962 /* If we're vectorizing a loop that executes a constant number of times,
15963 this variable gives the number of times that the vector loop would
15964 iterate, otherwise it is zero. */
15965 uint64_t m_num_vector_iterations = 0;
15967 /* Used only when vectorizing loops. Estimates the number and kind of
15968 operations that would be needed by one iteration of the scalar
15969 or vector loop. There is one entry for each tuning option of
15970 interest. */
15971 auto_vec<aarch64_vec_op_count, 2> m_ops;
15974 aarch64_vector_costs::aarch64_vector_costs (vec_info *vinfo,
15975 bool costing_for_scalar)
15976 : vector_costs (vinfo, costing_for_scalar),
15977 m_vec_flags (costing_for_scalar ? 0
15978 : aarch64_classify_vector_mode (vinfo->vector_mode))
15980 if (auto *issue_info = aarch64_tune_params.vec_costs->issue_info)
15982 m_ops.quick_push ({ issue_info, m_vec_flags });
15983 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
15985 unsigned int vf_factor = (m_vec_flags & VEC_ANY_SVE) ? 2 : 1;
15986 m_ops.quick_push ({ &neoversev1_vec_issue_info, m_vec_flags,
15987 vf_factor });
15992 /* Implement TARGET_VECTORIZE_CREATE_COSTS. */
15993 vector_costs *
15994 aarch64_vectorize_create_costs (vec_info *vinfo, bool costing_for_scalar)
15996 return new aarch64_vector_costs (vinfo, costing_for_scalar);
15999 /* Return true if the current CPU should use the new costs defined
16000 in GCC 11. This should be removed for GCC 12 and above, with the
16001 costs applying to all CPUs instead. */
16002 static bool
16003 aarch64_use_new_vector_costs_p ()
16005 return (aarch64_tune_params.extra_tuning_flags
16006 & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
16009 /* Return the appropriate SIMD costs for vectors of type VECTYPE. */
16010 static const simd_vec_cost *
16011 aarch64_simd_vec_costs (tree vectype)
16013 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16014 if (vectype != NULL
16015 && aarch64_sve_mode_p (TYPE_MODE (vectype))
16016 && costs->sve != NULL)
16017 return costs->sve;
16018 return costs->advsimd;
16021 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */
16022 static const simd_vec_cost *
16023 aarch64_simd_vec_costs_for_flags (unsigned int flags)
16025 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16026 if ((flags & VEC_ANY_SVE) && costs->sve)
16027 return costs->sve;
16028 return costs->advsimd;
16031 /* If STMT_INFO is a memory reference, return the scalar memory type,
16032 otherwise return null. */
16033 static tree
16034 aarch64_dr_type (stmt_vec_info stmt_info)
16036 if (auto dr = STMT_VINFO_DATA_REF (stmt_info))
16037 return TREE_TYPE (DR_REF (dr));
16038 return NULL_TREE;
16041 /* Decide whether to use the unrolling heuristic described above
16042 m_unrolled_advsimd_niters, updating that field if so. LOOP_VINFO
16043 describes the loop that we're vectorizing. */
16044 void
16045 aarch64_vector_costs::
16046 record_potential_advsimd_unrolling (loop_vec_info loop_vinfo)
16048 /* The heuristic only makes sense on targets that have the same
16049 vector throughput for SVE and Advanced SIMD. */
16050 if (!(aarch64_tune_params.extra_tuning_flags
16051 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
16052 return;
16054 /* We only want to apply the heuristic if LOOP_VINFO is being
16055 vectorized for SVE. */
16056 if (!(m_vec_flags & VEC_ANY_SVE))
16057 return;
16059 /* Check whether it is possible in principle to use Advanced SIMD
16060 instead. */
16061 if (aarch64_autovec_preference == 2)
16062 return;
16064 /* We don't want to apply the heuristic to outer loops, since it's
16065 harder to track two levels of unrolling. */
16066 if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
16067 return;
16069 /* Only handle cases in which the number of Advanced SIMD iterations
16070 would be known at compile time but the number of SVE iterations
16071 would not. */
16072 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
16073 || aarch64_sve_vg.is_constant ())
16074 return;
16076 /* Guess how many times the Advanced SIMD loop would iterate and make
16077 sure that it is within the complete unrolling limit. Even if the
16078 number of iterations is small enough, the number of statements might
16079 not be, which is why we need to estimate the number of statements too. */
16080 unsigned int estimated_vq = aarch64_estimated_sve_vq ();
16081 unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
16082 unsigned HOST_WIDE_INT unrolled_advsimd_niters
16083 = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
16084 if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
16085 return;
16087 /* Record that we're applying the heuristic and should try to estimate
16088 the number of statements in the Advanced SIMD loop. */
16089 m_unrolled_advsimd_niters = unrolled_advsimd_niters;
16092 /* Do one-time initialization of the aarch64_vector_costs given that we're
16093 costing the loop vectorization described by LOOP_VINFO. */
16094 void
16095 aarch64_vector_costs::analyze_loop_vinfo (loop_vec_info loop_vinfo)
16097 /* Record the number of times that the vector loop would execute,
16098 if known. */
16099 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
16100 auto scalar_niters = max_stmt_executions_int (loop);
16101 if (scalar_niters >= 0)
16103 unsigned int vf = vect_vf_for_cost (loop_vinfo);
16104 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16105 m_num_vector_iterations = scalar_niters / vf;
16106 else
16107 m_num_vector_iterations = CEIL (scalar_niters, vf);
16110 /* Detect whether we're vectorizing for SVE and should apply the unrolling
16111 heuristic described above m_unrolled_advsimd_niters. */
16112 record_potential_advsimd_unrolling (loop_vinfo);
16114 /* Record the issue information for any SVE WHILE instructions that the
16115 loop needs. */
16116 if (!m_ops.is_empty () && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
16118 unsigned int num_masks = 0;
16119 rgroup_controls *rgm;
16120 unsigned int num_vectors_m1;
16121 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
16122 if (rgm->type)
16123 num_masks += num_vectors_m1 + 1;
16124 for (auto &ops : m_ops)
16125 if (auto *issue = ops.sve_issue_info ())
16126 ops.pred_ops += num_masks * issue->while_pred_ops;
16130 /* Implement targetm.vectorize.builtin_vectorization_cost. */
16131 static int
16132 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
16133 tree vectype,
16134 int misalign ATTRIBUTE_UNUSED)
16136 unsigned elements;
16137 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
16138 bool fp = false;
16140 if (vectype != NULL)
16141 fp = FLOAT_TYPE_P (vectype);
16143 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16145 switch (type_of_cost)
16147 case scalar_stmt:
16148 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
16150 case scalar_load:
16151 return costs->scalar_load_cost;
16153 case scalar_store:
16154 return costs->scalar_store_cost;
16156 case vector_stmt:
16157 return fp ? simd_costs->fp_stmt_cost
16158 : simd_costs->int_stmt_cost;
16160 case vector_load:
16161 return simd_costs->align_load_cost;
16163 case vector_store:
16164 return simd_costs->store_cost;
16166 case vec_to_scalar:
16167 return simd_costs->vec_to_scalar_cost;
16169 case scalar_to_vec:
16170 return simd_costs->scalar_to_vec_cost;
16172 case unaligned_load:
16173 case vector_gather_load:
16174 return simd_costs->unalign_load_cost;
16176 case unaligned_store:
16177 case vector_scatter_store:
16178 return simd_costs->unalign_store_cost;
16180 case cond_branch_taken:
16181 return costs->cond_taken_branch_cost;
16183 case cond_branch_not_taken:
16184 return costs->cond_not_taken_branch_cost;
16186 case vec_perm:
16187 return simd_costs->permute_cost;
16189 case vec_promote_demote:
16190 return fp ? simd_costs->fp_stmt_cost
16191 : simd_costs->int_stmt_cost;
16193 case vec_construct:
16194 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
16195 return elements / 2 + 1;
16197 default:
16198 gcc_unreachable ();
16202 /* Return true if an access of kind KIND for STMT_INFO represents one
16203 vector of an LD[234] or ST[234] operation. Return the total number of
16204 vectors (2, 3 or 4) if so, otherwise return a value outside that range. */
16205 static int
16206 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
16208 if ((kind == vector_load
16209 || kind == unaligned_load
16210 || kind == vector_store
16211 || kind == unaligned_store)
16212 && STMT_VINFO_DATA_REF (stmt_info))
16214 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
16215 if (stmt_info
16216 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
16217 return DR_GROUP_SIZE (stmt_info);
16219 return 0;
16222 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
16223 vectors would produce a series of LDP or STP operations. KIND is the
16224 kind of statement that STMT_INFO represents. */
16225 static bool
16226 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
16227 stmt_vec_info stmt_info)
16229 switch (kind)
16231 case vector_load:
16232 case vector_store:
16233 case unaligned_load:
16234 case unaligned_store:
16235 break;
16237 default:
16238 return false;
16241 if (aarch64_tune_params.extra_tuning_flags
16242 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16243 return false;
16245 return is_gimple_assign (stmt_info->stmt);
16248 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
16249 or multiply-subtract sequence that might be suitable for fusing into a
16250 single instruction. If VEC_FLAGS is zero, analyze the operation as
16251 a scalar one, otherwise analyze it as an operation on vectors with those
16252 VEC_* flags. */
16253 static bool
16254 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info,
16255 unsigned int vec_flags)
16257 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
16258 if (!assign)
16259 return false;
16260 tree_code code = gimple_assign_rhs_code (assign);
16261 if (code != PLUS_EXPR && code != MINUS_EXPR)
16262 return false;
16264 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
16265 || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
16266 return false;
16268 for (int i = 1; i < 3; ++i)
16270 tree rhs = gimple_op (assign, i);
16271 /* ??? Should we try to check for a single use as well? */
16272 if (TREE_CODE (rhs) != SSA_NAME)
16273 continue;
16275 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
16276 if (!def_stmt_info
16277 || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
16278 continue;
16279 gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
16280 if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
16281 continue;
16283 if (vec_flags & VEC_ADVSIMD)
16285 /* Scalar and SVE code can tie the result to any FMLA input (or none,
16286 although that requires a MOVPRFX for SVE). However, Advanced SIMD
16287 only supports MLA forms, so will require a move if the result
16288 cannot be tied to the accumulator. The most important case in
16289 which this is true is when the accumulator input is invariant. */
16290 rhs = gimple_op (assign, 3 - i);
16291 if (TREE_CODE (rhs) != SSA_NAME)
16292 return false;
16293 def_stmt_info = vinfo->lookup_def (rhs);
16294 if (!def_stmt_info
16295 || STMT_VINFO_DEF_TYPE (def_stmt_info) == vect_external_def)
16296 return false;
16299 return true;
16301 return false;
16304 /* We are considering implementing STMT_INFO using SVE. If STMT_INFO is an
16305 in-loop reduction that SVE supports directly, return its latency in cycles,
16306 otherwise return zero. SVE_COSTS specifies the latencies of the relevant
16307 instructions. */
16308 static unsigned int
16309 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
16310 stmt_vec_info stmt_info,
16311 const sve_vec_cost *sve_costs)
16313 switch (vect_reduc_type (vinfo, stmt_info))
16315 case EXTRACT_LAST_REDUCTION:
16316 return sve_costs->clast_cost;
16318 case FOLD_LEFT_REDUCTION:
16319 switch (TYPE_MODE (TREE_TYPE (gimple_get_lhs (stmt_info->stmt))))
16321 case E_HFmode:
16322 case E_BFmode:
16323 return sve_costs->fadda_f16_cost;
16325 case E_SFmode:
16326 return sve_costs->fadda_f32_cost;
16328 case E_DFmode:
16329 return sve_costs->fadda_f64_cost;
16331 default:
16332 break;
16334 break;
16337 return 0;
16340 /* STMT_INFO describes a loop-carried operation in the original scalar code
16341 that we are considering implementing as a reduction. Return one of the
16342 following values, depending on VEC_FLAGS:
16344 - If VEC_FLAGS is zero, return the loop carry latency of the original
16345 scalar operation.
16347 - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
16348 Advanced SIMD implementation.
16350 - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
16351 SVE implementation. */
16352 static unsigned int
16353 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
16354 unsigned int vec_flags)
16356 const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
16357 const sve_vec_cost *sve_costs = nullptr;
16358 if (vec_flags & VEC_ANY_SVE)
16359 sve_costs = aarch64_tune_params.vec_costs->sve;
16361 /* If the caller is asking for the SVE latency, check for forms of reduction
16362 that only SVE can handle directly. */
16363 if (sve_costs)
16365 unsigned int latency
16366 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16367 if (latency)
16368 return latency;
16371 /* Handle scalar costs. */
16372 bool is_float = FLOAT_TYPE_P (TREE_TYPE (gimple_get_lhs (stmt_info->stmt)));
16373 if (vec_flags == 0)
16375 if (is_float)
16376 return vec_costs->scalar_fp_stmt_cost;
16377 return vec_costs->scalar_int_stmt_cost;
16380 /* Otherwise, the loop body just contains normal integer or FP operations,
16381 with a vector reduction outside the loop. */
16382 const simd_vec_cost *simd_costs
16383 = aarch64_simd_vec_costs_for_flags (vec_flags);
16384 if (is_float)
16385 return simd_costs->fp_stmt_cost;
16386 return simd_costs->int_stmt_cost;
16389 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16390 for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
16391 try to subdivide the target-independent categorization provided by KIND
16392 to get a more accurate cost. */
16393 static fractional_cost
16394 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16395 stmt_vec_info stmt_info,
16396 fractional_cost stmt_cost)
16398 /* Detect an extension of a loaded value. In general, we'll be able to fuse
16399 the extension with the load. */
16400 if (kind == scalar_stmt && vect_is_extending_load (vinfo, stmt_info))
16401 return 0;
16403 return stmt_cost;
16406 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16407 for the vectorized form of STMT_INFO, which has cost kind KIND and which
16408 when vectorized would operate on vector type VECTYPE. Try to subdivide
16409 the target-independent categorization provided by KIND to get a more
16410 accurate cost. WHERE specifies where the cost associated with KIND
16411 occurs. */
16412 static fractional_cost
16413 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
16414 stmt_vec_info stmt_info, tree vectype,
16415 enum vect_cost_model_location where,
16416 fractional_cost stmt_cost)
16418 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16419 const sve_vec_cost *sve_costs = nullptr;
16420 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16421 sve_costs = aarch64_tune_params.vec_costs->sve;
16423 /* It's generally better to avoid costing inductions, since the induction
16424 will usually be hidden by other operations. This is particularly true
16425 for things like COND_REDUCTIONS. */
16426 if (is_a<gphi *> (stmt_info->stmt))
16427 return 0;
16429 /* Detect cases in which vec_to_scalar is describing the extraction of a
16430 vector element in preparation for a scalar store. The store itself is
16431 costed separately. */
16432 if (vect_is_store_elt_extraction (kind, stmt_info))
16433 return simd_costs->store_elt_extra_cost;
16435 /* Detect SVE gather loads, which are costed as a single scalar_load
16436 for each element. We therefore need to divide the full-instruction
16437 cost by the number of elements in the vector. */
16438 if (kind == scalar_load
16439 && sve_costs
16440 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16442 unsigned int nunits = vect_nunits_for_cost (vectype);
16443 if (GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype)) == 64)
16444 return { sve_costs->gather_load_x64_cost, nunits };
16445 return { sve_costs->gather_load_x32_cost, nunits };
16448 /* Detect cases in which a scalar_store is really storing one element
16449 in a scatter operation. */
16450 if (kind == scalar_store
16451 && sve_costs
16452 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16453 return sve_costs->scatter_store_elt_cost;
16455 /* Detect cases in which vec_to_scalar represents an in-loop reduction. */
16456 if (kind == vec_to_scalar
16457 && where == vect_body
16458 && sve_costs)
16460 unsigned int latency
16461 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, sve_costs);
16462 if (latency)
16463 return latency;
16466 /* Detect cases in which vec_to_scalar represents a single reduction
16467 instruction like FADDP or MAXV. */
16468 if (kind == vec_to_scalar
16469 && where == vect_epilogue
16470 && vect_is_reduction (stmt_info))
16471 switch (GET_MODE_INNER (TYPE_MODE (vectype)))
16473 case E_QImode:
16474 return simd_costs->reduc_i8_cost;
16476 case E_HImode:
16477 return simd_costs->reduc_i16_cost;
16479 case E_SImode:
16480 return simd_costs->reduc_i32_cost;
16482 case E_DImode:
16483 return simd_costs->reduc_i64_cost;
16485 case E_HFmode:
16486 case E_BFmode:
16487 return simd_costs->reduc_f16_cost;
16489 case E_SFmode:
16490 return simd_costs->reduc_f32_cost;
16492 case E_DFmode:
16493 return simd_costs->reduc_f64_cost;
16495 default:
16496 break;
16499 /* Otherwise stick with the original categorization. */
16500 return stmt_cost;
16503 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
16504 for STMT_INFO, which has cost kind KIND and which when vectorized would
16505 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
16506 targets. */
16507 static fractional_cost
16508 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
16509 stmt_vec_info stmt_info, tree vectype,
16510 fractional_cost stmt_cost)
16512 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
16513 vector register size or number of units. Integer promotions of this
16514 type therefore map to SXT[BHW] or UXT[BHW].
16516 Most loads have extending forms that can do the sign or zero extension
16517 on the fly. Optimistically assume that a load followed by an extension
16518 will fold to this form during combine, and that the extension therefore
16519 comes for free. */
16520 if (kind == vector_stmt && vect_is_extending_load (vinfo, stmt_info))
16521 stmt_cost = 0;
16523 /* For similar reasons, vector_stmt integer truncations are a no-op,
16524 because we can just ignore the unused upper bits of the source. */
16525 if (kind == vector_stmt && vect_is_integer_truncation (stmt_info))
16526 stmt_cost = 0;
16528 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
16529 but there are no equivalent instructions for SVE. This means that
16530 (all other things being equal) 128-bit SVE needs twice as many load
16531 and store instructions as Advanced SIMD in order to process vector pairs.
16533 Also, scalar code can often use LDP and STP to access pairs of values,
16534 so it is too simplistic to say that one SVE load or store replaces
16535 VF scalar loads and stores.
16537 Ideally we would account for this in the scalar and Advanced SIMD
16538 costs by making suitable load/store pairs as cheap as a single
16539 load/store. However, that would be a very invasive change and in
16540 practice it tends to stress other parts of the cost model too much.
16541 E.g. stores of scalar constants currently count just a store,
16542 whereas stores of vector constants count a store and a vec_init.
16543 This is an artificial distinction for AArch64, where stores of
16544 nonzero scalar constants need the same kind of register invariant
16545 as vector stores.
16547 An alternative would be to double the cost of any SVE loads and stores
16548 that could be paired in Advanced SIMD (and possibly also paired in
16549 scalar code). But this tends to stress other parts of the cost model
16550 in the same way. It also means that we can fall back to Advanced SIMD
16551 even if full-loop predication would have been useful.
16553 Here we go for a more conservative version: double the costs of SVE
16554 loads and stores if one iteration of the scalar loop processes enough
16555 elements for it to use a whole number of Advanced SIMD LDP or STP
16556 instructions. This makes it very likely that the VF would be 1 for
16557 Advanced SIMD, and so no epilogue should be needed. */
16558 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
16560 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
16561 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
16562 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
16563 if (multiple_p (count * elt_bits, 256)
16564 && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
16565 stmt_cost *= 2;
16568 return stmt_cost;
16571 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
16572 and which when vectorized would operate on vector type VECTYPE. Add the
16573 cost of any embedded operations. */
16574 static fractional_cost
16575 aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
16576 tree vectype, fractional_cost stmt_cost)
16578 if (vectype)
16580 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
16582 /* Detect cases in which a vector load or store represents an
16583 LD[234] or ST[234] instruction. */
16584 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16586 case 2:
16587 stmt_cost += simd_costs->ld2_st2_permute_cost;
16588 break;
16590 case 3:
16591 stmt_cost += simd_costs->ld3_st3_permute_cost;
16592 break;
16594 case 4:
16595 stmt_cost += simd_costs->ld4_st4_permute_cost;
16596 break;
16599 if (kind == vector_stmt || kind == vec_to_scalar)
16600 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16602 if (FLOAT_TYPE_P (cmp_type))
16603 stmt_cost += simd_costs->fp_stmt_cost;
16604 else
16605 stmt_cost += simd_costs->int_stmt_cost;
16609 if (kind == scalar_stmt)
16610 if (tree cmp_type = vect_embedded_comparison_type (stmt_info))
16612 if (FLOAT_TYPE_P (cmp_type))
16613 stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
16614 else
16615 stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
16618 return stmt_cost;
16621 /* COUNT, KIND and STMT_INFO are the same as for vector_costs::add_stmt_cost
16622 and they describe an operation in the body of a vector loop. Record issue
16623 information relating to the vector operation in OPS. */
16624 void
16625 aarch64_vector_costs::count_ops (unsigned int count, vect_cost_for_stmt kind,
16626 stmt_vec_info stmt_info,
16627 aarch64_vec_op_count *ops)
16629 const aarch64_base_vec_issue_info *base_issue = ops->base_issue_info ();
16630 if (!base_issue)
16631 return;
16632 const aarch64_simd_vec_issue_info *simd_issue = ops->simd_issue_info ();
16633 const aarch64_sve_vec_issue_info *sve_issue = ops->sve_issue_info ();
16635 /* Calculate the minimum cycles per iteration imposed by a reduction
16636 operation. */
16637 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16638 && vect_is_reduction (stmt_info))
16640 unsigned int base
16641 = aarch64_in_loop_reduction_latency (m_vinfo, stmt_info, m_vec_flags);
16643 /* ??? Ideally we'd do COUNT reductions in parallel, but unfortunately
16644 that's not yet the case. */
16645 ops->reduction_latency = MAX (ops->reduction_latency, base * count);
16648 /* Assume that multiply-adds will become a single operation. */
16649 if (stmt_info && aarch64_multiply_add_p (m_vinfo, stmt_info, m_vec_flags))
16650 return;
16652 /* Count the basic operation cost associated with KIND. */
16653 switch (kind)
16655 case cond_branch_taken:
16656 case cond_branch_not_taken:
16657 case vector_gather_load:
16658 case vector_scatter_store:
16659 /* We currently don't expect these to be used in a loop body. */
16660 break;
16662 case vec_perm:
16663 case vec_promote_demote:
16664 case vec_construct:
16665 case vec_to_scalar:
16666 case scalar_to_vec:
16667 case vector_stmt:
16668 case scalar_stmt:
16669 ops->general_ops += count;
16670 break;
16672 case scalar_load:
16673 case vector_load:
16674 case unaligned_load:
16675 ops->loads += count;
16676 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16677 ops->general_ops += base_issue->fp_simd_load_general_ops * count;
16678 break;
16680 case vector_store:
16681 case unaligned_store:
16682 case scalar_store:
16683 ops->stores += count;
16684 if (m_vec_flags || FLOAT_TYPE_P (aarch64_dr_type (stmt_info)))
16685 ops->general_ops += base_issue->fp_simd_store_general_ops * count;
16686 break;
16689 /* Add any embedded comparison operations. */
16690 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
16691 && vect_embedded_comparison_type (stmt_info))
16692 ops->general_ops += count;
16694 /* COND_REDUCTIONS need two sets of VEC_COND_EXPRs, whereas so far we
16695 have only accounted for one. */
16696 if ((kind == vector_stmt || kind == vec_to_scalar)
16697 && vect_reduc_type (m_vinfo, stmt_info) == COND_REDUCTION)
16698 ops->general_ops += count;
16700 /* Count the predicate operations needed by an SVE comparison. */
16701 if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
16702 if (tree type = vect_comparison_type (stmt_info))
16704 unsigned int base = (FLOAT_TYPE_P (type)
16705 ? sve_issue->fp_cmp_pred_ops
16706 : sve_issue->int_cmp_pred_ops);
16707 ops->pred_ops += base * count;
16710 /* Add any extra overhead associated with LD[234] and ST[234] operations. */
16711 if (simd_issue)
16712 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
16714 case 2:
16715 ops->general_ops += simd_issue->ld2_st2_general_ops * count;
16716 break;
16718 case 3:
16719 ops->general_ops += simd_issue->ld3_st3_general_ops * count;
16720 break;
16722 case 4:
16723 ops->general_ops += simd_issue->ld4_st4_general_ops * count;
16724 break;
16727 /* Add any overhead associated with gather loads and scatter stores. */
16728 if (sve_issue
16729 && (kind == scalar_load || kind == scalar_store)
16730 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
16732 unsigned int pairs = CEIL (count, 2);
16733 ops->pred_ops += sve_issue->gather_scatter_pair_pred_ops * pairs;
16734 ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
16738 /* Return true if STMT_INFO contains a memory access and if the constant
16739 component of the memory address is aligned to SIZE bytes. */
16740 static bool
16741 aarch64_aligned_constant_offset_p (stmt_vec_info stmt_info,
16742 poly_uint64 size)
16744 if (!STMT_VINFO_DATA_REF (stmt_info))
16745 return false;
16747 if (auto first_stmt = DR_GROUP_FIRST_ELEMENT (stmt_info))
16748 stmt_info = first_stmt;
16749 tree constant_offset = DR_INIT (STMT_VINFO_DATA_REF (stmt_info));
16750 /* Needed for gathers & scatters, for example. */
16751 if (!constant_offset)
16752 return false;
16754 return multiple_p (wi::to_poly_offset (constant_offset), size);
16757 /* Check if a scalar or vector stmt could be part of a region of code
16758 that does nothing more than store values to memory, in the scalar
16759 case using STP. Return the cost of the stmt if so, counting 2 for
16760 one instruction. Return ~0U otherwise.
16762 The arguments are a subset of those passed to add_stmt_cost. */
16763 unsigned int
16764 aarch64_stp_sequence_cost (unsigned int count, vect_cost_for_stmt kind,
16765 stmt_vec_info stmt_info, tree vectype)
16767 /* Code that stores vector constants uses a vector_load to create
16768 the constant. We don't apply the heuristic to that case for two
16769 main reasons:
16771 - At the moment, STPs are only formed via peephole2, and the
16772 constant scalar moves would often come between STRs and so
16773 prevent STP formation.
16775 - The scalar code also has to load the constant somehow, and that
16776 isn't costed. */
16777 switch (kind)
16779 case scalar_to_vec:
16780 /* Count 2 insns for a GPR->SIMD dup and 1 insn for a FPR->SIMD dup. */
16781 return (FLOAT_TYPE_P (vectype) ? 2 : 4) * count;
16783 case vec_construct:
16784 if (FLOAT_TYPE_P (vectype))
16785 /* Count 1 insn for the maximum number of FP->SIMD INS
16786 instructions. */
16787 return (vect_nunits_for_cost (vectype) - 1) * 2 * count;
16789 /* Count 2 insns for a GPR->SIMD move and 2 insns for the
16790 maximum number of GPR->SIMD INS instructions. */
16791 return vect_nunits_for_cost (vectype) * 4 * count;
16793 case vector_store:
16794 case unaligned_store:
16795 /* Count 1 insn per vector if we can't form STP Q pairs. */
16796 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
16797 return count * 2;
16798 if (aarch64_tune_params.extra_tuning_flags
16799 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
16800 return count * 2;
16802 if (stmt_info)
16804 /* Assume we won't be able to use STP if the constant offset
16805 component of the address is misaligned. ??? This could be
16806 removed if we formed STP pairs earlier, rather than relying
16807 on peephole2. */
16808 auto size = GET_MODE_SIZE (TYPE_MODE (vectype));
16809 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16810 return count * 2;
16812 return CEIL (count, 2) * 2;
16814 case scalar_store:
16815 if (stmt_info && STMT_VINFO_DATA_REF (stmt_info))
16817 /* Check for a mode in which STP pairs can be formed. */
16818 auto size = GET_MODE_SIZE (TYPE_MODE (aarch64_dr_type (stmt_info)));
16819 if (maybe_ne (size, 4) && maybe_ne (size, 8))
16820 return ~0U;
16822 /* Assume we won't be able to use STP if the constant offset
16823 component of the address is misaligned. ??? This could be
16824 removed if we formed STP pairs earlier, rather than relying
16825 on peephole2. */
16826 if (!aarch64_aligned_constant_offset_p (stmt_info, size))
16827 return ~0U;
16829 return count;
16831 default:
16832 return ~0U;
16836 unsigned
16837 aarch64_vector_costs::add_stmt_cost (int count, vect_cost_for_stmt kind,
16838 stmt_vec_info stmt_info, slp_tree,
16839 tree vectype, int misalign,
16840 vect_cost_model_location where)
16842 fractional_cost stmt_cost
16843 = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
16845 bool in_inner_loop_p = (where == vect_body
16846 && stmt_info
16847 && stmt_in_inner_loop_p (m_vinfo, stmt_info));
16849 /* Do one-time initialization based on the vinfo. */
16850 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
16851 if (!m_analyzed_vinfo && aarch64_use_new_vector_costs_p ())
16853 if (loop_vinfo)
16854 analyze_loop_vinfo (loop_vinfo);
16856 m_analyzed_vinfo = true;
16859 /* Apply the heuristic described above m_stp_sequence_cost. */
16860 if (m_stp_sequence_cost != ~0U)
16862 uint64_t cost = aarch64_stp_sequence_cost (count, kind,
16863 stmt_info, vectype);
16864 m_stp_sequence_cost = MIN (m_stp_sequence_cost + cost, ~0U);
16867 /* Try to get a more accurate cost by looking at STMT_INFO instead
16868 of just looking at KIND. */
16869 if (stmt_info && aarch64_use_new_vector_costs_p ())
16871 /* If we scalarize a strided store, the vectorizer costs one
16872 vec_to_scalar for each element. However, we can store the first
16873 element using an FP store without a separate extract step. */
16874 if (vect_is_store_elt_extraction (kind, stmt_info))
16875 count -= 1;
16877 stmt_cost = aarch64_detect_scalar_stmt_subtype (m_vinfo, kind,
16878 stmt_info, stmt_cost);
16880 if (vectype && m_vec_flags)
16881 stmt_cost = aarch64_detect_vector_stmt_subtype (m_vinfo, kind,
16882 stmt_info, vectype,
16883 where, stmt_cost);
16886 /* Do any SVE-specific adjustments to the cost. */
16887 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
16888 stmt_cost = aarch64_sve_adjust_stmt_cost (m_vinfo, kind, stmt_info,
16889 vectype, stmt_cost);
16891 if (stmt_info && aarch64_use_new_vector_costs_p ())
16893 /* Account for any extra "embedded" costs that apply additively
16894 to the base cost calculated above. */
16895 stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
16896 stmt_cost);
16898 /* If we're recording a nonzero vector loop body cost for the
16899 innermost loop, also estimate the operations that would need
16900 to be issued by all relevant implementations of the loop. */
16901 if (loop_vinfo
16902 && (m_costing_for_scalar || where == vect_body)
16903 && (!LOOP_VINFO_LOOP (loop_vinfo)->inner || in_inner_loop_p)
16904 && stmt_cost != 0)
16905 for (auto &ops : m_ops)
16906 count_ops (count, kind, stmt_info, &ops);
16908 /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
16909 estimate the number of statements in the unrolled Advanced SIMD
16910 loop. For simplicitly, we assume that one iteration of the
16911 Advanced SIMD loop would need the same number of statements
16912 as one iteration of the SVE loop. */
16913 if (where == vect_body && m_unrolled_advsimd_niters)
16914 m_unrolled_advsimd_stmts += count * m_unrolled_advsimd_niters;
16916 /* Detect the use of an averaging operation. */
16917 gimple *stmt = stmt_info->stmt;
16918 if (is_gimple_call (stmt)
16919 && gimple_call_internal_p (stmt))
16921 switch (gimple_call_internal_fn (stmt))
16923 case IFN_AVG_FLOOR:
16924 case IFN_AVG_CEIL:
16925 m_has_avg = true;
16926 default:
16927 break;
16932 /* If the statement stores to a decl that is known to be the argument
16933 to a vld1 in the same function, ignore the store for costing purposes.
16934 See the comment above m_stores_to_vector_load_decl for more details. */
16935 if (stmt_info
16936 && (kind == vector_store || kind == unaligned_store)
16937 && aarch64_accesses_vector_load_decl_p (stmt_info))
16939 stmt_cost = 0;
16940 m_stores_to_vector_load_decl = true;
16943 return record_stmt_cost (stmt_info, where, (count * stmt_cost).ceil ());
16946 /* Return true if (a) we're applying the Advanced SIMD vs. SVE unrolling
16947 heuristic described above m_unrolled_advsimd_niters and (b) the heuristic
16948 says that we should prefer the Advanced SIMD loop. */
16949 bool
16950 aarch64_vector_costs::prefer_unrolled_loop () const
16952 if (!m_unrolled_advsimd_stmts)
16953 return false;
16955 if (dump_enabled_p ())
16956 dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
16957 " unrolled Advanced SIMD loop = "
16958 HOST_WIDE_INT_PRINT_UNSIGNED "\n",
16959 m_unrolled_advsimd_stmts);
16961 /* The balance here is tricky. On the one hand, we can't be sure whether
16962 the code is vectorizable with Advanced SIMD or not. However, even if
16963 it isn't vectorizable with Advanced SIMD, there's a possibility that
16964 the scalar code could also be unrolled. Some of the code might then
16965 benefit from SLP, or from using LDP and STP. We therefore apply
16966 the heuristic regardless of can_use_advsimd_p. */
16967 return (m_unrolled_advsimd_stmts
16968 && (m_unrolled_advsimd_stmts
16969 <= (unsigned int) param_max_completely_peeled_insns));
16972 /* Subroutine of adjust_body_cost for handling SVE. Use ISSUE_INFO to work out
16973 how fast the SVE code can be issued and compare it to the equivalent value
16974 for scalar code (SCALAR_CYCLES_PER_ITER). If COULD_USE_ADVSIMD is true,
16975 also compare it to the issue rate of Advanced SIMD code
16976 (ADVSIMD_CYCLES_PER_ITER).
16978 ORIG_BODY_COST is the cost originally passed to adjust_body_cost and
16979 *BODY_COST is the current value of the adjusted cost. *SHOULD_DISPARAGE
16980 is true if we think the loop body is too expensive. */
16982 fractional_cost
16983 aarch64_vector_costs::
16984 adjust_body_cost_sve (const aarch64_vec_op_count *ops,
16985 fractional_cost scalar_cycles_per_iter,
16986 unsigned int orig_body_cost, unsigned int *body_cost,
16987 bool *should_disparage)
16989 if (dump_enabled_p ())
16990 ops->dump ();
16992 fractional_cost sve_pred_cycles_per_iter = ops->min_pred_cycles_per_iter ();
16993 fractional_cost sve_cycles_per_iter = ops->min_cycles_per_iter ();
16995 /* If the scalar version of the loop could issue at least as
16996 quickly as the predicate parts of the SVE loop, make the SVE loop
16997 prohibitively expensive. In this case vectorization is adding an
16998 overhead that the original scalar code didn't have.
17000 This is mostly intended to detect cases in which WHILELOs dominate
17001 for very tight loops, which is something that normal latency-based
17002 costs would not model. Adding this kind of cliffedge would be
17003 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
17004 code in the caller handles that case in a more conservative way. */
17005 fractional_cost sve_estimate = sve_pred_cycles_per_iter + 1;
17006 if (scalar_cycles_per_iter < sve_estimate)
17008 unsigned int min_cost
17009 = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
17010 if (*body_cost < min_cost)
17012 if (dump_enabled_p ())
17013 dump_printf_loc (MSG_NOTE, vect_location,
17014 "Increasing body cost to %d because the"
17015 " scalar code could issue within the limit"
17016 " imposed by predicate operations\n",
17017 min_cost);
17018 *body_cost = min_cost;
17019 *should_disparage = true;
17023 return sve_cycles_per_iter;
17026 unsigned int
17027 aarch64_vector_costs::determine_suggested_unroll_factor ()
17029 bool sve = m_vec_flags & VEC_ANY_SVE;
17030 /* If we are trying to unroll an Advanced SIMD main loop that contains
17031 an averaging operation that we do not support with SVE and we might use a
17032 predicated epilogue, we need to be conservative and block unrolling as
17033 this might lead to a less optimal loop for the first and only epilogue
17034 using the original loop's vectorization factor.
17035 TODO: Remove this constraint when we add support for multiple epilogue
17036 vectorization. */
17037 if (!sve && !TARGET_SVE2 && m_has_avg)
17038 return 1;
17040 unsigned int max_unroll_factor = 1;
17041 for (auto vec_ops : m_ops)
17043 aarch64_simd_vec_issue_info const *vec_issue
17044 = vec_ops.simd_issue_info ();
17045 if (!vec_issue)
17046 return 1;
17047 /* Limit unroll factor to a value adjustable by the user, the default
17048 value is 4. */
17049 unsigned int unroll_factor = aarch64_vect_unroll_limit;
17050 unsigned int factor
17051 = vec_ops.reduction_latency > 1 ? vec_ops.reduction_latency : 1;
17052 unsigned int temp;
17054 /* Sanity check, this should never happen. */
17055 if ((vec_ops.stores + vec_ops.loads + vec_ops.general_ops) == 0)
17056 return 1;
17058 /* Check stores. */
17059 if (vec_ops.stores > 0)
17061 temp = CEIL (factor * vec_issue->stores_per_cycle,
17062 vec_ops.stores);
17063 unroll_factor = MIN (unroll_factor, temp);
17066 /* Check loads + stores. */
17067 if (vec_ops.loads > 0)
17069 temp = CEIL (factor * vec_issue->loads_stores_per_cycle,
17070 vec_ops.loads + vec_ops.stores);
17071 unroll_factor = MIN (unroll_factor, temp);
17074 /* Check general ops. */
17075 if (vec_ops.general_ops > 0)
17077 temp = CEIL (factor * vec_issue->general_ops_per_cycle,
17078 vec_ops.general_ops);
17079 unroll_factor = MIN (unroll_factor, temp);
17081 max_unroll_factor = MAX (max_unroll_factor, unroll_factor);
17084 /* Make sure unroll factor is power of 2. */
17085 return 1 << ceil_log2 (max_unroll_factor);
17088 /* BODY_COST is the cost of a vector loop body. Adjust the cost as necessary
17089 and return the new cost. */
17090 unsigned int
17091 aarch64_vector_costs::
17092 adjust_body_cost (loop_vec_info loop_vinfo,
17093 const aarch64_vector_costs *scalar_costs,
17094 unsigned int body_cost)
17096 if (scalar_costs->m_ops.is_empty () || m_ops.is_empty ())
17097 return body_cost;
17099 const auto &scalar_ops = scalar_costs->m_ops[0];
17100 const auto &vector_ops = m_ops[0];
17101 unsigned int estimated_vf = vect_vf_for_cost (loop_vinfo);
17102 unsigned int orig_body_cost = body_cost;
17103 bool should_disparage = false;
17105 if (dump_enabled_p ())
17106 dump_printf_loc (MSG_NOTE, vect_location,
17107 "Original vector body cost = %d\n", body_cost);
17109 fractional_cost scalar_cycles_per_iter
17110 = scalar_ops.min_cycles_per_iter () * estimated_vf;
17112 fractional_cost vector_cycles_per_iter = vector_ops.min_cycles_per_iter ();
17114 if (dump_enabled_p ())
17116 if (IN_RANGE (m_num_vector_iterations, 0, 65536))
17117 dump_printf_loc (MSG_NOTE, vect_location,
17118 "Vector loop iterates at most %wd times\n",
17119 m_num_vector_iterations);
17120 dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
17121 scalar_ops.dump ();
17122 dump_printf_loc (MSG_NOTE, vect_location,
17123 " estimated cycles per vector iteration"
17124 " (for VF %d) = %f\n",
17125 estimated_vf, scalar_cycles_per_iter.as_double ());
17128 if (vector_ops.sve_issue_info ())
17130 if (dump_enabled_p ())
17131 dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
17132 vector_cycles_per_iter
17133 = adjust_body_cost_sve (&vector_ops, scalar_cycles_per_iter,
17134 orig_body_cost, &body_cost, &should_disparage);
17136 if (aarch64_tune_params.vec_costs == &neoverse512tvb_vector_cost)
17138 /* Also take Neoverse V1 tuning into account, doubling the
17139 scalar and Advanced SIMD estimates to account for the
17140 doubling in SVE vector length. */
17141 if (dump_enabled_p ())
17142 dump_printf_loc (MSG_NOTE, vect_location,
17143 "Neoverse V1 estimate:\n");
17144 auto vf_factor = m_ops[1].vf_factor ();
17145 adjust_body_cost_sve (&m_ops[1], scalar_cycles_per_iter * vf_factor,
17146 orig_body_cost, &body_cost, &should_disparage);
17149 else
17151 if (dump_enabled_p ())
17153 dump_printf_loc (MSG_NOTE, vect_location,
17154 "Vector issue estimate:\n");
17155 vector_ops.dump ();
17159 /* Decide whether to stick to latency-based costs or whether to try to
17160 take issue rates into account. */
17161 unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
17162 if (m_vec_flags & VEC_ANY_SVE)
17163 threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
17165 if (m_num_vector_iterations >= 1
17166 && m_num_vector_iterations < threshold)
17168 if (dump_enabled_p ())
17169 dump_printf_loc (MSG_NOTE, vect_location,
17170 "Low iteration count, so using pure latency"
17171 " costs\n");
17173 /* Increase the cost of the vector code if it looks like the scalar code
17174 could issue more quickly. These values are only rough estimates,
17175 so minor differences should only result in minor changes. */
17176 else if (scalar_cycles_per_iter < vector_cycles_per_iter)
17178 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17179 scalar_cycles_per_iter);
17180 if (dump_enabled_p ())
17181 dump_printf_loc (MSG_NOTE, vect_location,
17182 "Increasing body cost to %d because scalar code"
17183 " would issue more quickly\n", body_cost);
17185 /* In general, it's expected that the proposed vector code would be able
17186 to issue more quickly than the original scalar code. This should
17187 already be reflected to some extent in the latency-based costs.
17189 However, the latency-based costs effectively assume that the scalar
17190 code and the vector code execute serially, which tends to underplay
17191 one important case: if the real (non-serialized) execution time of
17192 a scalar iteration is dominated by loop-carried dependencies,
17193 and if the vector code is able to reduce both the length of
17194 the loop-carried dependencies *and* the number of cycles needed
17195 to issue the code in general, we can be more confident that the
17196 vector code is an improvement, even if adding the other (non-loop-carried)
17197 latencies tends to hide this saving. We therefore reduce the cost of the
17198 vector loop body in proportion to the saving. */
17199 else if (scalar_ops.reduction_latency > vector_ops.reduction_latency
17200 && scalar_ops.reduction_latency == scalar_cycles_per_iter
17201 && scalar_cycles_per_iter > vector_cycles_per_iter
17202 && !should_disparage)
17204 body_cost = fractional_cost::scale (body_cost, vector_cycles_per_iter,
17205 scalar_cycles_per_iter);
17206 if (dump_enabled_p ())
17207 dump_printf_loc (MSG_NOTE, vect_location,
17208 "Decreasing body cost to %d account for smaller"
17209 " reduction latency\n", body_cost);
17212 return body_cost;
17215 void
17216 aarch64_vector_costs::finish_cost (const vector_costs *uncast_scalar_costs)
17218 auto *scalar_costs
17219 = static_cast<const aarch64_vector_costs *> (uncast_scalar_costs);
17220 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (m_vinfo);
17221 if (loop_vinfo
17222 && m_vec_flags
17223 && aarch64_use_new_vector_costs_p ())
17225 m_costs[vect_body] = adjust_body_cost (loop_vinfo, scalar_costs,
17226 m_costs[vect_body]);
17227 m_suggested_unroll_factor = determine_suggested_unroll_factor ();
17230 /* Apply the heuristic described above m_stp_sequence_cost. Prefer
17231 the scalar code in the event of a tie, since there is more chance
17232 of scalar code being optimized with surrounding operations.
17234 In addition, if the vector body is a simple store to a decl that
17235 is elsewhere loaded using vld1, strongly prefer the vector form,
17236 to the extent of giving the prologue a zero cost. See the comment
17237 above m_stores_to_vector_load_decl for details. */
17238 if (!loop_vinfo
17239 && scalar_costs
17240 && m_stp_sequence_cost != ~0U)
17242 if (m_stores_to_vector_load_decl)
17243 m_costs[vect_prologue] = 0;
17244 else if (m_stp_sequence_cost >= scalar_costs->m_stp_sequence_cost)
17245 m_costs[vect_body] = 2 * scalar_costs->total_cost ();
17248 vector_costs::finish_cost (scalar_costs);
17251 bool
17252 aarch64_vector_costs::
17253 better_main_loop_than_p (const vector_costs *uncast_other) const
17255 auto other = static_cast<const aarch64_vector_costs *> (uncast_other);
17257 auto this_loop_vinfo = as_a<loop_vec_info> (this->m_vinfo);
17258 auto other_loop_vinfo = as_a<loop_vec_info> (other->m_vinfo);
17260 if (dump_enabled_p ())
17261 dump_printf_loc (MSG_NOTE, vect_location,
17262 "Comparing two main loops (%s at VF %d vs %s at VF %d)\n",
17263 GET_MODE_NAME (this_loop_vinfo->vector_mode),
17264 vect_vf_for_cost (this_loop_vinfo),
17265 GET_MODE_NAME (other_loop_vinfo->vector_mode),
17266 vect_vf_for_cost (other_loop_vinfo));
17268 /* Apply the unrolling heuristic described above
17269 m_unrolled_advsimd_niters. */
17270 if (bool (m_unrolled_advsimd_stmts)
17271 != bool (other->m_unrolled_advsimd_stmts))
17273 bool this_prefer_unrolled = this->prefer_unrolled_loop ();
17274 bool other_prefer_unrolled = other->prefer_unrolled_loop ();
17275 if (this_prefer_unrolled != other_prefer_unrolled)
17277 if (dump_enabled_p ())
17278 dump_printf_loc (MSG_NOTE, vect_location,
17279 "Preferring Advanced SIMD loop because"
17280 " it can be unrolled\n");
17281 return other_prefer_unrolled;
17285 for (unsigned int i = 0; i < m_ops.length (); ++i)
17287 if (dump_enabled_p ())
17289 if (i)
17290 dump_printf_loc (MSG_NOTE, vect_location,
17291 "Reconsidering with subtuning %d\n", i);
17292 dump_printf_loc (MSG_NOTE, vect_location,
17293 "Issue info for %s loop:\n",
17294 GET_MODE_NAME (this_loop_vinfo->vector_mode));
17295 this->m_ops[i].dump ();
17296 dump_printf_loc (MSG_NOTE, vect_location,
17297 "Issue info for %s loop:\n",
17298 GET_MODE_NAME (other_loop_vinfo->vector_mode));
17299 other->m_ops[i].dump ();
17302 auto this_estimated_vf = (vect_vf_for_cost (this_loop_vinfo)
17303 * this->m_ops[i].vf_factor ());
17304 auto other_estimated_vf = (vect_vf_for_cost (other_loop_vinfo)
17305 * other->m_ops[i].vf_factor ());
17307 /* If it appears that one loop could process the same amount of data
17308 in fewer cycles, prefer that loop over the other one. */
17309 fractional_cost this_cost
17310 = this->m_ops[i].min_cycles_per_iter () * other_estimated_vf;
17311 fractional_cost other_cost
17312 = other->m_ops[i].min_cycles_per_iter () * this_estimated_vf;
17313 if (dump_enabled_p ())
17315 dump_printf_loc (MSG_NOTE, vect_location,
17316 "Weighted cycles per iteration of %s loop ~= %f\n",
17317 GET_MODE_NAME (this_loop_vinfo->vector_mode),
17318 this_cost.as_double ());
17319 dump_printf_loc (MSG_NOTE, vect_location,
17320 "Weighted cycles per iteration of %s loop ~= %f\n",
17321 GET_MODE_NAME (other_loop_vinfo->vector_mode),
17322 other_cost.as_double ());
17324 if (this_cost != other_cost)
17326 if (dump_enabled_p ())
17327 dump_printf_loc (MSG_NOTE, vect_location,
17328 "Preferring loop with lower cycles"
17329 " per iteration\n");
17330 return this_cost < other_cost;
17333 /* If the issue rate of SVE code is limited by predicate operations
17334 (i.e. if sve_pred_cycles_per_iter > sve_nonpred_cycles_per_iter),
17335 and if Advanced SIMD code could issue within the limit imposed
17336 by the predicate operations, the predicate operations are adding an
17337 overhead that the original code didn't have and so we should prefer
17338 the Advanced SIMD version. */
17339 auto better_pred_limit_p = [](const aarch64_vec_op_count &a,
17340 const aarch64_vec_op_count &b) -> bool
17342 if (a.pred_ops == 0
17343 && (b.min_pred_cycles_per_iter ()
17344 > b.min_nonpred_cycles_per_iter ()))
17346 if (dump_enabled_p ())
17347 dump_printf_loc (MSG_NOTE, vect_location,
17348 "Preferring Advanced SIMD loop since"
17349 " SVE loop is predicate-limited\n");
17350 return true;
17352 return false;
17354 if (better_pred_limit_p (this->m_ops[i], other->m_ops[i]))
17355 return true;
17356 if (better_pred_limit_p (other->m_ops[i], this->m_ops[i]))
17357 return false;
17360 return vector_costs::better_main_loop_than_p (other);
17363 static void initialize_aarch64_code_model (struct gcc_options *);
17365 /* Parse the TO_PARSE string and put the architecture struct that it
17366 selects into RES and the architectural features into ISA_FLAGS.
17367 Return an aarch_parse_opt_result describing the parse result.
17368 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
17369 When the TO_PARSE string contains an invalid extension,
17370 a copy of the string is created and stored to INVALID_EXTENSION. */
17372 static enum aarch_parse_opt_result
17373 aarch64_parse_arch (const char *to_parse, const struct processor **res,
17374 aarch64_feature_flags *isa_flags,
17375 std::string *invalid_extension)
17377 const char *ext;
17378 const struct processor *arch;
17379 size_t len;
17381 ext = strchr (to_parse, '+');
17383 if (ext != NULL)
17384 len = ext - to_parse;
17385 else
17386 len = strlen (to_parse);
17388 if (len == 0)
17389 return AARCH_PARSE_MISSING_ARG;
17392 /* Loop through the list of supported ARCHes to find a match. */
17393 for (arch = all_architectures; arch->name != NULL; arch++)
17395 if (strlen (arch->name) == len
17396 && strncmp (arch->name, to_parse, len) == 0)
17398 auto isa_temp = arch->flags;
17400 if (ext != NULL)
17402 /* TO_PARSE string contains at least one extension. */
17403 enum aarch_parse_opt_result ext_res
17404 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17406 if (ext_res != AARCH_PARSE_OK)
17407 return ext_res;
17409 /* Extension parsing was successful. Confirm the result
17410 arch and ISA flags. */
17411 *res = arch;
17412 *isa_flags = isa_temp;
17413 return AARCH_PARSE_OK;
17417 /* ARCH name not found in list. */
17418 return AARCH_PARSE_INVALID_ARG;
17421 /* Parse the TO_PARSE string and put the result tuning in RES and the
17422 architecture flags in ISA_FLAGS. Return an aarch_parse_opt_result
17423 describing the parse result. If there is an error parsing, RES and
17424 ISA_FLAGS are left unchanged.
17425 When the TO_PARSE string contains an invalid extension,
17426 a copy of the string is created and stored to INVALID_EXTENSION. */
17428 static enum aarch_parse_opt_result
17429 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
17430 aarch64_feature_flags *isa_flags,
17431 std::string *invalid_extension)
17433 const char *ext;
17434 const struct processor *cpu;
17435 size_t len;
17437 ext = strchr (to_parse, '+');
17439 if (ext != NULL)
17440 len = ext - to_parse;
17441 else
17442 len = strlen (to_parse);
17444 if (len == 0)
17445 return AARCH_PARSE_MISSING_ARG;
17448 /* Loop through the list of supported CPUs to find a match. */
17449 for (cpu = all_cores; cpu->name != NULL; cpu++)
17451 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
17453 auto isa_temp = cpu->flags;
17455 if (ext != NULL)
17457 /* TO_PARSE string contains at least one extension. */
17458 enum aarch_parse_opt_result ext_res
17459 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
17461 if (ext_res != AARCH_PARSE_OK)
17462 return ext_res;
17464 /* Extension parsing was successfull. Confirm the result
17465 cpu and ISA flags. */
17466 *res = cpu;
17467 *isa_flags = isa_temp;
17468 return AARCH_PARSE_OK;
17472 /* CPU name not found in list. */
17473 return AARCH_PARSE_INVALID_ARG;
17476 /* Parse the TO_PARSE string and put the cpu it selects into RES.
17477 Return an aarch_parse_opt_result describing the parse result.
17478 If the parsing fails the RES does not change. */
17480 static enum aarch_parse_opt_result
17481 aarch64_parse_tune (const char *to_parse, const struct processor **res)
17483 const struct processor *cpu;
17485 /* Loop through the list of supported CPUs to find a match. */
17486 for (cpu = all_cores; cpu->name != NULL; cpu++)
17488 if (strcmp (cpu->name, to_parse) == 0)
17490 *res = cpu;
17491 return AARCH_PARSE_OK;
17495 /* CPU name not found in list. */
17496 return AARCH_PARSE_INVALID_ARG;
17499 /* Parse TOKEN, which has length LENGTH to see if it is an option
17500 described in FLAG. If it is, return the index bit for that fusion type.
17501 If not, error (printing OPTION_NAME) and return zero. */
17503 static unsigned int
17504 aarch64_parse_one_option_token (const char *token,
17505 size_t length,
17506 const struct aarch64_flag_desc *flag,
17507 const char *option_name)
17509 for (; flag->name != NULL; flag++)
17511 if (length == strlen (flag->name)
17512 && !strncmp (flag->name, token, length))
17513 return flag->flag;
17516 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
17517 return 0;
17520 /* Parse OPTION which is a comma-separated list of flags to enable.
17521 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
17522 default state we inherit from the CPU tuning structures. OPTION_NAME
17523 gives the top-level option we are parsing in the -moverride string,
17524 for use in error messages. */
17526 static unsigned int
17527 aarch64_parse_boolean_options (const char *option,
17528 const struct aarch64_flag_desc *flags,
17529 unsigned int initial_state,
17530 const char *option_name)
17532 const char separator = '.';
17533 const char* specs = option;
17534 const char* ntoken = option;
17535 unsigned int found_flags = initial_state;
17537 while ((ntoken = strchr (specs, separator)))
17539 size_t token_length = ntoken - specs;
17540 unsigned token_ops = aarch64_parse_one_option_token (specs,
17541 token_length,
17542 flags,
17543 option_name);
17544 /* If we find "none" (or, for simplicity's sake, an error) anywhere
17545 in the token stream, reset the supported operations. So:
17547 adrp+add.cmp+branch.none.adrp+add
17549 would have the result of turning on only adrp+add fusion. */
17550 if (!token_ops)
17551 found_flags = 0;
17553 found_flags |= token_ops;
17554 specs = ++ntoken;
17557 /* We ended with a comma, print something. */
17558 if (!(*specs))
17560 error ("%qs string ill-formed", option_name);
17561 return 0;
17564 /* We still have one more token to parse. */
17565 size_t token_length = strlen (specs);
17566 unsigned token_ops = aarch64_parse_one_option_token (specs,
17567 token_length,
17568 flags,
17569 option_name);
17570 if (!token_ops)
17571 found_flags = 0;
17573 found_flags |= token_ops;
17574 return found_flags;
17577 /* Support for overriding instruction fusion. */
17579 static void
17580 aarch64_parse_fuse_string (const char *fuse_string,
17581 struct tune_params *tune)
17583 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
17584 aarch64_fusible_pairs,
17585 tune->fusible_ops,
17586 "fuse=");
17589 /* Support for overriding other tuning flags. */
17591 static void
17592 aarch64_parse_tune_string (const char *tune_string,
17593 struct tune_params *tune)
17595 tune->extra_tuning_flags
17596 = aarch64_parse_boolean_options (tune_string,
17597 aarch64_tuning_flags,
17598 tune->extra_tuning_flags,
17599 "tune=");
17602 /* Parse the sve_width tuning moverride string in TUNE_STRING.
17603 Accept the valid SVE vector widths allowed by
17604 aarch64_sve_vector_bits_enum and use it to override sve_width
17605 in TUNE. */
17607 static void
17608 aarch64_parse_sve_width_string (const char *tune_string,
17609 struct tune_params *tune)
17611 int width = -1;
17613 int n = sscanf (tune_string, "%d", &width);
17614 if (n == EOF)
17616 error ("invalid format for %<sve_width%>");
17617 return;
17619 switch (width)
17621 case SVE_128:
17622 case SVE_256:
17623 case SVE_512:
17624 case SVE_1024:
17625 case SVE_2048:
17626 break;
17627 default:
17628 error ("invalid %<sve_width%> value: %d", width);
17630 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
17633 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
17634 we understand. If it is, extract the option string and handoff to
17635 the appropriate function. */
17637 void
17638 aarch64_parse_one_override_token (const char* token,
17639 size_t length,
17640 struct tune_params *tune)
17642 const struct aarch64_tuning_override_function *fn
17643 = aarch64_tuning_override_functions;
17645 const char *option_part = strchr (token, '=');
17646 if (!option_part)
17648 error ("tuning string missing in option (%s)", token);
17649 return;
17652 /* Get the length of the option name. */
17653 length = option_part - token;
17654 /* Skip the '=' to get to the option string. */
17655 option_part++;
17657 for (; fn->name != NULL; fn++)
17659 if (!strncmp (fn->name, token, length))
17661 fn->parse_override (option_part, tune);
17662 return;
17666 error ("unknown tuning option (%s)",token);
17667 return;
17670 /* A checking mechanism for the implementation of the tls size. */
17672 static void
17673 initialize_aarch64_tls_size (struct gcc_options *opts)
17675 if (aarch64_tls_size == 0)
17676 aarch64_tls_size = 24;
17678 switch (opts->x_aarch64_cmodel_var)
17680 case AARCH64_CMODEL_TINY:
17681 /* Both the default and maximum TLS size allowed under tiny is 1M which
17682 needs two instructions to address, so we clamp the size to 24. */
17683 if (aarch64_tls_size > 24)
17684 aarch64_tls_size = 24;
17685 break;
17686 case AARCH64_CMODEL_SMALL:
17687 /* The maximum TLS size allowed under small is 4G. */
17688 if (aarch64_tls_size > 32)
17689 aarch64_tls_size = 32;
17690 break;
17691 case AARCH64_CMODEL_LARGE:
17692 /* The maximum TLS size allowed under large is 16E.
17693 FIXME: 16E should be 64bit, we only support 48bit offset now. */
17694 if (aarch64_tls_size > 48)
17695 aarch64_tls_size = 48;
17696 break;
17697 default:
17698 gcc_unreachable ();
17701 return;
17704 /* Return the CPU corresponding to the enum CPU. */
17706 static const struct processor *
17707 aarch64_get_tune_cpu (enum aarch64_processor cpu)
17709 gcc_assert (cpu != aarch64_none);
17711 return &all_cores[cpu];
17714 /* Return the architecture corresponding to the enum ARCH. */
17716 static const struct processor *
17717 aarch64_get_arch (enum aarch64_arch arch)
17719 gcc_assert (arch != aarch64_no_arch);
17721 return &all_architectures[arch];
17724 /* Parse STRING looking for options in the format:
17725 string :: option:string
17726 option :: name=substring
17727 name :: {a-z}
17728 substring :: defined by option. */
17730 static void
17731 aarch64_parse_override_string (const char* input_string,
17732 struct tune_params* tune)
17734 const char separator = ':';
17735 size_t string_length = strlen (input_string) + 1;
17736 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
17737 char *string = string_root;
17738 strncpy (string, input_string, string_length);
17739 string[string_length - 1] = '\0';
17741 char* ntoken = string;
17743 while ((ntoken = strchr (string, separator)))
17745 size_t token_length = ntoken - string;
17746 /* Make this substring look like a string. */
17747 *ntoken = '\0';
17748 aarch64_parse_one_override_token (string, token_length, tune);
17749 string = ++ntoken;
17752 /* One last option to parse. */
17753 aarch64_parse_one_override_token (string, strlen (string), tune);
17754 free (string_root);
17757 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
17758 are best for a generic target with the currently-enabled architecture
17759 extensions. */
17760 static void
17761 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
17763 /* Neoverse V1 is the only core that is known to benefit from
17764 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no
17765 point enabling it for SVE2 and above. */
17766 if (TARGET_SVE2)
17767 current_tune.extra_tuning_flags
17768 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
17771 static void
17772 aarch64_override_options_after_change_1 (struct gcc_options *opts)
17774 if (accepted_branch_protection_string)
17776 opts->x_aarch64_branch_protection_string
17777 = xstrdup (accepted_branch_protection_string);
17780 /* PR 70044: We have to be careful about being called multiple times for the
17781 same function. This means all changes should be repeatable. */
17783 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
17784 Disable the frame pointer flag so the mid-end will not use a frame
17785 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
17786 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
17787 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
17788 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
17789 if (opts->x_flag_omit_frame_pointer == 0)
17790 opts->x_flag_omit_frame_pointer = 2;
17792 /* If not optimizing for size, set the default
17793 alignment to what the target wants. */
17794 if (!opts->x_optimize_size)
17796 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
17797 opts->x_str_align_loops = aarch64_tune_params.loop_align;
17798 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
17799 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
17800 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
17801 opts->x_str_align_functions = aarch64_tune_params.function_align;
17804 /* We default to no pc-relative literal loads. */
17806 aarch64_pcrelative_literal_loads = false;
17808 /* If -mpc-relative-literal-loads is set on the command line, this
17809 implies that the user asked for PC relative literal loads. */
17810 if (opts->x_pcrelative_literal_loads == 1)
17811 aarch64_pcrelative_literal_loads = true;
17813 /* In the tiny memory model it makes no sense to disallow PC relative
17814 literal pool loads. */
17815 if (aarch64_cmodel == AARCH64_CMODEL_TINY
17816 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
17817 aarch64_pcrelative_literal_loads = true;
17819 /* When enabling the lower precision Newton series for the square root, also
17820 enable it for the reciprocal square root, since the latter is an
17821 intermediary step for the former. */
17822 if (flag_mlow_precision_sqrt)
17823 flag_mrecip_low_precision_sqrt = true;
17826 /* 'Unpack' up the internal tuning structs and update the options
17827 in OPTS. The caller must have set up selected_tune and selected_arch
17828 as all the other target-specific codegen decisions are
17829 derived from them. */
17831 void
17832 aarch64_override_options_internal (struct gcc_options *opts)
17834 const struct processor *tune = aarch64_get_tune_cpu (opts->x_selected_tune);
17835 aarch64_tune_flags = tune->flags;
17836 aarch64_tune = tune->sched_core;
17837 /* Make a copy of the tuning parameters attached to the core, which
17838 we may later overwrite. */
17839 aarch64_tune_params = *(tune->tune);
17840 if (tune->tune == &generic_tunings)
17841 aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
17843 if (opts->x_aarch64_override_tune_string)
17844 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
17845 &aarch64_tune_params);
17847 /* This target defaults to strict volatile bitfields. */
17848 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
17849 opts->x_flag_strict_volatile_bitfields = 1;
17851 if (aarch64_stack_protector_guard == SSP_GLOBAL
17852 && opts->x_aarch64_stack_protector_guard_offset_str)
17854 error ("incompatible options %<-mstack-protector-guard=global%> and "
17855 "%<-mstack-protector-guard-offset=%s%>",
17856 aarch64_stack_protector_guard_offset_str);
17859 if (aarch64_stack_protector_guard == SSP_SYSREG
17860 && !(opts->x_aarch64_stack_protector_guard_offset_str
17861 && opts->x_aarch64_stack_protector_guard_reg_str))
17863 error ("both %<-mstack-protector-guard-offset%> and "
17864 "%<-mstack-protector-guard-reg%> must be used "
17865 "with %<-mstack-protector-guard=sysreg%>");
17868 if (opts->x_aarch64_stack_protector_guard_reg_str)
17870 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
17871 error ("specify a system register with a small string length");
17874 if (opts->x_aarch64_stack_protector_guard_offset_str)
17876 char *end;
17877 const char *str = aarch64_stack_protector_guard_offset_str;
17878 errno = 0;
17879 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
17880 if (!*str || *end || errno)
17881 error ("%qs is not a valid offset in %qs", str,
17882 "-mstack-protector-guard-offset=");
17883 aarch64_stack_protector_guard_offset = offs;
17886 if ((flag_sanitize & SANITIZE_SHADOW_CALL_STACK)
17887 && !fixed_regs[R18_REGNUM])
17888 error ("%<-fsanitize=shadow-call-stack%> requires %<-ffixed-x18%>");
17890 initialize_aarch64_code_model (opts);
17891 initialize_aarch64_tls_size (opts);
17893 int queue_depth = 0;
17894 switch (aarch64_tune_params.autoprefetcher_model)
17896 case tune_params::AUTOPREFETCHER_OFF:
17897 queue_depth = -1;
17898 break;
17899 case tune_params::AUTOPREFETCHER_WEAK:
17900 queue_depth = 0;
17901 break;
17902 case tune_params::AUTOPREFETCHER_STRONG:
17903 queue_depth = max_insn_queue_index + 1;
17904 break;
17905 default:
17906 gcc_unreachable ();
17909 /* We don't mind passing in global_options_set here as we don't use
17910 the *options_set structs anyway. */
17911 SET_OPTION_IF_UNSET (opts, &global_options_set,
17912 param_sched_autopref_queue_depth, queue_depth);
17914 /* If using Advanced SIMD only for autovectorization disable SVE vector costs
17915 comparison. */
17916 if (aarch64_autovec_preference == 1)
17917 SET_OPTION_IF_UNSET (opts, &global_options_set,
17918 aarch64_sve_compare_costs, 0);
17920 /* Set up parameters to be used in prefetching algorithm. Do not
17921 override the defaults unless we are tuning for a core we have
17922 researched values for. */
17923 if (aarch64_tune_params.prefetch->num_slots > 0)
17924 SET_OPTION_IF_UNSET (opts, &global_options_set,
17925 param_simultaneous_prefetches,
17926 aarch64_tune_params.prefetch->num_slots);
17927 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
17928 SET_OPTION_IF_UNSET (opts, &global_options_set,
17929 param_l1_cache_size,
17930 aarch64_tune_params.prefetch->l1_cache_size);
17931 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17932 SET_OPTION_IF_UNSET (opts, &global_options_set,
17933 param_l1_cache_line_size,
17934 aarch64_tune_params.prefetch->l1_cache_line_size);
17936 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
17938 SET_OPTION_IF_UNSET (opts, &global_options_set,
17939 param_destruct_interfere_size,
17940 aarch64_tune_params.prefetch->l1_cache_line_size);
17941 SET_OPTION_IF_UNSET (opts, &global_options_set,
17942 param_construct_interfere_size,
17943 aarch64_tune_params.prefetch->l1_cache_line_size);
17945 else
17947 /* For a generic AArch64 target, cover the current range of cache line
17948 sizes. */
17949 SET_OPTION_IF_UNSET (opts, &global_options_set,
17950 param_destruct_interfere_size,
17951 256);
17952 SET_OPTION_IF_UNSET (opts, &global_options_set,
17953 param_construct_interfere_size,
17954 64);
17957 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
17958 SET_OPTION_IF_UNSET (opts, &global_options_set,
17959 param_l2_cache_size,
17960 aarch64_tune_params.prefetch->l2_cache_size);
17961 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
17962 SET_OPTION_IF_UNSET (opts, &global_options_set,
17963 param_prefetch_dynamic_strides, 0);
17964 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
17965 SET_OPTION_IF_UNSET (opts, &global_options_set,
17966 param_prefetch_minimum_stride,
17967 aarch64_tune_params.prefetch->minimum_stride);
17969 /* Use the alternative scheduling-pressure algorithm by default. */
17970 SET_OPTION_IF_UNSET (opts, &global_options_set,
17971 param_sched_pressure_algorithm,
17972 SCHED_PRESSURE_MODEL);
17974 /* Validate the guard size. */
17975 int guard_size = param_stack_clash_protection_guard_size;
17977 if (guard_size != 12 && guard_size != 16)
17978 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
17979 "size. Given value %d (%llu KB) is out of range",
17980 guard_size, (1ULL << guard_size) / 1024ULL);
17982 /* Enforce that interval is the same size as size so the mid-end does the
17983 right thing. */
17984 SET_OPTION_IF_UNSET (opts, &global_options_set,
17985 param_stack_clash_protection_probe_interval,
17986 guard_size);
17988 /* The maybe_set calls won't update the value if the user has explicitly set
17989 one. Which means we need to validate that probing interval and guard size
17990 are equal. */
17991 int probe_interval
17992 = param_stack_clash_protection_probe_interval;
17993 if (guard_size != probe_interval)
17994 error ("stack clash guard size %<%d%> must be equal to probing interval "
17995 "%<%d%>", guard_size, probe_interval);
17997 /* Enable sw prefetching at specified optimization level for
17998 CPUS that have prefetch. Lower optimization level threshold by 1
17999 when profiling is enabled. */
18000 if (opts->x_flag_prefetch_loop_arrays < 0
18001 && !opts->x_optimize_size
18002 && aarch64_tune_params.prefetch->default_opt_level >= 0
18003 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
18004 opts->x_flag_prefetch_loop_arrays = 1;
18006 aarch64_override_options_after_change_1 (opts);
18009 /* Print a hint with a suggestion for a core or architecture name that
18010 most closely resembles what the user passed in STR. ARCH is true if
18011 the user is asking for an architecture name. ARCH is false if the user
18012 is asking for a core name. */
18014 static void
18015 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
18017 auto_vec<const char *> candidates;
18018 const struct processor *entry = arch ? all_architectures : all_cores;
18019 for (; entry->name != NULL; entry++)
18020 candidates.safe_push (entry->name);
18022 #ifdef HAVE_LOCAL_CPU_DETECT
18023 /* Add also "native" as possible value. */
18024 if (arch)
18025 candidates.safe_push ("native");
18026 #endif
18028 char *s;
18029 const char *hint = candidates_list_and_hint (str, s, candidates);
18030 if (hint)
18031 inform (input_location, "valid arguments are: %s;"
18032 " did you mean %qs?", s, hint);
18033 else
18034 inform (input_location, "valid arguments are: %s", s);
18036 XDELETEVEC (s);
18039 /* Print a hint with a suggestion for a core name that most closely resembles
18040 what the user passed in STR. */
18042 inline static void
18043 aarch64_print_hint_for_core (const char *str)
18045 aarch64_print_hint_for_core_or_arch (str, false);
18048 /* Print a hint with a suggestion for an architecture name that most closely
18049 resembles what the user passed in STR. */
18051 inline static void
18052 aarch64_print_hint_for_arch (const char *str)
18054 aarch64_print_hint_for_core_or_arch (str, true);
18058 /* Print a hint with a suggestion for an extension name
18059 that most closely resembles what the user passed in STR. */
18061 void
18062 aarch64_print_hint_for_extensions (const std::string &str)
18064 auto_vec<const char *> candidates;
18065 aarch64_get_all_extension_candidates (&candidates);
18066 char *s;
18067 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
18068 if (hint)
18069 inform (input_location, "valid arguments are: %s;"
18070 " did you mean %qs?", s, hint);
18071 else
18072 inform (input_location, "valid arguments are: %s", s);
18074 XDELETEVEC (s);
18077 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
18078 specified in STR and throw errors if appropriate. Put the results if
18079 they are valid in RES and ISA_FLAGS. Return whether the option is
18080 valid. */
18082 static bool
18083 aarch64_validate_mcpu (const char *str, const struct processor **res,
18084 aarch64_feature_flags *isa_flags)
18086 std::string invalid_extension;
18087 enum aarch_parse_opt_result parse_res
18088 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18090 if (parse_res == AARCH_PARSE_OK)
18091 return true;
18093 switch (parse_res)
18095 case AARCH_PARSE_MISSING_ARG:
18096 error ("missing cpu name in %<-mcpu=%s%>", str);
18097 break;
18098 case AARCH_PARSE_INVALID_ARG:
18099 error ("unknown value %qs for %<-mcpu%>", str);
18100 aarch64_print_hint_for_core (str);
18101 /* A common user error is confusing -march and -mcpu.
18102 If the -mcpu string matches a known architecture then suggest
18103 -march=. */
18104 parse_res = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18105 if (parse_res == AARCH_PARSE_OK)
18106 inform (input_location, "did you mean %<-march=%s%>?", str);
18107 break;
18108 case AARCH_PARSE_INVALID_FEATURE:
18109 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
18110 invalid_extension.c_str (), str);
18111 aarch64_print_hint_for_extensions (invalid_extension);
18112 break;
18113 default:
18114 gcc_unreachable ();
18117 return false;
18120 /* Straight line speculation indicators. */
18121 enum aarch64_sls_hardening_type
18123 SLS_NONE = 0,
18124 SLS_RETBR = 1,
18125 SLS_BLR = 2,
18126 SLS_ALL = 3,
18128 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
18130 /* Return whether we should mitigatate Straight Line Speculation for the RET
18131 and BR instructions. */
18132 bool
18133 aarch64_harden_sls_retbr_p (void)
18135 return aarch64_sls_hardening & SLS_RETBR;
18138 /* Return whether we should mitigatate Straight Line Speculation for the BLR
18139 instruction. */
18140 bool
18141 aarch64_harden_sls_blr_p (void)
18143 return aarch64_sls_hardening & SLS_BLR;
18146 /* As of yet we only allow setting these options globally, in the future we may
18147 allow setting them per function. */
18148 static void
18149 aarch64_validate_sls_mitigation (const char *const_str)
18151 char *token_save = NULL;
18152 char *str = NULL;
18154 if (strcmp (const_str, "none") == 0)
18156 aarch64_sls_hardening = SLS_NONE;
18157 return;
18159 if (strcmp (const_str, "all") == 0)
18161 aarch64_sls_hardening = SLS_ALL;
18162 return;
18165 char *str_root = xstrdup (const_str);
18166 str = strtok_r (str_root, ",", &token_save);
18167 if (!str)
18168 error ("invalid argument given to %<-mharden-sls=%>");
18170 int temp = SLS_NONE;
18171 while (str)
18173 if (strcmp (str, "blr") == 0)
18174 temp |= SLS_BLR;
18175 else if (strcmp (str, "retbr") == 0)
18176 temp |= SLS_RETBR;
18177 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
18179 error ("%qs must be by itself for %<-mharden-sls=%>", str);
18180 break;
18182 else
18184 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
18185 break;
18187 str = strtok_r (NULL, ",", &token_save);
18189 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
18190 free (str_root);
18193 /* Validate a command-line -march option. Parse the arch and extensions
18194 (if any) specified in STR and throw errors if appropriate. Put the
18195 results, if they are valid, in RES and ISA_FLAGS. Return whether the
18196 option is valid. */
18198 static bool
18199 aarch64_validate_march (const char *str, const struct processor **res,
18200 aarch64_feature_flags *isa_flags)
18202 std::string invalid_extension;
18203 enum aarch_parse_opt_result parse_res
18204 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
18206 if (parse_res == AARCH_PARSE_OK)
18207 return true;
18209 switch (parse_res)
18211 case AARCH_PARSE_MISSING_ARG:
18212 error ("missing arch name in %<-march=%s%>", str);
18213 break;
18214 case AARCH_PARSE_INVALID_ARG:
18215 error ("unknown value %qs for %<-march%>", str);
18216 aarch64_print_hint_for_arch (str);
18217 /* A common user error is confusing -march and -mcpu.
18218 If the -march string matches a known CPU suggest -mcpu. */
18219 parse_res = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
18220 if (parse_res == AARCH_PARSE_OK)
18221 inform (input_location, "did you mean %<-mcpu=%s%>?", str);
18222 break;
18223 case AARCH_PARSE_INVALID_FEATURE:
18224 error ("invalid feature modifier %qs in %<-march=%s%>",
18225 invalid_extension.c_str (), str);
18226 aarch64_print_hint_for_extensions (invalid_extension);
18227 break;
18228 default:
18229 gcc_unreachable ();
18232 return false;
18235 /* Validate a command-line -mtune option. Parse the cpu
18236 specified in STR and throw errors if appropriate. Put the
18237 result, if it is valid, in RES. Return whether the option is
18238 valid. */
18240 static bool
18241 aarch64_validate_mtune (const char *str, const struct processor **res)
18243 enum aarch_parse_opt_result parse_res
18244 = aarch64_parse_tune (str, res);
18246 if (parse_res == AARCH_PARSE_OK)
18247 return true;
18249 switch (parse_res)
18251 case AARCH_PARSE_MISSING_ARG:
18252 error ("missing cpu name in %<-mtune=%s%>", str);
18253 break;
18254 case AARCH_PARSE_INVALID_ARG:
18255 error ("unknown value %qs for %<-mtune%>", str);
18256 aarch64_print_hint_for_core (str);
18257 break;
18258 default:
18259 gcc_unreachable ();
18261 return false;
18264 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
18266 static poly_uint16
18267 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
18269 /* 128-bit SVE and Advanced SIMD modes use different register layouts
18270 on big-endian targets, so we would need to forbid subregs that convert
18271 from one to the other. By default a reinterpret sequence would then
18272 involve a store to memory in one mode and a load back in the other.
18273 Even if we optimize that sequence using reverse instructions,
18274 it would still be a significant potential overhead.
18276 For now, it seems better to generate length-agnostic code for that
18277 case instead. */
18278 if (value == SVE_SCALABLE
18279 || (value == SVE_128 && BYTES_BIG_ENDIAN))
18280 return poly_uint16 (2, 2);
18281 else
18282 return (int) value / 64;
18285 /* Set the global aarch64_asm_isa_flags to FLAGS and update
18286 aarch64_isa_flags accordingly. */
18288 void
18289 aarch64_set_asm_isa_flags (aarch64_feature_flags flags)
18291 aarch64_set_asm_isa_flags (&global_options, flags);
18294 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
18295 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
18296 tuning structs. In particular it must set selected_tune and
18297 aarch64_asm_isa_flags that define the available ISA features and tuning
18298 decisions. It must also set selected_arch as this will be used to
18299 output the .arch asm tags for each function. */
18301 static void
18302 aarch64_override_options (void)
18304 aarch64_feature_flags cpu_isa = 0;
18305 aarch64_feature_flags arch_isa = 0;
18306 aarch64_set_asm_isa_flags (0);
18308 const struct processor *cpu = NULL;
18309 const struct processor *arch = NULL;
18310 const struct processor *tune = NULL;
18312 if (aarch64_harden_sls_string)
18313 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
18315 if (aarch64_branch_protection_string)
18316 aarch_validate_mbranch_protection (aarch64_branch_protection_string);
18318 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
18319 If either of -march or -mtune is given, they override their
18320 respective component of -mcpu. */
18321 if (aarch64_cpu_string)
18322 aarch64_validate_mcpu (aarch64_cpu_string, &cpu, &cpu_isa);
18324 if (aarch64_arch_string)
18325 aarch64_validate_march (aarch64_arch_string, &arch, &arch_isa);
18327 if (aarch64_tune_string)
18328 aarch64_validate_mtune (aarch64_tune_string, &tune);
18330 #ifdef SUBTARGET_OVERRIDE_OPTIONS
18331 SUBTARGET_OVERRIDE_OPTIONS;
18332 #endif
18334 if (cpu && arch)
18336 /* If both -mcpu and -march are specified, warn if they are not
18337 architecturally compatible and prefer the -march ISA flags. */
18338 if (arch->arch != cpu->arch)
18340 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
18341 aarch64_cpu_string,
18342 aarch64_arch_string);
18345 selected_arch = arch->arch;
18346 aarch64_set_asm_isa_flags (arch_isa);
18348 else if (cpu)
18350 selected_arch = cpu->arch;
18351 aarch64_set_asm_isa_flags (cpu_isa);
18353 else if (arch)
18355 cpu = &all_cores[arch->ident];
18356 selected_arch = arch->arch;
18357 aarch64_set_asm_isa_flags (arch_isa);
18359 else
18361 /* No -mcpu or -march specified, so use the default CPU. */
18362 cpu = &all_cores[TARGET_CPU_DEFAULT];
18363 selected_arch = cpu->arch;
18364 aarch64_set_asm_isa_flags (cpu->flags);
18367 selected_tune = tune ? tune->ident : cpu->ident;
18369 if (aarch_enable_bti == 2)
18371 #ifdef TARGET_ENABLE_BTI
18372 aarch_enable_bti = 1;
18373 #else
18374 aarch_enable_bti = 0;
18375 #endif
18378 /* Return address signing is currently not supported for ILP32 targets. For
18379 LP64 targets use the configured option in the absence of a command-line
18380 option for -mbranch-protection. */
18381 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
18383 #ifdef TARGET_ENABLE_PAC_RET
18384 aarch_ra_sign_scope = AARCH_FUNCTION_NON_LEAF;
18385 #else
18386 aarch_ra_sign_scope = AARCH_FUNCTION_NONE;
18387 #endif
18390 #ifndef HAVE_AS_MABI_OPTION
18391 /* The compiler may have been configured with 2.23.* binutils, which does
18392 not have support for ILP32. */
18393 if (TARGET_ILP32)
18394 error ("assembler does not support %<-mabi=ilp32%>");
18395 #endif
18397 /* Convert -msve-vector-bits to a VG count. */
18398 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
18400 if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE && TARGET_ILP32)
18401 sorry ("return address signing is only supported for %<-mabi=lp64%>");
18403 /* The pass to insert speculation tracking runs before
18404 shrink-wrapping and the latter does not know how to update the
18405 tracking status. So disable it in this case. */
18406 if (aarch64_track_speculation)
18407 flag_shrink_wrap = 0;
18409 aarch64_override_options_internal (&global_options);
18411 /* Save these options as the default ones in case we push and pop them later
18412 while processing functions with potential target attributes. */
18413 target_option_default_node = target_option_current_node
18414 = build_target_option_node (&global_options, &global_options_set);
18417 /* Implement targetm.override_options_after_change. */
18419 static void
18420 aarch64_override_options_after_change (void)
18422 aarch64_override_options_after_change_1 (&global_options);
18425 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
18426 static char *
18427 aarch64_offload_options (void)
18429 if (TARGET_ILP32)
18430 return xstrdup ("-foffload-abi=ilp32");
18431 else
18432 return xstrdup ("-foffload-abi=lp64");
18435 static struct machine_function *
18436 aarch64_init_machine_status (void)
18438 struct machine_function *machine;
18439 machine = ggc_cleared_alloc<machine_function> ();
18440 return machine;
18443 void
18444 aarch64_init_expanders (void)
18446 init_machine_status = aarch64_init_machine_status;
18449 /* A checking mechanism for the implementation of the various code models. */
18450 static void
18451 initialize_aarch64_code_model (struct gcc_options *opts)
18453 aarch64_cmodel = opts->x_aarch64_cmodel_var;
18454 switch (opts->x_aarch64_cmodel_var)
18456 case AARCH64_CMODEL_TINY:
18457 if (opts->x_flag_pic)
18458 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
18459 break;
18460 case AARCH64_CMODEL_SMALL:
18461 if (opts->x_flag_pic)
18463 #ifdef HAVE_AS_SMALL_PIC_RELOCS
18464 aarch64_cmodel = (flag_pic == 2
18465 ? AARCH64_CMODEL_SMALL_PIC
18466 : AARCH64_CMODEL_SMALL_SPIC);
18467 #else
18468 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
18469 #endif
18471 break;
18472 case AARCH64_CMODEL_LARGE:
18473 if (opts->x_flag_pic)
18474 sorry ("code model %qs with %<-f%s%>", "large",
18475 opts->x_flag_pic > 1 ? "PIC" : "pic");
18476 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
18477 sorry ("code model %qs not supported in ilp32 mode", "large");
18478 break;
18479 case AARCH64_CMODEL_TINY_PIC:
18480 case AARCH64_CMODEL_SMALL_PIC:
18481 case AARCH64_CMODEL_SMALL_SPIC:
18482 gcc_unreachable ();
18486 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
18487 using the information saved in PTR. */
18489 static void
18490 aarch64_option_restore (struct gcc_options *opts,
18491 struct gcc_options * /* opts_set */,
18492 struct cl_target_option * /* ptr */)
18494 aarch64_override_options_internal (opts);
18497 /* Implement TARGET_OPTION_PRINT. */
18499 static void
18500 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
18502 const struct processor *cpu
18503 = aarch64_get_tune_cpu (ptr->x_selected_tune);
18504 const struct processor *arch = aarch64_get_arch (ptr->x_selected_arch);
18505 std::string extension
18506 = aarch64_get_extension_string_for_isa_flags (ptr->x_aarch64_asm_isa_flags,
18507 arch->flags);
18509 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
18510 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
18511 arch->name, extension.c_str ());
18514 static GTY(()) tree aarch64_previous_fndecl;
18516 void
18517 aarch64_reset_previous_fndecl (void)
18519 aarch64_previous_fndecl = NULL;
18522 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
18523 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
18524 make sure optab availability predicates are recomputed when necessary. */
18526 void
18527 aarch64_save_restore_target_globals (tree new_tree)
18529 if (TREE_TARGET_GLOBALS (new_tree))
18530 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
18531 else if (new_tree == target_option_default_node)
18532 restore_target_globals (&default_target_globals);
18533 else
18534 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
18537 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
18538 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
18539 of the function, if such exists. This function may be called multiple
18540 times on a single function so use aarch64_previous_fndecl to avoid
18541 setting up identical state. */
18543 static void
18544 aarch64_set_current_function (tree fndecl)
18546 if (!fndecl || fndecl == aarch64_previous_fndecl)
18547 return;
18549 tree old_tree = (aarch64_previous_fndecl
18550 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
18551 : NULL_TREE);
18553 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
18555 /* If current function has no attributes but the previous one did,
18556 use the default node. */
18557 if (!new_tree && old_tree)
18558 new_tree = target_option_default_node;
18560 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
18561 the default have been handled by aarch64_save_restore_target_globals from
18562 aarch64_pragma_target_parse. */
18563 if (old_tree == new_tree)
18564 return;
18566 aarch64_previous_fndecl = fndecl;
18568 /* First set the target options. */
18569 cl_target_option_restore (&global_options, &global_options_set,
18570 TREE_TARGET_OPTION (new_tree));
18572 aarch64_save_restore_target_globals (new_tree);
18575 /* Enum describing the various ways we can handle attributes.
18576 In many cases we can reuse the generic option handling machinery. */
18578 enum aarch64_attr_opt_type
18580 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
18581 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
18582 aarch64_attr_enum, /* Attribute sets an enum variable. */
18583 aarch64_attr_custom /* Attribute requires a custom handling function. */
18586 /* All the information needed to handle a target attribute.
18587 NAME is the name of the attribute.
18588 ATTR_TYPE specifies the type of behavior of the attribute as described
18589 in the definition of enum aarch64_attr_opt_type.
18590 ALLOW_NEG is true if the attribute supports a "no-" form.
18591 HANDLER is the function that takes the attribute string as an argument
18592 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
18593 OPT_NUM is the enum specifying the option that the attribute modifies.
18594 This is needed for attributes that mirror the behavior of a command-line
18595 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
18596 aarch64_attr_enum. */
18598 struct aarch64_attribute_info
18600 const char *name;
18601 enum aarch64_attr_opt_type attr_type;
18602 bool allow_neg;
18603 bool (*handler) (const char *);
18604 enum opt_code opt_num;
18607 /* Handle the ARCH_STR argument to the arch= target attribute. */
18609 static bool
18610 aarch64_handle_attr_arch (const char *str)
18612 const struct processor *tmp_arch = NULL;
18613 std::string invalid_extension;
18614 aarch64_feature_flags tmp_flags;
18615 enum aarch_parse_opt_result parse_res
18616 = aarch64_parse_arch (str, &tmp_arch, &tmp_flags, &invalid_extension);
18618 if (parse_res == AARCH_PARSE_OK)
18620 gcc_assert (tmp_arch);
18621 selected_arch = tmp_arch->arch;
18622 aarch64_set_asm_isa_flags (tmp_flags);
18623 return true;
18626 switch (parse_res)
18628 case AARCH_PARSE_MISSING_ARG:
18629 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
18630 break;
18631 case AARCH_PARSE_INVALID_ARG:
18632 error ("invalid name %qs in %<target(\"arch=\")%> pragma or attribute", str);
18633 aarch64_print_hint_for_arch (str);
18634 break;
18635 case AARCH_PARSE_INVALID_FEATURE:
18636 error ("invalid feature modifier %s of value %qs in "
18637 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18638 aarch64_print_hint_for_extensions (invalid_extension);
18639 break;
18640 default:
18641 gcc_unreachable ();
18644 return false;
18647 /* Handle the argument CPU_STR to the cpu= target attribute. */
18649 static bool
18650 aarch64_handle_attr_cpu (const char *str)
18652 const struct processor *tmp_cpu = NULL;
18653 std::string invalid_extension;
18654 aarch64_feature_flags tmp_flags;
18655 enum aarch_parse_opt_result parse_res
18656 = aarch64_parse_cpu (str, &tmp_cpu, &tmp_flags, &invalid_extension);
18658 if (parse_res == AARCH_PARSE_OK)
18660 gcc_assert (tmp_cpu);
18661 selected_tune = tmp_cpu->ident;
18662 selected_arch = tmp_cpu->arch;
18663 aarch64_set_asm_isa_flags (tmp_flags);
18664 return true;
18667 switch (parse_res)
18669 case AARCH_PARSE_MISSING_ARG:
18670 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
18671 break;
18672 case AARCH_PARSE_INVALID_ARG:
18673 error ("invalid name %qs in %<target(\"cpu=\")%> pragma or attribute", str);
18674 aarch64_print_hint_for_core (str);
18675 break;
18676 case AARCH_PARSE_INVALID_FEATURE:
18677 error ("invalid feature modifier %qs of value %qs in "
18678 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18679 aarch64_print_hint_for_extensions (invalid_extension);
18680 break;
18681 default:
18682 gcc_unreachable ();
18685 return false;
18688 /* Handle the argument STR to the branch-protection= attribute. */
18690 static bool
18691 aarch64_handle_attr_branch_protection (const char* str)
18693 char *err_str = (char *) xmalloc (strlen (str) + 1);
18694 enum aarch_parse_opt_result res = aarch_parse_branch_protection (str,
18695 &err_str);
18696 bool success = false;
18697 switch (res)
18699 case AARCH_PARSE_MISSING_ARG:
18700 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
18701 " attribute");
18702 break;
18703 case AARCH_PARSE_INVALID_ARG:
18704 error ("invalid protection type %qs in %<target(\"branch-protection"
18705 "=\")%> pragma or attribute", err_str);
18706 break;
18707 case AARCH_PARSE_OK:
18708 success = true;
18709 /* Fall through. */
18710 case AARCH_PARSE_INVALID_FEATURE:
18711 break;
18712 default:
18713 gcc_unreachable ();
18715 free (err_str);
18716 return success;
18719 /* Handle the argument STR to the tune= target attribute. */
18721 static bool
18722 aarch64_handle_attr_tune (const char *str)
18724 const struct processor *tmp_tune = NULL;
18725 enum aarch_parse_opt_result parse_res
18726 = aarch64_parse_tune (str, &tmp_tune);
18728 if (parse_res == AARCH_PARSE_OK)
18730 gcc_assert (tmp_tune);
18731 selected_tune = tmp_tune->ident;
18732 return true;
18735 switch (parse_res)
18737 case AARCH_PARSE_INVALID_ARG:
18738 error ("invalid name %qs in %<target(\"tune=\")%> pragma or attribute", str);
18739 aarch64_print_hint_for_core (str);
18740 break;
18741 default:
18742 gcc_unreachable ();
18745 return false;
18748 /* Parse an architecture extensions target attribute string specified in STR.
18749 For example "+fp+nosimd". Show any errors if needed. Return TRUE
18750 if successful. Update aarch64_isa_flags to reflect the ISA features
18751 modified. */
18753 static bool
18754 aarch64_handle_attr_isa_flags (char *str)
18756 enum aarch_parse_opt_result parse_res;
18757 auto isa_flags = aarch64_asm_isa_flags;
18759 /* We allow "+nothing" in the beginning to clear out all architectural
18760 features if the user wants to handpick specific features. */
18761 if (strncmp ("+nothing", str, 8) == 0)
18763 isa_flags = 0;
18764 str += 8;
18767 std::string invalid_extension;
18768 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
18770 if (parse_res == AARCH_PARSE_OK)
18772 aarch64_set_asm_isa_flags (isa_flags);
18773 return true;
18776 switch (parse_res)
18778 case AARCH_PARSE_MISSING_ARG:
18779 error ("missing value in %<target()%> pragma or attribute");
18780 break;
18782 case AARCH_PARSE_INVALID_FEATURE:
18783 error ("invalid feature modifier %qs of value %qs in "
18784 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
18785 break;
18787 default:
18788 gcc_unreachable ();
18791 return false;
18794 /* The target attributes that we support. On top of these we also support just
18795 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
18796 handled explicitly in aarch64_process_one_target_attr. */
18798 static const struct aarch64_attribute_info aarch64_attributes[] =
18800 { "general-regs-only", aarch64_attr_mask, false, NULL,
18801 OPT_mgeneral_regs_only },
18802 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
18803 OPT_mfix_cortex_a53_835769 },
18804 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
18805 OPT_mfix_cortex_a53_843419 },
18806 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
18807 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
18808 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
18809 OPT_momit_leaf_frame_pointer },
18810 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
18811 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
18812 OPT_march_ },
18813 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
18814 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
18815 OPT_mtune_ },
18816 { "branch-protection", aarch64_attr_custom, false,
18817 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
18818 { "sign-return-address", aarch64_attr_enum, false, NULL,
18819 OPT_msign_return_address_ },
18820 { "outline-atomics", aarch64_attr_bool, true, NULL,
18821 OPT_moutline_atomics},
18822 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
18825 /* Parse ARG_STR which contains the definition of one target attribute.
18826 Show appropriate errors if any or return true if the attribute is valid. */
18828 static bool
18829 aarch64_process_one_target_attr (char *arg_str)
18831 bool invert = false;
18833 size_t len = strlen (arg_str);
18835 if (len == 0)
18837 error ("malformed %<target()%> pragma or attribute");
18838 return false;
18841 char *str_to_check = (char *) alloca (len + 1);
18842 strcpy (str_to_check, arg_str);
18844 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
18845 It is easier to detect and handle it explicitly here rather than going
18846 through the machinery for the rest of the target attributes in this
18847 function. */
18848 if (*str_to_check == '+')
18849 return aarch64_handle_attr_isa_flags (str_to_check);
18851 if (len > 3 && startswith (str_to_check, "no-"))
18853 invert = true;
18854 str_to_check += 3;
18856 char *arg = strchr (str_to_check, '=');
18858 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
18859 and point ARG to "foo". */
18860 if (arg)
18862 *arg = '\0';
18863 arg++;
18865 const struct aarch64_attribute_info *p_attr;
18866 bool found = false;
18867 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
18869 /* If the names don't match up, or the user has given an argument
18870 to an attribute that doesn't accept one, or didn't give an argument
18871 to an attribute that expects one, fail to match. */
18872 if (strcmp (str_to_check, p_attr->name) != 0)
18873 continue;
18875 found = true;
18876 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
18877 || p_attr->attr_type == aarch64_attr_enum;
18879 if (attr_need_arg_p ^ (arg != NULL))
18881 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
18882 return false;
18885 /* If the name matches but the attribute does not allow "no-" versions
18886 then we can't match. */
18887 if (invert && !p_attr->allow_neg)
18889 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
18890 return false;
18893 switch (p_attr->attr_type)
18895 /* Has a custom handler registered.
18896 For example, cpu=, arch=, tune=. */
18897 case aarch64_attr_custom:
18898 gcc_assert (p_attr->handler);
18899 if (!p_attr->handler (arg))
18900 return false;
18901 break;
18903 /* Either set or unset a boolean option. */
18904 case aarch64_attr_bool:
18906 struct cl_decoded_option decoded;
18908 generate_option (p_attr->opt_num, NULL, !invert,
18909 CL_TARGET, &decoded);
18910 aarch64_handle_option (&global_options, &global_options_set,
18911 &decoded, input_location);
18912 break;
18914 /* Set or unset a bit in the target_flags. aarch64_handle_option
18915 should know what mask to apply given the option number. */
18916 case aarch64_attr_mask:
18918 struct cl_decoded_option decoded;
18919 /* We only need to specify the option number.
18920 aarch64_handle_option will know which mask to apply. */
18921 decoded.opt_index = p_attr->opt_num;
18922 decoded.value = !invert;
18923 aarch64_handle_option (&global_options, &global_options_set,
18924 &decoded, input_location);
18925 break;
18927 /* Use the option setting machinery to set an option to an enum. */
18928 case aarch64_attr_enum:
18930 gcc_assert (arg);
18931 bool valid;
18932 int value;
18933 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
18934 &value, CL_TARGET);
18935 if (valid)
18937 set_option (&global_options, NULL, p_attr->opt_num, value,
18938 NULL, DK_UNSPECIFIED, input_location,
18939 global_dc);
18941 else
18943 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
18945 break;
18947 default:
18948 gcc_unreachable ();
18952 /* If we reached here we either have found an attribute and validated
18953 it or didn't match any. If we matched an attribute but its arguments
18954 were malformed we will have returned false already. */
18955 return found;
18958 /* Count how many times the character C appears in
18959 NULL-terminated string STR. */
18961 static unsigned int
18962 num_occurences_in_str (char c, char *str)
18964 unsigned int res = 0;
18965 while (*str != '\0')
18967 if (*str == c)
18968 res++;
18970 str++;
18973 return res;
18976 /* Parse the tree in ARGS that contains the target attribute information
18977 and update the global target options space. */
18979 bool
18980 aarch64_process_target_attr (tree args)
18982 if (TREE_CODE (args) == TREE_LIST)
18986 tree head = TREE_VALUE (args);
18987 if (head)
18989 if (!aarch64_process_target_attr (head))
18990 return false;
18992 args = TREE_CHAIN (args);
18993 } while (args);
18995 return true;
18998 if (TREE_CODE (args) != STRING_CST)
19000 error ("attribute %<target%> argument not a string");
19001 return false;
19004 size_t len = strlen (TREE_STRING_POINTER (args));
19005 char *str_to_check = (char *) alloca (len + 1);
19006 strcpy (str_to_check, TREE_STRING_POINTER (args));
19008 if (len == 0)
19010 error ("malformed %<target()%> pragma or attribute");
19011 return false;
19014 /* Used to catch empty spaces between commas i.e.
19015 attribute ((target ("attr1,,attr2"))). */
19016 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
19018 /* Handle multiple target attributes separated by ','. */
19019 char *token = strtok_r (str_to_check, ",", &str_to_check);
19021 unsigned int num_attrs = 0;
19022 while (token)
19024 num_attrs++;
19025 if (!aarch64_process_one_target_attr (token))
19027 /* Check if token is possibly an arch extension without
19028 leading '+'. */
19029 aarch64_feature_flags isa_temp = 0;
19030 auto with_plus = std::string ("+") + token;
19031 enum aarch_parse_opt_result ext_res
19032 = aarch64_parse_extension (with_plus.c_str (), &isa_temp, nullptr);
19034 if (ext_res == AARCH_PARSE_OK)
19035 error ("arch extension %<%s%> should be prefixed by %<+%>",
19036 token);
19037 else
19038 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
19039 return false;
19042 token = strtok_r (NULL, ",", &str_to_check);
19045 if (num_attrs != num_commas + 1)
19047 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
19048 return false;
19051 return true;
19054 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
19055 process attribute ((target ("..."))). */
19057 static bool
19058 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
19060 struct cl_target_option cur_target;
19061 bool ret;
19062 tree old_optimize;
19063 tree new_target, new_optimize;
19064 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19066 /* If what we're processing is the current pragma string then the
19067 target option node is already stored in target_option_current_node
19068 by aarch64_pragma_target_parse in aarch64-c.cc. Use that to avoid
19069 having to re-parse the string. This is especially useful to keep
19070 arm_neon.h compile times down since that header contains a lot
19071 of intrinsics enclosed in pragmas. */
19072 if (!existing_target && args == current_target_pragma)
19074 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
19075 return true;
19077 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19079 old_optimize
19080 = build_optimization_node (&global_options, &global_options_set);
19081 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
19083 /* If the function changed the optimization levels as well as setting
19084 target options, start with the optimizations specified. */
19085 if (func_optimize && func_optimize != old_optimize)
19086 cl_optimization_restore (&global_options, &global_options_set,
19087 TREE_OPTIMIZATION (func_optimize));
19089 /* Save the current target options to restore at the end. */
19090 cl_target_option_save (&cur_target, &global_options, &global_options_set);
19092 /* If fndecl already has some target attributes applied to it, unpack
19093 them so that we add this attribute on top of them, rather than
19094 overwriting them. */
19095 if (existing_target)
19097 struct cl_target_option *existing_options
19098 = TREE_TARGET_OPTION (existing_target);
19100 if (existing_options)
19101 cl_target_option_restore (&global_options, &global_options_set,
19102 existing_options);
19104 else
19105 cl_target_option_restore (&global_options, &global_options_set,
19106 TREE_TARGET_OPTION (target_option_current_node));
19108 ret = aarch64_process_target_attr (args);
19110 /* Set up any additional state. */
19111 if (ret)
19113 aarch64_override_options_internal (&global_options);
19114 new_target = build_target_option_node (&global_options,
19115 &global_options_set);
19117 else
19118 new_target = NULL;
19120 new_optimize = build_optimization_node (&global_options,
19121 &global_options_set);
19123 if (fndecl && ret)
19125 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
19127 if (old_optimize != new_optimize)
19128 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
19131 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
19133 if (old_optimize != new_optimize)
19134 cl_optimization_restore (&global_options, &global_options_set,
19135 TREE_OPTIMIZATION (old_optimize));
19136 return ret;
19139 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
19140 tri-bool options (yes, no, don't care) and the default value is
19141 DEF, determine whether to reject inlining. */
19143 static bool
19144 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
19145 int dont_care, int def)
19147 /* If the callee doesn't care, always allow inlining. */
19148 if (callee == dont_care)
19149 return true;
19151 /* If the caller doesn't care, always allow inlining. */
19152 if (caller == dont_care)
19153 return true;
19155 /* Otherwise, allow inlining if either the callee and caller values
19156 agree, or if the callee is using the default value. */
19157 return (callee == caller || callee == def);
19160 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
19161 to inline CALLEE into CALLER based on target-specific info.
19162 Make sure that the caller and callee have compatible architectural
19163 features. Then go through the other possible target attributes
19164 and see if they can block inlining. Try not to reject always_inline
19165 callees unless they are incompatible architecturally. */
19167 static bool
19168 aarch64_can_inline_p (tree caller, tree callee)
19170 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
19171 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
19173 struct cl_target_option *caller_opts
19174 = TREE_TARGET_OPTION (caller_tree ? caller_tree
19175 : target_option_default_node);
19177 struct cl_target_option *callee_opts
19178 = TREE_TARGET_OPTION (callee_tree ? callee_tree
19179 : target_option_default_node);
19181 /* Callee's ISA flags should be a subset of the caller's. */
19182 if ((caller_opts->x_aarch64_asm_isa_flags
19183 & callee_opts->x_aarch64_asm_isa_flags)
19184 != callee_opts->x_aarch64_asm_isa_flags)
19185 return false;
19186 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
19187 != callee_opts->x_aarch64_isa_flags)
19188 return false;
19190 /* Allow non-strict aligned functions inlining into strict
19191 aligned ones. */
19192 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
19193 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
19194 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
19195 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
19196 return false;
19198 bool always_inline = lookup_attribute ("always_inline",
19199 DECL_ATTRIBUTES (callee));
19201 /* If the architectural features match up and the callee is always_inline
19202 then the other attributes don't matter. */
19203 if (always_inline)
19204 return true;
19206 if (caller_opts->x_aarch64_cmodel_var
19207 != callee_opts->x_aarch64_cmodel_var)
19208 return false;
19210 if (caller_opts->x_aarch64_tls_dialect
19211 != callee_opts->x_aarch64_tls_dialect)
19212 return false;
19214 /* Honour explicit requests to workaround errata. */
19215 if (!aarch64_tribools_ok_for_inlining_p (
19216 caller_opts->x_aarch64_fix_a53_err835769,
19217 callee_opts->x_aarch64_fix_a53_err835769,
19218 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
19219 return false;
19221 if (!aarch64_tribools_ok_for_inlining_p (
19222 caller_opts->x_aarch64_fix_a53_err843419,
19223 callee_opts->x_aarch64_fix_a53_err843419,
19224 2, TARGET_FIX_ERR_A53_843419))
19225 return false;
19227 /* If the user explicitly specified -momit-leaf-frame-pointer for the
19228 caller and calle and they don't match up, reject inlining. */
19229 if (!aarch64_tribools_ok_for_inlining_p (
19230 caller_opts->x_flag_omit_leaf_frame_pointer,
19231 callee_opts->x_flag_omit_leaf_frame_pointer,
19232 2, 1))
19233 return false;
19235 /* If the callee has specific tuning overrides, respect them. */
19236 if (callee_opts->x_aarch64_override_tune_string != NULL
19237 && caller_opts->x_aarch64_override_tune_string == NULL)
19238 return false;
19240 /* If the user specified tuning override strings for the
19241 caller and callee and they don't match up, reject inlining.
19242 We just do a string compare here, we don't analyze the meaning
19243 of the string, as it would be too costly for little gain. */
19244 if (callee_opts->x_aarch64_override_tune_string
19245 && caller_opts->x_aarch64_override_tune_string
19246 && (strcmp (callee_opts->x_aarch64_override_tune_string,
19247 caller_opts->x_aarch64_override_tune_string) != 0))
19248 return false;
19250 return true;
19253 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
19254 been already. */
19256 unsigned int
19257 aarch64_tlsdesc_abi_id ()
19259 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
19260 if (!tlsdesc_abi.initialized_p ())
19262 HARD_REG_SET full_reg_clobbers;
19263 CLEAR_HARD_REG_SET (full_reg_clobbers);
19264 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
19265 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
19266 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
19267 SET_HARD_REG_BIT (full_reg_clobbers, regno);
19268 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
19270 return tlsdesc_abi.id ();
19273 /* Return true if SYMBOL_REF X binds locally. */
19275 static bool
19276 aarch64_symbol_binds_local_p (const_rtx x)
19278 return (SYMBOL_REF_DECL (x)
19279 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
19280 : SYMBOL_REF_LOCAL_P (x));
19283 /* Return true if SYMBOL_REF X is thread local */
19284 static bool
19285 aarch64_tls_symbol_p (rtx x)
19287 if (! TARGET_HAVE_TLS)
19288 return false;
19290 x = strip_salt (x);
19291 if (!SYMBOL_REF_P (x))
19292 return false;
19294 return SYMBOL_REF_TLS_MODEL (x) != 0;
19297 /* Classify a TLS symbol into one of the TLS kinds. */
19298 enum aarch64_symbol_type
19299 aarch64_classify_tls_symbol (rtx x)
19301 enum tls_model tls_kind = tls_symbolic_operand_type (x);
19303 switch (tls_kind)
19305 case TLS_MODEL_GLOBAL_DYNAMIC:
19306 case TLS_MODEL_LOCAL_DYNAMIC:
19307 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
19309 case TLS_MODEL_INITIAL_EXEC:
19310 switch (aarch64_cmodel)
19312 case AARCH64_CMODEL_TINY:
19313 case AARCH64_CMODEL_TINY_PIC:
19314 return SYMBOL_TINY_TLSIE;
19315 default:
19316 return SYMBOL_SMALL_TLSIE;
19319 case TLS_MODEL_LOCAL_EXEC:
19320 if (aarch64_tls_size == 12)
19321 return SYMBOL_TLSLE12;
19322 else if (aarch64_tls_size == 24)
19323 return SYMBOL_TLSLE24;
19324 else if (aarch64_tls_size == 32)
19325 return SYMBOL_TLSLE32;
19326 else if (aarch64_tls_size == 48)
19327 return SYMBOL_TLSLE48;
19328 else
19329 gcc_unreachable ();
19331 case TLS_MODEL_EMULATED:
19332 case TLS_MODEL_NONE:
19333 return SYMBOL_FORCE_TO_MEM;
19335 default:
19336 gcc_unreachable ();
19340 /* Return the correct method for accessing X + OFFSET, where X is either
19341 a SYMBOL_REF or LABEL_REF. */
19343 enum aarch64_symbol_type
19344 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
19346 x = strip_salt (x);
19348 if (LABEL_REF_P (x))
19350 switch (aarch64_cmodel)
19352 case AARCH64_CMODEL_LARGE:
19353 return SYMBOL_FORCE_TO_MEM;
19355 case AARCH64_CMODEL_TINY_PIC:
19356 case AARCH64_CMODEL_TINY:
19357 return SYMBOL_TINY_ABSOLUTE;
19359 case AARCH64_CMODEL_SMALL_SPIC:
19360 case AARCH64_CMODEL_SMALL_PIC:
19361 case AARCH64_CMODEL_SMALL:
19362 return SYMBOL_SMALL_ABSOLUTE;
19364 default:
19365 gcc_unreachable ();
19369 if (SYMBOL_REF_P (x))
19371 if (aarch64_tls_symbol_p (x))
19372 return aarch64_classify_tls_symbol (x);
19374 switch (aarch64_cmodel)
19376 case AARCH64_CMODEL_TINY_PIC:
19377 case AARCH64_CMODEL_TINY:
19378 /* With -fPIC non-local symbols use the GOT. For orthogonality
19379 always use the GOT for extern weak symbols. */
19380 if ((flag_pic || SYMBOL_REF_WEAK (x))
19381 && !aarch64_symbol_binds_local_p (x))
19382 return SYMBOL_TINY_GOT;
19384 /* When we retrieve symbol + offset address, we have to make sure
19385 the offset does not cause overflow of the final address. But
19386 we have no way of knowing the address of symbol at compile time
19387 so we can't accurately say if the distance between the PC and
19388 symbol + offset is outside the addressible range of +/-1MB in the
19389 TINY code model. So we limit the maximum offset to +/-64KB and
19390 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
19391 If offset_within_block_p is true we allow larger offsets. */
19392 if (!(IN_RANGE (offset, -0x10000, 0x10000)
19393 || offset_within_block_p (x, offset)))
19394 return SYMBOL_FORCE_TO_MEM;
19396 return SYMBOL_TINY_ABSOLUTE;
19399 case AARCH64_CMODEL_SMALL_SPIC:
19400 case AARCH64_CMODEL_SMALL_PIC:
19401 case AARCH64_CMODEL_SMALL:
19402 if ((flag_pic || SYMBOL_REF_WEAK (x))
19403 && !aarch64_symbol_binds_local_p (x))
19404 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
19405 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
19407 /* Same reasoning as the tiny code model, but the offset cap here is
19408 1MB, allowing +/-3.9GB for the offset to the symbol. */
19409 if (!(IN_RANGE (offset, -0x100000, 0x100000)
19410 || offset_within_block_p (x, offset)))
19411 return SYMBOL_FORCE_TO_MEM;
19413 return SYMBOL_SMALL_ABSOLUTE;
19415 case AARCH64_CMODEL_LARGE:
19416 /* This is alright even in PIC code as the constant
19417 pool reference is always PC relative and within
19418 the same translation unit. */
19419 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
19420 return SYMBOL_SMALL_ABSOLUTE;
19421 else
19422 return SYMBOL_FORCE_TO_MEM;
19424 default:
19425 gcc_unreachable ();
19429 /* By default push everything into the constant pool. */
19430 return SYMBOL_FORCE_TO_MEM;
19433 bool
19434 aarch64_constant_address_p (rtx x)
19436 return (CONSTANT_P (x) && memory_address_p (DImode, x));
19439 bool
19440 aarch64_legitimate_pic_operand_p (rtx x)
19442 poly_int64 offset;
19443 x = strip_offset_and_salt (x, &offset);
19444 if (SYMBOL_REF_P (x))
19445 return false;
19447 return true;
19450 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
19451 that should be rematerialized rather than spilled. */
19453 static bool
19454 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
19456 /* Support CSE and rematerialization of common constants. */
19457 if (CONST_INT_P (x)
19458 || CONST_DOUBLE_P (x))
19459 return true;
19461 /* Only accept variable-length vector constants if they can be
19462 handled directly.
19464 ??? It would be possible (but complex) to handle rematerialization
19465 of other constants via secondary reloads. */
19466 if (!GET_MODE_SIZE (mode).is_constant ())
19467 return aarch64_simd_valid_immediate (x, NULL);
19469 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
19470 least be forced to memory and loaded from there. */
19471 if (CONST_VECTOR_P (x))
19472 return !targetm.cannot_force_const_mem (mode, x);
19474 /* Do not allow vector struct mode constants for Advanced SIMD.
19475 We could support 0 and -1 easily, but they need support in
19476 aarch64-simd.md. */
19477 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19478 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
19479 return false;
19481 if (GET_CODE (x) == HIGH)
19482 x = XEXP (x, 0);
19484 /* Accept polynomial constants that can be calculated by using the
19485 destination of a move as the sole temporary. Constants that
19486 require a second temporary cannot be rematerialized (they can't be
19487 forced to memory and also aren't legitimate constants). */
19488 poly_int64 offset;
19489 if (poly_int_rtx_p (x, &offset))
19490 return aarch64_offset_temporaries (false, offset) <= 1;
19492 /* If an offset is being added to something else, we need to allow the
19493 base to be moved into the destination register, meaning that there
19494 are no free temporaries for the offset. */
19495 x = strip_offset_and_salt (x, &offset);
19496 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
19497 return false;
19499 /* Do not allow const (plus (anchor_symbol, const_int)). */
19500 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
19501 return false;
19503 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
19504 so spilling them is better than rematerialization. */
19505 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
19506 return true;
19508 /* Label references are always constant. */
19509 if (LABEL_REF_P (x))
19510 return true;
19512 return false;
19516 aarch64_load_tp (rtx target)
19518 if (!target
19519 || GET_MODE (target) != Pmode
19520 || !register_operand (target, Pmode))
19521 target = gen_reg_rtx (Pmode);
19523 /* Can return in any reg. */
19524 emit_insn (gen_aarch64_load_tp_hard (target));
19525 return target;
19528 /* On AAPCS systems, this is the "struct __va_list". */
19529 static GTY(()) tree va_list_type;
19531 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
19532 Return the type to use as __builtin_va_list.
19534 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
19536 struct __va_list
19538 void *__stack;
19539 void *__gr_top;
19540 void *__vr_top;
19541 int __gr_offs;
19542 int __vr_offs;
19543 }; */
19545 static tree
19546 aarch64_build_builtin_va_list (void)
19548 tree va_list_name;
19549 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19551 /* Create the type. */
19552 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
19553 /* Give it the required name. */
19554 va_list_name = build_decl (BUILTINS_LOCATION,
19555 TYPE_DECL,
19556 get_identifier ("__va_list"),
19557 va_list_type);
19558 DECL_ARTIFICIAL (va_list_name) = 1;
19559 TYPE_NAME (va_list_type) = va_list_name;
19560 TYPE_STUB_DECL (va_list_type) = va_list_name;
19562 /* Create the fields. */
19563 f_stack = build_decl (BUILTINS_LOCATION,
19564 FIELD_DECL, get_identifier ("__stack"),
19565 ptr_type_node);
19566 f_grtop = build_decl (BUILTINS_LOCATION,
19567 FIELD_DECL, get_identifier ("__gr_top"),
19568 ptr_type_node);
19569 f_vrtop = build_decl (BUILTINS_LOCATION,
19570 FIELD_DECL, get_identifier ("__vr_top"),
19571 ptr_type_node);
19572 f_groff = build_decl (BUILTINS_LOCATION,
19573 FIELD_DECL, get_identifier ("__gr_offs"),
19574 integer_type_node);
19575 f_vroff = build_decl (BUILTINS_LOCATION,
19576 FIELD_DECL, get_identifier ("__vr_offs"),
19577 integer_type_node);
19579 /* Tell tree-stdarg pass about our internal offset fields.
19580 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
19581 purpose to identify whether the code is updating va_list internal
19582 offset fields through irregular way. */
19583 va_list_gpr_counter_field = f_groff;
19584 va_list_fpr_counter_field = f_vroff;
19586 DECL_ARTIFICIAL (f_stack) = 1;
19587 DECL_ARTIFICIAL (f_grtop) = 1;
19588 DECL_ARTIFICIAL (f_vrtop) = 1;
19589 DECL_ARTIFICIAL (f_groff) = 1;
19590 DECL_ARTIFICIAL (f_vroff) = 1;
19592 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
19593 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
19594 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
19595 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
19596 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
19598 TYPE_FIELDS (va_list_type) = f_stack;
19599 DECL_CHAIN (f_stack) = f_grtop;
19600 DECL_CHAIN (f_grtop) = f_vrtop;
19601 DECL_CHAIN (f_vrtop) = f_groff;
19602 DECL_CHAIN (f_groff) = f_vroff;
19604 /* Compute its layout. */
19605 layout_type (va_list_type);
19607 return va_list_type;
19610 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
19611 static void
19612 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
19614 const CUMULATIVE_ARGS *cum;
19615 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19616 tree stack, grtop, vrtop, groff, vroff;
19617 tree t;
19618 int gr_save_area_size = cfun->va_list_gpr_size;
19619 int vr_save_area_size = cfun->va_list_fpr_size;
19620 int vr_offset;
19622 cum = &crtl->args.info;
19623 if (cfun->va_list_gpr_size)
19624 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
19625 cfun->va_list_gpr_size);
19626 if (cfun->va_list_fpr_size)
19627 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
19628 * UNITS_PER_VREG, cfun->va_list_fpr_size);
19630 if (!TARGET_FLOAT)
19632 gcc_assert (cum->aapcs_nvrn == 0);
19633 vr_save_area_size = 0;
19636 f_stack = TYPE_FIELDS (va_list_type_node);
19637 f_grtop = DECL_CHAIN (f_stack);
19638 f_vrtop = DECL_CHAIN (f_grtop);
19639 f_groff = DECL_CHAIN (f_vrtop);
19640 f_vroff = DECL_CHAIN (f_groff);
19642 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
19643 NULL_TREE);
19644 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
19645 NULL_TREE);
19646 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
19647 NULL_TREE);
19648 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
19649 NULL_TREE);
19650 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
19651 NULL_TREE);
19653 /* Emit code to initialize STACK, which points to the next varargs stack
19654 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
19655 by named arguments. STACK is 8-byte aligned. */
19656 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
19657 if (cum->aapcs_stack_size > 0)
19658 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
19659 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
19660 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19662 /* Emit code to initialize GRTOP, the top of the GR save area.
19663 virtual_incoming_args_rtx should have been 16 byte aligned. */
19664 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
19665 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
19666 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19668 /* Emit code to initialize VRTOP, the top of the VR save area.
19669 This address is gr_save_area_bytes below GRTOP, rounded
19670 down to the next 16-byte boundary. */
19671 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
19672 vr_offset = ROUND_UP (gr_save_area_size,
19673 STACK_BOUNDARY / BITS_PER_UNIT);
19675 if (vr_offset)
19676 t = fold_build_pointer_plus_hwi (t, -vr_offset);
19677 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
19678 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19680 /* Emit code to initialize GROFF, the offset from GRTOP of the
19681 next GPR argument. */
19682 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
19683 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
19684 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19686 /* Likewise emit code to initialize VROFF, the offset from FTOP
19687 of the next VR argument. */
19688 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
19689 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
19690 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
19693 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
19695 static tree
19696 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
19697 gimple_seq *post_p ATTRIBUTE_UNUSED)
19699 tree addr;
19700 bool indirect_p;
19701 bool is_ha; /* is HFA or HVA. */
19702 bool dw_align; /* double-word align. */
19703 machine_mode ag_mode = VOIDmode;
19704 int nregs;
19705 machine_mode mode;
19707 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
19708 tree stack, f_top, f_off, off, arg, roundup, on_stack;
19709 HOST_WIDE_INT size, rsize, adjust, align;
19710 tree t, u, cond1, cond2;
19712 indirect_p = pass_va_arg_by_reference (type);
19713 if (indirect_p)
19714 type = build_pointer_type (type);
19716 mode = TYPE_MODE (type);
19718 f_stack = TYPE_FIELDS (va_list_type_node);
19719 f_grtop = DECL_CHAIN (f_stack);
19720 f_vrtop = DECL_CHAIN (f_grtop);
19721 f_groff = DECL_CHAIN (f_vrtop);
19722 f_vroff = DECL_CHAIN (f_groff);
19724 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
19725 f_stack, NULL_TREE);
19726 size = int_size_in_bytes (type);
19728 unsigned int abi_break;
19729 unsigned int abi_break_packed;
19730 align
19731 = aarch64_function_arg_alignment (mode, type, &abi_break, &abi_break_packed)
19732 / BITS_PER_UNIT;
19734 dw_align = false;
19735 adjust = 0;
19736 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
19737 &is_ha, false))
19739 /* No frontends can create types with variable-sized modes, so we
19740 shouldn't be asked to pass or return them. */
19741 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
19743 /* TYPE passed in fp/simd registers. */
19744 if (!TARGET_FLOAT)
19745 aarch64_err_no_fpadvsimd (mode);
19747 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
19748 unshare_expr (valist), f_vrtop, NULL_TREE);
19749 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
19750 unshare_expr (valist), f_vroff, NULL_TREE);
19752 rsize = nregs * UNITS_PER_VREG;
19754 if (is_ha)
19756 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
19757 adjust = UNITS_PER_VREG - ag_size;
19759 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19760 && size < UNITS_PER_VREG)
19762 adjust = UNITS_PER_VREG - size;
19765 else
19767 /* TYPE passed in general registers. */
19768 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
19769 unshare_expr (valist), f_grtop, NULL_TREE);
19770 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
19771 unshare_expr (valist), f_groff, NULL_TREE);
19772 rsize = ROUND_UP (size, UNITS_PER_WORD);
19773 nregs = rsize / UNITS_PER_WORD;
19775 if (align <= 8 && abi_break_packed && warn_psabi)
19776 inform (input_location, "parameter passing for argument of type "
19777 "%qT changed in GCC 13.1", type);
19779 if (align > 8)
19781 if (abi_break && warn_psabi)
19782 inform (input_location, "parameter passing for argument of type "
19783 "%qT changed in GCC 9.1", type);
19784 dw_align = true;
19787 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19788 && size < UNITS_PER_WORD)
19790 adjust = UNITS_PER_WORD - size;
19794 /* Get a local temporary for the field value. */
19795 off = get_initialized_tmp_var (f_off, pre_p, NULL);
19797 /* Emit code to branch if off >= 0. */
19798 t = build2 (GE_EXPR, boolean_type_node, off,
19799 build_int_cst (TREE_TYPE (off), 0));
19800 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
19802 if (dw_align)
19804 /* Emit: offs = (offs + 15) & -16. */
19805 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19806 build_int_cst (TREE_TYPE (off), 15));
19807 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
19808 build_int_cst (TREE_TYPE (off), -16));
19809 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
19811 else
19812 roundup = NULL;
19814 /* Update ap.__[g|v]r_offs */
19815 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
19816 build_int_cst (TREE_TYPE (off), rsize));
19817 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
19819 /* String up. */
19820 if (roundup)
19821 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19823 /* [cond2] if (ap.__[g|v]r_offs > 0) */
19824 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
19825 build_int_cst (TREE_TYPE (f_off), 0));
19826 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
19828 /* String up: make sure the assignment happens before the use. */
19829 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
19830 COND_EXPR_ELSE (cond1) = t;
19832 /* Prepare the trees handling the argument that is passed on the stack;
19833 the top level node will store in ON_STACK. */
19834 arg = get_initialized_tmp_var (stack, pre_p, NULL);
19835 if (align > 8)
19837 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
19838 t = fold_build_pointer_plus_hwi (arg, 15);
19839 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19840 build_int_cst (TREE_TYPE (t), -16));
19841 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
19843 else
19844 roundup = NULL;
19845 /* Advance ap.__stack */
19846 t = fold_build_pointer_plus_hwi (arg, size + 7);
19847 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
19848 build_int_cst (TREE_TYPE (t), -8));
19849 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
19850 /* String up roundup and advance. */
19851 if (roundup)
19852 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
19853 /* String up with arg */
19854 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
19855 /* Big-endianness related address adjustment. */
19856 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
19857 && size < UNITS_PER_WORD)
19859 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
19860 size_int (UNITS_PER_WORD - size));
19861 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
19864 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
19865 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
19867 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
19868 t = off;
19869 if (adjust)
19870 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
19871 build_int_cst (TREE_TYPE (off), adjust));
19873 t = fold_convert (sizetype, t);
19874 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
19876 if (is_ha)
19878 /* type ha; // treat as "struct {ftype field[n];}"
19879 ... [computing offs]
19880 for (i = 0; i <nregs; ++i, offs += 16)
19881 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
19882 return ha; */
19883 int i;
19884 tree tmp_ha, field_t, field_ptr_t;
19886 /* Declare a local variable. */
19887 tmp_ha = create_tmp_var_raw (type, "ha");
19888 gimple_add_tmp_var (tmp_ha);
19890 /* Establish the base type. */
19891 switch (ag_mode)
19893 case E_SFmode:
19894 field_t = float_type_node;
19895 field_ptr_t = float_ptr_type_node;
19896 break;
19897 case E_DFmode:
19898 field_t = double_type_node;
19899 field_ptr_t = double_ptr_type_node;
19900 break;
19901 case E_TFmode:
19902 field_t = long_double_type_node;
19903 field_ptr_t = long_double_ptr_type_node;
19904 break;
19905 case E_SDmode:
19906 field_t = dfloat32_type_node;
19907 field_ptr_t = build_pointer_type (dfloat32_type_node);
19908 break;
19909 case E_DDmode:
19910 field_t = dfloat64_type_node;
19911 field_ptr_t = build_pointer_type (dfloat64_type_node);
19912 break;
19913 case E_TDmode:
19914 field_t = dfloat128_type_node;
19915 field_ptr_t = build_pointer_type (dfloat128_type_node);
19916 break;
19917 case E_HFmode:
19918 field_t = aarch64_fp16_type_node;
19919 field_ptr_t = aarch64_fp16_ptr_type_node;
19920 break;
19921 case E_BFmode:
19922 field_t = bfloat16_type_node;
19923 field_ptr_t = aarch64_bf16_ptr_type_node;
19924 break;
19925 case E_V2SImode:
19926 case E_V4SImode:
19928 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
19929 field_t = build_vector_type_for_mode (innertype, ag_mode);
19930 field_ptr_t = build_pointer_type (field_t);
19932 break;
19933 default:
19934 gcc_assert (0);
19937 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
19938 TREE_ADDRESSABLE (tmp_ha) = 1;
19939 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
19940 addr = t;
19941 t = fold_convert (field_ptr_t, addr);
19942 t = build2 (MODIFY_EXPR, field_t,
19943 build1 (INDIRECT_REF, field_t, tmp_ha),
19944 build1 (INDIRECT_REF, field_t, t));
19946 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
19947 for (i = 1; i < nregs; ++i)
19949 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
19950 u = fold_convert (field_ptr_t, addr);
19951 u = build2 (MODIFY_EXPR, field_t,
19952 build2 (MEM_REF, field_t, tmp_ha,
19953 build_int_cst (field_ptr_t,
19954 (i *
19955 int_size_in_bytes (field_t)))),
19956 build1 (INDIRECT_REF, field_t, u));
19957 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
19960 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
19961 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
19964 COND_EXPR_ELSE (cond2) = t;
19965 addr = fold_convert (build_pointer_type (type), cond1);
19966 addr = build_va_arg_indirect_ref (addr);
19968 if (indirect_p)
19969 addr = build_va_arg_indirect_ref (addr);
19971 return addr;
19974 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
19976 static void
19977 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
19978 const function_arg_info &arg,
19979 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
19981 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
19982 CUMULATIVE_ARGS local_cum;
19983 int gr_saved = cfun->va_list_gpr_size;
19984 int vr_saved = cfun->va_list_fpr_size;
19986 /* The caller has advanced CUM up to, but not beyond, the last named
19987 argument. Advance a local copy of CUM past the last "real" named
19988 argument, to find out how many registers are left over. */
19989 local_cum = *cum;
19990 if (!TYPE_NO_NAMED_ARGS_STDARG_P (TREE_TYPE (current_function_decl)))
19991 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
19993 /* Found out how many registers we need to save.
19994 Honor tree-stdvar analysis results. */
19995 if (cfun->va_list_gpr_size)
19996 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
19997 cfun->va_list_gpr_size / UNITS_PER_WORD);
19998 if (cfun->va_list_fpr_size)
19999 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
20000 cfun->va_list_fpr_size / UNITS_PER_VREG);
20002 if (!TARGET_FLOAT)
20004 gcc_assert (local_cum.aapcs_nvrn == 0);
20005 vr_saved = 0;
20008 if (!no_rtl)
20010 if (gr_saved > 0)
20012 rtx ptr, mem;
20014 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
20015 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
20016 - gr_saved * UNITS_PER_WORD);
20017 mem = gen_frame_mem (BLKmode, ptr);
20018 set_mem_alias_set (mem, get_varargs_alias_set ());
20020 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
20021 mem, gr_saved);
20023 if (vr_saved > 0)
20025 /* We can't use move_block_from_reg, because it will use
20026 the wrong mode, storing D regs only. */
20027 machine_mode mode = TImode;
20028 int off, i, vr_start;
20030 /* Set OFF to the offset from virtual_incoming_args_rtx of
20031 the first vector register. The VR save area lies below
20032 the GR one, and is aligned to 16 bytes. */
20033 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
20034 STACK_BOUNDARY / BITS_PER_UNIT);
20035 off -= vr_saved * UNITS_PER_VREG;
20037 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
20038 for (i = 0; i < vr_saved; ++i)
20040 rtx ptr, mem;
20042 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
20043 mem = gen_frame_mem (mode, ptr);
20044 set_mem_alias_set (mem, get_varargs_alias_set ());
20045 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
20046 off += UNITS_PER_VREG;
20051 /* We don't save the size into *PRETEND_SIZE because we want to avoid
20052 any complication of having crtl->args.pretend_args_size changed. */
20053 cfun->machine->frame.saved_varargs_size
20054 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
20055 STACK_BOUNDARY / BITS_PER_UNIT)
20056 + vr_saved * UNITS_PER_VREG);
20059 static void
20060 aarch64_conditional_register_usage (void)
20062 int i;
20063 if (!TARGET_FLOAT)
20065 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
20067 fixed_regs[i] = 1;
20068 call_used_regs[i] = 1;
20069 CLEAR_HARD_REG_BIT (operand_reg_set, i);
20072 if (!TARGET_SVE)
20073 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
20075 fixed_regs[i] = 1;
20076 call_used_regs[i] = 1;
20079 /* Only allow the FFR and FFRT to be accessed via special patterns. */
20080 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
20081 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
20083 /* When tracking speculation, we need a couple of call-clobbered registers
20084 to track the speculation state. It would be nice to just use
20085 IP0 and IP1, but currently there are numerous places that just
20086 assume these registers are free for other uses (eg pointer
20087 authentication). */
20088 if (aarch64_track_speculation)
20090 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
20091 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
20092 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20093 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
20097 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
20099 bool
20100 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
20102 /* For records we're passed a FIELD_DECL, for arrays we're passed
20103 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
20104 const_tree type = TREE_TYPE (field_or_array);
20106 /* Assign BLKmode to anything that contains multiple SVE predicates.
20107 For structures, the "multiple" case is indicated by MODE being
20108 VOIDmode. */
20109 unsigned int num_zr, num_pr;
20110 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
20112 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
20113 return !simple_cst_equal (TYPE_SIZE (field_or_array),
20114 TYPE_SIZE (type));
20115 return mode == VOIDmode;
20118 return default_member_type_forces_blk (field_or_array, mode);
20121 /* Bitmasks that indicate whether earlier versions of GCC would have
20122 taken a different path through the ABI logic. This should result in
20123 a -Wpsabi warning if the earlier path led to a different ABI decision.
20125 WARN_PSABI_EMPTY_CXX17_BASE
20126 Indicates that the type includes an artificial empty C++17 base field
20127 that, prior to GCC 10.1, would prevent the type from being treated as
20128 a HFA or HVA. See PR94383 for details.
20130 WARN_PSABI_NO_UNIQUE_ADDRESS
20131 Indicates that the type includes an empty [[no_unique_address]] field
20132 that, prior to GCC 10.1, would prevent the type from being treated as
20133 a HFA or HVA. */
20134 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
20135 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
20136 const unsigned int WARN_PSABI_ZERO_WIDTH_BITFIELD = 1U << 2;
20138 /* Walk down the type tree of TYPE counting consecutive base elements.
20139 If *MODEP is VOIDmode, then set it to the first valid floating point
20140 type. If a non-floating point type is found, or if a floating point
20141 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
20142 otherwise return the count in the sub-tree.
20144 The WARN_PSABI_FLAGS argument allows the caller to check whether this
20145 function has changed its behavior relative to earlier versions of GCC.
20146 Normally the argument should be nonnull and point to a zero-initialized
20147 variable. The function then records whether the ABI decision might
20148 be affected by a known fix to the ABI logic, setting the associated
20149 WARN_PSABI_* bits if so.
20151 When the argument is instead a null pointer, the function tries to
20152 simulate the behavior of GCC before all such ABI fixes were made.
20153 This is useful to check whether the function returns something
20154 different after the ABI fixes. */
20155 static int
20156 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
20157 unsigned int *warn_psabi_flags)
20159 machine_mode mode;
20160 HOST_WIDE_INT size;
20162 if (aarch64_sve::builtin_type_p (type))
20163 return -1;
20165 switch (TREE_CODE (type))
20167 case REAL_TYPE:
20168 mode = TYPE_MODE (type);
20169 if (mode != DFmode && mode != SFmode
20170 && mode != TFmode && mode != HFmode
20171 && mode != SDmode && mode != DDmode && mode != TDmode)
20172 return -1;
20174 if (*modep == VOIDmode)
20175 *modep = mode;
20177 if (*modep == mode)
20178 return 1;
20180 break;
20182 case COMPLEX_TYPE:
20183 mode = TYPE_MODE (TREE_TYPE (type));
20184 if (mode != DFmode && mode != SFmode
20185 && mode != TFmode && mode != HFmode)
20186 return -1;
20188 if (*modep == VOIDmode)
20189 *modep = mode;
20191 if (*modep == mode)
20192 return 2;
20194 break;
20196 case VECTOR_TYPE:
20197 /* Use V2SImode and V4SImode as representatives of all 64-bit
20198 and 128-bit vector types. */
20199 size = int_size_in_bytes (type);
20200 switch (size)
20202 case 8:
20203 mode = V2SImode;
20204 break;
20205 case 16:
20206 mode = V4SImode;
20207 break;
20208 default:
20209 return -1;
20212 if (*modep == VOIDmode)
20213 *modep = mode;
20215 /* Vector modes are considered to be opaque: two vectors are
20216 equivalent for the purposes of being homogeneous aggregates
20217 if they are the same size. */
20218 if (*modep == mode)
20219 return 1;
20221 break;
20223 case ARRAY_TYPE:
20225 int count;
20226 tree index = TYPE_DOMAIN (type);
20228 /* Can't handle incomplete types nor sizes that are not
20229 fixed. */
20230 if (!COMPLETE_TYPE_P (type)
20231 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20232 return -1;
20234 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
20235 warn_psabi_flags);
20236 if (count == -1
20237 || !index
20238 || !TYPE_MAX_VALUE (index)
20239 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
20240 || !TYPE_MIN_VALUE (index)
20241 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
20242 || count < 0)
20243 return -1;
20245 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
20246 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
20248 /* There must be no padding. */
20249 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20250 count * GET_MODE_BITSIZE (*modep)))
20251 return -1;
20253 return count;
20256 case RECORD_TYPE:
20258 int count = 0;
20259 int sub_count;
20260 tree field;
20262 /* Can't handle incomplete types nor sizes that are not
20263 fixed. */
20264 if (!COMPLETE_TYPE_P (type)
20265 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20266 return -1;
20268 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20270 if (TREE_CODE (field) != FIELD_DECL)
20271 continue;
20273 if (DECL_FIELD_ABI_IGNORED (field))
20275 /* See whether this is something that earlier versions of
20276 GCC failed to ignore. */
20277 unsigned int flag;
20278 if (lookup_attribute ("no_unique_address",
20279 DECL_ATTRIBUTES (field)))
20280 flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
20281 else if (cxx17_empty_base_field_p (field))
20282 flag = WARN_PSABI_EMPTY_CXX17_BASE;
20283 else
20284 /* No compatibility problem. */
20285 continue;
20287 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
20288 if (warn_psabi_flags)
20290 *warn_psabi_flags |= flag;
20291 continue;
20294 /* A zero-width bitfield may affect layout in some
20295 circumstances, but adds no members. The determination
20296 of whether or not a type is an HFA is performed after
20297 layout is complete, so if the type still looks like an
20298 HFA afterwards, it is still classed as one. This is
20299 potentially an ABI break for the hard-float ABI. */
20300 else if (DECL_BIT_FIELD (field)
20301 && integer_zerop (DECL_SIZE (field)))
20303 /* Prior to GCC-12 these fields were striped early,
20304 hiding them from the back-end entirely and
20305 resulting in the correct behaviour for argument
20306 passing. Simulate that old behaviour without
20307 generating a warning. */
20308 if (DECL_FIELD_CXX_ZERO_WIDTH_BIT_FIELD (field))
20309 continue;
20310 if (warn_psabi_flags)
20312 *warn_psabi_flags |= WARN_PSABI_ZERO_WIDTH_BITFIELD;
20313 continue;
20317 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20318 warn_psabi_flags);
20319 if (sub_count < 0)
20320 return -1;
20321 count += sub_count;
20324 /* There must be no padding. */
20325 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20326 count * GET_MODE_BITSIZE (*modep)))
20327 return -1;
20329 return count;
20332 case UNION_TYPE:
20333 case QUAL_UNION_TYPE:
20335 /* These aren't very interesting except in a degenerate case. */
20336 int count = 0;
20337 int sub_count;
20338 tree field;
20340 /* Can't handle incomplete types nor sizes that are not
20341 fixed. */
20342 if (!COMPLETE_TYPE_P (type)
20343 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
20344 return -1;
20346 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
20348 if (TREE_CODE (field) != FIELD_DECL)
20349 continue;
20351 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
20352 warn_psabi_flags);
20353 if (sub_count < 0)
20354 return -1;
20355 count = count > sub_count ? count : sub_count;
20358 /* There must be no padding. */
20359 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
20360 count * GET_MODE_BITSIZE (*modep)))
20361 return -1;
20363 return count;
20366 default:
20367 break;
20370 return -1;
20373 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
20374 type as described in AAPCS64 \S 4.1.2.
20376 See the comment above aarch64_composite_type_p for the notes on MODE. */
20378 static bool
20379 aarch64_short_vector_p (const_tree type,
20380 machine_mode mode)
20382 poly_int64 size = -1;
20384 if (type && TREE_CODE (type) == VECTOR_TYPE)
20386 if (aarch64_sve::builtin_type_p (type))
20387 return false;
20388 size = int_size_in_bytes (type);
20390 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
20391 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
20393 /* The containing "else if" is too loose: it means that we look at TYPE
20394 if the type is a vector type (good), but that we otherwise ignore TYPE
20395 and look only at the mode. This is wrong because the type describes
20396 the language-level information whereas the mode is purely an internal
20397 GCC concept. We can therefore reach here for types that are not
20398 vectors in the AAPCS64 sense.
20400 We can't "fix" that for the traditional Advanced SIMD vector modes
20401 without breaking backwards compatibility. However, there's no such
20402 baggage for the structure modes, which were introduced in GCC 12. */
20403 if (aarch64_advsimd_struct_mode_p (mode))
20404 return false;
20406 /* For similar reasons, rely only on the type, not the mode, when
20407 processing SVE types. */
20408 if (type && aarch64_some_values_include_pst_objects_p (type))
20409 /* Leave later code to report an error if SVE is disabled. */
20410 gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
20411 else
20412 size = GET_MODE_SIZE (mode);
20414 if (known_eq (size, 8) || known_eq (size, 16))
20416 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
20417 they are being treated as scalable AAPCS64 types. */
20418 gcc_assert (!aarch64_sve_mode_p (mode)
20419 && !aarch64_advsimd_struct_mode_p (mode));
20420 return true;
20422 return false;
20425 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
20426 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
20427 array types. The C99 floating-point complex types are also considered
20428 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
20429 types, which are GCC extensions and out of the scope of AAPCS64, are
20430 treated as composite types here as well.
20432 Note that MODE itself is not sufficient in determining whether a type
20433 is such a composite type or not. This is because
20434 stor-layout.cc:compute_record_mode may have already changed the MODE
20435 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
20436 structure with only one field may have its MODE set to the mode of the
20437 field. Also an integer mode whose size matches the size of the
20438 RECORD_TYPE type may be used to substitute the original mode
20439 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
20440 solely relied on. */
20442 static bool
20443 aarch64_composite_type_p (const_tree type,
20444 machine_mode mode)
20446 if (aarch64_short_vector_p (type, mode))
20447 return false;
20449 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
20450 return true;
20452 if (mode == BLKmode
20453 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
20454 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
20455 return true;
20457 return false;
20460 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
20461 shall be passed or returned in simd/fp register(s) (providing these
20462 parameter passing registers are available).
20464 Upon successful return, *COUNT returns the number of needed registers,
20465 *BASE_MODE returns the mode of the individual register and when IS_HA
20466 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
20467 floating-point aggregate or a homogeneous short-vector aggregate.
20469 SILENT_P is true if the function should refrain from reporting any
20470 diagnostics. This should only be used if the caller is certain that
20471 any ABI decisions would eventually come through this function with
20472 SILENT_P set to false. */
20474 static bool
20475 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
20476 const_tree type,
20477 machine_mode *base_mode,
20478 int *count,
20479 bool *is_ha,
20480 bool silent_p)
20482 if (is_ha != NULL) *is_ha = false;
20484 machine_mode new_mode = VOIDmode;
20485 bool composite_p = aarch64_composite_type_p (type, mode);
20487 if ((!composite_p
20488 && (GET_MODE_CLASS (mode) == MODE_FLOAT
20489 || GET_MODE_CLASS (mode) == MODE_DECIMAL_FLOAT))
20490 || aarch64_short_vector_p (type, mode))
20492 *count = 1;
20493 new_mode = mode;
20495 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
20497 if (is_ha != NULL) *is_ha = true;
20498 *count = 2;
20499 new_mode = GET_MODE_INNER (mode);
20501 else if (type && composite_p)
20503 unsigned int warn_psabi_flags = 0;
20504 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
20505 &warn_psabi_flags);
20506 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
20508 static unsigned last_reported_type_uid;
20509 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
20510 int alt;
20511 if (!silent_p
20512 && warn_psabi
20513 && warn_psabi_flags
20514 && uid != last_reported_type_uid
20515 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
20516 != ag_count))
20518 const char *url10
20519 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
20520 const char *url12
20521 = CHANGES_ROOT_URL "gcc-12/changes.html#zero_width_bitfields";
20522 gcc_assert (alt == -1);
20523 last_reported_type_uid = uid;
20524 /* Use TYPE_MAIN_VARIANT to strip any redundant const
20525 qualification. */
20526 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
20527 inform (input_location, "parameter passing for argument of "
20528 "type %qT with %<[[no_unique_address]]%> members "
20529 "changed %{in GCC 10.1%}",
20530 TYPE_MAIN_VARIANT (type), url10);
20531 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
20532 inform (input_location, "parameter passing for argument of "
20533 "type %qT when C++17 is enabled changed to match "
20534 "C++14 %{in GCC 10.1%}",
20535 TYPE_MAIN_VARIANT (type), url10);
20536 else if (warn_psabi_flags & WARN_PSABI_ZERO_WIDTH_BITFIELD)
20537 inform (input_location, "parameter passing for argument of "
20538 "type %qT changed %{in GCC 12.1%}",
20539 TYPE_MAIN_VARIANT (type), url12);
20542 if (is_ha != NULL) *is_ha = true;
20543 *count = ag_count;
20545 else
20546 return false;
20548 else
20549 return false;
20551 gcc_assert (!aarch64_sve_mode_p (new_mode));
20552 *base_mode = new_mode;
20553 return true;
20556 /* Implement TARGET_STRUCT_VALUE_RTX. */
20558 static rtx
20559 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
20560 int incoming ATTRIBUTE_UNUSED)
20562 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
20565 /* Implements target hook vector_mode_supported_p. */
20566 static bool
20567 aarch64_vector_mode_supported_p (machine_mode mode)
20569 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
20570 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
20573 /* Return the full-width SVE vector mode for element mode MODE, if one
20574 exists. */
20575 opt_machine_mode
20576 aarch64_full_sve_mode (scalar_mode mode)
20578 switch (mode)
20580 case E_DFmode:
20581 return VNx2DFmode;
20582 case E_SFmode:
20583 return VNx4SFmode;
20584 case E_HFmode:
20585 return VNx8HFmode;
20586 case E_BFmode:
20587 return VNx8BFmode;
20588 case E_DImode:
20589 return VNx2DImode;
20590 case E_SImode:
20591 return VNx4SImode;
20592 case E_HImode:
20593 return VNx8HImode;
20594 case E_QImode:
20595 return VNx16QImode;
20596 default:
20597 return opt_machine_mode ();
20601 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
20602 if it exists. */
20603 opt_machine_mode
20604 aarch64_vq_mode (scalar_mode mode)
20606 switch (mode)
20608 case E_DFmode:
20609 return V2DFmode;
20610 case E_SFmode:
20611 return V4SFmode;
20612 case E_HFmode:
20613 return V8HFmode;
20614 case E_BFmode:
20615 return V8BFmode;
20616 case E_SImode:
20617 return V4SImode;
20618 case E_HImode:
20619 return V8HImode;
20620 case E_QImode:
20621 return V16QImode;
20622 case E_DImode:
20623 return V2DImode;
20624 default:
20625 return opt_machine_mode ();
20629 /* Return appropriate SIMD container
20630 for MODE within a vector of WIDTH bits. */
20631 static machine_mode
20632 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
20634 if (TARGET_SVE
20635 && maybe_ne (width, 128)
20636 && known_eq (width, BITS_PER_SVE_VECTOR))
20637 return aarch64_full_sve_mode (mode).else_mode (word_mode);
20639 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
20640 if (TARGET_SIMD)
20642 if (known_eq (width, 128))
20643 return aarch64_vq_mode (mode).else_mode (word_mode);
20644 else
20645 switch (mode)
20647 case E_SFmode:
20648 return V2SFmode;
20649 case E_HFmode:
20650 return V4HFmode;
20651 case E_BFmode:
20652 return V4BFmode;
20653 case E_SImode:
20654 return V2SImode;
20655 case E_HImode:
20656 return V4HImode;
20657 case E_QImode:
20658 return V8QImode;
20659 default:
20660 break;
20663 return word_mode;
20666 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
20667 and return whether the SVE mode should be preferred over the
20668 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
20669 static bool
20670 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
20672 /* Take into account the aarch64-autovec-preference param if non-zero. */
20673 bool only_asimd_p = aarch64_autovec_preference == 1;
20674 bool only_sve_p = aarch64_autovec_preference == 2;
20676 if (only_asimd_p)
20677 return false;
20678 if (only_sve_p)
20679 return true;
20681 /* The preference in case of a tie in costs. */
20682 bool prefer_asimd = aarch64_autovec_preference == 3;
20683 bool prefer_sve = aarch64_autovec_preference == 4;
20685 poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
20686 poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
20687 /* If the CPU information does not have an SVE width registered use the
20688 generic poly_int comparison that prefers SVE. If a preference is
20689 explicitly requested avoid this path. */
20690 if (aarch64_tune_params.sve_width == SVE_SCALABLE
20691 && !prefer_asimd
20692 && !prefer_sve)
20693 return maybe_gt (nunits_sve, nunits_asimd);
20695 /* Otherwise estimate the runtime width of the modes involved. */
20696 HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
20697 HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
20699 /* Preferring SVE means picking it first unless the Advanced SIMD mode
20700 is clearly wider. */
20701 if (prefer_sve)
20702 return est_sve >= est_asimd;
20703 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
20704 is clearly wider. */
20705 if (prefer_asimd)
20706 return est_sve > est_asimd;
20708 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
20709 return est_sve > est_asimd;
20712 /* Return 128-bit container as the preferred SIMD mode for MODE. */
20713 static machine_mode
20714 aarch64_preferred_simd_mode (scalar_mode mode)
20716 /* Take into account explicit auto-vectorization ISA preferences through
20717 aarch64_cmp_autovec_modes. */
20718 if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
20719 return aarch64_full_sve_mode (mode).else_mode (word_mode);
20720 if (TARGET_SIMD)
20721 return aarch64_vq_mode (mode).else_mode (word_mode);
20722 return word_mode;
20725 /* Return a list of possible vector sizes for the vectorizer
20726 to iterate over. */
20727 static unsigned int
20728 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
20730 static const machine_mode sve_modes[] = {
20731 /* Try using full vectors for all element types. */
20732 VNx16QImode,
20734 /* Try using 16-bit containers for 8-bit elements and full vectors
20735 for wider elements. */
20736 VNx8QImode,
20738 /* Try using 32-bit containers for 8-bit and 16-bit elements and
20739 full vectors for wider elements. */
20740 VNx4QImode,
20742 /* Try using 64-bit containers for all element types. */
20743 VNx2QImode
20746 static const machine_mode advsimd_modes[] = {
20747 /* Try using 128-bit vectors for all element types. */
20748 V16QImode,
20750 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
20751 for wider elements. */
20752 V8QImode,
20754 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
20755 for wider elements.
20757 TODO: We could support a limited form of V4QImode too, so that
20758 we use 32-bit vectors for 8-bit elements. */
20759 V4HImode,
20761 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
20762 for 64-bit elements.
20764 TODO: We could similarly support limited forms of V2QImode and V2HImode
20765 for this case. */
20766 V2SImode
20769 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
20770 This is because:
20772 - If we can't use N-byte Advanced SIMD vectors then the placement
20773 doesn't matter; we'll just continue as though the Advanced SIMD
20774 entry didn't exist.
20776 - If an SVE main loop with N bytes ends up being cheaper than an
20777 Advanced SIMD main loop with N bytes then by default we'll replace
20778 the Advanced SIMD version with the SVE one.
20780 - If an Advanced SIMD main loop with N bytes ends up being cheaper
20781 than an SVE main loop with N bytes then by default we'll try to
20782 use the SVE loop to vectorize the epilogue instead. */
20784 bool only_asimd_p = aarch64_autovec_preference == 1;
20785 bool only_sve_p = aarch64_autovec_preference == 2;
20787 unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
20788 unsigned int advsimd_i = 0;
20790 while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
20792 if (sve_i < ARRAY_SIZE (sve_modes)
20793 && aarch64_cmp_autovec_modes (sve_modes[sve_i],
20794 advsimd_modes[advsimd_i]))
20795 modes->safe_push (sve_modes[sve_i++]);
20796 else
20797 modes->safe_push (advsimd_modes[advsimd_i++]);
20799 while (sve_i < ARRAY_SIZE (sve_modes))
20800 modes->safe_push (sve_modes[sve_i++]);
20802 unsigned int flags = 0;
20803 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
20804 can compare SVE against Advanced SIMD and so that we can compare
20805 multiple SVE vectorization approaches against each other. There's
20806 not really any point doing this for Advanced SIMD only, since the
20807 first mode that works should always be the best. */
20808 if (TARGET_SVE && aarch64_sve_compare_costs)
20809 flags |= VECT_COMPARE_COSTS;
20810 return flags;
20813 /* Implement TARGET_MANGLE_TYPE. */
20815 static const char *
20816 aarch64_mangle_type (const_tree type)
20818 /* The AArch64 ABI documents say that "__va_list" has to be
20819 mangled as if it is in the "std" namespace. */
20820 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
20821 return "St9__va_list";
20823 /* Half-precision floating point types. */
20824 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
20826 if (TYPE_MAIN_VARIANT (type) == float16_type_node)
20827 return NULL;
20828 if (TYPE_MODE (type) == BFmode)
20829 return "u6__bf16";
20830 else
20831 return "Dh";
20834 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
20835 builtin types. */
20836 if (TYPE_NAME (type) != NULL)
20838 const char *res;
20839 if ((res = aarch64_general_mangle_builtin_type (type))
20840 || (res = aarch64_sve::mangle_builtin_type (type)))
20841 return res;
20844 /* Use the default mangling. */
20845 return NULL;
20848 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
20850 static bool
20851 aarch64_verify_type_context (location_t loc, type_context_kind context,
20852 const_tree type, bool silent_p)
20854 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
20857 /* Find the first rtx_insn before insn that will generate an assembly
20858 instruction. */
20860 static rtx_insn *
20861 aarch64_prev_real_insn (rtx_insn *insn)
20863 if (!insn)
20864 return NULL;
20868 insn = prev_real_insn (insn);
20870 while (insn && recog_memoized (insn) < 0);
20872 return insn;
20875 static bool
20876 is_madd_op (enum attr_type t1)
20878 unsigned int i;
20879 /* A number of these may be AArch32 only. */
20880 enum attr_type mlatypes[] = {
20881 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
20882 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
20883 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
20886 for (i = 0; i < ARRAY_SIZE (mlatypes); i++)
20888 if (t1 == mlatypes[i])
20889 return true;
20892 return false;
20895 /* Check if there is a register dependency between a load and the insn
20896 for which we hold recog_data. */
20898 static bool
20899 dep_between_memop_and_curr (rtx memop)
20901 rtx load_reg;
20902 int opno;
20904 gcc_assert (GET_CODE (memop) == SET);
20906 if (!REG_P (SET_DEST (memop)))
20907 return false;
20909 load_reg = SET_DEST (memop);
20910 for (opno = 1; opno < recog_data.n_operands; opno++)
20912 rtx operand = recog_data.operand[opno];
20913 if (REG_P (operand)
20914 && reg_overlap_mentioned_p (load_reg, operand))
20915 return true;
20918 return false;
20922 /* When working around the Cortex-A53 erratum 835769,
20923 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
20924 instruction and has a preceding memory instruction such that a NOP
20925 should be inserted between them. */
20927 bool
20928 aarch64_madd_needs_nop (rtx_insn* insn)
20930 enum attr_type attr_type;
20931 rtx_insn *prev;
20932 rtx body;
20934 if (!TARGET_FIX_ERR_A53_835769)
20935 return false;
20937 if (!INSN_P (insn) || recog_memoized (insn) < 0)
20938 return false;
20940 attr_type = get_attr_type (insn);
20941 if (!is_madd_op (attr_type))
20942 return false;
20944 prev = aarch64_prev_real_insn (insn);
20945 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
20946 Restore recog state to INSN to avoid state corruption. */
20947 extract_constrain_insn_cached (insn);
20949 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
20950 return false;
20952 body = single_set (prev);
20954 /* If the previous insn is a memory op and there is no dependency between
20955 it and the DImode madd, emit a NOP between them. If body is NULL then we
20956 have a complex memory operation, probably a load/store pair.
20957 Be conservative for now and emit a NOP. */
20958 if (GET_MODE (recog_data.operand[0]) == DImode
20959 && (!body || !dep_between_memop_and_curr (body)))
20960 return true;
20962 return false;
20967 /* Implement FINAL_PRESCAN_INSN. */
20969 void
20970 aarch64_final_prescan_insn (rtx_insn *insn)
20972 if (aarch64_madd_needs_nop (insn))
20973 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
20977 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
20978 instruction. */
20980 bool
20981 aarch64_sve_index_immediate_p (rtx base_or_step)
20983 return (CONST_INT_P (base_or_step)
20984 && IN_RANGE (INTVAL (base_or_step), -16, 15));
20987 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
20988 when applied to mode MODE. Negate X first if NEGATE_P is true. */
20990 bool
20991 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
20993 rtx elt = unwrap_const_vec_duplicate (x);
20994 if (!CONST_INT_P (elt))
20995 return false;
20997 HOST_WIDE_INT val = INTVAL (elt);
20998 if (negate_p)
20999 val = -val;
21000 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
21002 if (val & 0xff)
21003 return IN_RANGE (val, 0, 0xff);
21004 return IN_RANGE (val, 0, 0xff00);
21007 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
21008 instructions when applied to mode MODE. Negate X first if NEGATE_P
21009 is true. */
21011 bool
21012 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
21014 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
21015 return false;
21017 /* After the optional negation, the immediate must be nonnegative.
21018 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
21019 instead of SQADD Zn.B, Zn.B, #129. */
21020 rtx elt = unwrap_const_vec_duplicate (x);
21021 return negate_p == (INTVAL (elt) < 0);
21024 /* Return true if X is a valid immediate operand for an SVE logical
21025 instruction such as AND. */
21027 bool
21028 aarch64_sve_bitmask_immediate_p (rtx x)
21030 rtx elt;
21032 return (const_vec_duplicate_p (x, &elt)
21033 && CONST_INT_P (elt)
21034 && aarch64_bitmask_imm (INTVAL (elt),
21035 GET_MODE_INNER (GET_MODE (x))));
21038 /* Return true if X is a valid immediate for the SVE DUP and CPY
21039 instructions. */
21041 bool
21042 aarch64_sve_dup_immediate_p (rtx x)
21044 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
21045 if (!CONST_INT_P (x))
21046 return false;
21048 HOST_WIDE_INT val = INTVAL (x);
21049 if (val & 0xff)
21050 return IN_RANGE (val, -0x80, 0x7f);
21051 return IN_RANGE (val, -0x8000, 0x7f00);
21054 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
21055 SIGNED_P says whether the operand is signed rather than unsigned. */
21057 bool
21058 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
21060 x = unwrap_const_vec_duplicate (x);
21061 return (CONST_INT_P (x)
21062 && (signed_p
21063 ? IN_RANGE (INTVAL (x), -16, 15)
21064 : IN_RANGE (INTVAL (x), 0, 127)));
21067 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
21068 instruction. Negate X first if NEGATE_P is true. */
21070 bool
21071 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
21073 rtx elt;
21074 REAL_VALUE_TYPE r;
21076 if (!const_vec_duplicate_p (x, &elt)
21077 || !CONST_DOUBLE_P (elt))
21078 return false;
21080 r = *CONST_DOUBLE_REAL_VALUE (elt);
21082 if (negate_p)
21083 r = real_value_negate (&r);
21085 if (real_equal (&r, &dconst1))
21086 return true;
21087 if (real_equal (&r, &dconsthalf))
21088 return true;
21089 return false;
21092 /* Return true if X is a valid immediate operand for an SVE FMUL
21093 instruction. */
21095 bool
21096 aarch64_sve_float_mul_immediate_p (rtx x)
21098 rtx elt;
21100 return (const_vec_duplicate_p (x, &elt)
21101 && CONST_DOUBLE_P (elt)
21102 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
21103 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
21106 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
21107 for the Advanced SIMD operation described by WHICH and INSN. If INFO
21108 is nonnull, use it to describe valid immediates. */
21109 static bool
21110 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
21111 simd_immediate_info *info,
21112 enum simd_immediate_check which,
21113 simd_immediate_info::insn_type insn)
21115 /* Try a 4-byte immediate with LSL. */
21116 for (unsigned int shift = 0; shift < 32; shift += 8)
21117 if ((val32 & (0xff << shift)) == val32)
21119 if (info)
21120 *info = simd_immediate_info (SImode, val32 >> shift, insn,
21121 simd_immediate_info::LSL, shift);
21122 return true;
21125 /* Try a 2-byte immediate with LSL. */
21126 unsigned int imm16 = val32 & 0xffff;
21127 if (imm16 == (val32 >> 16))
21128 for (unsigned int shift = 0; shift < 16; shift += 8)
21129 if ((imm16 & (0xff << shift)) == imm16)
21131 if (info)
21132 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
21133 simd_immediate_info::LSL, shift);
21134 return true;
21137 /* Try a 4-byte immediate with MSL, except for cases that MVN
21138 can handle. */
21139 if (which == AARCH64_CHECK_MOV)
21140 for (unsigned int shift = 8; shift < 24; shift += 8)
21142 unsigned int low = (1 << shift) - 1;
21143 if (((val32 & (0xff << shift)) | low) == val32)
21145 if (info)
21146 *info = simd_immediate_info (SImode, val32 >> shift, insn,
21147 simd_immediate_info::MSL, shift);
21148 return true;
21152 return false;
21155 /* Return true if replicating VAL64 is a valid immediate for the
21156 Advanced SIMD operation described by WHICH. If INFO is nonnull,
21157 use it to describe valid immediates. */
21158 static bool
21159 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
21160 simd_immediate_info *info,
21161 enum simd_immediate_check which)
21163 unsigned int val32 = val64 & 0xffffffff;
21164 unsigned int val16 = val64 & 0xffff;
21165 unsigned int val8 = val64 & 0xff;
21167 if (val32 == (val64 >> 32))
21169 if ((which & AARCH64_CHECK_ORR) != 0
21170 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
21171 simd_immediate_info::MOV))
21172 return true;
21174 if ((which & AARCH64_CHECK_BIC) != 0
21175 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
21176 simd_immediate_info::MVN))
21177 return true;
21179 /* Try using a replicated byte. */
21180 if (which == AARCH64_CHECK_MOV
21181 && val16 == (val32 >> 16)
21182 && val8 == (val16 >> 8))
21184 if (info)
21185 *info = simd_immediate_info (QImode, val8);
21186 return true;
21190 /* Try using a bit-to-bytemask. */
21191 if (which == AARCH64_CHECK_MOV)
21193 unsigned int i;
21194 for (i = 0; i < 64; i += 8)
21196 unsigned char byte = (val64 >> i) & 0xff;
21197 if (byte != 0 && byte != 0xff)
21198 break;
21200 if (i == 64)
21202 if (info)
21203 *info = simd_immediate_info (DImode, val64);
21204 return true;
21207 return false;
21210 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
21211 instruction. If INFO is nonnull, use it to describe valid immediates. */
21213 static bool
21214 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
21215 simd_immediate_info *info)
21217 scalar_int_mode mode = DImode;
21218 unsigned int val32 = val64 & 0xffffffff;
21219 if (val32 == (val64 >> 32))
21221 mode = SImode;
21222 unsigned int val16 = val32 & 0xffff;
21223 if (val16 == (val32 >> 16))
21225 mode = HImode;
21226 unsigned int val8 = val16 & 0xff;
21227 if (val8 == (val16 >> 8))
21228 mode = QImode;
21231 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
21232 if (IN_RANGE (val, -0x80, 0x7f))
21234 /* DUP with no shift. */
21235 if (info)
21236 *info = simd_immediate_info (mode, val);
21237 return true;
21239 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
21241 /* DUP with LSL #8. */
21242 if (info)
21243 *info = simd_immediate_info (mode, val);
21244 return true;
21246 if (aarch64_bitmask_imm (val64, mode))
21248 /* DUPM. */
21249 if (info)
21250 *info = simd_immediate_info (mode, val);
21251 return true;
21253 return false;
21256 /* Return true if X is an UNSPEC_PTRUE constant of the form:
21258 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
21260 where PATTERN is the svpattern as a CONST_INT and where ZERO
21261 is a zero constant of the required PTRUE mode (which can have
21262 fewer elements than X's mode, if zero bits are significant).
21264 If so, and if INFO is nonnull, describe the immediate in INFO. */
21265 bool
21266 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
21268 if (GET_CODE (x) != CONST)
21269 return false;
21271 x = XEXP (x, 0);
21272 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
21273 return false;
21275 if (info)
21277 aarch64_svpattern pattern
21278 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
21279 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
21280 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
21281 *info = simd_immediate_info (int_mode, pattern);
21283 return true;
21286 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
21287 it to describe valid immediates. */
21289 static bool
21290 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
21292 if (aarch64_sve_ptrue_svpattern_p (x, info))
21293 return true;
21295 if (x == CONST0_RTX (GET_MODE (x)))
21297 if (info)
21298 *info = simd_immediate_info (DImode, 0);
21299 return true;
21302 /* Analyze the value as a VNx16BImode. This should be relatively
21303 efficient, since rtx_vector_builder has enough built-in capacity
21304 to store all VLA predicate constants without needing the heap. */
21305 rtx_vector_builder builder;
21306 if (!aarch64_get_sve_pred_bits (builder, x))
21307 return false;
21309 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
21310 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
21312 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
21313 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
21314 if (pattern != AARCH64_NUM_SVPATTERNS)
21316 if (info)
21318 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
21319 *info = simd_immediate_info (int_mode, pattern);
21321 return true;
21324 return false;
21327 /* Return true if OP is a valid SIMD immediate for the operation
21328 described by WHICH. If INFO is nonnull, use it to describe valid
21329 immediates. */
21330 bool
21331 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
21332 enum simd_immediate_check which)
21334 machine_mode mode = GET_MODE (op);
21335 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
21336 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
21337 return false;
21339 if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
21340 return false;
21342 if (vec_flags & VEC_SVE_PRED)
21343 return aarch64_sve_pred_valid_immediate (op, info);
21345 scalar_mode elt_mode = GET_MODE_INNER (mode);
21346 rtx base, step;
21347 unsigned int n_elts;
21348 if (CONST_VECTOR_P (op)
21349 && CONST_VECTOR_DUPLICATE_P (op))
21350 n_elts = CONST_VECTOR_NPATTERNS (op);
21351 else if ((vec_flags & VEC_SVE_DATA)
21352 && const_vec_series_p (op, &base, &step))
21354 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
21355 if (!aarch64_sve_index_immediate_p (base)
21356 || !aarch64_sve_index_immediate_p (step))
21357 return false;
21359 if (info)
21361 /* Get the corresponding container mode. E.g. an INDEX on V2SI
21362 should yield two integer values per 128-bit block, meaning
21363 that we need to treat it in the same way as V2DI and then
21364 ignore the upper 32 bits of each element. */
21365 elt_mode = aarch64_sve_container_int_mode (mode);
21366 *info = simd_immediate_info (elt_mode, base, step);
21368 return true;
21370 else if (CONST_VECTOR_P (op)
21371 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
21372 /* N_ELTS set above. */;
21373 else
21374 return false;
21376 scalar_float_mode elt_float_mode;
21377 if (n_elts == 1
21378 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
21380 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
21381 if (aarch64_float_const_zero_rtx_p (elt)
21382 || aarch64_float_const_representable_p (elt))
21384 if (info)
21385 *info = simd_immediate_info (elt_float_mode, elt);
21386 return true;
21390 /* If all elements in an SVE vector have the same value, we have a free
21391 choice between using the element mode and using the container mode.
21392 Using the element mode means that unused parts of the vector are
21393 duplicates of the used elements, while using the container mode means
21394 that the unused parts are an extension of the used elements. Using the
21395 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
21396 for its container mode VNx4SI while 0x00000101 isn't.
21398 If not all elements in an SVE vector have the same value, we need the
21399 transition from one element to the next to occur at container boundaries.
21400 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
21401 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
21402 scalar_int_mode elt_int_mode;
21403 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
21404 elt_int_mode = aarch64_sve_container_int_mode (mode);
21405 else
21406 elt_int_mode = int_mode_for_mode (elt_mode).require ();
21408 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
21409 if (elt_size > 8)
21410 return false;
21412 /* Expand the vector constant out into a byte vector, with the least
21413 significant byte of the register first. */
21414 auto_vec<unsigned char, 16> bytes;
21415 bytes.reserve (n_elts * elt_size);
21416 for (unsigned int i = 0; i < n_elts; i++)
21418 /* The vector is provided in gcc endian-neutral fashion.
21419 For aarch64_be Advanced SIMD, it must be laid out in the vector
21420 register in reverse order. */
21421 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
21422 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
21424 if (elt_mode != elt_int_mode)
21425 elt = gen_lowpart (elt_int_mode, elt);
21427 if (!CONST_INT_P (elt))
21428 return false;
21430 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
21431 for (unsigned int byte = 0; byte < elt_size; byte++)
21433 bytes.quick_push (elt_val & 0xff);
21434 elt_val >>= BITS_PER_UNIT;
21438 /* The immediate must repeat every eight bytes. */
21439 unsigned int nbytes = bytes.length ();
21440 for (unsigned i = 8; i < nbytes; ++i)
21441 if (bytes[i] != bytes[i - 8])
21442 return false;
21444 /* Get the repeating 8-byte value as an integer. No endian correction
21445 is needed here because bytes is already in lsb-first order. */
21446 unsigned HOST_WIDE_INT val64 = 0;
21447 for (unsigned int i = 0; i < 8; i++)
21448 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
21449 << (i * BITS_PER_UNIT));
21451 if (vec_flags & VEC_SVE_DATA)
21452 return aarch64_sve_valid_immediate (val64, info);
21453 else
21454 return aarch64_advsimd_valid_immediate (val64, info, which);
21457 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
21458 has a step in the range of INDEX. Return the index expression if so,
21459 otherwise return null. */
21461 aarch64_check_zero_based_sve_index_immediate (rtx x)
21463 rtx base, step;
21464 if (const_vec_series_p (x, &base, &step)
21465 && base == const0_rtx
21466 && aarch64_sve_index_immediate_p (step))
21467 return step;
21468 return NULL_RTX;
21471 /* Check of immediate shift constants are within range. */
21472 bool
21473 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
21475 x = unwrap_const_vec_duplicate (x);
21476 if (!CONST_INT_P (x))
21477 return false;
21478 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
21479 if (left)
21480 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
21481 else
21482 return IN_RANGE (INTVAL (x), 1, bit_width);
21485 /* Return the bitmask CONST_INT to select the bits required by a zero extract
21486 operation of width WIDTH at bit position POS. */
21489 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
21491 gcc_assert (CONST_INT_P (width));
21492 gcc_assert (CONST_INT_P (pos));
21494 unsigned HOST_WIDE_INT mask
21495 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
21496 return GEN_INT (mask << UINTVAL (pos));
21499 bool
21500 aarch64_mov_operand_p (rtx x, machine_mode mode)
21502 if (GET_CODE (x) == HIGH
21503 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
21504 return true;
21506 if (CONST_INT_P (x))
21507 return true;
21509 if (VECTOR_MODE_P (GET_MODE (x)))
21511 /* Require predicate constants to be VNx16BI before RA, so that we
21512 force everything to have a canonical form. */
21513 if (!lra_in_progress
21514 && !reload_completed
21515 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
21516 && GET_MODE (x) != VNx16BImode)
21517 return false;
21519 return aarch64_simd_valid_immediate (x, NULL);
21522 /* Remove UNSPEC_SALT_ADDR before checking symbol reference. */
21523 x = strip_salt (x);
21525 /* GOT accesses are valid moves. */
21526 if (SYMBOL_REF_P (x)
21527 && aarch64_classify_symbolic_expression (x) == SYMBOL_SMALL_GOT_4G)
21528 return true;
21530 if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
21531 return true;
21533 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
21534 return true;
21536 return aarch64_classify_symbolic_expression (x)
21537 == SYMBOL_TINY_ABSOLUTE;
21540 /* Create a 0 constant that is based on V4SI to allow CSE to optimally share
21541 the constant creation. */
21544 aarch64_gen_shareable_zero (machine_mode mode)
21546 machine_mode zmode = V4SImode;
21547 rtx tmp = gen_reg_rtx (zmode);
21548 emit_move_insn (tmp, CONST0_RTX (zmode));
21549 return lowpart_subreg (mode, tmp, zmode);
21552 /* Return a const_int vector of VAL. */
21554 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
21556 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
21557 return gen_const_vec_duplicate (mode, c);
21560 /* Check OP is a legal scalar immediate for the MOVI instruction. */
21562 bool
21563 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
21565 machine_mode vmode;
21567 vmode = aarch64_simd_container_mode (mode, 64);
21568 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
21569 return aarch64_simd_valid_immediate (op_v, NULL);
21572 /* Construct and return a PARALLEL RTX vector with elements numbering the
21573 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
21574 the vector - from the perspective of the architecture. This does not
21575 line up with GCC's perspective on lane numbers, so we end up with
21576 different masks depending on our target endian-ness. The diagram
21577 below may help. We must draw the distinction when building masks
21578 which select one half of the vector. An instruction selecting
21579 architectural low-lanes for a big-endian target, must be described using
21580 a mask selecting GCC high-lanes.
21582 Big-Endian Little-Endian
21584 GCC 0 1 2 3 3 2 1 0
21585 | x | x | x | x | | x | x | x | x |
21586 Architecture 3 2 1 0 3 2 1 0
21588 Low Mask: { 2, 3 } { 0, 1 }
21589 High Mask: { 0, 1 } { 2, 3 }
21591 MODE Is the mode of the vector and NUNITS is the number of units in it. */
21594 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
21596 rtvec v = rtvec_alloc (nunits / 2);
21597 int high_base = nunits / 2;
21598 int low_base = 0;
21599 int base;
21600 rtx t1;
21601 int i;
21603 if (BYTES_BIG_ENDIAN)
21604 base = high ? low_base : high_base;
21605 else
21606 base = high ? high_base : low_base;
21608 for (i = 0; i < nunits / 2; i++)
21609 RTVEC_ELT (v, i) = GEN_INT (base + i);
21611 t1 = gen_rtx_PARALLEL (mode, v);
21612 return t1;
21615 /* Check OP for validity as a PARALLEL RTX vector with elements
21616 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
21617 from the perspective of the architecture. See the diagram above
21618 aarch64_simd_vect_par_cnst_half for more details. */
21620 bool
21621 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
21622 bool high)
21624 int nelts;
21625 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
21626 return false;
21628 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
21629 HOST_WIDE_INT count_op = XVECLEN (op, 0);
21630 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
21631 int i = 0;
21633 if (count_op != count_ideal)
21634 return false;
21636 for (i = 0; i < count_ideal; i++)
21638 rtx elt_op = XVECEXP (op, 0, i);
21639 rtx elt_ideal = XVECEXP (ideal, 0, i);
21641 if (!CONST_INT_P (elt_op)
21642 || INTVAL (elt_ideal) != INTVAL (elt_op))
21643 return false;
21645 return true;
21648 /* Return a PARALLEL containing NELTS elements, with element I equal
21649 to BASE + I * STEP. */
21652 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
21654 rtvec vec = rtvec_alloc (nelts);
21655 for (unsigned int i = 0; i < nelts; ++i)
21656 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
21657 return gen_rtx_PARALLEL (VOIDmode, vec);
21660 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
21661 series with step STEP. */
21663 bool
21664 aarch64_stepped_int_parallel_p (rtx op, int step)
21666 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
21667 return false;
21669 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
21670 for (int i = 1; i < XVECLEN (op, 0); ++i)
21671 if (!CONST_INT_P (XVECEXP (op, 0, i))
21672 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
21673 return false;
21675 return true;
21678 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
21679 HIGH (exclusive). */
21680 void
21681 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
21682 const_tree exp)
21684 HOST_WIDE_INT lane;
21685 gcc_assert (CONST_INT_P (operand));
21686 lane = INTVAL (operand);
21688 if (lane < low || lane >= high)
21690 if (exp)
21691 error_at (EXPR_LOCATION (exp), "lane %wd out of range %wd - %wd",
21692 lane, low, high - 1);
21693 else
21694 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
21698 /* Peform endian correction on lane number N, which indexes a vector
21699 of mode MODE, and return the result as an SImode rtx. */
21702 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
21704 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
21707 /* Return TRUE if OP is a valid vector addressing mode. */
21709 bool
21710 aarch64_simd_mem_operand_p (rtx op)
21712 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
21713 || REG_P (XEXP (op, 0)));
21716 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
21718 bool
21719 aarch64_sve_ld1r_operand_p (rtx op)
21721 struct aarch64_address_info addr;
21722 scalar_mode mode;
21724 return (MEM_P (op)
21725 && is_a <scalar_mode> (GET_MODE (op), &mode)
21726 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
21727 && addr.type == ADDRESS_REG_IMM
21728 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
21731 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
21732 where the size of the read data is specified by `mode` and the size of the
21733 vector elements are specified by `elem_mode`. */
21734 bool
21735 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
21736 scalar_mode elem_mode)
21738 struct aarch64_address_info addr;
21739 if (!MEM_P (op)
21740 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
21741 return false;
21743 if (addr.type == ADDRESS_REG_IMM)
21744 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
21746 if (addr.type == ADDRESS_REG_REG)
21747 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
21749 return false;
21752 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
21753 bool
21754 aarch64_sve_ld1rq_operand_p (rtx op)
21756 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
21757 GET_MODE_INNER (GET_MODE (op)));
21760 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
21761 accessing a vector where the element size is specified by `elem_mode`. */
21762 bool
21763 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
21765 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
21768 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
21769 bool
21770 aarch64_sve_ldff1_operand_p (rtx op)
21772 if (!MEM_P (op))
21773 return false;
21775 struct aarch64_address_info addr;
21776 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
21777 return false;
21779 if (addr.type == ADDRESS_REG_IMM)
21780 return known_eq (addr.const_offset, 0);
21782 return addr.type == ADDRESS_REG_REG;
21785 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
21786 bool
21787 aarch64_sve_ldnf1_operand_p (rtx op)
21789 struct aarch64_address_info addr;
21791 return (MEM_P (op)
21792 && aarch64_classify_address (&addr, XEXP (op, 0),
21793 GET_MODE (op), false)
21794 && addr.type == ADDRESS_REG_IMM);
21797 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
21798 The conditions for STR are the same. */
21799 bool
21800 aarch64_sve_ldr_operand_p (rtx op)
21802 struct aarch64_address_info addr;
21804 return (MEM_P (op)
21805 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
21806 false, ADDR_QUERY_ANY)
21807 && addr.type == ADDRESS_REG_IMM);
21810 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
21811 addressing memory of mode MODE. */
21812 bool
21813 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
21815 struct aarch64_address_info addr;
21816 if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
21817 return false;
21819 if (addr.type == ADDRESS_REG_IMM)
21820 return offset_6bit_signed_scaled_p (mode, addr.const_offset);
21822 return addr.type == ADDRESS_REG_REG;
21825 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
21826 We need to be able to access the individual pieces, so the range
21827 is different from LD[234] and ST[234]. */
21828 bool
21829 aarch64_sve_struct_memory_operand_p (rtx op)
21831 if (!MEM_P (op))
21832 return false;
21834 machine_mode mode = GET_MODE (op);
21835 struct aarch64_address_info addr;
21836 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
21837 ADDR_QUERY_ANY)
21838 || addr.type != ADDRESS_REG_IMM)
21839 return false;
21841 poly_int64 first = addr.const_offset;
21842 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
21843 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
21844 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
21847 /* Emit a register copy from operand to operand, taking care not to
21848 early-clobber source registers in the process.
21850 COUNT is the number of components into which the copy needs to be
21851 decomposed. */
21852 void
21853 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
21854 unsigned int count)
21856 unsigned int i;
21857 int rdest = REGNO (operands[0]);
21858 int rsrc = REGNO (operands[1]);
21860 if (!reg_overlap_mentioned_p (operands[0], operands[1])
21861 || rdest < rsrc)
21862 for (i = 0; i < count; i++)
21863 emit_move_insn (gen_rtx_REG (mode, rdest + i),
21864 gen_rtx_REG (mode, rsrc + i));
21865 else
21866 for (i = 0; i < count; i++)
21867 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
21868 gen_rtx_REG (mode, rsrc + count - i - 1));
21871 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
21872 one of VSTRUCT modes: OI, CI, or XI. */
21874 aarch64_simd_attr_length_rglist (machine_mode mode)
21876 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
21877 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
21880 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
21881 alignment of a vector to 128 bits. SVE predicates have an alignment of
21882 16 bits. */
21883 static HOST_WIDE_INT
21884 aarch64_simd_vector_alignment (const_tree type)
21886 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
21887 be set for non-predicate vectors of booleans. Modes are the most
21888 direct way we have of identifying real SVE predicate types. */
21889 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
21890 return 16;
21891 widest_int min_size
21892 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
21893 return wi::umin (min_size, 128).to_uhwi ();
21896 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
21897 static poly_uint64
21898 aarch64_vectorize_preferred_vector_alignment (const_tree type)
21900 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
21902 /* If the length of the vector is a fixed power of 2, try to align
21903 to that length, otherwise don't try to align at all. */
21904 HOST_WIDE_INT result;
21905 if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
21906 || !pow2p_hwi (result))
21907 result = TYPE_ALIGN (TREE_TYPE (type));
21908 return result;
21910 return TYPE_ALIGN (type);
21913 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
21914 static bool
21915 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
21917 if (is_packed)
21918 return false;
21920 /* For fixed-length vectors, check that the vectorizer will aim for
21921 full-vector alignment. This isn't true for generic GCC vectors
21922 that are wider than the ABI maximum of 128 bits. */
21923 poly_uint64 preferred_alignment =
21924 aarch64_vectorize_preferred_vector_alignment (type);
21925 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
21926 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
21927 preferred_alignment))
21928 return false;
21930 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
21931 return true;
21934 /* Return true if the vector misalignment factor is supported by the
21935 target. */
21936 static bool
21937 aarch64_builtin_support_vector_misalignment (machine_mode mode,
21938 const_tree type, int misalignment,
21939 bool is_packed)
21941 if (TARGET_SIMD && STRICT_ALIGNMENT)
21943 /* Return if movmisalign pattern is not supported for this mode. */
21944 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
21945 return false;
21947 /* Misalignment factor is unknown at compile time. */
21948 if (misalignment == -1)
21949 return false;
21951 return default_builtin_support_vector_misalignment (mode, type, misalignment,
21952 is_packed);
21955 /* If VALS is a vector constant that can be loaded into a register
21956 using DUP, generate instructions to do so and return an RTX to
21957 assign to the register. Otherwise return NULL_RTX. */
21958 static rtx
21959 aarch64_simd_dup_constant (rtx vals)
21961 machine_mode mode = GET_MODE (vals);
21962 machine_mode inner_mode = GET_MODE_INNER (mode);
21963 rtx x;
21965 if (!const_vec_duplicate_p (vals, &x))
21966 return NULL_RTX;
21968 /* We can load this constant by using DUP and a constant in a
21969 single ARM register. This will be cheaper than a vector
21970 load. */
21971 x = force_reg (inner_mode, x);
21972 return gen_vec_duplicate (mode, x);
21976 /* Generate code to load VALS, which is a PARALLEL containing only
21977 constants (for vec_init) or CONST_VECTOR, efficiently into a
21978 register. Returns an RTX to copy into the register, or NULL_RTX
21979 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
21980 static rtx
21981 aarch64_simd_make_constant (rtx vals)
21983 machine_mode mode = GET_MODE (vals);
21984 rtx const_dup;
21985 rtx const_vec = NULL_RTX;
21986 int n_const = 0;
21987 int i;
21989 if (CONST_VECTOR_P (vals))
21990 const_vec = vals;
21991 else if (GET_CODE (vals) == PARALLEL)
21993 /* A CONST_VECTOR must contain only CONST_INTs and
21994 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
21995 Only store valid constants in a CONST_VECTOR. */
21996 int n_elts = XVECLEN (vals, 0);
21997 for (i = 0; i < n_elts; ++i)
21999 rtx x = XVECEXP (vals, 0, i);
22000 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22001 n_const++;
22003 if (n_const == n_elts)
22004 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
22006 else
22007 gcc_unreachable ();
22009 if (const_vec != NULL_RTX
22010 && aarch64_simd_valid_immediate (const_vec, NULL))
22011 /* Load using MOVI/MVNI. */
22012 return const_vec;
22013 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
22014 /* Loaded using DUP. */
22015 return const_dup;
22016 else if (const_vec != NULL_RTX)
22017 /* Load from constant pool. We cannot take advantage of single-cycle
22018 LD1 because we need a PC-relative addressing mode. */
22019 return const_vec;
22020 else
22021 /* A PARALLEL containing something not valid inside CONST_VECTOR.
22022 We cannot construct an initializer. */
22023 return NULL_RTX;
22026 /* Expand a vector initialisation sequence, such that TARGET is
22027 initialised to contain VALS. */
22029 void
22030 aarch64_expand_vector_init (rtx target, rtx vals)
22032 machine_mode mode = GET_MODE (target);
22033 scalar_mode inner_mode = GET_MODE_INNER (mode);
22034 /* The number of vector elements. */
22035 int n_elts = XVECLEN (vals, 0);
22036 /* The number of vector elements which are not constant. */
22037 int n_var = 0;
22038 rtx any_const = NULL_RTX;
22039 /* The first element of vals. */
22040 rtx v0 = XVECEXP (vals, 0, 0);
22041 bool all_same = true;
22043 /* This is a special vec_init<M><N> where N is not an element mode but a
22044 vector mode with half the elements of M. We expect to find two entries
22045 of mode N in VALS and we must put their concatentation into TARGET. */
22046 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
22048 machine_mode narrow_mode = GET_MODE (XVECEXP (vals, 0, 0));
22049 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode
22050 && known_eq (GET_MODE_SIZE (mode),
22051 2 * GET_MODE_SIZE (narrow_mode)));
22052 emit_insn (gen_aarch64_vec_concat (narrow_mode, target,
22053 XVECEXP (vals, 0, 0),
22054 XVECEXP (vals, 0, 1)));
22055 return;
22058 /* Count the number of variable elements to initialise. */
22059 for (int i = 0; i < n_elts; ++i)
22061 rtx x = XVECEXP (vals, 0, i);
22062 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
22063 ++n_var;
22064 else
22065 any_const = x;
22067 all_same &= rtx_equal_p (x, v0);
22070 /* No variable elements, hand off to aarch64_simd_make_constant which knows
22071 how best to handle this. */
22072 if (n_var == 0)
22074 rtx constant = aarch64_simd_make_constant (vals);
22075 if (constant != NULL_RTX)
22077 emit_move_insn (target, constant);
22078 return;
22082 /* Splat a single non-constant element if we can. */
22083 if (all_same)
22085 rtx x = force_reg (inner_mode, v0);
22086 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
22087 return;
22090 /* Check for interleaving case.
22091 For eg if initializer is (int16x8_t) {x, y, x, y, x, y, x, y}.
22092 Generate following code:
22093 dup v0.h, x
22094 dup v1.h, y
22095 zip1 v0.h, v0.h, v1.h
22096 for "large enough" initializer. */
22098 if (n_elts >= 8)
22100 int i;
22101 for (i = 2; i < n_elts; i++)
22102 if (!rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, i % 2)))
22103 break;
22105 if (i == n_elts)
22107 machine_mode mode = GET_MODE (target);
22108 rtx dest[2];
22110 for (int i = 0; i < 2; i++)
22112 rtx x = expand_vector_broadcast (mode, XVECEXP (vals, 0, i));
22113 dest[i] = force_reg (mode, x);
22116 rtvec v = gen_rtvec (2, dest[0], dest[1]);
22117 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22118 return;
22122 enum insn_code icode = optab_handler (vec_set_optab, mode);
22123 gcc_assert (icode != CODE_FOR_nothing);
22125 /* If there are only variable elements, try to optimize
22126 the insertion using dup for the most common element
22127 followed by insertions. */
22129 /* The algorithm will fill matches[*][0] with the earliest matching element,
22130 and matches[X][1] with the count of duplicate elements (if X is the
22131 earliest element which has duplicates). */
22133 if (n_var == n_elts && n_elts <= 16)
22135 int matches[16][2] = {0};
22136 for (int i = 0; i < n_elts; i++)
22138 for (int j = 0; j <= i; j++)
22140 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
22142 matches[i][0] = j;
22143 matches[j][1]++;
22144 break;
22148 int maxelement = 0;
22149 int maxv = 0;
22150 for (int i = 0; i < n_elts; i++)
22151 if (matches[i][1] > maxv)
22153 maxelement = i;
22154 maxv = matches[i][1];
22157 /* Create a duplicate of the most common element, unless all elements
22158 are equally useless to us, in which case just immediately set the
22159 vector register using the first element. */
22161 if (maxv == 1)
22163 /* For vectors of two 64-bit elements, we can do even better. */
22164 if (n_elts == 2
22165 && (inner_mode == E_DImode
22166 || inner_mode == E_DFmode))
22169 rtx x0 = XVECEXP (vals, 0, 0);
22170 rtx x1 = XVECEXP (vals, 0, 1);
22171 /* Combine can pick up this case, but handling it directly
22172 here leaves clearer RTL.
22174 This is load_pair_lanes<mode>, and also gives us a clean-up
22175 for store_pair_lanes<mode>. */
22176 if (memory_operand (x0, inner_mode)
22177 && memory_operand (x1, inner_mode)
22178 && aarch64_mergeable_load_pair_p (mode, x0, x1))
22180 rtx t;
22181 if (inner_mode == DFmode)
22182 t = gen_load_pair_lanesdf (target, x0, x1);
22183 else
22184 t = gen_load_pair_lanesdi (target, x0, x1);
22185 emit_insn (t);
22186 return;
22189 /* The subreg-move sequence below will move into lane zero of the
22190 vector register. For big-endian we want that position to hold
22191 the last element of VALS. */
22192 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
22193 rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22194 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
22196 else
22198 rtx x = force_reg (inner_mode, XVECEXP (vals, 0, maxelement));
22199 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
22202 /* Insert the rest. */
22203 for (int i = 0; i < n_elts; i++)
22205 rtx x = XVECEXP (vals, 0, i);
22206 if (matches[i][0] == maxelement)
22207 continue;
22208 x = force_reg (inner_mode, x);
22209 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22211 return;
22214 /* Initialise a vector which is part-variable. We want to first try
22215 to build those lanes which are constant in the most efficient way we
22216 can. */
22217 if (n_var != n_elts)
22219 rtx copy = copy_rtx (vals);
22221 /* Load constant part of vector. We really don't care what goes into the
22222 parts we will overwrite, but we're more likely to be able to load the
22223 constant efficiently if it has fewer, larger, repeating parts
22224 (see aarch64_simd_valid_immediate). */
22225 for (int i = 0; i < n_elts; i++)
22227 rtx x = XVECEXP (vals, 0, i);
22228 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22229 continue;
22230 rtx subst = any_const;
22231 for (int bit = n_elts / 2; bit > 0; bit /= 2)
22233 /* Look in the copied vector, as more elements are const. */
22234 rtx test = XVECEXP (copy, 0, i ^ bit);
22235 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
22237 subst = test;
22238 break;
22241 XVECEXP (copy, 0, i) = subst;
22243 aarch64_expand_vector_init (target, copy);
22246 /* Insert the variable lanes directly. */
22247 for (int i = 0; i < n_elts; i++)
22249 rtx x = XVECEXP (vals, 0, i);
22250 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
22251 continue;
22252 x = force_reg (inner_mode, x);
22253 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
22257 /* Emit RTL corresponding to:
22258 insr TARGET, ELEM. */
22260 static void
22261 emit_insr (rtx target, rtx elem)
22263 machine_mode mode = GET_MODE (target);
22264 scalar_mode elem_mode = GET_MODE_INNER (mode);
22265 elem = force_reg (elem_mode, elem);
22267 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
22268 gcc_assert (icode != CODE_FOR_nothing);
22269 emit_insn (GEN_FCN (icode) (target, target, elem));
22272 /* Subroutine of aarch64_sve_expand_vector_init for handling
22273 trailing constants.
22274 This function works as follows:
22275 (a) Create a new vector consisting of trailing constants.
22276 (b) Initialize TARGET with the constant vector using emit_move_insn.
22277 (c) Insert remaining elements in TARGET using insr.
22278 NELTS is the total number of elements in original vector while
22279 while NELTS_REQD is the number of elements that are actually
22280 significant.
22282 ??? The heuristic used is to do above only if number of constants
22283 is at least half the total number of elements. May need fine tuning. */
22285 static bool
22286 aarch64_sve_expand_vector_init_handle_trailing_constants
22287 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
22289 machine_mode mode = GET_MODE (target);
22290 scalar_mode elem_mode = GET_MODE_INNER (mode);
22291 int n_trailing_constants = 0;
22293 for (int i = nelts_reqd - 1;
22294 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
22295 i--)
22296 n_trailing_constants++;
22298 if (n_trailing_constants >= nelts_reqd / 2)
22300 /* Try to use the natural pattern of BUILDER to extend the trailing
22301 constant elements to a full vector. Replace any variables in the
22302 extra elements with zeros.
22304 ??? It would be better if the builders supported "don't care"
22305 elements, with the builder filling in whichever elements
22306 give the most compact encoding. */
22307 rtx_vector_builder v (mode, nelts, 1);
22308 for (int i = 0; i < nelts; i++)
22310 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
22311 if (!valid_for_const_vector_p (elem_mode, x))
22312 x = CONST0_RTX (elem_mode);
22313 v.quick_push (x);
22315 rtx const_vec = v.build ();
22316 emit_move_insn (target, const_vec);
22318 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
22319 emit_insr (target, builder.elt (i));
22321 return true;
22324 return false;
22327 /* Subroutine of aarch64_sve_expand_vector_init.
22328 Works as follows:
22329 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
22330 (b) Skip trailing elements from BUILDER, which are the same as
22331 element NELTS_REQD - 1.
22332 (c) Insert earlier elements in reverse order in TARGET using insr. */
22334 static void
22335 aarch64_sve_expand_vector_init_insert_elems (rtx target,
22336 const rtx_vector_builder &builder,
22337 int nelts_reqd)
22339 machine_mode mode = GET_MODE (target);
22340 scalar_mode elem_mode = GET_MODE_INNER (mode);
22342 struct expand_operand ops[2];
22343 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
22344 gcc_assert (icode != CODE_FOR_nothing);
22346 create_output_operand (&ops[0], target, mode);
22347 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
22348 expand_insn (icode, 2, ops);
22350 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22351 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
22352 emit_insr (target, builder.elt (i));
22355 /* Subroutine of aarch64_sve_expand_vector_init to handle case
22356 when all trailing elements of builder are same.
22357 This works as follows:
22358 (a) Use expand_insn interface to broadcast last vector element in TARGET.
22359 (b) Insert remaining elements in TARGET using insr.
22361 ??? The heuristic used is to do above if number of same trailing elements
22362 is at least 3/4 of total number of elements, loosely based on
22363 heuristic from mostly_zeros_p. May need fine-tuning. */
22365 static bool
22366 aarch64_sve_expand_vector_init_handle_trailing_same_elem
22367 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
22369 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
22370 if (ndups >= (3 * nelts_reqd) / 4)
22372 aarch64_sve_expand_vector_init_insert_elems (target, builder,
22373 nelts_reqd - ndups + 1);
22374 return true;
22377 return false;
22380 /* Initialize register TARGET from BUILDER. NELTS is the constant number
22381 of elements in BUILDER.
22383 The function tries to initialize TARGET from BUILDER if it fits one
22384 of the special cases outlined below.
22386 Failing that, the function divides BUILDER into two sub-vectors:
22387 v_even = even elements of BUILDER;
22388 v_odd = odd elements of BUILDER;
22390 and recursively calls itself with v_even and v_odd.
22392 if (recursive call succeeded for v_even or v_odd)
22393 TARGET = zip (v_even, v_odd)
22395 The function returns true if it managed to build TARGET from BUILDER
22396 with one of the special cases, false otherwise.
22398 Example: {a, 1, b, 2, c, 3, d, 4}
22400 The vector gets divided into:
22401 v_even = {a, b, c, d}
22402 v_odd = {1, 2, 3, 4}
22404 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
22405 initialize tmp2 from constant vector v_odd using emit_move_insn.
22407 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
22408 4 elements, so we construct tmp1 from v_even using insr:
22409 tmp1 = dup(d)
22410 insr tmp1, c
22411 insr tmp1, b
22412 insr tmp1, a
22414 And finally:
22415 TARGET = zip (tmp1, tmp2)
22416 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
22418 static bool
22419 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
22420 int nelts, int nelts_reqd)
22422 machine_mode mode = GET_MODE (target);
22424 /* Case 1: Vector contains trailing constants. */
22426 if (aarch64_sve_expand_vector_init_handle_trailing_constants
22427 (target, builder, nelts, nelts_reqd))
22428 return true;
22430 /* Case 2: Vector contains leading constants. */
22432 rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
22433 for (int i = 0; i < nelts_reqd; i++)
22434 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
22435 rev_builder.finalize ();
22437 if (aarch64_sve_expand_vector_init_handle_trailing_constants
22438 (target, rev_builder, nelts, nelts_reqd))
22440 emit_insn (gen_aarch64_sve_rev (mode, target, target));
22441 return true;
22444 /* Case 3: Vector contains trailing same element. */
22446 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22447 (target, builder, nelts_reqd))
22448 return true;
22450 /* Case 4: Vector contains leading same element. */
22452 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
22453 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
22455 emit_insn (gen_aarch64_sve_rev (mode, target, target));
22456 return true;
22459 /* Avoid recursing below 4-elements.
22460 ??? The threshold 4 may need fine-tuning. */
22462 if (nelts_reqd <= 4)
22463 return false;
22465 rtx_vector_builder v_even (mode, nelts, 1);
22466 rtx_vector_builder v_odd (mode, nelts, 1);
22468 for (int i = 0; i < nelts * 2; i += 2)
22470 v_even.quick_push (builder.elt (i));
22471 v_odd.quick_push (builder.elt (i + 1));
22474 v_even.finalize ();
22475 v_odd.finalize ();
22477 rtx tmp1 = gen_reg_rtx (mode);
22478 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
22479 nelts, nelts_reqd / 2);
22481 rtx tmp2 = gen_reg_rtx (mode);
22482 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
22483 nelts, nelts_reqd / 2);
22485 if (!did_even_p && !did_odd_p)
22486 return false;
22488 /* Initialize v_even and v_odd using INSR if it didn't match any of the
22489 special cases and zip v_even, v_odd. */
22491 if (!did_even_p)
22492 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
22494 if (!did_odd_p)
22495 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
22497 rtvec v = gen_rtvec (2, tmp1, tmp2);
22498 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
22499 return true;
22502 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
22504 void
22505 aarch64_sve_expand_vector_init (rtx target, rtx vals)
22507 machine_mode mode = GET_MODE (target);
22508 int nelts = XVECLEN (vals, 0);
22510 rtx_vector_builder v (mode, nelts, 1);
22511 for (int i = 0; i < nelts; i++)
22512 v.quick_push (XVECEXP (vals, 0, i));
22513 v.finalize ();
22515 /* If neither sub-vectors of v could be initialized specially,
22516 then use INSR to insert all elements from v into TARGET.
22517 ??? This might not be optimal for vectors with large
22518 initializers like 16-element or above.
22519 For nelts < 4, it probably isn't useful to handle specially. */
22521 if (nelts < 4
22522 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
22523 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
22526 /* Check whether VALUE is a vector constant in which every element
22527 is either a power of 2 or a negated power of 2. If so, return
22528 a constant vector of log2s, and flip CODE between PLUS and MINUS
22529 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
22531 static rtx
22532 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
22534 if (!CONST_VECTOR_P (value))
22535 return NULL_RTX;
22537 rtx_vector_builder builder;
22538 if (!builder.new_unary_operation (GET_MODE (value), value, false))
22539 return NULL_RTX;
22541 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
22542 /* 1 if the result of the multiplication must be negated,
22543 0 if it mustn't, or -1 if we don't yet care. */
22544 int negate = -1;
22545 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
22546 for (unsigned int i = 0; i < encoded_nelts; ++i)
22548 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
22549 if (!CONST_SCALAR_INT_P (elt))
22550 return NULL_RTX;
22551 rtx_mode_t val (elt, int_mode);
22552 wide_int pow2 = wi::neg (val);
22553 if (val != pow2)
22555 /* It matters whether we negate or not. Make that choice,
22556 and make sure that it's consistent with previous elements. */
22557 if (negate == !wi::neg_p (val))
22558 return NULL_RTX;
22559 negate = wi::neg_p (val);
22560 if (!negate)
22561 pow2 = val;
22563 /* POW2 is now the value that we want to be a power of 2. */
22564 int shift = wi::exact_log2 (pow2);
22565 if (shift < 0)
22566 return NULL_RTX;
22567 builder.quick_push (gen_int_mode (shift, int_mode));
22569 if (negate == -1)
22570 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
22571 code = PLUS;
22572 else if (negate == 1)
22573 code = code == PLUS ? MINUS : PLUS;
22574 return builder.build ();
22577 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
22578 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
22579 operands array, in the same order as for fma_optab. Return true if
22580 the function emitted all the necessary instructions, false if the caller
22581 should generate the pattern normally with the new OPERANDS array. */
22583 bool
22584 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
22586 machine_mode mode = GET_MODE (operands[0]);
22587 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
22589 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
22590 NULL_RTX, true, OPTAB_DIRECT);
22591 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
22592 operands[3], product, operands[0], true,
22593 OPTAB_DIRECT);
22594 return true;
22596 operands[2] = force_reg (mode, operands[2]);
22597 return false;
22600 /* Likewise, but for a conditional pattern. */
22602 bool
22603 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
22605 machine_mode mode = GET_MODE (operands[0]);
22606 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
22608 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
22609 NULL_RTX, true, OPTAB_DIRECT);
22610 emit_insn (gen_cond (code, mode, operands[0], operands[1],
22611 operands[4], product, operands[5]));
22612 return true;
22614 operands[3] = force_reg (mode, operands[3]);
22615 return false;
22618 static unsigned HOST_WIDE_INT
22619 aarch64_shift_truncation_mask (machine_mode mode)
22621 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
22622 return 0;
22623 return GET_MODE_UNIT_BITSIZE (mode) - 1;
22626 /* Select a format to encode pointers in exception handling data. */
22628 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
22630 int type;
22631 switch (aarch64_cmodel)
22633 case AARCH64_CMODEL_TINY:
22634 case AARCH64_CMODEL_TINY_PIC:
22635 case AARCH64_CMODEL_SMALL:
22636 case AARCH64_CMODEL_SMALL_PIC:
22637 case AARCH64_CMODEL_SMALL_SPIC:
22638 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
22639 for everything. */
22640 type = DW_EH_PE_sdata4;
22641 break;
22642 default:
22643 /* No assumptions here. 8-byte relocs required. */
22644 type = DW_EH_PE_sdata8;
22645 break;
22647 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
22650 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
22652 static void
22653 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
22655 if (TREE_CODE (decl) == FUNCTION_DECL)
22657 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
22658 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
22660 fprintf (stream, "\t.variant_pcs\t");
22661 assemble_name (stream, name);
22662 fprintf (stream, "\n");
22667 /* The last .arch and .tune assembly strings that we printed. */
22668 static std::string aarch64_last_printed_arch_string;
22669 static std::string aarch64_last_printed_tune_string;
22671 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
22672 by the function fndecl. */
22674 void
22675 aarch64_declare_function_name (FILE *stream, const char* name,
22676 tree fndecl)
22678 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
22680 struct cl_target_option *targ_options;
22681 if (target_parts)
22682 targ_options = TREE_TARGET_OPTION (target_parts);
22683 else
22684 targ_options = TREE_TARGET_OPTION (target_option_current_node);
22685 gcc_assert (targ_options);
22687 const struct processor *this_arch
22688 = aarch64_get_arch (targ_options->x_selected_arch);
22690 auto isa_flags = targ_options->x_aarch64_asm_isa_flags;
22691 std::string extension
22692 = aarch64_get_extension_string_for_isa_flags (isa_flags,
22693 this_arch->flags);
22694 /* Only update the assembler .arch string if it is distinct from the last
22695 such string we printed. */
22696 std::string to_print = this_arch->name + extension;
22697 if (to_print != aarch64_last_printed_arch_string)
22699 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
22700 aarch64_last_printed_arch_string = to_print;
22703 /* Print the cpu name we're tuning for in the comments, might be
22704 useful to readers of the generated asm. Do it only when it changes
22705 from function to function and verbose assembly is requested. */
22706 const struct processor *this_tune
22707 = aarch64_get_tune_cpu (targ_options->x_selected_tune);
22709 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
22711 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
22712 this_tune->name);
22713 aarch64_last_printed_tune_string = this_tune->name;
22716 aarch64_asm_output_variant_pcs (stream, fndecl, name);
22718 /* Don't forget the type directive for ELF. */
22719 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
22720 ASM_OUTPUT_LABEL (stream, name);
22722 cfun->machine->label_is_assembled = true;
22725 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. */
22727 void
22728 aarch64_print_patchable_function_entry (FILE *file,
22729 unsigned HOST_WIDE_INT patch_area_size,
22730 bool record_p)
22732 if (!cfun->machine->label_is_assembled)
22734 /* Emit the patching area before the entry label, if any. */
22735 default_print_patchable_function_entry (file, patch_area_size,
22736 record_p);
22737 return;
22740 rtx pa = gen_patchable_area (GEN_INT (patch_area_size),
22741 GEN_INT (record_p));
22742 basic_block bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
22744 if (!aarch_bti_enabled ()
22745 || cgraph_node::get (cfun->decl)->only_called_directly_p ())
22747 /* Emit the patchable_area at the beginning of the function. */
22748 rtx_insn *insn = emit_insn_before (pa, BB_HEAD (bb));
22749 INSN_ADDRESSES_NEW (insn, -1);
22750 return;
22753 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
22754 if (!insn
22755 || !INSN_P (insn)
22756 || GET_CODE (PATTERN (insn)) != UNSPEC_VOLATILE
22757 || XINT (PATTERN (insn), 1) != UNSPECV_BTI_C)
22759 /* Emit a BTI_C. */
22760 insn = emit_insn_before (gen_bti_c (), BB_HEAD (bb));
22763 /* Emit the patchable_area after BTI_C. */
22764 insn = emit_insn_after (pa, insn);
22765 INSN_ADDRESSES_NEW (insn, -1);
22768 /* Output patchable area. */
22770 void
22771 aarch64_output_patchable_area (unsigned int patch_area_size, bool record_p)
22773 default_print_patchable_function_entry (asm_out_file, patch_area_size,
22774 record_p);
22777 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
22779 void
22780 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
22782 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
22783 const char *value = IDENTIFIER_POINTER (target);
22784 aarch64_asm_output_variant_pcs (stream, decl, name);
22785 ASM_OUTPUT_DEF (stream, name, value);
22788 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
22789 function symbol references. */
22791 void
22792 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
22794 default_elf_asm_output_external (stream, decl, name);
22795 aarch64_asm_output_variant_pcs (stream, decl, name);
22798 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
22799 Used to output the .cfi_b_key_frame directive when signing the current
22800 function with the B key. */
22802 void
22803 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
22805 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
22806 && aarch_ra_sign_key == AARCH_KEY_B)
22807 asm_fprintf (f, "\t.cfi_b_key_frame\n");
22810 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
22812 static void
22813 aarch64_start_file (void)
22815 struct cl_target_option *default_options
22816 = TREE_TARGET_OPTION (target_option_default_node);
22818 const struct processor *default_arch
22819 = aarch64_get_arch (default_options->x_selected_arch);
22820 auto default_isa_flags = default_options->x_aarch64_asm_isa_flags;
22821 std::string extension
22822 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
22823 default_arch->flags);
22825 aarch64_last_printed_arch_string = default_arch->name + extension;
22826 aarch64_last_printed_tune_string = "";
22827 asm_fprintf (asm_out_file, "\t.arch %s\n",
22828 aarch64_last_printed_arch_string.c_str ());
22830 default_file_start ();
22833 /* Emit load exclusive. */
22835 static void
22836 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
22837 rtx mem, rtx model_rtx)
22839 if (mode == TImode)
22840 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
22841 gen_highpart (DImode, rval),
22842 mem, model_rtx));
22843 else
22844 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
22847 /* Emit store exclusive. */
22849 static void
22850 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
22851 rtx mem, rtx rval, rtx model_rtx)
22853 if (mode == TImode)
22854 emit_insn (gen_aarch64_store_exclusive_pair
22855 (bval, mem, operand_subword (rval, 0, 0, TImode),
22856 operand_subword (rval, 1, 0, TImode), model_rtx));
22857 else
22858 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
22861 /* Mark the previous jump instruction as unlikely. */
22863 static void
22864 aarch64_emit_unlikely_jump (rtx insn)
22866 rtx_insn *jump = emit_jump_insn (insn);
22867 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
22870 /* We store the names of the various atomic helpers in a 5x5 array.
22871 Return the libcall function given MODE, MODEL and NAMES. */
22874 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
22875 const atomic_ool_names *names)
22877 memmodel model = memmodel_from_int (INTVAL (model_rtx));
22878 int mode_idx, model_idx;
22880 switch (mode)
22882 case E_QImode:
22883 mode_idx = 0;
22884 break;
22885 case E_HImode:
22886 mode_idx = 1;
22887 break;
22888 case E_SImode:
22889 mode_idx = 2;
22890 break;
22891 case E_DImode:
22892 mode_idx = 3;
22893 break;
22894 case E_TImode:
22895 mode_idx = 4;
22896 break;
22897 default:
22898 gcc_unreachable ();
22901 switch (model)
22903 case MEMMODEL_RELAXED:
22904 model_idx = 0;
22905 break;
22906 case MEMMODEL_CONSUME:
22907 case MEMMODEL_ACQUIRE:
22908 model_idx = 1;
22909 break;
22910 case MEMMODEL_RELEASE:
22911 model_idx = 2;
22912 break;
22913 case MEMMODEL_ACQ_REL:
22914 case MEMMODEL_SEQ_CST:
22915 model_idx = 3;
22916 break;
22917 case MEMMODEL_SYNC_ACQUIRE:
22918 case MEMMODEL_SYNC_RELEASE:
22919 case MEMMODEL_SYNC_SEQ_CST:
22920 model_idx = 4;
22921 break;
22922 default:
22923 gcc_unreachable ();
22926 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
22927 VISIBILITY_HIDDEN);
22930 #define DEF0(B, N) \
22931 { "__aarch64_" #B #N "_relax", \
22932 "__aarch64_" #B #N "_acq", \
22933 "__aarch64_" #B #N "_rel", \
22934 "__aarch64_" #B #N "_acq_rel", \
22935 "__aarch64_" #B #N "_sync" }
22937 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
22938 { NULL, NULL, NULL, NULL }
22939 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
22941 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
22942 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
22943 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
22944 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
22945 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
22946 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
22948 #undef DEF0
22949 #undef DEF4
22950 #undef DEF5
22952 /* Expand a compare and swap pattern. */
22954 void
22955 aarch64_expand_compare_and_swap (rtx operands[])
22957 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
22958 machine_mode mode, r_mode;
22960 bval = operands[0];
22961 rval = operands[1];
22962 mem = operands[2];
22963 oldval = operands[3];
22964 newval = operands[4];
22965 is_weak = operands[5];
22966 mod_s = operands[6];
22967 mod_f = operands[7];
22968 mode = GET_MODE (mem);
22970 /* Normally the succ memory model must be stronger than fail, but in the
22971 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
22972 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
22973 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
22974 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
22975 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
22977 r_mode = mode;
22978 if (mode == QImode || mode == HImode)
22980 r_mode = SImode;
22981 rval = gen_reg_rtx (r_mode);
22984 if (TARGET_LSE)
22986 /* The CAS insn requires oldval and rval overlap, but we need to
22987 have a copy of oldval saved across the operation to tell if
22988 the operation is successful. */
22989 if (reg_overlap_mentioned_p (rval, oldval))
22990 rval = copy_to_mode_reg (r_mode, oldval);
22991 else
22992 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
22994 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
22995 newval, mod_s));
22996 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
22998 else if (TARGET_OUTLINE_ATOMICS)
23000 /* Oldval must satisfy compare afterward. */
23001 if (!aarch64_plus_operand (oldval, mode))
23002 oldval = force_reg (mode, oldval);
23003 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
23004 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
23005 oldval, mode, newval, mode,
23006 XEXP (mem, 0), Pmode);
23007 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23009 else
23011 /* The oldval predicate varies by mode. Test it and force to reg. */
23012 insn_code code = code_for_aarch64_compare_and_swap (mode);
23013 if (!insn_data[code].operand[2].predicate (oldval, mode))
23014 oldval = force_reg (mode, oldval);
23016 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
23017 is_weak, mod_s, mod_f));
23018 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
23021 if (r_mode != mode)
23022 rval = gen_lowpart (mode, rval);
23023 emit_move_insn (operands[1], rval);
23025 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
23026 emit_insn (gen_rtx_SET (bval, x));
23029 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
23030 sequence implementing an atomic operation. */
23032 static void
23033 aarch64_emit_post_barrier (enum memmodel model)
23035 const enum memmodel base_model = memmodel_base (model);
23037 if (is_mm_sync (model)
23038 && (base_model == MEMMODEL_ACQUIRE
23039 || base_model == MEMMODEL_ACQ_REL
23040 || base_model == MEMMODEL_SEQ_CST))
23042 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
23046 /* Split a compare and swap pattern. */
23048 void
23049 aarch64_split_compare_and_swap (rtx operands[])
23051 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
23052 gcc_assert (epilogue_completed);
23054 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
23055 machine_mode mode;
23056 bool is_weak;
23057 rtx_code_label *label1, *label2;
23058 enum memmodel model;
23060 rval = operands[0];
23061 mem = operands[1];
23062 oldval = operands[2];
23063 newval = operands[3];
23064 is_weak = (operands[4] != const0_rtx);
23065 model_rtx = operands[5];
23066 scratch = operands[7];
23067 mode = GET_MODE (mem);
23068 model = memmodel_from_int (INTVAL (model_rtx));
23070 /* When OLDVAL is zero and we want the strong version we can emit a tighter
23071 loop:
23072 .label1:
23073 LD[A]XR rval, [mem]
23074 CBNZ rval, .label2
23075 ST[L]XR scratch, newval, [mem]
23076 CBNZ scratch, .label1
23077 .label2:
23078 CMP rval, 0. */
23079 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
23080 oldval == const0_rtx && mode != TImode);
23082 label1 = NULL;
23083 if (!is_weak)
23085 label1 = gen_label_rtx ();
23086 emit_label (label1);
23088 label2 = gen_label_rtx ();
23090 /* The initial load can be relaxed for a __sync operation since a final
23091 barrier will be emitted to stop code hoisting. */
23092 if (is_mm_sync (model))
23093 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
23094 else
23095 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
23097 if (strong_zero_p)
23098 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
23099 else
23101 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
23102 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
23104 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23105 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
23106 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23108 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
23110 if (!is_weak)
23112 if (aarch64_track_speculation)
23114 /* Emit an explicit compare instruction, so that we can correctly
23115 track the condition codes. */
23116 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
23117 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23119 else
23120 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
23122 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23123 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
23124 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23126 else
23127 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
23129 emit_label (label2);
23131 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
23132 to set the condition flags. If this is not used it will be removed by
23133 later passes. */
23134 if (strong_zero_p)
23135 aarch64_gen_compare_reg (NE, rval, const0_rtx);
23137 /* Emit any final barrier needed for a __sync operation. */
23138 if (is_mm_sync (model))
23139 aarch64_emit_post_barrier (model);
23142 /* Split an atomic operation. */
23144 void
23145 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
23146 rtx value, rtx model_rtx, rtx cond)
23148 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
23149 gcc_assert (epilogue_completed);
23151 machine_mode mode = GET_MODE (mem);
23152 machine_mode wmode = (mode == DImode ? DImode : SImode);
23153 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
23154 const bool is_sync = is_mm_sync (model);
23155 rtx_code_label *label;
23156 rtx x;
23158 /* Split the atomic operation into a sequence. */
23159 label = gen_label_rtx ();
23160 emit_label (label);
23162 if (new_out)
23163 new_out = gen_lowpart (wmode, new_out);
23164 if (old_out)
23165 old_out = gen_lowpart (wmode, old_out);
23166 else
23167 old_out = new_out;
23168 value = simplify_gen_subreg (wmode, value, mode, 0);
23170 /* The initial load can be relaxed for a __sync operation since a final
23171 barrier will be emitted to stop code hoisting. */
23172 if (is_sync)
23173 aarch64_emit_load_exclusive (mode, old_out, mem,
23174 GEN_INT (MEMMODEL_RELAXED));
23175 else
23176 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
23178 switch (code)
23180 case SET:
23181 new_out = value;
23182 break;
23184 case NOT:
23185 x = gen_rtx_AND (wmode, old_out, value);
23186 emit_insn (gen_rtx_SET (new_out, x));
23187 x = gen_rtx_NOT (wmode, new_out);
23188 emit_insn (gen_rtx_SET (new_out, x));
23189 break;
23191 case MINUS:
23192 if (CONST_INT_P (value))
23194 value = GEN_INT (-UINTVAL (value));
23195 code = PLUS;
23197 /* Fall through. */
23199 default:
23200 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
23201 emit_insn (gen_rtx_SET (new_out, x));
23202 break;
23205 aarch64_emit_store_exclusive (mode, cond, mem,
23206 gen_lowpart (mode, new_out), model_rtx);
23208 if (aarch64_track_speculation)
23210 /* Emit an explicit compare instruction, so that we can correctly
23211 track the condition codes. */
23212 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
23213 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
23215 else
23216 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
23218 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23219 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
23220 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23222 /* Emit any final barrier needed for a __sync operation. */
23223 if (is_sync)
23224 aarch64_emit_post_barrier (model);
23227 static void
23228 aarch64_init_libfuncs (void)
23230 /* Half-precision float operations. The compiler handles all operations
23231 with NULL libfuncs by converting to SFmode. */
23233 /* Conversions. */
23234 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
23235 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
23237 /* Arithmetic. */
23238 set_optab_libfunc (add_optab, HFmode, NULL);
23239 set_optab_libfunc (sdiv_optab, HFmode, NULL);
23240 set_optab_libfunc (smul_optab, HFmode, NULL);
23241 set_optab_libfunc (neg_optab, HFmode, NULL);
23242 set_optab_libfunc (sub_optab, HFmode, NULL);
23244 /* Comparisons. */
23245 set_optab_libfunc (eq_optab, HFmode, NULL);
23246 set_optab_libfunc (ne_optab, HFmode, NULL);
23247 set_optab_libfunc (lt_optab, HFmode, NULL);
23248 set_optab_libfunc (le_optab, HFmode, NULL);
23249 set_optab_libfunc (ge_optab, HFmode, NULL);
23250 set_optab_libfunc (gt_optab, HFmode, NULL);
23251 set_optab_libfunc (unord_optab, HFmode, NULL);
23254 /* Target hook for c_mode_for_suffix. */
23255 static machine_mode
23256 aarch64_c_mode_for_suffix (char suffix)
23258 if (suffix == 'q')
23259 return TFmode;
23261 return VOIDmode;
23264 /* We can only represent floating point constants which will fit in
23265 "quarter-precision" values. These values are characterised by
23266 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
23269 (-1)^s * (n/16) * 2^r
23271 Where:
23272 's' is the sign bit.
23273 'n' is an integer in the range 16 <= n <= 31.
23274 'r' is an integer in the range -3 <= r <= 4. */
23276 /* Return true iff X can be represented by a quarter-precision
23277 floating point immediate operand X. Note, we cannot represent 0.0. */
23278 bool
23279 aarch64_float_const_representable_p (rtx x)
23281 /* This represents our current view of how many bits
23282 make up the mantissa. */
23283 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
23284 int exponent;
23285 unsigned HOST_WIDE_INT mantissa, mask;
23286 REAL_VALUE_TYPE r, m;
23287 bool fail;
23289 x = unwrap_const_vec_duplicate (x);
23290 if (!CONST_DOUBLE_P (x))
23291 return false;
23293 if (GET_MODE (x) == VOIDmode
23294 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
23295 return false;
23297 r = *CONST_DOUBLE_REAL_VALUE (x);
23299 /* We cannot represent infinities, NaNs or +/-zero. We won't
23300 know if we have +zero until we analyse the mantissa, but we
23301 can reject the other invalid values. */
23302 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
23303 || REAL_VALUE_MINUS_ZERO (r))
23304 return false;
23306 /* Extract exponent. */
23307 r = real_value_abs (&r);
23308 exponent = REAL_EXP (&r);
23310 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
23311 highest (sign) bit, with a fixed binary point at bit point_pos.
23312 m1 holds the low part of the mantissa, m2 the high part.
23313 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
23314 bits for the mantissa, this can fail (low bits will be lost). */
23315 real_ldexp (&m, &r, point_pos - exponent);
23316 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
23318 /* If the low part of the mantissa has bits set we cannot represent
23319 the value. */
23320 if (w.ulow () != 0)
23321 return false;
23322 /* We have rejected the lower HOST_WIDE_INT, so update our
23323 understanding of how many bits lie in the mantissa and
23324 look only at the high HOST_WIDE_INT. */
23325 mantissa = w.elt (1);
23326 point_pos -= HOST_BITS_PER_WIDE_INT;
23328 /* We can only represent values with a mantissa of the form 1.xxxx. */
23329 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
23330 if ((mantissa & mask) != 0)
23331 return false;
23333 /* Having filtered unrepresentable values, we may now remove all
23334 but the highest 5 bits. */
23335 mantissa >>= point_pos - 5;
23337 /* We cannot represent the value 0.0, so reject it. This is handled
23338 elsewhere. */
23339 if (mantissa == 0)
23340 return false;
23342 /* Then, as bit 4 is always set, we can mask it off, leaving
23343 the mantissa in the range [0, 15]. */
23344 mantissa &= ~(1 << 4);
23345 gcc_assert (mantissa <= 15);
23347 /* GCC internally does not use IEEE754-like encoding (where normalized
23348 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.cc).
23349 Our mantissa values are shifted 4 places to the left relative to
23350 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
23351 by 5 places to correct for GCC's representation. */
23352 exponent = 5 - exponent;
23354 return (exponent >= 0 && exponent <= 7);
23357 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
23358 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
23359 output MOVI/MVNI, ORR or BIC immediate. */
23360 char*
23361 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
23362 enum simd_immediate_check which)
23364 bool is_valid;
23365 static char templ[40];
23366 const char *mnemonic;
23367 const char *shift_op;
23368 unsigned int lane_count = 0;
23369 char element_char;
23371 struct simd_immediate_info info;
23373 /* This will return true to show const_vector is legal for use as either
23374 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
23375 It will also update INFO to show how the immediate should be generated.
23376 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
23377 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
23378 gcc_assert (is_valid);
23380 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23381 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
23383 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23385 gcc_assert (info.insn == simd_immediate_info::MOV
23386 && info.u.mov.shift == 0);
23387 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
23388 move immediate path. */
23389 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23390 info.u.mov.value = GEN_INT (0);
23391 else
23393 const unsigned int buf_size = 20;
23394 char float_buf[buf_size] = {'\0'};
23395 real_to_decimal_for_mode (float_buf,
23396 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23397 buf_size, buf_size, 1, info.elt_mode);
23399 if (lane_count == 1)
23400 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
23401 else
23402 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
23403 lane_count, element_char, float_buf);
23404 return templ;
23408 gcc_assert (CONST_INT_P (info.u.mov.value));
23410 if (which == AARCH64_CHECK_MOV)
23412 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
23413 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
23414 ? "msl" : "lsl");
23415 if (lane_count == 1)
23416 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
23417 mnemonic, UINTVAL (info.u.mov.value));
23418 else if (info.u.mov.shift)
23419 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23420 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
23421 element_char, UINTVAL (info.u.mov.value), shift_op,
23422 info.u.mov.shift);
23423 else
23424 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
23425 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
23426 element_char, UINTVAL (info.u.mov.value));
23428 else
23430 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
23431 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
23432 if (info.u.mov.shift)
23433 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23434 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
23435 element_char, UINTVAL (info.u.mov.value), "lsl",
23436 info.u.mov.shift);
23437 else
23438 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
23439 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
23440 element_char, UINTVAL (info.u.mov.value));
23442 return templ;
23445 char*
23446 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
23449 /* If a floating point number was passed and we desire to use it in an
23450 integer mode do the conversion to integer. */
23451 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
23453 unsigned HOST_WIDE_INT ival;
23454 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
23455 gcc_unreachable ();
23456 immediate = gen_int_mode (ival, mode);
23459 machine_mode vmode;
23460 /* use a 64 bit mode for everything except for DI/DF/DD mode, where we use
23461 a 128 bit vector mode. */
23462 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
23464 vmode = aarch64_simd_container_mode (mode, width);
23465 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
23466 return aarch64_output_simd_mov_immediate (v_op, width);
23469 /* Return the output string to use for moving immediate CONST_VECTOR
23470 into an SVE register. */
23472 char *
23473 aarch64_output_sve_mov_immediate (rtx const_vector)
23475 static char templ[40];
23476 struct simd_immediate_info info;
23477 char element_char;
23479 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
23480 gcc_assert (is_valid);
23482 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23484 machine_mode vec_mode = GET_MODE (const_vector);
23485 if (aarch64_sve_pred_mode_p (vec_mode))
23487 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
23488 if (info.insn == simd_immediate_info::MOV)
23490 gcc_assert (info.u.mov.value == const0_rtx);
23491 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
23493 else
23495 gcc_assert (info.insn == simd_immediate_info::PTRUE);
23496 unsigned int total_bytes;
23497 if (info.u.pattern == AARCH64_SV_ALL
23498 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
23499 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
23500 total_bytes / GET_MODE_SIZE (info.elt_mode));
23501 else
23502 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
23503 svpattern_token (info.u.pattern));
23505 return buf;
23508 if (info.insn == simd_immediate_info::INDEX)
23510 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
23511 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
23512 element_char, INTVAL (info.u.index.base),
23513 INTVAL (info.u.index.step));
23514 return templ;
23517 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
23519 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
23520 info.u.mov.value = GEN_INT (0);
23521 else
23523 const int buf_size = 20;
23524 char float_buf[buf_size] = {};
23525 real_to_decimal_for_mode (float_buf,
23526 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
23527 buf_size, buf_size, 1, info.elt_mode);
23529 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
23530 element_char, float_buf);
23531 return templ;
23535 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
23536 element_char, INTVAL (info.u.mov.value));
23537 return templ;
23540 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
23541 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
23542 pattern. */
23544 char *
23545 aarch64_output_sve_ptrues (rtx const_unspec)
23547 static char templ[40];
23549 struct simd_immediate_info info;
23550 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
23551 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
23553 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
23554 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
23555 svpattern_token (info.u.pattern));
23556 return templ;
23559 /* Split operands into moves from op[1] + op[2] into op[0]. */
23561 void
23562 aarch64_split_combinev16qi (rtx operands[3])
23564 unsigned int dest = REGNO (operands[0]);
23565 unsigned int src1 = REGNO (operands[1]);
23566 unsigned int src2 = REGNO (operands[2]);
23567 machine_mode halfmode = GET_MODE (operands[1]);
23568 unsigned int halfregs = REG_NREGS (operands[1]);
23569 rtx destlo, desthi;
23571 gcc_assert (halfmode == V16QImode);
23573 if (src1 == dest && src2 == dest + halfregs)
23575 /* No-op move. Can't split to nothing; emit something. */
23576 emit_note (NOTE_INSN_DELETED);
23577 return;
23580 /* Preserve register attributes for variable tracking. */
23581 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
23582 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
23583 GET_MODE_SIZE (halfmode));
23585 /* Special case of reversed high/low parts. */
23586 if (reg_overlap_mentioned_p (operands[2], destlo)
23587 && reg_overlap_mentioned_p (operands[1], desthi))
23589 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23590 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
23591 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
23593 else if (!reg_overlap_mentioned_p (operands[2], destlo))
23595 /* Try to avoid unnecessary moves if part of the result
23596 is in the right place already. */
23597 if (src1 != dest)
23598 emit_move_insn (destlo, operands[1]);
23599 if (src2 != dest + halfregs)
23600 emit_move_insn (desthi, operands[2]);
23602 else
23604 if (src2 != dest + halfregs)
23605 emit_move_insn (desthi, operands[2]);
23606 if (src1 != dest)
23607 emit_move_insn (destlo, operands[1]);
23611 /* vec_perm support. */
23613 struct expand_vec_perm_d
23615 rtx target, op0, op1;
23616 vec_perm_indices perm;
23617 machine_mode vmode;
23618 machine_mode op_mode;
23619 unsigned int vec_flags;
23620 unsigned int op_vec_flags;
23621 bool one_vector_p;
23622 bool testing_p;
23625 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
23627 /* Generate a variable permutation. */
23629 static void
23630 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
23632 machine_mode vmode = GET_MODE (target);
23633 bool one_vector_p = rtx_equal_p (op0, op1);
23635 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
23636 gcc_checking_assert (GET_MODE (op0) == vmode);
23637 gcc_checking_assert (GET_MODE (op1) == vmode);
23638 gcc_checking_assert (GET_MODE (sel) == vmode);
23639 gcc_checking_assert (TARGET_SIMD);
23641 if (one_vector_p)
23643 if (vmode == V8QImode)
23645 /* Expand the argument to a V16QI mode by duplicating it. */
23646 rtx pair = gen_reg_rtx (V16QImode);
23647 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
23648 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23650 else
23652 emit_insn (gen_aarch64_qtbl1v16qi (target, op0, sel));
23655 else
23657 rtx pair;
23659 if (vmode == V8QImode)
23661 pair = gen_reg_rtx (V16QImode);
23662 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
23663 emit_insn (gen_aarch64_qtbl1v8qi (target, pair, sel));
23665 else
23667 pair = gen_reg_rtx (V2x16QImode);
23668 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
23669 emit_insn (gen_aarch64_qtbl2v16qi (target, pair, sel));
23674 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
23675 NELT is the number of elements in the vector. */
23677 void
23678 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
23679 unsigned int nelt)
23681 machine_mode vmode = GET_MODE (target);
23682 bool one_vector_p = rtx_equal_p (op0, op1);
23683 rtx mask;
23685 /* The TBL instruction does not use a modulo index, so we must take care
23686 of that ourselves. */
23687 mask = aarch64_simd_gen_const_vector_dup (vmode,
23688 one_vector_p ? nelt - 1 : 2 * nelt - 1);
23689 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
23691 /* For big-endian, we also need to reverse the index within the vector
23692 (but not which vector). */
23693 if (BYTES_BIG_ENDIAN)
23695 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
23696 if (!one_vector_p)
23697 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
23698 sel = expand_simple_binop (vmode, XOR, sel, mask,
23699 NULL, 0, OPTAB_LIB_WIDEN);
23701 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
23704 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
23706 static void
23707 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
23709 emit_insn (gen_rtx_SET (target,
23710 gen_rtx_UNSPEC (GET_MODE (target),
23711 gen_rtvec (2, op0, op1), code)));
23714 /* Expand an SVE vec_perm with the given operands. */
23716 void
23717 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
23719 machine_mode data_mode = GET_MODE (target);
23720 machine_mode sel_mode = GET_MODE (sel);
23721 /* Enforced by the pattern condition. */
23722 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
23724 /* Note: vec_perm indices are supposed to wrap when they go beyond the
23725 size of the two value vectors, i.e. the upper bits of the indices
23726 are effectively ignored. SVE TBL instead produces 0 for any
23727 out-of-range indices, so we need to modulo all the vec_perm indices
23728 to ensure they are all in range. */
23729 rtx sel_reg = force_reg (sel_mode, sel);
23731 /* Check if the sel only references the first values vector. */
23732 if (CONST_VECTOR_P (sel)
23733 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
23735 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
23736 return;
23739 /* Check if the two values vectors are the same. */
23740 if (rtx_equal_p (op0, op1))
23742 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
23743 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23744 NULL, 0, OPTAB_DIRECT);
23745 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
23746 return;
23749 /* Run TBL on for each value vector and combine the results. */
23751 rtx res0 = gen_reg_rtx (data_mode);
23752 rtx res1 = gen_reg_rtx (data_mode);
23753 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
23754 if (!CONST_VECTOR_P (sel)
23755 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
23757 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
23758 2 * nunits - 1);
23759 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
23760 NULL, 0, OPTAB_DIRECT);
23762 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
23763 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
23764 NULL, 0, OPTAB_DIRECT);
23765 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
23766 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
23767 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
23768 else
23769 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
23772 /* Recognize patterns suitable for the TRN instructions. */
23773 static bool
23774 aarch64_evpc_trn (struct expand_vec_perm_d *d)
23776 HOST_WIDE_INT odd;
23777 poly_uint64 nelt = d->perm.length ();
23778 rtx out, in0, in1;
23779 machine_mode vmode = d->vmode;
23781 if (GET_MODE_UNIT_SIZE (vmode) > 8)
23782 return false;
23784 /* Note that these are little-endian tests.
23785 We correct for big-endian later. */
23786 if (!d->perm[0].is_constant (&odd)
23787 || (odd != 0 && odd != 1)
23788 || !d->perm.series_p (0, 2, odd, 2)
23789 || !d->perm.series_p (1, 2, nelt + odd, 2))
23790 return false;
23792 /* Success! */
23793 if (d->testing_p)
23794 return true;
23796 in0 = d->op0;
23797 in1 = d->op1;
23798 /* We don't need a big-endian lane correction for SVE; see the comment
23799 at the head of aarch64-sve.md for details. */
23800 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23802 std::swap (in0, in1);
23803 odd = !odd;
23805 out = d->target;
23807 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23808 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
23809 return true;
23812 /* Try to re-encode the PERM constant so it combines odd and even elements.
23813 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
23814 We retry with this new constant with the full suite of patterns. */
23815 static bool
23816 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
23818 expand_vec_perm_d newd;
23819 unsigned HOST_WIDE_INT nelt;
23821 if (d->vec_flags != VEC_ADVSIMD)
23822 return false;
23824 /* Get the new mode. Always twice the size of the inner
23825 and half the elements. */
23826 poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
23827 unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
23828 auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
23829 machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
23831 if (new_mode == word_mode)
23832 return false;
23834 /* to_constant is safe since this routine is specific to Advanced SIMD
23835 vectors. */
23836 nelt = d->perm.length ().to_constant ();
23838 vec_perm_builder newpermconst;
23839 newpermconst.new_vector (nelt / 2, nelt / 2, 1);
23841 /* Convert the perm constant if we can. Require even, odd as the pairs. */
23842 for (unsigned int i = 0; i < nelt; i += 2)
23844 poly_int64 elt0 = d->perm[i];
23845 poly_int64 elt1 = d->perm[i + 1];
23846 poly_int64 newelt;
23847 if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
23848 return false;
23849 newpermconst.quick_push (newelt.to_constant ());
23851 newpermconst.finalize ();
23853 newd.vmode = new_mode;
23854 newd.vec_flags = VEC_ADVSIMD;
23855 newd.op_mode = newd.vmode;
23856 newd.op_vec_flags = newd.vec_flags;
23857 newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
23858 newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
23859 newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
23860 newd.testing_p = d->testing_p;
23861 newd.one_vector_p = d->one_vector_p;
23863 newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
23864 return aarch64_expand_vec_perm_const_1 (&newd);
23867 /* Recognize patterns suitable for the UZP instructions. */
23868 static bool
23869 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
23871 HOST_WIDE_INT odd;
23872 rtx out, in0, in1;
23873 machine_mode vmode = d->vmode;
23875 if (GET_MODE_UNIT_SIZE (vmode) > 8)
23876 return false;
23878 /* Note that these are little-endian tests.
23879 We correct for big-endian later. */
23880 if (!d->perm[0].is_constant (&odd)
23881 || (odd != 0 && odd != 1)
23882 || !d->perm.series_p (0, 1, odd, 2))
23883 return false;
23885 /* Success! */
23886 if (d->testing_p)
23887 return true;
23889 in0 = d->op0;
23890 in1 = d->op1;
23891 /* We don't need a big-endian lane correction for SVE; see the comment
23892 at the head of aarch64-sve.md for details. */
23893 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23895 std::swap (in0, in1);
23896 odd = !odd;
23898 out = d->target;
23900 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23901 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
23902 return true;
23905 /* Recognize patterns suitable for the ZIP instructions. */
23906 static bool
23907 aarch64_evpc_zip (struct expand_vec_perm_d *d)
23909 unsigned int high;
23910 poly_uint64 nelt = d->perm.length ();
23911 rtx out, in0, in1;
23912 machine_mode vmode = d->vmode;
23914 if (GET_MODE_UNIT_SIZE (vmode) > 8)
23915 return false;
23917 /* Note that these are little-endian tests.
23918 We correct for big-endian later. */
23919 poly_uint64 first = d->perm[0];
23920 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
23921 || !d->perm.series_p (0, 2, first, 1)
23922 || !d->perm.series_p (1, 2, first + nelt, 1))
23923 return false;
23924 high = maybe_ne (first, 0U);
23926 /* Success! */
23927 if (d->testing_p)
23928 return true;
23930 in0 = d->op0;
23931 in1 = d->op1;
23932 /* We don't need a big-endian lane correction for SVE; see the comment
23933 at the head of aarch64-sve.md for details. */
23934 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
23936 std::swap (in0, in1);
23937 high = !high;
23939 out = d->target;
23941 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
23942 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
23943 return true;
23946 /* Recognize patterns for the EXT insn. */
23948 static bool
23949 aarch64_evpc_ext (struct expand_vec_perm_d *d)
23951 HOST_WIDE_INT location;
23952 rtx offset;
23954 /* The first element always refers to the first vector.
23955 Check if the extracted indices are increasing by one. */
23956 if (d->vec_flags == VEC_SVE_PRED
23957 || !d->perm[0].is_constant (&location)
23958 || !d->perm.series_p (0, 1, location, 1))
23959 return false;
23961 /* Success! */
23962 if (d->testing_p)
23963 return true;
23965 /* The case where (location == 0) is a no-op for both big- and little-endian,
23966 and is removed by the mid-end at optimization levels -O1 and higher.
23968 We don't need a big-endian lane correction for SVE; see the comment
23969 at the head of aarch64-sve.md for details. */
23970 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
23972 /* After setup, we want the high elements of the first vector (stored
23973 at the LSB end of the register), and the low elements of the second
23974 vector (stored at the MSB end of the register). So swap. */
23975 std::swap (d->op0, d->op1);
23976 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
23977 to_constant () is safe since this is restricted to Advanced SIMD
23978 vectors. */
23979 location = d->perm.length ().to_constant () - location;
23982 offset = GEN_INT (location);
23983 emit_set_insn (d->target,
23984 gen_rtx_UNSPEC (d->vmode,
23985 gen_rtvec (3, d->op0, d->op1, offset),
23986 UNSPEC_EXT));
23987 return true;
23990 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
23991 within each 64-bit, 32-bit or 16-bit granule. */
23993 static bool
23994 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
23996 HOST_WIDE_INT diff;
23997 unsigned int i, size, unspec;
23998 machine_mode pred_mode;
24000 if (d->vec_flags == VEC_SVE_PRED
24001 || !d->one_vector_p
24002 || !d->perm[0].is_constant (&diff)
24003 || !diff)
24004 return false;
24006 if (d->vec_flags & VEC_SVE_DATA)
24007 size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
24008 else
24009 size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
24010 if (size == 64)
24012 unspec = UNSPEC_REV64;
24013 pred_mode = VNx2BImode;
24015 else if (size == 32)
24017 unspec = UNSPEC_REV32;
24018 pred_mode = VNx4BImode;
24020 else if (size == 16)
24022 unspec = UNSPEC_REV16;
24023 pred_mode = VNx8BImode;
24025 else
24026 return false;
24028 unsigned int step = diff + 1;
24029 for (i = 0; i < step; ++i)
24030 if (!d->perm.series_p (i, step, diff - i, step))
24031 return false;
24033 /* Success! */
24034 if (d->testing_p)
24035 return true;
24037 if (d->vec_flags & VEC_SVE_DATA)
24039 rtx pred = aarch64_ptrue_reg (pred_mode);
24040 emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
24041 d->target, pred, d->op0));
24042 return true;
24044 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
24045 emit_set_insn (d->target, src);
24046 return true;
24049 /* Recognize patterns for the REV insn, which reverses elements within
24050 a full vector. */
24052 static bool
24053 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
24055 poly_uint64 nelt = d->perm.length ();
24057 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
24058 return false;
24060 if (!d->perm.series_p (0, 1, nelt - 1, -1))
24061 return false;
24063 /* Success! */
24064 if (d->testing_p)
24065 return true;
24067 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
24068 emit_set_insn (d->target, src);
24069 return true;
24072 static bool
24073 aarch64_evpc_dup (struct expand_vec_perm_d *d)
24075 rtx out = d->target;
24076 rtx in0;
24077 HOST_WIDE_INT elt;
24078 machine_mode vmode = d->vmode;
24079 rtx lane;
24081 if (d->vec_flags == VEC_SVE_PRED
24082 || d->perm.encoding ().encoded_nelts () != 1
24083 || !d->perm[0].is_constant (&elt))
24084 return false;
24086 if ((d->vec_flags & VEC_SVE_DATA)
24087 && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
24088 return false;
24090 /* Success! */
24091 if (d->testing_p)
24092 return true;
24094 /* The generic preparation in aarch64_expand_vec_perm_const_1
24095 swaps the operand order and the permute indices if it finds
24096 d->perm[0] to be in the second operand. Thus, we can always
24097 use d->op0 and need not do any extra arithmetic to get the
24098 correct lane number. */
24099 in0 = d->op0;
24100 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
24102 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
24103 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
24104 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
24105 return true;
24108 static bool
24109 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
24111 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
24112 machine_mode vmode = d->vmode;
24114 /* Make sure that the indices are constant. */
24115 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
24116 for (unsigned int i = 0; i < encoded_nelts; ++i)
24117 if (!d->perm[i].is_constant ())
24118 return false;
24120 if (d->testing_p)
24121 return true;
24123 /* Generic code will try constant permutation twice. Once with the
24124 original mode and again with the elements lowered to QImode.
24125 So wait and don't do the selector expansion ourselves. */
24126 if (vmode != V8QImode && vmode != V16QImode)
24127 return false;
24129 /* to_constant is safe since this routine is specific to Advanced SIMD
24130 vectors. */
24131 unsigned int nelt = d->perm.length ().to_constant ();
24132 for (unsigned int i = 0; i < nelt; ++i)
24133 /* If big-endian and two vectors we end up with a weird mixed-endian
24134 mode on NEON. Reverse the index within each word but not the word
24135 itself. to_constant is safe because we checked is_constant above. */
24136 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
24137 ? d->perm[i].to_constant () ^ (nelt - 1)
24138 : d->perm[i].to_constant ());
24140 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
24141 sel = force_reg (vmode, sel);
24143 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
24144 return true;
24147 /* Try to implement D using an SVE TBL instruction. */
24149 static bool
24150 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
24152 unsigned HOST_WIDE_INT nelt;
24154 /* Permuting two variable-length vectors could overflow the
24155 index range. */
24156 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
24157 return false;
24159 if (d->testing_p)
24160 return true;
24162 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
24163 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
24164 if (d->one_vector_p)
24165 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
24166 else
24167 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
24168 return true;
24171 /* Try to implement D using SVE dup instruction. */
24173 static bool
24174 aarch64_evpc_sve_dup (struct expand_vec_perm_d *d)
24176 if (BYTES_BIG_ENDIAN
24177 || !d->one_vector_p
24178 || d->vec_flags != VEC_SVE_DATA
24179 || d->op_vec_flags != VEC_ADVSIMD
24180 || d->perm.encoding ().nelts_per_pattern () != 1
24181 || !known_eq (d->perm.encoding ().npatterns (),
24182 GET_MODE_NUNITS (d->op_mode))
24183 || !known_eq (GET_MODE_BITSIZE (d->op_mode), 128))
24184 return false;
24186 int npatterns = d->perm.encoding ().npatterns ();
24187 for (int i = 0; i < npatterns; i++)
24188 if (!known_eq (d->perm[i], i))
24189 return false;
24191 if (d->testing_p)
24192 return true;
24194 aarch64_expand_sve_dupq (d->target, GET_MODE (d->target), d->op0);
24195 return true;
24198 /* Try to implement D using SVE SEL instruction. */
24200 static bool
24201 aarch64_evpc_sel (struct expand_vec_perm_d *d)
24203 machine_mode vmode = d->vmode;
24204 int unit_size = GET_MODE_UNIT_SIZE (vmode);
24206 if (d->vec_flags != VEC_SVE_DATA
24207 || unit_size > 8)
24208 return false;
24210 int n_patterns = d->perm.encoding ().npatterns ();
24211 poly_int64 vec_len = d->perm.length ();
24213 for (int i = 0; i < n_patterns; ++i)
24214 if (!known_eq (d->perm[i], i)
24215 && !known_eq (d->perm[i], vec_len + i))
24216 return false;
24218 for (int i = n_patterns; i < n_patterns * 2; i++)
24219 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
24220 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
24221 return false;
24223 if (d->testing_p)
24224 return true;
24226 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
24228 /* Build a predicate that is true when op0 elements should be used. */
24229 rtx_vector_builder builder (pred_mode, n_patterns, 2);
24230 for (int i = 0; i < n_patterns * 2; i++)
24232 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
24233 : CONST0_RTX (BImode);
24234 builder.quick_push (elem);
24237 rtx const_vec = builder.build ();
24238 rtx pred = force_reg (pred_mode, const_vec);
24239 /* TARGET = PRED ? OP0 : OP1. */
24240 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
24241 return true;
24244 /* Recognize patterns suitable for the INS instructions. */
24245 static bool
24246 aarch64_evpc_ins (struct expand_vec_perm_d *d)
24248 machine_mode mode = d->vmode;
24249 unsigned HOST_WIDE_INT nelt;
24251 if (d->vec_flags != VEC_ADVSIMD)
24252 return false;
24254 /* to_constant is safe since this routine is specific to Advanced SIMD
24255 vectors. */
24256 nelt = d->perm.length ().to_constant ();
24257 rtx insv = d->op0;
24259 HOST_WIDE_INT idx = -1;
24261 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24263 HOST_WIDE_INT elt;
24264 if (!d->perm[i].is_constant (&elt))
24265 return false;
24266 if (elt == (HOST_WIDE_INT) i)
24267 continue;
24268 if (idx != -1)
24270 idx = -1;
24271 break;
24273 idx = i;
24276 if (idx == -1)
24278 insv = d->op1;
24279 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
24281 if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
24282 continue;
24283 if (idx != -1)
24284 return false;
24285 idx = i;
24288 if (idx == -1)
24289 return false;
24292 if (d->testing_p)
24293 return true;
24295 gcc_assert (idx != -1);
24297 unsigned extractindex = d->perm[idx].to_constant ();
24298 rtx extractv = d->op0;
24299 if (extractindex >= nelt)
24301 extractv = d->op1;
24302 extractindex -= nelt;
24304 gcc_assert (extractindex < nelt);
24306 insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
24307 expand_operand ops[5];
24308 create_output_operand (&ops[0], d->target, mode);
24309 create_input_operand (&ops[1], insv, mode);
24310 create_integer_operand (&ops[2], 1 << idx);
24311 create_input_operand (&ops[3], extractv, mode);
24312 create_integer_operand (&ops[4], extractindex);
24313 expand_insn (icode, 5, ops);
24315 return true;
24318 static bool
24319 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
24321 gcc_assert (d->op_mode != E_VOIDmode);
24323 /* The pattern matching functions above are written to look for a small
24324 number to begin the sequence (0, 1, N/2). If we begin with an index
24325 from the second operand, we can swap the operands. */
24326 poly_int64 nelt = d->perm.length ();
24327 if (known_ge (d->perm[0], nelt))
24329 d->perm.rotate_inputs (1);
24330 std::swap (d->op0, d->op1);
24333 if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
24334 || d->vec_flags == VEC_SVE_DATA
24335 || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
24336 || d->vec_flags == VEC_SVE_PRED)
24337 && known_gt (nelt, 1))
24339 if (d->vmode == d->op_mode)
24341 if (aarch64_evpc_rev_local (d))
24342 return true;
24343 else if (aarch64_evpc_rev_global (d))
24344 return true;
24345 else if (aarch64_evpc_ext (d))
24346 return true;
24347 else if (aarch64_evpc_dup (d))
24348 return true;
24349 else if (aarch64_evpc_zip (d))
24350 return true;
24351 else if (aarch64_evpc_uzp (d))
24352 return true;
24353 else if (aarch64_evpc_trn (d))
24354 return true;
24355 else if (aarch64_evpc_sel (d))
24356 return true;
24357 else if (aarch64_evpc_ins (d))
24358 return true;
24359 else if (aarch64_evpc_reencode (d))
24360 return true;
24362 if (d->vec_flags == VEC_SVE_DATA)
24363 return aarch64_evpc_sve_tbl (d);
24364 else if (d->vec_flags == VEC_ADVSIMD)
24365 return aarch64_evpc_tbl (d);
24367 else
24369 if (aarch64_evpc_sve_dup (d))
24370 return true;
24373 return false;
24376 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
24378 static bool
24379 aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
24380 rtx target, rtx op0, rtx op1,
24381 const vec_perm_indices &sel)
24383 struct expand_vec_perm_d d;
24385 /* Check whether the mask can be applied to a single vector. */
24386 if (sel.ninputs () == 1
24387 || (op0 && rtx_equal_p (op0, op1)))
24388 d.one_vector_p = true;
24389 else if (sel.all_from_input_p (0))
24391 d.one_vector_p = true;
24392 op1 = op0;
24394 else if (sel.all_from_input_p (1))
24396 d.one_vector_p = true;
24397 op0 = op1;
24399 else
24400 d.one_vector_p = false;
24402 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
24403 sel.nelts_per_input ());
24404 d.vmode = vmode;
24405 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
24406 d.op_mode = op_mode;
24407 d.op_vec_flags = aarch64_classify_vector_mode (d.op_mode);
24408 d.target = target;
24409 d.op0 = op0 ? force_reg (op_mode, op0) : NULL_RTX;
24410 if (op0 == op1)
24411 d.op1 = d.op0;
24412 else
24413 d.op1 = op1 ? force_reg (op_mode, op1) : NULL_RTX;
24414 d.testing_p = !target;
24416 if (!d.testing_p)
24417 return aarch64_expand_vec_perm_const_1 (&d);
24419 rtx_insn *last = get_last_insn ();
24420 bool ret = aarch64_expand_vec_perm_const_1 (&d);
24421 gcc_assert (last == get_last_insn ());
24423 return ret;
24425 /* Generate a byte permute mask for a register of mode MODE,
24426 which has NUNITS units. */
24429 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
24431 /* We have to reverse each vector because we dont have
24432 a permuted load that can reverse-load according to ABI rules. */
24433 rtx mask;
24434 rtvec v = rtvec_alloc (16);
24435 unsigned int i, j;
24436 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
24438 gcc_assert (BYTES_BIG_ENDIAN);
24439 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
24441 for (i = 0; i < nunits; i++)
24442 for (j = 0; j < usize; j++)
24443 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
24444 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
24445 return force_reg (V16QImode, mask);
24448 /* Expand an SVE integer comparison using the SVE equivalent of:
24450 (set TARGET (CODE OP0 OP1)). */
24452 void
24453 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
24455 machine_mode pred_mode = GET_MODE (target);
24456 machine_mode data_mode = GET_MODE (op0);
24457 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
24458 op0, op1);
24459 if (!rtx_equal_p (target, res))
24460 emit_move_insn (target, res);
24463 /* Return the UNSPEC_COND_* code for comparison CODE. */
24465 static unsigned int
24466 aarch64_unspec_cond_code (rtx_code code)
24468 switch (code)
24470 case NE:
24471 return UNSPEC_COND_FCMNE;
24472 case EQ:
24473 return UNSPEC_COND_FCMEQ;
24474 case LT:
24475 return UNSPEC_COND_FCMLT;
24476 case GT:
24477 return UNSPEC_COND_FCMGT;
24478 case LE:
24479 return UNSPEC_COND_FCMLE;
24480 case GE:
24481 return UNSPEC_COND_FCMGE;
24482 case UNORDERED:
24483 return UNSPEC_COND_FCMUO;
24484 default:
24485 gcc_unreachable ();
24489 /* Emit:
24491 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24493 where <X> is the operation associated with comparison CODE.
24494 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
24496 static void
24497 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
24498 bool known_ptrue_p, rtx op0, rtx op1)
24500 rtx flag = gen_int_mode (known_ptrue_p, SImode);
24501 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
24502 gen_rtvec (4, pred, flag, op0, op1),
24503 aarch64_unspec_cond_code (code));
24504 emit_set_insn (target, unspec);
24507 /* Emit the SVE equivalent of:
24509 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
24510 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
24511 (set TARGET (ior:PRED_MODE TMP1 TMP2))
24513 where <Xi> is the operation associated with comparison CODEi.
24514 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
24516 static void
24517 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
24518 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
24520 machine_mode pred_mode = GET_MODE (pred);
24521 rtx tmp1 = gen_reg_rtx (pred_mode);
24522 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
24523 rtx tmp2 = gen_reg_rtx (pred_mode);
24524 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
24525 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
24528 /* Emit the SVE equivalent of:
24530 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
24531 (set TARGET (not TMP))
24533 where <X> is the operation associated with comparison CODE.
24534 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
24536 static void
24537 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
24538 bool known_ptrue_p, rtx op0, rtx op1)
24540 machine_mode pred_mode = GET_MODE (pred);
24541 rtx tmp = gen_reg_rtx (pred_mode);
24542 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
24543 aarch64_emit_unop (target, one_cmpl_optab, tmp);
24546 /* Expand an SVE floating-point comparison using the SVE equivalent of:
24548 (set TARGET (CODE OP0 OP1))
24550 If CAN_INVERT_P is true, the caller can also handle inverted results;
24551 return true if the result is in fact inverted. */
24553 bool
24554 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
24555 rtx op0, rtx op1, bool can_invert_p)
24557 machine_mode pred_mode = GET_MODE (target);
24558 machine_mode data_mode = GET_MODE (op0);
24560 rtx ptrue = aarch64_ptrue_reg (pred_mode);
24561 switch (code)
24563 case UNORDERED:
24564 /* UNORDERED has no immediate form. */
24565 op1 = force_reg (data_mode, op1);
24566 /* fall through */
24567 case LT:
24568 case LE:
24569 case GT:
24570 case GE:
24571 case EQ:
24572 case NE:
24574 /* There is native support for the comparison. */
24575 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24576 return false;
24579 case LTGT:
24580 /* This is a trapping operation (LT or GT). */
24581 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
24582 return false;
24584 case UNEQ:
24585 if (!flag_trapping_math)
24587 /* This would trap for signaling NaNs. */
24588 op1 = force_reg (data_mode, op1);
24589 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
24590 ptrue, true, op0, op1);
24591 return false;
24593 /* fall through */
24594 case UNLT:
24595 case UNLE:
24596 case UNGT:
24597 case UNGE:
24598 if (flag_trapping_math)
24600 /* Work out which elements are ordered. */
24601 rtx ordered = gen_reg_rtx (pred_mode);
24602 op1 = force_reg (data_mode, op1);
24603 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
24604 ptrue, true, op0, op1);
24606 /* Test the opposite condition for the ordered elements,
24607 then invert the result. */
24608 if (code == UNEQ)
24609 code = NE;
24610 else
24611 code = reverse_condition_maybe_unordered (code);
24612 if (can_invert_p)
24614 aarch64_emit_sve_fp_cond (target, code,
24615 ordered, false, op0, op1);
24616 return true;
24618 aarch64_emit_sve_invert_fp_cond (target, code,
24619 ordered, false, op0, op1);
24620 return false;
24622 break;
24624 case ORDERED:
24625 /* ORDERED has no immediate form. */
24626 op1 = force_reg (data_mode, op1);
24627 break;
24629 default:
24630 gcc_unreachable ();
24633 /* There is native support for the inverse comparison. */
24634 code = reverse_condition_maybe_unordered (code);
24635 if (can_invert_p)
24637 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
24638 return true;
24640 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
24641 return false;
24644 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
24645 of the data being selected and CMP_MODE is the mode of the values being
24646 compared. */
24648 void
24649 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
24650 rtx *ops)
24652 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
24653 rtx pred = gen_reg_rtx (pred_mode);
24654 if (FLOAT_MODE_P (cmp_mode))
24656 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
24657 ops[4], ops[5], true))
24658 std::swap (ops[1], ops[2]);
24660 else
24661 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
24663 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
24664 ops[1] = force_reg (data_mode, ops[1]);
24665 /* The "false" value can only be zero if the "true" value is a constant. */
24666 if (register_operand (ops[1], data_mode)
24667 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
24668 ops[2] = force_reg (data_mode, ops[2]);
24670 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
24671 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
24674 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
24675 true. However due to issues with register allocation it is preferable
24676 to avoid tieing integer scalar and FP scalar modes. Executing integer
24677 operations in general registers is better than treating them as scalar
24678 vector operations. This reduces latency and avoids redundant int<->FP
24679 moves. So tie modes if they are either the same class, or vector modes
24680 with other vector modes, vector structs or any scalar mode. */
24682 static bool
24683 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
24685 if ((aarch64_advsimd_partial_struct_mode_p (mode1)
24686 != aarch64_advsimd_partial_struct_mode_p (mode2))
24687 && maybe_gt (GET_MODE_SIZE (mode1), 8)
24688 && maybe_gt (GET_MODE_SIZE (mode2), 8))
24689 return false;
24691 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
24692 return true;
24694 /* We specifically want to allow elements of "structure" modes to
24695 be tieable to the structure. This more general condition allows
24696 other rarer situations too. The reason we don't extend this to
24697 predicate modes is that there are no predicate structure modes
24698 nor any specific instructions for extracting part of a predicate
24699 register. */
24700 if (aarch64_vector_data_mode_p (mode1)
24701 && aarch64_vector_data_mode_p (mode2))
24702 return true;
24704 /* Also allow any scalar modes with vectors. */
24705 if (aarch64_vector_mode_supported_p (mode1)
24706 || aarch64_vector_mode_supported_p (mode2))
24707 return true;
24709 return false;
24712 /* Return a new RTX holding the result of moving POINTER forward by
24713 AMOUNT bytes. */
24715 static rtx
24716 aarch64_move_pointer (rtx pointer, poly_int64 amount)
24718 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
24720 return adjust_automodify_address (pointer, GET_MODE (pointer),
24721 next, amount);
24724 /* Return a new RTX holding the result of moving POINTER forward by the
24725 size of the mode it points to. */
24727 static rtx
24728 aarch64_progress_pointer (rtx pointer)
24730 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
24733 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
24734 MODE bytes. */
24736 static void
24737 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
24738 machine_mode mode)
24740 /* Handle 256-bit memcpy separately. We do this by making 2 adjacent memory
24741 address copies using V4SImode so that we can use Q registers. */
24742 if (known_eq (GET_MODE_BITSIZE (mode), 256))
24744 mode = V4SImode;
24745 rtx reg1 = gen_reg_rtx (mode);
24746 rtx reg2 = gen_reg_rtx (mode);
24747 /* "Cast" the pointers to the correct mode. */
24748 *src = adjust_address (*src, mode, 0);
24749 *dst = adjust_address (*dst, mode, 0);
24750 /* Emit the memcpy. */
24751 emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
24752 aarch64_progress_pointer (*src)));
24753 emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
24754 aarch64_progress_pointer (*dst), reg2));
24755 /* Move the pointers forward. */
24756 *src = aarch64_move_pointer (*src, 32);
24757 *dst = aarch64_move_pointer (*dst, 32);
24758 return;
24761 rtx reg = gen_reg_rtx (mode);
24763 /* "Cast" the pointers to the correct mode. */
24764 *src = adjust_address (*src, mode, 0);
24765 *dst = adjust_address (*dst, mode, 0);
24766 /* Emit the memcpy. */
24767 emit_move_insn (reg, *src);
24768 emit_move_insn (*dst, reg);
24769 /* Move the pointers forward. */
24770 *src = aarch64_progress_pointer (*src);
24771 *dst = aarch64_progress_pointer (*dst);
24774 /* Expand a cpymem using the MOPS extension. OPERANDS are taken
24775 from the cpymem pattern. Return true iff we succeeded. */
24776 static bool
24777 aarch64_expand_cpymem_mops (rtx *operands)
24779 if (!TARGET_MOPS)
24780 return false;
24782 /* All three registers are changed by the instruction, so each one
24783 must be a fresh pseudo. */
24784 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24785 rtx src_addr = copy_to_mode_reg (Pmode, XEXP (operands[1], 0));
24786 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24787 rtx src_mem = replace_equiv_address (operands[1], src_addr);
24788 rtx sz_reg = copy_to_mode_reg (DImode, operands[2]);
24789 emit_insn (gen_aarch64_cpymemdi (dst_mem, src_mem, sz_reg));
24791 return true;
24794 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
24795 we succeed, otherwise return false, indicating that a libcall to
24796 memcpy should be emitted. */
24798 bool
24799 aarch64_expand_cpymem (rtx *operands)
24801 int mode_bits;
24802 rtx dst = operands[0];
24803 rtx src = operands[1];
24804 rtx base;
24805 machine_mode cur_mode = BLKmode;
24807 /* Variable-sized memcpy can go through the MOPS expansion if available. */
24808 if (!CONST_INT_P (operands[2]))
24809 return aarch64_expand_cpymem_mops (operands);
24811 unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
24813 /* Try to inline up to 256 bytes or use the MOPS threshold if available. */
24814 unsigned HOST_WIDE_INT max_copy_size
24815 = TARGET_MOPS ? aarch64_mops_memcpy_size_threshold : 256;
24817 bool size_p = optimize_function_for_size_p (cfun);
24819 /* Large constant-sized cpymem should go through MOPS when possible.
24820 It should be a win even for size optimization in the general case.
24821 For speed optimization the choice between MOPS and the SIMD sequence
24822 depends on the size of the copy, rather than number of instructions,
24823 alignment etc. */
24824 if (size > max_copy_size)
24825 return aarch64_expand_cpymem_mops (operands);
24827 int copy_bits = 256;
24829 /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
24830 support or slow 256-bit LDP/STP fall back to 128-bit chunks. */
24831 if (size <= 24
24832 || !TARGET_SIMD
24833 || (aarch64_tune_params.extra_tuning_flags
24834 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
24835 copy_bits = 128;
24837 /* Emit an inline load+store sequence and count the number of operations
24838 involved. We use a simple count of just the loads and stores emitted
24839 rather than rtx_insn count as all the pointer adjustments and reg copying
24840 in this function will get optimized away later in the pipeline. */
24841 start_sequence ();
24842 unsigned nops = 0;
24844 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
24845 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
24847 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
24848 src = adjust_automodify_address (src, VOIDmode, base, 0);
24850 /* Convert size to bits to make the rest of the code simpler. */
24851 int n = size * BITS_PER_UNIT;
24853 while (n > 0)
24855 /* Find the largest mode in which to do the copy in without over reading
24856 or writing. */
24857 opt_scalar_int_mode mode_iter;
24858 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
24859 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
24860 cur_mode = mode_iter.require ();
24862 gcc_assert (cur_mode != BLKmode);
24864 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
24866 /* Prefer Q-register accesses for the last bytes. */
24867 if (mode_bits == 128 && copy_bits == 256)
24868 cur_mode = V4SImode;
24870 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
24871 /* A single block copy is 1 load + 1 store. */
24872 nops += 2;
24873 n -= mode_bits;
24875 /* Emit trailing copies using overlapping unaligned accesses
24876 (when !STRICT_ALIGNMENT) - this is smaller and faster. */
24877 if (n > 0 && n < copy_bits / 2 && !STRICT_ALIGNMENT)
24879 machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
24880 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
24881 gcc_assert (n_bits <= mode_bits);
24882 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
24883 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
24884 n = n_bits;
24887 rtx_insn *seq = get_insns ();
24888 end_sequence ();
24889 /* MOPS sequence requires 3 instructions for the memory copying + 1 to move
24890 the constant size into a register. */
24891 unsigned mops_cost = 3 + 1;
24893 /* If MOPS is available at this point we don't consider the libcall as it's
24894 not a win even on code size. At this point only consider MOPS if
24895 optimizing for size. For speed optimizations we will have chosen between
24896 the two based on copy size already. */
24897 if (TARGET_MOPS)
24899 if (size_p && mops_cost < nops)
24900 return aarch64_expand_cpymem_mops (operands);
24901 emit_insn (seq);
24902 return true;
24905 /* A memcpy libcall in the worst case takes 3 instructions to prepare the
24906 arguments + 1 for the call. When MOPS is not available and we're
24907 optimizing for size a libcall may be preferable. */
24908 unsigned libcall_cost = 4;
24909 if (size_p && libcall_cost < nops)
24910 return false;
24912 emit_insn (seq);
24913 return true;
24916 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
24917 SRC is a register we have created with the duplicated value to be set. */
24918 static void
24919 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
24920 machine_mode mode)
24922 /* If we are copying 128bits or 256bits, we can do that straight from
24923 the SIMD register we prepared. */
24924 if (known_eq (GET_MODE_BITSIZE (mode), 256))
24926 mode = GET_MODE (src);
24927 /* "Cast" the *dst to the correct mode. */
24928 *dst = adjust_address (*dst, mode, 0);
24929 /* Emit the memset. */
24930 emit_insn (aarch64_gen_store_pair (mode, *dst, src,
24931 aarch64_progress_pointer (*dst), src));
24933 /* Move the pointers forward. */
24934 *dst = aarch64_move_pointer (*dst, 32);
24935 return;
24937 if (known_eq (GET_MODE_BITSIZE (mode), 128))
24939 /* "Cast" the *dst to the correct mode. */
24940 *dst = adjust_address (*dst, GET_MODE (src), 0);
24941 /* Emit the memset. */
24942 emit_move_insn (*dst, src);
24943 /* Move the pointers forward. */
24944 *dst = aarch64_move_pointer (*dst, 16);
24945 return;
24947 /* For copying less, we have to extract the right amount from src. */
24948 rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
24950 /* "Cast" the *dst to the correct mode. */
24951 *dst = adjust_address (*dst, mode, 0);
24952 /* Emit the memset. */
24953 emit_move_insn (*dst, reg);
24954 /* Move the pointer forward. */
24955 *dst = aarch64_progress_pointer (*dst);
24958 /* Expand a setmem using the MOPS instructions. OPERANDS are the same
24959 as for the setmem pattern. Return true iff we succeed. */
24960 static bool
24961 aarch64_expand_setmem_mops (rtx *operands)
24963 if (!TARGET_MOPS)
24964 return false;
24966 /* The first two registers are changed by the instruction, so both
24967 of them must be a fresh pseudo. */
24968 rtx dst_addr = copy_to_mode_reg (Pmode, XEXP (operands[0], 0));
24969 rtx dst_mem = replace_equiv_address (operands[0], dst_addr);
24970 rtx sz_reg = copy_to_mode_reg (DImode, operands[1]);
24971 rtx val = operands[2];
24972 if (val != CONST0_RTX (QImode))
24973 val = force_reg (QImode, val);
24974 emit_insn (gen_aarch64_setmemdi (dst_mem, val, sz_reg));
24975 return true;
24978 /* Expand setmem, as if from a __builtin_memset. Return true if
24979 we succeed, otherwise return false. */
24981 bool
24982 aarch64_expand_setmem (rtx *operands)
24984 int n, mode_bits;
24985 unsigned HOST_WIDE_INT len;
24986 rtx dst = operands[0];
24987 rtx val = operands[2], src;
24988 rtx base;
24989 machine_mode cur_mode = BLKmode, next_mode;
24991 /* If we don't have SIMD registers or the size is variable use the MOPS
24992 inlined sequence if possible. */
24993 if (!CONST_INT_P (operands[1]) || !TARGET_SIMD)
24994 return aarch64_expand_setmem_mops (operands);
24996 bool size_p = optimize_function_for_size_p (cfun);
24998 /* Default the maximum to 256-bytes when considering only libcall vs
24999 SIMD broadcast sequence. */
25000 unsigned max_set_size = 256;
25002 len = INTVAL (operands[1]);
25003 if (len > max_set_size && !TARGET_MOPS)
25004 return false;
25006 int cst_val = !!(CONST_INT_P (val) && (INTVAL (val) != 0));
25007 /* The MOPS sequence takes:
25008 3 instructions for the memory storing
25009 + 1 to move the constant size into a reg
25010 + 1 if VAL is a non-zero constant to move into a reg
25011 (zero constants can use XZR directly). */
25012 unsigned mops_cost = 3 + 1 + cst_val;
25013 /* A libcall to memset in the worst case takes 3 instructions to prepare
25014 the arguments + 1 for the call. */
25015 unsigned libcall_cost = 4;
25017 /* Upper bound check. For large constant-sized setmem use the MOPS sequence
25018 when available. */
25019 if (TARGET_MOPS
25020 && len >= (unsigned HOST_WIDE_INT) aarch64_mops_memset_size_threshold)
25021 return aarch64_expand_setmem_mops (operands);
25023 /* Attempt a sequence with a vector broadcast followed by stores.
25024 Count the number of operations involved to see if it's worth it
25025 against the alternatives. A simple counter simd_ops on the
25026 algorithmically-relevant operations is used rather than an rtx_insn count
25027 as all the pointer adjusmtents and mode reinterprets will be optimized
25028 away later. */
25029 start_sequence ();
25030 unsigned simd_ops = 0;
25032 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
25033 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
25035 /* Prepare the val using a DUP/MOVI v0.16B, val. */
25036 src = expand_vector_broadcast (V16QImode, val);
25037 src = force_reg (V16QImode, src);
25038 simd_ops++;
25039 /* Convert len to bits to make the rest of the code simpler. */
25040 n = len * BITS_PER_UNIT;
25042 /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
25043 AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. */
25044 const int copy_limit = (aarch64_tune_params.extra_tuning_flags
25045 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
25046 ? GET_MODE_BITSIZE (TImode) : 256;
25048 while (n > 0)
25050 /* Find the largest mode in which to do the copy without
25051 over writing. */
25052 opt_scalar_int_mode mode_iter;
25053 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
25054 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
25055 cur_mode = mode_iter.require ();
25057 gcc_assert (cur_mode != BLKmode);
25059 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
25060 aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
25061 simd_ops++;
25062 n -= mode_bits;
25064 /* Do certain trailing copies as overlapping if it's going to be
25065 cheaper. i.e. less instructions to do so. For instance doing a 15
25066 byte copy it's more efficient to do two overlapping 8 byte copies than
25067 8 + 4 + 2 + 1. Only do this when -mstrict-align is not supplied. */
25068 if (n > 0 && n < copy_limit / 2 && !STRICT_ALIGNMENT)
25070 next_mode = smallest_mode_for_size (n, MODE_INT);
25071 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
25072 gcc_assert (n_bits <= mode_bits);
25073 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
25074 n = n_bits;
25077 rtx_insn *seq = get_insns ();
25078 end_sequence ();
25080 if (size_p)
25082 /* When optimizing for size we have 3 options: the SIMD broadcast sequence,
25083 call to memset or the MOPS expansion. */
25084 if (TARGET_MOPS
25085 && mops_cost <= libcall_cost
25086 && mops_cost <= simd_ops)
25087 return aarch64_expand_setmem_mops (operands);
25088 /* If MOPS is not available or not shorter pick a libcall if the SIMD
25089 sequence is too long. */
25090 else if (libcall_cost < simd_ops)
25091 return false;
25092 emit_insn (seq);
25093 return true;
25096 /* At this point the SIMD broadcast sequence is the best choice when
25097 optimizing for speed. */
25098 emit_insn (seq);
25099 return true;
25103 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
25104 SImode stores. Handle the case when the constant has identical
25105 bottom and top halves. This is beneficial when the two stores can be
25106 merged into an STP and we avoid synthesising potentially expensive
25107 immediates twice. Return true if such a split is possible. */
25109 bool
25110 aarch64_split_dimode_const_store (rtx dst, rtx src)
25112 rtx lo = gen_lowpart (SImode, src);
25113 rtx hi = gen_highpart_mode (SImode, DImode, src);
25115 bool size_p = optimize_function_for_size_p (cfun);
25117 if (!rtx_equal_p (lo, hi))
25118 return false;
25120 unsigned int orig_cost
25121 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
25122 unsigned int lo_cost
25123 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
25125 /* We want to transform:
25126 MOV x1, 49370
25127 MOVK x1, 0x140, lsl 16
25128 MOVK x1, 0xc0da, lsl 32
25129 MOVK x1, 0x140, lsl 48
25130 STR x1, [x0]
25131 into:
25132 MOV w1, 49370
25133 MOVK w1, 0x140, lsl 16
25134 STP w1, w1, [x0]
25135 So we want to perform this only when we save two instructions
25136 or more. When optimizing for size, however, accept any code size
25137 savings we can. */
25138 if (size_p && orig_cost <= lo_cost)
25139 return false;
25141 if (!size_p
25142 && (orig_cost <= lo_cost + 1))
25143 return false;
25145 rtx mem_lo = adjust_address (dst, SImode, 0);
25146 if (!aarch64_mem_pair_operand (mem_lo, SImode))
25147 return false;
25149 rtx tmp_reg = gen_reg_rtx (SImode);
25150 aarch64_expand_mov_immediate (tmp_reg, lo);
25151 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
25152 /* Don't emit an explicit store pair as this may not be always profitable.
25153 Let the sched-fusion logic decide whether to merge them. */
25154 emit_move_insn (mem_lo, tmp_reg);
25155 emit_move_insn (mem_hi, tmp_reg);
25157 return true;
25160 /* Generate RTL for a conditional branch with rtx comparison CODE in
25161 mode CC_MODE. The destination of the unlikely conditional branch
25162 is LABEL_REF. */
25164 void
25165 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
25166 rtx label_ref)
25168 rtx x;
25169 x = gen_rtx_fmt_ee (code, VOIDmode,
25170 gen_rtx_REG (cc_mode, CC_REGNUM),
25171 const0_rtx);
25173 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
25174 gen_rtx_LABEL_REF (VOIDmode, label_ref),
25175 pc_rtx);
25176 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
25179 /* Generate DImode scratch registers for 128-bit (TImode) addition.
25181 OP1 represents the TImode destination operand 1
25182 OP2 represents the TImode destination operand 2
25183 LOW_DEST represents the low half (DImode) of TImode operand 0
25184 LOW_IN1 represents the low half (DImode) of TImode operand 1
25185 LOW_IN2 represents the low half (DImode) of TImode operand 2
25186 HIGH_DEST represents the high half (DImode) of TImode operand 0
25187 HIGH_IN1 represents the high half (DImode) of TImode operand 1
25188 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
25190 void
25191 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25192 rtx *low_in1, rtx *low_in2,
25193 rtx *high_dest, rtx *high_in1,
25194 rtx *high_in2)
25196 *low_dest = gen_reg_rtx (DImode);
25197 *low_in1 = gen_lowpart (DImode, op1);
25198 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25199 subreg_lowpart_offset (DImode, TImode));
25200 *high_dest = gen_reg_rtx (DImode);
25201 *high_in1 = gen_highpart (DImode, op1);
25202 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25203 subreg_highpart_offset (DImode, TImode));
25206 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
25208 This function differs from 'arch64_addti_scratch_regs' in that
25209 OP1 can be an immediate constant (zero). We must call
25210 subreg_highpart_offset with DImode and TImode arguments, otherwise
25211 VOIDmode will be used for the const_int which generates an internal
25212 error from subreg_size_highpart_offset which does not expect a size of zero.
25214 OP1 represents the TImode destination operand 1
25215 OP2 represents the TImode destination operand 2
25216 LOW_DEST represents the low half (DImode) of TImode operand 0
25217 LOW_IN1 represents the low half (DImode) of TImode operand 1
25218 LOW_IN2 represents the low half (DImode) of TImode operand 2
25219 HIGH_DEST represents the high half (DImode) of TImode operand 0
25220 HIGH_IN1 represents the high half (DImode) of TImode operand 1
25221 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
25224 void
25225 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
25226 rtx *low_in1, rtx *low_in2,
25227 rtx *high_dest, rtx *high_in1,
25228 rtx *high_in2)
25230 *low_dest = gen_reg_rtx (DImode);
25231 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
25232 subreg_lowpart_offset (DImode, TImode));
25234 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
25235 subreg_lowpart_offset (DImode, TImode));
25236 *high_dest = gen_reg_rtx (DImode);
25238 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
25239 subreg_highpart_offset (DImode, TImode));
25240 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
25241 subreg_highpart_offset (DImode, TImode));
25244 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
25246 OP0 represents the TImode destination operand 0
25247 LOW_DEST represents the low half (DImode) of TImode operand 0
25248 LOW_IN1 represents the low half (DImode) of TImode operand 1
25249 LOW_IN2 represents the low half (DImode) of TImode operand 2
25250 HIGH_DEST represents the high half (DImode) of TImode operand 0
25251 HIGH_IN1 represents the high half (DImode) of TImode operand 1
25252 HIGH_IN2 represents the high half (DImode) of TImode operand 2
25253 UNSIGNED_P is true if the operation is being performed on unsigned
25254 values. */
25255 void
25256 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
25257 rtx low_in2, rtx high_dest, rtx high_in1,
25258 rtx high_in2, bool unsigned_p)
25260 if (low_in2 == const0_rtx)
25262 low_dest = low_in1;
25263 high_in2 = force_reg (DImode, high_in2);
25264 if (unsigned_p)
25265 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
25266 else
25267 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
25269 else
25271 if (aarch64_plus_immediate (low_in2, DImode))
25272 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
25273 GEN_INT (-UINTVAL (low_in2))));
25274 else
25276 low_in2 = force_reg (DImode, low_in2);
25277 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
25279 high_in2 = force_reg (DImode, high_in2);
25281 if (unsigned_p)
25282 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
25283 else
25284 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
25287 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
25288 emit_move_insn (gen_highpart (DImode, op0), high_dest);
25292 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
25294 static unsigned HOST_WIDE_INT
25295 aarch64_asan_shadow_offset (void)
25297 if (TARGET_ILP32)
25298 return (HOST_WIDE_INT_1 << 29);
25299 else
25300 return (HOST_WIDE_INT_1 << 36);
25303 static rtx
25304 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
25305 int code, tree treeop0, tree treeop1)
25307 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25308 rtx op0, op1;
25309 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25310 insn_code icode;
25311 struct expand_operand ops[4];
25313 start_sequence ();
25314 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25316 op_mode = GET_MODE (op0);
25317 if (op_mode == VOIDmode)
25318 op_mode = GET_MODE (op1);
25320 switch (op_mode)
25322 case E_QImode:
25323 case E_HImode:
25324 case E_SImode:
25325 cmp_mode = SImode;
25326 icode = CODE_FOR_cmpsi;
25327 break;
25329 case E_DImode:
25330 cmp_mode = DImode;
25331 icode = CODE_FOR_cmpdi;
25332 break;
25334 case E_SFmode:
25335 cmp_mode = SFmode;
25336 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25337 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
25338 break;
25340 case E_DFmode:
25341 cmp_mode = DFmode;
25342 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
25343 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
25344 break;
25346 default:
25347 end_sequence ();
25348 return NULL_RTX;
25351 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
25352 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
25353 if (!op0 || !op1)
25355 end_sequence ();
25356 return NULL_RTX;
25358 *prep_seq = get_insns ();
25359 end_sequence ();
25361 create_fixed_operand (&ops[0], op0);
25362 create_fixed_operand (&ops[1], op1);
25364 start_sequence ();
25365 if (!maybe_expand_insn (icode, 2, ops))
25367 end_sequence ();
25368 return NULL_RTX;
25370 *gen_seq = get_insns ();
25371 end_sequence ();
25373 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
25374 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
25377 static rtx
25378 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
25379 int cmp_code, tree treeop0, tree treeop1, int bit_code)
25381 rtx op0, op1, target;
25382 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
25383 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
25384 insn_code icode;
25385 struct expand_operand ops[6];
25386 int aarch64_cond;
25388 push_to_sequence (*prep_seq);
25389 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
25391 op_mode = GET_MODE (op0);
25392 if (op_mode == VOIDmode)
25393 op_mode = GET_MODE (op1);
25395 switch (op_mode)
25397 case E_QImode:
25398 case E_HImode:
25399 case E_SImode:
25400 cmp_mode = SImode;
25401 break;
25403 case E_DImode:
25404 cmp_mode = DImode;
25405 break;
25407 case E_SFmode:
25408 cmp_mode = SFmode;
25409 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25410 break;
25412 case E_DFmode:
25413 cmp_mode = DFmode;
25414 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
25415 break;
25417 default:
25418 end_sequence ();
25419 return NULL_RTX;
25422 icode = code_for_ccmp (cc_mode, cmp_mode);
25424 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
25425 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
25426 if (!op0 || !op1)
25428 end_sequence ();
25429 return NULL_RTX;
25431 *prep_seq = get_insns ();
25432 end_sequence ();
25434 target = gen_rtx_REG (cc_mode, CC_REGNUM);
25435 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
25437 if (bit_code != AND)
25439 /* Treat the ccmp patterns as canonical and use them where possible,
25440 but fall back to ccmp_rev patterns if there's no other option. */
25441 rtx_code prev_code = GET_CODE (prev);
25442 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
25443 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
25444 && !(prev_code == EQ
25445 || prev_code == NE
25446 || prev_code == ORDERED
25447 || prev_code == UNORDERED))
25448 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
25449 else
25451 rtx_code code = reverse_condition (prev_code);
25452 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
25454 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
25457 create_fixed_operand (&ops[0], XEXP (prev, 0));
25458 create_fixed_operand (&ops[1], target);
25459 create_fixed_operand (&ops[2], op0);
25460 create_fixed_operand (&ops[3], op1);
25461 create_fixed_operand (&ops[4], prev);
25462 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
25464 push_to_sequence (*gen_seq);
25465 if (!maybe_expand_insn (icode, 6, ops))
25467 end_sequence ();
25468 return NULL_RTX;
25471 *gen_seq = get_insns ();
25472 end_sequence ();
25474 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
25477 #undef TARGET_GEN_CCMP_FIRST
25478 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
25480 #undef TARGET_GEN_CCMP_NEXT
25481 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
25483 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
25484 instruction fusion of some sort. */
25486 static bool
25487 aarch64_macro_fusion_p (void)
25489 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
25493 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
25494 should be kept together during scheduling. */
25496 static bool
25497 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
25499 rtx set_dest;
25500 rtx prev_set = single_set (prev);
25501 rtx curr_set = single_set (curr);
25502 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
25503 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
25505 if (!aarch64_macro_fusion_p ())
25506 return false;
25508 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
25510 /* We are trying to match:
25511 prev (mov) == (set (reg r0) (const_int imm16))
25512 curr (movk) == (set (zero_extract (reg r0)
25513 (const_int 16)
25514 (const_int 16))
25515 (const_int imm16_1)) */
25517 set_dest = SET_DEST (curr_set);
25519 if (GET_CODE (set_dest) == ZERO_EXTRACT
25520 && CONST_INT_P (SET_SRC (curr_set))
25521 && CONST_INT_P (SET_SRC (prev_set))
25522 && CONST_INT_P (XEXP (set_dest, 2))
25523 && INTVAL (XEXP (set_dest, 2)) == 16
25524 && REG_P (XEXP (set_dest, 0))
25525 && REG_P (SET_DEST (prev_set))
25526 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
25528 return true;
25532 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
25535 /* We're trying to match:
25536 prev (adrp) == (set (reg r1)
25537 (high (symbol_ref ("SYM"))))
25538 curr (add) == (set (reg r0)
25539 (lo_sum (reg r1)
25540 (symbol_ref ("SYM"))))
25541 Note that r0 need not necessarily be the same as r1, especially
25542 during pre-regalloc scheduling. */
25544 if (satisfies_constraint_Ush (SET_SRC (prev_set))
25545 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25547 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
25548 && REG_P (XEXP (SET_SRC (curr_set), 0))
25549 && REGNO (XEXP (SET_SRC (curr_set), 0))
25550 == REGNO (SET_DEST (prev_set))
25551 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
25552 XEXP (SET_SRC (curr_set), 1)))
25553 return true;
25557 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
25560 /* We're trying to match:
25561 prev (movk) == (set (zero_extract (reg r0)
25562 (const_int 16)
25563 (const_int 32))
25564 (const_int imm16_1))
25565 curr (movk) == (set (zero_extract (reg r0)
25566 (const_int 16)
25567 (const_int 48))
25568 (const_int imm16_2)) */
25570 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
25571 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
25572 && REG_P (XEXP (SET_DEST (prev_set), 0))
25573 && REG_P (XEXP (SET_DEST (curr_set), 0))
25574 && REGNO (XEXP (SET_DEST (prev_set), 0))
25575 == REGNO (XEXP (SET_DEST (curr_set), 0))
25576 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
25577 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
25578 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
25579 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
25580 && CONST_INT_P (SET_SRC (prev_set))
25581 && CONST_INT_P (SET_SRC (curr_set)))
25582 return true;
25585 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
25587 /* We're trying to match:
25588 prev (adrp) == (set (reg r0)
25589 (high (symbol_ref ("SYM"))))
25590 curr (ldr) == (set (reg r1)
25591 (mem (lo_sum (reg r0)
25592 (symbol_ref ("SYM")))))
25594 curr (ldr) == (set (reg r1)
25595 (zero_extend (mem
25596 (lo_sum (reg r0)
25597 (symbol_ref ("SYM")))))) */
25598 if (satisfies_constraint_Ush (SET_SRC (prev_set))
25599 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
25601 rtx curr_src = SET_SRC (curr_set);
25603 if (GET_CODE (curr_src) == ZERO_EXTEND)
25604 curr_src = XEXP (curr_src, 0);
25606 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
25607 && REG_P (XEXP (XEXP (curr_src, 0), 0))
25608 && REGNO (XEXP (XEXP (curr_src, 0), 0))
25609 == REGNO (SET_DEST (prev_set))
25610 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
25611 XEXP (SET_SRC (prev_set), 0)))
25612 return true;
25616 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
25617 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
25618 && prev_set && curr_set && any_condjump_p (curr)
25619 && GET_CODE (SET_SRC (prev_set)) == COMPARE
25620 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
25621 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
25622 return true;
25624 /* Fuse flag-setting ALU instructions and conditional branch. */
25625 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
25626 && any_condjump_p (curr))
25628 unsigned int condreg1, condreg2;
25629 rtx cc_reg_1;
25630 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
25631 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
25633 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
25634 && prev
25635 && modified_in_p (cc_reg_1, prev))
25637 enum attr_type prev_type = get_attr_type (prev);
25639 /* FIXME: this misses some which is considered simple arthematic
25640 instructions for ThunderX. Simple shifts are missed here. */
25641 if (prev_type == TYPE_ALUS_SREG
25642 || prev_type == TYPE_ALUS_IMM
25643 || prev_type == TYPE_LOGICS_REG
25644 || prev_type == TYPE_LOGICS_IMM)
25645 return true;
25649 /* Fuse ALU instructions and CBZ/CBNZ. */
25650 if (prev_set
25651 && curr_set
25652 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
25653 && any_condjump_p (curr))
25655 /* We're trying to match:
25656 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
25657 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
25658 (const_int 0))
25659 (label_ref ("SYM"))
25660 (pc)) */
25661 if (SET_DEST (curr_set) == (pc_rtx)
25662 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
25663 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
25664 && REG_P (SET_DEST (prev_set))
25665 && REGNO (SET_DEST (prev_set))
25666 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
25668 /* Fuse ALU operations followed by conditional branch instruction. */
25669 switch (get_attr_type (prev))
25671 case TYPE_ALU_IMM:
25672 case TYPE_ALU_SREG:
25673 case TYPE_ADC_REG:
25674 case TYPE_ADC_IMM:
25675 case TYPE_ADCS_REG:
25676 case TYPE_ADCS_IMM:
25677 case TYPE_LOGIC_REG:
25678 case TYPE_LOGIC_IMM:
25679 case TYPE_CSEL:
25680 case TYPE_ADR:
25681 case TYPE_MOV_IMM:
25682 case TYPE_SHIFT_REG:
25683 case TYPE_SHIFT_IMM:
25684 case TYPE_BFM:
25685 case TYPE_RBIT:
25686 case TYPE_REV:
25687 case TYPE_EXTEND:
25688 return true;
25690 default:;
25695 /* Fuse A+B+1 and A-B-1 */
25696 if (simple_sets_p
25697 && aarch64_fusion_enabled_p (AARCH64_FUSE_ADDSUB_2REG_CONST1))
25699 /* We're trying to match:
25700 prev == (set (r0) (plus (r0) (r1)))
25701 curr == (set (r0) (plus (r0) (const_int 1)))
25703 prev == (set (r0) (minus (r0) (r1)))
25704 curr == (set (r0) (plus (r0) (const_int -1))) */
25706 rtx prev_src = SET_SRC (prev_set);
25707 rtx curr_src = SET_SRC (curr_set);
25709 int polarity = 1;
25710 if (GET_CODE (prev_src) == MINUS)
25711 polarity = -1;
25713 if (GET_CODE (curr_src) == PLUS
25714 && (GET_CODE (prev_src) == PLUS || GET_CODE (prev_src) == MINUS)
25715 && CONST_INT_P (XEXP (curr_src, 1))
25716 && INTVAL (XEXP (curr_src, 1)) == polarity
25717 && REG_P (XEXP (curr_src, 0))
25718 && REG_P (SET_DEST (prev_set))
25719 && REGNO (SET_DEST (prev_set)) == REGNO (XEXP (curr_src, 0)))
25720 return true;
25723 return false;
25726 /* Return true iff the instruction fusion described by OP is enabled. */
25728 bool
25729 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
25731 return (aarch64_tune_params.fusible_ops & op) != 0;
25734 /* If MEM is in the form of [base+offset], extract the two parts
25735 of address and set to BASE and OFFSET, otherwise return false
25736 after clearing BASE and OFFSET. */
25738 bool
25739 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
25741 rtx addr;
25743 gcc_assert (MEM_P (mem));
25745 addr = XEXP (mem, 0);
25747 if (REG_P (addr))
25749 *base = addr;
25750 *offset = const0_rtx;
25751 return true;
25754 if (GET_CODE (addr) == PLUS
25755 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
25757 *base = XEXP (addr, 0);
25758 *offset = XEXP (addr, 1);
25759 return true;
25762 *base = NULL_RTX;
25763 *offset = NULL_RTX;
25765 return false;
25768 /* Types for scheduling fusion. */
25769 enum sched_fusion_type
25771 SCHED_FUSION_NONE = 0,
25772 SCHED_FUSION_LD_SIGN_EXTEND,
25773 SCHED_FUSION_LD_ZERO_EXTEND,
25774 SCHED_FUSION_LD,
25775 SCHED_FUSION_ST,
25776 SCHED_FUSION_NUM
25779 /* If INSN is a load or store of address in the form of [base+offset],
25780 extract the two parts and set to BASE and OFFSET. Return scheduling
25781 fusion type this INSN is. */
25783 static enum sched_fusion_type
25784 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
25786 rtx x, dest, src;
25787 enum sched_fusion_type fusion = SCHED_FUSION_LD;
25789 gcc_assert (INSN_P (insn));
25790 x = PATTERN (insn);
25791 if (GET_CODE (x) != SET)
25792 return SCHED_FUSION_NONE;
25794 src = SET_SRC (x);
25795 dest = SET_DEST (x);
25797 machine_mode dest_mode = GET_MODE (dest);
25799 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
25800 return SCHED_FUSION_NONE;
25802 if (GET_CODE (src) == SIGN_EXTEND)
25804 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
25805 src = XEXP (src, 0);
25806 if (!MEM_P (src) || GET_MODE (src) != SImode)
25807 return SCHED_FUSION_NONE;
25809 else if (GET_CODE (src) == ZERO_EXTEND)
25811 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
25812 src = XEXP (src, 0);
25813 if (!MEM_P (src) || GET_MODE (src) != SImode)
25814 return SCHED_FUSION_NONE;
25817 if (MEM_P (src) && REG_P (dest))
25818 extract_base_offset_in_addr (src, base, offset);
25819 else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
25821 fusion = SCHED_FUSION_ST;
25822 extract_base_offset_in_addr (dest, base, offset);
25824 else
25825 return SCHED_FUSION_NONE;
25827 if (*base == NULL_RTX || *offset == NULL_RTX)
25828 fusion = SCHED_FUSION_NONE;
25830 return fusion;
25833 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
25835 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
25836 and PRI are only calculated for these instructions. For other instruction,
25837 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
25838 type instruction fusion can be added by returning different priorities.
25840 It's important that irrelevant instructions get the largest FUSION_PRI. */
25842 static void
25843 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
25844 int *fusion_pri, int *pri)
25846 int tmp, off_val;
25847 rtx base, offset;
25848 enum sched_fusion_type fusion;
25850 gcc_assert (INSN_P (insn));
25852 tmp = max_pri - 1;
25853 fusion = fusion_load_store (insn, &base, &offset);
25854 if (fusion == SCHED_FUSION_NONE)
25856 *pri = tmp;
25857 *fusion_pri = tmp;
25858 return;
25861 /* Set FUSION_PRI according to fusion type and base register. */
25862 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
25864 /* Calculate PRI. */
25865 tmp /= 2;
25867 /* INSN with smaller offset goes first. */
25868 off_val = (int)(INTVAL (offset));
25869 if (off_val >= 0)
25870 tmp -= (off_val & 0xfffff);
25871 else
25872 tmp += ((- off_val) & 0xfffff);
25874 *pri = tmp;
25875 return;
25878 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
25879 Adjust priority of sha1h instructions so they are scheduled before
25880 other SHA1 instructions. */
25882 static int
25883 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
25885 rtx x = PATTERN (insn);
25887 if (GET_CODE (x) == SET)
25889 x = SET_SRC (x);
25891 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
25892 return priority + 10;
25895 return priority;
25898 /* If REVERSED is null, return true if memory reference *MEM2 comes
25899 immediately after memory reference *MEM1. Do not change the references
25900 in this case.
25902 Otherwise, check if *MEM1 and *MEM2 are consecutive memory references and,
25903 if they are, try to make them use constant offsets from the same base
25904 register. Return true on success. When returning true, set *REVERSED
25905 to true if *MEM1 comes after *MEM2, false if *MEM1 comes before *MEM2. */
25906 static bool
25907 aarch64_check_consecutive_mems (rtx *mem1, rtx *mem2, bool *reversed)
25909 if (reversed)
25910 *reversed = false;
25912 if (GET_RTX_CLASS (GET_CODE (XEXP (*mem1, 0))) == RTX_AUTOINC
25913 || GET_RTX_CLASS (GET_CODE (XEXP (*mem2, 0))) == RTX_AUTOINC)
25914 return false;
25916 if (!MEM_SIZE_KNOWN_P (*mem1) || !MEM_SIZE_KNOWN_P (*mem2))
25917 return false;
25919 auto size1 = MEM_SIZE (*mem1);
25920 auto size2 = MEM_SIZE (*mem2);
25922 rtx base1, base2, offset1, offset2;
25923 extract_base_offset_in_addr (*mem1, &base1, &offset1);
25924 extract_base_offset_in_addr (*mem2, &base2, &offset2);
25926 /* Make sure at least one memory is in base+offset form. */
25927 if (!(base1 && offset1) && !(base2 && offset2))
25928 return false;
25930 /* If both mems already use the same base register, just check the
25931 offsets. */
25932 if (base1 && base2 && rtx_equal_p (base1, base2))
25934 if (!offset1 || !offset2)
25935 return false;
25937 if (known_eq (UINTVAL (offset1) + size1, UINTVAL (offset2)))
25938 return true;
25940 if (known_eq (UINTVAL (offset2) + size2, UINTVAL (offset1)) && reversed)
25942 *reversed = true;
25943 return true;
25946 return false;
25949 /* Otherwise, check whether the MEM_EXPRs and MEM_OFFSETs together
25950 guarantee that the values are consecutive. */
25951 if (MEM_EXPR (*mem1)
25952 && MEM_EXPR (*mem2)
25953 && MEM_OFFSET_KNOWN_P (*mem1)
25954 && MEM_OFFSET_KNOWN_P (*mem2))
25956 poly_int64 expr_offset1;
25957 poly_int64 expr_offset2;
25958 tree expr_base1 = get_addr_base_and_unit_offset (MEM_EXPR (*mem1),
25959 &expr_offset1);
25960 tree expr_base2 = get_addr_base_and_unit_offset (MEM_EXPR (*mem2),
25961 &expr_offset2);
25962 if (!expr_base1
25963 || !expr_base2
25964 || !DECL_P (expr_base1)
25965 || !operand_equal_p (expr_base1, expr_base2, OEP_ADDRESS_OF))
25966 return false;
25968 expr_offset1 += MEM_OFFSET (*mem1);
25969 expr_offset2 += MEM_OFFSET (*mem2);
25971 if (known_eq (expr_offset1 + size1, expr_offset2))
25973 else if (known_eq (expr_offset2 + size2, expr_offset1) && reversed)
25974 *reversed = true;
25975 else
25976 return false;
25978 if (reversed)
25980 if (base2)
25982 rtx addr1 = plus_constant (Pmode, XEXP (*mem2, 0),
25983 expr_offset1 - expr_offset2);
25984 *mem1 = replace_equiv_address_nv (*mem1, addr1);
25986 else
25988 rtx addr2 = plus_constant (Pmode, XEXP (*mem1, 0),
25989 expr_offset2 - expr_offset1);
25990 *mem2 = replace_equiv_address_nv (*mem2, addr2);
25993 return true;
25996 return false;
25999 /* Return true if MEM1 and MEM2 can be combined into a single access
26000 of mode MODE, with the combined access having the same address as MEM1. */
26002 bool
26003 aarch64_mergeable_load_pair_p (machine_mode mode, rtx mem1, rtx mem2)
26005 if (STRICT_ALIGNMENT && MEM_ALIGN (mem1) < GET_MODE_ALIGNMENT (mode))
26006 return false;
26007 return aarch64_check_consecutive_mems (&mem1, &mem2, nullptr);
26010 /* Given OPERANDS of consecutive load/store, check if we can merge
26011 them into ldp/stp. LOAD is true if they are load instructions.
26012 MODE is the mode of memory operands. */
26014 bool
26015 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
26016 machine_mode mode)
26018 enum reg_class rclass_1, rclass_2;
26019 rtx mem_1, mem_2, reg_1, reg_2;
26021 /* Allow the tuning structure to disable LDP instruction formation
26022 from combining instructions (e.g., in peephole2).
26023 TODO: Implement fine-grained tuning control for LDP and STP:
26024 1. control policies for load and store separately;
26025 2. support the following policies:
26026 - default (use what is in the tuning structure)
26027 - always
26028 - never
26029 - aligned (only if the compiler can prove that the
26030 load will be aligned to 2 * element_size) */
26031 if (load && (aarch64_tune_params.extra_tuning_flags
26032 & AARCH64_EXTRA_TUNE_NO_LDP_COMBINE))
26033 return false;
26035 if (load)
26037 mem_1 = operands[1];
26038 mem_2 = operands[3];
26039 reg_1 = operands[0];
26040 reg_2 = operands[2];
26041 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
26042 if (REGNO (reg_1) == REGNO (reg_2))
26043 return false;
26044 if (reg_overlap_mentioned_p (reg_1, mem_2))
26045 return false;
26047 else
26049 mem_1 = operands[0];
26050 mem_2 = operands[2];
26051 reg_1 = operands[1];
26052 reg_2 = operands[3];
26055 /* The mems cannot be volatile. */
26056 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
26057 return false;
26059 /* If we have SImode and slow unaligned ldp,
26060 check the alignment to be at least 8 byte. */
26061 if (mode == SImode
26062 && (aarch64_tune_params.extra_tuning_flags
26063 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
26064 && !optimize_size
26065 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
26066 return false;
26068 /* Check if the addresses are in the form of [base+offset]. */
26069 bool reversed = false;
26070 if (!aarch64_check_consecutive_mems (&mem_1, &mem_2, &reversed))
26071 return false;
26073 /* The operands must be of the same size. */
26074 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
26075 GET_MODE_SIZE (GET_MODE (mem_2))));
26077 /* One of the memory accesses must be a mempair operand.
26078 If it is not the first one, they need to be swapped by the
26079 peephole. */
26080 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
26081 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
26082 return false;
26084 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
26085 rclass_1 = FP_REGS;
26086 else
26087 rclass_1 = GENERAL_REGS;
26089 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
26090 rclass_2 = FP_REGS;
26091 else
26092 rclass_2 = GENERAL_REGS;
26094 /* Check if the registers are of same class. */
26095 if (rclass_1 != rclass_2)
26096 return false;
26098 return true;
26101 /* Given OPERANDS of consecutive load/store that can be merged,
26102 swap them if they are not in ascending order. */
26103 void
26104 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
26106 int mem_op = load ? 1 : 0;
26107 bool reversed = false;
26108 if (!aarch64_check_consecutive_mems (operands + mem_op,
26109 operands + mem_op + 2, &reversed))
26110 gcc_unreachable ();
26112 if (reversed)
26114 /* Irrespective of whether this is a load or a store,
26115 we do the same swap. */
26116 std::swap (operands[0], operands[2]);
26117 std::swap (operands[1], operands[3]);
26121 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
26122 comparison between the two. */
26124 aarch64_host_wide_int_compare (const void *x, const void *y)
26126 return wi::cmps (* ((const HOST_WIDE_INT *) x),
26127 * ((const HOST_WIDE_INT *) y));
26130 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
26131 other pointing to a REG rtx containing an offset, compare the offsets
26132 of the two pairs.
26134 Return:
26136 1 iff offset (X) > offset (Y)
26137 0 iff offset (X) == offset (Y)
26138 -1 iff offset (X) < offset (Y) */
26140 aarch64_ldrstr_offset_compare (const void *x, const void *y)
26142 const rtx * operands_1 = (const rtx *) x;
26143 const rtx * operands_2 = (const rtx *) y;
26144 rtx mem_1, mem_2, base, offset_1, offset_2;
26146 if (MEM_P (operands_1[0]))
26147 mem_1 = operands_1[0];
26148 else
26149 mem_1 = operands_1[1];
26151 if (MEM_P (operands_2[0]))
26152 mem_2 = operands_2[0];
26153 else
26154 mem_2 = operands_2[1];
26156 /* Extract the offsets. */
26157 extract_base_offset_in_addr (mem_1, &base, &offset_1);
26158 extract_base_offset_in_addr (mem_2, &base, &offset_2);
26160 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
26162 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
26165 /* Given OPERANDS of consecutive load/store, check if we can merge
26166 them into ldp/stp by adjusting the offset. LOAD is true if they
26167 are load instructions. MODE is the mode of memory operands.
26169 Given below consecutive stores:
26171 str w1, [xb, 0x100]
26172 str w1, [xb, 0x104]
26173 str w1, [xb, 0x108]
26174 str w1, [xb, 0x10c]
26176 Though the offsets are out of the range supported by stp, we can
26177 still pair them after adjusting the offset, like:
26179 add scratch, xb, 0x100
26180 stp w1, w1, [scratch]
26181 stp w1, w1, [scratch, 0x8]
26183 The peephole patterns detecting this opportunity should guarantee
26184 the scratch register is avaliable. */
26186 bool
26187 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
26188 machine_mode mode)
26190 const int num_insns = 4;
26191 enum reg_class rclass;
26192 HOST_WIDE_INT offvals[num_insns], msize;
26193 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
26195 if (load)
26197 for (int i = 0; i < num_insns; i++)
26199 reg[i] = operands[2 * i];
26200 mem[i] = operands[2 * i + 1];
26202 gcc_assert (REG_P (reg[i]));
26205 /* Do not attempt to merge the loads if the loads clobber each other. */
26206 for (int i = 0; i < 8; i += 2)
26207 for (int j = i + 2; j < 8; j += 2)
26208 if (reg_overlap_mentioned_p (operands[i], operands[j]))
26209 return false;
26211 else
26212 for (int i = 0; i < num_insns; i++)
26214 mem[i] = operands[2 * i];
26215 reg[i] = operands[2 * i + 1];
26218 /* Skip if memory operand is by itself valid for ldp/stp. */
26219 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
26220 return false;
26222 for (int i = 0; i < num_insns; i++)
26224 /* The mems cannot be volatile. */
26225 if (MEM_VOLATILE_P (mem[i]))
26226 return false;
26228 /* Check if the addresses are in the form of [base+offset]. */
26229 extract_base_offset_in_addr (mem[i], base + i, offset + i);
26230 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
26231 return false;
26234 /* Check if the registers are of same class. */
26235 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
26236 ? FP_REGS : GENERAL_REGS;
26238 for (int i = 1; i < num_insns; i++)
26239 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
26241 if (rclass != FP_REGS)
26242 return false;
26244 else
26246 if (rclass != GENERAL_REGS)
26247 return false;
26250 /* Only the last register in the order in which they occur
26251 may be clobbered by the load. */
26252 if (rclass == GENERAL_REGS && load)
26253 for (int i = 0; i < num_insns - 1; i++)
26254 if (reg_mentioned_p (reg[i], mem[i]))
26255 return false;
26257 /* Check if the bases are same. */
26258 for (int i = 0; i < num_insns - 1; i++)
26259 if (!rtx_equal_p (base[i], base[i + 1]))
26260 return false;
26262 for (int i = 0; i < num_insns; i++)
26263 offvals[i] = INTVAL (offset[i]);
26265 msize = GET_MODE_SIZE (mode).to_constant ();
26267 /* Check if the offsets can be put in the right order to do a ldp/stp. */
26268 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
26269 aarch64_host_wide_int_compare);
26271 if (!(offvals[1] == offvals[0] + msize
26272 && offvals[3] == offvals[2] + msize))
26273 return false;
26275 /* Check that offsets are within range of each other. The ldp/stp
26276 instructions have 7 bit immediate offsets, so use 0x80. */
26277 if (offvals[2] - offvals[0] >= msize * 0x80)
26278 return false;
26280 /* The offsets must be aligned with respect to each other. */
26281 if (offvals[0] % msize != offvals[2] % msize)
26282 return false;
26284 /* If we have SImode and slow unaligned ldp,
26285 check the alignment to be at least 8 byte. */
26286 if (mode == SImode
26287 && (aarch64_tune_params.extra_tuning_flags
26288 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
26289 && !optimize_size
26290 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
26291 return false;
26293 return true;
26296 /* Given OPERANDS of consecutive load/store, this function pairs them
26297 into LDP/STP after adjusting the offset. It depends on the fact
26298 that the operands can be sorted so the offsets are correct for STP.
26299 MODE is the mode of memory operands. CODE is the rtl operator
26300 which should be applied to all memory operands, it's SIGN_EXTEND,
26301 ZERO_EXTEND or UNKNOWN. */
26303 bool
26304 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
26305 machine_mode mode, RTX_CODE code)
26307 rtx base, offset_1, offset_3, t1, t2;
26308 rtx mem_1, mem_2, mem_3, mem_4;
26309 rtx temp_operands[8];
26310 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
26311 stp_off_upper_limit, stp_off_lower_limit, msize;
26313 /* We make changes on a copy as we may still bail out. */
26314 for (int i = 0; i < 8; i ++)
26315 temp_operands[i] = operands[i];
26317 /* Sort the operands. Note for cases as below:
26318 [base + 0x310] = A
26319 [base + 0x320] = B
26320 [base + 0x330] = C
26321 [base + 0x320] = D
26322 We need stable sorting otherwise wrong data may be store to offset 0x320.
26323 Also note the dead store in above case should be optimized away, but no
26324 guarantees here. */
26325 gcc_stablesort(temp_operands, 4, 2 * sizeof (rtx *),
26326 aarch64_ldrstr_offset_compare);
26328 /* Copy the memory operands so that if we have to bail for some
26329 reason the original addresses are unchanged. */
26330 if (load)
26332 mem_1 = copy_rtx (temp_operands[1]);
26333 mem_2 = copy_rtx (temp_operands[3]);
26334 mem_3 = copy_rtx (temp_operands[5]);
26335 mem_4 = copy_rtx (temp_operands[7]);
26337 else
26339 mem_1 = copy_rtx (temp_operands[0]);
26340 mem_2 = copy_rtx (temp_operands[2]);
26341 mem_3 = copy_rtx (temp_operands[4]);
26342 mem_4 = copy_rtx (temp_operands[6]);
26343 gcc_assert (code == UNKNOWN);
26346 extract_base_offset_in_addr (mem_1, &base, &offset_1);
26347 extract_base_offset_in_addr (mem_3, &base, &offset_3);
26348 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
26349 && offset_3 != NULL_RTX);
26351 /* Adjust offset so it can fit in LDP/STP instruction. */
26352 msize = GET_MODE_SIZE (mode).to_constant();
26353 stp_off_upper_limit = msize * (0x40 - 1);
26354 stp_off_lower_limit = - msize * 0x40;
26356 off_val_1 = INTVAL (offset_1);
26357 off_val_3 = INTVAL (offset_3);
26359 /* The base offset is optimally half way between the two STP/LDP offsets. */
26360 if (msize <= 4)
26361 base_off = (off_val_1 + off_val_3) / 2;
26362 else
26363 /* However, due to issues with negative LDP/STP offset generation for
26364 larger modes, for DF, DD, DI and vector modes. we must not use negative
26365 addresses smaller than 9 signed unadjusted bits can store. This
26366 provides the most range in this case. */
26367 base_off = off_val_1;
26369 /* Adjust the base so that it is aligned with the addresses but still
26370 optimal. */
26371 if (base_off % msize != off_val_1 % msize)
26372 /* Fix the offset, bearing in mind we want to make it bigger not
26373 smaller. */
26374 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26375 else if (msize <= 4)
26376 /* The negative range of LDP/STP is one larger than the positive range. */
26377 base_off += msize;
26379 /* Check if base offset is too big or too small. We can attempt to resolve
26380 this issue by setting it to the maximum value and seeing if the offsets
26381 still fit. */
26382 if (base_off >= 0x1000)
26384 base_off = 0x1000 - 1;
26385 /* We must still make sure that the base offset is aligned with respect
26386 to the address. But it may not be made any bigger. */
26387 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26390 /* Likewise for the case where the base is too small. */
26391 if (base_off <= -0x1000)
26393 base_off = -0x1000 + 1;
26394 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
26397 /* Offset of the first STP/LDP. */
26398 new_off_1 = off_val_1 - base_off;
26400 /* Offset of the second STP/LDP. */
26401 new_off_3 = off_val_3 - base_off;
26403 /* The offsets must be within the range of the LDP/STP instructions. */
26404 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
26405 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
26406 return false;
26408 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
26409 new_off_1), true);
26410 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
26411 new_off_1 + msize), true);
26412 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
26413 new_off_3), true);
26414 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
26415 new_off_3 + msize), true);
26417 if (!aarch64_mem_pair_operand (mem_1, mode)
26418 || !aarch64_mem_pair_operand (mem_3, mode))
26419 return false;
26421 if (code == ZERO_EXTEND)
26423 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
26424 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
26425 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
26426 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
26428 else if (code == SIGN_EXTEND)
26430 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
26431 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
26432 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
26433 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
26436 if (load)
26438 operands[0] = temp_operands[0];
26439 operands[1] = mem_1;
26440 operands[2] = temp_operands[2];
26441 operands[3] = mem_2;
26442 operands[4] = temp_operands[4];
26443 operands[5] = mem_3;
26444 operands[6] = temp_operands[6];
26445 operands[7] = mem_4;
26447 else
26449 operands[0] = mem_1;
26450 operands[1] = temp_operands[1];
26451 operands[2] = mem_2;
26452 operands[3] = temp_operands[3];
26453 operands[4] = mem_3;
26454 operands[5] = temp_operands[5];
26455 operands[6] = mem_4;
26456 operands[7] = temp_operands[7];
26459 /* Emit adjusting instruction. */
26460 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
26461 /* Emit ldp/stp instructions. */
26462 t1 = gen_rtx_SET (operands[0], operands[1]);
26463 t2 = gen_rtx_SET (operands[2], operands[3]);
26464 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26465 t1 = gen_rtx_SET (operands[4], operands[5]);
26466 t2 = gen_rtx_SET (operands[6], operands[7]);
26467 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
26468 return true;
26471 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
26472 it isn't worth branching around empty masked ops (including masked
26473 stores). */
26475 static bool
26476 aarch64_empty_mask_is_expensive (unsigned)
26478 return false;
26481 /* Return 1 if pseudo register should be created and used to hold
26482 GOT address for PIC code. */
26484 bool
26485 aarch64_use_pseudo_pic_reg (void)
26487 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
26490 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
26492 static int
26493 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
26495 switch (XINT (x, 1))
26497 case UNSPEC_GOTSMALLPIC:
26498 case UNSPEC_GOTSMALLPIC28K:
26499 case UNSPEC_GOTTINYPIC:
26500 return 0;
26501 default:
26502 break;
26505 return default_unspec_may_trap_p (x, flags);
26509 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
26510 return the log2 of that value. Otherwise return -1. */
26513 aarch64_fpconst_pow_of_2 (rtx x)
26515 const REAL_VALUE_TYPE *r;
26517 if (!CONST_DOUBLE_P (x))
26518 return -1;
26520 r = CONST_DOUBLE_REAL_VALUE (x);
26522 if (REAL_VALUE_NEGATIVE (*r)
26523 || REAL_VALUE_ISNAN (*r)
26524 || REAL_VALUE_ISINF (*r)
26525 || !real_isinteger (r, DFmode))
26526 return -1;
26528 return exact_log2 (real_to_integer (r));
26531 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
26532 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
26533 return n. Otherwise return -1. */
26536 aarch64_fpconst_pow2_recip (rtx x)
26538 REAL_VALUE_TYPE r0;
26540 if (!CONST_DOUBLE_P (x))
26541 return -1;
26543 r0 = *CONST_DOUBLE_REAL_VALUE (x);
26544 if (exact_real_inverse (DFmode, &r0)
26545 && !REAL_VALUE_NEGATIVE (r0))
26547 int ret = exact_log2 (real_to_integer (&r0));
26548 if (ret >= 1 && ret <= 32)
26549 return ret;
26551 return -1;
26554 /* If X is a vector of equal CONST_DOUBLE values and that value is
26555 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
26558 aarch64_vec_fpconst_pow_of_2 (rtx x)
26560 int nelts;
26561 if (!CONST_VECTOR_P (x)
26562 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
26563 return -1;
26565 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
26566 return -1;
26568 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
26569 if (firstval <= 0)
26570 return -1;
26572 for (int i = 1; i < nelts; i++)
26573 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
26574 return -1;
26576 return firstval;
26579 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
26580 to float.
26582 __fp16 always promotes through this hook.
26583 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
26584 through the generic excess precision logic rather than here. */
26586 static tree
26587 aarch64_promoted_type (const_tree t)
26589 if (SCALAR_FLOAT_TYPE_P (t)
26590 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
26591 return float_type_node;
26593 return NULL_TREE;
26596 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
26598 static bool
26599 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
26600 optimization_type opt_type)
26602 switch (op)
26604 case rsqrt_optab:
26605 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
26607 default:
26608 return true;
26612 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
26614 static unsigned int
26615 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
26616 int *offset)
26618 /* Polynomial invariant 1 == (VG / 2) - 1. */
26619 gcc_assert (i == 1);
26620 *factor = 2;
26621 *offset = 1;
26622 return AARCH64_DWARF_VG;
26625 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
26626 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
26628 static bool
26629 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
26631 return ((mode == HFmode || mode == BFmode)
26632 ? true
26633 : default_libgcc_floating_mode_supported_p (mode));
26636 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
26637 if MODE is [BH]Fmode, and punt to the generic implementation otherwise. */
26639 static bool
26640 aarch64_scalar_mode_supported_p (scalar_mode mode)
26642 if (DECIMAL_FLOAT_MODE_P (mode))
26643 return default_decimal_float_supported_p ();
26645 return ((mode == HFmode || mode == BFmode)
26646 ? true
26647 : default_scalar_mode_supported_p (mode));
26650 /* Set the value of FLT_EVAL_METHOD.
26651 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
26653 0: evaluate all operations and constants, whose semantic type has at
26654 most the range and precision of type float, to the range and
26655 precision of float; evaluate all other operations and constants to
26656 the range and precision of the semantic type;
26658 N, where _FloatN is a supported interchange floating type
26659 evaluate all operations and constants, whose semantic type has at
26660 most the range and precision of _FloatN type, to the range and
26661 precision of the _FloatN type; evaluate all other operations and
26662 constants to the range and precision of the semantic type;
26664 If we have the ARMv8.2-A extensions then we support _Float16 in native
26665 precision, so we should set this to 16. Otherwise, we support the type,
26666 but want to evaluate expressions in float precision, so set this to
26667 0. */
26669 static enum flt_eval_method
26670 aarch64_excess_precision (enum excess_precision_type type)
26672 switch (type)
26674 case EXCESS_PRECISION_TYPE_FAST:
26675 case EXCESS_PRECISION_TYPE_STANDARD:
26676 /* We can calculate either in 16-bit range and precision or
26677 32-bit range and precision. Make that decision based on whether
26678 we have native support for the ARMv8.2-A 16-bit floating-point
26679 instructions or not. */
26680 return (TARGET_FP_F16INST
26681 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
26682 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
26683 case EXCESS_PRECISION_TYPE_IMPLICIT:
26684 case EXCESS_PRECISION_TYPE_FLOAT16:
26685 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
26686 default:
26687 gcc_unreachable ();
26689 return FLT_EVAL_METHOD_UNPREDICTABLE;
26692 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
26693 scheduled for speculative execution. Reject the long-running division
26694 and square-root instructions. */
26696 static bool
26697 aarch64_sched_can_speculate_insn (rtx_insn *insn)
26699 switch (get_attr_type (insn))
26701 case TYPE_SDIV:
26702 case TYPE_UDIV:
26703 case TYPE_FDIVS:
26704 case TYPE_FDIVD:
26705 case TYPE_FSQRTS:
26706 case TYPE_FSQRTD:
26707 case TYPE_NEON_FP_SQRT_S:
26708 case TYPE_NEON_FP_SQRT_D:
26709 case TYPE_NEON_FP_SQRT_S_Q:
26710 case TYPE_NEON_FP_SQRT_D_Q:
26711 case TYPE_NEON_FP_DIV_S:
26712 case TYPE_NEON_FP_DIV_D:
26713 case TYPE_NEON_FP_DIV_S_Q:
26714 case TYPE_NEON_FP_DIV_D_Q:
26715 return false;
26716 default:
26717 return true;
26721 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
26723 static int
26724 aarch64_compute_pressure_classes (reg_class *classes)
26726 int i = 0;
26727 classes[i++] = GENERAL_REGS;
26728 classes[i++] = FP_REGS;
26729 /* PR_REGS isn't a useful pressure class because many predicate pseudo
26730 registers need to go in PR_LO_REGS at some point during their
26731 lifetime. Splitting it into two halves has the effect of making
26732 all predicates count against PR_LO_REGS, so that we try whenever
26733 possible to restrict the number of live predicates to 8. This
26734 greatly reduces the amount of spilling in certain loops. */
26735 classes[i++] = PR_LO_REGS;
26736 classes[i++] = PR_HI_REGS;
26737 return i;
26740 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
26742 static bool
26743 aarch64_can_change_mode_class (machine_mode from,
26744 machine_mode to, reg_class_t)
26746 unsigned int from_flags = aarch64_classify_vector_mode (from);
26747 unsigned int to_flags = aarch64_classify_vector_mode (to);
26749 bool from_sve_p = (from_flags & VEC_ANY_SVE);
26750 bool to_sve_p = (to_flags & VEC_ANY_SVE);
26752 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
26753 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
26755 bool from_pred_p = (from_flags & VEC_SVE_PRED);
26756 bool to_pred_p = (to_flags & VEC_SVE_PRED);
26758 bool to_partial_advsimd_struct_p = (to_flags == (VEC_ADVSIMD | VEC_STRUCT
26759 | VEC_PARTIAL));
26760 bool from_partial_advsimd_struct_p = (from_flags == (VEC_ADVSIMD | VEC_STRUCT
26761 | VEC_PARTIAL));
26763 /* Don't allow changes between predicate modes and other modes.
26764 Only predicate registers can hold predicate modes and only
26765 non-predicate registers can hold non-predicate modes, so any
26766 attempt to mix them would require a round trip through memory. */
26767 if (from_pred_p != to_pred_p)
26768 return false;
26770 /* Don't allow changes between partial SVE modes and other modes.
26771 The contents of partial SVE modes are distributed evenly across
26772 the register, whereas GCC expects them to be clustered together. */
26773 if (from_partial_sve_p != to_partial_sve_p)
26774 return false;
26776 /* Similarly reject changes between partial SVE modes that have
26777 different patterns of significant and insignificant bits. */
26778 if (from_partial_sve_p
26779 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
26780 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
26781 return false;
26783 /* Don't allow changes between partial and other registers only if
26784 one is a normal SIMD register, allow only if not larger than 64-bit. */
26785 if ((to_partial_advsimd_struct_p ^ from_partial_advsimd_struct_p)
26786 && (known_gt (GET_MODE_SIZE (to), 8) || known_gt (GET_MODE_SIZE (to), 8)))
26787 return false;
26789 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
26791 /* Don't allow changes between SVE modes and other modes that might
26792 be bigger than 128 bits. In particular, OImode, CImode and XImode
26793 divide into 128-bit quantities while SVE modes divide into
26794 BITS_PER_SVE_VECTOR quantities. */
26795 if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
26796 return false;
26797 if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
26798 return false;
26801 if (BYTES_BIG_ENDIAN)
26803 /* Don't allow changes between SVE data modes and non-SVE modes.
26804 See the comment at the head of aarch64-sve.md for details. */
26805 if (from_sve_p != to_sve_p)
26806 return false;
26808 /* Don't allow changes in element size: lane 0 of the new vector
26809 would not then be lane 0 of the old vector. See the comment
26810 above aarch64_maybe_expand_sve_subreg_move for a more detailed
26811 description.
26813 In the worst case, this forces a register to be spilled in
26814 one mode and reloaded in the other, which handles the
26815 endianness correctly. */
26816 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
26817 return false;
26819 return true;
26822 /* Implement TARGET_EARLY_REMAT_MODES. */
26824 static void
26825 aarch64_select_early_remat_modes (sbitmap modes)
26827 /* SVE values are not normally live across a call, so it should be
26828 worth doing early rematerialization even in VL-specific mode. */
26829 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
26830 if (aarch64_sve_mode_p ((machine_mode) i))
26831 bitmap_set_bit (modes, i);
26834 /* Override the default target speculation_safe_value. */
26835 static rtx
26836 aarch64_speculation_safe_value (machine_mode mode,
26837 rtx result, rtx val, rtx failval)
26839 /* Maybe we should warn if falling back to hard barriers. They are
26840 likely to be noticably more expensive than the alternative below. */
26841 if (!aarch64_track_speculation)
26842 return default_speculation_safe_value (mode, result, val, failval);
26844 if (!REG_P (val))
26845 val = copy_to_mode_reg (mode, val);
26847 if (!aarch64_reg_or_zero (failval, mode))
26848 failval = copy_to_mode_reg (mode, failval);
26850 emit_insn (gen_despeculate_copy (mode, result, val, failval));
26851 return result;
26854 /* Implement TARGET_ESTIMATED_POLY_VALUE.
26855 Look into the tuning structure for an estimate.
26856 KIND specifies the type of requested estimate: min, max or likely.
26857 For cores with a known SVE width all three estimates are the same.
26858 For generic SVE tuning we want to distinguish the maximum estimate from
26859 the minimum and likely ones.
26860 The likely estimate is the same as the minimum in that case to give a
26861 conservative behavior of auto-vectorizing with SVE when it is a win
26862 even for 128-bit SVE.
26863 When SVE width information is available VAL.coeffs[1] is multiplied by
26864 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
26866 static HOST_WIDE_INT
26867 aarch64_estimated_poly_value (poly_int64 val,
26868 poly_value_estimate_kind kind
26869 = POLY_VALUE_LIKELY)
26871 unsigned int width_source = aarch64_tune_params.sve_width;
26873 /* If there is no core-specific information then the minimum and likely
26874 values are based on 128-bit vectors and the maximum is based on
26875 the architectural maximum of 2048 bits. */
26876 if (width_source == SVE_SCALABLE)
26877 switch (kind)
26879 case POLY_VALUE_MIN:
26880 case POLY_VALUE_LIKELY:
26881 return val.coeffs[0];
26882 case POLY_VALUE_MAX:
26883 return val.coeffs[0] + val.coeffs[1] * 15;
26886 /* Allow sve_width to be a bitmask of different VL, treating the lowest
26887 as likely. This could be made more general if future -mtune options
26888 need it to be. */
26889 if (kind == POLY_VALUE_MAX)
26890 width_source = 1 << floor_log2 (width_source);
26891 else
26892 width_source = least_bit_hwi (width_source);
26894 /* If the core provides width information, use that. */
26895 HOST_WIDE_INT over_128 = width_source - 128;
26896 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
26900 /* Return true for types that could be supported as SIMD return or
26901 argument types. */
26903 static bool
26904 supported_simd_type (tree t)
26906 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
26908 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
26909 return s == 1 || s == 2 || s == 4 || s == 8;
26911 return false;
26914 /* Return true for types that currently are supported as SIMD return
26915 or argument types. */
26917 static bool
26918 currently_supported_simd_type (tree t, tree b)
26920 if (COMPLEX_FLOAT_TYPE_P (t))
26921 return false;
26923 if (TYPE_SIZE (t) != TYPE_SIZE (b))
26924 return false;
26926 return supported_simd_type (t);
26929 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
26931 static int
26932 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
26933 struct cgraph_simd_clone *clonei,
26934 tree base_type, int num,
26935 bool explicit_p)
26937 tree t, ret_type;
26938 unsigned int elt_bits, count;
26939 unsigned HOST_WIDE_INT const_simdlen;
26940 poly_uint64 vec_bits;
26942 if (!TARGET_SIMD)
26943 return 0;
26945 /* For now, SVE simdclones won't produce illegal simdlen, So only check
26946 const simdlens here. */
26947 if (maybe_ne (clonei->simdlen, 0U)
26948 && clonei->simdlen.is_constant (&const_simdlen)
26949 && (const_simdlen < 2
26950 || const_simdlen > 1024
26951 || (const_simdlen & (const_simdlen - 1)) != 0))
26953 if (explicit_p)
26954 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26955 "unsupported simdlen %wd", const_simdlen);
26956 return 0;
26959 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
26960 if (TREE_CODE (ret_type) != VOID_TYPE
26961 && !currently_supported_simd_type (ret_type, base_type))
26963 if (!explicit_p)
26965 else if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
26966 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26967 "GCC does not currently support mixed size types "
26968 "for %<simd%> functions");
26969 else if (supported_simd_type (ret_type))
26970 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26971 "GCC does not currently support return type %qT "
26972 "for %<simd%> functions", ret_type);
26973 else
26974 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26975 "unsupported return type %qT for %<simd%> functions",
26976 ret_type);
26977 return 0;
26980 int i;
26981 tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
26982 bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
26984 for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
26985 t && t != void_list_node; t = TREE_CHAIN (t), i++)
26987 tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
26989 if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
26990 && !currently_supported_simd_type (arg_type, base_type))
26992 if (!explicit_p)
26994 else if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
26995 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
26996 "GCC does not currently support mixed size types "
26997 "for %<simd%> functions");
26998 else
26999 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27000 "GCC does not currently support argument type %qT "
27001 "for %<simd%> functions", arg_type);
27002 return 0;
27006 clonei->vecsize_mangle = 'n';
27007 clonei->mask_mode = VOIDmode;
27008 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
27009 if (known_eq (clonei->simdlen, 0U))
27011 count = 2;
27012 vec_bits = (num == 0 ? 64 : 128);
27013 clonei->simdlen = exact_div (vec_bits, elt_bits);
27015 else
27017 count = 1;
27018 vec_bits = clonei->simdlen * elt_bits;
27019 /* For now, SVE simdclones won't produce illegal simdlen, So only check
27020 const simdlens here. */
27021 if (clonei->simdlen.is_constant (&const_simdlen)
27022 && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
27024 if (explicit_p)
27025 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
27026 "GCC does not currently support simdlen %wd for "
27027 "type %qT",
27028 const_simdlen, base_type);
27029 return 0;
27032 clonei->vecsize_int = vec_bits;
27033 clonei->vecsize_float = vec_bits;
27034 return count;
27037 /* Implement TARGET_SIMD_CLONE_ADJUST. */
27039 static void
27040 aarch64_simd_clone_adjust (struct cgraph_node *node)
27042 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
27043 use the correct ABI. */
27045 tree t = TREE_TYPE (node->decl);
27046 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
27047 TYPE_ATTRIBUTES (t));
27050 /* Implement TARGET_SIMD_CLONE_USABLE. */
27052 static int
27053 aarch64_simd_clone_usable (struct cgraph_node *node)
27055 switch (node->simdclone->vecsize_mangle)
27057 case 'n':
27058 if (!TARGET_SIMD)
27059 return -1;
27060 return 0;
27061 default:
27062 gcc_unreachable ();
27066 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
27068 static int
27069 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
27071 auto check_attr = [&](const char *name) {
27072 tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
27073 tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
27074 if (!attr1 && !attr2)
27075 return true;
27077 return attr1 && attr2 && attribute_value_equal (attr1, attr2);
27080 if (!check_attr ("aarch64_vector_pcs"))
27081 return 0;
27082 if (!check_attr ("Advanced SIMD type"))
27083 return 0;
27084 if (!check_attr ("SVE type"))
27085 return 0;
27086 if (!check_attr ("SVE sizeless type"))
27087 return 0;
27088 return 1;
27091 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
27093 static const char *
27094 aarch64_get_multilib_abi_name (void)
27096 if (TARGET_BIG_END)
27097 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
27098 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
27101 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
27102 global variable based guard use the default else
27103 return a null tree. */
27104 static tree
27105 aarch64_stack_protect_guard (void)
27107 if (aarch64_stack_protector_guard == SSP_GLOBAL)
27108 return default_stack_protect_guard ();
27110 return NULL_TREE;
27113 /* Return the diagnostic message string if the binary operation OP is
27114 not permitted on TYPE1 and TYPE2, NULL otherwise. */
27116 static const char *
27117 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
27118 const_tree type2)
27120 if (VECTOR_TYPE_P (type1)
27121 && VECTOR_TYPE_P (type2)
27122 && !TYPE_INDIVISIBLE_P (type1)
27123 && !TYPE_INDIVISIBLE_P (type2)
27124 && (aarch64_sve::builtin_type_p (type1)
27125 != aarch64_sve::builtin_type_p (type2)))
27126 return N_("cannot combine GNU and SVE vectors in a binary operation");
27128 /* Operation allowed. */
27129 return NULL;
27132 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
27133 compiler that we automatically ignore the top byte of our pointers, which
27134 allows using -fsanitize=hwaddress. */
27135 bool
27136 aarch64_can_tag_addresses ()
27138 return !TARGET_ILP32;
27141 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
27142 section at the end if needed. */
27143 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
27144 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
27145 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
27146 void
27147 aarch64_file_end_indicate_exec_stack ()
27149 file_end_indicate_exec_stack ();
27151 unsigned feature_1_and = 0;
27152 if (aarch_bti_enabled ())
27153 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
27155 if (aarch_ra_sign_scope != AARCH_FUNCTION_NONE)
27156 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
27158 if (feature_1_and)
27160 /* Generate .note.gnu.property section. */
27161 switch_to_section (get_section (".note.gnu.property",
27162 SECTION_NOTYPE, NULL));
27164 /* PT_NOTE header: namesz, descsz, type.
27165 namesz = 4 ("GNU\0")
27166 descsz = 16 (Size of the program property array)
27167 [(12 + padding) * Number of array elements]
27168 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
27169 assemble_align (POINTER_SIZE);
27170 assemble_integer (GEN_INT (4), 4, 32, 1);
27171 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
27172 assemble_integer (GEN_INT (5), 4, 32, 1);
27174 /* PT_NOTE name. */
27175 assemble_string ("GNU", 4);
27177 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
27178 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
27179 datasz = 4
27180 data = feature_1_and. */
27181 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
27182 assemble_integer (GEN_INT (4), 4, 32, 1);
27183 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
27185 /* Pad the size of the note to the required alignment. */
27186 assemble_align (POINTER_SIZE);
27189 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
27190 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
27191 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
27193 /* Helper function for straight line speculation.
27194 Return what barrier should be emitted for straight line speculation
27195 mitigation.
27196 When not mitigating against straight line speculation this function returns
27197 an empty string.
27198 When mitigating against straight line speculation, use:
27199 * SB when the v8.5-A SB extension is enabled.
27200 * DSB+ISB otherwise. */
27201 const char *
27202 aarch64_sls_barrier (int mitigation_required)
27204 return mitigation_required
27205 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
27206 : "";
27209 static GTY (()) tree aarch64_sls_shared_thunks[30];
27210 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
27211 const char *indirect_symbol_names[30] = {
27212 "__call_indirect_x0",
27213 "__call_indirect_x1",
27214 "__call_indirect_x2",
27215 "__call_indirect_x3",
27216 "__call_indirect_x4",
27217 "__call_indirect_x5",
27218 "__call_indirect_x6",
27219 "__call_indirect_x7",
27220 "__call_indirect_x8",
27221 "__call_indirect_x9",
27222 "__call_indirect_x10",
27223 "__call_indirect_x11",
27224 "__call_indirect_x12",
27225 "__call_indirect_x13",
27226 "__call_indirect_x14",
27227 "__call_indirect_x15",
27228 "", /* "__call_indirect_x16", */
27229 "", /* "__call_indirect_x17", */
27230 "__call_indirect_x18",
27231 "__call_indirect_x19",
27232 "__call_indirect_x20",
27233 "__call_indirect_x21",
27234 "__call_indirect_x22",
27235 "__call_indirect_x23",
27236 "__call_indirect_x24",
27237 "__call_indirect_x25",
27238 "__call_indirect_x26",
27239 "__call_indirect_x27",
27240 "__call_indirect_x28",
27241 "__call_indirect_x29",
27244 /* Function to create a BLR thunk. This thunk is used to mitigate straight
27245 line speculation. Instead of a simple BLR that can be speculated past,
27246 we emit a BL to this thunk, and this thunk contains a BR to the relevant
27247 register. These thunks have the relevant speculation barries put after
27248 their indirect branch so that speculation is blocked.
27250 We use such a thunk so the speculation barriers are kept off the
27251 architecturally executed path in order to reduce the performance overhead.
27253 When optimizing for size we use stubs shared by the linked object.
27254 When optimizing for performance we emit stubs for each function in the hope
27255 that the branch predictor can better train on jumps specific for a given
27256 function. */
27258 aarch64_sls_create_blr_label (int regnum)
27260 gcc_assert (STUB_REGNUM_P (regnum));
27261 if (optimize_function_for_size_p (cfun))
27263 /* For the thunks shared between different functions in this compilation
27264 unit we use a named symbol -- this is just for users to more easily
27265 understand the generated assembly. */
27266 aarch64_sls_shared_thunks_needed = true;
27267 const char *thunk_name = indirect_symbol_names[regnum];
27268 if (aarch64_sls_shared_thunks[regnum] == NULL)
27270 /* Build a decl representing this function stub and record it for
27271 later. We build a decl here so we can use the GCC machinery for
27272 handling sections automatically (through `get_named_section` and
27273 `make_decl_one_only`). That saves us a lot of trouble handling
27274 the specifics of different output file formats. */
27275 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
27276 get_identifier (thunk_name),
27277 build_function_type_list (void_type_node,
27278 NULL_TREE));
27279 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
27280 NULL_TREE, void_type_node);
27281 TREE_PUBLIC (decl) = 1;
27282 TREE_STATIC (decl) = 1;
27283 DECL_IGNORED_P (decl) = 1;
27284 DECL_ARTIFICIAL (decl) = 1;
27285 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
27286 resolve_unique_section (decl, 0, false);
27287 aarch64_sls_shared_thunks[regnum] = decl;
27290 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
27293 if (cfun->machine->call_via[regnum] == NULL)
27294 cfun->machine->call_via[regnum]
27295 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
27296 return cfun->machine->call_via[regnum];
27299 /* Helper function for aarch64_sls_emit_blr_function_thunks and
27300 aarch64_sls_emit_shared_blr_thunks below. */
27301 static void
27302 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
27304 /* Save in x16 and branch to that function so this transformation does
27305 not prevent jumping to `BTI c` instructions. */
27306 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
27307 asm_fprintf (out_file, "\tbr\tx16\n");
27310 /* Emit all BLR stubs for this particular function.
27311 Here we emit all the BLR stubs needed for the current function. Since we
27312 emit these stubs in a consecutive block we know there will be no speculation
27313 gadgets between each stub, and hence we only emit a speculation barrier at
27314 the end of the stub sequences.
27316 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
27317 void
27318 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
27320 if (! aarch64_harden_sls_blr_p ())
27321 return;
27323 bool any_functions_emitted = false;
27324 /* We must save and restore the current function section since this assembly
27325 is emitted at the end of the function. This means it can be emitted *just
27326 after* the cold section of a function. That cold part would be emitted in
27327 a different section. That switch would trigger a `.cfi_endproc` directive
27328 to be emitted in the original section and a `.cfi_startproc` directive to
27329 be emitted in the new section. Switching to the original section without
27330 restoring would mean that the `.cfi_endproc` emitted as a function ends
27331 would happen in a different section -- leaving an unmatched
27332 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
27333 in the standard text section. */
27334 section *save_text_section = in_section;
27335 switch_to_section (function_section (current_function_decl));
27336 for (int regnum = 0; regnum < 30; ++regnum)
27338 rtx specu_label = cfun->machine->call_via[regnum];
27339 if (specu_label == NULL)
27340 continue;
27342 targetm.asm_out.print_operand (out_file, specu_label, 0);
27343 asm_fprintf (out_file, ":\n");
27344 aarch64_sls_emit_function_stub (out_file, regnum);
27345 any_functions_emitted = true;
27347 if (any_functions_emitted)
27348 /* Can use the SB if needs be here, since this stub will only be used
27349 by the current function, and hence for the current target. */
27350 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
27351 switch_to_section (save_text_section);
27354 /* Emit shared BLR stubs for the current compilation unit.
27355 Over the course of compiling this unit we may have converted some BLR
27356 instructions to a BL to a shared stub function. This is where we emit those
27357 stub functions.
27358 This function is for the stubs shared between different functions in this
27359 compilation unit. We share when optimizing for size instead of speed.
27361 This function is called through the TARGET_ASM_FILE_END hook. */
27362 void
27363 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
27365 if (! aarch64_sls_shared_thunks_needed)
27366 return;
27368 for (int regnum = 0; regnum < 30; ++regnum)
27370 tree decl = aarch64_sls_shared_thunks[regnum];
27371 if (!decl)
27372 continue;
27374 const char *name = indirect_symbol_names[regnum];
27375 switch_to_section (get_named_section (decl, NULL, 0));
27376 ASM_OUTPUT_ALIGN (out_file, 2);
27377 targetm.asm_out.globalize_label (out_file, name);
27378 /* Only emits if the compiler is configured for an assembler that can
27379 handle visibility directives. */
27380 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
27381 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
27382 ASM_OUTPUT_LABEL (out_file, name);
27383 aarch64_sls_emit_function_stub (out_file, regnum);
27384 /* Use the most conservative target to ensure it can always be used by any
27385 function in the translation unit. */
27386 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
27387 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
27391 /* Implement TARGET_ASM_FILE_END. */
27392 void
27393 aarch64_asm_file_end ()
27395 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
27396 /* Since this function will be called for the ASM_FILE_END hook, we ensure
27397 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
27398 for FreeBSD) still gets called. */
27399 #ifdef TARGET_ASM_FILE_END
27400 TARGET_ASM_FILE_END ();
27401 #endif
27404 const char *
27405 aarch64_indirect_call_asm (rtx addr)
27407 gcc_assert (REG_P (addr));
27408 if (aarch64_harden_sls_blr_p ())
27410 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
27411 output_asm_insn ("bl\t%0", &stub_label);
27413 else
27414 output_asm_insn ("blr\t%0", &addr);
27415 return "";
27418 /* Target-specific selftests. */
27420 #if CHECKING_P
27422 namespace selftest {
27424 /* Selftest for the RTL loader.
27425 Verify that the RTL loader copes with a dump from
27426 print_rtx_function. This is essentially just a test that class
27427 function_reader can handle a real dump, but it also verifies
27428 that lookup_reg_by_dump_name correctly handles hard regs.
27429 The presence of hard reg names in the dump means that the test is
27430 target-specific, hence it is in this file. */
27432 static void
27433 aarch64_test_loading_full_dump ()
27435 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
27437 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
27439 rtx_insn *insn_1 = get_insn_by_uid (1);
27440 ASSERT_EQ (NOTE, GET_CODE (insn_1));
27442 rtx_insn *insn_15 = get_insn_by_uid (15);
27443 ASSERT_EQ (INSN, GET_CODE (insn_15));
27444 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
27446 /* Verify crtl->return_rtx. */
27447 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
27448 ASSERT_EQ (0, REGNO (crtl->return_rtx));
27449 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
27452 /* Test the fractional_cost class. */
27454 static void
27455 aarch64_test_fractional_cost ()
27457 using cf = fractional_cost;
27459 ASSERT_EQ (cf (0, 20), 0);
27461 ASSERT_EQ (cf (4, 2), 2);
27462 ASSERT_EQ (3, cf (9, 3));
27464 ASSERT_NE (cf (5, 2), 2);
27465 ASSERT_NE (3, cf (8, 3));
27467 ASSERT_EQ (cf (7, 11) + cf (15, 11), 2);
27468 ASSERT_EQ (cf (2, 3) + cf (3, 5), cf (19, 15));
27469 ASSERT_EQ (cf (2, 3) + cf (1, 6) + cf (1, 6), 1);
27471 ASSERT_EQ (cf (14, 15) - cf (4, 15), cf (2, 3));
27472 ASSERT_EQ (cf (1, 4) - cf (1, 2), 0);
27473 ASSERT_EQ (cf (3, 5) - cf (1, 10), cf (1, 2));
27474 ASSERT_EQ (cf (11, 3) - 3, cf (2, 3));
27475 ASSERT_EQ (3 - cf (7, 3), cf (2, 3));
27476 ASSERT_EQ (3 - cf (10, 3), 0);
27478 ASSERT_EQ (cf (2, 3) * 5, cf (10, 3));
27479 ASSERT_EQ (14 * cf (11, 21), cf (22, 3));
27481 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27482 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27483 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27484 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27485 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27486 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27487 ASSERT_TRUE (cf (239, 240) < 1);
27488 ASSERT_FALSE (cf (240, 240) < 1);
27489 ASSERT_FALSE (cf (241, 240) < 1);
27490 ASSERT_FALSE (2 < cf (207, 104));
27491 ASSERT_FALSE (2 < cf (208, 104));
27492 ASSERT_TRUE (2 < cf (209, 104));
27494 ASSERT_TRUE (cf (4, 15) < cf (5, 15));
27495 ASSERT_FALSE (cf (5, 15) < cf (5, 15));
27496 ASSERT_FALSE (cf (6, 15) < cf (5, 15));
27497 ASSERT_TRUE (cf (1, 3) < cf (2, 5));
27498 ASSERT_TRUE (cf (1, 12) < cf (1, 6));
27499 ASSERT_FALSE (cf (5, 3) < cf (5, 3));
27500 ASSERT_TRUE (cf (239, 240) < 1);
27501 ASSERT_FALSE (cf (240, 240) < 1);
27502 ASSERT_FALSE (cf (241, 240) < 1);
27503 ASSERT_FALSE (2 < cf (207, 104));
27504 ASSERT_FALSE (2 < cf (208, 104));
27505 ASSERT_TRUE (2 < cf (209, 104));
27507 ASSERT_FALSE (cf (4, 15) >= cf (5, 15));
27508 ASSERT_TRUE (cf (5, 15) >= cf (5, 15));
27509 ASSERT_TRUE (cf (6, 15) >= cf (5, 15));
27510 ASSERT_FALSE (cf (1, 3) >= cf (2, 5));
27511 ASSERT_FALSE (cf (1, 12) >= cf (1, 6));
27512 ASSERT_TRUE (cf (5, 3) >= cf (5, 3));
27513 ASSERT_FALSE (cf (239, 240) >= 1);
27514 ASSERT_TRUE (cf (240, 240) >= 1);
27515 ASSERT_TRUE (cf (241, 240) >= 1);
27516 ASSERT_TRUE (2 >= cf (207, 104));
27517 ASSERT_TRUE (2 >= cf (208, 104));
27518 ASSERT_FALSE (2 >= cf (209, 104));
27520 ASSERT_FALSE (cf (4, 15) > cf (5, 15));
27521 ASSERT_FALSE (cf (5, 15) > cf (5, 15));
27522 ASSERT_TRUE (cf (6, 15) > cf (5, 15));
27523 ASSERT_FALSE (cf (1, 3) > cf (2, 5));
27524 ASSERT_FALSE (cf (1, 12) > cf (1, 6));
27525 ASSERT_FALSE (cf (5, 3) > cf (5, 3));
27526 ASSERT_FALSE (cf (239, 240) > 1);
27527 ASSERT_FALSE (cf (240, 240) > 1);
27528 ASSERT_TRUE (cf (241, 240) > 1);
27529 ASSERT_TRUE (2 > cf (207, 104));
27530 ASSERT_FALSE (2 > cf (208, 104));
27531 ASSERT_FALSE (2 > cf (209, 104));
27533 ASSERT_EQ (cf (1, 2).ceil (), 1);
27534 ASSERT_EQ (cf (11, 7).ceil (), 2);
27535 ASSERT_EQ (cf (20, 1).ceil (), 20);
27536 ASSERT_EQ ((cf (0xfffffffd) + 1).ceil (), 0xfffffffe);
27537 ASSERT_EQ ((cf (0xfffffffd) + 2).ceil (), 0xffffffff);
27538 ASSERT_EQ ((cf (0xfffffffd) + 3).ceil (), 0xffffffff);
27539 ASSERT_EQ ((cf (0x7fffffff) * 2).ceil (), 0xfffffffe);
27540 ASSERT_EQ ((cf (0x80000000) * 2).ceil (), 0xffffffff);
27542 ASSERT_EQ (cf (1, 2).as_double (), 0.5);
27545 /* Run all target-specific selftests. */
27547 static void
27548 aarch64_run_selftests (void)
27550 aarch64_test_loading_full_dump ();
27551 aarch64_test_fractional_cost ();
27554 } // namespace selftest
27556 #endif /* #if CHECKING_P */
27558 #undef TARGET_STACK_PROTECT_GUARD
27559 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
27561 #undef TARGET_ADDRESS_COST
27562 #define TARGET_ADDRESS_COST aarch64_address_cost
27564 /* This hook will determines whether unnamed bitfields affect the alignment
27565 of the containing structure. The hook returns true if the structure
27566 should inherit the alignment requirements of an unnamed bitfield's
27567 type. */
27568 #undef TARGET_ALIGN_ANON_BITFIELD
27569 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
27571 #undef TARGET_ASM_ALIGNED_DI_OP
27572 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
27574 #undef TARGET_ASM_ALIGNED_HI_OP
27575 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
27577 #undef TARGET_ASM_ALIGNED_SI_OP
27578 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
27580 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
27581 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
27582 hook_bool_const_tree_hwi_hwi_const_tree_true
27584 #undef TARGET_ASM_FILE_START
27585 #define TARGET_ASM_FILE_START aarch64_start_file
27587 #undef TARGET_ASM_OUTPUT_MI_THUNK
27588 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
27590 #undef TARGET_ASM_SELECT_RTX_SECTION
27591 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
27593 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
27594 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
27596 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
27597 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
27599 #undef TARGET_BUILD_BUILTIN_VA_LIST
27600 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
27602 #undef TARGET_CALLEE_COPIES
27603 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
27605 #undef TARGET_CAN_ELIMINATE
27606 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
27608 #undef TARGET_CAN_INLINE_P
27609 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
27611 #undef TARGET_CANNOT_FORCE_CONST_MEM
27612 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
27614 #undef TARGET_CASE_VALUES_THRESHOLD
27615 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
27617 #undef TARGET_CONDITIONAL_REGISTER_USAGE
27618 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
27620 #undef TARGET_MEMBER_TYPE_FORCES_BLK
27621 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
27623 /* Only the least significant bit is used for initialization guard
27624 variables. */
27625 #undef TARGET_CXX_GUARD_MASK_BIT
27626 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
27628 #undef TARGET_C_MODE_FOR_SUFFIX
27629 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
27631 #ifdef TARGET_BIG_ENDIAN_DEFAULT
27632 #undef TARGET_DEFAULT_TARGET_FLAGS
27633 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
27634 #endif
27636 #undef TARGET_CLASS_MAX_NREGS
27637 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
27639 #undef TARGET_BUILTIN_DECL
27640 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
27642 #undef TARGET_BUILTIN_RECIPROCAL
27643 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
27645 #undef TARGET_C_EXCESS_PRECISION
27646 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
27648 #undef TARGET_EXPAND_BUILTIN
27649 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
27651 #undef TARGET_EXPAND_BUILTIN_VA_START
27652 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
27654 #undef TARGET_FOLD_BUILTIN
27655 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
27657 #undef TARGET_FUNCTION_ARG
27658 #define TARGET_FUNCTION_ARG aarch64_function_arg
27660 #undef TARGET_FUNCTION_ARG_ADVANCE
27661 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
27663 #undef TARGET_FUNCTION_ARG_BOUNDARY
27664 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
27666 #undef TARGET_FUNCTION_ARG_PADDING
27667 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
27669 #undef TARGET_GET_RAW_RESULT_MODE
27670 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
27671 #undef TARGET_GET_RAW_ARG_MODE
27672 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
27674 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
27675 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
27677 #undef TARGET_FUNCTION_VALUE
27678 #define TARGET_FUNCTION_VALUE aarch64_function_value
27680 #undef TARGET_FUNCTION_VALUE_REGNO_P
27681 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
27683 #undef TARGET_GIMPLE_FOLD_BUILTIN
27684 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
27686 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
27687 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
27689 #undef TARGET_INIT_BUILTINS
27690 #define TARGET_INIT_BUILTINS aarch64_init_builtins
27692 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
27693 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
27694 aarch64_ira_change_pseudo_allocno_class
27696 #undef TARGET_LEGITIMATE_ADDRESS_P
27697 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
27699 #undef TARGET_LEGITIMATE_CONSTANT_P
27700 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
27702 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
27703 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
27704 aarch64_legitimize_address_displacement
27706 #undef TARGET_LIBGCC_CMP_RETURN_MODE
27707 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
27709 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
27710 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
27711 aarch64_libgcc_floating_mode_supported_p
27713 #undef TARGET_MANGLE_TYPE
27714 #define TARGET_MANGLE_TYPE aarch64_mangle_type
27716 #undef TARGET_INVALID_BINARY_OP
27717 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
27719 #undef TARGET_VERIFY_TYPE_CONTEXT
27720 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
27722 #undef TARGET_MEMORY_MOVE_COST
27723 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
27725 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
27726 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
27728 #undef TARGET_MUST_PASS_IN_STACK
27729 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
27731 /* This target hook should return true if accesses to volatile bitfields
27732 should use the narrowest mode possible. It should return false if these
27733 accesses should use the bitfield container type. */
27734 #undef TARGET_NARROW_VOLATILE_BITFIELD
27735 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
27737 #undef TARGET_OPTION_OVERRIDE
27738 #define TARGET_OPTION_OVERRIDE aarch64_override_options
27740 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
27741 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
27742 aarch64_override_options_after_change
27744 #undef TARGET_OFFLOAD_OPTIONS
27745 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
27747 #undef TARGET_OPTION_RESTORE
27748 #define TARGET_OPTION_RESTORE aarch64_option_restore
27750 #undef TARGET_OPTION_PRINT
27751 #define TARGET_OPTION_PRINT aarch64_option_print
27753 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
27754 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
27756 #undef TARGET_SET_CURRENT_FUNCTION
27757 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
27759 #undef TARGET_PASS_BY_REFERENCE
27760 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
27762 #undef TARGET_PREFERRED_RELOAD_CLASS
27763 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
27765 #undef TARGET_SCHED_REASSOCIATION_WIDTH
27766 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
27768 #undef TARGET_DWARF_FRAME_REG_MODE
27769 #define TARGET_DWARF_FRAME_REG_MODE aarch64_dwarf_frame_reg_mode
27771 #undef TARGET_PROMOTED_TYPE
27772 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
27774 #undef TARGET_SECONDARY_RELOAD
27775 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
27777 #undef TARGET_SECONDARY_MEMORY_NEEDED
27778 #define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
27780 #undef TARGET_SHIFT_TRUNCATION_MASK
27781 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
27783 #undef TARGET_SETUP_INCOMING_VARARGS
27784 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
27786 #undef TARGET_STRUCT_VALUE_RTX
27787 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
27789 #undef TARGET_REGISTER_MOVE_COST
27790 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
27792 #undef TARGET_RETURN_IN_MEMORY
27793 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
27795 #undef TARGET_RETURN_IN_MSB
27796 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
27798 #undef TARGET_RTX_COSTS
27799 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
27801 #undef TARGET_SCALAR_MODE_SUPPORTED_P
27802 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
27804 #undef TARGET_SCHED_ISSUE_RATE
27805 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
27807 #undef TARGET_SCHED_VARIABLE_ISSUE
27808 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
27810 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
27811 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
27812 aarch64_sched_first_cycle_multipass_dfa_lookahead
27814 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
27815 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
27816 aarch64_first_cycle_multipass_dfa_lookahead_guard
27818 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
27819 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
27820 aarch64_get_separate_components
27822 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
27823 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
27824 aarch64_components_for_bb
27826 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
27827 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
27828 aarch64_disqualify_components
27830 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
27831 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
27832 aarch64_emit_prologue_components
27834 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
27835 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
27836 aarch64_emit_epilogue_components
27838 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
27839 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
27840 aarch64_set_handled_components
27842 #undef TARGET_TRAMPOLINE_INIT
27843 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
27845 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
27846 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
27848 #undef TARGET_VECTOR_MODE_SUPPORTED_P
27849 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
27851 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
27852 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
27854 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
27855 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
27856 aarch64_builtin_support_vector_misalignment
27858 #undef TARGET_ARRAY_MODE
27859 #define TARGET_ARRAY_MODE aarch64_array_mode
27861 #undef TARGET_ARRAY_MODE_SUPPORTED_P
27862 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
27864 #undef TARGET_VECTORIZE_CREATE_COSTS
27865 #define TARGET_VECTORIZE_CREATE_COSTS aarch64_vectorize_create_costs
27867 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
27868 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
27869 aarch64_builtin_vectorization_cost
27871 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
27872 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
27874 #undef TARGET_VECTORIZE_BUILTINS
27875 #define TARGET_VECTORIZE_BUILTINS
27877 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
27878 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
27879 aarch64_autovectorize_vector_modes
27881 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
27882 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
27883 aarch64_atomic_assign_expand_fenv
27885 /* Section anchor support. */
27887 #undef TARGET_MIN_ANCHOR_OFFSET
27888 #define TARGET_MIN_ANCHOR_OFFSET -256
27890 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
27891 byte offset; we can do much more for larger data types, but have no way
27892 to determine the size of the access. We assume accesses are aligned. */
27893 #undef TARGET_MAX_ANCHOR_OFFSET
27894 #define TARGET_MAX_ANCHOR_OFFSET 4095
27896 #undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
27897 #define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
27898 aarch64_vectorize_preferred_div_as_shifts_over_mult
27900 #undef TARGET_VECTOR_ALIGNMENT
27901 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
27903 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
27904 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
27905 aarch64_vectorize_preferred_vector_alignment
27906 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
27907 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
27908 aarch64_simd_vector_alignment_reachable
27910 /* vec_perm support. */
27912 #undef TARGET_VECTORIZE_VEC_PERM_CONST
27913 #define TARGET_VECTORIZE_VEC_PERM_CONST \
27914 aarch64_vectorize_vec_perm_const
27916 #undef TARGET_VECTORIZE_RELATED_MODE
27917 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
27918 #undef TARGET_VECTORIZE_GET_MASK_MODE
27919 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
27920 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
27921 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
27922 aarch64_empty_mask_is_expensive
27923 #undef TARGET_PREFERRED_ELSE_VALUE
27924 #define TARGET_PREFERRED_ELSE_VALUE \
27925 aarch64_preferred_else_value
27927 #undef TARGET_INIT_LIBFUNCS
27928 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
27930 #undef TARGET_FIXED_CONDITION_CODE_REGS
27931 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
27933 #undef TARGET_FLAGS_REGNUM
27934 #define TARGET_FLAGS_REGNUM CC_REGNUM
27936 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
27937 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
27939 #undef TARGET_ASAN_SHADOW_OFFSET
27940 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
27942 #undef TARGET_LEGITIMIZE_ADDRESS
27943 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
27945 #undef TARGET_SCHED_CAN_SPECULATE_INSN
27946 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
27948 #undef TARGET_CAN_USE_DOLOOP_P
27949 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
27951 #undef TARGET_SCHED_ADJUST_PRIORITY
27952 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
27954 #undef TARGET_SCHED_MACRO_FUSION_P
27955 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
27957 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
27958 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
27960 #undef TARGET_SCHED_FUSION_PRIORITY
27961 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
27963 #undef TARGET_UNSPEC_MAY_TRAP_P
27964 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
27966 #undef TARGET_USE_PSEUDO_PIC_REG
27967 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
27969 #undef TARGET_PRINT_OPERAND
27970 #define TARGET_PRINT_OPERAND aarch64_print_operand
27972 #undef TARGET_PRINT_OPERAND_ADDRESS
27973 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
27975 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
27976 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
27978 #undef TARGET_OPTAB_SUPPORTED_P
27979 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
27981 #undef TARGET_OMIT_STRUCT_RETURN_REG
27982 #define TARGET_OMIT_STRUCT_RETURN_REG true
27984 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
27985 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
27986 aarch64_dwarf_poly_indeterminate_value
27988 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
27989 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
27990 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
27992 #undef TARGET_HARD_REGNO_NREGS
27993 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
27994 #undef TARGET_HARD_REGNO_MODE_OK
27995 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
27997 #undef TARGET_MODES_TIEABLE_P
27998 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
28000 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
28001 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
28002 aarch64_hard_regno_call_part_clobbered
28004 #undef TARGET_INSN_CALLEE_ABI
28005 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
28007 #undef TARGET_CONSTANT_ALIGNMENT
28008 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
28010 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
28011 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
28012 aarch64_stack_clash_protection_alloca_probe_range
28014 #undef TARGET_COMPUTE_PRESSURE_CLASSES
28015 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
28017 #undef TARGET_CAN_CHANGE_MODE_CLASS
28018 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
28020 #undef TARGET_SELECT_EARLY_REMAT_MODES
28021 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
28023 #undef TARGET_SPECULATION_SAFE_VALUE
28024 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
28026 #undef TARGET_ESTIMATED_POLY_VALUE
28027 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
28029 #undef TARGET_ATTRIBUTE_TABLE
28030 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
28032 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
28033 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
28034 aarch64_simd_clone_compute_vecsize_and_simdlen
28036 #undef TARGET_SIMD_CLONE_ADJUST
28037 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
28039 #undef TARGET_SIMD_CLONE_USABLE
28040 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
28042 #undef TARGET_COMP_TYPE_ATTRIBUTES
28043 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
28045 #undef TARGET_GET_MULTILIB_ABI_NAME
28046 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
28048 #undef TARGET_FNTYPE_ABI
28049 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
28051 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
28052 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
28054 #if CHECKING_P
28055 #undef TARGET_RUN_TARGET_SELFTESTS
28056 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
28057 #endif /* #if CHECKING_P */
28059 #undef TARGET_ASM_POST_CFI_STARTPROC
28060 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
28062 #undef TARGET_STRICT_ARGUMENT_NAMING
28063 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
28065 #undef TARGET_MD_ASM_ADJUST
28066 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
28068 #undef TARGET_ASM_FILE_END
28069 #define TARGET_ASM_FILE_END aarch64_asm_file_end
28071 #undef TARGET_ASM_FUNCTION_EPILOGUE
28072 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
28074 #undef TARGET_HAVE_SHADOW_CALL_STACK
28075 #define TARGET_HAVE_SHADOW_CALL_STACK true
28077 #undef TARGET_CONST_ANCHOR
28078 #define TARGET_CONST_ANCHOR 0x1000000
28080 struct gcc_target targetm = TARGET_INITIALIZER;
28082 #include "gt-aarch64.h"