AArch64: Cleanup aarch64_classify_symbol
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobe9f961d41faa4b82b8e5ebc0207a80b390e41067
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2021 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74 #include "intl.h"
75 #include "expmed.h"
76 #include "function-abi.h"
77 #include "gimple-pretty-print.h"
78 #include "tree-ssa-loop-niter.h"
80 /* This file should be included last. */
81 #include "target-def.h"
83 /* Defined for convenience. */
84 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
86 /* Information about a legitimate vector immediate operand. */
87 struct simd_immediate_info
89 enum insn_type { MOV, MVN, INDEX, PTRUE };
90 enum modifier_type { LSL, MSL };
92 simd_immediate_info () {}
93 simd_immediate_info (scalar_float_mode, rtx);
94 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
95 insn_type = MOV, modifier_type = LSL,
96 unsigned int = 0);
97 simd_immediate_info (scalar_mode, rtx, rtx);
98 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
100 /* The mode of the elements. */
101 scalar_mode elt_mode;
103 /* The instruction to use to move the immediate into a vector. */
104 insn_type insn;
106 union
108 /* For MOV and MVN. */
109 struct
111 /* The value of each element. */
112 rtx value;
114 /* The kind of shift modifier to use, and the number of bits to shift.
115 This is (LSL, 0) if no shift is needed. */
116 modifier_type modifier;
117 unsigned int shift;
118 } mov;
120 /* For INDEX. */
121 struct
123 /* The value of the first element and the step to be added for each
124 subsequent element. */
125 rtx base, step;
126 } index;
128 /* For PTRUE. */
129 aarch64_svpattern pattern;
130 } u;
133 /* Construct a floating-point immediate in which each element has mode
134 ELT_MODE_IN and value VALUE_IN. */
135 inline simd_immediate_info
136 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
137 : elt_mode (elt_mode_in), insn (MOV)
139 u.mov.value = value_in;
140 u.mov.modifier = LSL;
141 u.mov.shift = 0;
144 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
145 and value VALUE_IN. The other parameters are as for the structure
146 fields. */
147 inline simd_immediate_info
148 ::simd_immediate_info (scalar_int_mode elt_mode_in,
149 unsigned HOST_WIDE_INT value_in,
150 insn_type insn_in, modifier_type modifier_in,
151 unsigned int shift_in)
152 : elt_mode (elt_mode_in), insn (insn_in)
154 u.mov.value = gen_int_mode (value_in, elt_mode_in);
155 u.mov.modifier = modifier_in;
156 u.mov.shift = shift_in;
159 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
160 and where element I is equal to BASE_IN + I * STEP_IN. */
161 inline simd_immediate_info
162 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
163 : elt_mode (elt_mode_in), insn (INDEX)
165 u.index.base = base_in;
166 u.index.step = step_in;
169 /* Construct a predicate that controls elements of mode ELT_MODE_IN
170 and has PTRUE pattern PATTERN_IN. */
171 inline simd_immediate_info
172 ::simd_immediate_info (scalar_int_mode elt_mode_in,
173 aarch64_svpattern pattern_in)
174 : elt_mode (elt_mode_in), insn (PTRUE)
176 u.pattern = pattern_in;
179 namespace {
181 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
182 class pure_scalable_type_info
184 public:
185 /* Represents the result of analyzing a type. All values are nonzero,
186 in the possibly forlorn hope that accidental conversions to bool
187 trigger a warning. */
188 enum analysis_result
190 /* The type does not have an ABI identity; i.e. it doesn't contain
191 at least one object whose type is a Fundamental Data Type. */
192 NO_ABI_IDENTITY = 1,
194 /* The type is definitely a Pure Scalable Type. */
195 IS_PST,
197 /* The type is definitely not a Pure Scalable Type. */
198 ISNT_PST,
200 /* It doesn't matter for PCS purposes whether the type is a Pure
201 Scalable Type or not, since the type will be handled the same
202 way regardless.
204 Specifically, this means that if the type is a Pure Scalable Type,
205 there aren't enough argument registers to hold it, and so it will
206 need to be passed or returned in memory. If the type isn't a
207 Pure Scalable Type, it's too big to be passed or returned in core
208 or SIMD&FP registers, and so again will need to go in memory. */
209 DOESNT_MATTER
212 /* Aggregates of 17 bytes or more are normally passed and returned
213 in memory, so aggregates of that size can safely be analyzed as
214 DOESNT_MATTER. We need to be able to collect enough pieces to
215 represent a PST that is smaller than that. Since predicates are
216 2 bytes in size for -msve-vector-bits=128, that means we need to be
217 able to store at least 8 pieces.
219 We also need to be able to store enough pieces to represent
220 a single vector in each vector argument register and a single
221 predicate in each predicate argument register. This means that
222 we need at least 12 pieces. */
223 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
224 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
226 /* Describes one piece of a PST. Each piece is one of:
228 - a single Scalable Vector Type (SVT)
229 - a single Scalable Predicate Type (SPT)
230 - a PST containing 2, 3 or 4 SVTs, with no padding
232 It either represents a single built-in type or a PST formed from
233 multiple homogeneous built-in types. */
234 struct piece
236 rtx get_rtx (unsigned int, unsigned int) const;
238 /* The number of vector and predicate registers that the piece
239 occupies. One of the two is always zero. */
240 unsigned int num_zr;
241 unsigned int num_pr;
243 /* The mode of the registers described above. */
244 machine_mode mode;
246 /* If this piece is formed from multiple homogeneous built-in types,
247 this is the mode of the built-in types, otherwise it is MODE. */
248 machine_mode orig_mode;
250 /* The offset in bytes of the piece from the start of the type. */
251 poly_uint64_pod offset;
254 /* Divides types analyzed as IS_PST into individual pieces. The pieces
255 are in memory order. */
256 auto_vec<piece, MAX_PIECES> pieces;
258 unsigned int num_zr () const;
259 unsigned int num_pr () const;
261 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
263 analysis_result analyze (const_tree);
264 bool analyze_registers (const_tree);
266 private:
267 analysis_result analyze_array (const_tree);
268 analysis_result analyze_record (const_tree);
269 void add_piece (const piece &);
273 /* The current code model. */
274 enum aarch64_code_model aarch64_cmodel;
276 /* The number of 64-bit elements in an SVE vector. */
277 poly_uint16 aarch64_sve_vg;
279 #ifdef HAVE_AS_TLS
280 #undef TARGET_HAVE_TLS
281 #define TARGET_HAVE_TLS 1
282 #endif
284 static bool aarch64_composite_type_p (const_tree, machine_mode);
285 static bool aarch64_return_in_memory_1 (const_tree);
286 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
287 const_tree,
288 machine_mode *, int *,
289 bool *, bool);
290 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
291 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
292 static void aarch64_override_options_after_change (void);
293 static bool aarch64_vector_mode_supported_p (machine_mode);
294 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
295 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
296 const_tree type,
297 int misalignment,
298 bool is_packed);
299 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
300 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
301 aarch64_addr_query_type);
302 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
304 /* Major revision number of the ARM Architecture implemented by the target. */
305 unsigned aarch64_architecture_version;
307 /* The processor for which instructions should be scheduled. */
308 enum aarch64_processor aarch64_tune = cortexa53;
310 /* Mask to specify which instruction scheduling options should be used. */
311 uint64_t aarch64_tune_flags = 0;
313 /* Global flag for PC relative loads. */
314 bool aarch64_pcrelative_literal_loads;
316 /* Global flag for whether frame pointer is enabled. */
317 bool aarch64_use_frame_pointer;
319 #define BRANCH_PROTECT_STR_MAX 255
320 char *accepted_branch_protection_string = NULL;
322 static enum aarch64_parse_opt_result
323 aarch64_parse_branch_protection (const char*, char**);
325 /* Support for command line parsing of boolean flags in the tuning
326 structures. */
327 struct aarch64_flag_desc
329 const char* name;
330 unsigned int flag;
333 #define AARCH64_FUSION_PAIR(name, internal_name) \
334 { name, AARCH64_FUSE_##internal_name },
335 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
337 { "none", AARCH64_FUSE_NOTHING },
338 #include "aarch64-fusion-pairs.def"
339 { "all", AARCH64_FUSE_ALL },
340 { NULL, AARCH64_FUSE_NOTHING }
343 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
344 { name, AARCH64_EXTRA_TUNE_##internal_name },
345 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
347 { "none", AARCH64_EXTRA_TUNE_NONE },
348 #include "aarch64-tuning-flags.def"
349 { "all", AARCH64_EXTRA_TUNE_ALL },
350 { NULL, AARCH64_EXTRA_TUNE_NONE }
353 /* Tuning parameters. */
355 static const struct cpu_addrcost_table generic_addrcost_table =
358 1, /* hi */
359 0, /* si */
360 0, /* di */
361 1, /* ti */
363 0, /* pre_modify */
364 0, /* post_modify */
365 0, /* post_modify_ld3_st3 */
366 0, /* post_modify_ld4_st4 */
367 0, /* register_offset */
368 0, /* register_sextend */
369 0, /* register_zextend */
370 0 /* imm_offset */
373 static const struct cpu_addrcost_table exynosm1_addrcost_table =
376 0, /* hi */
377 0, /* si */
378 0, /* di */
379 2, /* ti */
381 0, /* pre_modify */
382 0, /* post_modify */
383 0, /* post_modify_ld3_st3 */
384 0, /* post_modify_ld4_st4 */
385 1, /* register_offset */
386 1, /* register_sextend */
387 2, /* register_zextend */
388 0, /* imm_offset */
391 static const struct cpu_addrcost_table xgene1_addrcost_table =
394 1, /* hi */
395 0, /* si */
396 0, /* di */
397 1, /* ti */
399 1, /* pre_modify */
400 1, /* post_modify */
401 1, /* post_modify_ld3_st3 */
402 1, /* post_modify_ld4_st4 */
403 0, /* register_offset */
404 1, /* register_sextend */
405 1, /* register_zextend */
406 0, /* imm_offset */
409 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
412 1, /* hi */
413 1, /* si */
414 1, /* di */
415 2, /* ti */
417 0, /* pre_modify */
418 0, /* post_modify */
419 0, /* post_modify_ld3_st3 */
420 0, /* post_modify_ld4_st4 */
421 2, /* register_offset */
422 3, /* register_sextend */
423 3, /* register_zextend */
424 0, /* imm_offset */
427 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
430 1, /* hi */
431 1, /* si */
432 1, /* di */
433 2, /* ti */
435 0, /* pre_modify */
436 0, /* post_modify */
437 0, /* post_modify_ld3_st3 */
438 0, /* post_modify_ld4_st4 */
439 2, /* register_offset */
440 3, /* register_sextend */
441 3, /* register_zextend */
442 0, /* imm_offset */
445 static const struct cpu_addrcost_table tsv110_addrcost_table =
448 1, /* hi */
449 0, /* si */
450 0, /* di */
451 1, /* ti */
453 0, /* pre_modify */
454 0, /* post_modify */
455 0, /* post_modify_ld3_st3 */
456 0, /* post_modify_ld4_st4 */
457 0, /* register_offset */
458 1, /* register_sextend */
459 1, /* register_zextend */
460 0, /* imm_offset */
463 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
466 1, /* hi */
467 1, /* si */
468 1, /* di */
469 2, /* ti */
471 1, /* pre_modify */
472 1, /* post_modify */
473 1, /* post_modify_ld3_st3 */
474 1, /* post_modify_ld4_st4 */
475 3, /* register_offset */
476 3, /* register_sextend */
477 3, /* register_zextend */
478 2, /* imm_offset */
481 static const struct cpu_addrcost_table a64fx_addrcost_table =
484 1, /* hi */
485 1, /* si */
486 1, /* di */
487 2, /* ti */
489 0, /* pre_modify */
490 0, /* post_modify */
491 0, /* post_modify_ld3_st3 */
492 0, /* post_modify_ld4_st4 */
493 2, /* register_offset */
494 3, /* register_sextend */
495 3, /* register_zextend */
496 0, /* imm_offset */
499 static const struct cpu_addrcost_table neoversev1_addrcost_table =
502 1, /* hi */
503 0, /* si */
504 0, /* di */
505 1, /* ti */
507 0, /* pre_modify */
508 0, /* post_modify */
509 3, /* post_modify_ld3_st3 */
510 3, /* post_modify_ld4_st4 */
511 0, /* register_offset */
512 0, /* register_sextend */
513 0, /* register_zextend */
514 0 /* imm_offset */
517 static const struct cpu_regmove_cost generic_regmove_cost =
519 1, /* GP2GP */
520 /* Avoid the use of slow int<->fp moves for spilling by setting
521 their cost higher than memmov_cost. */
522 5, /* GP2FP */
523 5, /* FP2GP */
524 2 /* FP2FP */
527 static const struct cpu_regmove_cost cortexa57_regmove_cost =
529 1, /* GP2GP */
530 /* Avoid the use of slow int<->fp moves for spilling by setting
531 their cost higher than memmov_cost. */
532 5, /* GP2FP */
533 5, /* FP2GP */
534 2 /* FP2FP */
537 static const struct cpu_regmove_cost cortexa53_regmove_cost =
539 1, /* GP2GP */
540 /* Avoid the use of slow int<->fp moves for spilling by setting
541 their cost higher than memmov_cost. */
542 5, /* GP2FP */
543 5, /* FP2GP */
544 2 /* FP2FP */
547 static const struct cpu_regmove_cost exynosm1_regmove_cost =
549 1, /* GP2GP */
550 /* Avoid the use of slow int<->fp moves for spilling by setting
551 their cost higher than memmov_cost (actual, 4 and 9). */
552 9, /* GP2FP */
553 9, /* FP2GP */
554 1 /* FP2FP */
557 static const struct cpu_regmove_cost thunderx_regmove_cost =
559 2, /* GP2GP */
560 2, /* GP2FP */
561 6, /* FP2GP */
562 4 /* FP2FP */
565 static const struct cpu_regmove_cost xgene1_regmove_cost =
567 1, /* GP2GP */
568 /* Avoid the use of slow int<->fp moves for spilling by setting
569 their cost higher than memmov_cost. */
570 8, /* GP2FP */
571 8, /* FP2GP */
572 2 /* FP2FP */
575 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
577 2, /* GP2GP */
578 /* Avoid the use of int<->fp moves for spilling. */
579 6, /* GP2FP */
580 6, /* FP2GP */
581 4 /* FP2FP */
584 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
586 1, /* GP2GP */
587 /* Avoid the use of int<->fp moves for spilling. */
588 5, /* GP2FP */
589 6, /* FP2GP */
590 3, /* FP2FP */
593 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
595 1, /* GP2GP */
596 /* Avoid the use of int<->fp moves for spilling. */
597 4, /* GP2FP */
598 5, /* FP2GP */
599 4 /* FP2FP */
602 static const struct cpu_regmove_cost tsv110_regmove_cost =
604 1, /* GP2GP */
605 /* Avoid the use of slow int<->fp moves for spilling by setting
606 their cost higher than memmov_cost. */
607 2, /* GP2FP */
608 3, /* FP2GP */
609 2 /* FP2FP */
612 static const struct cpu_regmove_cost a64fx_regmove_cost =
614 1, /* GP2GP */
615 /* Avoid the use of slow int<->fp moves for spilling by setting
616 their cost higher than memmov_cost. */
617 5, /* GP2FP */
618 7, /* FP2GP */
619 2 /* FP2FP */
622 /* Generic costs for Advanced SIMD vector operations. */
623 static const advsimd_vec_cost generic_advsimd_vector_cost =
625 1, /* int_stmt_cost */
626 1, /* fp_stmt_cost */
627 0, /* ld2_st2_permute_cost */
628 0, /* ld3_st3_permute_cost */
629 0, /* ld4_st4_permute_cost */
630 2, /* permute_cost */
631 2, /* reduc_i8_cost */
632 2, /* reduc_i16_cost */
633 2, /* reduc_i32_cost */
634 2, /* reduc_i64_cost */
635 2, /* reduc_f16_cost */
636 2, /* reduc_f32_cost */
637 2, /* reduc_f64_cost */
638 2, /* store_elt_extra_cost */
639 2, /* vec_to_scalar_cost */
640 1, /* scalar_to_vec_cost */
641 1, /* align_load_cost */
642 1, /* unalign_load_cost */
643 1, /* unalign_store_cost */
644 1 /* store_cost */
647 /* Generic costs for SVE vector operations. */
648 static const sve_vec_cost generic_sve_vector_cost =
651 1, /* int_stmt_cost */
652 1, /* fp_stmt_cost */
653 0, /* ld2_st2_permute_cost */
654 0, /* ld3_st3_permute_cost */
655 0, /* ld4_st4_permute_cost */
656 2, /* permute_cost */
657 2, /* reduc_i8_cost */
658 2, /* reduc_i16_cost */
659 2, /* reduc_i32_cost */
660 2, /* reduc_i64_cost */
661 2, /* reduc_f16_cost */
662 2, /* reduc_f32_cost */
663 2, /* reduc_f64_cost */
664 2, /* store_elt_extra_cost */
665 2, /* vec_to_scalar_cost */
666 1, /* scalar_to_vec_cost */
667 1, /* align_load_cost */
668 1, /* unalign_load_cost */
669 1, /* unalign_store_cost */
670 1 /* store_cost */
672 2, /* clast_cost */
673 2, /* fadda_f16_cost */
674 2, /* fadda_f32_cost */
675 2, /* fadda_f64_cost */
676 1 /* scatter_store_elt_cost */
679 /* Generic costs for vector insn classes. */
680 static const struct cpu_vector_cost generic_vector_cost =
682 1, /* scalar_int_stmt_cost */
683 1, /* scalar_fp_stmt_cost */
684 1, /* scalar_load_cost */
685 1, /* scalar_store_cost */
686 3, /* cond_taken_branch_cost */
687 1, /* cond_not_taken_branch_cost */
688 &generic_advsimd_vector_cost, /* advsimd */
689 &generic_sve_vector_cost, /* sve */
690 nullptr /* issue_info */
693 static const advsimd_vec_cost a64fx_advsimd_vector_cost =
695 2, /* int_stmt_cost */
696 5, /* fp_stmt_cost */
697 0, /* ld2_st2_permute_cost */
698 0, /* ld3_st3_permute_cost */
699 0, /* ld4_st4_permute_cost */
700 3, /* permute_cost */
701 13, /* reduc_i8_cost */
702 13, /* reduc_i16_cost */
703 13, /* reduc_i32_cost */
704 13, /* reduc_i64_cost */
705 13, /* reduc_f16_cost */
706 13, /* reduc_f32_cost */
707 13, /* reduc_f64_cost */
708 13, /* store_elt_extra_cost */
709 13, /* vec_to_scalar_cost */
710 4, /* scalar_to_vec_cost */
711 6, /* align_load_cost */
712 6, /* unalign_load_cost */
713 1, /* unalign_store_cost */
714 1 /* store_cost */
717 static const sve_vec_cost a64fx_sve_vector_cost =
720 2, /* int_stmt_cost */
721 5, /* fp_stmt_cost */
722 0, /* ld2_st2_permute_cost */
723 0, /* ld3_st3_permute_cost */
724 0, /* ld4_st4_permute_cost */
725 3, /* permute_cost */
726 13, /* reduc_i8_cost */
727 13, /* reduc_i16_cost */
728 13, /* reduc_i32_cost */
729 13, /* reduc_i64_cost */
730 13, /* reduc_f16_cost */
731 13, /* reduc_f32_cost */
732 13, /* reduc_f64_cost */
733 13, /* store_elt_extra_cost */
734 13, /* vec_to_scalar_cost */
735 4, /* scalar_to_vec_cost */
736 6, /* align_load_cost */
737 6, /* unalign_load_cost */
738 1, /* unalign_store_cost */
739 1 /* store_cost */
741 13, /* clast_cost */
742 13, /* fadda_f16_cost */
743 13, /* fadda_f32_cost */
744 13, /* fadda_f64_cost */
745 1 /* scatter_store_elt_cost */
748 static const struct cpu_vector_cost a64fx_vector_cost =
750 1, /* scalar_int_stmt_cost */
751 5, /* scalar_fp_stmt_cost */
752 4, /* scalar_load_cost */
753 1, /* scalar_store_cost */
754 3, /* cond_taken_branch_cost */
755 1, /* cond_not_taken_branch_cost */
756 &a64fx_advsimd_vector_cost, /* advsimd */
757 &a64fx_sve_vector_cost, /* sve */
758 nullptr /* issue_info */
761 static const advsimd_vec_cost qdf24xx_advsimd_vector_cost =
763 1, /* int_stmt_cost */
764 3, /* fp_stmt_cost */
765 0, /* ld2_st2_permute_cost */
766 0, /* ld3_st3_permute_cost */
767 0, /* ld4_st4_permute_cost */
768 2, /* permute_cost */
769 1, /* reduc_i8_cost */
770 1, /* reduc_i16_cost */
771 1, /* reduc_i32_cost */
772 1, /* reduc_i64_cost */
773 1, /* reduc_f16_cost */
774 1, /* reduc_f32_cost */
775 1, /* reduc_f64_cost */
776 1, /* store_elt_extra_cost */
777 1, /* vec_to_scalar_cost */
778 1, /* scalar_to_vec_cost */
779 1, /* align_load_cost */
780 1, /* unalign_load_cost */
781 1, /* unalign_store_cost */
782 1 /* store_cost */
785 /* QDF24XX costs for vector insn classes. */
786 static const struct cpu_vector_cost qdf24xx_vector_cost =
788 1, /* scalar_int_stmt_cost */
789 1, /* scalar_fp_stmt_cost */
790 1, /* scalar_load_cost */
791 1, /* scalar_store_cost */
792 3, /* cond_taken_branch_cost */
793 1, /* cond_not_taken_branch_cost */
794 &qdf24xx_advsimd_vector_cost, /* advsimd */
795 nullptr, /* sve */
796 nullptr /* issue_info */
800 static const advsimd_vec_cost thunderx_advsimd_vector_cost =
802 4, /* int_stmt_cost */
803 1, /* fp_stmt_cost */
804 0, /* ld2_st2_permute_cost */
805 0, /* ld3_st3_permute_cost */
806 0, /* ld4_st4_permute_cost */
807 4, /* permute_cost */
808 2, /* reduc_i8_cost */
809 2, /* reduc_i16_cost */
810 2, /* reduc_i32_cost */
811 2, /* reduc_i64_cost */
812 2, /* reduc_f16_cost */
813 2, /* reduc_f32_cost */
814 2, /* reduc_f64_cost */
815 2, /* store_elt_extra_cost */
816 2, /* vec_to_scalar_cost */
817 2, /* scalar_to_vec_cost */
818 3, /* align_load_cost */
819 5, /* unalign_load_cost */
820 5, /* unalign_store_cost */
821 1 /* store_cost */
824 /* ThunderX costs for vector insn classes. */
825 static const struct cpu_vector_cost thunderx_vector_cost =
827 1, /* scalar_int_stmt_cost */
828 1, /* scalar_fp_stmt_cost */
829 3, /* scalar_load_cost */
830 1, /* scalar_store_cost */
831 3, /* cond_taken_branch_cost */
832 3, /* cond_not_taken_branch_cost */
833 &thunderx_advsimd_vector_cost, /* advsimd */
834 nullptr, /* sve */
835 nullptr /* issue_info */
838 static const advsimd_vec_cost tsv110_advsimd_vector_cost =
840 2, /* int_stmt_cost */
841 2, /* fp_stmt_cost */
842 0, /* ld2_st2_permute_cost */
843 0, /* ld3_st3_permute_cost */
844 0, /* ld4_st4_permute_cost */
845 2, /* permute_cost */
846 3, /* reduc_i8_cost */
847 3, /* reduc_i16_cost */
848 3, /* reduc_i32_cost */
849 3, /* reduc_i64_cost */
850 3, /* reduc_f16_cost */
851 3, /* reduc_f32_cost */
852 3, /* reduc_f64_cost */
853 3, /* store_elt_extra_cost */
854 3, /* vec_to_scalar_cost */
855 2, /* scalar_to_vec_cost */
856 5, /* align_load_cost */
857 5, /* unalign_load_cost */
858 1, /* unalign_store_cost */
859 1 /* store_cost */
862 static const struct cpu_vector_cost tsv110_vector_cost =
864 1, /* scalar_int_stmt_cost */
865 1, /* scalar_fp_stmt_cost */
866 5, /* scalar_load_cost */
867 1, /* scalar_store_cost */
868 1, /* cond_taken_branch_cost */
869 1, /* cond_not_taken_branch_cost */
870 &tsv110_advsimd_vector_cost, /* advsimd */
871 nullptr, /* sve */
872 nullptr /* issue_info */
875 static const advsimd_vec_cost cortexa57_advsimd_vector_cost =
877 2, /* int_stmt_cost */
878 2, /* fp_stmt_cost */
879 0, /* ld2_st2_permute_cost */
880 0, /* ld3_st3_permute_cost */
881 0, /* ld4_st4_permute_cost */
882 3, /* permute_cost */
883 8, /* reduc_i8_cost */
884 8, /* reduc_i16_cost */
885 8, /* reduc_i32_cost */
886 8, /* reduc_i64_cost */
887 8, /* reduc_f16_cost */
888 8, /* reduc_f32_cost */
889 8, /* reduc_f64_cost */
890 8, /* store_elt_extra_cost */
891 8, /* vec_to_scalar_cost */
892 8, /* scalar_to_vec_cost */
893 4, /* align_load_cost */
894 4, /* unalign_load_cost */
895 1, /* unalign_store_cost */
896 1 /* store_cost */
899 /* Cortex-A57 costs for vector insn classes. */
900 static const struct cpu_vector_cost cortexa57_vector_cost =
902 1, /* scalar_int_stmt_cost */
903 1, /* scalar_fp_stmt_cost */
904 4, /* scalar_load_cost */
905 1, /* scalar_store_cost */
906 1, /* cond_taken_branch_cost */
907 1, /* cond_not_taken_branch_cost */
908 &cortexa57_advsimd_vector_cost, /* advsimd */
909 nullptr, /* sve */
910 nullptr /* issue_info */
913 static const advsimd_vec_cost exynosm1_advsimd_vector_cost =
915 3, /* int_stmt_cost */
916 3, /* fp_stmt_cost */
917 0, /* ld2_st2_permute_cost */
918 0, /* ld3_st3_permute_cost */
919 0, /* ld4_st4_permute_cost */
920 3, /* permute_cost */
921 3, /* reduc_i8_cost */
922 3, /* reduc_i16_cost */
923 3, /* reduc_i32_cost */
924 3, /* reduc_i64_cost */
925 3, /* reduc_f16_cost */
926 3, /* reduc_f32_cost */
927 3, /* reduc_f64_cost */
928 3, /* store_elt_extra_cost */
929 3, /* vec_to_scalar_cost */
930 3, /* scalar_to_vec_cost */
931 5, /* align_load_cost */
932 5, /* unalign_load_cost */
933 1, /* unalign_store_cost */
934 1 /* store_cost */
937 static const struct cpu_vector_cost exynosm1_vector_cost =
939 1, /* scalar_int_stmt_cost */
940 1, /* scalar_fp_stmt_cost */
941 5, /* scalar_load_cost */
942 1, /* scalar_store_cost */
943 1, /* cond_taken_branch_cost */
944 1, /* cond_not_taken_branch_cost */
945 &exynosm1_advsimd_vector_cost, /* advsimd */
946 nullptr, /* sve */
947 nullptr /* issue_info */
950 static const advsimd_vec_cost xgene1_advsimd_vector_cost =
952 2, /* int_stmt_cost */
953 2, /* fp_stmt_cost */
954 0, /* ld2_st2_permute_cost */
955 0, /* ld3_st3_permute_cost */
956 0, /* ld4_st4_permute_cost */
957 2, /* permute_cost */
958 4, /* reduc_i8_cost */
959 4, /* reduc_i16_cost */
960 4, /* reduc_i32_cost */
961 4, /* reduc_i64_cost */
962 4, /* reduc_f16_cost */
963 4, /* reduc_f32_cost */
964 4, /* reduc_f64_cost */
965 4, /* store_elt_extra_cost */
966 4, /* vec_to_scalar_cost */
967 4, /* scalar_to_vec_cost */
968 10, /* align_load_cost */
969 10, /* unalign_load_cost */
970 2, /* unalign_store_cost */
971 2 /* store_cost */
974 /* Generic costs for vector insn classes. */
975 static const struct cpu_vector_cost xgene1_vector_cost =
977 1, /* scalar_int_stmt_cost */
978 1, /* scalar_fp_stmt_cost */
979 5, /* scalar_load_cost */
980 1, /* scalar_store_cost */
981 2, /* cond_taken_branch_cost */
982 1, /* cond_not_taken_branch_cost */
983 &xgene1_advsimd_vector_cost, /* advsimd */
984 nullptr, /* sve */
985 nullptr /* issue_info */
988 static const advsimd_vec_cost thunderx2t99_advsimd_vector_cost =
990 4, /* int_stmt_cost */
991 5, /* fp_stmt_cost */
992 0, /* ld2_st2_permute_cost */
993 0, /* ld3_st3_permute_cost */
994 0, /* ld4_st4_permute_cost */
995 10, /* permute_cost */
996 6, /* reduc_i8_cost */
997 6, /* reduc_i16_cost */
998 6, /* reduc_i32_cost */
999 6, /* reduc_i64_cost */
1000 6, /* reduc_f16_cost */
1001 6, /* reduc_f32_cost */
1002 6, /* reduc_f64_cost */
1003 6, /* store_elt_extra_cost */
1004 6, /* vec_to_scalar_cost */
1005 5, /* scalar_to_vec_cost */
1006 4, /* align_load_cost */
1007 4, /* unalign_load_cost */
1008 1, /* unalign_store_cost */
1009 1 /* store_cost */
1012 /* Costs for vector insn classes for Vulcan. */
1013 static const struct cpu_vector_cost thunderx2t99_vector_cost =
1015 1, /* scalar_int_stmt_cost */
1016 6, /* scalar_fp_stmt_cost */
1017 4, /* scalar_load_cost */
1018 1, /* scalar_store_cost */
1019 2, /* cond_taken_branch_cost */
1020 1, /* cond_not_taken_branch_cost */
1021 &thunderx2t99_advsimd_vector_cost, /* advsimd */
1022 nullptr, /* sve */
1023 nullptr /* issue_info */
1026 static const advsimd_vec_cost thunderx3t110_advsimd_vector_cost =
1028 5, /* int_stmt_cost */
1029 5, /* fp_stmt_cost */
1030 0, /* ld2_st2_permute_cost */
1031 0, /* ld3_st3_permute_cost */
1032 0, /* ld4_st4_permute_cost */
1033 10, /* permute_cost */
1034 5, /* reduc_i8_cost */
1035 5, /* reduc_i16_cost */
1036 5, /* reduc_i32_cost */
1037 5, /* reduc_i64_cost */
1038 5, /* reduc_f16_cost */
1039 5, /* reduc_f32_cost */
1040 5, /* reduc_f64_cost */
1041 5, /* store_elt_extra_cost */
1042 5, /* vec_to_scalar_cost */
1043 5, /* scalar_to_vec_cost */
1044 4, /* align_load_cost */
1045 4, /* unalign_load_cost */
1046 4, /* unalign_store_cost */
1047 4 /* store_cost */
1050 static const struct cpu_vector_cost thunderx3t110_vector_cost =
1052 1, /* scalar_int_stmt_cost */
1053 5, /* scalar_fp_stmt_cost */
1054 4, /* scalar_load_cost */
1055 1, /* scalar_store_cost */
1056 2, /* cond_taken_branch_cost */
1057 1, /* cond_not_taken_branch_cost */
1058 &thunderx3t110_advsimd_vector_cost, /* advsimd */
1059 nullptr, /* sve */
1060 nullptr /* issue_info */
1064 /* Generic costs for branch instructions. */
1065 static const struct cpu_branch_cost generic_branch_cost =
1067 1, /* Predictable. */
1068 3 /* Unpredictable. */
1071 /* Generic approximation modes. */
1072 static const cpu_approx_modes generic_approx_modes =
1074 AARCH64_APPROX_NONE, /* division */
1075 AARCH64_APPROX_NONE, /* sqrt */
1076 AARCH64_APPROX_NONE /* recip_sqrt */
1079 /* Approximation modes for Exynos M1. */
1080 static const cpu_approx_modes exynosm1_approx_modes =
1082 AARCH64_APPROX_NONE, /* division */
1083 AARCH64_APPROX_ALL, /* sqrt */
1084 AARCH64_APPROX_ALL /* recip_sqrt */
1087 /* Approximation modes for X-Gene 1. */
1088 static const cpu_approx_modes xgene1_approx_modes =
1090 AARCH64_APPROX_NONE, /* division */
1091 AARCH64_APPROX_NONE, /* sqrt */
1092 AARCH64_APPROX_ALL /* recip_sqrt */
1095 /* Generic prefetch settings (which disable prefetch). */
1096 static const cpu_prefetch_tune generic_prefetch_tune =
1098 0, /* num_slots */
1099 -1, /* l1_cache_size */
1100 -1, /* l1_cache_line_size */
1101 -1, /* l2_cache_size */
1102 true, /* prefetch_dynamic_strides */
1103 -1, /* minimum_stride */
1104 -1 /* default_opt_level */
1107 static const cpu_prefetch_tune exynosm1_prefetch_tune =
1109 0, /* num_slots */
1110 -1, /* l1_cache_size */
1111 64, /* l1_cache_line_size */
1112 -1, /* l2_cache_size */
1113 true, /* prefetch_dynamic_strides */
1114 -1, /* minimum_stride */
1115 -1 /* default_opt_level */
1118 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
1120 4, /* num_slots */
1121 32, /* l1_cache_size */
1122 64, /* l1_cache_line_size */
1123 512, /* l2_cache_size */
1124 false, /* prefetch_dynamic_strides */
1125 2048, /* minimum_stride */
1126 3 /* default_opt_level */
1129 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
1131 8, /* num_slots */
1132 32, /* l1_cache_size */
1133 128, /* l1_cache_line_size */
1134 16*1024, /* l2_cache_size */
1135 true, /* prefetch_dynamic_strides */
1136 -1, /* minimum_stride */
1137 3 /* default_opt_level */
1140 static const cpu_prefetch_tune thunderx_prefetch_tune =
1142 8, /* num_slots */
1143 32, /* l1_cache_size */
1144 128, /* l1_cache_line_size */
1145 -1, /* l2_cache_size */
1146 true, /* prefetch_dynamic_strides */
1147 -1, /* minimum_stride */
1148 -1 /* default_opt_level */
1151 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
1153 8, /* num_slots */
1154 32, /* l1_cache_size */
1155 64, /* l1_cache_line_size */
1156 256, /* l2_cache_size */
1157 true, /* prefetch_dynamic_strides */
1158 -1, /* minimum_stride */
1159 -1 /* default_opt_level */
1162 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
1164 8, /* num_slots */
1165 32, /* l1_cache_size */
1166 64, /* l1_cache_line_size */
1167 256, /* l2_cache_size */
1168 true, /* prefetch_dynamic_strides */
1169 -1, /* minimum_stride */
1170 -1 /* default_opt_level */
1173 static const cpu_prefetch_tune tsv110_prefetch_tune =
1175 0, /* num_slots */
1176 64, /* l1_cache_size */
1177 64, /* l1_cache_line_size */
1178 512, /* l2_cache_size */
1179 true, /* prefetch_dynamic_strides */
1180 -1, /* minimum_stride */
1181 -1 /* default_opt_level */
1184 static const cpu_prefetch_tune xgene1_prefetch_tune =
1186 8, /* num_slots */
1187 32, /* l1_cache_size */
1188 64, /* l1_cache_line_size */
1189 256, /* l2_cache_size */
1190 true, /* prefetch_dynamic_strides */
1191 -1, /* minimum_stride */
1192 -1 /* default_opt_level */
1195 static const cpu_prefetch_tune a64fx_prefetch_tune =
1197 8, /* num_slots */
1198 64, /* l1_cache_size */
1199 256, /* l1_cache_line_size */
1200 32768, /* l2_cache_size */
1201 true, /* prefetch_dynamic_strides */
1202 -1, /* minimum_stride */
1203 -1 /* default_opt_level */
1206 static const struct tune_params generic_tunings =
1208 &cortexa57_extra_costs,
1209 &generic_addrcost_table,
1210 &generic_regmove_cost,
1211 &generic_vector_cost,
1212 &generic_branch_cost,
1213 &generic_approx_modes,
1214 SVE_NOT_IMPLEMENTED, /* sve_width */
1215 4, /* memmov_cost */
1216 2, /* issue_rate */
1217 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1218 "16:12", /* function_align. */
1219 "4", /* jump_align. */
1220 "8", /* loop_align. */
1221 2, /* int_reassoc_width. */
1222 4, /* fp_reassoc_width. */
1223 1, /* vec_reassoc_width. */
1224 2, /* min_div_recip_mul_sf. */
1225 2, /* min_div_recip_mul_df. */
1226 0, /* max_case_values. */
1227 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1228 /* Enabling AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS significantly benefits
1229 Neoverse V1. It does not have a noticeable effect on A64FX and should
1230 have at most a very minor effect on SVE2 cores. */
1231 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS), /* tune_flags. */
1232 &generic_prefetch_tune
1235 static const struct tune_params cortexa35_tunings =
1237 &cortexa53_extra_costs,
1238 &generic_addrcost_table,
1239 &cortexa53_regmove_cost,
1240 &generic_vector_cost,
1241 &generic_branch_cost,
1242 &generic_approx_modes,
1243 SVE_NOT_IMPLEMENTED, /* sve_width */
1244 4, /* memmov_cost */
1245 1, /* issue_rate */
1246 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1247 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1248 "16", /* function_align. */
1249 "4", /* jump_align. */
1250 "8", /* loop_align. */
1251 2, /* int_reassoc_width. */
1252 4, /* fp_reassoc_width. */
1253 1, /* vec_reassoc_width. */
1254 2, /* min_div_recip_mul_sf. */
1255 2, /* min_div_recip_mul_df. */
1256 0, /* max_case_values. */
1257 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1258 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1259 &generic_prefetch_tune
1262 static const struct tune_params cortexa53_tunings =
1264 &cortexa53_extra_costs,
1265 &generic_addrcost_table,
1266 &cortexa53_regmove_cost,
1267 &generic_vector_cost,
1268 &generic_branch_cost,
1269 &generic_approx_modes,
1270 SVE_NOT_IMPLEMENTED, /* sve_width */
1271 4, /* memmov_cost */
1272 2, /* issue_rate */
1273 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1274 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1275 "16", /* function_align. */
1276 "4", /* jump_align. */
1277 "8", /* loop_align. */
1278 2, /* int_reassoc_width. */
1279 4, /* fp_reassoc_width. */
1280 1, /* vec_reassoc_width. */
1281 2, /* min_div_recip_mul_sf. */
1282 2, /* min_div_recip_mul_df. */
1283 0, /* max_case_values. */
1284 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1285 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1286 &generic_prefetch_tune
1289 static const struct tune_params cortexa57_tunings =
1291 &cortexa57_extra_costs,
1292 &generic_addrcost_table,
1293 &cortexa57_regmove_cost,
1294 &cortexa57_vector_cost,
1295 &generic_branch_cost,
1296 &generic_approx_modes,
1297 SVE_NOT_IMPLEMENTED, /* sve_width */
1298 4, /* memmov_cost */
1299 3, /* issue_rate */
1300 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1301 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
1302 "16", /* function_align. */
1303 "4", /* jump_align. */
1304 "8", /* loop_align. */
1305 2, /* int_reassoc_width. */
1306 4, /* fp_reassoc_width. */
1307 1, /* vec_reassoc_width. */
1308 2, /* min_div_recip_mul_sf. */
1309 2, /* min_div_recip_mul_df. */
1310 0, /* max_case_values. */
1311 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1312 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
1313 &generic_prefetch_tune
1316 static const struct tune_params cortexa72_tunings =
1318 &cortexa57_extra_costs,
1319 &generic_addrcost_table,
1320 &cortexa57_regmove_cost,
1321 &cortexa57_vector_cost,
1322 &generic_branch_cost,
1323 &generic_approx_modes,
1324 SVE_NOT_IMPLEMENTED, /* sve_width */
1325 4, /* memmov_cost */
1326 3, /* issue_rate */
1327 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1328 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
1329 "16", /* function_align. */
1330 "4", /* jump_align. */
1331 "8", /* loop_align. */
1332 2, /* int_reassoc_width. */
1333 4, /* fp_reassoc_width. */
1334 1, /* vec_reassoc_width. */
1335 2, /* min_div_recip_mul_sf. */
1336 2, /* min_div_recip_mul_df. */
1337 0, /* max_case_values. */
1338 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1339 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1340 &generic_prefetch_tune
1343 static const struct tune_params cortexa73_tunings =
1345 &cortexa57_extra_costs,
1346 &generic_addrcost_table,
1347 &cortexa57_regmove_cost,
1348 &cortexa57_vector_cost,
1349 &generic_branch_cost,
1350 &generic_approx_modes,
1351 SVE_NOT_IMPLEMENTED, /* sve_width */
1352 4, /* memmov_cost. */
1353 2, /* issue_rate. */
1354 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1355 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1356 "16", /* function_align. */
1357 "4", /* jump_align. */
1358 "8", /* loop_align. */
1359 2, /* int_reassoc_width. */
1360 4, /* fp_reassoc_width. */
1361 1, /* vec_reassoc_width. */
1362 2, /* min_div_recip_mul_sf. */
1363 2, /* min_div_recip_mul_df. */
1364 0, /* max_case_values. */
1365 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1366 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1367 &generic_prefetch_tune
1372 static const struct tune_params exynosm1_tunings =
1374 &exynosm1_extra_costs,
1375 &exynosm1_addrcost_table,
1376 &exynosm1_regmove_cost,
1377 &exynosm1_vector_cost,
1378 &generic_branch_cost,
1379 &exynosm1_approx_modes,
1380 SVE_NOT_IMPLEMENTED, /* sve_width */
1381 4, /* memmov_cost */
1382 3, /* issue_rate */
1383 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
1384 "4", /* function_align. */
1385 "4", /* jump_align. */
1386 "4", /* loop_align. */
1387 2, /* int_reassoc_width. */
1388 4, /* fp_reassoc_width. */
1389 1, /* vec_reassoc_width. */
1390 2, /* min_div_recip_mul_sf. */
1391 2, /* min_div_recip_mul_df. */
1392 48, /* max_case_values. */
1393 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1394 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1395 &exynosm1_prefetch_tune
1398 static const struct tune_params thunderxt88_tunings =
1400 &thunderx_extra_costs,
1401 &generic_addrcost_table,
1402 &thunderx_regmove_cost,
1403 &thunderx_vector_cost,
1404 &generic_branch_cost,
1405 &generic_approx_modes,
1406 SVE_NOT_IMPLEMENTED, /* sve_width */
1407 6, /* memmov_cost */
1408 2, /* issue_rate */
1409 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1410 "8", /* function_align. */
1411 "8", /* jump_align. */
1412 "8", /* loop_align. */
1413 2, /* int_reassoc_width. */
1414 4, /* fp_reassoc_width. */
1415 1, /* vec_reassoc_width. */
1416 2, /* min_div_recip_mul_sf. */
1417 2, /* min_div_recip_mul_df. */
1418 0, /* max_case_values. */
1419 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1420 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
1421 &thunderxt88_prefetch_tune
1424 static const struct tune_params thunderx_tunings =
1426 &thunderx_extra_costs,
1427 &generic_addrcost_table,
1428 &thunderx_regmove_cost,
1429 &thunderx_vector_cost,
1430 &generic_branch_cost,
1431 &generic_approx_modes,
1432 SVE_NOT_IMPLEMENTED, /* sve_width */
1433 6, /* memmov_cost */
1434 2, /* issue_rate */
1435 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1436 "8", /* function_align. */
1437 "8", /* jump_align. */
1438 "8", /* loop_align. */
1439 2, /* int_reassoc_width. */
1440 4, /* fp_reassoc_width. */
1441 1, /* vec_reassoc_width. */
1442 2, /* min_div_recip_mul_sf. */
1443 2, /* min_div_recip_mul_df. */
1444 0, /* max_case_values. */
1445 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1446 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1447 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
1448 &thunderx_prefetch_tune
1451 static const struct tune_params tsv110_tunings =
1453 &tsv110_extra_costs,
1454 &tsv110_addrcost_table,
1455 &tsv110_regmove_cost,
1456 &tsv110_vector_cost,
1457 &generic_branch_cost,
1458 &generic_approx_modes,
1459 SVE_NOT_IMPLEMENTED, /* sve_width */
1460 4, /* memmov_cost */
1461 4, /* issue_rate */
1462 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1463 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1464 "16", /* function_align. */
1465 "4", /* jump_align. */
1466 "8", /* loop_align. */
1467 2, /* int_reassoc_width. */
1468 4, /* fp_reassoc_width. */
1469 1, /* vec_reassoc_width. */
1470 2, /* min_div_recip_mul_sf. */
1471 2, /* min_div_recip_mul_df. */
1472 0, /* max_case_values. */
1473 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1474 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1475 &tsv110_prefetch_tune
1478 static const struct tune_params xgene1_tunings =
1480 &xgene1_extra_costs,
1481 &xgene1_addrcost_table,
1482 &xgene1_regmove_cost,
1483 &xgene1_vector_cost,
1484 &generic_branch_cost,
1485 &xgene1_approx_modes,
1486 SVE_NOT_IMPLEMENTED, /* sve_width */
1487 6, /* memmov_cost */
1488 4, /* issue_rate */
1489 AARCH64_FUSE_NOTHING, /* fusible_ops */
1490 "16", /* function_align. */
1491 "16", /* jump_align. */
1492 "16", /* loop_align. */
1493 2, /* int_reassoc_width. */
1494 4, /* fp_reassoc_width. */
1495 1, /* vec_reassoc_width. */
1496 2, /* min_div_recip_mul_sf. */
1497 2, /* min_div_recip_mul_df. */
1498 17, /* max_case_values. */
1499 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1500 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1501 &xgene1_prefetch_tune
1504 static const struct tune_params emag_tunings =
1506 &xgene1_extra_costs,
1507 &xgene1_addrcost_table,
1508 &xgene1_regmove_cost,
1509 &xgene1_vector_cost,
1510 &generic_branch_cost,
1511 &xgene1_approx_modes,
1512 SVE_NOT_IMPLEMENTED,
1513 6, /* memmov_cost */
1514 4, /* issue_rate */
1515 AARCH64_FUSE_NOTHING, /* fusible_ops */
1516 "16", /* function_align. */
1517 "16", /* jump_align. */
1518 "16", /* loop_align. */
1519 2, /* int_reassoc_width. */
1520 4, /* fp_reassoc_width. */
1521 1, /* vec_reassoc_width. */
1522 2, /* min_div_recip_mul_sf. */
1523 2, /* min_div_recip_mul_df. */
1524 17, /* max_case_values. */
1525 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1526 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1527 &xgene1_prefetch_tune
1530 static const struct tune_params qdf24xx_tunings =
1532 &qdf24xx_extra_costs,
1533 &qdf24xx_addrcost_table,
1534 &qdf24xx_regmove_cost,
1535 &qdf24xx_vector_cost,
1536 &generic_branch_cost,
1537 &generic_approx_modes,
1538 SVE_NOT_IMPLEMENTED, /* sve_width */
1539 4, /* memmov_cost */
1540 4, /* issue_rate */
1541 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1542 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1543 "16", /* function_align. */
1544 "8", /* jump_align. */
1545 "16", /* loop_align. */
1546 2, /* int_reassoc_width. */
1547 4, /* fp_reassoc_width. */
1548 1, /* vec_reassoc_width. */
1549 2, /* min_div_recip_mul_sf. */
1550 2, /* min_div_recip_mul_df. */
1551 0, /* max_case_values. */
1552 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1553 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1554 &qdf24xx_prefetch_tune
1557 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1558 for now. */
1559 static const struct tune_params saphira_tunings =
1561 &generic_extra_costs,
1562 &generic_addrcost_table,
1563 &generic_regmove_cost,
1564 &generic_vector_cost,
1565 &generic_branch_cost,
1566 &generic_approx_modes,
1567 SVE_NOT_IMPLEMENTED, /* sve_width */
1568 4, /* memmov_cost */
1569 4, /* issue_rate */
1570 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1571 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1572 "16", /* function_align. */
1573 "8", /* jump_align. */
1574 "16", /* loop_align. */
1575 2, /* int_reassoc_width. */
1576 4, /* fp_reassoc_width. */
1577 1, /* vec_reassoc_width. */
1578 2, /* min_div_recip_mul_sf. */
1579 2, /* min_div_recip_mul_df. */
1580 0, /* max_case_values. */
1581 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1582 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1583 &generic_prefetch_tune
1586 static const struct tune_params thunderx2t99_tunings =
1588 &thunderx2t99_extra_costs,
1589 &thunderx2t99_addrcost_table,
1590 &thunderx2t99_regmove_cost,
1591 &thunderx2t99_vector_cost,
1592 &generic_branch_cost,
1593 &generic_approx_modes,
1594 SVE_NOT_IMPLEMENTED, /* sve_width */
1595 4, /* memmov_cost. */
1596 4, /* issue_rate. */
1597 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1598 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1599 "16", /* function_align. */
1600 "8", /* jump_align. */
1601 "16", /* loop_align. */
1602 3, /* int_reassoc_width. */
1603 2, /* fp_reassoc_width. */
1604 2, /* vec_reassoc_width. */
1605 2, /* min_div_recip_mul_sf. */
1606 2, /* min_div_recip_mul_df. */
1607 0, /* max_case_values. */
1608 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1609 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1610 &thunderx2t99_prefetch_tune
1613 static const struct tune_params thunderx3t110_tunings =
1615 &thunderx3t110_extra_costs,
1616 &thunderx3t110_addrcost_table,
1617 &thunderx3t110_regmove_cost,
1618 &thunderx3t110_vector_cost,
1619 &generic_branch_cost,
1620 &generic_approx_modes,
1621 SVE_NOT_IMPLEMENTED, /* sve_width */
1622 4, /* memmov_cost. */
1623 6, /* issue_rate. */
1624 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1625 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1626 "16", /* function_align. */
1627 "8", /* jump_align. */
1628 "16", /* loop_align. */
1629 3, /* int_reassoc_width. */
1630 2, /* fp_reassoc_width. */
1631 2, /* vec_reassoc_width. */
1632 2, /* min_div_recip_mul_sf. */
1633 2, /* min_div_recip_mul_df. */
1634 0, /* max_case_values. */
1635 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1636 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1637 &thunderx3t110_prefetch_tune
1640 static const struct tune_params neoversen1_tunings =
1642 &cortexa76_extra_costs,
1643 &generic_addrcost_table,
1644 &generic_regmove_cost,
1645 &cortexa57_vector_cost,
1646 &generic_branch_cost,
1647 &generic_approx_modes,
1648 SVE_NOT_IMPLEMENTED, /* sve_width */
1649 4, /* memmov_cost */
1650 3, /* issue_rate */
1651 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1652 "32:16", /* function_align. */
1653 "4", /* jump_align. */
1654 "32:16", /* loop_align. */
1655 2, /* int_reassoc_width. */
1656 4, /* fp_reassoc_width. */
1657 2, /* vec_reassoc_width. */
1658 2, /* min_div_recip_mul_sf. */
1659 2, /* min_div_recip_mul_df. */
1660 0, /* max_case_values. */
1661 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1662 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1663 &generic_prefetch_tune
1666 static const advsimd_vec_cost neoversev1_advsimd_vector_cost =
1668 2, /* int_stmt_cost */
1669 2, /* fp_stmt_cost */
1670 4, /* ld2_st2_permute_cost */
1671 4, /* ld3_st3_permute_cost */
1672 5, /* ld4_st4_permute_cost */
1673 3, /* permute_cost */
1674 4, /* reduc_i8_cost */
1675 4, /* reduc_i16_cost */
1676 2, /* reduc_i32_cost */
1677 2, /* reduc_i64_cost */
1678 6, /* reduc_f16_cost */
1679 3, /* reduc_f32_cost */
1680 2, /* reduc_f64_cost */
1681 2, /* store_elt_extra_cost */
1682 /* This value is just inherited from the Cortex-A57 table. */
1683 8, /* vec_to_scalar_cost */
1684 /* This depends very much on what the scalar value is and
1685 where it comes from. E.g. some constants take two dependent
1686 instructions or a load, while others might be moved from a GPR.
1687 4 seems to be a reasonable compromise in practice. */
1688 4, /* scalar_to_vec_cost */
1689 4, /* align_load_cost */
1690 4, /* unalign_load_cost */
1691 /* Although stores have a latency of 2 and compete for the
1692 vector pipes, in practice it's better not to model that. */
1693 1, /* unalign_store_cost */
1694 1 /* store_cost */
1697 static const sve_vec_cost neoversev1_sve_vector_cost =
1700 2, /* int_stmt_cost */
1701 2, /* fp_stmt_cost */
1702 4, /* ld2_st2_permute_cost */
1703 7, /* ld3_st3_permute_cost */
1704 8, /* ld4_st4_permute_cost */
1705 3, /* permute_cost */
1706 /* Theoretically, a reduction involving 31 scalar ADDs could
1707 complete in ~9 cycles and would have a cost of 31. [SU]ADDV
1708 completes in 14 cycles, so give it a cost of 31 + 5. */
1709 36, /* reduc_i8_cost */
1710 /* Likewise for 15 scalar ADDs (~5 cycles) vs. 12: 15 + 7. */
1711 22, /* reduc_i16_cost */
1712 /* Likewise for 7 scalar ADDs (~3 cycles) vs. 10: 7 + 7. */
1713 14, /* reduc_i32_cost */
1714 /* Likewise for 3 scalar ADDs (~2 cycles) vs. 10: 3 + 8. */
1715 11, /* reduc_i64_cost */
1716 /* Theoretically, a reduction involving 15 scalar FADDs could
1717 complete in ~9 cycles and would have a cost of 30. FADDV
1718 completes in 13 cycles, so give it a cost of 30 + 4. */
1719 34, /* reduc_f16_cost */
1720 /* Likewise for 7 scalar FADDs (~6 cycles) vs. 11: 14 + 5. */
1721 19, /* reduc_f32_cost */
1722 /* Likewise for 3 scalar FADDs (~4 cycles) vs. 9: 6 + 5. */
1723 11, /* reduc_f64_cost */
1724 2, /* store_elt_extra_cost */
1725 /* This value is just inherited from the Cortex-A57 table. */
1726 8, /* vec_to_scalar_cost */
1727 /* See the comment above the Advanced SIMD versions. */
1728 4, /* scalar_to_vec_cost */
1729 4, /* align_load_cost */
1730 4, /* unalign_load_cost */
1731 /* Although stores have a latency of 2 and compete for the
1732 vector pipes, in practice it's better not to model that. */
1733 1, /* unalign_store_cost */
1734 1 /* store_cost */
1736 3, /* clast_cost */
1737 19, /* fadda_f16_cost */
1738 11, /* fadda_f32_cost */
1739 8, /* fadda_f64_cost */
1740 3 /* scatter_store_elt_cost */
1743 static const aarch64_scalar_vec_issue_info neoversev1_scalar_issue_info =
1745 3, /* loads_stores_per_cycle */
1746 2, /* stores_per_cycle */
1747 4, /* general_ops_per_cycle */
1748 0, /* fp_simd_load_general_ops */
1749 1 /* fp_simd_store_general_ops */
1752 static const aarch64_advsimd_vec_issue_info neoversev1_advsimd_issue_info =
1755 3, /* loads_stores_per_cycle */
1756 2, /* stores_per_cycle */
1757 4, /* general_ops_per_cycle */
1758 0, /* fp_simd_load_general_ops */
1759 1 /* fp_simd_store_general_ops */
1761 2, /* ld2_st2_general_ops */
1762 2, /* ld3_st3_general_ops */
1763 3 /* ld4_st4_general_ops */
1766 static const aarch64_sve_vec_issue_info neoversev1_sve_issue_info =
1770 2, /* loads_per_cycle */
1771 2, /* stores_per_cycle */
1772 2, /* general_ops_per_cycle */
1773 0, /* fp_simd_load_general_ops */
1774 1 /* fp_simd_store_general_ops */
1776 2, /* ld2_st2_general_ops */
1777 2, /* ld3_st3_general_ops */
1778 3 /* ld4_st4_general_ops */
1780 1, /* pred_ops_per_cycle */
1781 2, /* while_pred_ops */
1782 2, /* int_cmp_pred_ops */
1783 1, /* fp_cmp_pred_ops */
1784 1, /* gather_scatter_pair_general_ops */
1785 1 /* gather_scatter_pair_pred_ops */
1788 static const aarch64_vec_issue_info neoversev1_vec_issue_info =
1790 &neoversev1_scalar_issue_info,
1791 &neoversev1_advsimd_issue_info,
1792 &neoversev1_sve_issue_info
1795 /* Neoverse V1 costs for vector insn classes. */
1796 static const struct cpu_vector_cost neoversev1_vector_cost =
1798 1, /* scalar_int_stmt_cost */
1799 2, /* scalar_fp_stmt_cost */
1800 4, /* scalar_load_cost */
1801 1, /* scalar_store_cost */
1802 1, /* cond_taken_branch_cost */
1803 1, /* cond_not_taken_branch_cost */
1804 &neoversev1_advsimd_vector_cost, /* advsimd */
1805 &neoversev1_sve_vector_cost, /* sve */
1806 &neoversev1_vec_issue_info /* issue_info */
1809 static const struct tune_params neoversev1_tunings =
1811 &cortexa76_extra_costs,
1812 &neoversev1_addrcost_table,
1813 &generic_regmove_cost,
1814 &neoversev1_vector_cost,
1815 &generic_branch_cost,
1816 &generic_approx_modes,
1817 SVE_256, /* sve_width */
1818 4, /* memmov_cost */
1819 3, /* issue_rate */
1820 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1821 "32:16", /* function_align. */
1822 "4", /* jump_align. */
1823 "32:16", /* loop_align. */
1824 2, /* int_reassoc_width. */
1825 4, /* fp_reassoc_width. */
1826 2, /* vec_reassoc_width. */
1827 2, /* min_div_recip_mul_sf. */
1828 2, /* min_div_recip_mul_df. */
1829 0, /* max_case_values. */
1830 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1831 (AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS
1832 | AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS
1833 | AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT), /* tune_flags. */
1834 &generic_prefetch_tune
1837 static const struct tune_params neoversen2_tunings =
1839 &cortexa76_extra_costs,
1840 &generic_addrcost_table,
1841 &generic_regmove_cost,
1842 &cortexa57_vector_cost,
1843 &generic_branch_cost,
1844 &generic_approx_modes,
1845 SVE_128, /* sve_width */
1846 4, /* memmov_cost */
1847 3, /* issue_rate */
1848 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1849 "32:16", /* function_align. */
1850 "4", /* jump_align. */
1851 "32:16", /* loop_align. */
1852 2, /* int_reassoc_width. */
1853 4, /* fp_reassoc_width. */
1854 2, /* vec_reassoc_width. */
1855 2, /* min_div_recip_mul_sf. */
1856 2, /* min_div_recip_mul_df. */
1857 0, /* max_case_values. */
1858 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1859 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1860 &generic_prefetch_tune
1863 static const struct tune_params a64fx_tunings =
1865 &a64fx_extra_costs,
1866 &a64fx_addrcost_table,
1867 &a64fx_regmove_cost,
1868 &a64fx_vector_cost,
1869 &generic_branch_cost,
1870 &generic_approx_modes,
1871 SVE_512, /* sve_width */
1872 4, /* memmov_cost */
1873 7, /* issue_rate */
1874 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1875 "32", /* function_align. */
1876 "16", /* jump_align. */
1877 "32", /* loop_align. */
1878 4, /* int_reassoc_width. */
1879 2, /* fp_reassoc_width. */
1880 2, /* vec_reassoc_width. */
1881 2, /* min_div_recip_mul_sf. */
1882 2, /* min_div_recip_mul_df. */
1883 0, /* max_case_values. */
1884 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1885 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1886 &a64fx_prefetch_tune
1889 /* Support for fine-grained override of the tuning structures. */
1890 struct aarch64_tuning_override_function
1892 const char* name;
1893 void (*parse_override)(const char*, struct tune_params*);
1896 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1897 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1898 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1900 static const struct aarch64_tuning_override_function
1901 aarch64_tuning_override_functions[] =
1903 { "fuse", aarch64_parse_fuse_string },
1904 { "tune", aarch64_parse_tune_string },
1905 { "sve_width", aarch64_parse_sve_width_string },
1906 { NULL, NULL }
1909 /* A processor implementing AArch64. */
1910 struct processor
1912 const char *const name;
1913 enum aarch64_processor ident;
1914 enum aarch64_processor sched_core;
1915 enum aarch64_arch arch;
1916 unsigned architecture_version;
1917 const uint64_t flags;
1918 const struct tune_params *const tune;
1921 /* Architectures implementing AArch64. */
1922 static const struct processor all_architectures[] =
1924 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1925 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1926 #include "aarch64-arches.def"
1927 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1930 /* Processor cores implementing AArch64. */
1931 static const struct processor all_cores[] =
1933 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1934 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1935 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1936 FLAGS, &COSTS##_tunings},
1937 #include "aarch64-cores.def"
1938 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1939 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1940 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1944 /* Target specification. These are populated by the -march, -mtune, -mcpu
1945 handling code or by target attributes. */
1946 static const struct processor *selected_arch;
1947 static const struct processor *selected_cpu;
1948 static const struct processor *selected_tune;
1950 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1952 /* The current tuning set. */
1953 struct tune_params aarch64_tune_params = generic_tunings;
1955 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
1957 static tree
1958 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1959 int, bool *no_add_attrs)
1961 /* Since we set fn_type_req to true, the caller should have checked
1962 this for us. */
1963 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1964 switch ((arm_pcs) fntype_abi (*node).id ())
1966 case ARM_PCS_AAPCS64:
1967 case ARM_PCS_SIMD:
1968 return NULL_TREE;
1970 case ARM_PCS_SVE:
1971 error ("the %qE attribute cannot be applied to an SVE function type",
1972 name);
1973 *no_add_attrs = true;
1974 return NULL_TREE;
1976 case ARM_PCS_TLSDESC:
1977 case ARM_PCS_UNKNOWN:
1978 break;
1980 gcc_unreachable ();
1983 /* Table of machine attributes. */
1984 static const struct attribute_spec aarch64_attribute_table[] =
1986 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1987 affects_type_identity, handler, exclude } */
1988 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
1989 handle_aarch64_vector_pcs_attribute, NULL },
1990 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
1991 aarch64_sve::handle_arm_sve_vector_bits_attribute,
1992 NULL },
1993 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
1994 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
1995 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
1996 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1999 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
2001 /* An ISA extension in the co-processor and main instruction set space. */
2002 struct aarch64_option_extension
2004 const char *const name;
2005 const unsigned long flags_on;
2006 const unsigned long flags_off;
2009 typedef enum aarch64_cond_code
2011 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
2012 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
2013 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
2015 aarch64_cc;
2017 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
2019 struct aarch64_branch_protect_type
2021 /* The type's name that the user passes to the branch-protection option
2022 string. */
2023 const char* name;
2024 /* Function to handle the protection type and set global variables.
2025 First argument is the string token corresponding with this type and the
2026 second argument is the next token in the option string.
2027 Return values:
2028 * AARCH64_PARSE_OK: Handling was sucessful.
2029 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
2030 should print an error.
2031 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
2032 own error. */
2033 enum aarch64_parse_opt_result (*handler)(char*, char*);
2034 /* A list of types that can follow this type in the option string. */
2035 const aarch64_branch_protect_type* subtypes;
2036 unsigned int num_subtypes;
2039 static enum aarch64_parse_opt_result
2040 aarch64_handle_no_branch_protection (char* str, char* rest)
2042 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
2043 aarch64_enable_bti = 0;
2044 if (rest)
2046 error ("unexpected %<%s%> after %<%s%>", rest, str);
2047 return AARCH64_PARSE_INVALID_FEATURE;
2049 return AARCH64_PARSE_OK;
2052 static enum aarch64_parse_opt_result
2053 aarch64_handle_standard_branch_protection (char* str, char* rest)
2055 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2056 aarch64_ra_sign_key = AARCH64_KEY_A;
2057 aarch64_enable_bti = 1;
2058 if (rest)
2060 error ("unexpected %<%s%> after %<%s%>", rest, str);
2061 return AARCH64_PARSE_INVALID_FEATURE;
2063 return AARCH64_PARSE_OK;
2066 static enum aarch64_parse_opt_result
2067 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
2068 char* rest ATTRIBUTE_UNUSED)
2070 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
2071 aarch64_ra_sign_key = AARCH64_KEY_A;
2072 return AARCH64_PARSE_OK;
2075 static enum aarch64_parse_opt_result
2076 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
2077 char* rest ATTRIBUTE_UNUSED)
2079 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
2080 return AARCH64_PARSE_OK;
2083 static enum aarch64_parse_opt_result
2084 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
2085 char* rest ATTRIBUTE_UNUSED)
2087 aarch64_ra_sign_key = AARCH64_KEY_B;
2088 return AARCH64_PARSE_OK;
2091 static enum aarch64_parse_opt_result
2092 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
2093 char* rest ATTRIBUTE_UNUSED)
2095 aarch64_enable_bti = 1;
2096 return AARCH64_PARSE_OK;
2099 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
2100 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
2101 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
2102 { NULL, NULL, NULL, 0 }
2105 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
2106 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
2107 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
2108 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
2109 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
2110 { "bti", aarch64_handle_bti_protection, NULL, 0 },
2111 { NULL, NULL, NULL, 0 }
2114 /* The condition codes of the processor, and the inverse function. */
2115 static const char * const aarch64_condition_codes[] =
2117 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
2118 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
2121 /* The preferred condition codes for SVE conditions. */
2122 static const char *const aarch64_sve_condition_codes[] =
2124 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
2125 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
2128 /* Return the assembly token for svpattern value VALUE. */
2130 static const char *
2131 svpattern_token (enum aarch64_svpattern pattern)
2133 switch (pattern)
2135 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
2136 AARCH64_FOR_SVPATTERN (CASE)
2137 #undef CASE
2138 case AARCH64_NUM_SVPATTERNS:
2139 break;
2141 gcc_unreachable ();
2144 /* Return the location of a piece that is known to be passed or returned
2145 in registers. FIRST_ZR is the first unused vector argument register
2146 and FIRST_PR is the first unused predicate argument register. */
2149 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
2150 unsigned int first_pr) const
2152 gcc_assert (VECTOR_MODE_P (mode)
2153 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
2154 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
2156 if (num_zr > 0 && num_pr == 0)
2157 return gen_rtx_REG (mode, first_zr);
2159 if (num_zr == 0 && num_pr == 1)
2160 return gen_rtx_REG (mode, first_pr);
2162 gcc_unreachable ();
2165 /* Return the total number of vector registers required by the PST. */
2167 unsigned int
2168 pure_scalable_type_info::num_zr () const
2170 unsigned int res = 0;
2171 for (unsigned int i = 0; i < pieces.length (); ++i)
2172 res += pieces[i].num_zr;
2173 return res;
2176 /* Return the total number of predicate registers required by the PST. */
2178 unsigned int
2179 pure_scalable_type_info::num_pr () const
2181 unsigned int res = 0;
2182 for (unsigned int i = 0; i < pieces.length (); ++i)
2183 res += pieces[i].num_pr;
2184 return res;
2187 /* Return the location of a PST that is known to be passed or returned
2188 in registers. FIRST_ZR is the first unused vector argument register
2189 and FIRST_PR is the first unused predicate argument register. */
2192 pure_scalable_type_info::get_rtx (machine_mode mode,
2193 unsigned int first_zr,
2194 unsigned int first_pr) const
2196 /* Try to return a single REG if possible. This leads to better
2197 code generation; it isn't required for correctness. */
2198 if (mode == pieces[0].mode)
2200 gcc_assert (pieces.length () == 1);
2201 return pieces[0].get_rtx (first_zr, first_pr);
2204 /* Build up a PARALLEL that contains the individual pieces. */
2205 rtvec rtxes = rtvec_alloc (pieces.length ());
2206 for (unsigned int i = 0; i < pieces.length (); ++i)
2208 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
2209 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
2210 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
2211 first_zr += pieces[i].num_zr;
2212 first_pr += pieces[i].num_pr;
2214 return gen_rtx_PARALLEL (mode, rtxes);
2217 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
2218 in the AAPCS64. */
2220 pure_scalable_type_info::analysis_result
2221 pure_scalable_type_info::analyze (const_tree type)
2223 /* Prevent accidental reuse. */
2224 gcc_assert (pieces.is_empty ());
2226 /* No code will be generated for erroneous types, so we won't establish
2227 an ABI mapping. */
2228 if (type == error_mark_node)
2229 return NO_ABI_IDENTITY;
2231 /* Zero-sized types disappear in the language->ABI mapping. */
2232 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2233 return NO_ABI_IDENTITY;
2235 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
2236 piece p = {};
2237 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
2239 machine_mode mode = TYPE_MODE_RAW (type);
2240 gcc_assert (VECTOR_MODE_P (mode)
2241 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
2243 p.mode = p.orig_mode = mode;
2244 add_piece (p);
2245 return IS_PST;
2248 /* Check for user-defined PSTs. */
2249 if (TREE_CODE (type) == ARRAY_TYPE)
2250 return analyze_array (type);
2251 if (TREE_CODE (type) == RECORD_TYPE)
2252 return analyze_record (type);
2254 return ISNT_PST;
2257 /* Analyze a type that is known not to be passed or returned in memory.
2258 Return true if it has an ABI identity and is a Pure Scalable Type. */
2260 bool
2261 pure_scalable_type_info::analyze_registers (const_tree type)
2263 analysis_result result = analyze (type);
2264 gcc_assert (result != DOESNT_MATTER);
2265 return result == IS_PST;
2268 /* Subroutine of analyze for handling ARRAY_TYPEs. */
2270 pure_scalable_type_info::analysis_result
2271 pure_scalable_type_info::analyze_array (const_tree type)
2273 /* Analyze the element type. */
2274 pure_scalable_type_info element_info;
2275 analysis_result result = element_info.analyze (TREE_TYPE (type));
2276 if (result != IS_PST)
2277 return result;
2279 /* An array of unknown, flexible or variable length will be passed and
2280 returned by reference whatever we do. */
2281 tree nelts_minus_one = array_type_nelts (type);
2282 if (!tree_fits_uhwi_p (nelts_minus_one))
2283 return DOESNT_MATTER;
2285 /* Likewise if the array is constant-sized but too big to be interesting.
2286 The double checks against MAX_PIECES are to protect against overflow. */
2287 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
2288 if (count > MAX_PIECES)
2289 return DOESNT_MATTER;
2290 count += 1;
2291 if (count * element_info.pieces.length () > MAX_PIECES)
2292 return DOESNT_MATTER;
2294 /* The above checks should have weeded out elements of unknown size. */
2295 poly_uint64 element_bytes;
2296 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
2297 gcc_unreachable ();
2299 /* Build up the list of individual vectors and predicates. */
2300 gcc_assert (!element_info.pieces.is_empty ());
2301 for (unsigned int i = 0; i < count; ++i)
2302 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
2304 piece p = element_info.pieces[j];
2305 p.offset += i * element_bytes;
2306 add_piece (p);
2308 return IS_PST;
2311 /* Subroutine of analyze for handling RECORD_TYPEs. */
2313 pure_scalable_type_info::analysis_result
2314 pure_scalable_type_info::analyze_record (const_tree type)
2316 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
2318 if (TREE_CODE (field) != FIELD_DECL)
2319 continue;
2321 /* Zero-sized fields disappear in the language->ABI mapping. */
2322 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
2323 continue;
2325 /* All fields with an ABI identity must be PSTs for the record as
2326 a whole to be a PST. If any individual field is too big to be
2327 interesting then the record is too. */
2328 pure_scalable_type_info field_info;
2329 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
2330 if (subresult == NO_ABI_IDENTITY)
2331 continue;
2332 if (subresult != IS_PST)
2333 return subresult;
2335 /* Since all previous fields are PSTs, we ought to be able to track
2336 the field offset using poly_ints. */
2337 tree bitpos = bit_position (field);
2338 gcc_assert (poly_int_tree_p (bitpos));
2340 /* For the same reason, it shouldn't be possible to create a PST field
2341 whose offset isn't byte-aligned. */
2342 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
2343 BITS_PER_UNIT);
2345 /* Punt if the record is too big to be interesting. */
2346 poly_uint64 bytepos;
2347 if (!wide_bytepos.to_uhwi (&bytepos)
2348 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
2349 return DOESNT_MATTER;
2351 /* Add the individual vectors and predicates in the field to the
2352 record's list. */
2353 gcc_assert (!field_info.pieces.is_empty ());
2354 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
2356 piece p = field_info.pieces[i];
2357 p.offset += bytepos;
2358 add_piece (p);
2361 /* Empty structures disappear in the language->ABI mapping. */
2362 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
2365 /* Add P to the list of pieces in the type. */
2367 void
2368 pure_scalable_type_info::add_piece (const piece &p)
2370 /* Try to fold the new piece into the previous one to form a
2371 single-mode PST. For example, if we see three consecutive vectors
2372 of the same mode, we can represent them using the corresponding
2373 3-tuple mode.
2375 This is purely an optimization. */
2376 if (!pieces.is_empty ())
2378 piece &prev = pieces.last ();
2379 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
2380 unsigned int nelems1, nelems2;
2381 if (prev.orig_mode == p.orig_mode
2382 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
2383 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
2384 GET_MODE_NUNITS (p.orig_mode), &nelems1)
2385 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
2386 GET_MODE_NUNITS (p.orig_mode), &nelems2)
2387 && targetm.array_mode (p.orig_mode,
2388 nelems1 + nelems2).exists (&prev.mode))
2390 prev.num_zr += p.num_zr;
2391 prev.num_pr += p.num_pr;
2392 return;
2395 pieces.quick_push (p);
2398 /* Return true if at least one possible value of type TYPE includes at
2399 least one object of Pure Scalable Type, in the sense of the AAPCS64.
2401 This is a relatively expensive test for some types, so it should
2402 generally be made as late as possible. */
2404 static bool
2405 aarch64_some_values_include_pst_objects_p (const_tree type)
2407 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
2408 return false;
2410 if (aarch64_sve::builtin_type_p (type))
2411 return true;
2413 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
2414 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
2416 if (RECORD_OR_UNION_TYPE_P (type))
2417 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
2418 if (TREE_CODE (field) == FIELD_DECL
2419 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
2420 return true;
2422 return false;
2425 /* Return the descriptor of the SIMD ABI. */
2427 static const predefined_function_abi &
2428 aarch64_simd_abi (void)
2430 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
2431 if (!simd_abi.initialized_p ())
2433 HARD_REG_SET full_reg_clobbers
2434 = default_function_abi.full_reg_clobbers ();
2435 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
2436 if (FP_SIMD_SAVED_REGNUM_P (regno))
2437 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2438 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
2440 return simd_abi;
2443 /* Return the descriptor of the SVE PCS. */
2445 static const predefined_function_abi &
2446 aarch64_sve_abi (void)
2448 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
2449 if (!sve_abi.initialized_p ())
2451 HARD_REG_SET full_reg_clobbers
2452 = default_function_abi.full_reg_clobbers ();
2453 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
2454 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2455 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
2456 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
2457 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
2459 return sve_abi;
2462 /* If X is an UNSPEC_SALT_ADDR expression, return the address that it
2463 wraps, otherwise return X itself. */
2465 static rtx
2466 strip_salt (rtx x)
2468 rtx search = x;
2469 if (GET_CODE (search) == CONST)
2470 search = XEXP (search, 0);
2471 if (GET_CODE (search) == UNSPEC && XINT (search, 1) == UNSPEC_SALT_ADDR)
2472 x = XVECEXP (search, 0, 0);
2473 return x;
2476 /* Like strip_offset, but also strip any UNSPEC_SALT_ADDR from the
2477 expression. */
2479 static rtx
2480 strip_offset_and_salt (rtx addr, poly_int64 *offset)
2482 return strip_salt (strip_offset (addr, offset));
2485 /* Generate code to enable conditional branches in functions over 1 MiB. */
2486 const char *
2487 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
2488 const char * branch_format)
2490 rtx_code_label * tmp_label = gen_label_rtx ();
2491 char label_buf[256];
2492 char buffer[128];
2493 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
2494 CODE_LABEL_NUMBER (tmp_label));
2495 const char *label_ptr = targetm.strip_name_encoding (label_buf);
2496 rtx dest_label = operands[pos_label];
2497 operands[pos_label] = tmp_label;
2499 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
2500 output_asm_insn (buffer, operands);
2502 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
2503 operands[pos_label] = dest_label;
2504 output_asm_insn (buffer, operands);
2505 return "";
2508 void
2509 aarch64_err_no_fpadvsimd (machine_mode mode)
2511 if (TARGET_GENERAL_REGS_ONLY)
2512 if (FLOAT_MODE_P (mode))
2513 error ("%qs is incompatible with the use of floating-point types",
2514 "-mgeneral-regs-only");
2515 else
2516 error ("%qs is incompatible with the use of vector types",
2517 "-mgeneral-regs-only");
2518 else
2519 if (FLOAT_MODE_P (mode))
2520 error ("%qs feature modifier is incompatible with the use of"
2521 " floating-point types", "+nofp");
2522 else
2523 error ("%qs feature modifier is incompatible with the use of"
2524 " vector types", "+nofp");
2527 /* Report when we try to do something that requires SVE when SVE is disabled.
2528 This is an error of last resort and isn't very high-quality. It usually
2529 involves attempts to measure the vector length in some way. */
2530 static void
2531 aarch64_report_sve_required (void)
2533 static bool reported_p = false;
2535 /* Avoid reporting a slew of messages for a single oversight. */
2536 if (reported_p)
2537 return;
2539 error ("this operation requires the SVE ISA extension");
2540 inform (input_location, "you can enable SVE using the command-line"
2541 " option %<-march%>, or by using the %<target%>"
2542 " attribute or pragma");
2543 reported_p = true;
2546 /* Return true if REGNO is P0-P15 or one of the special FFR-related
2547 registers. */
2548 inline bool
2549 pr_or_ffr_regnum_p (unsigned int regno)
2551 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
2554 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
2555 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
2556 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
2557 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
2558 and GENERAL_REGS is lower than the memory cost (in this case the best class
2559 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
2560 cost results in bad allocations with many redundant int<->FP moves which
2561 are expensive on various cores.
2562 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
2563 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
2564 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
2565 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
2566 The result of this is that it is no longer inefficient to have a higher
2567 memory move cost than the register move cost.
2570 static reg_class_t
2571 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
2572 reg_class_t best_class)
2574 machine_mode mode;
2576 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
2577 || !reg_class_subset_p (FP_REGS, allocno_class))
2578 return allocno_class;
2580 if (!reg_class_subset_p (GENERAL_REGS, best_class)
2581 || !reg_class_subset_p (FP_REGS, best_class))
2582 return best_class;
2584 mode = PSEUDO_REGNO_MODE (regno);
2585 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
2588 static unsigned int
2589 aarch64_min_divisions_for_recip_mul (machine_mode mode)
2591 if (GET_MODE_UNIT_SIZE (mode) == 4)
2592 return aarch64_tune_params.min_div_recip_mul_sf;
2593 return aarch64_tune_params.min_div_recip_mul_df;
2596 /* Return the reassociation width of treeop OPC with mode MODE. */
2597 static int
2598 aarch64_reassociation_width (unsigned opc, machine_mode mode)
2600 if (VECTOR_MODE_P (mode))
2601 return aarch64_tune_params.vec_reassoc_width;
2602 if (INTEGRAL_MODE_P (mode))
2603 return aarch64_tune_params.int_reassoc_width;
2604 /* Avoid reassociating floating point addition so we emit more FMAs. */
2605 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
2606 return aarch64_tune_params.fp_reassoc_width;
2607 return 1;
2610 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
2611 unsigned
2612 aarch64_dbx_register_number (unsigned regno)
2614 if (GP_REGNUM_P (regno))
2615 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
2616 else if (regno == SP_REGNUM)
2617 return AARCH64_DWARF_SP;
2618 else if (FP_REGNUM_P (regno))
2619 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
2620 else if (PR_REGNUM_P (regno))
2621 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
2622 else if (regno == VG_REGNUM)
2623 return AARCH64_DWARF_VG;
2625 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
2626 equivalent DWARF register. */
2627 return DWARF_FRAME_REGISTERS;
2630 /* If X is a CONST_DOUBLE, return its bit representation as a constant
2631 integer, otherwise return X unmodified. */
2632 static rtx
2633 aarch64_bit_representation (rtx x)
2635 if (CONST_DOUBLE_P (x))
2636 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
2637 return x;
2640 /* Return an estimate for the number of quadwords in an SVE vector. This is
2641 equivalent to the number of Advanced SIMD vectors in an SVE vector. */
2642 static unsigned int
2643 aarch64_estimated_sve_vq ()
2645 return estimated_poly_value (BITS_PER_SVE_VECTOR) / 128;
2648 /* Return true if MODE is any of the Advanced SIMD structure modes. */
2649 static bool
2650 aarch64_advsimd_struct_mode_p (machine_mode mode)
2652 return (TARGET_SIMD
2653 && (mode == OImode || mode == CImode || mode == XImode));
2656 /* Return true if MODE is an SVE predicate mode. */
2657 static bool
2658 aarch64_sve_pred_mode_p (machine_mode mode)
2660 return (TARGET_SVE
2661 && (mode == VNx16BImode
2662 || mode == VNx8BImode
2663 || mode == VNx4BImode
2664 || mode == VNx2BImode));
2667 /* Three mutually-exclusive flags describing a vector or predicate type. */
2668 const unsigned int VEC_ADVSIMD = 1;
2669 const unsigned int VEC_SVE_DATA = 2;
2670 const unsigned int VEC_SVE_PRED = 4;
2671 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
2672 a structure of 2, 3 or 4 vectors. */
2673 const unsigned int VEC_STRUCT = 8;
2674 /* Can be used in combination with VEC_SVE_DATA to indicate that the
2675 vector has fewer significant bytes than a full SVE vector. */
2676 const unsigned int VEC_PARTIAL = 16;
2677 /* Useful combinations of the above. */
2678 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
2679 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
2681 /* Return a set of flags describing the vector properties of mode MODE.
2682 Ignore modes that are not supported by the current target. */
2683 static unsigned int
2684 aarch64_classify_vector_mode (machine_mode mode)
2686 if (aarch64_advsimd_struct_mode_p (mode))
2687 return VEC_ADVSIMD | VEC_STRUCT;
2689 if (aarch64_sve_pred_mode_p (mode))
2690 return VEC_SVE_PRED;
2692 /* Make the decision based on the mode's enum value rather than its
2693 properties, so that we keep the correct classification regardless
2694 of -msve-vector-bits. */
2695 switch (mode)
2697 /* Partial SVE QI vectors. */
2698 case E_VNx2QImode:
2699 case E_VNx4QImode:
2700 case E_VNx8QImode:
2701 /* Partial SVE HI vectors. */
2702 case E_VNx2HImode:
2703 case E_VNx4HImode:
2704 /* Partial SVE SI vector. */
2705 case E_VNx2SImode:
2706 /* Partial SVE HF vectors. */
2707 case E_VNx2HFmode:
2708 case E_VNx4HFmode:
2709 /* Partial SVE BF vectors. */
2710 case E_VNx2BFmode:
2711 case E_VNx4BFmode:
2712 /* Partial SVE SF vector. */
2713 case E_VNx2SFmode:
2714 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
2716 case E_VNx16QImode:
2717 case E_VNx8HImode:
2718 case E_VNx4SImode:
2719 case E_VNx2DImode:
2720 case E_VNx8BFmode:
2721 case E_VNx8HFmode:
2722 case E_VNx4SFmode:
2723 case E_VNx2DFmode:
2724 return TARGET_SVE ? VEC_SVE_DATA : 0;
2726 /* x2 SVE vectors. */
2727 case E_VNx32QImode:
2728 case E_VNx16HImode:
2729 case E_VNx8SImode:
2730 case E_VNx4DImode:
2731 case E_VNx16BFmode:
2732 case E_VNx16HFmode:
2733 case E_VNx8SFmode:
2734 case E_VNx4DFmode:
2735 /* x3 SVE vectors. */
2736 case E_VNx48QImode:
2737 case E_VNx24HImode:
2738 case E_VNx12SImode:
2739 case E_VNx6DImode:
2740 case E_VNx24BFmode:
2741 case E_VNx24HFmode:
2742 case E_VNx12SFmode:
2743 case E_VNx6DFmode:
2744 /* x4 SVE vectors. */
2745 case E_VNx64QImode:
2746 case E_VNx32HImode:
2747 case E_VNx16SImode:
2748 case E_VNx8DImode:
2749 case E_VNx32BFmode:
2750 case E_VNx32HFmode:
2751 case E_VNx16SFmode:
2752 case E_VNx8DFmode:
2753 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
2755 /* 64-bit Advanced SIMD vectors. */
2756 case E_V8QImode:
2757 case E_V4HImode:
2758 case E_V2SImode:
2759 /* ...E_V1DImode doesn't exist. */
2760 case E_V4HFmode:
2761 case E_V4BFmode:
2762 case E_V2SFmode:
2763 case E_V1DFmode:
2764 /* 128-bit Advanced SIMD vectors. */
2765 case E_V16QImode:
2766 case E_V8HImode:
2767 case E_V4SImode:
2768 case E_V2DImode:
2769 case E_V8HFmode:
2770 case E_V8BFmode:
2771 case E_V4SFmode:
2772 case E_V2DFmode:
2773 return TARGET_SIMD ? VEC_ADVSIMD : 0;
2775 default:
2776 return 0;
2780 /* Return true if MODE is any of the data vector modes, including
2781 structure modes. */
2782 static bool
2783 aarch64_vector_data_mode_p (machine_mode mode)
2785 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
2788 /* Return true if MODE is any form of SVE mode, including predicates,
2789 vectors and structures. */
2790 bool
2791 aarch64_sve_mode_p (machine_mode mode)
2793 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
2796 /* Return true if MODE is an SVE data vector mode; either a single vector
2797 or a structure of vectors. */
2798 static bool
2799 aarch64_sve_data_mode_p (machine_mode mode)
2801 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
2804 /* Return the number of defined bytes in one constituent vector of
2805 SVE mode MODE, which has vector flags VEC_FLAGS. */
2806 static poly_int64
2807 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
2809 if (vec_flags & VEC_PARTIAL)
2810 /* A single partial vector. */
2811 return GET_MODE_SIZE (mode);
2813 if (vec_flags & VEC_SVE_DATA)
2814 /* A single vector or a tuple. */
2815 return BYTES_PER_SVE_VECTOR;
2817 /* A single predicate. */
2818 gcc_assert (vec_flags & VEC_SVE_PRED);
2819 return BYTES_PER_SVE_PRED;
2822 /* Implement target hook TARGET_ARRAY_MODE. */
2823 static opt_machine_mode
2824 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
2826 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
2827 && IN_RANGE (nelems, 2, 4))
2828 return mode_for_vector (GET_MODE_INNER (mode),
2829 GET_MODE_NUNITS (mode) * nelems);
2831 return opt_machine_mode ();
2834 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
2835 static bool
2836 aarch64_array_mode_supported_p (machine_mode mode,
2837 unsigned HOST_WIDE_INT nelems)
2839 if (TARGET_SIMD
2840 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
2841 || AARCH64_VALID_SIMD_DREG_MODE (mode))
2842 && (nelems >= 2 && nelems <= 4))
2843 return true;
2845 return false;
2848 /* MODE is some form of SVE vector mode. For data modes, return the number
2849 of vector register bits that each element of MODE occupies, such as 64
2850 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
2851 in a 64-bit container). For predicate modes, return the number of
2852 data bits controlled by each significant predicate bit. */
2854 static unsigned int
2855 aarch64_sve_container_bits (machine_mode mode)
2857 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2858 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
2859 ? BITS_PER_SVE_VECTOR
2860 : GET_MODE_BITSIZE (mode));
2861 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
2864 /* Return the SVE predicate mode to use for elements that have
2865 ELEM_NBYTES bytes, if such a mode exists. */
2867 opt_machine_mode
2868 aarch64_sve_pred_mode (unsigned int elem_nbytes)
2870 if (TARGET_SVE)
2872 if (elem_nbytes == 1)
2873 return VNx16BImode;
2874 if (elem_nbytes == 2)
2875 return VNx8BImode;
2876 if (elem_nbytes == 4)
2877 return VNx4BImode;
2878 if (elem_nbytes == 8)
2879 return VNx2BImode;
2881 return opt_machine_mode ();
2884 /* Return the SVE predicate mode that should be used to control
2885 SVE mode MODE. */
2887 machine_mode
2888 aarch64_sve_pred_mode (machine_mode mode)
2890 unsigned int bits = aarch64_sve_container_bits (mode);
2891 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
2894 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
2896 static opt_machine_mode
2897 aarch64_get_mask_mode (machine_mode mode)
2899 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2900 if (vec_flags & VEC_SVE_DATA)
2901 return aarch64_sve_pred_mode (mode);
2903 return default_get_mask_mode (mode);
2906 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
2908 opt_machine_mode
2909 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
2911 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
2912 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
2913 machine_mode mode;
2914 FOR_EACH_MODE_IN_CLASS (mode, mclass)
2915 if (inner_mode == GET_MODE_INNER (mode)
2916 && known_eq (nunits, GET_MODE_NUNITS (mode))
2917 && aarch64_sve_data_mode_p (mode))
2918 return mode;
2919 return opt_machine_mode ();
2922 /* Return the integer element mode associated with SVE mode MODE. */
2924 static scalar_int_mode
2925 aarch64_sve_element_int_mode (machine_mode mode)
2927 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2928 ? BITS_PER_SVE_VECTOR
2929 : GET_MODE_BITSIZE (mode));
2930 unsigned int elt_bits = vector_element_size (vector_bits,
2931 GET_MODE_NUNITS (mode));
2932 return int_mode_for_size (elt_bits, 0).require ();
2935 /* Return an integer element mode that contains exactly
2936 aarch64_sve_container_bits (MODE) bits. This is wider than
2937 aarch64_sve_element_int_mode if MODE is a partial vector,
2938 otherwise it's the same. */
2940 static scalar_int_mode
2941 aarch64_sve_container_int_mode (machine_mode mode)
2943 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
2946 /* Return the integer vector mode associated with SVE mode MODE.
2947 Unlike related_int_vector_mode, this can handle the case in which
2948 MODE is a predicate (and thus has a different total size). */
2950 machine_mode
2951 aarch64_sve_int_mode (machine_mode mode)
2953 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
2954 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
2957 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
2959 static opt_machine_mode
2960 aarch64_vectorize_related_mode (machine_mode vector_mode,
2961 scalar_mode element_mode,
2962 poly_uint64 nunits)
2964 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
2966 /* If we're operating on SVE vectors, try to return an SVE mode. */
2967 poly_uint64 sve_nunits;
2968 if ((vec_flags & VEC_SVE_DATA)
2969 && multiple_p (BYTES_PER_SVE_VECTOR,
2970 GET_MODE_SIZE (element_mode), &sve_nunits))
2972 machine_mode sve_mode;
2973 if (maybe_ne (nunits, 0U))
2975 /* Try to find a full or partial SVE mode with exactly
2976 NUNITS units. */
2977 if (multiple_p (sve_nunits, nunits)
2978 && aarch64_sve_data_mode (element_mode,
2979 nunits).exists (&sve_mode))
2980 return sve_mode;
2982 else
2984 /* Take the preferred number of units from the number of bytes
2985 that fit in VECTOR_MODE. We always start by "autodetecting"
2986 a full vector mode with preferred_simd_mode, so vectors
2987 chosen here will also be full vector modes. Then
2988 autovectorize_vector_modes tries smaller starting modes
2989 and thus smaller preferred numbers of units. */
2990 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
2991 if (aarch64_sve_data_mode (element_mode,
2992 sve_nunits).exists (&sve_mode))
2993 return sve_mode;
2997 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
2998 if ((vec_flags & VEC_ADVSIMD)
2999 && known_eq (nunits, 0U)
3000 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
3001 && maybe_ge (GET_MODE_BITSIZE (element_mode)
3002 * GET_MODE_NUNITS (vector_mode), 128U))
3004 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
3005 if (VECTOR_MODE_P (res))
3006 return res;
3009 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
3012 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
3013 prefer to use the first arithmetic operand as the else value if
3014 the else value doesn't matter, since that exactly matches the SVE
3015 destructive merging form. For ternary operations we could either
3016 pick the first operand and use FMAD-like instructions or the last
3017 operand and use FMLA-like instructions; the latter seems more
3018 natural. */
3020 static tree
3021 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
3023 return nops == 3 ? ops[2] : ops[0];
3026 /* Implement TARGET_HARD_REGNO_NREGS. */
3028 static unsigned int
3029 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
3031 /* ??? Logically we should only need to provide a value when
3032 HARD_REGNO_MODE_OK says that the combination is valid,
3033 but at the moment we need to handle all modes. Just ignore
3034 any runtime parts for registers that can't store them. */
3035 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
3036 switch (aarch64_regno_regclass (regno))
3038 case FP_REGS:
3039 case FP_LO_REGS:
3040 case FP_LO8_REGS:
3042 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3043 if (vec_flags & VEC_SVE_DATA)
3044 return exact_div (GET_MODE_SIZE (mode),
3045 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
3046 return CEIL (lowest_size, UNITS_PER_VREG);
3048 case PR_REGS:
3049 case PR_LO_REGS:
3050 case PR_HI_REGS:
3051 case FFR_REGS:
3052 case PR_AND_FFR_REGS:
3053 return 1;
3054 default:
3055 return CEIL (lowest_size, UNITS_PER_WORD);
3057 gcc_unreachable ();
3060 /* Implement TARGET_HARD_REGNO_MODE_OK. */
3062 static bool
3063 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
3065 if (GET_MODE_CLASS (mode) == MODE_CC)
3066 return regno == CC_REGNUM;
3068 if (regno == VG_REGNUM)
3069 /* This must have the same size as _Unwind_Word. */
3070 return mode == DImode;
3072 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3073 if (vec_flags & VEC_SVE_PRED)
3074 return pr_or_ffr_regnum_p (regno);
3076 if (pr_or_ffr_regnum_p (regno))
3077 return false;
3079 if (regno == SP_REGNUM)
3080 /* The purpose of comparing with ptr_mode is to support the
3081 global register variable associated with the stack pointer
3082 register via the syntax of asm ("wsp") in ILP32. */
3083 return mode == Pmode || mode == ptr_mode;
3085 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
3086 return mode == Pmode;
3088 if (GP_REGNUM_P (regno))
3090 if (vec_flags & VEC_ANY_SVE)
3091 return false;
3092 if (known_le (GET_MODE_SIZE (mode), 8))
3093 return true;
3094 if (known_le (GET_MODE_SIZE (mode), 16))
3095 return (regno & 1) == 0;
3097 else if (FP_REGNUM_P (regno))
3099 if (vec_flags & VEC_STRUCT)
3100 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
3101 else
3102 return !VECTOR_MODE_P (mode) || vec_flags != 0;
3105 return false;
3108 /* Return true if a function with type FNTYPE returns its value in
3109 SVE vector or predicate registers. */
3111 static bool
3112 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
3114 tree return_type = TREE_TYPE (fntype);
3116 pure_scalable_type_info pst_info;
3117 switch (pst_info.analyze (return_type))
3119 case pure_scalable_type_info::IS_PST:
3120 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
3121 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
3123 case pure_scalable_type_info::DOESNT_MATTER:
3124 gcc_assert (aarch64_return_in_memory_1 (return_type));
3125 return false;
3127 case pure_scalable_type_info::NO_ABI_IDENTITY:
3128 case pure_scalable_type_info::ISNT_PST:
3129 return false;
3131 gcc_unreachable ();
3134 /* Return true if a function with type FNTYPE takes arguments in
3135 SVE vector or predicate registers. */
3137 static bool
3138 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
3140 CUMULATIVE_ARGS args_so_far_v;
3141 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
3142 NULL_TREE, 0, true);
3143 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
3145 for (tree chain = TYPE_ARG_TYPES (fntype);
3146 chain && chain != void_list_node;
3147 chain = TREE_CHAIN (chain))
3149 tree arg_type = TREE_VALUE (chain);
3150 if (arg_type == error_mark_node)
3151 return false;
3153 function_arg_info arg (arg_type, /*named=*/true);
3154 apply_pass_by_reference_rules (&args_so_far_v, arg);
3155 pure_scalable_type_info pst_info;
3156 if (pst_info.analyze_registers (arg.type))
3158 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
3159 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
3160 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
3161 return true;
3164 targetm.calls.function_arg_advance (args_so_far, arg);
3166 return false;
3169 /* Implement TARGET_FNTYPE_ABI. */
3171 static const predefined_function_abi &
3172 aarch64_fntype_abi (const_tree fntype)
3174 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
3175 return aarch64_simd_abi ();
3177 if (aarch64_returns_value_in_sve_regs_p (fntype)
3178 || aarch64_takes_arguments_in_sve_regs_p (fntype))
3179 return aarch64_sve_abi ();
3181 return default_function_abi;
3184 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
3186 static bool
3187 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
3189 return (aarch64_sve::builtin_type_p (type1)
3190 == aarch64_sve::builtin_type_p (type2));
3193 /* Return true if we should emit CFI for register REGNO. */
3195 static bool
3196 aarch64_emit_cfi_for_reg_p (unsigned int regno)
3198 return (GP_REGNUM_P (regno)
3199 || !default_function_abi.clobbers_full_reg_p (regno));
3202 /* Return the mode we should use to save and restore register REGNO. */
3204 static machine_mode
3205 aarch64_reg_save_mode (unsigned int regno)
3207 if (GP_REGNUM_P (regno))
3208 return DImode;
3210 if (FP_REGNUM_P (regno))
3211 switch (crtl->abi->id ())
3213 case ARM_PCS_AAPCS64:
3214 /* Only the low 64 bits are saved by the base PCS. */
3215 return DFmode;
3217 case ARM_PCS_SIMD:
3218 /* The vector PCS saves the low 128 bits (which is the full
3219 register on non-SVE targets). */
3220 return TFmode;
3222 case ARM_PCS_SVE:
3223 /* Use vectors of DImode for registers that need frame
3224 information, so that the first 64 bytes of the save slot
3225 are always the equivalent of what storing D<n> would give. */
3226 if (aarch64_emit_cfi_for_reg_p (regno))
3227 return VNx2DImode;
3229 /* Use vectors of bytes otherwise, so that the layout is
3230 endian-agnostic, and so that we can use LDR and STR for
3231 big-endian targets. */
3232 return VNx16QImode;
3234 case ARM_PCS_TLSDESC:
3235 case ARM_PCS_UNKNOWN:
3236 break;
3239 if (PR_REGNUM_P (regno))
3240 /* Save the full predicate register. */
3241 return VNx16BImode;
3243 gcc_unreachable ();
3246 /* Implement TARGET_INSN_CALLEE_ABI. */
3248 const predefined_function_abi &
3249 aarch64_insn_callee_abi (const rtx_insn *insn)
3251 rtx pat = PATTERN (insn);
3252 gcc_assert (GET_CODE (pat) == PARALLEL);
3253 rtx unspec = XVECEXP (pat, 0, 1);
3254 gcc_assert (GET_CODE (unspec) == UNSPEC
3255 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
3256 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
3259 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
3260 the lower 64 bits of a 128-bit register. Tell the compiler the callee
3261 clobbers the top 64 bits when restoring the bottom 64 bits. */
3263 static bool
3264 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
3265 unsigned int regno,
3266 machine_mode mode)
3268 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
3270 poly_int64 per_register_size = GET_MODE_SIZE (mode);
3271 unsigned int nregs = hard_regno_nregs (regno, mode);
3272 if (nregs > 1)
3273 per_register_size = exact_div (per_register_size, nregs);
3274 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
3275 return maybe_gt (per_register_size, 16);
3276 return maybe_gt (per_register_size, 8);
3278 return false;
3281 /* Implement REGMODE_NATURAL_SIZE. */
3282 poly_uint64
3283 aarch64_regmode_natural_size (machine_mode mode)
3285 /* The natural size for SVE data modes is one SVE data vector,
3286 and similarly for predicates. We can't independently modify
3287 anything smaller than that. */
3288 /* ??? For now, only do this for variable-width SVE registers.
3289 Doing it for constant-sized registers breaks lower-subreg.c. */
3290 /* ??? And once that's fixed, we should probably have similar
3291 code for Advanced SIMD. */
3292 if (!aarch64_sve_vg.is_constant ())
3294 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
3295 if (vec_flags & VEC_SVE_PRED)
3296 return BYTES_PER_SVE_PRED;
3297 if (vec_flags & VEC_SVE_DATA)
3298 return BYTES_PER_SVE_VECTOR;
3300 return UNITS_PER_WORD;
3303 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
3304 machine_mode
3305 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
3306 machine_mode mode)
3308 /* The predicate mode determines which bits are significant and
3309 which are "don't care". Decreasing the number of lanes would
3310 lose data while increasing the number of lanes would make bits
3311 unnecessarily significant. */
3312 if (PR_REGNUM_P (regno))
3313 return mode;
3314 if (known_ge (GET_MODE_SIZE (mode), 4))
3315 return mode;
3316 else
3317 return SImode;
3320 /* Return true if I's bits are consecutive ones from the MSB. */
3321 bool
3322 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
3324 return exact_log2 (-i) != HOST_WIDE_INT_M1;
3327 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
3328 that strcpy from constants will be faster. */
3330 static HOST_WIDE_INT
3331 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
3333 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
3334 return MAX (align, BITS_PER_WORD);
3335 return align;
3338 /* Return true if calls to DECL should be treated as
3339 long-calls (ie called via a register). */
3340 static bool
3341 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
3343 return false;
3346 /* Return true if calls to symbol-ref SYM should be treated as
3347 long-calls (ie called via a register). */
3348 bool
3349 aarch64_is_long_call_p (rtx sym)
3351 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
3354 /* Return true if calls to symbol-ref SYM should not go through
3355 plt stubs. */
3357 bool
3358 aarch64_is_noplt_call_p (rtx sym)
3360 const_tree decl = SYMBOL_REF_DECL (sym);
3362 if (flag_pic
3363 && decl
3364 && (!flag_plt
3365 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
3366 && !targetm.binds_local_p (decl))
3367 return true;
3369 return false;
3372 /* Emit an insn that's a simple single-set. Both the operands must be
3373 known to be valid. */
3374 inline static rtx_insn *
3375 emit_set_insn (rtx x, rtx y)
3377 return emit_insn (gen_rtx_SET (x, y));
3380 /* X and Y are two things to compare using CODE. Emit the compare insn and
3381 return the rtx for register 0 in the proper mode. */
3383 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
3385 machine_mode cmp_mode = GET_MODE (x);
3386 machine_mode cc_mode;
3387 rtx cc_reg;
3389 if (cmp_mode == TImode)
3391 gcc_assert (code == NE);
3393 cc_mode = CCmode;
3394 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3396 rtx x_lo = operand_subword (x, 0, 0, TImode);
3397 rtx y_lo = operand_subword (y, 0, 0, TImode);
3398 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
3400 rtx x_hi = operand_subword (x, 1, 0, TImode);
3401 rtx y_hi = operand_subword (y, 1, 0, TImode);
3402 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
3403 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
3404 GEN_INT (AARCH64_EQ)));
3406 else
3408 cc_mode = SELECT_CC_MODE (code, x, y);
3409 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3410 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
3412 return cc_reg;
3415 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
3417 static rtx
3418 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
3419 machine_mode y_mode)
3421 if (y_mode == E_QImode || y_mode == E_HImode)
3423 if (CONST_INT_P (y))
3425 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
3426 y_mode = SImode;
3428 else
3430 rtx t, cc_reg;
3431 machine_mode cc_mode;
3433 t = gen_rtx_ZERO_EXTEND (SImode, y);
3434 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
3435 cc_mode = CC_SWPmode;
3436 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
3437 emit_set_insn (cc_reg, t);
3438 return cc_reg;
3442 if (!aarch64_plus_operand (y, y_mode))
3443 y = force_reg (y_mode, y);
3445 return aarch64_gen_compare_reg (code, x, y);
3448 /* Build the SYMBOL_REF for __tls_get_addr. */
3450 static GTY(()) rtx tls_get_addr_libfunc;
3453 aarch64_tls_get_addr (void)
3455 if (!tls_get_addr_libfunc)
3456 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
3457 return tls_get_addr_libfunc;
3460 /* Return the TLS model to use for ADDR. */
3462 static enum tls_model
3463 tls_symbolic_operand_type (rtx addr)
3465 enum tls_model tls_kind = TLS_MODEL_NONE;
3466 poly_int64 offset;
3467 addr = strip_offset_and_salt (addr, &offset);
3468 if (SYMBOL_REF_P (addr))
3469 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
3471 return tls_kind;
3474 /* We'll allow lo_sum's in addresses in our legitimate addresses
3475 so that combine would take care of combining addresses where
3476 necessary, but for generation purposes, we'll generate the address
3477 as :
3478 RTL Absolute
3479 tmp = hi (symbol_ref); adrp x1, foo
3480 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
3483 PIC TLS
3484 adrp x1, :got:foo adrp tmp, :tlsgd:foo
3485 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
3486 bl __tls_get_addr
3489 Load TLS symbol, depending on TLS mechanism and TLS access model.
3491 Global Dynamic - Traditional TLS:
3492 adrp tmp, :tlsgd:imm
3493 add dest, tmp, #:tlsgd_lo12:imm
3494 bl __tls_get_addr
3496 Global Dynamic - TLS Descriptors:
3497 adrp dest, :tlsdesc:imm
3498 ldr tmp, [dest, #:tlsdesc_lo12:imm]
3499 add dest, dest, #:tlsdesc_lo12:imm
3500 blr tmp
3501 mrs tp, tpidr_el0
3502 add dest, dest, tp
3504 Initial Exec:
3505 mrs tp, tpidr_el0
3506 adrp tmp, :gottprel:imm
3507 ldr dest, [tmp, #:gottprel_lo12:imm]
3508 add dest, dest, tp
3510 Local Exec:
3511 mrs tp, tpidr_el0
3512 add t0, tp, #:tprel_hi12:imm, lsl #12
3513 add t0, t0, #:tprel_lo12_nc:imm
3516 static void
3517 aarch64_load_symref_appropriately (rtx dest, rtx imm,
3518 enum aarch64_symbol_type type)
3520 switch (type)
3522 case SYMBOL_SMALL_ABSOLUTE:
3524 /* In ILP32, the mode of dest can be either SImode or DImode. */
3525 rtx tmp_reg = dest;
3526 machine_mode mode = GET_MODE (dest);
3528 gcc_assert (mode == Pmode || mode == ptr_mode);
3530 if (can_create_pseudo_p ())
3531 tmp_reg = gen_reg_rtx (mode);
3533 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3534 emit_insn (gen_add_losym (dest, tmp_reg, imm));
3535 return;
3538 case SYMBOL_TINY_ABSOLUTE:
3539 emit_insn (gen_rtx_SET (dest, imm));
3540 return;
3542 case SYMBOL_SMALL_GOT_28K:
3544 machine_mode mode = GET_MODE (dest);
3545 rtx gp_rtx = pic_offset_table_rtx;
3546 rtx insn;
3547 rtx mem;
3549 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
3550 here before rtl expand. Tree IVOPT will generate rtl pattern to
3551 decide rtx costs, in which case pic_offset_table_rtx is not
3552 initialized. For that case no need to generate the first adrp
3553 instruction as the final cost for global variable access is
3554 one instruction. */
3555 if (gp_rtx != NULL)
3557 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
3558 using the page base as GOT base, the first page may be wasted,
3559 in the worst scenario, there is only 28K space for GOT).
3561 The generate instruction sequence for accessing global variable
3564 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
3566 Only one instruction needed. But we must initialize
3567 pic_offset_table_rtx properly. We generate initialize insn for
3568 every global access, and allow CSE to remove all redundant.
3570 The final instruction sequences will look like the following
3571 for multiply global variables access.
3573 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
3575 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3576 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3577 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3578 ... */
3580 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
3581 crtl->uses_pic_offset_table = 1;
3582 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
3584 if (mode != GET_MODE (gp_rtx))
3585 gp_rtx = gen_lowpart (mode, gp_rtx);
3589 if (mode == ptr_mode)
3591 if (mode == DImode)
3592 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
3593 else
3594 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
3596 mem = XVECEXP (SET_SRC (insn), 0, 0);
3598 else
3600 gcc_assert (mode == Pmode);
3602 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
3603 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3606 /* The operand is expected to be MEM. Whenever the related insn
3607 pattern changed, above code which calculate mem should be
3608 updated. */
3609 gcc_assert (MEM_P (mem));
3610 MEM_READONLY_P (mem) = 1;
3611 MEM_NOTRAP_P (mem) = 1;
3612 emit_insn (insn);
3613 return;
3616 case SYMBOL_SMALL_GOT_4G:
3618 /* In ILP32, the mode of dest can be either SImode or DImode,
3619 while the got entry is always of SImode size. The mode of
3620 dest depends on how dest is used: if dest is assigned to a
3621 pointer (e.g. in the memory), it has SImode; it may have
3622 DImode if dest is dereferenced to access the memeory.
3623 This is why we have to handle three different ldr_got_small
3624 patterns here (two patterns for ILP32). */
3626 rtx insn;
3627 rtx mem;
3628 rtx tmp_reg = dest;
3629 machine_mode mode = GET_MODE (dest);
3631 if (can_create_pseudo_p ())
3632 tmp_reg = gen_reg_rtx (mode);
3634 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3635 if (mode == ptr_mode)
3637 if (mode == DImode)
3638 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
3639 else
3640 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
3642 mem = XVECEXP (SET_SRC (insn), 0, 0);
3644 else
3646 gcc_assert (mode == Pmode);
3648 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
3649 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3652 gcc_assert (MEM_P (mem));
3653 MEM_READONLY_P (mem) = 1;
3654 MEM_NOTRAP_P (mem) = 1;
3655 emit_insn (insn);
3656 return;
3659 case SYMBOL_SMALL_TLSGD:
3661 rtx_insn *insns;
3662 /* The return type of __tls_get_addr is the C pointer type
3663 so use ptr_mode. */
3664 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3665 rtx tmp_reg = dest;
3667 if (GET_MODE (dest) != ptr_mode)
3668 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
3670 start_sequence ();
3671 if (ptr_mode == SImode)
3672 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3673 else
3674 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
3675 insns = get_insns ();
3676 end_sequence ();
3678 RTL_CONST_CALL_P (insns) = 1;
3679 emit_libcall_block (insns, tmp_reg, result, imm);
3680 /* Convert back to the mode of the dest adding a zero_extend
3681 from SImode (ptr_mode) to DImode (Pmode). */
3682 if (dest != tmp_reg)
3683 convert_move (dest, tmp_reg, true);
3684 return;
3687 case SYMBOL_SMALL_TLSDESC:
3689 machine_mode mode = GET_MODE (dest);
3690 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3691 rtx tp;
3693 gcc_assert (mode == Pmode || mode == ptr_mode);
3695 /* In ILP32, the got entry is always of SImode size. Unlike
3696 small GOT, the dest is fixed at reg 0. */
3697 if (TARGET_ILP32)
3698 emit_insn (gen_tlsdesc_small_si (imm));
3699 else
3700 emit_insn (gen_tlsdesc_small_di (imm));
3701 tp = aarch64_load_tp (NULL);
3703 if (mode != Pmode)
3704 tp = gen_lowpart (mode, tp);
3706 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3707 if (REG_P (dest))
3708 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3709 return;
3712 case SYMBOL_SMALL_TLSIE:
3714 /* In ILP32, the mode of dest can be either SImode or DImode,
3715 while the got entry is always of SImode size. The mode of
3716 dest depends on how dest is used: if dest is assigned to a
3717 pointer (e.g. in the memory), it has SImode; it may have
3718 DImode if dest is dereferenced to access the memeory.
3719 This is why we have to handle three different tlsie_small
3720 patterns here (two patterns for ILP32). */
3721 machine_mode mode = GET_MODE (dest);
3722 rtx tmp_reg = gen_reg_rtx (mode);
3723 rtx tp = aarch64_load_tp (NULL);
3725 if (mode == ptr_mode)
3727 if (mode == DImode)
3728 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3729 else
3731 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3732 tp = gen_lowpart (mode, tp);
3735 else
3737 gcc_assert (mode == Pmode);
3738 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3741 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3742 if (REG_P (dest))
3743 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3744 return;
3747 case SYMBOL_TLSLE12:
3748 case SYMBOL_TLSLE24:
3749 case SYMBOL_TLSLE32:
3750 case SYMBOL_TLSLE48:
3752 machine_mode mode = GET_MODE (dest);
3753 rtx tp = aarch64_load_tp (NULL);
3755 if (mode != Pmode)
3756 tp = gen_lowpart (mode, tp);
3758 switch (type)
3760 case SYMBOL_TLSLE12:
3761 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3762 (dest, tp, imm));
3763 break;
3764 case SYMBOL_TLSLE24:
3765 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3766 (dest, tp, imm));
3767 break;
3768 case SYMBOL_TLSLE32:
3769 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3770 (dest, imm));
3771 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3772 (dest, dest, tp));
3773 break;
3774 case SYMBOL_TLSLE48:
3775 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3776 (dest, imm));
3777 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3778 (dest, dest, tp));
3779 break;
3780 default:
3781 gcc_unreachable ();
3784 if (REG_P (dest))
3785 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3786 return;
3789 case SYMBOL_TINY_GOT:
3791 rtx insn;
3792 machine_mode mode = GET_MODE (dest);
3794 if (mode == ptr_mode)
3795 insn = gen_ldr_got_tiny (mode, dest, imm);
3796 else
3798 gcc_assert (mode == Pmode);
3799 insn = gen_ldr_got_tiny_sidi (dest, imm);
3802 emit_insn (insn);
3803 return;
3806 case SYMBOL_TINY_TLSIE:
3808 machine_mode mode = GET_MODE (dest);
3809 rtx tp = aarch64_load_tp (NULL);
3811 if (mode == ptr_mode)
3813 if (mode == DImode)
3814 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3815 else
3817 tp = gen_lowpart (mode, tp);
3818 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3821 else
3823 gcc_assert (mode == Pmode);
3824 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3827 if (REG_P (dest))
3828 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3829 return;
3832 default:
3833 gcc_unreachable ();
3837 /* Emit a move from SRC to DEST. Assume that the move expanders can
3838 handle all moves if !can_create_pseudo_p (). The distinction is
3839 important because, unlike emit_move_insn, the move expanders know
3840 how to force Pmode objects into the constant pool even when the
3841 constant pool address is not itself legitimate. */
3842 static rtx
3843 aarch64_emit_move (rtx dest, rtx src)
3845 return (can_create_pseudo_p ()
3846 ? emit_move_insn (dest, src)
3847 : emit_move_insn_1 (dest, src));
3850 /* Apply UNOPTAB to OP and store the result in DEST. */
3852 static void
3853 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3855 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3856 if (dest != tmp)
3857 emit_move_insn (dest, tmp);
3860 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
3862 static void
3863 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3865 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3866 OPTAB_DIRECT);
3867 if (dest != tmp)
3868 emit_move_insn (dest, tmp);
3871 /* Split a 128-bit move operation into two 64-bit move operations,
3872 taking care to handle partial overlap of register to register
3873 copies. Special cases are needed when moving between GP regs and
3874 FP regs. SRC can be a register, constant or memory; DST a register
3875 or memory. If either operand is memory it must not have any side
3876 effects. */
3877 void
3878 aarch64_split_128bit_move (rtx dst, rtx src)
3880 rtx dst_lo, dst_hi;
3881 rtx src_lo, src_hi;
3883 machine_mode mode = GET_MODE (dst);
3885 gcc_assert (mode == TImode || mode == TFmode);
3886 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3887 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3889 if (REG_P (dst) && REG_P (src))
3891 int src_regno = REGNO (src);
3892 int dst_regno = REGNO (dst);
3894 /* Handle FP <-> GP regs. */
3895 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3897 src_lo = gen_lowpart (word_mode, src);
3898 src_hi = gen_highpart (word_mode, src);
3900 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3901 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3902 return;
3904 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3906 dst_lo = gen_lowpart (word_mode, dst);
3907 dst_hi = gen_highpart (word_mode, dst);
3909 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3910 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3911 return;
3915 dst_lo = gen_lowpart (word_mode, dst);
3916 dst_hi = gen_highpart (word_mode, dst);
3917 src_lo = gen_lowpart (word_mode, src);
3918 src_hi = gen_highpart_mode (word_mode, mode, src);
3920 /* At most one pairing may overlap. */
3921 if (reg_overlap_mentioned_p (dst_lo, src_hi))
3923 aarch64_emit_move (dst_hi, src_hi);
3924 aarch64_emit_move (dst_lo, src_lo);
3926 else
3928 aarch64_emit_move (dst_lo, src_lo);
3929 aarch64_emit_move (dst_hi, src_hi);
3933 /* Return true if we should split a move from 128-bit value SRC
3934 to 128-bit register DEST. */
3936 bool
3937 aarch64_split_128bit_move_p (rtx dst, rtx src)
3939 if (FP_REGNUM_P (REGNO (dst)))
3940 return REG_P (src) && !FP_REGNUM_P (REGNO (src));
3941 /* All moves to GPRs need to be split. */
3942 return true;
3945 /* Split a complex SIMD combine. */
3947 void
3948 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
3950 machine_mode src_mode = GET_MODE (src1);
3951 machine_mode dst_mode = GET_MODE (dst);
3953 gcc_assert (VECTOR_MODE_P (dst_mode));
3954 gcc_assert (register_operand (dst, dst_mode)
3955 && register_operand (src1, src_mode)
3956 && register_operand (src2, src_mode));
3958 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
3959 return;
3962 /* Split a complex SIMD move. */
3964 void
3965 aarch64_split_simd_move (rtx dst, rtx src)
3967 machine_mode src_mode = GET_MODE (src);
3968 machine_mode dst_mode = GET_MODE (dst);
3970 gcc_assert (VECTOR_MODE_P (dst_mode));
3972 if (REG_P (dst) && REG_P (src))
3974 gcc_assert (VECTOR_MODE_P (src_mode));
3975 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3979 bool
3980 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3981 machine_mode ymode, rtx y)
3983 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3984 gcc_assert (r != NULL);
3985 return rtx_equal_p (x, r);
3988 /* Return TARGET if it is nonnull and a register of mode MODE.
3989 Otherwise, return a fresh register of mode MODE if we can,
3990 or TARGET reinterpreted as MODE if we can't. */
3992 static rtx
3993 aarch64_target_reg (rtx target, machine_mode mode)
3995 if (target && REG_P (target) && GET_MODE (target) == mode)
3996 return target;
3997 if (!can_create_pseudo_p ())
3999 gcc_assert (target);
4000 return gen_lowpart (mode, target);
4002 return gen_reg_rtx (mode);
4005 /* Return a register that contains the constant in BUILDER, given that
4006 the constant is a legitimate move operand. Use TARGET as the register
4007 if it is nonnull and convenient. */
4009 static rtx
4010 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
4012 rtx src = builder.build ();
4013 target = aarch64_target_reg (target, GET_MODE (src));
4014 emit_insn (gen_rtx_SET (target, src));
4015 return target;
4018 static rtx
4019 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
4021 if (can_create_pseudo_p ())
4022 return force_reg (mode, value);
4023 else
4025 gcc_assert (x);
4026 aarch64_emit_move (x, value);
4027 return x;
4031 /* Return true if predicate value X is a constant in which every element
4032 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
4033 value, i.e. as a predicate in which all bits are significant. */
4035 static bool
4036 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
4038 if (GET_CODE (x) != CONST_VECTOR)
4039 return false;
4041 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
4042 GET_MODE_NUNITS (GET_MODE (x)));
4043 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
4044 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
4045 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
4047 unsigned int nelts = const_vector_encoded_nelts (x);
4048 for (unsigned int i = 0; i < nelts; ++i)
4050 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
4051 if (!CONST_INT_P (elt))
4052 return false;
4054 builder.quick_push (elt);
4055 for (unsigned int j = 1; j < factor; ++j)
4056 builder.quick_push (const0_rtx);
4058 builder.finalize ();
4059 return true;
4062 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
4063 widest predicate element size it can have (that is, the largest size
4064 for which each element would still be 0 or 1). */
4066 unsigned int
4067 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
4069 /* Start with the most optimistic assumption: that we only need
4070 one bit per pattern. This is what we will use if only the first
4071 bit in each pattern is ever set. */
4072 unsigned int mask = GET_MODE_SIZE (DImode);
4073 mask |= builder.npatterns ();
4075 /* Look for set bits. */
4076 unsigned int nelts = builder.encoded_nelts ();
4077 for (unsigned int i = 1; i < nelts; ++i)
4078 if (INTVAL (builder.elt (i)) != 0)
4080 if (i & 1)
4081 return 1;
4082 mask |= i;
4084 return mask & -mask;
4087 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
4088 return that predicate mode, otherwise return opt_machine_mode (). */
4090 opt_machine_mode
4091 aarch64_ptrue_all_mode (rtx x)
4093 gcc_assert (GET_MODE (x) == VNx16BImode);
4094 if (GET_CODE (x) != CONST_VECTOR
4095 || !CONST_VECTOR_DUPLICATE_P (x)
4096 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
4097 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
4098 return opt_machine_mode ();
4100 unsigned int nelts = const_vector_encoded_nelts (x);
4101 for (unsigned int i = 1; i < nelts; ++i)
4102 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
4103 return opt_machine_mode ();
4105 return aarch64_sve_pred_mode (nelts);
4108 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
4109 that the constant would have with predicate element size ELT_SIZE
4110 (ignoring the upper bits in each element) and return:
4112 * -1 if all bits are set
4113 * N if the predicate has N leading set bits followed by all clear bits
4114 * 0 if the predicate does not have any of these forms. */
4117 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
4118 unsigned int elt_size)
4120 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
4121 followed by set bits. */
4122 if (builder.nelts_per_pattern () == 3)
4123 return 0;
4125 /* Skip over leading set bits. */
4126 unsigned int nelts = builder.encoded_nelts ();
4127 unsigned int i = 0;
4128 for (; i < nelts; i += elt_size)
4129 if (INTVAL (builder.elt (i)) == 0)
4130 break;
4131 unsigned int vl = i / elt_size;
4133 /* Check for the all-true case. */
4134 if (i == nelts)
4135 return -1;
4137 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
4138 repeating pattern of set bits followed by clear bits. */
4139 if (builder.nelts_per_pattern () != 2)
4140 return 0;
4142 /* We have a "foreground" value and a duplicated "background" value.
4143 If the background might repeat and the last set bit belongs to it,
4144 we might have set bits followed by clear bits followed by set bits. */
4145 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
4146 return 0;
4148 /* Make sure that the rest are all clear. */
4149 for (; i < nelts; i += elt_size)
4150 if (INTVAL (builder.elt (i)) != 0)
4151 return 0;
4153 return vl;
4156 /* See if there is an svpattern that encodes an SVE predicate of mode
4157 PRED_MODE in which the first VL bits are set and the rest are clear.
4158 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
4159 A VL of -1 indicates an all-true vector. */
4161 aarch64_svpattern
4162 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
4164 if (vl < 0)
4165 return AARCH64_SV_ALL;
4167 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
4168 return AARCH64_NUM_SVPATTERNS;
4170 if (vl >= 1 && vl <= 8)
4171 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
4173 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
4174 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
4176 int max_vl;
4177 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
4179 if (vl == (max_vl / 3) * 3)
4180 return AARCH64_SV_MUL3;
4181 /* These would only trigger for non-power-of-2 lengths. */
4182 if (vl == (max_vl & -4))
4183 return AARCH64_SV_MUL4;
4184 if (vl == (1 << floor_log2 (max_vl)))
4185 return AARCH64_SV_POW2;
4186 if (vl == max_vl)
4187 return AARCH64_SV_ALL;
4189 return AARCH64_NUM_SVPATTERNS;
4192 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
4193 bits has the lowest bit set and the upper bits clear. This is the
4194 VNx16BImode equivalent of a PTRUE for controlling elements of
4195 ELT_SIZE bytes. However, because the constant is VNx16BImode,
4196 all bits are significant, even the upper zeros. */
4199 aarch64_ptrue_all (unsigned int elt_size)
4201 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
4202 builder.quick_push (const1_rtx);
4203 for (unsigned int i = 1; i < elt_size; ++i)
4204 builder.quick_push (const0_rtx);
4205 return builder.build ();
4208 /* Return an all-true predicate register of mode MODE. */
4211 aarch64_ptrue_reg (machine_mode mode)
4213 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
4214 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
4215 return gen_lowpart (mode, reg);
4218 /* Return an all-false predicate register of mode MODE. */
4221 aarch64_pfalse_reg (machine_mode mode)
4223 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
4224 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
4225 return gen_lowpart (mode, reg);
4228 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
4229 for it. PRED2[0] is the predicate for the instruction whose result
4230 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
4231 for it. Return true if we can prove that the two predicates are
4232 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
4233 with PRED1[0] without changing behavior. */
4235 bool
4236 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
4238 machine_mode mode = GET_MODE (pred1[0]);
4239 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
4240 && mode == GET_MODE (pred2[0])
4241 && aarch64_sve_ptrue_flag (pred1[1], SImode)
4242 && aarch64_sve_ptrue_flag (pred2[1], SImode));
4244 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
4245 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
4246 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
4247 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
4248 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
4251 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
4252 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
4253 Use TARGET as the target register if nonnull and convenient. */
4255 static rtx
4256 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
4257 machine_mode data_mode, rtx op1, rtx op2)
4259 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
4260 expand_operand ops[5];
4261 create_output_operand (&ops[0], target, pred_mode);
4262 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
4263 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
4264 create_input_operand (&ops[3], op1, data_mode);
4265 create_input_operand (&ops[4], op2, data_mode);
4266 expand_insn (icode, 5, ops);
4267 return ops[0].value;
4270 /* Use a comparison to convert integer vector SRC into MODE, which is
4271 the corresponding SVE predicate mode. Use TARGET for the result
4272 if it's nonnull and convenient. */
4275 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
4277 machine_mode src_mode = GET_MODE (src);
4278 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
4279 src, CONST0_RTX (src_mode));
4282 /* Return the assembly token for svprfop value PRFOP. */
4284 static const char *
4285 svprfop_token (enum aarch64_svprfop prfop)
4287 switch (prfop)
4289 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
4290 AARCH64_FOR_SVPRFOP (CASE)
4291 #undef CASE
4292 case AARCH64_NUM_SVPRFOPS:
4293 break;
4295 gcc_unreachable ();
4298 /* Return the assembly string for an SVE prefetch operation with
4299 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
4300 and that SUFFIX is the format for the remaining operands. */
4302 char *
4303 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
4304 const char *suffix)
4306 static char buffer[128];
4307 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
4308 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
4309 mnemonic, svprfop_token (prfop), suffix);
4310 gcc_assert (written < sizeof (buffer));
4311 return buffer;
4314 /* Check whether we can calculate the number of elements in PATTERN
4315 at compile time, given that there are NELTS_PER_VQ elements per
4316 128-bit block. Return the value if so, otherwise return -1. */
4318 HOST_WIDE_INT
4319 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
4321 unsigned int vl, const_vg;
4322 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
4323 vl = 1 + (pattern - AARCH64_SV_VL1);
4324 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
4325 vl = 16 << (pattern - AARCH64_SV_VL16);
4326 else if (aarch64_sve_vg.is_constant (&const_vg))
4328 /* There are two vector granules per quadword. */
4329 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
4330 switch (pattern)
4332 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
4333 case AARCH64_SV_MUL4: return nelts & -4;
4334 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
4335 case AARCH64_SV_ALL: return nelts;
4336 default: gcc_unreachable ();
4339 else
4340 return -1;
4342 /* There are two vector granules per quadword. */
4343 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
4344 if (known_le (vl, nelts_all))
4345 return vl;
4347 /* Requesting more elements than are available results in a PFALSE. */
4348 if (known_gt (vl, nelts_all))
4349 return 0;
4351 return -1;
4354 /* Return true if we can move VALUE into a register using a single
4355 CNT[BHWD] instruction. */
4357 static bool
4358 aarch64_sve_cnt_immediate_p (poly_int64 value)
4360 HOST_WIDE_INT factor = value.coeffs[0];
4361 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
4362 return (value.coeffs[1] == factor
4363 && IN_RANGE (factor, 2, 16 * 16)
4364 && (factor & 1) == 0
4365 && factor <= 16 * (factor & -factor));
4368 /* Likewise for rtx X. */
4370 bool
4371 aarch64_sve_cnt_immediate_p (rtx x)
4373 poly_int64 value;
4374 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
4377 /* Return the asm string for an instruction with a CNT-like vector size
4378 operand (a vector pattern followed by a multiplier in the range [1, 16]).
4379 PREFIX is the mnemonic without the size suffix and OPERANDS is the
4380 first part of the operands template (the part that comes before the
4381 vector size itself). PATTERN is the pattern to use. FACTOR is the
4382 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
4383 in each quadword. If it is zero, we can use any element size. */
4385 static char *
4386 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
4387 aarch64_svpattern pattern,
4388 unsigned int factor,
4389 unsigned int nelts_per_vq)
4391 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
4393 if (nelts_per_vq == 0)
4394 /* There is some overlap in the ranges of the four CNT instructions.
4395 Here we always use the smallest possible element size, so that the
4396 multiplier is 1 whereever possible. */
4397 nelts_per_vq = factor & -factor;
4398 int shift = std::min (exact_log2 (nelts_per_vq), 4);
4399 gcc_assert (IN_RANGE (shift, 1, 4));
4400 char suffix = "dwhb"[shift - 1];
4402 factor >>= shift;
4403 unsigned int written;
4404 if (pattern == AARCH64_SV_ALL && factor == 1)
4405 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
4406 prefix, suffix, operands);
4407 else if (factor == 1)
4408 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
4409 prefix, suffix, operands, svpattern_token (pattern));
4410 else
4411 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
4412 prefix, suffix, operands, svpattern_token (pattern),
4413 factor);
4414 gcc_assert (written < sizeof (buffer));
4415 return buffer;
4418 /* Return the asm string for an instruction with a CNT-like vector size
4419 operand (a vector pattern followed by a multiplier in the range [1, 16]).
4420 PREFIX is the mnemonic without the size suffix and OPERANDS is the
4421 first part of the operands template (the part that comes before the
4422 vector size itself). X is the value of the vector size operand,
4423 as a polynomial integer rtx; we need to convert this into an "all"
4424 pattern with a multiplier. */
4426 char *
4427 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
4428 rtx x)
4430 poly_int64 value = rtx_to_poly_int64 (x);
4431 gcc_assert (aarch64_sve_cnt_immediate_p (value));
4432 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
4433 value.coeffs[1], 0);
4436 /* Return the asm string for an instruction with a CNT-like vector size
4437 operand (a vector pattern followed by a multiplier in the range [1, 16]).
4438 PREFIX is the mnemonic without the size suffix and OPERANDS is the
4439 first part of the operands template (the part that comes before the
4440 vector size itself). CNT_PAT[0..2] are the operands of the
4441 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
4443 char *
4444 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
4445 const char *operands, rtx *cnt_pat)
4447 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
4448 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
4449 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
4450 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
4451 factor, nelts_per_vq);
4454 /* Return true if we can add X using a single SVE INC or DEC instruction. */
4456 bool
4457 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
4459 poly_int64 value;
4460 return (poly_int_rtx_p (x, &value)
4461 && (aarch64_sve_cnt_immediate_p (value)
4462 || aarch64_sve_cnt_immediate_p (-value)));
4465 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
4466 operand 0. */
4468 char *
4469 aarch64_output_sve_scalar_inc_dec (rtx offset)
4471 poly_int64 offset_value = rtx_to_poly_int64 (offset);
4472 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
4473 if (offset_value.coeffs[1] > 0)
4474 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
4475 offset_value.coeffs[1], 0);
4476 else
4477 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
4478 -offset_value.coeffs[1], 0);
4481 /* Return true if we can add VALUE to a register using a single ADDVL
4482 or ADDPL instruction. */
4484 static bool
4485 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
4487 HOST_WIDE_INT factor = value.coeffs[0];
4488 if (factor == 0 || value.coeffs[1] != factor)
4489 return false;
4490 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
4491 and a value of 16 is one vector width. */
4492 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
4493 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
4496 /* Likewise for rtx X. */
4498 bool
4499 aarch64_sve_addvl_addpl_immediate_p (rtx x)
4501 poly_int64 value;
4502 return (poly_int_rtx_p (x, &value)
4503 && aarch64_sve_addvl_addpl_immediate_p (value));
4506 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
4507 to operand 1 and storing the result in operand 0. */
4509 char *
4510 aarch64_output_sve_addvl_addpl (rtx offset)
4512 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
4513 poly_int64 offset_value = rtx_to_poly_int64 (offset);
4514 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
4516 int factor = offset_value.coeffs[1];
4517 if ((factor & 15) == 0)
4518 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
4519 else
4520 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
4521 return buffer;
4524 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4525 instruction. If it is, store the number of elements in each vector
4526 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
4527 factor in *FACTOR_OUT (if nonnull). */
4529 bool
4530 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
4531 unsigned int *nelts_per_vq_out)
4533 rtx elt;
4534 poly_int64 value;
4536 if (!const_vec_duplicate_p (x, &elt)
4537 || !poly_int_rtx_p (elt, &value))
4538 return false;
4540 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
4541 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
4542 /* There's no vector INCB. */
4543 return false;
4545 HOST_WIDE_INT factor = value.coeffs[0];
4546 if (value.coeffs[1] != factor)
4547 return false;
4549 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
4550 if ((factor % nelts_per_vq) != 0
4551 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
4552 return false;
4554 if (factor_out)
4555 *factor_out = factor;
4556 if (nelts_per_vq_out)
4557 *nelts_per_vq_out = nelts_per_vq;
4558 return true;
4561 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4562 instruction. */
4564 bool
4565 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
4567 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
4570 /* Return the asm template for an SVE vector INC or DEC instruction.
4571 OPERANDS gives the operands before the vector count and X is the
4572 value of the vector count operand itself. */
4574 char *
4575 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
4577 int factor;
4578 unsigned int nelts_per_vq;
4579 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
4580 gcc_unreachable ();
4581 if (factor < 0)
4582 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
4583 -factor, nelts_per_vq);
4584 else
4585 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
4586 factor, nelts_per_vq);
4589 static int
4590 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4591 scalar_int_mode mode)
4593 int i;
4594 unsigned HOST_WIDE_INT val, val2, mask;
4595 int one_match, zero_match;
4596 int num_insns;
4598 val = INTVAL (imm);
4600 if (aarch64_move_imm (val, mode))
4602 if (generate)
4603 emit_insn (gen_rtx_SET (dest, imm));
4604 return 1;
4607 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
4608 (with XXXX non-zero). In that case check to see if the move can be done in
4609 a smaller mode. */
4610 val2 = val & 0xffffffff;
4611 if (mode == DImode
4612 && aarch64_move_imm (val2, SImode)
4613 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
4615 if (generate)
4616 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4618 /* Check if we have to emit a second instruction by checking to see
4619 if any of the upper 32 bits of the original DI mode value is set. */
4620 if (val == val2)
4621 return 1;
4623 i = (val >> 48) ? 48 : 32;
4625 if (generate)
4626 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4627 GEN_INT ((val >> i) & 0xffff)));
4629 return 2;
4632 if ((val >> 32) == 0 || mode == SImode)
4634 if (generate)
4636 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4637 if (mode == SImode)
4638 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4639 GEN_INT ((val >> 16) & 0xffff)));
4640 else
4641 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4642 GEN_INT ((val >> 16) & 0xffff)));
4644 return 2;
4647 /* Remaining cases are all for DImode. */
4649 mask = 0xffff;
4650 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4651 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4652 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4653 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4655 if (zero_match != 2 && one_match != 2)
4657 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
4658 For a 64-bit bitmask try whether changing 16 bits to all ones or
4659 zeroes creates a valid bitmask. To check any repeated bitmask,
4660 try using 16 bits from the other 32-bit half of val. */
4662 for (i = 0; i < 64; i += 16, mask <<= 16)
4664 val2 = val & ~mask;
4665 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4666 break;
4667 val2 = val | mask;
4668 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4669 break;
4670 val2 = val2 & ~mask;
4671 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
4672 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4673 break;
4675 if (i != 64)
4677 if (generate)
4679 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4680 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4681 GEN_INT ((val >> i) & 0xffff)));
4683 return 2;
4687 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4688 are emitted by the initial mov. If one_match > zero_match, skip set bits,
4689 otherwise skip zero bits. */
4691 num_insns = 1;
4692 mask = 0xffff;
4693 val2 = one_match > zero_match ? ~val : val;
4694 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4696 if (generate)
4697 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4698 ? (val | ~(mask << i))
4699 : (val & (mask << i)))));
4700 for (i += 16; i < 64; i += 16)
4702 if ((val2 & (mask << i)) == 0)
4703 continue;
4704 if (generate)
4705 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4706 GEN_INT ((val >> i) & 0xffff)));
4707 num_insns ++;
4710 return num_insns;
4713 /* Return whether imm is a 128-bit immediate which is simple enough to
4714 expand inline. */
4715 bool
4716 aarch64_mov128_immediate (rtx imm)
4718 if (CONST_INT_P (imm))
4719 return true;
4721 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4723 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4724 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4726 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4727 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4731 /* Return the number of temporary registers that aarch64_add_offset_1
4732 would need to add OFFSET to a register. */
4734 static unsigned int
4735 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4737 return absu_hwi (offset) < 0x1000000 ? 0 : 1;
4740 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
4741 a non-polynomial OFFSET. MODE is the mode of the addition.
4742 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4743 be set and CFA adjustments added to the generated instructions.
4745 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4746 temporary if register allocation is already complete. This temporary
4747 register may overlap DEST but must not overlap SRC. If TEMP1 is known
4748 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4749 the immediate again.
4751 Since this function may be used to adjust the stack pointer, we must
4752 ensure that it cannot cause transient stack deallocation (for example
4753 by first incrementing SP and then decrementing when adjusting by a
4754 large immediate). */
4756 static void
4757 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4758 rtx src, HOST_WIDE_INT offset, rtx temp1,
4759 bool frame_related_p, bool emit_move_imm)
4761 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4762 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4764 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4765 rtx_insn *insn;
4767 if (!moffset)
4769 if (!rtx_equal_p (dest, src))
4771 insn = emit_insn (gen_rtx_SET (dest, src));
4772 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4774 return;
4777 /* Single instruction adjustment. */
4778 if (aarch64_uimm12_shift (moffset))
4780 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4781 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4782 return;
4785 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4786 and either:
4788 a) the offset cannot be loaded by a 16-bit move or
4789 b) there is no spare register into which we can move it. */
4790 if (moffset < 0x1000000
4791 && ((!temp1 && !can_create_pseudo_p ())
4792 || !aarch64_move_imm (moffset, mode)))
4794 HOST_WIDE_INT low_off = moffset & 0xfff;
4796 low_off = offset < 0 ? -low_off : low_off;
4797 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4798 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4799 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4800 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4801 return;
4804 /* Emit a move immediate if required and an addition/subtraction. */
4805 if (emit_move_imm)
4807 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4808 temp1 = aarch64_force_temporary (mode, temp1,
4809 gen_int_mode (moffset, mode));
4811 insn = emit_insn (offset < 0
4812 ? gen_sub3_insn (dest, src, temp1)
4813 : gen_add3_insn (dest, src, temp1));
4814 if (frame_related_p)
4816 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4817 rtx adj = plus_constant (mode, src, offset);
4818 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4822 /* Return the number of temporary registers that aarch64_add_offset
4823 would need to move OFFSET into a register or add OFFSET to a register;
4824 ADD_P is true if we want the latter rather than the former. */
4826 static unsigned int
4827 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4829 /* This follows the same structure as aarch64_add_offset. */
4830 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4831 return 0;
4833 unsigned int count = 0;
4834 HOST_WIDE_INT factor = offset.coeffs[1];
4835 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4836 poly_int64 poly_offset (factor, factor);
4837 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4838 /* Need one register for the ADDVL/ADDPL result. */
4839 count += 1;
4840 else if (factor != 0)
4842 factor = abs (factor);
4843 if (factor > 16 * (factor & -factor))
4844 /* Need one register for the CNT result and one for the multiplication
4845 factor. If necessary, the second temporary can be reused for the
4846 constant part of the offset. */
4847 return 2;
4848 /* Need one register for the CNT result (which might then
4849 be shifted). */
4850 count += 1;
4852 return count + aarch64_add_offset_1_temporaries (constant);
4855 /* If X can be represented as a poly_int64, return the number
4856 of temporaries that are required to add it to a register.
4857 Return -1 otherwise. */
4860 aarch64_add_offset_temporaries (rtx x)
4862 poly_int64 offset;
4863 if (!poly_int_rtx_p (x, &offset))
4864 return -1;
4865 return aarch64_offset_temporaries (true, offset);
4868 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
4869 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4870 be set and CFA adjustments added to the generated instructions.
4872 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4873 temporary if register allocation is already complete. This temporary
4874 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4875 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4876 false to avoid emitting the immediate again.
4878 TEMP2, if nonnull, is a second temporary register that doesn't
4879 overlap either DEST or REG.
4881 Since this function may be used to adjust the stack pointer, we must
4882 ensure that it cannot cause transient stack deallocation (for example
4883 by first incrementing SP and then decrementing when adjusting by a
4884 large immediate). */
4886 static void
4887 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4888 poly_int64 offset, rtx temp1, rtx temp2,
4889 bool frame_related_p, bool emit_move_imm = true)
4891 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4892 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4893 gcc_assert (temp1 == NULL_RTX
4894 || !frame_related_p
4895 || !reg_overlap_mentioned_p (temp1, dest));
4896 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4898 /* Try using ADDVL or ADDPL to add the whole value. */
4899 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4901 rtx offset_rtx = gen_int_mode (offset, mode);
4902 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4903 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4904 return;
4907 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4908 SVE vector register, over and above the minimum size of 128 bits.
4909 This is equivalent to half the value returned by CNTD with a
4910 vector shape of ALL. */
4911 HOST_WIDE_INT factor = offset.coeffs[1];
4912 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4914 /* Try using ADDVL or ADDPL to add the VG-based part. */
4915 poly_int64 poly_offset (factor, factor);
4916 if (src != const0_rtx
4917 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4919 rtx offset_rtx = gen_int_mode (poly_offset, mode);
4920 if (frame_related_p)
4922 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4923 RTX_FRAME_RELATED_P (insn) = true;
4924 src = dest;
4926 else
4928 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4929 src = aarch64_force_temporary (mode, temp1, addr);
4930 temp1 = temp2;
4931 temp2 = NULL_RTX;
4934 /* Otherwise use a CNT-based sequence. */
4935 else if (factor != 0)
4937 /* Use a subtraction if we have a negative factor. */
4938 rtx_code code = PLUS;
4939 if (factor < 0)
4941 factor = -factor;
4942 code = MINUS;
4945 /* Calculate CNTD * FACTOR / 2. First try to fold the division
4946 into the multiplication. */
4947 rtx val;
4948 int shift = 0;
4949 if (factor & 1)
4950 /* Use a right shift by 1. */
4951 shift = -1;
4952 else
4953 factor /= 2;
4954 HOST_WIDE_INT low_bit = factor & -factor;
4955 if (factor <= 16 * low_bit)
4957 if (factor > 16 * 8)
4959 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
4960 the value with the minimum multiplier and shift it into
4961 position. */
4962 int extra_shift = exact_log2 (low_bit);
4963 shift += extra_shift;
4964 factor >>= extra_shift;
4966 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
4968 else
4970 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
4971 directly, since that should increase the chances of being
4972 able to use a shift and add sequence. If LOW_BIT itself
4973 is out of range, just use CNTD. */
4974 if (low_bit <= 16 * 8)
4975 factor /= low_bit;
4976 else
4977 low_bit = 1;
4979 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
4980 val = aarch64_force_temporary (mode, temp1, val);
4982 if (can_create_pseudo_p ())
4984 rtx coeff1 = gen_int_mode (factor, mode);
4985 val = expand_mult (mode, val, coeff1, NULL_RTX, true, true);
4987 else
4989 /* Go back to using a negative multiplication factor if we have
4990 no register from which to subtract. */
4991 if (code == MINUS && src == const0_rtx)
4993 factor = -factor;
4994 code = PLUS;
4996 rtx coeff1 = gen_int_mode (factor, mode);
4997 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4998 val = gen_rtx_MULT (mode, val, coeff1);
5002 if (shift > 0)
5004 /* Multiply by 1 << SHIFT. */
5005 val = aarch64_force_temporary (mode, temp1, val);
5006 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
5008 else if (shift == -1)
5010 /* Divide by 2. */
5011 val = aarch64_force_temporary (mode, temp1, val);
5012 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
5015 /* Calculate SRC +/- CNTD * FACTOR / 2. */
5016 if (src != const0_rtx)
5018 val = aarch64_force_temporary (mode, temp1, val);
5019 val = gen_rtx_fmt_ee (code, mode, src, val);
5021 else if (code == MINUS)
5023 val = aarch64_force_temporary (mode, temp1, val);
5024 val = gen_rtx_NEG (mode, val);
5027 if (constant == 0 || frame_related_p)
5029 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
5030 if (frame_related_p)
5032 RTX_FRAME_RELATED_P (insn) = true;
5033 add_reg_note (insn, REG_CFA_ADJUST_CFA,
5034 gen_rtx_SET (dest, plus_constant (Pmode, src,
5035 poly_offset)));
5037 src = dest;
5038 if (constant == 0)
5039 return;
5041 else
5043 src = aarch64_force_temporary (mode, temp1, val);
5044 temp1 = temp2;
5045 temp2 = NULL_RTX;
5048 emit_move_imm = true;
5051 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
5052 frame_related_p, emit_move_imm);
5055 /* Like aarch64_add_offset, but the offset is given as an rtx rather
5056 than a poly_int64. */
5058 void
5059 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
5060 rtx offset_rtx, rtx temp1, rtx temp2)
5062 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
5063 temp1, temp2, false);
5066 /* Add DELTA to the stack pointer, marking the instructions frame-related.
5067 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
5068 if TEMP1 already contains abs (DELTA). */
5070 static inline void
5071 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
5073 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
5074 temp1, temp2, true, emit_move_imm);
5077 /* Subtract DELTA from the stack pointer, marking the instructions
5078 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
5079 if nonnull. */
5081 static inline void
5082 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
5083 bool emit_move_imm = true)
5085 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
5086 temp1, temp2, frame_related_p, emit_move_imm);
5089 /* Set DEST to (vec_series BASE STEP). */
5091 static void
5092 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
5094 machine_mode mode = GET_MODE (dest);
5095 scalar_mode inner = GET_MODE_INNER (mode);
5097 /* Each operand can be a register or an immediate in the range [-16, 15]. */
5098 if (!aarch64_sve_index_immediate_p (base))
5099 base = force_reg (inner, base);
5100 if (!aarch64_sve_index_immediate_p (step))
5101 step = force_reg (inner, step);
5103 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
5106 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
5107 register of mode MODE. Use TARGET for the result if it's nonnull
5108 and convenient.
5110 The two vector modes must have the same element mode. The behavior
5111 is to duplicate architectural lane N of SRC into architectural lanes
5112 N + I * STEP of the result. On big-endian targets, architectural
5113 lane 0 of an Advanced SIMD vector is the last element of the vector
5114 in memory layout, so for big-endian targets this operation has the
5115 effect of reversing SRC before duplicating it. Callers need to
5116 account for this. */
5119 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
5121 machine_mode src_mode = GET_MODE (src);
5122 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
5123 insn_code icode = (BYTES_BIG_ENDIAN
5124 ? code_for_aarch64_vec_duplicate_vq_be (mode)
5125 : code_for_aarch64_vec_duplicate_vq_le (mode));
5127 unsigned int i = 0;
5128 expand_operand ops[3];
5129 create_output_operand (&ops[i++], target, mode);
5130 create_output_operand (&ops[i++], src, src_mode);
5131 if (BYTES_BIG_ENDIAN)
5133 /* Create a PARALLEL describing the reversal of SRC. */
5134 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
5135 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
5136 nelts_per_vq - 1, -1);
5137 create_fixed_operand (&ops[i++], sel);
5139 expand_insn (icode, i, ops);
5140 return ops[0].value;
5143 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
5144 the memory image into DEST. Return true on success. */
5146 static bool
5147 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
5149 src = force_const_mem (GET_MODE (src), src);
5150 if (!src)
5151 return false;
5153 /* Make sure that the address is legitimate. */
5154 if (!aarch64_sve_ld1rq_operand_p (src))
5156 rtx addr = force_reg (Pmode, XEXP (src, 0));
5157 src = replace_equiv_address (src, addr);
5160 machine_mode mode = GET_MODE (dest);
5161 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5162 rtx ptrue = aarch64_ptrue_reg (pred_mode);
5163 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
5164 return true;
5167 /* SRC is an SVE CONST_VECTOR that contains N "foreground" values followed
5168 by N "background" values. Try to move it into TARGET using:
5170 PTRUE PRED.<T>, VL<N>
5171 MOV TRUE.<T>, #<foreground>
5172 MOV FALSE.<T>, #<background>
5173 SEL TARGET.<T>, PRED.<T>, TRUE.<T>, FALSE.<T>
5175 The PTRUE is always a single instruction but the MOVs might need a
5176 longer sequence. If the background value is zero (as it often is),
5177 the sequence can sometimes collapse to a PTRUE followed by a
5178 zero-predicated move.
5180 Return the target on success, otherwise return null. */
5182 static rtx
5183 aarch64_expand_sve_const_vector_sel (rtx target, rtx src)
5185 gcc_assert (CONST_VECTOR_NELTS_PER_PATTERN (src) == 2);
5187 /* Make sure that the PTRUE is valid. */
5188 machine_mode mode = GET_MODE (src);
5189 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
5190 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5191 if (aarch64_svpattern_for_vl (pred_mode, npatterns)
5192 == AARCH64_NUM_SVPATTERNS)
5193 return NULL_RTX;
5195 rtx_vector_builder pred_builder (pred_mode, npatterns, 2);
5196 rtx_vector_builder true_builder (mode, npatterns, 1);
5197 rtx_vector_builder false_builder (mode, npatterns, 1);
5198 for (unsigned int i = 0; i < npatterns; ++i)
5200 true_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5201 pred_builder.quick_push (CONST1_RTX (BImode));
5203 for (unsigned int i = 0; i < npatterns; ++i)
5205 false_builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i + npatterns));
5206 pred_builder.quick_push (CONST0_RTX (BImode));
5208 expand_operand ops[4];
5209 create_output_operand (&ops[0], target, mode);
5210 create_input_operand (&ops[1], true_builder.build (), mode);
5211 create_input_operand (&ops[2], false_builder.build (), mode);
5212 create_input_operand (&ops[3], pred_builder.build (), pred_mode);
5213 expand_insn (code_for_vcond_mask (mode, mode), 4, ops);
5214 return target;
5217 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
5218 SVE data mode and isn't a legitimate constant. Use TARGET for the
5219 result if convenient.
5221 The returned register can have whatever mode seems most natural
5222 given the contents of SRC. */
5224 static rtx
5225 aarch64_expand_sve_const_vector (rtx target, rtx src)
5227 machine_mode mode = GET_MODE (src);
5228 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
5229 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
5230 scalar_mode elt_mode = GET_MODE_INNER (mode);
5231 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
5232 unsigned int container_bits = aarch64_sve_container_bits (mode);
5233 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
5235 if (nelts_per_pattern == 1
5236 && encoded_bits <= 128
5237 && container_bits != elt_bits)
5239 /* We have a partial vector mode and a constant whose full-vector
5240 equivalent would occupy a repeating 128-bit sequence. Build that
5241 full-vector equivalent instead, so that we have the option of
5242 using LD1RQ and Advanced SIMD operations. */
5243 unsigned int repeat = container_bits / elt_bits;
5244 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
5245 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
5246 for (unsigned int i = 0; i < npatterns; ++i)
5247 for (unsigned int j = 0; j < repeat; ++j)
5248 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
5249 target = aarch64_target_reg (target, full_mode);
5250 return aarch64_expand_sve_const_vector (target, builder.build ());
5253 if (nelts_per_pattern == 1 && encoded_bits == 128)
5255 /* The constant is a duplicated quadword but can't be narrowed
5256 beyond a quadword. Get the memory image of the first quadword
5257 as a 128-bit vector and try using LD1RQ to load it from memory.
5259 The effect for both endiannesses is to load memory lane N into
5260 architectural lanes N + I * STEP of the result. On big-endian
5261 targets, the layout of the 128-bit vector in an Advanced SIMD
5262 register would be different from its layout in an SVE register,
5263 but this 128-bit vector is a memory value only. */
5264 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5265 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
5266 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
5267 return target;
5270 if (nelts_per_pattern == 1 && encoded_bits < 128)
5272 /* The vector is a repeating sequence of 64 bits or fewer.
5273 See if we can load them using an Advanced SIMD move and then
5274 duplicate it to fill a vector. This is better than using a GPR
5275 move because it keeps everything in the same register file. */
5276 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
5277 rtx_vector_builder builder (vq_mode, npatterns, 1);
5278 for (unsigned int i = 0; i < npatterns; ++i)
5280 /* We want memory lane N to go into architectural lane N,
5281 so reverse for big-endian targets. The DUP .Q pattern
5282 has a compensating reverse built-in. */
5283 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
5284 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
5286 rtx vq_src = builder.build ();
5287 if (aarch64_simd_valid_immediate (vq_src, NULL))
5289 vq_src = force_reg (vq_mode, vq_src);
5290 return aarch64_expand_sve_dupq (target, mode, vq_src);
5293 /* Get an integer representation of the repeating part of Advanced
5294 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
5295 which for big-endian targets is lane-swapped wrt a normal
5296 Advanced SIMD vector. This means that for both endiannesses,
5297 memory lane N of SVE vector SRC corresponds to architectural
5298 lane N of a register holding VQ_SRC. This in turn means that
5299 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
5300 as a single 128-bit value) and thus that memory lane 0 of SRC is
5301 in the lsb of the integer. Duplicating the integer therefore
5302 ensures that memory lane N of SRC goes into architectural lane
5303 N + I * INDEX of the SVE register. */
5304 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
5305 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
5306 if (elt_value)
5308 /* Pretend that we had a vector of INT_MODE to start with. */
5309 elt_mode = int_mode;
5310 mode = aarch64_full_sve_mode (int_mode).require ();
5312 /* If the integer can be moved into a general register by a
5313 single instruction, do that and duplicate the result. */
5314 if (CONST_INT_P (elt_value)
5315 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
5317 elt_value = force_reg (elt_mode, elt_value);
5318 return expand_vector_broadcast (mode, elt_value);
5321 else if (npatterns == 1)
5322 /* We're duplicating a single value, but can't do better than
5323 force it to memory and load from there. This handles things
5324 like symbolic constants. */
5325 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
5327 if (elt_value)
5329 /* Load the element from memory if we can, otherwise move it into
5330 a register and use a DUP. */
5331 rtx op = force_const_mem (elt_mode, elt_value);
5332 if (!op)
5333 op = force_reg (elt_mode, elt_value);
5334 return expand_vector_broadcast (mode, op);
5338 /* Try using INDEX. */
5339 rtx base, step;
5340 if (const_vec_series_p (src, &base, &step))
5342 aarch64_expand_vec_series (target, base, step);
5343 return target;
5346 /* From here on, it's better to force the whole constant to memory
5347 if we can. */
5348 if (GET_MODE_NUNITS (mode).is_constant ())
5349 return NULL_RTX;
5351 if (nelts_per_pattern == 2)
5352 if (rtx res = aarch64_expand_sve_const_vector_sel (target, src))
5353 return res;
5355 /* Expand each pattern individually. */
5356 gcc_assert (npatterns > 1);
5357 rtx_vector_builder builder;
5358 auto_vec<rtx, 16> vectors (npatterns);
5359 for (unsigned int i = 0; i < npatterns; ++i)
5361 builder.new_vector (mode, 1, nelts_per_pattern);
5362 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
5363 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
5364 vectors.quick_push (force_reg (mode, builder.build ()));
5367 /* Use permutes to interleave the separate vectors. */
5368 while (npatterns > 1)
5370 npatterns /= 2;
5371 for (unsigned int i = 0; i < npatterns; ++i)
5373 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
5374 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
5375 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
5376 vectors[i] = tmp;
5379 gcc_assert (vectors[0] == target);
5380 return target;
5383 /* Use WHILE to set a predicate register of mode MODE in which the first
5384 VL bits are set and the rest are clear. Use TARGET for the register
5385 if it's nonnull and convenient. */
5387 static rtx
5388 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
5389 unsigned int vl)
5391 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
5392 target = aarch64_target_reg (target, mode);
5393 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
5394 target, const0_rtx, limit));
5395 return target;
5398 static rtx
5399 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
5401 /* BUILDER is a constant predicate in which the index of every set bit
5402 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5403 by inverting every element at a multiple of ELT_SIZE and EORing the
5404 result with an ELT_SIZE PTRUE.
5406 Return a register that contains the constant on success, otherwise
5407 return null. Use TARGET as the register if it is nonnull and
5408 convenient. */
5410 static rtx
5411 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
5412 unsigned int elt_size)
5414 /* Invert every element at a multiple of ELT_SIZE, keeping the
5415 other bits zero. */
5416 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
5417 builder.nelts_per_pattern ());
5418 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5419 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
5420 inv_builder.quick_push (const1_rtx);
5421 else
5422 inv_builder.quick_push (const0_rtx);
5423 inv_builder.finalize ();
5425 /* See if we can load the constant cheaply. */
5426 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
5427 if (!inv)
5428 return NULL_RTX;
5430 /* EOR the result with an ELT_SIZE PTRUE. */
5431 rtx mask = aarch64_ptrue_all (elt_size);
5432 mask = force_reg (VNx16BImode, mask);
5433 inv = gen_lowpart (VNx16BImode, inv);
5434 target = aarch64_target_reg (target, VNx16BImode);
5435 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
5436 return target;
5439 /* BUILDER is a constant predicate in which the index of every set bit
5440 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
5441 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
5442 register on success, otherwise return null. Use TARGET as the register
5443 if nonnull and convenient. */
5445 static rtx
5446 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
5447 unsigned int elt_size,
5448 unsigned int permute_size)
5450 /* We're going to split the constant into two new constants A and B,
5451 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
5452 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
5454 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
5455 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
5457 where _ indicates elements that will be discarded by the permute.
5459 First calculate the ELT_SIZEs for A and B. */
5460 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
5461 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
5462 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
5463 if (INTVAL (builder.elt (i)) != 0)
5465 if (i & permute_size)
5466 b_elt_size |= i - permute_size;
5467 else
5468 a_elt_size |= i;
5470 a_elt_size &= -a_elt_size;
5471 b_elt_size &= -b_elt_size;
5473 /* Now construct the vectors themselves. */
5474 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
5475 builder.nelts_per_pattern ());
5476 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
5477 builder.nelts_per_pattern ());
5478 unsigned int nelts = builder.encoded_nelts ();
5479 for (unsigned int i = 0; i < nelts; ++i)
5480 if (i & (elt_size - 1))
5482 a_builder.quick_push (const0_rtx);
5483 b_builder.quick_push (const0_rtx);
5485 else if ((i & permute_size) == 0)
5487 /* The A and B elements are significant. */
5488 a_builder.quick_push (builder.elt (i));
5489 b_builder.quick_push (builder.elt (i + permute_size));
5491 else
5493 /* The A and B elements are going to be discarded, so pick whatever
5494 is likely to give a nice constant. We are targeting element
5495 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
5496 with the aim of each being a sequence of ones followed by
5497 a sequence of zeros. So:
5499 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
5500 duplicate the last X_ELT_SIZE element, to extend the
5501 current sequence of ones or zeros.
5503 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
5504 zero, so that the constant really does have X_ELT_SIZE and
5505 not a smaller size. */
5506 if (a_elt_size > permute_size)
5507 a_builder.quick_push (const0_rtx);
5508 else
5509 a_builder.quick_push (a_builder.elt (i - a_elt_size));
5510 if (b_elt_size > permute_size)
5511 b_builder.quick_push (const0_rtx);
5512 else
5513 b_builder.quick_push (b_builder.elt (i - b_elt_size));
5515 a_builder.finalize ();
5516 b_builder.finalize ();
5518 /* Try loading A into a register. */
5519 rtx_insn *last = get_last_insn ();
5520 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
5521 if (!a)
5522 return NULL_RTX;
5524 /* Try loading B into a register. */
5525 rtx b = a;
5526 if (a_builder != b_builder)
5528 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
5529 if (!b)
5531 delete_insns_since (last);
5532 return NULL_RTX;
5536 /* Emit the TRN1 itself. We emit a TRN that operates on VNx16BI
5537 operands but permutes them as though they had mode MODE. */
5538 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
5539 target = aarch64_target_reg (target, GET_MODE (a));
5540 rtx type_reg = CONST0_RTX (mode);
5541 emit_insn (gen_aarch64_sve_trn1_conv (mode, target, a, b, type_reg));
5542 return target;
5545 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
5546 constant in BUILDER into an SVE predicate register. Return the register
5547 on success, otherwise return null. Use TARGET for the register if
5548 nonnull and convenient.
5550 ALLOW_RECURSE_P is true if we can use methods that would call this
5551 function recursively. */
5553 static rtx
5554 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
5555 bool allow_recurse_p)
5557 if (builder.encoded_nelts () == 1)
5558 /* A PFALSE or a PTRUE .B ALL. */
5559 return aarch64_emit_set_immediate (target, builder);
5561 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
5562 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
5564 /* If we can load the constant using PTRUE, use it as-is. */
5565 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
5566 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
5567 return aarch64_emit_set_immediate (target, builder);
5569 /* Otherwise use WHILE to set the first VL bits. */
5570 return aarch64_sve_move_pred_via_while (target, mode, vl);
5573 if (!allow_recurse_p)
5574 return NULL_RTX;
5576 /* Try inverting the vector in element size ELT_SIZE and then EORing
5577 the result with an ELT_SIZE PTRUE. */
5578 if (INTVAL (builder.elt (0)) == 0)
5579 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
5580 elt_size))
5581 return res;
5583 /* Try using TRN1 to permute two simpler constants. */
5584 for (unsigned int i = elt_size; i <= 8; i *= 2)
5585 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
5586 elt_size, i))
5587 return res;
5589 return NULL_RTX;
5592 /* Return an SVE predicate register that contains the VNx16BImode
5593 constant in BUILDER, without going through the move expanders.
5595 The returned register can have whatever mode seems most natural
5596 given the contents of BUILDER. Use TARGET for the result if
5597 convenient. */
5599 static rtx
5600 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
5602 /* Try loading the constant using pure predicate operations. */
5603 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5604 return res;
5606 /* Try forcing the constant to memory. */
5607 if (builder.full_nelts ().is_constant ())
5608 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5610 target = aarch64_target_reg (target, VNx16BImode);
5611 emit_move_insn (target, mem);
5612 return target;
5615 /* The last resort is to load the constant as an integer and then
5616 compare it against zero. Use -1 for set bits in order to increase
5617 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
5618 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5619 builder.nelts_per_pattern ());
5620 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5621 int_builder.quick_push (INTVAL (builder.elt (i))
5622 ? constm1_rtx : const0_rtx);
5623 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5624 int_builder.build ());
5627 /* Set DEST to immediate IMM. */
5629 void
5630 aarch64_expand_mov_immediate (rtx dest, rtx imm)
5632 machine_mode mode = GET_MODE (dest);
5634 /* Check on what type of symbol it is. */
5635 scalar_int_mode int_mode;
5636 if ((SYMBOL_REF_P (imm)
5637 || LABEL_REF_P (imm)
5638 || GET_CODE (imm) == CONST
5639 || GET_CODE (imm) == CONST_POLY_INT)
5640 && is_a <scalar_int_mode> (mode, &int_mode))
5642 rtx mem;
5643 poly_int64 offset;
5644 HOST_WIDE_INT const_offset;
5645 enum aarch64_symbol_type sty;
5647 /* If we have (const (plus symbol offset)), separate out the offset
5648 before we start classifying the symbol. */
5649 rtx base = strip_offset (imm, &offset);
5651 /* We must always add an offset involving VL separately, rather than
5652 folding it into the relocation. */
5653 if (!offset.is_constant (&const_offset))
5655 if (!TARGET_SVE)
5657 aarch64_report_sve_required ();
5658 return;
5660 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
5661 emit_insn (gen_rtx_SET (dest, imm));
5662 else
5664 /* Do arithmetic on 32-bit values if the result is smaller
5665 than that. */
5666 if (partial_subreg_p (int_mode, SImode))
5668 /* It is invalid to do symbol calculations in modes
5669 narrower than SImode. */
5670 gcc_assert (base == const0_rtx);
5671 dest = gen_lowpart (SImode, dest);
5672 int_mode = SImode;
5674 if (base != const0_rtx)
5676 base = aarch64_force_temporary (int_mode, dest, base);
5677 aarch64_add_offset (int_mode, dest, base, offset,
5678 NULL_RTX, NULL_RTX, false);
5680 else
5681 aarch64_add_offset (int_mode, dest, base, offset,
5682 dest, NULL_RTX, false);
5684 return;
5687 sty = aarch64_classify_symbol (base, const_offset);
5688 switch (sty)
5690 case SYMBOL_FORCE_TO_MEM:
5691 if (int_mode != ptr_mode)
5692 imm = convert_memory_address (ptr_mode, imm);
5694 if (const_offset != 0
5695 && targetm.cannot_force_const_mem (ptr_mode, imm))
5697 gcc_assert (can_create_pseudo_p ());
5698 base = aarch64_force_temporary (int_mode, dest, base);
5699 aarch64_add_offset (int_mode, dest, base, const_offset,
5700 NULL_RTX, NULL_RTX, false);
5701 return;
5704 mem = force_const_mem (ptr_mode, imm);
5705 gcc_assert (mem);
5707 /* If we aren't generating PC relative literals, then
5708 we need to expand the literal pool access carefully.
5709 This is something that needs to be done in a number
5710 of places, so could well live as a separate function. */
5711 if (!aarch64_pcrelative_literal_loads)
5713 gcc_assert (can_create_pseudo_p ());
5714 base = gen_reg_rtx (ptr_mode);
5715 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
5716 if (ptr_mode != Pmode)
5717 base = convert_memory_address (Pmode, base);
5718 mem = gen_rtx_MEM (ptr_mode, base);
5721 if (int_mode != ptr_mode)
5722 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
5724 emit_insn (gen_rtx_SET (dest, mem));
5726 return;
5728 case SYMBOL_SMALL_TLSGD:
5729 case SYMBOL_SMALL_TLSDESC:
5730 case SYMBOL_SMALL_TLSIE:
5731 case SYMBOL_SMALL_GOT_28K:
5732 case SYMBOL_SMALL_GOT_4G:
5733 case SYMBOL_TINY_GOT:
5734 case SYMBOL_TINY_TLSIE:
5735 if (const_offset != 0)
5737 gcc_assert(can_create_pseudo_p ());
5738 base = aarch64_force_temporary (int_mode, dest, base);
5739 aarch64_add_offset (int_mode, dest, base, const_offset,
5740 NULL_RTX, NULL_RTX, false);
5741 return;
5743 /* FALLTHRU */
5745 case SYMBOL_SMALL_ABSOLUTE:
5746 case SYMBOL_TINY_ABSOLUTE:
5747 case SYMBOL_TLSLE12:
5748 case SYMBOL_TLSLE24:
5749 case SYMBOL_TLSLE32:
5750 case SYMBOL_TLSLE48:
5751 aarch64_load_symref_appropriately (dest, imm, sty);
5752 return;
5754 default:
5755 gcc_unreachable ();
5759 if (!CONST_INT_P (imm))
5761 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
5763 /* Only the low bit of each .H, .S and .D element is defined,
5764 so we can set the upper bits to whatever we like. If the
5765 predicate is all-true in MODE, prefer to set all the undefined
5766 bits as well, so that we can share a single .B predicate for
5767 all modes. */
5768 if (imm == CONSTM1_RTX (mode))
5769 imm = CONSTM1_RTX (VNx16BImode);
5771 /* All methods for constructing predicate modes wider than VNx16BI
5772 will set the upper bits of each element to zero. Expose this
5773 by moving such constants as a VNx16BI, so that all bits are
5774 significant and so that constants for different modes can be
5775 shared. The wider constant will still be available as a
5776 REG_EQUAL note. */
5777 rtx_vector_builder builder;
5778 if (aarch64_get_sve_pred_bits (builder, imm))
5780 rtx res = aarch64_expand_sve_const_pred (dest, builder);
5781 if (dest != res)
5782 emit_move_insn (dest, gen_lowpart (mode, res));
5783 return;
5787 if (GET_CODE (imm) == HIGH
5788 || aarch64_simd_valid_immediate (imm, NULL))
5790 emit_insn (gen_rtx_SET (dest, imm));
5791 return;
5794 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
5795 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
5797 if (dest != res)
5798 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
5799 return;
5802 rtx mem = force_const_mem (mode, imm);
5803 gcc_assert (mem);
5804 emit_move_insn (dest, mem);
5805 return;
5808 aarch64_internal_mov_immediate (dest, imm, true,
5809 as_a <scalar_int_mode> (mode));
5812 /* Return the MEM rtx that provides the canary value that should be used
5813 for stack-smashing protection. MODE is the mode of the memory.
5814 For SSP_GLOBAL, DECL_RTL is the MEM rtx for the canary variable
5815 (__stack_chk_guard), otherwise it has no useful value. SALT_TYPE
5816 indicates whether the caller is performing a SET or a TEST operation. */
5819 aarch64_stack_protect_canary_mem (machine_mode mode, rtx decl_rtl,
5820 aarch64_salt_type salt_type)
5822 rtx addr;
5823 if (aarch64_stack_protector_guard == SSP_GLOBAL)
5825 gcc_assert (MEM_P (decl_rtl));
5826 addr = XEXP (decl_rtl, 0);
5827 poly_int64 offset;
5828 rtx base = strip_offset_and_salt (addr, &offset);
5829 if (!SYMBOL_REF_P (base))
5830 return decl_rtl;
5832 rtvec v = gen_rtvec (2, base, GEN_INT (salt_type));
5833 addr = gen_rtx_UNSPEC (Pmode, v, UNSPEC_SALT_ADDR);
5834 addr = gen_rtx_CONST (Pmode, addr);
5835 addr = plus_constant (Pmode, addr, offset);
5837 else
5839 /* Calculate the address from the system register. */
5840 rtx salt = GEN_INT (salt_type);
5841 addr = gen_reg_rtx (mode);
5842 if (mode == DImode)
5843 emit_insn (gen_reg_stack_protect_address_di (addr, salt));
5844 else
5846 emit_insn (gen_reg_stack_protect_address_si (addr, salt));
5847 addr = convert_memory_address (Pmode, addr);
5849 addr = plus_constant (Pmode, addr, aarch64_stack_protector_guard_offset);
5851 return gen_rtx_MEM (mode, force_reg (Pmode, addr));
5854 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
5855 that is known to contain PTRUE. */
5857 void
5858 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
5860 expand_operand ops[3];
5861 machine_mode mode = GET_MODE (dest);
5862 create_output_operand (&ops[0], dest, mode);
5863 create_input_operand (&ops[1], pred, GET_MODE(pred));
5864 create_input_operand (&ops[2], src, mode);
5865 temporary_volatile_ok v (true);
5866 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
5869 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
5870 operand is in memory. In this case we need to use the predicated LD1
5871 and ST1 instead of LDR and STR, both for correctness on big-endian
5872 targets and because LD1 and ST1 support a wider range of addressing modes.
5873 PRED_MODE is the mode of the predicate.
5875 See the comment at the head of aarch64-sve.md for details about the
5876 big-endian handling. */
5878 void
5879 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
5881 machine_mode mode = GET_MODE (dest);
5882 rtx ptrue = aarch64_ptrue_reg (pred_mode);
5883 if (!register_operand (src, mode)
5884 && !register_operand (dest, mode))
5886 rtx tmp = gen_reg_rtx (mode);
5887 if (MEM_P (src))
5888 aarch64_emit_sve_pred_move (tmp, ptrue, src);
5889 else
5890 emit_move_insn (tmp, src);
5891 src = tmp;
5893 aarch64_emit_sve_pred_move (dest, ptrue, src);
5896 /* Called only on big-endian targets. See whether an SVE vector move
5897 from SRC to DEST is effectively a REV[BHW] instruction, because at
5898 least one operand is a subreg of an SVE vector that has wider or
5899 narrower elements. Return true and emit the instruction if so.
5901 For example:
5903 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
5905 represents a VIEW_CONVERT between the following vectors, viewed
5906 in memory order:
5908 R2: { [0].high, [0].low, [1].high, [1].low, ... }
5909 R1: { [0], [1], [2], [3], ... }
5911 The high part of lane X in R2 should therefore correspond to lane X*2
5912 of R1, but the register representations are:
5914 msb lsb
5915 R2: ...... [1].high [1].low [0].high [0].low
5916 R1: ...... [3] [2] [1] [0]
5918 where the low part of lane X in R2 corresponds to lane X*2 in R1.
5919 We therefore need a reverse operation to swap the high and low values
5920 around.
5922 This is purely an optimization. Without it we would spill the
5923 subreg operand to the stack in one mode and reload it in the
5924 other mode, which has the same effect as the REV. */
5926 bool
5927 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
5929 gcc_assert (BYTES_BIG_ENDIAN);
5931 /* Do not try to optimize subregs that LRA has created for matched
5932 reloads. These subregs only exist as a temporary measure to make
5933 the RTL well-formed, but they are exempt from the usual
5934 TARGET_CAN_CHANGE_MODE_CLASS rules.
5936 For example, if we have:
5938 (set (reg:VNx8HI R1) (foo:VNx8HI (reg:VNx4SI R2)))
5940 and the constraints require R1 and R2 to be in the same register,
5941 LRA may need to create RTL such as:
5943 (set (subreg:VNx4SI (reg:VNx8HI TMP) 0) (reg:VNx4SI R2))
5944 (set (reg:VNx8HI TMP) (foo:VNx8HI (subreg:VNx4SI (reg:VNx8HI TMP) 0)))
5945 (set (reg:VNx8HI R1) (reg:VNx8HI TMP))
5947 which forces both the input and output of the original instruction
5948 to use the same hard register. But for this to work, the normal
5949 rules have to be suppressed on the subreg input, otherwise LRA
5950 would need to reload that input too, meaning that the process
5951 would never terminate. To compensate for this, the normal rules
5952 are also suppressed for the subreg output of the first move.
5953 Ignoring the special case and handling the first move normally
5954 would therefore generate wrong code: we would reverse the elements
5955 for the first subreg but not reverse them back for the second subreg. */
5956 if (SUBREG_P (dest) && !LRA_SUBREG_P (dest))
5957 dest = SUBREG_REG (dest);
5958 if (SUBREG_P (src) && !LRA_SUBREG_P (src))
5959 src = SUBREG_REG (src);
5961 /* The optimization handles two single SVE REGs with different element
5962 sizes. */
5963 if (!REG_P (dest)
5964 || !REG_P (src)
5965 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
5966 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
5967 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
5968 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
5969 return false;
5971 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
5972 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
5973 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
5974 UNSPEC_REV_SUBREG);
5975 emit_insn (gen_rtx_SET (dest, unspec));
5976 return true;
5979 /* Return a copy of X with mode MODE, without changing its other
5980 attributes. Unlike gen_lowpart, this doesn't care whether the
5981 mode change is valid. */
5984 aarch64_replace_reg_mode (rtx x, machine_mode mode)
5986 if (GET_MODE (x) == mode)
5987 return x;
5989 x = shallow_copy_rtx (x);
5990 set_mode_and_regno (x, mode, REGNO (x));
5991 return x;
5994 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
5995 stored in wider integer containers. */
5997 static unsigned int
5998 aarch64_sve_rev_unspec (machine_mode mode)
6000 switch (GET_MODE_UNIT_SIZE (mode))
6002 case 1: return UNSPEC_REVB;
6003 case 2: return UNSPEC_REVH;
6004 case 4: return UNSPEC_REVW;
6006 gcc_unreachable ();
6009 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
6010 operands. */
6012 void
6013 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
6015 /* Decide which REV operation we need. The mode with wider elements
6016 determines the mode of the operands and the mode with the narrower
6017 elements determines the reverse width. */
6018 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
6019 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
6020 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
6021 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
6022 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
6024 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
6025 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
6027 /* Get the operands in the appropriate modes and emit the instruction. */
6028 ptrue = gen_lowpart (pred_mode, ptrue);
6029 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
6030 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
6031 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
6032 dest, ptrue, src));
6035 static bool
6036 aarch64_function_ok_for_sibcall (tree, tree exp)
6038 if (crtl->abi->id () != expr_callee_abi (exp).id ())
6039 return false;
6041 return true;
6044 /* Subroutine of aarch64_pass_by_reference for arguments that are not
6045 passed in SVE registers. */
6047 static bool
6048 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
6049 const function_arg_info &arg)
6051 HOST_WIDE_INT size;
6052 machine_mode dummymode;
6053 int nregs;
6055 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
6056 if (arg.mode == BLKmode && arg.type)
6057 size = int_size_in_bytes (arg.type);
6058 else
6059 /* No frontends can create types with variable-sized modes, so we
6060 shouldn't be asked to pass or return them. */
6061 size = GET_MODE_SIZE (arg.mode).to_constant ();
6063 /* Aggregates are passed by reference based on their size. */
6064 if (arg.aggregate_type_p ())
6065 size = int_size_in_bytes (arg.type);
6067 /* Variable sized arguments are always returned by reference. */
6068 if (size < 0)
6069 return true;
6071 /* Can this be a candidate to be passed in fp/simd register(s)? */
6072 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
6073 &dummymode, &nregs, NULL,
6074 !pcum || pcum->silent_p))
6075 return false;
6077 /* Arguments which are variable sized or larger than 2 registers are
6078 passed by reference unless they are a homogenous floating point
6079 aggregate. */
6080 return size > 2 * UNITS_PER_WORD;
6083 /* Implement TARGET_PASS_BY_REFERENCE. */
6085 static bool
6086 aarch64_pass_by_reference (cumulative_args_t pcum_v,
6087 const function_arg_info &arg)
6089 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6091 if (!arg.type)
6092 return aarch64_pass_by_reference_1 (pcum, arg);
6094 pure_scalable_type_info pst_info;
6095 switch (pst_info.analyze (arg.type))
6097 case pure_scalable_type_info::IS_PST:
6098 if (pcum && !pcum->silent_p && !TARGET_SVE)
6099 /* We can't gracefully recover at this point, so make this a
6100 fatal error. */
6101 fatal_error (input_location, "arguments of type %qT require"
6102 " the SVE ISA extension", arg.type);
6104 /* Variadic SVE types are passed by reference. Normal non-variadic
6105 arguments are too if we've run out of registers. */
6106 return (!arg.named
6107 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
6108 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
6110 case pure_scalable_type_info::DOESNT_MATTER:
6111 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
6112 return true;
6114 case pure_scalable_type_info::NO_ABI_IDENTITY:
6115 case pure_scalable_type_info::ISNT_PST:
6116 return aarch64_pass_by_reference_1 (pcum, arg);
6118 gcc_unreachable ();
6121 /* Return TRUE if VALTYPE is padded to its least significant bits. */
6122 static bool
6123 aarch64_return_in_msb (const_tree valtype)
6125 machine_mode dummy_mode;
6126 int dummy_int;
6128 /* Never happens in little-endian mode. */
6129 if (!BYTES_BIG_ENDIAN)
6130 return false;
6132 /* Only composite types smaller than or equal to 16 bytes can
6133 be potentially returned in registers. */
6134 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
6135 || int_size_in_bytes (valtype) <= 0
6136 || int_size_in_bytes (valtype) > 16)
6137 return false;
6139 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
6140 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
6141 is always passed/returned in the least significant bits of fp/simd
6142 register(s). */
6143 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
6144 &dummy_mode, &dummy_int, NULL,
6145 false))
6146 return false;
6148 /* Likewise pure scalable types for SVE vector and predicate registers. */
6149 pure_scalable_type_info pst_info;
6150 if (pst_info.analyze_registers (valtype))
6151 return false;
6153 return true;
6156 /* Implement TARGET_FUNCTION_VALUE.
6157 Define how to find the value returned by a function. */
6159 static rtx
6160 aarch64_function_value (const_tree type, const_tree func,
6161 bool outgoing ATTRIBUTE_UNUSED)
6163 machine_mode mode;
6164 int unsignedp;
6166 mode = TYPE_MODE (type);
6167 if (INTEGRAL_TYPE_P (type))
6168 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
6170 pure_scalable_type_info pst_info;
6171 if (type && pst_info.analyze_registers (type))
6172 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
6174 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6175 are returned in memory, not by value. */
6176 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6177 bool sve_p = (vec_flags & VEC_ANY_SVE);
6179 if (aarch64_return_in_msb (type))
6181 HOST_WIDE_INT size = int_size_in_bytes (type);
6183 if (size % UNITS_PER_WORD != 0)
6185 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
6186 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
6190 int count;
6191 machine_mode ag_mode;
6192 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
6193 NULL, false))
6195 gcc_assert (!sve_p);
6196 if (!aarch64_composite_type_p (type, mode))
6198 gcc_assert (count == 1 && mode == ag_mode);
6199 return gen_rtx_REG (mode, V0_REGNUM);
6201 else
6203 int i;
6204 rtx par;
6206 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
6207 for (i = 0; i < count; i++)
6209 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
6210 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
6211 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6212 XVECEXP (par, 0, i) = tmp;
6214 return par;
6217 else
6219 if (sve_p)
6221 /* Vector types can acquire a partial SVE mode using things like
6222 __attribute__((vector_size(N))), and this is potentially useful.
6223 However, the choice of mode doesn't affect the type's ABI
6224 identity, so we should treat the types as though they had
6225 the associated integer mode, just like they did before SVE
6226 was introduced.
6228 We know that the vector must be 128 bits or smaller,
6229 otherwise we'd have returned it in memory instead. */
6230 gcc_assert (type
6231 && (aarch64_some_values_include_pst_objects_p (type)
6232 || (vec_flags & VEC_PARTIAL)));
6234 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
6235 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
6236 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
6237 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
6239 return gen_rtx_REG (mode, R0_REGNUM);
6243 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
6244 Return true if REGNO is the number of a hard register in which the values
6245 of called function may come back. */
6247 static bool
6248 aarch64_function_value_regno_p (const unsigned int regno)
6250 /* Maximum of 16 bytes can be returned in the general registers. Examples
6251 of 16-byte return values are: 128-bit integers and 16-byte small
6252 structures (excluding homogeneous floating-point aggregates). */
6253 if (regno == R0_REGNUM || regno == R1_REGNUM)
6254 return true;
6256 /* Up to four fp/simd registers can return a function value, e.g. a
6257 homogeneous floating-point aggregate having four members. */
6258 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
6259 return TARGET_FLOAT;
6261 return false;
6264 /* Subroutine for aarch64_return_in_memory for types that are not returned
6265 in SVE registers. */
6267 static bool
6268 aarch64_return_in_memory_1 (const_tree type)
6270 HOST_WIDE_INT size;
6271 machine_mode ag_mode;
6272 int count;
6274 if (!AGGREGATE_TYPE_P (type)
6275 && TREE_CODE (type) != COMPLEX_TYPE
6276 && TREE_CODE (type) != VECTOR_TYPE)
6277 /* Simple scalar types always returned in registers. */
6278 return false;
6280 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6281 &ag_mode, &count, NULL, false))
6282 return false;
6284 /* Types larger than 2 registers returned in memory. */
6285 size = int_size_in_bytes (type);
6286 return (size < 0 || size > 2 * UNITS_PER_WORD);
6289 /* Implement TARGET_RETURN_IN_MEMORY.
6291 If the type T of the result of a function is such that
6292 void func (T arg)
6293 would require that arg be passed as a value in a register (or set of
6294 registers) according to the parameter passing rules, then the result
6295 is returned in the same registers as would be used for such an
6296 argument. */
6298 static bool
6299 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
6301 pure_scalable_type_info pst_info;
6302 switch (pst_info.analyze (type))
6304 case pure_scalable_type_info::IS_PST:
6305 return (pst_info.num_zr () > NUM_FP_ARG_REGS
6306 || pst_info.num_pr () > NUM_PR_ARG_REGS);
6308 case pure_scalable_type_info::DOESNT_MATTER:
6309 gcc_assert (aarch64_return_in_memory_1 (type));
6310 return true;
6312 case pure_scalable_type_info::NO_ABI_IDENTITY:
6313 case pure_scalable_type_info::ISNT_PST:
6314 return aarch64_return_in_memory_1 (type);
6316 gcc_unreachable ();
6319 static bool
6320 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
6321 const_tree type, int *nregs)
6323 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6324 return aarch64_vfp_is_call_or_return_candidate (mode, type,
6325 &pcum->aapcs_vfp_rmode,
6326 nregs, NULL, pcum->silent_p);
6329 /* Given MODE and TYPE of a function argument, return the alignment in
6330 bits. The idea is to suppress any stronger alignment requested by
6331 the user and opt for the natural alignment (specified in AAPCS64 \S
6332 4.1). ABI_BREAK is set to true if the alignment was incorrectly
6333 calculated in versions of GCC prior to GCC-9. This is a helper
6334 function for local use only. */
6336 static unsigned int
6337 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
6338 unsigned int *abi_break)
6340 *abi_break = 0;
6341 if (!type)
6342 return GET_MODE_ALIGNMENT (mode);
6344 if (integer_zerop (TYPE_SIZE (type)))
6345 return 0;
6347 gcc_assert (TYPE_MODE (type) == mode);
6349 if (!AGGREGATE_TYPE_P (type))
6350 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
6352 if (TREE_CODE (type) == ARRAY_TYPE)
6353 return TYPE_ALIGN (TREE_TYPE (type));
6355 unsigned int alignment = 0;
6356 unsigned int bitfield_alignment = 0;
6357 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
6358 if (TREE_CODE (field) == FIELD_DECL)
6360 /* Note that we explicitly consider zero-sized fields here,
6361 even though they don't map to AAPCS64 machine types.
6362 For example, in:
6364 struct __attribute__((aligned(8))) empty {};
6366 struct s {
6367 [[no_unique_address]] empty e;
6368 int x;
6371 "s" contains only one Fundamental Data Type (the int field)
6372 but gains 8-byte alignment and size thanks to "e". */
6373 alignment = std::max (alignment, DECL_ALIGN (field));
6374 if (DECL_BIT_FIELD_TYPE (field))
6375 bitfield_alignment
6376 = std::max (bitfield_alignment,
6377 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
6380 if (bitfield_alignment > alignment)
6382 *abi_break = alignment;
6383 return bitfield_alignment;
6386 return alignment;
6389 /* Layout a function argument according to the AAPCS64 rules. The rule
6390 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
6391 mode that was originally given to us by the target hook, whereas the
6392 mode in ARG might be the result of replacing partial SVE modes with
6393 the equivalent integer mode. */
6395 static void
6396 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6398 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6399 tree type = arg.type;
6400 machine_mode mode = arg.mode;
6401 int ncrn, nvrn, nregs;
6402 bool allocate_ncrn, allocate_nvrn;
6403 HOST_WIDE_INT size;
6404 unsigned int abi_break;
6406 /* We need to do this once per argument. */
6407 if (pcum->aapcs_arg_processed)
6408 return;
6410 pcum->aapcs_arg_processed = true;
6412 pure_scalable_type_info pst_info;
6413 if (type && pst_info.analyze_registers (type))
6415 /* The PCS says that it is invalid to pass an SVE value to an
6416 unprototyped function. There is no ABI-defined location we
6417 can return in this case, so we have no real choice but to raise
6418 an error immediately, even though this is only a query function. */
6419 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
6421 gcc_assert (!pcum->silent_p);
6422 error ("SVE type %qT cannot be passed to an unprototyped function",
6423 arg.type);
6424 /* Avoid repeating the message, and avoid tripping the assert
6425 below. */
6426 pcum->pcs_variant = ARM_PCS_SVE;
6429 /* We would have converted the argument into pass-by-reference
6430 form if it didn't fit in registers. */
6431 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
6432 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
6433 gcc_assert (arg.named
6434 && pcum->pcs_variant == ARM_PCS_SVE
6435 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
6436 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
6437 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
6438 P0_REGNUM + pcum->aapcs_nprn);
6439 return;
6442 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
6443 are passed by reference, not by value. */
6444 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6445 bool sve_p = (vec_flags & VEC_ANY_SVE);
6446 if (sve_p)
6447 /* Vector types can acquire a partial SVE mode using things like
6448 __attribute__((vector_size(N))), and this is potentially useful.
6449 However, the choice of mode doesn't affect the type's ABI
6450 identity, so we should treat the types as though they had
6451 the associated integer mode, just like they did before SVE
6452 was introduced.
6454 We know that the vector must be 128 bits or smaller,
6455 otherwise we'd have passed it in memory instead. */
6456 gcc_assert (type
6457 && (aarch64_some_values_include_pst_objects_p (type)
6458 || (vec_flags & VEC_PARTIAL)));
6460 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
6461 if (type)
6462 size = int_size_in_bytes (type);
6463 else
6464 /* No frontends can create types with variable-sized modes, so we
6465 shouldn't be asked to pass or return them. */
6466 size = GET_MODE_SIZE (mode).to_constant ();
6467 size = ROUND_UP (size, UNITS_PER_WORD);
6469 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
6470 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
6471 mode,
6472 type,
6473 &nregs);
6474 gcc_assert (!sve_p || !allocate_nvrn);
6476 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
6477 The following code thus handles passing by SIMD/FP registers first. */
6479 nvrn = pcum->aapcs_nvrn;
6481 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
6482 and homogenous short-vector aggregates (HVA). */
6483 if (allocate_nvrn)
6485 if (!pcum->silent_p && !TARGET_FLOAT)
6486 aarch64_err_no_fpadvsimd (mode);
6488 if (nvrn + nregs <= NUM_FP_ARG_REGS)
6490 pcum->aapcs_nextnvrn = nvrn + nregs;
6491 if (!aarch64_composite_type_p (type, mode))
6493 gcc_assert (nregs == 1);
6494 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
6496 else
6498 rtx par;
6499 int i;
6500 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6501 for (i = 0; i < nregs; i++)
6503 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
6504 V0_REGNUM + nvrn + i);
6505 rtx offset = gen_int_mode
6506 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
6507 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
6508 XVECEXP (par, 0, i) = tmp;
6510 pcum->aapcs_reg = par;
6512 return;
6514 else
6516 /* C.3 NSRN is set to 8. */
6517 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
6518 goto on_stack;
6522 ncrn = pcum->aapcs_ncrn;
6523 nregs = size / UNITS_PER_WORD;
6525 /* C6 - C9. though the sign and zero extension semantics are
6526 handled elsewhere. This is the case where the argument fits
6527 entirely general registers. */
6528 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
6530 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
6532 /* C.8 if the argument has an alignment of 16 then the NGRN is
6533 rounded up to the next even number. */
6534 if (nregs == 2
6535 && ncrn % 2
6536 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
6537 comparison is there because for > 16 * BITS_PER_UNIT
6538 alignment nregs should be > 2 and therefore it should be
6539 passed by reference rather than value. */
6540 && (aarch64_function_arg_alignment (mode, type, &abi_break)
6541 == 16 * BITS_PER_UNIT))
6543 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
6544 inform (input_location, "parameter passing for argument of type "
6545 "%qT changed in GCC 9.1", type);
6546 ++ncrn;
6547 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
6550 /* If an argument with an SVE mode needs to be shifted up to the
6551 high part of the register, treat it as though it had an integer mode.
6552 Using the normal (parallel [...]) would suppress the shifting. */
6553 if (sve_p
6554 && BYTES_BIG_ENDIAN
6555 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
6556 && aarch64_pad_reg_upward (mode, type, false))
6558 mode = int_mode_for_mode (mode).require ();
6559 sve_p = false;
6562 /* NREGS can be 0 when e.g. an empty structure is to be passed.
6563 A reg is still generated for it, but the caller should be smart
6564 enough not to use it. */
6565 if (nregs == 0
6566 || (nregs == 1 && !sve_p)
6567 || GET_MODE_CLASS (mode) == MODE_INT)
6568 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
6569 else
6571 rtx par;
6572 int i;
6574 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
6575 for (i = 0; i < nregs; i++)
6577 scalar_int_mode reg_mode = word_mode;
6578 if (nregs == 1)
6579 reg_mode = int_mode_for_mode (mode).require ();
6580 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
6581 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
6582 GEN_INT (i * UNITS_PER_WORD));
6583 XVECEXP (par, 0, i) = tmp;
6585 pcum->aapcs_reg = par;
6588 pcum->aapcs_nextncrn = ncrn + nregs;
6589 return;
6592 /* C.11 */
6593 pcum->aapcs_nextncrn = NUM_ARG_REGS;
6595 /* The argument is passed on stack; record the needed number of words for
6596 this argument and align the total size if necessary. */
6597 on_stack:
6598 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
6600 if (aarch64_function_arg_alignment (mode, type, &abi_break)
6601 == 16 * BITS_PER_UNIT)
6603 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
6604 if (pcum->aapcs_stack_size != new_size)
6606 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
6607 inform (input_location, "parameter passing for argument of type "
6608 "%qT changed in GCC 9.1", type);
6609 pcum->aapcs_stack_size = new_size;
6612 return;
6615 /* Implement TARGET_FUNCTION_ARG. */
6617 static rtx
6618 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
6620 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6621 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
6622 || pcum->pcs_variant == ARM_PCS_SIMD
6623 || pcum->pcs_variant == ARM_PCS_SVE);
6625 if (arg.end_marker_p ())
6626 return gen_int_mode (pcum->pcs_variant, DImode);
6628 aarch64_layout_arg (pcum_v, arg);
6629 return pcum->aapcs_reg;
6632 void
6633 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
6634 const_tree fntype,
6635 rtx libname ATTRIBUTE_UNUSED,
6636 const_tree fndecl ATTRIBUTE_UNUSED,
6637 unsigned n_named ATTRIBUTE_UNUSED,
6638 bool silent_p)
6640 pcum->aapcs_ncrn = 0;
6641 pcum->aapcs_nvrn = 0;
6642 pcum->aapcs_nprn = 0;
6643 pcum->aapcs_nextncrn = 0;
6644 pcum->aapcs_nextnvrn = 0;
6645 pcum->aapcs_nextnprn = 0;
6646 if (fntype)
6647 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
6648 else
6649 pcum->pcs_variant = ARM_PCS_AAPCS64;
6650 pcum->aapcs_reg = NULL_RTX;
6651 pcum->aapcs_arg_processed = false;
6652 pcum->aapcs_stack_words = 0;
6653 pcum->aapcs_stack_size = 0;
6654 pcum->silent_p = silent_p;
6656 if (!silent_p
6657 && !TARGET_FLOAT
6658 && fntype && fntype != error_mark_node)
6660 const_tree type = TREE_TYPE (fntype);
6661 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
6662 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
6663 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
6664 &mode, &nregs, NULL, false))
6665 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
6668 if (!silent_p
6669 && !TARGET_SVE
6670 && pcum->pcs_variant == ARM_PCS_SVE)
6672 /* We can't gracefully recover at this point, so make this a
6673 fatal error. */
6674 if (fndecl)
6675 fatal_error (input_location, "%qE requires the SVE ISA extension",
6676 fndecl);
6677 else
6678 fatal_error (input_location, "calls to functions of type %qT require"
6679 " the SVE ISA extension", fntype);
6683 static void
6684 aarch64_function_arg_advance (cumulative_args_t pcum_v,
6685 const function_arg_info &arg)
6687 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6688 if (pcum->pcs_variant == ARM_PCS_AAPCS64
6689 || pcum->pcs_variant == ARM_PCS_SIMD
6690 || pcum->pcs_variant == ARM_PCS_SVE)
6692 aarch64_layout_arg (pcum_v, arg);
6693 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
6694 != (pcum->aapcs_stack_words != 0));
6695 pcum->aapcs_arg_processed = false;
6696 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
6697 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
6698 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
6699 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
6700 pcum->aapcs_stack_words = 0;
6701 pcum->aapcs_reg = NULL_RTX;
6705 bool
6706 aarch64_function_arg_regno_p (unsigned regno)
6708 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
6709 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
6712 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
6713 PARM_BOUNDARY bits of alignment, but will be given anything up
6714 to STACK_BOUNDARY bits if the type requires it. This makes sure
6715 that both before and after the layout of each argument, the Next
6716 Stacked Argument Address (NSAA) will have a minimum alignment of
6717 8 bytes. */
6719 static unsigned int
6720 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
6722 unsigned int abi_break;
6723 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
6724 &abi_break);
6725 alignment = MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
6726 if (abi_break & warn_psabi)
6728 abi_break = MIN (MAX (abi_break, PARM_BOUNDARY), STACK_BOUNDARY);
6729 if (alignment != abi_break)
6730 inform (input_location, "parameter passing for argument of type "
6731 "%qT changed in GCC 9.1", type);
6734 return alignment;
6737 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
6739 static fixed_size_mode
6740 aarch64_get_reg_raw_mode (int regno)
6742 if (TARGET_SVE && FP_REGNUM_P (regno))
6743 /* Don't use the SVE part of the register for __builtin_apply and
6744 __builtin_return. The SVE registers aren't used by the normal PCS,
6745 so using them there would be a waste of time. The PCS extensions
6746 for SVE types are fundamentally incompatible with the
6747 __builtin_return/__builtin_apply interface. */
6748 return as_a <fixed_size_mode> (V16QImode);
6749 return default_get_reg_raw_mode (regno);
6752 /* Implement TARGET_FUNCTION_ARG_PADDING.
6754 Small aggregate types are placed in the lowest memory address.
6756 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
6758 static pad_direction
6759 aarch64_function_arg_padding (machine_mode mode, const_tree type)
6761 /* On little-endian targets, the least significant byte of every stack
6762 argument is passed at the lowest byte address of the stack slot. */
6763 if (!BYTES_BIG_ENDIAN)
6764 return PAD_UPWARD;
6766 /* Otherwise, integral, floating-point and pointer types are padded downward:
6767 the least significant byte of a stack argument is passed at the highest
6768 byte address of the stack slot. */
6769 if (type
6770 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
6771 || POINTER_TYPE_P (type))
6772 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
6773 return PAD_DOWNWARD;
6775 /* Everything else padded upward, i.e. data in first byte of stack slot. */
6776 return PAD_UPWARD;
6779 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
6781 It specifies padding for the last (may also be the only)
6782 element of a block move between registers and memory. If
6783 assuming the block is in the memory, padding upward means that
6784 the last element is padded after its highest significant byte,
6785 while in downward padding, the last element is padded at the
6786 its least significant byte side.
6788 Small aggregates and small complex types are always padded
6789 upwards.
6791 We don't need to worry about homogeneous floating-point or
6792 short-vector aggregates; their move is not affected by the
6793 padding direction determined here. Regardless of endianness,
6794 each element of such an aggregate is put in the least
6795 significant bits of a fp/simd register.
6797 Return !BYTES_BIG_ENDIAN if the least significant byte of the
6798 register has useful data, and return the opposite if the most
6799 significant byte does. */
6801 bool
6802 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
6803 bool first ATTRIBUTE_UNUSED)
6806 /* Aside from pure scalable types, small composite types are always
6807 padded upward. */
6808 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
6810 HOST_WIDE_INT size;
6811 if (type)
6812 size = int_size_in_bytes (type);
6813 else
6814 /* No frontends can create types with variable-sized modes, so we
6815 shouldn't be asked to pass or return them. */
6816 size = GET_MODE_SIZE (mode).to_constant ();
6817 if (size < 2 * UNITS_PER_WORD)
6819 pure_scalable_type_info pst_info;
6820 if (pst_info.analyze_registers (type))
6821 return false;
6822 return true;
6826 /* Otherwise, use the default padding. */
6827 return !BYTES_BIG_ENDIAN;
6830 static scalar_int_mode
6831 aarch64_libgcc_cmp_return_mode (void)
6833 return SImode;
6836 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
6838 /* We use the 12-bit shifted immediate arithmetic instructions so values
6839 must be multiple of (1 << 12), i.e. 4096. */
6840 #define ARITH_FACTOR 4096
6842 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
6843 #error Cannot use simple address calculation for stack probing
6844 #endif
6846 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
6847 inclusive. These are offsets from the current stack pointer. */
6849 static void
6850 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
6852 HOST_WIDE_INT size;
6853 if (!poly_size.is_constant (&size))
6855 sorry ("stack probes for SVE frames");
6856 return;
6859 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REGNUM);
6861 /* See the same assertion on PROBE_INTERVAL above. */
6862 gcc_assert ((first % ARITH_FACTOR) == 0);
6864 /* See if we have a constant small number of probes to generate. If so,
6865 that's the easy case. */
6866 if (size <= PROBE_INTERVAL)
6868 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
6870 emit_set_insn (reg1,
6871 plus_constant (Pmode,
6872 stack_pointer_rtx, -(first + base)));
6873 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
6876 /* The run-time loop is made up of 8 insns in the generic case while the
6877 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
6878 else if (size <= 4 * PROBE_INTERVAL)
6880 HOST_WIDE_INT i, rem;
6882 emit_set_insn (reg1,
6883 plus_constant (Pmode,
6884 stack_pointer_rtx,
6885 -(first + PROBE_INTERVAL)));
6886 emit_stack_probe (reg1);
6888 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
6889 it exceeds SIZE. If only two probes are needed, this will not
6890 generate any code. Then probe at FIRST + SIZE. */
6891 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
6893 emit_set_insn (reg1,
6894 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
6895 emit_stack_probe (reg1);
6898 rem = size - (i - PROBE_INTERVAL);
6899 if (rem > 256)
6901 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6903 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
6904 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
6906 else
6907 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
6910 /* Otherwise, do the same as above, but in a loop. Note that we must be
6911 extra careful with variables wrapping around because we might be at
6912 the very top (or the very bottom) of the address space and we have
6913 to be able to handle this case properly; in particular, we use an
6914 equality test for the loop condition. */
6915 else
6917 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REGNUM);
6919 /* Step 1: round SIZE to the previous multiple of the interval. */
6921 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
6924 /* Step 2: compute initial and final value of the loop counter. */
6926 /* TEST_ADDR = SP + FIRST. */
6927 emit_set_insn (reg1,
6928 plus_constant (Pmode, stack_pointer_rtx, -first));
6930 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
6931 HOST_WIDE_INT adjustment = - (first + rounded_size);
6932 if (! aarch64_uimm12_shift (adjustment))
6934 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
6935 true, Pmode);
6936 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
6938 else
6939 emit_set_insn (reg2,
6940 plus_constant (Pmode, stack_pointer_rtx, adjustment));
6942 /* Step 3: the loop
6946 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
6947 probe at TEST_ADDR
6949 while (TEST_ADDR != LAST_ADDR)
6951 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
6952 until it is equal to ROUNDED_SIZE. */
6954 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
6957 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
6958 that SIZE is equal to ROUNDED_SIZE. */
6960 if (size != rounded_size)
6962 HOST_WIDE_INT rem = size - rounded_size;
6964 if (rem > 256)
6966 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6968 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
6969 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
6971 else
6972 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
6976 /* Make sure nothing is scheduled before we are done. */
6977 emit_insn (gen_blockage ());
6980 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
6981 absolute addresses. */
6983 const char *
6984 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
6986 static int labelno = 0;
6987 char loop_lab[32];
6988 rtx xops[2];
6990 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
6992 /* Loop. */
6993 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
6995 HOST_WIDE_INT stack_clash_probe_interval
6996 = 1 << param_stack_clash_protection_guard_size;
6998 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
6999 xops[0] = reg1;
7000 HOST_WIDE_INT interval;
7001 if (flag_stack_clash_protection)
7002 interval = stack_clash_probe_interval;
7003 else
7004 interval = PROBE_INTERVAL;
7006 gcc_assert (aarch64_uimm12_shift (interval));
7007 xops[1] = GEN_INT (interval);
7009 output_asm_insn ("sub\t%0, %0, %1", xops);
7011 /* If doing stack clash protection then we probe up by the ABI specified
7012 amount. We do this because we're dropping full pages at a time in the
7013 loop. But if we're doing non-stack clash probing, probe at SP 0. */
7014 if (flag_stack_clash_protection)
7015 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
7016 else
7017 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
7019 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
7020 by this amount for each iteration. */
7021 output_asm_insn ("str\txzr, [%0, %1]", xops);
7023 /* Test if TEST_ADDR == LAST_ADDR. */
7024 xops[1] = reg2;
7025 output_asm_insn ("cmp\t%0, %1", xops);
7027 /* Branch. */
7028 fputs ("\tb.ne\t", asm_out_file);
7029 assemble_name_raw (asm_out_file, loop_lab);
7030 fputc ('\n', asm_out_file);
7032 return "";
7035 /* Emit the probe loop for doing stack clash probes and stack adjustments for
7036 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
7037 of GUARD_SIZE. When a probe is emitted it is done at most
7038 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
7039 at most MIN_PROBE_THRESHOLD. By the end of this function
7040 BASE = BASE - ADJUSTMENT. */
7042 const char *
7043 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
7044 rtx min_probe_threshold, rtx guard_size)
7046 /* This function is not allowed to use any instruction generation function
7047 like gen_ and friends. If you do you'll likely ICE during CFG validation,
7048 so instead emit the code you want using output_asm_insn. */
7049 gcc_assert (flag_stack_clash_protection);
7050 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
7051 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
7053 /* The minimum required allocation before the residual requires probing. */
7054 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
7056 /* Clamp the value down to the nearest value that can be used with a cmp. */
7057 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
7058 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
7060 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
7061 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
7063 static int labelno = 0;
7064 char loop_start_lab[32];
7065 char loop_end_lab[32];
7066 rtx xops[2];
7068 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
7069 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
7071 /* Emit loop start label. */
7072 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
7074 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
7075 xops[0] = adjustment;
7076 xops[1] = probe_offset_value_rtx;
7077 output_asm_insn ("cmp\t%0, %1", xops);
7079 /* Branch to end if not enough adjustment to probe. */
7080 fputs ("\tb.lt\t", asm_out_file);
7081 assemble_name_raw (asm_out_file, loop_end_lab);
7082 fputc ('\n', asm_out_file);
7084 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
7085 xops[0] = base;
7086 xops[1] = probe_offset_value_rtx;
7087 output_asm_insn ("sub\t%0, %0, %1", xops);
7089 /* Probe at BASE. */
7090 xops[1] = const0_rtx;
7091 output_asm_insn ("str\txzr, [%0, %1]", xops);
7093 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
7094 xops[0] = adjustment;
7095 xops[1] = probe_offset_value_rtx;
7096 output_asm_insn ("sub\t%0, %0, %1", xops);
7098 /* Branch to start if still more bytes to allocate. */
7099 fputs ("\tb\t", asm_out_file);
7100 assemble_name_raw (asm_out_file, loop_start_lab);
7101 fputc ('\n', asm_out_file);
7103 /* No probe leave. */
7104 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
7106 /* BASE = BASE - ADJUSTMENT. */
7107 xops[0] = base;
7108 xops[1] = adjustment;
7109 output_asm_insn ("sub\t%0, %0, %1", xops);
7110 return "";
7113 /* Determine whether a frame chain needs to be generated. */
7114 static bool
7115 aarch64_needs_frame_chain (void)
7117 /* Force a frame chain for EH returns so the return address is at FP+8. */
7118 if (frame_pointer_needed || crtl->calls_eh_return)
7119 return true;
7121 /* A leaf function cannot have calls or write LR. */
7122 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
7124 /* Don't use a frame chain in leaf functions if leaf frame pointers
7125 are disabled. */
7126 if (flag_omit_leaf_frame_pointer && is_leaf)
7127 return false;
7129 return aarch64_use_frame_pointer;
7132 /* Mark the registers that need to be saved by the callee and calculate
7133 the size of the callee-saved registers area and frame record (both FP
7134 and LR may be omitted). */
7135 static void
7136 aarch64_layout_frame (void)
7138 poly_int64 offset = 0;
7139 int regno, last_fp_reg = INVALID_REGNUM;
7140 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
7141 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
7142 bool frame_related_fp_reg_p = false;
7143 aarch64_frame &frame = cfun->machine->frame;
7145 frame.emit_frame_chain = aarch64_needs_frame_chain ();
7147 /* Adjust the outgoing arguments size if required. Keep it in sync with what
7148 the mid-end is doing. */
7149 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
7151 #define SLOT_NOT_REQUIRED (-2)
7152 #define SLOT_REQUIRED (-1)
7154 frame.wb_candidate1 = INVALID_REGNUM;
7155 frame.wb_candidate2 = INVALID_REGNUM;
7156 frame.spare_pred_reg = INVALID_REGNUM;
7158 /* First mark all the registers that really need to be saved... */
7159 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7160 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
7162 /* ... that includes the eh data registers (if needed)... */
7163 if (crtl->calls_eh_return)
7164 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
7165 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
7167 /* ... and any callee saved register that dataflow says is live. */
7168 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7169 if (df_regs_ever_live_p (regno)
7170 && !fixed_regs[regno]
7171 && (regno == R30_REGNUM
7172 || !crtl->abi->clobbers_full_reg_p (regno)))
7173 frame.reg_offset[regno] = SLOT_REQUIRED;
7175 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7176 if (df_regs_ever_live_p (regno)
7177 && !fixed_regs[regno]
7178 && !crtl->abi->clobbers_full_reg_p (regno))
7180 frame.reg_offset[regno] = SLOT_REQUIRED;
7181 last_fp_reg = regno;
7182 if (aarch64_emit_cfi_for_reg_p (regno))
7183 frame_related_fp_reg_p = true;
7186 /* Big-endian SVE frames need a spare predicate register in order
7187 to save Z8-Z15. Decide which register they should use. Prefer
7188 an unused argument register if possible, so that we don't force P4
7189 to be saved unnecessarily. */
7190 if (frame_related_fp_reg_p
7191 && crtl->abi->id () == ARM_PCS_SVE
7192 && BYTES_BIG_ENDIAN)
7194 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
7195 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
7196 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
7197 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
7198 break;
7199 gcc_assert (regno <= P7_REGNUM);
7200 frame.spare_pred_reg = regno;
7201 df_set_regs_ever_live (regno, true);
7204 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7205 if (df_regs_ever_live_p (regno)
7206 && !fixed_regs[regno]
7207 && !crtl->abi->clobbers_full_reg_p (regno))
7208 frame.reg_offset[regno] = SLOT_REQUIRED;
7210 /* With stack-clash, LR must be saved in non-leaf functions. The saving of
7211 LR counts as an implicit probe which allows us to maintain the invariant
7212 described in the comment at expand_prologue. */
7213 gcc_assert (crtl->is_leaf
7214 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
7216 /* Now assign stack slots for the registers. Start with the predicate
7217 registers, since predicate LDR and STR have a relatively small
7218 offset range. These saves happen below the hard frame pointer. */
7219 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
7220 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7222 frame.reg_offset[regno] = offset;
7223 offset += BYTES_PER_SVE_PRED;
7226 if (maybe_ne (offset, 0))
7228 /* If we have any vector registers to save above the predicate registers,
7229 the offset of the vector register save slots need to be a multiple
7230 of the vector size. This lets us use the immediate forms of LDR/STR
7231 (or LD1/ST1 for big-endian).
7233 A vector register is 8 times the size of a predicate register,
7234 and we need to save a maximum of 12 predicate registers, so the
7235 first vector register will be at either #1, MUL VL or #2, MUL VL.
7237 If we don't have any vector registers to save, and we know how
7238 big the predicate save area is, we can just round it up to the
7239 next 16-byte boundary. */
7240 if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
7241 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7242 else
7244 if (known_le (offset, vector_save_size))
7245 offset = vector_save_size;
7246 else if (known_le (offset, vector_save_size * 2))
7247 offset = vector_save_size * 2;
7248 else
7249 gcc_unreachable ();
7253 /* If we need to save any SVE vector registers, add them next. */
7254 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
7255 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7256 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7258 frame.reg_offset[regno] = offset;
7259 offset += vector_save_size;
7262 /* OFFSET is now the offset of the hard frame pointer from the bottom
7263 of the callee save area. */
7264 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
7265 frame.below_hard_fp_saved_regs_size = offset;
7266 if (frame.emit_frame_chain)
7268 /* FP and LR are placed in the linkage record. */
7269 frame.reg_offset[R29_REGNUM] = offset;
7270 frame.wb_candidate1 = R29_REGNUM;
7271 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
7272 frame.wb_candidate2 = R30_REGNUM;
7273 offset += 2 * UNITS_PER_WORD;
7276 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
7277 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7279 frame.reg_offset[regno] = offset;
7280 if (frame.wb_candidate1 == INVALID_REGNUM)
7281 frame.wb_candidate1 = regno;
7282 else if (frame.wb_candidate2 == INVALID_REGNUM)
7283 frame.wb_candidate2 = regno;
7284 offset += UNITS_PER_WORD;
7287 poly_int64 max_int_offset = offset;
7288 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7289 bool has_align_gap = maybe_ne (offset, max_int_offset);
7291 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
7292 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
7294 /* If there is an alignment gap between integer and fp callee-saves,
7295 allocate the last fp register to it if possible. */
7296 if (regno == last_fp_reg
7297 && has_align_gap
7298 && known_eq (vector_save_size, 8)
7299 && multiple_p (offset, 16))
7301 frame.reg_offset[regno] = max_int_offset;
7302 break;
7305 frame.reg_offset[regno] = offset;
7306 if (frame.wb_candidate1 == INVALID_REGNUM)
7307 frame.wb_candidate1 = regno;
7308 else if (frame.wb_candidate2 == INVALID_REGNUM
7309 && frame.wb_candidate1 >= V0_REGNUM)
7310 frame.wb_candidate2 = regno;
7311 offset += vector_save_size;
7314 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
7316 frame.saved_regs_size = offset;
7318 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
7320 poly_int64 above_outgoing_args
7321 = aligned_upper_bound (varargs_and_saved_regs_size
7322 + get_frame_size (),
7323 STACK_BOUNDARY / BITS_PER_UNIT);
7325 frame.hard_fp_offset
7326 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
7328 /* Both these values are already aligned. */
7329 gcc_assert (multiple_p (crtl->outgoing_args_size,
7330 STACK_BOUNDARY / BITS_PER_UNIT));
7331 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
7333 frame.locals_offset = frame.saved_varargs_size;
7335 frame.initial_adjust = 0;
7336 frame.final_adjust = 0;
7337 frame.callee_adjust = 0;
7338 frame.sve_callee_adjust = 0;
7339 frame.callee_offset = 0;
7341 HOST_WIDE_INT max_push_offset = 0;
7342 if (frame.wb_candidate2 != INVALID_REGNUM)
7343 max_push_offset = 512;
7344 else if (frame.wb_candidate1 != INVALID_REGNUM)
7345 max_push_offset = 256;
7347 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
7348 HOST_WIDE_INT const_saved_regs_size;
7349 if (frame.frame_size.is_constant (&const_size)
7350 && const_size < max_push_offset
7351 && known_eq (frame.hard_fp_offset, const_size))
7353 /* Simple, small frame with no outgoing arguments:
7355 stp reg1, reg2, [sp, -frame_size]!
7356 stp reg3, reg4, [sp, 16] */
7357 frame.callee_adjust = const_size;
7359 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
7360 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
7361 && const_outgoing_args_size + const_saved_regs_size < 512
7362 /* We could handle this case even with outgoing args, provided
7363 that the number of args left us with valid offsets for all
7364 predicate and vector save slots. It's such a rare case that
7365 it hardly seems worth the effort though. */
7366 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
7367 && !(cfun->calls_alloca
7368 && frame.hard_fp_offset.is_constant (&const_fp_offset)
7369 && const_fp_offset < max_push_offset))
7371 /* Frame with small outgoing arguments:
7373 sub sp, sp, frame_size
7374 stp reg1, reg2, [sp, outgoing_args_size]
7375 stp reg3, reg4, [sp, outgoing_args_size + 16] */
7376 frame.initial_adjust = frame.frame_size;
7377 frame.callee_offset = const_outgoing_args_size;
7379 else if (saves_below_hard_fp_p
7380 && known_eq (frame.saved_regs_size,
7381 frame.below_hard_fp_saved_regs_size))
7383 /* Frame in which all saves are SVE saves:
7385 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
7386 save SVE registers relative to SP
7387 sub sp, sp, outgoing_args_size */
7388 frame.initial_adjust = (frame.hard_fp_offset
7389 + frame.below_hard_fp_saved_regs_size);
7390 frame.final_adjust = crtl->outgoing_args_size;
7392 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
7393 && const_fp_offset < max_push_offset)
7395 /* Frame with large outgoing arguments or SVE saves, but with
7396 a small local area:
7398 stp reg1, reg2, [sp, -hard_fp_offset]!
7399 stp reg3, reg4, [sp, 16]
7400 [sub sp, sp, below_hard_fp_saved_regs_size]
7401 [save SVE registers relative to SP]
7402 sub sp, sp, outgoing_args_size */
7403 frame.callee_adjust = const_fp_offset;
7404 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
7405 frame.final_adjust = crtl->outgoing_args_size;
7407 else
7409 /* Frame with large local area and outgoing arguments or SVE saves,
7410 using frame pointer:
7412 sub sp, sp, hard_fp_offset
7413 stp x29, x30, [sp, 0]
7414 add x29, sp, 0
7415 stp reg3, reg4, [sp, 16]
7416 [sub sp, sp, below_hard_fp_saved_regs_size]
7417 [save SVE registers relative to SP]
7418 sub sp, sp, outgoing_args_size */
7419 frame.initial_adjust = frame.hard_fp_offset;
7420 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
7421 frame.final_adjust = crtl->outgoing_args_size;
7424 /* Make sure the individual adjustments add up to the full frame size. */
7425 gcc_assert (known_eq (frame.initial_adjust
7426 + frame.callee_adjust
7427 + frame.sve_callee_adjust
7428 + frame.final_adjust, frame.frame_size));
7430 if (!frame.emit_frame_chain && frame.callee_adjust == 0)
7432 /* We've decided not to associate any register saves with the initial
7433 stack allocation. */
7434 frame.wb_candidate1 = INVALID_REGNUM;
7435 frame.wb_candidate2 = INVALID_REGNUM;
7438 frame.laid_out = true;
7441 /* Return true if the register REGNO is saved on entry to
7442 the current function. */
7444 static bool
7445 aarch64_register_saved_on_entry (int regno)
7447 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
7450 /* Return the next register up from REGNO up to LIMIT for the callee
7451 to save. */
7453 static unsigned
7454 aarch64_next_callee_save (unsigned regno, unsigned limit)
7456 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
7457 regno ++;
7458 return regno;
7461 /* Push the register number REGNO of mode MODE to the stack with write-back
7462 adjusting the stack by ADJUSTMENT. */
7464 static void
7465 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
7466 HOST_WIDE_INT adjustment)
7468 rtx base_rtx = stack_pointer_rtx;
7469 rtx insn, reg, mem;
7471 reg = gen_rtx_REG (mode, regno);
7472 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
7473 plus_constant (Pmode, base_rtx, -adjustment));
7474 mem = gen_frame_mem (mode, mem);
7476 insn = emit_move_insn (mem, reg);
7477 RTX_FRAME_RELATED_P (insn) = 1;
7480 /* Generate and return an instruction to store the pair of registers
7481 REG and REG2 of mode MODE to location BASE with write-back adjusting
7482 the stack location BASE by ADJUSTMENT. */
7484 static rtx
7485 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
7486 HOST_WIDE_INT adjustment)
7488 switch (mode)
7490 case E_DImode:
7491 return gen_storewb_pairdi_di (base, base, reg, reg2,
7492 GEN_INT (-adjustment),
7493 GEN_INT (UNITS_PER_WORD - adjustment));
7494 case E_DFmode:
7495 return gen_storewb_pairdf_di (base, base, reg, reg2,
7496 GEN_INT (-adjustment),
7497 GEN_INT (UNITS_PER_WORD - adjustment));
7498 case E_TFmode:
7499 return gen_storewb_pairtf_di (base, base, reg, reg2,
7500 GEN_INT (-adjustment),
7501 GEN_INT (UNITS_PER_VREG - adjustment));
7502 default:
7503 gcc_unreachable ();
7507 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
7508 stack pointer by ADJUSTMENT. */
7510 static void
7511 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
7513 rtx_insn *insn;
7514 machine_mode mode = aarch64_reg_save_mode (regno1);
7516 if (regno2 == INVALID_REGNUM)
7517 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
7519 rtx reg1 = gen_rtx_REG (mode, regno1);
7520 rtx reg2 = gen_rtx_REG (mode, regno2);
7522 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
7523 reg2, adjustment));
7524 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
7525 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7526 RTX_FRAME_RELATED_P (insn) = 1;
7529 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
7530 adjusting it by ADJUSTMENT afterwards. */
7532 static rtx
7533 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
7534 HOST_WIDE_INT adjustment)
7536 switch (mode)
7538 case E_DImode:
7539 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
7540 GEN_INT (UNITS_PER_WORD));
7541 case E_DFmode:
7542 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
7543 GEN_INT (UNITS_PER_WORD));
7544 case E_TFmode:
7545 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
7546 GEN_INT (UNITS_PER_VREG));
7547 default:
7548 gcc_unreachable ();
7552 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
7553 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
7554 into CFI_OPS. */
7556 static void
7557 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
7558 rtx *cfi_ops)
7560 machine_mode mode = aarch64_reg_save_mode (regno1);
7561 rtx reg1 = gen_rtx_REG (mode, regno1);
7563 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
7565 if (regno2 == INVALID_REGNUM)
7567 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
7568 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
7569 emit_move_insn (reg1, gen_frame_mem (mode, mem));
7571 else
7573 rtx reg2 = gen_rtx_REG (mode, regno2);
7574 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7575 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
7576 reg2, adjustment));
7580 /* Generate and return a store pair instruction of mode MODE to store
7581 register REG1 to MEM1 and register REG2 to MEM2. */
7583 static rtx
7584 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
7585 rtx reg2)
7587 switch (mode)
7589 case E_DImode:
7590 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
7592 case E_DFmode:
7593 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
7595 case E_TFmode:
7596 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
7598 case E_V4SImode:
7599 return gen_vec_store_pairv4siv4si (mem1, reg1, mem2, reg2);
7601 case E_V16QImode:
7602 return gen_vec_store_pairv16qiv16qi (mem1, reg1, mem2, reg2);
7604 default:
7605 gcc_unreachable ();
7609 /* Generate and regurn a load pair isntruction of mode MODE to load register
7610 REG1 from MEM1 and register REG2 from MEM2. */
7612 static rtx
7613 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
7614 rtx mem2)
7616 switch (mode)
7618 case E_DImode:
7619 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
7621 case E_DFmode:
7622 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
7624 case E_TFmode:
7625 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
7627 case E_V4SImode:
7628 return gen_load_pairv4siv4si (reg1, mem1, reg2, mem2);
7630 default:
7631 gcc_unreachable ();
7635 /* Return TRUE if return address signing should be enabled for the current
7636 function, otherwise return FALSE. */
7638 bool
7639 aarch64_return_address_signing_enabled (void)
7641 /* This function should only be called after frame laid out. */
7642 gcc_assert (cfun->machine->frame.laid_out);
7644 /* Turn return address signing off in any function that uses
7645 __builtin_eh_return. The address passed to __builtin_eh_return
7646 is not signed so either it has to be signed (with original sp)
7647 or the code path that uses it has to avoid authenticating it.
7648 Currently eh return introduces a return to anywhere gadget, no
7649 matter what we do here since it uses ret with user provided
7650 address. An ideal fix for that is to use indirect branch which
7651 can be protected with BTI j (to some extent). */
7652 if (crtl->calls_eh_return)
7653 return false;
7655 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
7656 if its LR is pushed onto stack. */
7657 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
7658 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
7659 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
7662 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
7663 bool
7664 aarch64_bti_enabled (void)
7666 return (aarch64_enable_bti == 1);
7669 /* The caller is going to use ST1D or LD1D to save or restore an SVE
7670 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
7671 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
7673 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
7674 or LD1D address
7676 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
7677 if the variable isn't already nonnull
7679 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
7680 Handle this case using a temporary base register that is suitable for
7681 all offsets in that range. Use ANCHOR_REG as this base register if it
7682 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
7684 static inline void
7685 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
7686 rtx &anchor_reg, poly_int64 &offset,
7687 rtx &ptrue)
7689 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
7691 /* This is the maximum valid offset of the anchor from the base.
7692 Lower values would be valid too. */
7693 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
7694 if (!anchor_reg)
7696 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7697 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7698 gen_int_mode (anchor_offset, Pmode)));
7700 base_rtx = anchor_reg;
7701 offset -= anchor_offset;
7703 if (!ptrue)
7705 int pred_reg = cfun->machine->frame.spare_pred_reg;
7706 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
7707 CONSTM1_RTX (VNx16BImode));
7708 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
7712 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
7713 is saved at BASE + OFFSET. */
7715 static void
7716 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
7717 rtx base, poly_int64 offset)
7719 rtx mem = gen_frame_mem (GET_MODE (reg),
7720 plus_constant (Pmode, base, offset));
7721 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
7724 /* Emit code to save the callee-saved registers from register number START
7725 to LIMIT to the stack at the location starting at offset START_OFFSET,
7726 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
7727 is true if the hard frame pointer has been set up. */
7729 static void
7730 aarch64_save_callee_saves (poly_int64 start_offset,
7731 unsigned start, unsigned limit, bool skip_wb,
7732 bool hard_fp_valid_p)
7734 rtx_insn *insn;
7735 unsigned regno;
7736 unsigned regno2;
7737 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7739 for (regno = aarch64_next_callee_save (start, limit);
7740 regno <= limit;
7741 regno = aarch64_next_callee_save (regno + 1, limit))
7743 rtx reg, mem;
7744 poly_int64 offset;
7745 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7747 if (skip_wb
7748 && (regno == cfun->machine->frame.wb_candidate1
7749 || regno == cfun->machine->frame.wb_candidate2))
7750 continue;
7752 if (cfun->machine->reg_is_wrapped_separately[regno])
7753 continue;
7755 machine_mode mode = aarch64_reg_save_mode (regno);
7756 reg = gen_rtx_REG (mode, regno);
7757 offset = start_offset + cfun->machine->frame.reg_offset[regno];
7758 rtx base_rtx = stack_pointer_rtx;
7759 poly_int64 sp_offset = offset;
7761 HOST_WIDE_INT const_offset;
7762 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7763 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7764 offset, ptrue);
7765 else if (GP_REGNUM_P (regno)
7766 && (!offset.is_constant (&const_offset) || const_offset >= 512))
7768 gcc_assert (known_eq (start_offset, 0));
7769 poly_int64 fp_offset
7770 = cfun->machine->frame.below_hard_fp_saved_regs_size;
7771 if (hard_fp_valid_p)
7772 base_rtx = hard_frame_pointer_rtx;
7773 else
7775 if (!anchor_reg)
7777 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7778 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7779 gen_int_mode (fp_offset, Pmode)));
7781 base_rtx = anchor_reg;
7783 offset -= fp_offset;
7785 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7786 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
7788 if (!aarch64_sve_mode_p (mode)
7789 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7790 && !cfun->machine->reg_is_wrapped_separately[regno2]
7791 && known_eq (GET_MODE_SIZE (mode),
7792 cfun->machine->frame.reg_offset[regno2]
7793 - cfun->machine->frame.reg_offset[regno]))
7795 rtx reg2 = gen_rtx_REG (mode, regno2);
7796 rtx mem2;
7798 offset += GET_MODE_SIZE (mode);
7799 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7800 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
7801 reg2));
7803 /* The first part of a frame-related parallel insn is
7804 always assumed to be relevant to the frame
7805 calculations; subsequent parts, are only
7806 frame-related if explicitly marked. */
7807 if (aarch64_emit_cfi_for_reg_p (regno2))
7809 if (need_cfa_note_p)
7810 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
7811 sp_offset + GET_MODE_SIZE (mode));
7812 else
7813 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7816 regno = regno2;
7818 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7820 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
7821 need_cfa_note_p = true;
7823 else if (aarch64_sve_mode_p (mode))
7824 insn = emit_insn (gen_rtx_SET (mem, reg));
7825 else
7826 insn = emit_move_insn (mem, reg);
7828 RTX_FRAME_RELATED_P (insn) = frame_related_p;
7829 if (frame_related_p && need_cfa_note_p)
7830 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
7834 /* Emit code to restore the callee registers from register number START
7835 up to and including LIMIT. Restore from the stack offset START_OFFSET,
7836 skipping any write-back candidates if SKIP_WB is true. Write the
7837 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
7839 static void
7840 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
7841 unsigned limit, bool skip_wb, rtx *cfi_ops)
7843 unsigned regno;
7844 unsigned regno2;
7845 poly_int64 offset;
7846 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7848 for (regno = aarch64_next_callee_save (start, limit);
7849 regno <= limit;
7850 regno = aarch64_next_callee_save (regno + 1, limit))
7852 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7853 if (cfun->machine->reg_is_wrapped_separately[regno])
7854 continue;
7856 rtx reg, mem;
7858 if (skip_wb
7859 && (regno == cfun->machine->frame.wb_candidate1
7860 || regno == cfun->machine->frame.wb_candidate2))
7861 continue;
7863 machine_mode mode = aarch64_reg_save_mode (regno);
7864 reg = gen_rtx_REG (mode, regno);
7865 offset = start_offset + cfun->machine->frame.reg_offset[regno];
7866 rtx base_rtx = stack_pointer_rtx;
7867 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7868 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7869 offset, ptrue);
7870 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7872 if (!aarch64_sve_mode_p (mode)
7873 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7874 && !cfun->machine->reg_is_wrapped_separately[regno2]
7875 && known_eq (GET_MODE_SIZE (mode),
7876 cfun->machine->frame.reg_offset[regno2]
7877 - cfun->machine->frame.reg_offset[regno]))
7879 rtx reg2 = gen_rtx_REG (mode, regno2);
7880 rtx mem2;
7882 offset += GET_MODE_SIZE (mode);
7883 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7884 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7886 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7887 regno = regno2;
7889 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7890 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
7891 else if (aarch64_sve_mode_p (mode))
7892 emit_insn (gen_rtx_SET (reg, mem));
7893 else
7894 emit_move_insn (reg, mem);
7895 if (frame_related_p)
7896 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
7900 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
7901 of MODE. */
7903 static inline bool
7904 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7906 HOST_WIDE_INT multiple;
7907 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7908 && IN_RANGE (multiple, -8, 7));
7911 /* Return true if OFFSET is a signed 6-bit value multiplied by the size
7912 of MODE. */
7914 static inline bool
7915 offset_6bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7917 HOST_WIDE_INT multiple;
7918 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7919 && IN_RANGE (multiple, -32, 31));
7922 /* Return true if OFFSET is an unsigned 6-bit value multiplied by the size
7923 of MODE. */
7925 static inline bool
7926 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7928 HOST_WIDE_INT multiple;
7929 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7930 && IN_RANGE (multiple, 0, 63));
7933 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
7934 of MODE. */
7936 bool
7937 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7939 HOST_WIDE_INT multiple;
7940 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7941 && IN_RANGE (multiple, -64, 63));
7944 /* Return true if OFFSET is a signed 9-bit value. */
7946 bool
7947 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
7948 poly_int64 offset)
7950 HOST_WIDE_INT const_offset;
7951 return (offset.is_constant (&const_offset)
7952 && IN_RANGE (const_offset, -256, 255));
7955 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
7956 of MODE. */
7958 static inline bool
7959 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7961 HOST_WIDE_INT multiple;
7962 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7963 && IN_RANGE (multiple, -256, 255));
7966 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
7967 of MODE. */
7969 static inline bool
7970 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7972 HOST_WIDE_INT multiple;
7973 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7974 && IN_RANGE (multiple, 0, 4095));
7977 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
7979 static sbitmap
7980 aarch64_get_separate_components (void)
7982 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7983 bitmap_clear (components);
7985 /* The registers we need saved to the frame. */
7986 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7987 if (aarch64_register_saved_on_entry (regno))
7989 /* Punt on saves and restores that use ST1D and LD1D. We could
7990 try to be smarter, but it would involve making sure that the
7991 spare predicate register itself is safe to use at the save
7992 and restore points. Also, when a frame pointer is being used,
7993 the slots are often out of reach of ST1D and LD1D anyway. */
7994 machine_mode mode = aarch64_reg_save_mode (regno);
7995 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7996 continue;
7998 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8000 /* If the register is saved in the first SVE save slot, we use
8001 it as a stack probe for -fstack-clash-protection. */
8002 if (flag_stack_clash_protection
8003 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
8004 && known_eq (offset, 0))
8005 continue;
8007 /* Get the offset relative to the register we'll use. */
8008 if (frame_pointer_needed)
8009 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
8010 else
8011 offset += crtl->outgoing_args_size;
8013 /* Check that we can access the stack slot of the register with one
8014 direct load with no adjustments needed. */
8015 if (aarch64_sve_mode_p (mode)
8016 ? offset_9bit_signed_scaled_p (mode, offset)
8017 : offset_12bit_unsigned_scaled_p (mode, offset))
8018 bitmap_set_bit (components, regno);
8021 /* Don't mess with the hard frame pointer. */
8022 if (frame_pointer_needed)
8023 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
8025 /* If the spare predicate register used by big-endian SVE code
8026 is call-preserved, it must be saved in the main prologue
8027 before any saves that use it. */
8028 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
8029 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
8031 unsigned reg1 = cfun->machine->frame.wb_candidate1;
8032 unsigned reg2 = cfun->machine->frame.wb_candidate2;
8033 /* If registers have been chosen to be stored/restored with
8034 writeback don't interfere with them to avoid having to output explicit
8035 stack adjustment instructions. */
8036 if (reg2 != INVALID_REGNUM)
8037 bitmap_clear_bit (components, reg2);
8038 if (reg1 != INVALID_REGNUM)
8039 bitmap_clear_bit (components, reg1);
8041 bitmap_clear_bit (components, LR_REGNUM);
8042 bitmap_clear_bit (components, SP_REGNUM);
8044 return components;
8047 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
8049 static sbitmap
8050 aarch64_components_for_bb (basic_block bb)
8052 bitmap in = DF_LIVE_IN (bb);
8053 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
8054 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
8056 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
8057 bitmap_clear (components);
8059 /* Clobbered registers don't generate values in any meaningful sense,
8060 since nothing after the clobber can rely on their value. And we can't
8061 say that partially-clobbered registers are unconditionally killed,
8062 because whether they're killed or not depends on the mode of the
8063 value they're holding. Thus partially call-clobbered registers
8064 appear in neither the kill set nor the gen set.
8066 Check manually for any calls that clobber more of a register than the
8067 current function can. */
8068 function_abi_aggregator callee_abis;
8069 rtx_insn *insn;
8070 FOR_BB_INSNS (bb, insn)
8071 if (CALL_P (insn))
8072 callee_abis.note_callee_abi (insn_callee_abi (insn));
8073 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
8075 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
8076 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8077 if (!fixed_regs[regno]
8078 && !crtl->abi->clobbers_full_reg_p (regno)
8079 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
8080 || bitmap_bit_p (in, regno)
8081 || bitmap_bit_p (gen, regno)
8082 || bitmap_bit_p (kill, regno)))
8084 bitmap_set_bit (components, regno);
8086 /* If there is a callee-save at an adjacent offset, add it too
8087 to increase the use of LDP/STP. */
8088 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8089 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
8091 if (regno2 <= LAST_SAVED_REGNUM)
8093 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
8094 if (regno < regno2
8095 ? known_eq (offset + 8, offset2)
8096 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
8097 bitmap_set_bit (components, regno2);
8101 return components;
8104 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
8105 Nothing to do for aarch64. */
8107 static void
8108 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
8112 /* Return the next set bit in BMP from START onwards. Return the total number
8113 of bits in BMP if no set bit is found at or after START. */
8115 static unsigned int
8116 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
8118 unsigned int nbits = SBITMAP_SIZE (bmp);
8119 if (start == nbits)
8120 return start;
8122 gcc_assert (start < nbits);
8123 for (unsigned int i = start; i < nbits; i++)
8124 if (bitmap_bit_p (bmp, i))
8125 return i;
8127 return nbits;
8130 /* Do the work for aarch64_emit_prologue_components and
8131 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
8132 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
8133 for these components or the epilogue sequence. That is, it determines
8134 whether we should emit stores or loads and what kind of CFA notes to attach
8135 to the insns. Otherwise the logic for the two sequences is very
8136 similar. */
8138 static void
8139 aarch64_process_components (sbitmap components, bool prologue_p)
8141 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
8142 ? HARD_FRAME_POINTER_REGNUM
8143 : STACK_POINTER_REGNUM);
8145 unsigned last_regno = SBITMAP_SIZE (components);
8146 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
8147 rtx_insn *insn = NULL;
8149 while (regno != last_regno)
8151 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
8152 machine_mode mode = aarch64_reg_save_mode (regno);
8154 rtx reg = gen_rtx_REG (mode, regno);
8155 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
8156 if (frame_pointer_needed)
8157 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
8158 else
8159 offset += crtl->outgoing_args_size;
8161 rtx addr = plus_constant (Pmode, ptr_reg, offset);
8162 rtx mem = gen_frame_mem (mode, addr);
8164 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
8165 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
8166 /* No more registers to handle after REGNO.
8167 Emit a single save/restore and exit. */
8168 if (regno2 == last_regno)
8170 insn = emit_insn (set);
8171 if (frame_related_p)
8173 RTX_FRAME_RELATED_P (insn) = 1;
8174 if (prologue_p)
8175 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
8176 else
8177 add_reg_note (insn, REG_CFA_RESTORE, reg);
8179 break;
8182 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
8183 /* The next register is not of the same class or its offset is not
8184 mergeable with the current one into a pair. */
8185 if (aarch64_sve_mode_p (mode)
8186 || !satisfies_constraint_Ump (mem)
8187 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
8188 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
8189 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
8190 GET_MODE_SIZE (mode)))
8192 insn = emit_insn (set);
8193 if (frame_related_p)
8195 RTX_FRAME_RELATED_P (insn) = 1;
8196 if (prologue_p)
8197 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
8198 else
8199 add_reg_note (insn, REG_CFA_RESTORE, reg);
8202 regno = regno2;
8203 continue;
8206 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
8208 /* REGNO2 can be saved/restored in a pair with REGNO. */
8209 rtx reg2 = gen_rtx_REG (mode, regno2);
8210 if (frame_pointer_needed)
8211 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
8212 else
8213 offset2 += crtl->outgoing_args_size;
8214 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
8215 rtx mem2 = gen_frame_mem (mode, addr2);
8216 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
8217 : gen_rtx_SET (reg2, mem2);
8219 if (prologue_p)
8220 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
8221 else
8222 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
8224 if (frame_related_p || frame_related2_p)
8226 RTX_FRAME_RELATED_P (insn) = 1;
8227 if (prologue_p)
8229 if (frame_related_p)
8230 add_reg_note (insn, REG_CFA_OFFSET, set);
8231 if (frame_related2_p)
8232 add_reg_note (insn, REG_CFA_OFFSET, set2);
8234 else
8236 if (frame_related_p)
8237 add_reg_note (insn, REG_CFA_RESTORE, reg);
8238 if (frame_related2_p)
8239 add_reg_note (insn, REG_CFA_RESTORE, reg2);
8243 regno = aarch64_get_next_set_bit (components, regno2 + 1);
8247 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
8249 static void
8250 aarch64_emit_prologue_components (sbitmap components)
8252 aarch64_process_components (components, true);
8255 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
8257 static void
8258 aarch64_emit_epilogue_components (sbitmap components)
8260 aarch64_process_components (components, false);
8263 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
8265 static void
8266 aarch64_set_handled_components (sbitmap components)
8268 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
8269 if (bitmap_bit_p (components, regno))
8270 cfun->machine->reg_is_wrapped_separately[regno] = true;
8273 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
8274 determining the probe offset for alloca. */
8276 static HOST_WIDE_INT
8277 aarch64_stack_clash_protection_alloca_probe_range (void)
8279 return STACK_CLASH_CALLER_GUARD;
8283 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
8284 registers. If POLY_SIZE is not large enough to require a probe this function
8285 will only adjust the stack. When allocating the stack space
8286 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
8287 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
8288 arguments. If we are then we ensure that any allocation larger than the ABI
8289 defined buffer needs a probe so that the invariant of having a 1KB buffer is
8290 maintained.
8292 We emit barriers after each stack adjustment to prevent optimizations from
8293 breaking the invariant that we never drop the stack more than a page. This
8294 invariant is needed to make it easier to correctly handle asynchronous
8295 events, e.g. if we were to allow the stack to be dropped by more than a page
8296 and then have multiple probes up and we take a signal somewhere in between
8297 then the signal handler doesn't know the state of the stack and can make no
8298 assumptions about which pages have been probed. */
8300 static void
8301 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
8302 poly_int64 poly_size,
8303 bool frame_related_p,
8304 bool final_adjustment_p)
8306 HOST_WIDE_INT guard_size
8307 = 1 << param_stack_clash_protection_guard_size;
8308 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8309 HOST_WIDE_INT min_probe_threshold
8310 = (final_adjustment_p
8311 ? guard_used_by_caller
8312 : guard_size - guard_used_by_caller);
8313 /* When doing the final adjustment for the outgoing arguments, take into
8314 account any unprobed space there is above the current SP. There are
8315 two cases:
8317 - When saving SVE registers below the hard frame pointer, we force
8318 the lowest save to take place in the prologue before doing the final
8319 adjustment (i.e. we don't allow the save to be shrink-wrapped).
8320 This acts as a probe at SP, so there is no unprobed space.
8322 - When there are no SVE register saves, we use the store of the link
8323 register as a probe. We can't assume that LR was saved at position 0
8324 though, so treat any space below it as unprobed. */
8325 if (final_adjustment_p
8326 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
8328 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
8329 if (known_ge (lr_offset, 0))
8330 min_probe_threshold -= lr_offset.to_constant ();
8331 else
8332 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
8335 poly_int64 frame_size = cfun->machine->frame.frame_size;
8337 /* We should always have a positive probe threshold. */
8338 gcc_assert (min_probe_threshold > 0);
8340 if (flag_stack_clash_protection && !final_adjustment_p)
8342 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8343 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8344 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8346 if (known_eq (frame_size, 0))
8348 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
8350 else if (known_lt (initial_adjust + sve_callee_adjust,
8351 guard_size - guard_used_by_caller)
8352 && known_lt (final_adjust, guard_used_by_caller))
8354 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
8358 /* If SIZE is not large enough to require probing, just adjust the stack and
8359 exit. */
8360 if (known_lt (poly_size, min_probe_threshold)
8361 || !flag_stack_clash_protection)
8363 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
8364 return;
8367 HOST_WIDE_INT size;
8368 /* Handle the SVE non-constant case first. */
8369 if (!poly_size.is_constant (&size))
8371 if (dump_file)
8373 fprintf (dump_file, "Stack clash SVE prologue: ");
8374 print_dec (poly_size, dump_file);
8375 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
8378 /* First calculate the amount of bytes we're actually spilling. */
8379 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
8380 poly_size, temp1, temp2, false, true);
8382 rtx_insn *insn = get_last_insn ();
8384 if (frame_related_p)
8386 /* This is done to provide unwinding information for the stack
8387 adjustments we're about to do, however to prevent the optimizers
8388 from removing the R11 move and leaving the CFA note (which would be
8389 very wrong) we tie the old and new stack pointer together.
8390 The tie will expand to nothing but the optimizers will not touch
8391 the instruction. */
8392 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
8393 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
8394 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
8396 /* We want the CFA independent of the stack pointer for the
8397 duration of the loop. */
8398 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
8399 RTX_FRAME_RELATED_P (insn) = 1;
8402 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
8403 rtx guard_const = gen_int_mode (guard_size, Pmode);
8405 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
8406 stack_pointer_rtx, temp1,
8407 probe_const, guard_const));
8409 /* Now reset the CFA register if needed. */
8410 if (frame_related_p)
8412 add_reg_note (insn, REG_CFA_DEF_CFA,
8413 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
8414 gen_int_mode (poly_size, Pmode)));
8415 RTX_FRAME_RELATED_P (insn) = 1;
8418 return;
8421 if (dump_file)
8422 fprintf (dump_file,
8423 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
8424 " bytes, probing will be required.\n", size);
8426 /* Round size to the nearest multiple of guard_size, and calculate the
8427 residual as the difference between the original size and the rounded
8428 size. */
8429 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
8430 HOST_WIDE_INT residual = size - rounded_size;
8432 /* We can handle a small number of allocations/probes inline. Otherwise
8433 punt to a loop. */
8434 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
8436 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
8438 aarch64_sub_sp (NULL, temp2, guard_size, true);
8439 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
8440 guard_used_by_caller));
8441 emit_insn (gen_blockage ());
8443 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
8445 else
8447 /* Compute the ending address. */
8448 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
8449 temp1, NULL, false, true);
8450 rtx_insn *insn = get_last_insn ();
8452 /* For the initial allocation, we don't have a frame pointer
8453 set up, so we always need CFI notes. If we're doing the
8454 final allocation, then we may have a frame pointer, in which
8455 case it is the CFA, otherwise we need CFI notes.
8457 We can determine which allocation we are doing by looking at
8458 the value of FRAME_RELATED_P since the final allocations are not
8459 frame related. */
8460 if (frame_related_p)
8462 /* We want the CFA independent of the stack pointer for the
8463 duration of the loop. */
8464 add_reg_note (insn, REG_CFA_DEF_CFA,
8465 plus_constant (Pmode, temp1, rounded_size));
8466 RTX_FRAME_RELATED_P (insn) = 1;
8469 /* This allocates and probes the stack. Note that this re-uses some of
8470 the existing Ada stack protection code. However we are guaranteed not
8471 to enter the non loop or residual branches of that code.
8473 The non-loop part won't be entered because if our allocation amount
8474 doesn't require a loop, the case above would handle it.
8476 The residual amount won't be entered because TEMP1 is a mutliple of
8477 the allocation size. The residual will always be 0. As such, the only
8478 part we are actually using from that code is the loop setup. The
8479 actual probing is done in aarch64_output_probe_stack_range. */
8480 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
8481 stack_pointer_rtx, temp1));
8483 /* Now reset the CFA register if needed. */
8484 if (frame_related_p)
8486 add_reg_note (insn, REG_CFA_DEF_CFA,
8487 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
8488 RTX_FRAME_RELATED_P (insn) = 1;
8491 emit_insn (gen_blockage ());
8492 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
8495 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
8496 be probed. This maintains the requirement that each page is probed at
8497 least once. For initial probing we probe only if the allocation is
8498 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
8499 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
8500 GUARD_SIZE. This works that for any allocation that is large enough to
8501 trigger a probe here, we'll have at least one, and if they're not large
8502 enough for this code to emit anything for them, The page would have been
8503 probed by the saving of FP/LR either by this function or any callees. If
8504 we don't have any callees then we won't have more stack adjustments and so
8505 are still safe. */
8506 if (residual)
8508 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
8509 /* If we're doing final adjustments, and we've done any full page
8510 allocations then any residual needs to be probed. */
8511 if (final_adjustment_p && rounded_size != 0)
8512 min_probe_threshold = 0;
8513 /* If doing a small final adjustment, we always probe at offset 0.
8514 This is done to avoid issues when LR is not at position 0 or when
8515 the final adjustment is smaller than the probing offset. */
8516 else if (final_adjustment_p && rounded_size == 0)
8517 residual_probe_offset = 0;
8519 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
8520 if (residual >= min_probe_threshold)
8522 if (dump_file)
8523 fprintf (dump_file,
8524 "Stack clash AArch64 prologue residuals: "
8525 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
8526 "\n", residual);
8528 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
8529 residual_probe_offset));
8530 emit_insn (gen_blockage ());
8535 /* Return 1 if the register is used by the epilogue. We need to say the
8536 return register is used, but only after epilogue generation is complete.
8537 Note that in the case of sibcalls, the values "used by the epilogue" are
8538 considered live at the start of the called function.
8540 For SIMD functions we need to return 1 for FP registers that are saved and
8541 restored by a function but are not zero in call_used_regs. If we do not do
8542 this optimizations may remove the restore of the register. */
8545 aarch64_epilogue_uses (int regno)
8547 if (epilogue_completed)
8549 if (regno == LR_REGNUM)
8550 return 1;
8552 return 0;
8555 /* AArch64 stack frames generated by this compiler look like:
8557 +-------------------------------+
8559 | incoming stack arguments |
8561 +-------------------------------+
8562 | | <-- incoming stack pointer (aligned)
8563 | callee-allocated save area |
8564 | for register varargs |
8566 +-------------------------------+
8567 | local variables | <-- frame_pointer_rtx
8569 +-------------------------------+
8570 | padding | \
8571 +-------------------------------+ |
8572 | callee-saved registers | | frame.saved_regs_size
8573 +-------------------------------+ |
8574 | LR' | |
8575 +-------------------------------+ |
8576 | FP' | |
8577 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
8578 | SVE vector registers | | \
8579 +-------------------------------+ | | below_hard_fp_saved_regs_size
8580 | SVE predicate registers | / /
8581 +-------------------------------+
8582 | dynamic allocation |
8583 +-------------------------------+
8584 | padding |
8585 +-------------------------------+
8586 | outgoing stack arguments | <-- arg_pointer
8588 +-------------------------------+
8589 | | <-- stack_pointer_rtx (aligned)
8591 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
8592 but leave frame_pointer_rtx and hard_frame_pointer_rtx
8593 unchanged.
8595 By default for stack-clash we assume the guard is at least 64KB, but this
8596 value is configurable to either 4KB or 64KB. We also force the guard size to
8597 be the same as the probing interval and both values are kept in sync.
8599 With those assumptions the callee can allocate up to 63KB (or 3KB depending
8600 on the guard size) of stack space without probing.
8602 When probing is needed, we emit a probe at the start of the prologue
8603 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
8605 We have to track how much space has been allocated and the only stores
8606 to the stack we track as implicit probes are the FP/LR stores.
8608 For outgoing arguments we probe if the size is larger than 1KB, such that
8609 the ABI specified buffer is maintained for the next callee.
8611 The following registers are reserved during frame layout and should not be
8612 used for any other purpose:
8614 - r11: Used by stack clash protection when SVE is enabled, and also
8615 as an anchor register when saving and restoring registers
8616 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
8617 - r14 and r15: Used for speculation tracking.
8618 - r16(IP0), r17(IP1): Used by indirect tailcalls.
8619 - r30(LR), r29(FP): Used by standard frame layout.
8621 These registers must be avoided in frame layout related code unless the
8622 explicit intention is to interact with one of the features listed above. */
8624 /* Generate the prologue instructions for entry into a function.
8625 Establish the stack frame by decreasing the stack pointer with a
8626 properly calculated size and, if necessary, create a frame record
8627 filled with the values of LR and previous frame pointer. The
8628 current FP is also set up if it is in use. */
8630 void
8631 aarch64_expand_prologue (void)
8633 poly_int64 frame_size = cfun->machine->frame.frame_size;
8634 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8635 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8636 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8637 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8638 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8639 poly_int64 below_hard_fp_saved_regs_size
8640 = cfun->machine->frame.below_hard_fp_saved_regs_size;
8641 unsigned reg1 = cfun->machine->frame.wb_candidate1;
8642 unsigned reg2 = cfun->machine->frame.wb_candidate2;
8643 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
8644 rtx_insn *insn;
8646 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
8648 /* Fold the SVE allocation into the initial allocation.
8649 We don't do this in aarch64_layout_arg to avoid pessimizing
8650 the epilogue code. */
8651 initial_adjust += sve_callee_adjust;
8652 sve_callee_adjust = 0;
8655 /* Sign return address for functions. */
8656 if (aarch64_return_address_signing_enabled ())
8658 switch (aarch64_ra_sign_key)
8660 case AARCH64_KEY_A:
8661 insn = emit_insn (gen_paciasp ());
8662 break;
8663 case AARCH64_KEY_B:
8664 insn = emit_insn (gen_pacibsp ());
8665 break;
8666 default:
8667 gcc_unreachable ();
8669 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8670 RTX_FRAME_RELATED_P (insn) = 1;
8673 if (flag_stack_usage_info)
8674 current_function_static_stack_size = constant_lower_bound (frame_size);
8676 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
8678 if (crtl->is_leaf && !cfun->calls_alloca)
8680 if (maybe_gt (frame_size, PROBE_INTERVAL)
8681 && maybe_gt (frame_size, get_stack_check_protect ()))
8682 aarch64_emit_probe_stack_range (get_stack_check_protect (),
8683 (frame_size
8684 - get_stack_check_protect ()));
8686 else if (maybe_gt (frame_size, 0))
8687 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
8690 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8691 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8693 /* In theory we should never have both an initial adjustment
8694 and a callee save adjustment. Verify that is the case since the
8695 code below does not handle it for -fstack-clash-protection. */
8696 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
8698 /* Will only probe if the initial adjustment is larger than the guard
8699 less the amount of the guard reserved for use by the caller's
8700 outgoing args. */
8701 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
8702 true, false);
8704 if (callee_adjust != 0)
8705 aarch64_push_regs (reg1, reg2, callee_adjust);
8707 /* The offset of the frame chain record (if any) from the current SP. */
8708 poly_int64 chain_offset = (initial_adjust + callee_adjust
8709 - cfun->machine->frame.hard_fp_offset);
8710 gcc_assert (known_ge (chain_offset, 0));
8712 /* The offset of the bottom of the save area from the current SP. */
8713 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
8715 if (emit_frame_chain)
8717 if (callee_adjust == 0)
8719 reg1 = R29_REGNUM;
8720 reg2 = R30_REGNUM;
8721 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
8722 false, false);
8724 else
8725 gcc_assert (known_eq (chain_offset, 0));
8726 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
8727 stack_pointer_rtx, chain_offset,
8728 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
8729 if (frame_pointer_needed && !frame_size.is_constant ())
8731 /* Variable-sized frames need to describe the save slot
8732 address using DW_CFA_expression rather than DW_CFA_offset.
8733 This means that, without taking further action, the
8734 locations of the registers that we've already saved would
8735 remain based on the stack pointer even after we redefine
8736 the CFA based on the frame pointer. We therefore need new
8737 DW_CFA_expressions to re-express the save slots with addresses
8738 based on the frame pointer. */
8739 rtx_insn *insn = get_last_insn ();
8740 gcc_assert (RTX_FRAME_RELATED_P (insn));
8742 /* Add an explicit CFA definition if this was previously
8743 implicit. */
8744 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
8746 rtx src = plus_constant (Pmode, stack_pointer_rtx,
8747 callee_offset);
8748 add_reg_note (insn, REG_CFA_ADJUST_CFA,
8749 gen_rtx_SET (hard_frame_pointer_rtx, src));
8752 /* Change the save slot expressions for the registers that
8753 we've already saved. */
8754 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
8755 hard_frame_pointer_rtx, UNITS_PER_WORD);
8756 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
8757 hard_frame_pointer_rtx, 0);
8759 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
8762 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
8763 callee_adjust != 0 || emit_frame_chain,
8764 emit_frame_chain);
8765 if (maybe_ne (sve_callee_adjust, 0))
8767 gcc_assert (!flag_stack_clash_protection
8768 || known_eq (initial_adjust, 0));
8769 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
8770 sve_callee_adjust,
8771 !frame_pointer_needed, false);
8772 saved_regs_offset += sve_callee_adjust;
8774 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
8775 false, emit_frame_chain);
8776 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
8777 callee_adjust != 0 || emit_frame_chain,
8778 emit_frame_chain);
8780 /* We may need to probe the final adjustment if it is larger than the guard
8781 that is assumed by the called. */
8782 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
8783 !frame_pointer_needed, true);
8786 /* Return TRUE if we can use a simple_return insn.
8788 This function checks whether the callee saved stack is empty, which
8789 means no restore actions are need. The pro_and_epilogue will use
8790 this to check whether shrink-wrapping opt is feasible. */
8792 bool
8793 aarch64_use_return_insn_p (void)
8795 if (!reload_completed)
8796 return false;
8798 if (crtl->profile)
8799 return false;
8801 return known_eq (cfun->machine->frame.frame_size, 0);
8804 /* Generate the epilogue instructions for returning from a function.
8805 This is almost exactly the reverse of the prolog sequence, except
8806 that we need to insert barriers to avoid scheduling loads that read
8807 from a deallocated stack, and we optimize the unwind records by
8808 emitting them all together if possible. */
8809 void
8810 aarch64_expand_epilogue (bool for_sibcall)
8812 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8813 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8814 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8815 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8816 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8817 poly_int64 below_hard_fp_saved_regs_size
8818 = cfun->machine->frame.below_hard_fp_saved_regs_size;
8819 unsigned reg1 = cfun->machine->frame.wb_candidate1;
8820 unsigned reg2 = cfun->machine->frame.wb_candidate2;
8821 rtx cfi_ops = NULL;
8822 rtx_insn *insn;
8823 /* A stack clash protection prologue may not have left EP0_REGNUM or
8824 EP1_REGNUM in a usable state. The same is true for allocations
8825 with an SVE component, since we then need both temporary registers
8826 for each allocation. For stack clash we are in a usable state if
8827 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
8828 HOST_WIDE_INT guard_size
8829 = 1 << param_stack_clash_protection_guard_size;
8830 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8832 /* We can re-use the registers when:
8834 (a) the deallocation amount is the same as the corresponding
8835 allocation amount (which is false if we combine the initial
8836 and SVE callee save allocations in the prologue); and
8838 (b) the allocation amount doesn't need a probe (which is false
8839 if the amount is guard_size - guard_used_by_caller or greater).
8841 In such situations the register should remain live with the correct
8842 value. */
8843 bool can_inherit_p = (initial_adjust.is_constant ()
8844 && final_adjust.is_constant ()
8845 && (!flag_stack_clash_protection
8846 || (known_lt (initial_adjust,
8847 guard_size - guard_used_by_caller)
8848 && known_eq (sve_callee_adjust, 0))));
8850 /* We need to add memory barrier to prevent read from deallocated stack. */
8851 bool need_barrier_p
8852 = maybe_ne (get_frame_size ()
8853 + cfun->machine->frame.saved_varargs_size, 0);
8855 /* Emit a barrier to prevent loads from a deallocated stack. */
8856 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
8857 || cfun->calls_alloca
8858 || crtl->calls_eh_return)
8860 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8861 need_barrier_p = false;
8864 /* Restore the stack pointer from the frame pointer if it may not
8865 be the same as the stack pointer. */
8866 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8867 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8868 if (frame_pointer_needed
8869 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
8870 /* If writeback is used when restoring callee-saves, the CFA
8871 is restored on the instruction doing the writeback. */
8872 aarch64_add_offset (Pmode, stack_pointer_rtx,
8873 hard_frame_pointer_rtx,
8874 -callee_offset - below_hard_fp_saved_regs_size,
8875 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
8876 else
8877 /* The case where we need to re-use the register here is very rare, so
8878 avoid the complicated condition and just always emit a move if the
8879 immediate doesn't fit. */
8880 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
8882 /* Restore the vector registers before the predicate registers,
8883 so that we can use P4 as a temporary for big-endian SVE frames. */
8884 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
8885 callee_adjust != 0, &cfi_ops);
8886 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
8887 false, &cfi_ops);
8888 if (maybe_ne (sve_callee_adjust, 0))
8889 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
8890 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
8891 R0_REGNUM, R30_REGNUM,
8892 callee_adjust != 0, &cfi_ops);
8894 if (need_barrier_p)
8895 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8897 if (callee_adjust != 0)
8898 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
8900 /* If we have no register restore information, the CFA must have been
8901 defined in terms of the stack pointer since the end of the prologue. */
8902 gcc_assert (cfi_ops || !frame_pointer_needed);
8904 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
8906 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
8907 insn = get_last_insn ();
8908 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
8909 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
8910 RTX_FRAME_RELATED_P (insn) = 1;
8911 cfi_ops = NULL;
8914 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
8915 add restriction on emit_move optimization to leaf functions. */
8916 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
8917 (!can_inherit_p || !crtl->is_leaf
8918 || df_regs_ever_live_p (EP0_REGNUM)));
8920 if (cfi_ops)
8922 /* Emit delayed restores and reset the CFA to be SP. */
8923 insn = get_last_insn ();
8924 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
8925 REG_NOTES (insn) = cfi_ops;
8926 RTX_FRAME_RELATED_P (insn) = 1;
8929 /* We prefer to emit the combined return/authenticate instruction RETAA,
8930 however there are three cases in which we must instead emit an explicit
8931 authentication instruction.
8933 1) Sibcalls don't return in a normal way, so if we're about to call one
8934 we must authenticate.
8936 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
8937 generating code for !TARGET_ARMV8_3 we can't use it and must
8938 explicitly authenticate.
8940 3) On an eh_return path we make extra stack adjustments to update the
8941 canonical frame address to be the exception handler's CFA. We want
8942 to authenticate using the CFA of the function which calls eh_return.
8944 if (aarch64_return_address_signing_enabled ()
8945 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
8947 switch (aarch64_ra_sign_key)
8949 case AARCH64_KEY_A:
8950 insn = emit_insn (gen_autiasp ());
8951 break;
8952 case AARCH64_KEY_B:
8953 insn = emit_insn (gen_autibsp ());
8954 break;
8955 default:
8956 gcc_unreachable ();
8958 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8959 RTX_FRAME_RELATED_P (insn) = 1;
8962 /* Stack adjustment for exception handler. */
8963 if (crtl->calls_eh_return && !for_sibcall)
8965 /* We need to unwind the stack by the offset computed by
8966 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
8967 to be SP; letting the CFA move during this adjustment
8968 is just as correct as retaining the CFA from the body
8969 of the function. Therefore, do nothing special. */
8970 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
8973 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
8974 if (!for_sibcall)
8975 emit_jump_insn (ret_rtx);
8978 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
8979 normally or return to a previous frame after unwinding.
8981 An EH return uses a single shared return sequence. The epilogue is
8982 exactly like a normal epilogue except that it has an extra input
8983 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
8984 that must be applied after the frame has been destroyed. An extra label
8985 is inserted before the epilogue which initializes this register to zero,
8986 and this is the entry point for a normal return.
8988 An actual EH return updates the return address, initializes the stack
8989 adjustment and jumps directly into the epilogue (bypassing the zeroing
8990 of the adjustment). Since the return address is typically saved on the
8991 stack when a function makes a call, the saved LR must be updated outside
8992 the epilogue.
8994 This poses problems as the store is generated well before the epilogue,
8995 so the offset of LR is not known yet. Also optimizations will remove the
8996 store as it appears dead, even after the epilogue is generated (as the
8997 base or offset for loading LR is different in many cases).
8999 To avoid these problems this implementation forces the frame pointer
9000 in eh_return functions so that the location of LR is fixed and known early.
9001 It also marks the store volatile, so no optimization is permitted to
9002 remove the store. */
9004 aarch64_eh_return_handler_rtx (void)
9006 rtx tmp = gen_frame_mem (Pmode,
9007 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
9009 /* Mark the store volatile, so no optimization is permitted to remove it. */
9010 MEM_VOLATILE_P (tmp) = true;
9011 return tmp;
9014 /* Output code to add DELTA to the first argument, and then jump
9015 to FUNCTION. Used for C++ multiple inheritance. */
9016 static void
9017 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
9018 HOST_WIDE_INT delta,
9019 HOST_WIDE_INT vcall_offset,
9020 tree function)
9022 /* The this pointer is always in x0. Note that this differs from
9023 Arm where the this pointer maybe bumped to r1 if r0 is required
9024 to return a pointer to an aggregate. On AArch64 a result value
9025 pointer will be in x8. */
9026 int this_regno = R0_REGNUM;
9027 rtx this_rtx, temp0, temp1, addr, funexp;
9028 rtx_insn *insn;
9029 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
9031 if (aarch64_bti_enabled ())
9032 emit_insn (gen_bti_c());
9034 reload_completed = 1;
9035 emit_note (NOTE_INSN_PROLOGUE_END);
9037 this_rtx = gen_rtx_REG (Pmode, this_regno);
9038 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
9039 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
9041 if (vcall_offset == 0)
9042 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
9043 else
9045 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
9047 addr = this_rtx;
9048 if (delta != 0)
9050 if (delta >= -256 && delta < 256)
9051 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
9052 plus_constant (Pmode, this_rtx, delta));
9053 else
9054 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
9055 temp1, temp0, false);
9058 if (Pmode == ptr_mode)
9059 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
9060 else
9061 aarch64_emit_move (temp0,
9062 gen_rtx_ZERO_EXTEND (Pmode,
9063 gen_rtx_MEM (ptr_mode, addr)));
9065 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
9066 addr = plus_constant (Pmode, temp0, vcall_offset);
9067 else
9069 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
9070 Pmode);
9071 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
9074 if (Pmode == ptr_mode)
9075 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
9076 else
9077 aarch64_emit_move (temp1,
9078 gen_rtx_SIGN_EXTEND (Pmode,
9079 gen_rtx_MEM (ptr_mode, addr)));
9081 emit_insn (gen_add2_insn (this_rtx, temp1));
9084 /* Generate a tail call to the target function. */
9085 if (!TREE_USED (function))
9087 assemble_external (function);
9088 TREE_USED (function) = 1;
9090 funexp = XEXP (DECL_RTL (function), 0);
9091 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
9092 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
9093 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
9094 SIBLING_CALL_P (insn) = 1;
9096 insn = get_insns ();
9097 shorten_branches (insn);
9099 assemble_start_function (thunk, fnname);
9100 final_start_function (insn, file, 1);
9101 final (insn, file, 1);
9102 final_end_function ();
9103 assemble_end_function (thunk, fnname);
9105 /* Stop pretending to be a post-reload pass. */
9106 reload_completed = 0;
9109 static bool
9110 aarch64_tls_referenced_p (rtx x)
9112 if (!TARGET_HAVE_TLS)
9113 return false;
9114 subrtx_iterator::array_type array;
9115 FOR_EACH_SUBRTX (iter, array, x, ALL)
9117 const_rtx x = *iter;
9118 if (SYMBOL_REF_P (x) && SYMBOL_REF_TLS_MODEL (x) != 0)
9119 return true;
9120 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
9121 TLS offsets, not real symbol references. */
9122 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
9123 iter.skip_subrtxes ();
9125 return false;
9129 /* Return true if val can be encoded as a 12-bit unsigned immediate with
9130 a left shift of 0 or 12 bits. */
9131 bool
9132 aarch64_uimm12_shift (HOST_WIDE_INT val)
9134 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
9135 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
9139 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
9140 that can be created with a left shift of 0 or 12. */
9141 static HOST_WIDE_INT
9142 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
9144 /* Check to see if the value fits in 24 bits, as that is the maximum we can
9145 handle correctly. */
9146 gcc_assert ((val & 0xffffff) == val);
9148 if (((val & 0xfff) << 0) == val)
9149 return val;
9151 return val & (0xfff << 12);
9154 /* Return true if val is an immediate that can be loaded into a
9155 register by a MOVZ instruction. */
9156 static bool
9157 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
9159 if (GET_MODE_SIZE (mode) > 4)
9161 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
9162 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
9163 return 1;
9165 else
9167 /* Ignore sign extension. */
9168 val &= (HOST_WIDE_INT) 0xffffffff;
9170 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
9171 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
9174 /* Test whether:
9176 X = (X & AND_VAL) | IOR_VAL;
9178 can be implemented using:
9180 MOVK X, #(IOR_VAL >> shift), LSL #shift
9182 Return the shift if so, otherwise return -1. */
9184 aarch64_movk_shift (const wide_int_ref &and_val,
9185 const wide_int_ref &ior_val)
9187 unsigned int precision = and_val.get_precision ();
9188 unsigned HOST_WIDE_INT mask = 0xffff;
9189 for (unsigned int shift = 0; shift < precision; shift += 16)
9191 if (and_val == ~mask && (ior_val & mask) == ior_val)
9192 return shift;
9193 mask <<= 16;
9195 return -1;
9198 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
9199 64-bit (DImode) integer. */
9201 static unsigned HOST_WIDE_INT
9202 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
9204 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
9205 while (size < 64)
9207 val &= (HOST_WIDE_INT_1U << size) - 1;
9208 val |= val << size;
9209 size *= 2;
9211 return val;
9214 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
9216 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
9218 0x0000000100000001ull,
9219 0x0001000100010001ull,
9220 0x0101010101010101ull,
9221 0x1111111111111111ull,
9222 0x5555555555555555ull,
9226 /* Return true if val is a valid bitmask immediate. */
9228 bool
9229 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
9231 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
9232 int bits;
9234 /* Check for a single sequence of one bits and return quickly if so.
9235 The special cases of all ones and all zeroes returns false. */
9236 val = aarch64_replicate_bitmask_imm (val_in, mode);
9237 tmp = val + (val & -val);
9239 if (tmp == (tmp & -tmp))
9240 return (val + 1) > 1;
9242 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
9243 if (mode == SImode)
9244 val = (val << 32) | (val & 0xffffffff);
9246 /* Invert if the immediate doesn't start with a zero bit - this means we
9247 only need to search for sequences of one bits. */
9248 if (val & 1)
9249 val = ~val;
9251 /* Find the first set bit and set tmp to val with the first sequence of one
9252 bits removed. Return success if there is a single sequence of ones. */
9253 first_one = val & -val;
9254 tmp = val & (val + first_one);
9256 if (tmp == 0)
9257 return true;
9259 /* Find the next set bit and compute the difference in bit position. */
9260 next_one = tmp & -tmp;
9261 bits = clz_hwi (first_one) - clz_hwi (next_one);
9262 mask = val ^ tmp;
9264 /* Check the bit position difference is a power of 2, and that the first
9265 sequence of one bits fits within 'bits' bits. */
9266 if ((mask >> bits) != 0 || bits != (bits & -bits))
9267 return false;
9269 /* Check the sequence of one bits is repeated 64/bits times. */
9270 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
9273 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
9274 Assumed precondition: VAL_IN Is not zero. */
9276 unsigned HOST_WIDE_INT
9277 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
9279 int lowest_bit_set = ctz_hwi (val_in);
9280 int highest_bit_set = floor_log2 (val_in);
9281 gcc_assert (val_in != 0);
9283 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
9284 (HOST_WIDE_INT_1U << lowest_bit_set));
9287 /* Create constant where bits outside of lowest bit set to highest bit set
9288 are set to 1. */
9290 unsigned HOST_WIDE_INT
9291 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
9293 return val_in | ~aarch64_and_split_imm1 (val_in);
9296 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
9298 bool
9299 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
9301 scalar_int_mode int_mode;
9302 if (!is_a <scalar_int_mode> (mode, &int_mode))
9303 return false;
9305 if (aarch64_bitmask_imm (val_in, int_mode))
9306 return false;
9308 if (aarch64_move_imm (val_in, int_mode))
9309 return false;
9311 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
9313 return aarch64_bitmask_imm (imm2, int_mode);
9316 /* Return true if val is an immediate that can be loaded into a
9317 register in a single instruction. */
9318 bool
9319 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
9321 scalar_int_mode int_mode;
9322 if (!is_a <scalar_int_mode> (mode, &int_mode))
9323 return false;
9325 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
9326 return 1;
9327 return aarch64_bitmask_imm (val, int_mode);
9330 static bool
9331 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
9333 if (GET_CODE (x) == HIGH)
9334 return true;
9336 /* There's no way to calculate VL-based values using relocations. */
9337 subrtx_iterator::array_type array;
9338 FOR_EACH_SUBRTX (iter, array, x, ALL)
9339 if (GET_CODE (*iter) == CONST_POLY_INT)
9340 return true;
9342 poly_int64 offset;
9343 rtx base = strip_offset_and_salt (x, &offset);
9344 if (SYMBOL_REF_P (base) || LABEL_REF_P (base))
9346 /* We checked for POLY_INT_CST offsets above. */
9347 if (aarch64_classify_symbol (base, offset.to_constant ())
9348 != SYMBOL_FORCE_TO_MEM)
9349 return true;
9350 else
9351 /* Avoid generating a 64-bit relocation in ILP32; leave
9352 to aarch64_expand_mov_immediate to handle it properly. */
9353 return mode != ptr_mode;
9356 return aarch64_tls_referenced_p (x);
9359 /* Implement TARGET_CASE_VALUES_THRESHOLD.
9360 The expansion for a table switch is quite expensive due to the number
9361 of instructions, the table lookup and hard to predict indirect jump.
9362 When optimizing for speed, and -O3 enabled, use the per-core tuning if
9363 set, otherwise use tables for > 16 cases as a tradeoff between size and
9364 performance. When optimizing for size, use the default setting. */
9366 static unsigned int
9367 aarch64_case_values_threshold (void)
9369 /* Use the specified limit for the number of cases before using jump
9370 tables at higher optimization levels. */
9371 if (optimize > 2
9372 && selected_cpu->tune->max_case_values != 0)
9373 return selected_cpu->tune->max_case_values;
9374 else
9375 return optimize_size ? default_case_values_threshold () : 17;
9378 /* Return true if register REGNO is a valid index register.
9379 STRICT_P is true if REG_OK_STRICT is in effect. */
9381 bool
9382 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
9384 if (!HARD_REGISTER_NUM_P (regno))
9386 if (!strict_p)
9387 return true;
9389 if (!reg_renumber)
9390 return false;
9392 regno = reg_renumber[regno];
9394 return GP_REGNUM_P (regno);
9397 /* Return true if register REGNO is a valid base register for mode MODE.
9398 STRICT_P is true if REG_OK_STRICT is in effect. */
9400 bool
9401 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
9403 if (!HARD_REGISTER_NUM_P (regno))
9405 if (!strict_p)
9406 return true;
9408 if (!reg_renumber)
9409 return false;
9411 regno = reg_renumber[regno];
9414 /* The fake registers will be eliminated to either the stack or
9415 hard frame pointer, both of which are usually valid base registers.
9416 Reload deals with the cases where the eliminated form isn't valid. */
9417 return (GP_REGNUM_P (regno)
9418 || regno == SP_REGNUM
9419 || regno == FRAME_POINTER_REGNUM
9420 || regno == ARG_POINTER_REGNUM);
9423 /* Return true if X is a valid base register for mode MODE.
9424 STRICT_P is true if REG_OK_STRICT is in effect. */
9426 static bool
9427 aarch64_base_register_rtx_p (rtx x, bool strict_p)
9429 if (!strict_p
9430 && SUBREG_P (x)
9431 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
9432 x = SUBREG_REG (x);
9434 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
9437 /* Return true if address offset is a valid index. If it is, fill in INFO
9438 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
9440 static bool
9441 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
9442 machine_mode mode, bool strict_p)
9444 enum aarch64_address_type type;
9445 rtx index;
9446 int shift;
9448 /* (reg:P) */
9449 if ((REG_P (x) || SUBREG_P (x))
9450 && GET_MODE (x) == Pmode)
9452 type = ADDRESS_REG_REG;
9453 index = x;
9454 shift = 0;
9456 /* (sign_extend:DI (reg:SI)) */
9457 else if ((GET_CODE (x) == SIGN_EXTEND
9458 || GET_CODE (x) == ZERO_EXTEND)
9459 && GET_MODE (x) == DImode
9460 && GET_MODE (XEXP (x, 0)) == SImode)
9462 type = (GET_CODE (x) == SIGN_EXTEND)
9463 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9464 index = XEXP (x, 0);
9465 shift = 0;
9467 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
9468 else if (GET_CODE (x) == MULT
9469 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
9470 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
9471 && GET_MODE (XEXP (x, 0)) == DImode
9472 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
9473 && CONST_INT_P (XEXP (x, 1)))
9475 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
9476 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9477 index = XEXP (XEXP (x, 0), 0);
9478 shift = exact_log2 (INTVAL (XEXP (x, 1)));
9480 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
9481 else if (GET_CODE (x) == ASHIFT
9482 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
9483 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
9484 && GET_MODE (XEXP (x, 0)) == DImode
9485 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
9486 && CONST_INT_P (XEXP (x, 1)))
9488 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
9489 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
9490 index = XEXP (XEXP (x, 0), 0);
9491 shift = INTVAL (XEXP (x, 1));
9493 /* (and:DI (mult:DI (reg:DI) (const_int scale))
9494 (const_int 0xffffffff<<shift)) */
9495 else if (GET_CODE (x) == AND
9496 && GET_MODE (x) == DImode
9497 && GET_CODE (XEXP (x, 0)) == MULT
9498 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9499 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9500 && CONST_INT_P (XEXP (x, 1)))
9502 type = ADDRESS_REG_UXTW;
9503 index = XEXP (XEXP (x, 0), 0);
9504 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
9505 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9506 shift = -1;
9508 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
9509 (const_int 0xffffffff<<shift)) */
9510 else if (GET_CODE (x) == AND
9511 && GET_MODE (x) == DImode
9512 && GET_CODE (XEXP (x, 0)) == ASHIFT
9513 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
9514 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9515 && CONST_INT_P (XEXP (x, 1)))
9517 type = ADDRESS_REG_UXTW;
9518 index = XEXP (XEXP (x, 0), 0);
9519 shift = INTVAL (XEXP (XEXP (x, 0), 1));
9520 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
9521 shift = -1;
9523 /* (mult:P (reg:P) (const_int scale)) */
9524 else if (GET_CODE (x) == MULT
9525 && GET_MODE (x) == Pmode
9526 && GET_MODE (XEXP (x, 0)) == Pmode
9527 && CONST_INT_P (XEXP (x, 1)))
9529 type = ADDRESS_REG_REG;
9530 index = XEXP (x, 0);
9531 shift = exact_log2 (INTVAL (XEXP (x, 1)));
9533 /* (ashift:P (reg:P) (const_int shift)) */
9534 else if (GET_CODE (x) == ASHIFT
9535 && GET_MODE (x) == Pmode
9536 && GET_MODE (XEXP (x, 0)) == Pmode
9537 && CONST_INT_P (XEXP (x, 1)))
9539 type = ADDRESS_REG_REG;
9540 index = XEXP (x, 0);
9541 shift = INTVAL (XEXP (x, 1));
9543 else
9544 return false;
9546 if (!strict_p
9547 && SUBREG_P (index)
9548 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
9549 index = SUBREG_REG (index);
9551 if (aarch64_sve_data_mode_p (mode))
9553 if (type != ADDRESS_REG_REG
9554 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
9555 return false;
9557 else
9559 if (shift != 0
9560 && !(IN_RANGE (shift, 1, 3)
9561 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
9562 return false;
9565 if (REG_P (index)
9566 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
9568 info->type = type;
9569 info->offset = index;
9570 info->shift = shift;
9571 return true;
9574 return false;
9577 /* Return true if MODE is one of the modes for which we
9578 support LDP/STP operations. */
9580 static bool
9581 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
9583 return mode == SImode || mode == DImode
9584 || mode == SFmode || mode == DFmode
9585 || (aarch64_vector_mode_supported_p (mode)
9586 && (known_eq (GET_MODE_SIZE (mode), 8)
9587 || (known_eq (GET_MODE_SIZE (mode), 16)
9588 && (aarch64_tune_params.extra_tuning_flags
9589 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
9592 /* Return true if REGNO is a virtual pointer register, or an eliminable
9593 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
9594 include stack_pointer or hard_frame_pointer. */
9595 static bool
9596 virt_or_elim_regno_p (unsigned regno)
9598 return ((regno >= FIRST_VIRTUAL_REGISTER
9599 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
9600 || regno == FRAME_POINTER_REGNUM
9601 || regno == ARG_POINTER_REGNUM);
9604 /* Return true if X is a valid address of type TYPE for machine mode MODE.
9605 If it is, fill in INFO appropriately. STRICT_P is true if
9606 REG_OK_STRICT is in effect. */
9608 bool
9609 aarch64_classify_address (struct aarch64_address_info *info,
9610 rtx x, machine_mode mode, bool strict_p,
9611 aarch64_addr_query_type type)
9613 enum rtx_code code = GET_CODE (x);
9614 rtx op0, op1;
9615 poly_int64 offset;
9617 HOST_WIDE_INT const_size;
9619 /* Whether a vector mode is partial doesn't affect address legitimacy.
9620 Partial vectors like VNx8QImode allow the same indexed addressing
9621 mode and MUL VL addressing mode as full vectors like VNx16QImode;
9622 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
9623 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9624 vec_flags &= ~VEC_PARTIAL;
9626 /* On BE, we use load/store pair for all large int mode load/stores.
9627 TI/TFmode may also use a load/store pair. */
9628 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
9629 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
9630 || type == ADDR_QUERY_LDP_STP_N
9631 || mode == TImode
9632 || mode == TFmode
9633 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
9635 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
9636 corresponds to the actual size of the memory being loaded/stored and the
9637 mode of the corresponding addressing mode is half of that. */
9638 if (type == ADDR_QUERY_LDP_STP_N
9639 && known_eq (GET_MODE_SIZE (mode), 16))
9640 mode = DFmode;
9642 bool allow_reg_index_p = (!load_store_pair_p
9643 && (known_lt (GET_MODE_SIZE (mode), 16)
9644 || vec_flags == VEC_ADVSIMD
9645 || vec_flags & VEC_SVE_DATA));
9647 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
9648 [Rn, #offset, MUL VL]. */
9649 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
9650 && (code != REG && code != PLUS))
9651 return false;
9653 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
9654 REG addressing. */
9655 if (advsimd_struct_p
9656 && !BYTES_BIG_ENDIAN
9657 && (code != POST_INC && code != REG))
9658 return false;
9660 gcc_checking_assert (GET_MODE (x) == VOIDmode
9661 || SCALAR_INT_MODE_P (GET_MODE (x)));
9663 switch (code)
9665 case REG:
9666 case SUBREG:
9667 info->type = ADDRESS_REG_IMM;
9668 info->base = x;
9669 info->offset = const0_rtx;
9670 info->const_offset = 0;
9671 return aarch64_base_register_rtx_p (x, strict_p);
9673 case PLUS:
9674 op0 = XEXP (x, 0);
9675 op1 = XEXP (x, 1);
9677 if (! strict_p
9678 && REG_P (op0)
9679 && virt_or_elim_regno_p (REGNO (op0))
9680 && poly_int_rtx_p (op1, &offset))
9682 info->type = ADDRESS_REG_IMM;
9683 info->base = op0;
9684 info->offset = op1;
9685 info->const_offset = offset;
9687 return true;
9690 if (maybe_ne (GET_MODE_SIZE (mode), 0)
9691 && aarch64_base_register_rtx_p (op0, strict_p)
9692 && poly_int_rtx_p (op1, &offset))
9694 info->type = ADDRESS_REG_IMM;
9695 info->base = op0;
9696 info->offset = op1;
9697 info->const_offset = offset;
9699 /* TImode and TFmode values are allowed in both pairs of X
9700 registers and individual Q registers. The available
9701 address modes are:
9702 X,X: 7-bit signed scaled offset
9703 Q: 9-bit signed offset
9704 We conservatively require an offset representable in either mode.
9705 When performing the check for pairs of X registers i.e. LDP/STP
9706 pass down DImode since that is the natural size of the LDP/STP
9707 instruction memory accesses. */
9708 if (mode == TImode || mode == TFmode)
9709 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
9710 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9711 || offset_12bit_unsigned_scaled_p (mode, offset)));
9713 /* A 7bit offset check because OImode will emit a ldp/stp
9714 instruction (only big endian will get here).
9715 For ldp/stp instructions, the offset is scaled for the size of a
9716 single element of the pair. */
9717 if (mode == OImode)
9718 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
9720 /* Three 9/12 bit offsets checks because CImode will emit three
9721 ldr/str instructions (only big endian will get here). */
9722 if (mode == CImode)
9723 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9724 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
9725 offset + 32)
9726 || offset_12bit_unsigned_scaled_p (V16QImode,
9727 offset + 32)));
9729 /* Two 7bit offsets checks because XImode will emit two ldp/stp
9730 instructions (only big endian will get here). */
9731 if (mode == XImode)
9732 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9733 && aarch64_offset_7bit_signed_scaled_p (TImode,
9734 offset + 32));
9736 /* Make "m" use the LD1 offset range for SVE data modes, so
9737 that pre-RTL optimizers like ivopts will work to that
9738 instead of the wider LDR/STR range. */
9739 if (vec_flags == VEC_SVE_DATA)
9740 return (type == ADDR_QUERY_M
9741 ? offset_4bit_signed_scaled_p (mode, offset)
9742 : offset_9bit_signed_scaled_p (mode, offset));
9744 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
9746 poly_int64 end_offset = (offset
9747 + GET_MODE_SIZE (mode)
9748 - BYTES_PER_SVE_VECTOR);
9749 return (type == ADDR_QUERY_M
9750 ? offset_4bit_signed_scaled_p (mode, offset)
9751 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
9752 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
9753 end_offset)));
9756 if (vec_flags == VEC_SVE_PRED)
9757 return offset_9bit_signed_scaled_p (mode, offset);
9759 if (load_store_pair_p)
9760 return ((known_eq (GET_MODE_SIZE (mode), 4)
9761 || known_eq (GET_MODE_SIZE (mode), 8)
9762 || known_eq (GET_MODE_SIZE (mode), 16))
9763 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9764 else
9765 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9766 || offset_12bit_unsigned_scaled_p (mode, offset));
9769 if (allow_reg_index_p)
9771 /* Look for base + (scaled/extended) index register. */
9772 if (aarch64_base_register_rtx_p (op0, strict_p)
9773 && aarch64_classify_index (info, op1, mode, strict_p))
9775 info->base = op0;
9776 return true;
9778 if (aarch64_base_register_rtx_p (op1, strict_p)
9779 && aarch64_classify_index (info, op0, mode, strict_p))
9781 info->base = op1;
9782 return true;
9786 return false;
9788 case POST_INC:
9789 case POST_DEC:
9790 case PRE_INC:
9791 case PRE_DEC:
9792 info->type = ADDRESS_REG_WB;
9793 info->base = XEXP (x, 0);
9794 info->offset = NULL_RTX;
9795 return aarch64_base_register_rtx_p (info->base, strict_p);
9797 case POST_MODIFY:
9798 case PRE_MODIFY:
9799 info->type = ADDRESS_REG_WB;
9800 info->base = XEXP (x, 0);
9801 if (GET_CODE (XEXP (x, 1)) == PLUS
9802 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
9803 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
9804 && aarch64_base_register_rtx_p (info->base, strict_p))
9806 info->offset = XEXP (XEXP (x, 1), 1);
9807 info->const_offset = offset;
9809 /* TImode and TFmode values are allowed in both pairs of X
9810 registers and individual Q registers. The available
9811 address modes are:
9812 X,X: 7-bit signed scaled offset
9813 Q: 9-bit signed offset
9814 We conservatively require an offset representable in either mode.
9816 if (mode == TImode || mode == TFmode)
9817 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
9818 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
9820 if (load_store_pair_p)
9821 return ((known_eq (GET_MODE_SIZE (mode), 4)
9822 || known_eq (GET_MODE_SIZE (mode), 8)
9823 || known_eq (GET_MODE_SIZE (mode), 16))
9824 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9825 else
9826 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
9828 return false;
9830 case CONST:
9831 case SYMBOL_REF:
9832 case LABEL_REF:
9833 /* load literal: pc-relative constant pool entry. Only supported
9834 for SI mode or larger. */
9835 info->type = ADDRESS_SYMBOLIC;
9837 if (!load_store_pair_p
9838 && GET_MODE_SIZE (mode).is_constant (&const_size)
9839 && const_size >= 4)
9841 poly_int64 offset;
9842 rtx sym = strip_offset_and_salt (x, &offset);
9843 return ((LABEL_REF_P (sym)
9844 || (SYMBOL_REF_P (sym)
9845 && CONSTANT_POOL_ADDRESS_P (sym)
9846 && aarch64_pcrelative_literal_loads)));
9848 return false;
9850 case LO_SUM:
9851 info->type = ADDRESS_LO_SUM;
9852 info->base = XEXP (x, 0);
9853 info->offset = XEXP (x, 1);
9854 if (allow_reg_index_p
9855 && aarch64_base_register_rtx_p (info->base, strict_p))
9857 poly_int64 offset;
9858 HOST_WIDE_INT const_offset;
9859 rtx sym = strip_offset_and_salt (info->offset, &offset);
9860 if (SYMBOL_REF_P (sym)
9861 && offset.is_constant (&const_offset)
9862 && (aarch64_classify_symbol (sym, const_offset)
9863 == SYMBOL_SMALL_ABSOLUTE))
9865 /* The symbol and offset must be aligned to the access size. */
9866 unsigned int align;
9868 if (CONSTANT_POOL_ADDRESS_P (sym))
9869 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
9870 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
9872 tree exp = SYMBOL_REF_DECL (sym);
9873 align = TYPE_ALIGN (TREE_TYPE (exp));
9874 align = aarch64_constant_alignment (exp, align);
9876 else if (SYMBOL_REF_DECL (sym))
9877 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
9878 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
9879 && SYMBOL_REF_BLOCK (sym) != NULL)
9880 align = SYMBOL_REF_BLOCK (sym)->alignment;
9881 else
9882 align = BITS_PER_UNIT;
9884 poly_int64 ref_size = GET_MODE_SIZE (mode);
9885 if (known_eq (ref_size, 0))
9886 ref_size = GET_MODE_SIZE (DImode);
9888 return (multiple_p (const_offset, ref_size)
9889 && multiple_p (align / BITS_PER_UNIT, ref_size));
9892 return false;
9894 default:
9895 return false;
9899 /* Return true if the address X is valid for a PRFM instruction.
9900 STRICT_P is true if we should do strict checking with
9901 aarch64_classify_address. */
9903 bool
9904 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
9906 struct aarch64_address_info addr;
9908 /* PRFM accepts the same addresses as DImode... */
9909 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
9910 if (!res)
9911 return false;
9913 /* ... except writeback forms. */
9914 return addr.type != ADDRESS_REG_WB;
9917 bool
9918 aarch64_symbolic_address_p (rtx x)
9920 poly_int64 offset;
9921 x = strip_offset_and_salt (x, &offset);
9922 return SYMBOL_REF_P (x) || LABEL_REF_P (x);
9925 /* Classify the base of symbolic expression X. */
9927 enum aarch64_symbol_type
9928 aarch64_classify_symbolic_expression (rtx x)
9930 rtx offset;
9932 split_const (x, &x, &offset);
9933 return aarch64_classify_symbol (x, INTVAL (offset));
9937 /* Return TRUE if X is a legitimate address for accessing memory in
9938 mode MODE. */
9939 static bool
9940 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
9942 struct aarch64_address_info addr;
9944 return aarch64_classify_address (&addr, x, mode, strict_p);
9947 /* Return TRUE if X is a legitimate address of type TYPE for accessing
9948 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
9949 bool
9950 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
9951 aarch64_addr_query_type type)
9953 struct aarch64_address_info addr;
9955 return aarch64_classify_address (&addr, x, mode, strict_p, type);
9958 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
9960 static bool
9961 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
9962 poly_int64 orig_offset,
9963 machine_mode mode)
9965 HOST_WIDE_INT size;
9966 if (GET_MODE_SIZE (mode).is_constant (&size))
9968 HOST_WIDE_INT const_offset, second_offset;
9970 /* A general SVE offset is A * VQ + B. Remove the A component from
9971 coefficient 0 in order to get the constant B. */
9972 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
9974 /* Split an out-of-range address displacement into a base and
9975 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
9976 range otherwise to increase opportunities for sharing the base
9977 address of different sizes. Unaligned accesses use the signed
9978 9-bit range, TImode/TFmode use the intersection of signed
9979 scaled 7-bit and signed 9-bit offset. */
9980 if (mode == TImode || mode == TFmode)
9981 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
9982 else if ((const_offset & (size - 1)) != 0)
9983 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
9984 else
9985 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
9987 if (second_offset == 0 || known_eq (orig_offset, second_offset))
9988 return false;
9990 /* Split the offset into second_offset and the rest. */
9991 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9992 *offset2 = gen_int_mode (second_offset, Pmode);
9993 return true;
9995 else
9997 /* Get the mode we should use as the basis of the range. For structure
9998 modes this is the mode of one vector. */
9999 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10000 machine_mode step_mode
10001 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
10003 /* Get the "mul vl" multiplier we'd like to use. */
10004 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
10005 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
10006 if (vec_flags & VEC_SVE_DATA)
10007 /* LDR supports a 9-bit range, but the move patterns for
10008 structure modes require all vectors to be in range of the
10009 same base. The simplest way of accomodating that while still
10010 promoting reuse of anchor points between different modes is
10011 to use an 8-bit range unconditionally. */
10012 vnum = ((vnum + 128) & 255) - 128;
10013 else
10014 /* Predicates are only handled singly, so we might as well use
10015 the full range. */
10016 vnum = ((vnum + 256) & 511) - 256;
10017 if (vnum == 0)
10018 return false;
10020 /* Convert the "mul vl" multiplier into a byte offset. */
10021 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
10022 if (known_eq (second_offset, orig_offset))
10023 return false;
10025 /* Split the offset into second_offset and the rest. */
10026 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
10027 *offset2 = gen_int_mode (second_offset, Pmode);
10028 return true;
10032 /* Return the binary representation of floating point constant VALUE in INTVAL.
10033 If the value cannot be converted, return false without setting INTVAL.
10034 The conversion is done in the given MODE. */
10035 bool
10036 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
10039 /* We make a general exception for 0. */
10040 if (aarch64_float_const_zero_rtx_p (value))
10042 *intval = 0;
10043 return true;
10046 scalar_float_mode mode;
10047 if (!CONST_DOUBLE_P (value)
10048 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
10049 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
10050 /* Only support up to DF mode. */
10051 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
10052 return false;
10054 unsigned HOST_WIDE_INT ival = 0;
10056 long res[2];
10057 real_to_target (res,
10058 CONST_DOUBLE_REAL_VALUE (value),
10059 REAL_MODE_FORMAT (mode));
10061 if (mode == DFmode)
10063 int order = BYTES_BIG_ENDIAN ? 1 : 0;
10064 ival = zext_hwi (res[order], 32);
10065 ival |= (zext_hwi (res[1 - order], 32) << 32);
10067 else
10068 ival = zext_hwi (res[0], 32);
10070 *intval = ival;
10071 return true;
10074 /* Return TRUE if rtx X is an immediate constant that can be moved using a
10075 single MOV(+MOVK) followed by an FMOV. */
10076 bool
10077 aarch64_float_const_rtx_p (rtx x)
10079 machine_mode mode = GET_MODE (x);
10080 if (mode == VOIDmode)
10081 return false;
10083 /* Determine whether it's cheaper to write float constants as
10084 mov/movk pairs over ldr/adrp pairs. */
10085 unsigned HOST_WIDE_INT ival;
10087 if (CONST_DOUBLE_P (x)
10088 && SCALAR_FLOAT_MODE_P (mode)
10089 && aarch64_reinterpret_float_as_int (x, &ival))
10091 scalar_int_mode imode = (mode == HFmode
10092 ? SImode
10093 : int_mode_for_mode (mode).require ());
10094 int num_instr = aarch64_internal_mov_immediate
10095 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
10096 return num_instr < 3;
10099 return false;
10102 /* Return TRUE if rtx X is immediate constant 0.0 */
10103 bool
10104 aarch64_float_const_zero_rtx_p (rtx x)
10106 if (GET_MODE (x) == VOIDmode)
10107 return false;
10109 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
10110 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
10111 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
10114 /* Return TRUE if rtx X is immediate constant that fits in a single
10115 MOVI immediate operation. */
10116 bool
10117 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
10119 if (!TARGET_SIMD)
10120 return false;
10122 machine_mode vmode;
10123 scalar_int_mode imode;
10124 unsigned HOST_WIDE_INT ival;
10126 if (CONST_DOUBLE_P (x)
10127 && SCALAR_FLOAT_MODE_P (mode))
10129 if (!aarch64_reinterpret_float_as_int (x, &ival))
10130 return false;
10132 /* We make a general exception for 0. */
10133 if (aarch64_float_const_zero_rtx_p (x))
10134 return true;
10136 imode = int_mode_for_mode (mode).require ();
10138 else if (CONST_INT_P (x)
10139 && is_a <scalar_int_mode> (mode, &imode))
10140 ival = INTVAL (x);
10141 else
10142 return false;
10144 /* use a 64 bit mode for everything except for DI/DF mode, where we use
10145 a 128 bit vector mode. */
10146 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
10148 vmode = aarch64_simd_container_mode (imode, width);
10149 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
10151 return aarch64_simd_valid_immediate (v_op, NULL);
10155 /* Return the fixed registers used for condition codes. */
10157 static bool
10158 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
10160 *p1 = CC_REGNUM;
10161 *p2 = INVALID_REGNUM;
10162 return true;
10165 /* This function is used by the call expanders of the machine description.
10166 RESULT is the register in which the result is returned. It's NULL for
10167 "call" and "sibcall".
10168 MEM is the location of the function call.
10169 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
10170 SIBCALL indicates whether this function call is normal call or sibling call.
10171 It will generate different pattern accordingly. */
10173 void
10174 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
10176 rtx call, callee, tmp;
10177 rtvec vec;
10178 machine_mode mode;
10180 gcc_assert (MEM_P (mem));
10181 callee = XEXP (mem, 0);
10182 mode = GET_MODE (callee);
10183 gcc_assert (mode == Pmode);
10185 /* Decide if we should generate indirect calls by loading the
10186 address of the callee into a register before performing
10187 the branch-and-link. */
10188 if (SYMBOL_REF_P (callee)
10189 ? (aarch64_is_long_call_p (callee)
10190 || aarch64_is_noplt_call_p (callee))
10191 : !REG_P (callee))
10192 XEXP (mem, 0) = force_reg (mode, callee);
10194 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
10196 if (result != NULL_RTX)
10197 call = gen_rtx_SET (result, call);
10199 if (sibcall)
10200 tmp = ret_rtx;
10201 else
10202 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
10204 gcc_assert (CONST_INT_P (callee_abi));
10205 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
10206 UNSPEC_CALLEE_ABI);
10208 vec = gen_rtvec (3, call, callee_abi, tmp);
10209 call = gen_rtx_PARALLEL (VOIDmode, vec);
10211 aarch64_emit_call_insn (call);
10214 /* Emit call insn with PAT and do aarch64-specific handling. */
10216 void
10217 aarch64_emit_call_insn (rtx pat)
10219 rtx insn = emit_call_insn (pat);
10221 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
10222 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
10223 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
10226 machine_mode
10227 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
10229 machine_mode mode_x = GET_MODE (x);
10230 rtx_code code_x = GET_CODE (x);
10232 /* All floating point compares return CCFP if it is an equality
10233 comparison, and CCFPE otherwise. */
10234 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
10236 switch (code)
10238 case EQ:
10239 case NE:
10240 case UNORDERED:
10241 case ORDERED:
10242 case UNLT:
10243 case UNLE:
10244 case UNGT:
10245 case UNGE:
10246 case UNEQ:
10247 return CCFPmode;
10249 case LT:
10250 case LE:
10251 case GT:
10252 case GE:
10253 case LTGT:
10254 return CCFPEmode;
10256 default:
10257 gcc_unreachable ();
10261 /* Equality comparisons of short modes against zero can be performed
10262 using the TST instruction with the appropriate bitmask. */
10263 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
10264 && (code == EQ || code == NE)
10265 && (mode_x == HImode || mode_x == QImode))
10266 return CC_NZmode;
10268 /* Similarly, comparisons of zero_extends from shorter modes can
10269 be performed using an ANDS with an immediate mask. */
10270 if (y == const0_rtx && code_x == ZERO_EXTEND
10271 && (mode_x == SImode || mode_x == DImode)
10272 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
10273 && (code == EQ || code == NE))
10274 return CC_NZmode;
10276 if ((mode_x == SImode || mode_x == DImode)
10277 && y == const0_rtx
10278 && (code == EQ || code == NE || code == LT || code == GE)
10279 && (code_x == PLUS || code_x == MINUS || code_x == AND
10280 || code_x == NEG
10281 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
10282 && CONST_INT_P (XEXP (x, 2)))))
10283 return CC_NZmode;
10285 /* A compare with a shifted operand. Because of canonicalization,
10286 the comparison will have to be swapped when we emit the assembly
10287 code. */
10288 if ((mode_x == SImode || mode_x == DImode)
10289 && (REG_P (y) || SUBREG_P (y) || y == const0_rtx)
10290 && (code_x == ASHIFT || code_x == ASHIFTRT
10291 || code_x == LSHIFTRT
10292 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
10293 return CC_SWPmode;
10295 /* Similarly for a negated operand, but we can only do this for
10296 equalities. */
10297 if ((mode_x == SImode || mode_x == DImode)
10298 && (REG_P (y) || SUBREG_P (y))
10299 && (code == EQ || code == NE)
10300 && code_x == NEG)
10301 return CC_Zmode;
10303 /* A test for unsigned overflow from an addition. */
10304 if ((mode_x == DImode || mode_x == TImode)
10305 && (code == LTU || code == GEU)
10306 && code_x == PLUS
10307 && rtx_equal_p (XEXP (x, 0), y))
10308 return CC_Cmode;
10310 /* A test for unsigned overflow from an add with carry. */
10311 if ((mode_x == DImode || mode_x == TImode)
10312 && (code == LTU || code == GEU)
10313 && code_x == PLUS
10314 && CONST_SCALAR_INT_P (y)
10315 && (rtx_mode_t (y, mode_x)
10316 == (wi::shwi (1, mode_x)
10317 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
10318 return CC_ADCmode;
10320 /* A test for signed overflow. */
10321 if ((mode_x == DImode || mode_x == TImode)
10322 && code == NE
10323 && code_x == PLUS
10324 && GET_CODE (y) == SIGN_EXTEND)
10325 return CC_Vmode;
10327 /* For everything else, return CCmode. */
10328 return CCmode;
10331 static int
10332 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
10335 aarch64_get_condition_code (rtx x)
10337 machine_mode mode = GET_MODE (XEXP (x, 0));
10338 enum rtx_code comp_code = GET_CODE (x);
10340 if (GET_MODE_CLASS (mode) != MODE_CC)
10341 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
10342 return aarch64_get_condition_code_1 (mode, comp_code);
10345 static int
10346 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
10348 switch (mode)
10350 case E_CCFPmode:
10351 case E_CCFPEmode:
10352 switch (comp_code)
10354 case GE: return AARCH64_GE;
10355 case GT: return AARCH64_GT;
10356 case LE: return AARCH64_LS;
10357 case LT: return AARCH64_MI;
10358 case NE: return AARCH64_NE;
10359 case EQ: return AARCH64_EQ;
10360 case ORDERED: return AARCH64_VC;
10361 case UNORDERED: return AARCH64_VS;
10362 case UNLT: return AARCH64_LT;
10363 case UNLE: return AARCH64_LE;
10364 case UNGT: return AARCH64_HI;
10365 case UNGE: return AARCH64_PL;
10366 default: return -1;
10368 break;
10370 case E_CCmode:
10371 switch (comp_code)
10373 case NE: return AARCH64_NE;
10374 case EQ: return AARCH64_EQ;
10375 case GE: return AARCH64_GE;
10376 case GT: return AARCH64_GT;
10377 case LE: return AARCH64_LE;
10378 case LT: return AARCH64_LT;
10379 case GEU: return AARCH64_CS;
10380 case GTU: return AARCH64_HI;
10381 case LEU: return AARCH64_LS;
10382 case LTU: return AARCH64_CC;
10383 default: return -1;
10385 break;
10387 case E_CC_SWPmode:
10388 switch (comp_code)
10390 case NE: return AARCH64_NE;
10391 case EQ: return AARCH64_EQ;
10392 case GE: return AARCH64_LE;
10393 case GT: return AARCH64_LT;
10394 case LE: return AARCH64_GE;
10395 case LT: return AARCH64_GT;
10396 case GEU: return AARCH64_LS;
10397 case GTU: return AARCH64_CC;
10398 case LEU: return AARCH64_CS;
10399 case LTU: return AARCH64_HI;
10400 default: return -1;
10402 break;
10404 case E_CC_NZCmode:
10405 switch (comp_code)
10407 case NE: return AARCH64_NE; /* = any */
10408 case EQ: return AARCH64_EQ; /* = none */
10409 case GE: return AARCH64_PL; /* = nfrst */
10410 case LT: return AARCH64_MI; /* = first */
10411 case GEU: return AARCH64_CS; /* = nlast */
10412 case GTU: return AARCH64_HI; /* = pmore */
10413 case LEU: return AARCH64_LS; /* = plast */
10414 case LTU: return AARCH64_CC; /* = last */
10415 default: return -1;
10417 break;
10419 case E_CC_NZmode:
10420 switch (comp_code)
10422 case NE: return AARCH64_NE;
10423 case EQ: return AARCH64_EQ;
10424 case GE: return AARCH64_PL;
10425 case LT: return AARCH64_MI;
10426 default: return -1;
10428 break;
10430 case E_CC_Zmode:
10431 switch (comp_code)
10433 case NE: return AARCH64_NE;
10434 case EQ: return AARCH64_EQ;
10435 default: return -1;
10437 break;
10439 case E_CC_Cmode:
10440 switch (comp_code)
10442 case LTU: return AARCH64_CS;
10443 case GEU: return AARCH64_CC;
10444 default: return -1;
10446 break;
10448 case E_CC_ADCmode:
10449 switch (comp_code)
10451 case GEU: return AARCH64_CS;
10452 case LTU: return AARCH64_CC;
10453 default: return -1;
10455 break;
10457 case E_CC_Vmode:
10458 switch (comp_code)
10460 case NE: return AARCH64_VS;
10461 case EQ: return AARCH64_VC;
10462 default: return -1;
10464 break;
10466 default:
10467 return -1;
10470 return -1;
10473 bool
10474 aarch64_const_vec_all_same_in_range_p (rtx x,
10475 HOST_WIDE_INT minval,
10476 HOST_WIDE_INT maxval)
10478 rtx elt;
10479 return (const_vec_duplicate_p (x, &elt)
10480 && CONST_INT_P (elt)
10481 && IN_RANGE (INTVAL (elt), minval, maxval));
10484 bool
10485 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
10487 return aarch64_const_vec_all_same_in_range_p (x, val, val);
10490 /* Return true if VEC is a constant in which every element is in the range
10491 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
10493 static bool
10494 aarch64_const_vec_all_in_range_p (rtx vec,
10495 HOST_WIDE_INT minval,
10496 HOST_WIDE_INT maxval)
10498 if (GET_CODE (vec) != CONST_VECTOR
10499 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
10500 return false;
10502 int nunits;
10503 if (!CONST_VECTOR_STEPPED_P (vec))
10504 nunits = const_vector_encoded_nelts (vec);
10505 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
10506 return false;
10508 for (int i = 0; i < nunits; i++)
10510 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
10511 if (!CONST_INT_P (vec_elem)
10512 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
10513 return false;
10515 return true;
10518 /* N Z C V. */
10519 #define AARCH64_CC_V 1
10520 #define AARCH64_CC_C (1 << 1)
10521 #define AARCH64_CC_Z (1 << 2)
10522 #define AARCH64_CC_N (1 << 3)
10524 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
10525 static const int aarch64_nzcv_codes[] =
10527 0, /* EQ, Z == 1. */
10528 AARCH64_CC_Z, /* NE, Z == 0. */
10529 0, /* CS, C == 1. */
10530 AARCH64_CC_C, /* CC, C == 0. */
10531 0, /* MI, N == 1. */
10532 AARCH64_CC_N, /* PL, N == 0. */
10533 0, /* VS, V == 1. */
10534 AARCH64_CC_V, /* VC, V == 0. */
10535 0, /* HI, C ==1 && Z == 0. */
10536 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
10537 AARCH64_CC_V, /* GE, N == V. */
10538 0, /* LT, N != V. */
10539 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
10540 0, /* LE, !(Z == 0 && N == V). */
10541 0, /* AL, Any. */
10542 0 /* NV, Any. */
10545 /* Print floating-point vector immediate operand X to F, negating it
10546 first if NEGATE is true. Return true on success, false if it isn't
10547 a constant we can handle. */
10549 static bool
10550 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
10552 rtx elt;
10554 if (!const_vec_duplicate_p (x, &elt))
10555 return false;
10557 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
10558 if (negate)
10559 r = real_value_negate (&r);
10561 /* Handle the SVE single-bit immediates specially, since they have a
10562 fixed form in the assembly syntax. */
10563 if (real_equal (&r, &dconst0))
10564 asm_fprintf (f, "0.0");
10565 else if (real_equal (&r, &dconst2))
10566 asm_fprintf (f, "2.0");
10567 else if (real_equal (&r, &dconst1))
10568 asm_fprintf (f, "1.0");
10569 else if (real_equal (&r, &dconsthalf))
10570 asm_fprintf (f, "0.5");
10571 else
10573 const int buf_size = 20;
10574 char float_buf[buf_size] = {'\0'};
10575 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
10576 1, GET_MODE (elt));
10577 asm_fprintf (f, "%s", float_buf);
10580 return true;
10583 /* Return the equivalent letter for size. */
10584 static char
10585 sizetochar (int size)
10587 switch (size)
10589 case 64: return 'd';
10590 case 32: return 's';
10591 case 16: return 'h';
10592 case 8 : return 'b';
10593 default: gcc_unreachable ();
10597 /* Print operand X to file F in a target specific manner according to CODE.
10598 The acceptable formatting commands given by CODE are:
10599 'c': An integer or symbol address without a preceding #
10600 sign.
10601 'C': Take the duplicated element in a vector constant
10602 and print it in hex.
10603 'D': Take the duplicated element in a vector constant
10604 and print it as an unsigned integer, in decimal.
10605 'e': Print the sign/zero-extend size as a character 8->b,
10606 16->h, 32->w. Can also be used for masks:
10607 0xff->b, 0xffff->h, 0xffffffff->w.
10608 'I': If the operand is a duplicated vector constant,
10609 replace it with the duplicated scalar. If the
10610 operand is then a floating-point constant, replace
10611 it with the integer bit representation. Print the
10612 transformed constant as a signed decimal number.
10613 'p': Prints N such that 2^N == X (X must be power of 2 and
10614 const int).
10615 'P': Print the number of non-zero bits in X (a const_int).
10616 'H': Print the higher numbered register of a pair (TImode)
10617 of regs.
10618 'm': Print a condition (eq, ne, etc).
10619 'M': Same as 'm', but invert condition.
10620 'N': Take the duplicated element in a vector constant
10621 and print the negative of it in decimal.
10622 'b/h/s/d/q': Print a scalar FP/SIMD register name.
10623 'S/T/U/V': Print a FP/SIMD register name for a register list.
10624 The register printed is the FP/SIMD register name
10625 of X + 0/1/2/3 for S/T/U/V.
10626 'R': Print a scalar Integer/FP/SIMD register name + 1.
10627 'X': Print bottom 16 bits of integer constant in hex.
10628 'w/x': Print a general register name or the zero register
10629 (32-bit or 64-bit).
10630 '0': Print a normal operand, if it's a general register,
10631 then we assume DImode.
10632 'k': Print NZCV for conditional compare instructions.
10633 'A': Output address constant representing the first
10634 argument of X, specifying a relocation offset
10635 if appropriate.
10636 'L': Output constant address specified by X
10637 with a relocation offset if appropriate.
10638 'G': Prints address of X, specifying a PC relative
10639 relocation mode if appropriate.
10640 'y': Output address of LDP or STP - this is used for
10641 some LDP/STPs which don't use a PARALLEL in their
10642 pattern (so the mode needs to be adjusted).
10643 'z': Output address of a typical LDP or STP. */
10645 static void
10646 aarch64_print_operand (FILE *f, rtx x, int code)
10648 rtx elt;
10649 switch (code)
10651 case 'c':
10652 if (CONST_INT_P (x))
10653 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
10654 else
10656 poly_int64 offset;
10657 rtx base = strip_offset_and_salt (x, &offset);
10658 if (SYMBOL_REF_P (base))
10659 output_addr_const (f, x);
10660 else
10661 output_operand_lossage ("unsupported operand for code '%c'", code);
10663 break;
10665 case 'e':
10667 x = unwrap_const_vec_duplicate (x);
10668 if (!CONST_INT_P (x))
10670 output_operand_lossage ("invalid operand for '%%%c'", code);
10671 return;
10674 HOST_WIDE_INT val = INTVAL (x);
10675 if ((val & ~7) == 8 || val == 0xff)
10676 fputc ('b', f);
10677 else if ((val & ~7) == 16 || val == 0xffff)
10678 fputc ('h', f);
10679 else if ((val & ~7) == 32 || val == 0xffffffff)
10680 fputc ('w', f);
10681 else
10683 output_operand_lossage ("invalid operand for '%%%c'", code);
10684 return;
10687 break;
10689 case 'p':
10691 int n;
10693 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
10695 output_operand_lossage ("invalid operand for '%%%c'", code);
10696 return;
10699 asm_fprintf (f, "%d", n);
10701 break;
10703 case 'P':
10704 if (!CONST_INT_P (x))
10706 output_operand_lossage ("invalid operand for '%%%c'", code);
10707 return;
10710 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
10711 break;
10713 case 'H':
10714 if (x == const0_rtx)
10716 asm_fprintf (f, "xzr");
10717 break;
10720 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
10722 output_operand_lossage ("invalid operand for '%%%c'", code);
10723 return;
10726 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
10727 break;
10729 case 'I':
10731 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
10732 if (CONST_INT_P (x))
10733 asm_fprintf (f, "%wd", INTVAL (x));
10734 else
10736 output_operand_lossage ("invalid operand for '%%%c'", code);
10737 return;
10739 break;
10742 case 'M':
10743 case 'm':
10745 int cond_code;
10746 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
10747 if (x == const_true_rtx)
10749 if (code == 'M')
10750 fputs ("nv", f);
10751 return;
10754 if (!COMPARISON_P (x))
10756 output_operand_lossage ("invalid operand for '%%%c'", code);
10757 return;
10760 cond_code = aarch64_get_condition_code (x);
10761 gcc_assert (cond_code >= 0);
10762 if (code == 'M')
10763 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
10764 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
10765 fputs (aarch64_sve_condition_codes[cond_code], f);
10766 else
10767 fputs (aarch64_condition_codes[cond_code], f);
10769 break;
10771 case 'N':
10772 if (!const_vec_duplicate_p (x, &elt))
10774 output_operand_lossage ("invalid vector constant");
10775 return;
10778 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10779 asm_fprintf (f, "%wd", (HOST_WIDE_INT) -UINTVAL (elt));
10780 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10781 && aarch64_print_vector_float_operand (f, x, true))
10783 else
10785 output_operand_lossage ("invalid vector constant");
10786 return;
10788 break;
10790 case 'b':
10791 case 'h':
10792 case 's':
10793 case 'd':
10794 case 'q':
10795 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10797 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10798 return;
10800 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
10801 break;
10803 case 'S':
10804 case 'T':
10805 case 'U':
10806 case 'V':
10807 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10809 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10810 return;
10812 asm_fprintf (f, "%c%d",
10813 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
10814 REGNO (x) - V0_REGNUM + (code - 'S'));
10815 break;
10817 case 'R':
10818 if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
10819 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
10820 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10821 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
10822 else
10823 output_operand_lossage ("incompatible register operand for '%%%c'",
10824 code);
10825 break;
10827 case 'X':
10828 if (!CONST_INT_P (x))
10830 output_operand_lossage ("invalid operand for '%%%c'", code);
10831 return;
10833 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
10834 break;
10836 case 'C':
10838 /* Print a replicated constant in hex. */
10839 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10841 output_operand_lossage ("invalid operand for '%%%c'", code);
10842 return;
10844 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10845 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10847 break;
10849 case 'D':
10851 /* Print a replicated constant in decimal, treating it as
10852 unsigned. */
10853 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10855 output_operand_lossage ("invalid operand for '%%%c'", code);
10856 return;
10858 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10859 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10861 break;
10863 case 'w':
10864 case 'x':
10865 if (x == const0_rtx
10866 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
10868 asm_fprintf (f, "%czr", code);
10869 break;
10872 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10874 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
10875 break;
10878 if (REG_P (x) && REGNO (x) == SP_REGNUM)
10880 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
10881 break;
10884 /* Fall through */
10886 case 0:
10887 if (x == NULL)
10889 output_operand_lossage ("missing operand");
10890 return;
10893 switch (GET_CODE (x))
10895 case REG:
10896 if (aarch64_sve_data_mode_p (GET_MODE (x)))
10898 if (REG_NREGS (x) == 1)
10899 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
10900 else
10902 char suffix
10903 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
10904 asm_fprintf (f, "{z%d.%c - z%d.%c}",
10905 REGNO (x) - V0_REGNUM, suffix,
10906 END_REGNO (x) - V0_REGNUM - 1, suffix);
10909 else
10910 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
10911 break;
10913 case MEM:
10914 output_address (GET_MODE (x), XEXP (x, 0));
10915 break;
10917 case LABEL_REF:
10918 case SYMBOL_REF:
10919 output_addr_const (asm_out_file, x);
10920 break;
10922 case CONST_INT:
10923 asm_fprintf (f, "%wd", INTVAL (x));
10924 break;
10926 case CONST:
10927 if (!VECTOR_MODE_P (GET_MODE (x)))
10929 output_addr_const (asm_out_file, x);
10930 break;
10932 /* fall through */
10934 case CONST_VECTOR:
10935 if (!const_vec_duplicate_p (x, &elt))
10937 output_operand_lossage ("invalid vector constant");
10938 return;
10941 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10942 asm_fprintf (f, "%wd", INTVAL (elt));
10943 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10944 && aarch64_print_vector_float_operand (f, x, false))
10946 else
10948 output_operand_lossage ("invalid vector constant");
10949 return;
10951 break;
10953 case CONST_DOUBLE:
10954 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
10955 be getting CONST_DOUBLEs holding integers. */
10956 gcc_assert (GET_MODE (x) != VOIDmode);
10957 if (aarch64_float_const_zero_rtx_p (x))
10959 fputc ('0', f);
10960 break;
10962 else if (aarch64_float_const_representable_p (x))
10964 #define buf_size 20
10965 char float_buf[buf_size] = {'\0'};
10966 real_to_decimal_for_mode (float_buf,
10967 CONST_DOUBLE_REAL_VALUE (x),
10968 buf_size, buf_size,
10969 1, GET_MODE (x));
10970 asm_fprintf (asm_out_file, "%s", float_buf);
10971 break;
10972 #undef buf_size
10974 output_operand_lossage ("invalid constant");
10975 return;
10976 default:
10977 output_operand_lossage ("invalid operand");
10978 return;
10980 break;
10982 case 'A':
10983 if (GET_CODE (x) == HIGH)
10984 x = XEXP (x, 0);
10986 switch (aarch64_classify_symbolic_expression (x))
10988 case SYMBOL_SMALL_GOT_4G:
10989 asm_fprintf (asm_out_file, ":got:");
10990 break;
10992 case SYMBOL_SMALL_TLSGD:
10993 asm_fprintf (asm_out_file, ":tlsgd:");
10994 break;
10996 case SYMBOL_SMALL_TLSDESC:
10997 asm_fprintf (asm_out_file, ":tlsdesc:");
10998 break;
11000 case SYMBOL_SMALL_TLSIE:
11001 asm_fprintf (asm_out_file, ":gottprel:");
11002 break;
11004 case SYMBOL_TLSLE24:
11005 asm_fprintf (asm_out_file, ":tprel:");
11006 break;
11008 case SYMBOL_TINY_GOT:
11009 gcc_unreachable ();
11010 break;
11012 default:
11013 break;
11015 output_addr_const (asm_out_file, x);
11016 break;
11018 case 'L':
11019 switch (aarch64_classify_symbolic_expression (x))
11021 case SYMBOL_SMALL_GOT_4G:
11022 asm_fprintf (asm_out_file, ":lo12:");
11023 break;
11025 case SYMBOL_SMALL_TLSGD:
11026 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
11027 break;
11029 case SYMBOL_SMALL_TLSDESC:
11030 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
11031 break;
11033 case SYMBOL_SMALL_TLSIE:
11034 asm_fprintf (asm_out_file, ":gottprel_lo12:");
11035 break;
11037 case SYMBOL_TLSLE12:
11038 asm_fprintf (asm_out_file, ":tprel_lo12:");
11039 break;
11041 case SYMBOL_TLSLE24:
11042 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
11043 break;
11045 case SYMBOL_TINY_GOT:
11046 asm_fprintf (asm_out_file, ":got:");
11047 break;
11049 case SYMBOL_TINY_TLSIE:
11050 asm_fprintf (asm_out_file, ":gottprel:");
11051 break;
11053 default:
11054 break;
11056 output_addr_const (asm_out_file, x);
11057 break;
11059 case 'G':
11060 switch (aarch64_classify_symbolic_expression (x))
11062 case SYMBOL_TLSLE24:
11063 asm_fprintf (asm_out_file, ":tprel_hi12:");
11064 break;
11065 default:
11066 break;
11068 output_addr_const (asm_out_file, x);
11069 break;
11071 case 'k':
11073 HOST_WIDE_INT cond_code;
11075 if (!CONST_INT_P (x))
11077 output_operand_lossage ("invalid operand for '%%%c'", code);
11078 return;
11081 cond_code = INTVAL (x);
11082 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
11083 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
11085 break;
11087 case 'y':
11088 case 'z':
11090 machine_mode mode = GET_MODE (x);
11092 if (!MEM_P (x)
11093 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
11095 output_operand_lossage ("invalid operand for '%%%c'", code);
11096 return;
11099 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
11100 code == 'y'
11101 ? ADDR_QUERY_LDP_STP_N
11102 : ADDR_QUERY_LDP_STP))
11103 output_operand_lossage ("invalid operand prefix '%%%c'", code);
11105 break;
11107 default:
11108 output_operand_lossage ("invalid operand prefix '%%%c'", code);
11109 return;
11113 /* Print address 'x' of a memory access with mode 'mode'.
11114 'op' is the context required by aarch64_classify_address. It can either be
11115 MEM for a normal memory access or PARALLEL for LDP/STP. */
11116 static bool
11117 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
11118 aarch64_addr_query_type type)
11120 struct aarch64_address_info addr;
11121 unsigned int size, vec_flags;
11123 /* Check all addresses are Pmode - including ILP32. */
11124 if (GET_MODE (x) != Pmode
11125 && (!CONST_INT_P (x)
11126 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
11128 output_operand_lossage ("invalid address mode");
11129 return false;
11132 if (aarch64_classify_address (&addr, x, mode, true, type))
11133 switch (addr.type)
11135 case ADDRESS_REG_IMM:
11136 if (known_eq (addr.const_offset, 0))
11138 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
11139 return true;
11142 vec_flags = aarch64_classify_vector_mode (mode);
11143 if (vec_flags & VEC_ANY_SVE)
11145 HOST_WIDE_INT vnum
11146 = exact_div (addr.const_offset,
11147 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
11148 asm_fprintf (f, "[%s, #%wd, mul vl]",
11149 reg_names[REGNO (addr.base)], vnum);
11150 return true;
11153 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
11154 INTVAL (addr.offset));
11155 return true;
11157 case ADDRESS_REG_REG:
11158 if (addr.shift == 0)
11159 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
11160 reg_names [REGNO (addr.offset)]);
11161 else
11162 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
11163 reg_names [REGNO (addr.offset)], addr.shift);
11164 return true;
11166 case ADDRESS_REG_UXTW:
11167 if (addr.shift == 0)
11168 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
11169 REGNO (addr.offset) - R0_REGNUM);
11170 else
11171 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
11172 REGNO (addr.offset) - R0_REGNUM, addr.shift);
11173 return true;
11175 case ADDRESS_REG_SXTW:
11176 if (addr.shift == 0)
11177 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
11178 REGNO (addr.offset) - R0_REGNUM);
11179 else
11180 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
11181 REGNO (addr.offset) - R0_REGNUM, addr.shift);
11182 return true;
11184 case ADDRESS_REG_WB:
11185 /* Writeback is only supported for fixed-width modes. */
11186 size = GET_MODE_SIZE (mode).to_constant ();
11187 switch (GET_CODE (x))
11189 case PRE_INC:
11190 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
11191 return true;
11192 case POST_INC:
11193 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
11194 return true;
11195 case PRE_DEC:
11196 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
11197 return true;
11198 case POST_DEC:
11199 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
11200 return true;
11201 case PRE_MODIFY:
11202 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
11203 INTVAL (addr.offset));
11204 return true;
11205 case POST_MODIFY:
11206 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
11207 INTVAL (addr.offset));
11208 return true;
11209 default:
11210 break;
11212 break;
11214 case ADDRESS_LO_SUM:
11215 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
11216 output_addr_const (f, addr.offset);
11217 asm_fprintf (f, "]");
11218 return true;
11220 case ADDRESS_SYMBOLIC:
11221 output_addr_const (f, x);
11222 return true;
11225 return false;
11228 /* Print address 'x' of a memory access with mode 'mode'. */
11229 static void
11230 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
11232 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
11233 output_addr_const (f, x);
11236 /* Implement TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA. */
11238 static bool
11239 aarch64_output_addr_const_extra (FILE *file, rtx x)
11241 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SALT_ADDR)
11243 output_addr_const (file, XVECEXP (x, 0, 0));
11244 return true;
11246 return false;
11249 bool
11250 aarch64_label_mentioned_p (rtx x)
11252 const char *fmt;
11253 int i;
11255 if (LABEL_REF_P (x))
11256 return true;
11258 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
11259 referencing instruction, but they are constant offsets, not
11260 symbols. */
11261 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
11262 return false;
11264 fmt = GET_RTX_FORMAT (GET_CODE (x));
11265 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
11267 if (fmt[i] == 'E')
11269 int j;
11271 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
11272 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
11273 return 1;
11275 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
11276 return 1;
11279 return 0;
11282 /* Implement REGNO_REG_CLASS. */
11284 enum reg_class
11285 aarch64_regno_regclass (unsigned regno)
11287 if (STUB_REGNUM_P (regno))
11288 return STUB_REGS;
11290 if (GP_REGNUM_P (regno))
11291 return GENERAL_REGS;
11293 if (regno == SP_REGNUM)
11294 return STACK_REG;
11296 if (regno == FRAME_POINTER_REGNUM
11297 || regno == ARG_POINTER_REGNUM)
11298 return POINTER_REGS;
11300 if (FP_REGNUM_P (regno))
11301 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
11302 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
11304 if (PR_REGNUM_P (regno))
11305 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
11307 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
11308 return FFR_REGS;
11310 return NO_REGS;
11313 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
11314 If OFFSET is out of range, return an offset of an anchor point
11315 that is in range. Return 0 otherwise. */
11317 static HOST_WIDE_INT
11318 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
11319 machine_mode mode)
11321 /* Does it look like we'll need a 16-byte load/store-pair operation? */
11322 if (size > 16)
11323 return (offset + 0x400) & ~0x7f0;
11325 /* For offsets that aren't a multiple of the access size, the limit is
11326 -256...255. */
11327 if (offset & (size - 1))
11329 /* BLKmode typically uses LDP of X-registers. */
11330 if (mode == BLKmode)
11331 return (offset + 512) & ~0x3ff;
11332 return (offset + 0x100) & ~0x1ff;
11335 /* Small negative offsets are supported. */
11336 if (IN_RANGE (offset, -256, 0))
11337 return 0;
11339 if (mode == TImode || mode == TFmode)
11340 return (offset + 0x100) & ~0x1ff;
11342 /* Use 12-bit offset by access size. */
11343 return offset & (~0xfff * size);
11346 static rtx
11347 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
11349 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
11350 where mask is selected by alignment and size of the offset.
11351 We try to pick as large a range for the offset as possible to
11352 maximize the chance of a CSE. However, for aligned addresses
11353 we limit the range to 4k so that structures with different sized
11354 elements are likely to use the same base. We need to be careful
11355 not to split a CONST for some forms of address expression, otherwise
11356 it will generate sub-optimal code. */
11358 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
11360 rtx base = XEXP (x, 0);
11361 rtx offset_rtx = XEXP (x, 1);
11362 HOST_WIDE_INT offset = INTVAL (offset_rtx);
11364 if (GET_CODE (base) == PLUS)
11366 rtx op0 = XEXP (base, 0);
11367 rtx op1 = XEXP (base, 1);
11369 /* Force any scaling into a temp for CSE. */
11370 op0 = force_reg (Pmode, op0);
11371 op1 = force_reg (Pmode, op1);
11373 /* Let the pointer register be in op0. */
11374 if (REG_POINTER (op1))
11375 std::swap (op0, op1);
11377 /* If the pointer is virtual or frame related, then we know that
11378 virtual register instantiation or register elimination is going
11379 to apply a second constant. We want the two constants folded
11380 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
11381 if (virt_or_elim_regno_p (REGNO (op0)))
11383 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
11384 NULL_RTX, true, OPTAB_DIRECT);
11385 return gen_rtx_PLUS (Pmode, base, op1);
11388 /* Otherwise, in order to encourage CSE (and thence loop strength
11389 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
11390 base = expand_binop (Pmode, add_optab, op0, op1,
11391 NULL_RTX, true, OPTAB_DIRECT);
11392 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
11395 HOST_WIDE_INT size;
11396 if (GET_MODE_SIZE (mode).is_constant (&size))
11398 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
11399 mode);
11400 if (base_offset != 0)
11402 base = plus_constant (Pmode, base, base_offset);
11403 base = force_operand (base, NULL_RTX);
11404 return plus_constant (Pmode, base, offset - base_offset);
11409 return x;
11412 static reg_class_t
11413 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
11414 reg_class_t rclass,
11415 machine_mode mode,
11416 secondary_reload_info *sri)
11418 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
11419 LDR and STR. See the comment at the head of aarch64-sve.md for
11420 more details about the big-endian handling. */
11421 if (reg_class_subset_p (rclass, FP_REGS)
11422 && !((REG_P (x) && HARD_REGISTER_P (x))
11423 || aarch64_simd_valid_immediate (x, NULL))
11424 && mode != VNx16QImode)
11426 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11427 if ((vec_flags & VEC_SVE_DATA)
11428 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
11430 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
11431 return NO_REGS;
11435 /* If we have to disable direct literal pool loads and stores because the
11436 function is too big, then we need a scratch register. */
11437 if (MEM_P (x) && SYMBOL_REF_P (x) && CONSTANT_POOL_ADDRESS_P (x)
11438 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
11439 || targetm.vector_mode_supported_p (GET_MODE (x)))
11440 && !aarch64_pcrelative_literal_loads)
11442 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
11443 return NO_REGS;
11446 /* Without the TARGET_SIMD instructions we cannot move a Q register
11447 to a Q register directly. We need a scratch. */
11448 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
11449 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
11450 && reg_class_subset_p (rclass, FP_REGS))
11452 sri->icode = code_for_aarch64_reload_mov (mode);
11453 return NO_REGS;
11456 /* A TFmode or TImode memory access should be handled via an FP_REGS
11457 because AArch64 has richer addressing modes for LDR/STR instructions
11458 than LDP/STP instructions. */
11459 if (TARGET_FLOAT && rclass == GENERAL_REGS
11460 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
11461 return FP_REGS;
11463 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
11464 return GENERAL_REGS;
11466 return NO_REGS;
11469 static bool
11470 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
11472 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
11474 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
11475 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
11476 if (frame_pointer_needed)
11477 return to == HARD_FRAME_POINTER_REGNUM;
11478 return true;
11481 poly_int64
11482 aarch64_initial_elimination_offset (unsigned from, unsigned to)
11484 if (to == HARD_FRAME_POINTER_REGNUM)
11486 if (from == ARG_POINTER_REGNUM)
11487 return cfun->machine->frame.hard_fp_offset;
11489 if (from == FRAME_POINTER_REGNUM)
11490 return cfun->machine->frame.hard_fp_offset
11491 - cfun->machine->frame.locals_offset;
11494 if (to == STACK_POINTER_REGNUM)
11496 if (from == FRAME_POINTER_REGNUM)
11497 return cfun->machine->frame.frame_size
11498 - cfun->machine->frame.locals_offset;
11501 return cfun->machine->frame.frame_size;
11505 /* Get return address without mangling. */
11508 aarch64_return_addr_rtx (void)
11510 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
11511 /* Note: aarch64_return_address_signing_enabled only
11512 works after cfun->machine->frame.laid_out is set,
11513 so here we don't know if the return address will
11514 be signed or not. */
11515 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
11516 emit_move_insn (lr, val);
11517 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
11518 return lr;
11522 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
11523 previous frame. */
11526 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
11528 if (count != 0)
11529 return const0_rtx;
11530 return aarch64_return_addr_rtx ();
11533 static void
11534 aarch64_asm_trampoline_template (FILE *f)
11536 /* Even if the current function doesn't have branch protection, some
11537 later function might, so since this template is only generated once
11538 we have to add a BTI just in case. */
11539 asm_fprintf (f, "\thint\t34 // bti c\n");
11541 if (TARGET_ILP32)
11543 asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
11544 asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
11546 else
11548 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
11549 asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
11551 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
11553 /* We always emit a speculation barrier.
11554 This is because the same trampoline template is used for every nested
11555 function. Since nested functions are not particularly common or
11556 performant we don't worry too much about the extra instructions to copy
11557 around.
11558 This is not yet a problem, since we have not yet implemented function
11559 specific attributes to choose between hardening against straight line
11560 speculation or not, but such function specific attributes are likely to
11561 happen in the future. */
11562 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
11564 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
11565 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
11568 static void
11569 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
11571 rtx fnaddr, mem, a_tramp;
11572 const int tramp_code_sz = 24;
11574 /* Don't need to copy the trailing D-words, we fill those in below. */
11575 /* We create our own memory address in Pmode so that `emit_block_move` can
11576 use parts of the backend which expect Pmode addresses. */
11577 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
11578 emit_block_move (gen_rtx_MEM (BLKmode, temp),
11579 assemble_trampoline_template (),
11580 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
11581 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
11582 fnaddr = XEXP (DECL_RTL (fndecl), 0);
11583 if (GET_MODE (fnaddr) != ptr_mode)
11584 fnaddr = convert_memory_address (ptr_mode, fnaddr);
11585 emit_move_insn (mem, fnaddr);
11587 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
11588 emit_move_insn (mem, chain_value);
11590 /* XXX We should really define a "clear_cache" pattern and use
11591 gen_clear_cache(). */
11592 a_tramp = XEXP (m_tramp, 0);
11593 maybe_emit_call_builtin___clear_cache (a_tramp,
11594 plus_constant (ptr_mode,
11595 a_tramp,
11596 TRAMPOLINE_SIZE));
11599 static unsigned char
11600 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
11602 /* ??? Logically we should only need to provide a value when
11603 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
11604 can hold MODE, but at the moment we need to handle all modes.
11605 Just ignore any runtime parts for registers that can't store them. */
11606 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
11607 unsigned int nregs, vec_flags;
11608 switch (regclass)
11610 case STUB_REGS:
11611 case TAILCALL_ADDR_REGS:
11612 case POINTER_REGS:
11613 case GENERAL_REGS:
11614 case ALL_REGS:
11615 case POINTER_AND_FP_REGS:
11616 case FP_REGS:
11617 case FP_LO_REGS:
11618 case FP_LO8_REGS:
11619 vec_flags = aarch64_classify_vector_mode (mode);
11620 if ((vec_flags & VEC_SVE_DATA)
11621 && constant_multiple_p (GET_MODE_SIZE (mode),
11622 aarch64_vl_bytes (mode, vec_flags), &nregs))
11623 return nregs;
11624 return (vec_flags & VEC_ADVSIMD
11625 ? CEIL (lowest_size, UNITS_PER_VREG)
11626 : CEIL (lowest_size, UNITS_PER_WORD));
11627 case STACK_REG:
11628 case PR_REGS:
11629 case PR_LO_REGS:
11630 case PR_HI_REGS:
11631 case FFR_REGS:
11632 case PR_AND_FFR_REGS:
11633 return 1;
11635 case NO_REGS:
11636 return 0;
11638 default:
11639 break;
11641 gcc_unreachable ();
11644 static reg_class_t
11645 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
11647 if (regclass == POINTER_REGS)
11648 return GENERAL_REGS;
11650 if (regclass == STACK_REG)
11652 if (REG_P(x)
11653 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
11654 return regclass;
11656 return NO_REGS;
11659 /* Register eliminiation can result in a request for
11660 SP+constant->FP_REGS. We cannot support such operations which
11661 use SP as source and an FP_REG as destination, so reject out
11662 right now. */
11663 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
11665 rtx lhs = XEXP (x, 0);
11667 /* Look through a possible SUBREG introduced by ILP32. */
11668 if (SUBREG_P (lhs))
11669 lhs = SUBREG_REG (lhs);
11671 gcc_assert (REG_P (lhs));
11672 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
11673 POINTER_REGS));
11674 return NO_REGS;
11677 return regclass;
11680 void
11681 aarch64_asm_output_labelref (FILE* f, const char *name)
11683 asm_fprintf (f, "%U%s", name);
11686 static void
11687 aarch64_elf_asm_constructor (rtx symbol, int priority)
11689 if (priority == DEFAULT_INIT_PRIORITY)
11690 default_ctor_section_asm_out_constructor (symbol, priority);
11691 else
11693 section *s;
11694 /* While priority is known to be in range [0, 65535], so 18 bytes
11695 would be enough, the compiler might not know that. To avoid
11696 -Wformat-truncation false positive, use a larger size. */
11697 char buf[23];
11698 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
11699 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11700 switch_to_section (s);
11701 assemble_align (POINTER_SIZE);
11702 assemble_aligned_integer (POINTER_BYTES, symbol);
11706 static void
11707 aarch64_elf_asm_destructor (rtx symbol, int priority)
11709 if (priority == DEFAULT_INIT_PRIORITY)
11710 default_dtor_section_asm_out_destructor (symbol, priority);
11711 else
11713 section *s;
11714 /* While priority is known to be in range [0, 65535], so 18 bytes
11715 would be enough, the compiler might not know that. To avoid
11716 -Wformat-truncation false positive, use a larger size. */
11717 char buf[23];
11718 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
11719 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11720 switch_to_section (s);
11721 assemble_align (POINTER_SIZE);
11722 assemble_aligned_integer (POINTER_BYTES, symbol);
11726 const char*
11727 aarch64_output_casesi (rtx *operands)
11729 char buf[100];
11730 char label[100];
11731 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
11732 int index;
11733 static const char *const patterns[4][2] =
11736 "ldrb\t%w3, [%0,%w1,uxtw]",
11737 "add\t%3, %4, %w3, sxtb #2"
11740 "ldrh\t%w3, [%0,%w1,uxtw #1]",
11741 "add\t%3, %4, %w3, sxth #2"
11744 "ldr\t%w3, [%0,%w1,uxtw #2]",
11745 "add\t%3, %4, %w3, sxtw #2"
11747 /* We assume that DImode is only generated when not optimizing and
11748 that we don't really need 64-bit address offsets. That would
11749 imply an object file with 8GB of code in a single function! */
11751 "ldr\t%w3, [%0,%w1,uxtw #2]",
11752 "add\t%3, %4, %w3, sxtw #2"
11756 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
11758 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
11759 index = exact_log2 (GET_MODE_SIZE (mode));
11761 gcc_assert (index >= 0 && index <= 3);
11763 /* Need to implement table size reduction, by chaning the code below. */
11764 output_asm_insn (patterns[index][0], operands);
11765 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
11766 snprintf (buf, sizeof (buf),
11767 "adr\t%%4, %s", targetm.strip_name_encoding (label));
11768 output_asm_insn (buf, operands);
11769 output_asm_insn (patterns[index][1], operands);
11770 output_asm_insn ("br\t%3", operands);
11771 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
11772 operands);
11773 assemble_label (asm_out_file, label);
11774 return "";
11778 /* Return size in bits of an arithmetic operand which is shifted/scaled and
11779 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
11780 operator. */
11783 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
11785 if (shift >= 0 && shift <= 3)
11787 int size;
11788 for (size = 8; size <= 32; size *= 2)
11790 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
11791 if (mask == bits << shift)
11792 return size;
11795 return 0;
11798 /* Constant pools are per function only when PC relative
11799 literal loads are true or we are in the large memory
11800 model. */
11802 static inline bool
11803 aarch64_can_use_per_function_literal_pools_p (void)
11805 return (aarch64_pcrelative_literal_loads
11806 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
11809 static bool
11810 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
11812 /* We can't use blocks for constants when we're using a per-function
11813 constant pool. */
11814 return !aarch64_can_use_per_function_literal_pools_p ();
11817 /* Select appropriate section for constants depending
11818 on where we place literal pools. */
11820 static section *
11821 aarch64_select_rtx_section (machine_mode mode,
11822 rtx x,
11823 unsigned HOST_WIDE_INT align)
11825 if (aarch64_can_use_per_function_literal_pools_p ())
11826 return function_section (current_function_decl);
11828 return default_elf_select_rtx_section (mode, x, align);
11831 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
11832 void
11833 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
11834 HOST_WIDE_INT offset)
11836 /* When using per-function literal pools, we must ensure that any code
11837 section is aligned to the minimal instruction length, lest we get
11838 errors from the assembler re "unaligned instructions". */
11839 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
11840 ASM_OUTPUT_ALIGN (f, 2);
11843 /* Costs. */
11845 /* Helper function for rtx cost calculation. Strip a shift expression
11846 from X. Returns the inner operand if successful, or the original
11847 expression on failure. */
11848 static rtx
11849 aarch64_strip_shift (rtx x)
11851 rtx op = x;
11853 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
11854 we can convert both to ROR during final output. */
11855 if ((GET_CODE (op) == ASHIFT
11856 || GET_CODE (op) == ASHIFTRT
11857 || GET_CODE (op) == LSHIFTRT
11858 || GET_CODE (op) == ROTATERT
11859 || GET_CODE (op) == ROTATE)
11860 && CONST_INT_P (XEXP (op, 1)))
11861 return XEXP (op, 0);
11863 if (GET_CODE (op) == MULT
11864 && CONST_INT_P (XEXP (op, 1))
11865 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
11866 return XEXP (op, 0);
11868 return x;
11871 /* Helper function for rtx cost calculation. Strip an extend
11872 expression from X. Returns the inner operand if successful, or the
11873 original expression on failure. We deal with a number of possible
11874 canonicalization variations here. If STRIP_SHIFT is true, then
11875 we can strip off a shift also. */
11876 static rtx
11877 aarch64_strip_extend (rtx x, bool strip_shift)
11879 scalar_int_mode mode;
11880 rtx op = x;
11882 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
11883 return op;
11885 if (GET_CODE (op) == AND
11886 && GET_CODE (XEXP (op, 0)) == MULT
11887 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
11888 && CONST_INT_P (XEXP (op, 1))
11889 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
11890 INTVAL (XEXP (op, 1))) != 0)
11891 return XEXP (XEXP (op, 0), 0);
11893 /* Now handle extended register, as this may also have an optional
11894 left shift by 1..4. */
11895 if (strip_shift
11896 && GET_CODE (op) == ASHIFT
11897 && CONST_INT_P (XEXP (op, 1))
11898 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
11899 op = XEXP (op, 0);
11901 if (GET_CODE (op) == ZERO_EXTEND
11902 || GET_CODE (op) == SIGN_EXTEND)
11903 op = XEXP (op, 0);
11905 if (op != x)
11906 return op;
11908 return x;
11911 /* Return true iff CODE is a shift supported in combination
11912 with arithmetic instructions. */
11914 static bool
11915 aarch64_shift_p (enum rtx_code code)
11917 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
11921 /* Return true iff X is a cheap shift without a sign extend. */
11923 static bool
11924 aarch64_cheap_mult_shift_p (rtx x)
11926 rtx op0, op1;
11928 op0 = XEXP (x, 0);
11929 op1 = XEXP (x, 1);
11931 if (!(aarch64_tune_params.extra_tuning_flags
11932 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
11933 return false;
11935 if (GET_CODE (op0) == SIGN_EXTEND)
11936 return false;
11938 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
11939 && UINTVAL (op1) <= 4)
11940 return true;
11942 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
11943 return false;
11945 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
11947 if (l2 > 0 && l2 <= 4)
11948 return true;
11950 return false;
11953 /* Helper function for rtx cost calculation. Calculate the cost of
11954 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
11955 Return the calculated cost of the expression, recursing manually in to
11956 operands where needed. */
11958 static int
11959 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
11961 rtx op0, op1;
11962 const struct cpu_cost_table *extra_cost
11963 = aarch64_tune_params.insn_extra_cost;
11964 int cost = 0;
11965 bool compound_p = (outer == PLUS || outer == MINUS);
11966 machine_mode mode = GET_MODE (x);
11968 gcc_checking_assert (code == MULT);
11970 op0 = XEXP (x, 0);
11971 op1 = XEXP (x, 1);
11973 if (VECTOR_MODE_P (mode))
11975 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11976 if (vec_flags & VEC_ADVSIMD)
11978 /* The by-element versions of the instruction have the same costs as
11979 the normal 3-vector version. So don't add the costs of the
11980 duplicate into the costs of the multiply. We make an assumption
11981 that the input to the VEC_DUPLICATE is already on the FP & SIMD
11982 side. This means costing of a MUL by element pre RA is a bit
11983 optimistic. */
11984 if (GET_CODE (op0) == VEC_DUPLICATE)
11985 op0 = XEXP (op0, 0);
11986 else if (GET_CODE (op1) == VEC_DUPLICATE)
11987 op1 = XEXP (op1, 0);
11989 cost += rtx_cost (op0, mode, MULT, 0, speed);
11990 cost += rtx_cost (op1, mode, MULT, 1, speed);
11991 if (speed)
11993 if (GET_CODE (x) == MULT)
11994 cost += extra_cost->vect.mult;
11995 /* This is to catch the SSRA costing currently flowing here. */
11996 else
11997 cost += extra_cost->vect.alu;
11999 return cost;
12002 /* Integer multiply/fma. */
12003 if (GET_MODE_CLASS (mode) == MODE_INT)
12005 /* The multiply will be canonicalized as a shift, cost it as such. */
12006 if (aarch64_shift_p (GET_CODE (x))
12007 || (CONST_INT_P (op1)
12008 && exact_log2 (INTVAL (op1)) > 0))
12010 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
12011 || GET_CODE (op0) == SIGN_EXTEND;
12012 if (speed)
12014 if (compound_p)
12016 /* If the shift is considered cheap,
12017 then don't add any cost. */
12018 if (aarch64_cheap_mult_shift_p (x))
12020 else if (REG_P (op1))
12021 /* ARITH + shift-by-register. */
12022 cost += extra_cost->alu.arith_shift_reg;
12023 else if (is_extend)
12024 /* ARITH + extended register. We don't have a cost field
12025 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
12026 cost += extra_cost->alu.extend_arith;
12027 else
12028 /* ARITH + shift-by-immediate. */
12029 cost += extra_cost->alu.arith_shift;
12031 else
12032 /* LSL (immediate). */
12033 cost += extra_cost->alu.shift;
12036 /* Strip extends as we will have costed them in the case above. */
12037 if (is_extend)
12038 op0 = aarch64_strip_extend (op0, true);
12040 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
12042 return cost;
12045 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
12046 compound and let the below cases handle it. After all, MNEG is a
12047 special-case alias of MSUB. */
12048 if (GET_CODE (op0) == NEG)
12050 op0 = XEXP (op0, 0);
12051 compound_p = true;
12054 /* Integer multiplies or FMAs have zero/sign extending variants. */
12055 if ((GET_CODE (op0) == ZERO_EXTEND
12056 && GET_CODE (op1) == ZERO_EXTEND)
12057 || (GET_CODE (op0) == SIGN_EXTEND
12058 && GET_CODE (op1) == SIGN_EXTEND))
12060 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
12061 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
12063 if (speed)
12065 if (compound_p)
12066 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
12067 cost += extra_cost->mult[0].extend_add;
12068 else
12069 /* MUL/SMULL/UMULL. */
12070 cost += extra_cost->mult[0].extend;
12073 return cost;
12076 /* This is either an integer multiply or a MADD. In both cases
12077 we want to recurse and cost the operands. */
12078 cost += rtx_cost (op0, mode, MULT, 0, speed);
12079 cost += rtx_cost (op1, mode, MULT, 1, speed);
12081 if (speed)
12083 if (compound_p)
12084 /* MADD/MSUB. */
12085 cost += extra_cost->mult[mode == DImode].add;
12086 else
12087 /* MUL. */
12088 cost += extra_cost->mult[mode == DImode].simple;
12091 return cost;
12093 else
12095 if (speed)
12097 /* Floating-point FMA/FMUL can also support negations of the
12098 operands, unless the rounding mode is upward or downward in
12099 which case FNMUL is different than FMUL with operand negation. */
12100 bool neg0 = GET_CODE (op0) == NEG;
12101 bool neg1 = GET_CODE (op1) == NEG;
12102 if (compound_p || !flag_rounding_math || (neg0 && neg1))
12104 if (neg0)
12105 op0 = XEXP (op0, 0);
12106 if (neg1)
12107 op1 = XEXP (op1, 0);
12110 if (compound_p)
12111 /* FMADD/FNMADD/FNMSUB/FMSUB. */
12112 cost += extra_cost->fp[mode == DFmode].fma;
12113 else
12114 /* FMUL/FNMUL. */
12115 cost += extra_cost->fp[mode == DFmode].mult;
12118 cost += rtx_cost (op0, mode, MULT, 0, speed);
12119 cost += rtx_cost (op1, mode, MULT, 1, speed);
12120 return cost;
12124 static int
12125 aarch64_address_cost (rtx x,
12126 machine_mode mode,
12127 addr_space_t as ATTRIBUTE_UNUSED,
12128 bool speed)
12130 enum rtx_code c = GET_CODE (x);
12131 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
12132 struct aarch64_address_info info;
12133 int cost = 0;
12134 info.shift = 0;
12136 if (!aarch64_classify_address (&info, x, mode, false))
12138 if (GET_CODE (x) == CONST || SYMBOL_REF_P (x))
12140 /* This is a CONST or SYMBOL ref which will be split
12141 in a different way depending on the code model in use.
12142 Cost it through the generic infrastructure. */
12143 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
12144 /* Divide through by the cost of one instruction to
12145 bring it to the same units as the address costs. */
12146 cost_symbol_ref /= COSTS_N_INSNS (1);
12147 /* The cost is then the cost of preparing the address,
12148 followed by an immediate (possibly 0) offset. */
12149 return cost_symbol_ref + addr_cost->imm_offset;
12151 else
12153 /* This is most likely a jump table from a case
12154 statement. */
12155 return addr_cost->register_offset;
12159 switch (info.type)
12161 case ADDRESS_LO_SUM:
12162 case ADDRESS_SYMBOLIC:
12163 case ADDRESS_REG_IMM:
12164 cost += addr_cost->imm_offset;
12165 break;
12167 case ADDRESS_REG_WB:
12168 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
12169 cost += addr_cost->pre_modify;
12170 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
12172 if (mode == CImode)
12173 cost += addr_cost->post_modify_ld3_st3;
12174 else if (mode == XImode)
12175 cost += addr_cost->post_modify_ld4_st4;
12176 else
12177 cost += addr_cost->post_modify;
12179 else
12180 gcc_unreachable ();
12182 break;
12184 case ADDRESS_REG_REG:
12185 cost += addr_cost->register_offset;
12186 break;
12188 case ADDRESS_REG_SXTW:
12189 cost += addr_cost->register_sextend;
12190 break;
12192 case ADDRESS_REG_UXTW:
12193 cost += addr_cost->register_zextend;
12194 break;
12196 default:
12197 gcc_unreachable ();
12201 if (info.shift > 0)
12203 /* For the sake of calculating the cost of the shifted register
12204 component, we can treat same sized modes in the same way. */
12205 if (known_eq (GET_MODE_BITSIZE (mode), 16))
12206 cost += addr_cost->addr_scale_costs.hi;
12207 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
12208 cost += addr_cost->addr_scale_costs.si;
12209 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
12210 cost += addr_cost->addr_scale_costs.di;
12211 else
12212 /* We can't tell, or this is a 128-bit vector. */
12213 cost += addr_cost->addr_scale_costs.ti;
12216 return cost;
12219 /* Return the cost of a branch. If SPEED_P is true then the compiler is
12220 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
12221 to be taken. */
12224 aarch64_branch_cost (bool speed_p, bool predictable_p)
12226 /* When optimizing for speed, use the cost of unpredictable branches. */
12227 const struct cpu_branch_cost *branch_costs =
12228 aarch64_tune_params.branch_costs;
12230 if (!speed_p || predictable_p)
12231 return branch_costs->predictable;
12232 else
12233 return branch_costs->unpredictable;
12236 /* Return true if X is a zero or sign extract
12237 usable in an ADD or SUB (extended register) instruction. */
12238 static bool
12239 aarch64_rtx_arith_op_extract_p (rtx x)
12241 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
12242 No shift. */
12243 if (GET_CODE (x) == SIGN_EXTEND
12244 || GET_CODE (x) == ZERO_EXTEND)
12245 return REG_P (XEXP (x, 0));
12247 return false;
12250 static bool
12251 aarch64_frint_unspec_p (unsigned int u)
12253 switch (u)
12255 case UNSPEC_FRINTZ:
12256 case UNSPEC_FRINTP:
12257 case UNSPEC_FRINTM:
12258 case UNSPEC_FRINTA:
12259 case UNSPEC_FRINTN:
12260 case UNSPEC_FRINTX:
12261 case UNSPEC_FRINTI:
12262 return true;
12264 default:
12265 return false;
12269 /* Return true iff X is an rtx that will match an extr instruction
12270 i.e. as described in the *extr<mode>5_insn family of patterns.
12271 OP0 and OP1 will be set to the operands of the shifts involved
12272 on success and will be NULL_RTX otherwise. */
12274 static bool
12275 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
12277 rtx op0, op1;
12278 scalar_int_mode mode;
12279 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
12280 return false;
12282 *res_op0 = NULL_RTX;
12283 *res_op1 = NULL_RTX;
12285 if (GET_CODE (x) != IOR)
12286 return false;
12288 op0 = XEXP (x, 0);
12289 op1 = XEXP (x, 1);
12291 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
12292 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
12294 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
12295 if (GET_CODE (op1) == ASHIFT)
12296 std::swap (op0, op1);
12298 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
12299 return false;
12301 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
12302 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
12304 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
12305 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
12307 *res_op0 = XEXP (op0, 0);
12308 *res_op1 = XEXP (op1, 0);
12309 return true;
12313 return false;
12316 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
12317 storing it in *COST. Result is true if the total cost of the operation
12318 has now been calculated. */
12319 static bool
12320 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
12322 rtx inner;
12323 rtx comparator;
12324 enum rtx_code cmpcode;
12325 const struct cpu_cost_table *extra_cost
12326 = aarch64_tune_params.insn_extra_cost;
12328 if (COMPARISON_P (op0))
12330 inner = XEXP (op0, 0);
12331 comparator = XEXP (op0, 1);
12332 cmpcode = GET_CODE (op0);
12334 else
12336 inner = op0;
12337 comparator = const0_rtx;
12338 cmpcode = NE;
12341 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
12343 /* Conditional branch. */
12344 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
12345 return true;
12346 else
12348 if (cmpcode == NE || cmpcode == EQ)
12350 if (comparator == const0_rtx)
12352 /* TBZ/TBNZ/CBZ/CBNZ. */
12353 if (GET_CODE (inner) == ZERO_EXTRACT)
12354 /* TBZ/TBNZ. */
12355 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
12356 ZERO_EXTRACT, 0, speed);
12357 else
12358 /* CBZ/CBNZ. */
12359 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
12361 return true;
12363 if (register_operand (inner, VOIDmode)
12364 && aarch64_imm24 (comparator, VOIDmode))
12366 /* SUB and SUBS. */
12367 *cost += COSTS_N_INSNS (2);
12368 if (speed)
12369 *cost += extra_cost->alu.arith * 2;
12370 return true;
12373 else if (cmpcode == LT || cmpcode == GE)
12375 /* TBZ/TBNZ. */
12376 if (comparator == const0_rtx)
12377 return true;
12381 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
12383 /* CCMP. */
12384 if (GET_CODE (op1) == COMPARE)
12386 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
12387 if (XEXP (op1, 1) == const0_rtx)
12388 *cost += 1;
12389 if (speed)
12391 machine_mode mode = GET_MODE (XEXP (op1, 0));
12393 if (GET_MODE_CLASS (mode) == MODE_INT)
12394 *cost += extra_cost->alu.arith;
12395 else
12396 *cost += extra_cost->fp[mode == DFmode].compare;
12398 return true;
12401 /* It's a conditional operation based on the status flags,
12402 so it must be some flavor of CSEL. */
12404 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
12405 if (GET_CODE (op1) == NEG
12406 || GET_CODE (op1) == NOT
12407 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
12408 op1 = XEXP (op1, 0);
12409 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
12411 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
12412 op1 = XEXP (op1, 0);
12413 op2 = XEXP (op2, 0);
12415 else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
12417 inner = XEXP (op1, 0);
12418 if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
12419 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
12420 op1 = XEXP (inner, 0);
12423 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
12424 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
12425 return true;
12428 /* We don't know what this is, cost all operands. */
12429 return false;
12432 /* Check whether X is a bitfield operation of the form shift + extend that
12433 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
12434 operand to which the bitfield operation is applied. Otherwise return
12435 NULL_RTX. */
12437 static rtx
12438 aarch64_extend_bitfield_pattern_p (rtx x)
12440 rtx_code outer_code = GET_CODE (x);
12441 machine_mode outer_mode = GET_MODE (x);
12443 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
12444 && outer_mode != SImode && outer_mode != DImode)
12445 return NULL_RTX;
12447 rtx inner = XEXP (x, 0);
12448 rtx_code inner_code = GET_CODE (inner);
12449 machine_mode inner_mode = GET_MODE (inner);
12450 rtx op = NULL_RTX;
12452 switch (inner_code)
12454 case ASHIFT:
12455 if (CONST_INT_P (XEXP (inner, 1))
12456 && (inner_mode == QImode || inner_mode == HImode))
12457 op = XEXP (inner, 0);
12458 break;
12459 case LSHIFTRT:
12460 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
12461 && (inner_mode == QImode || inner_mode == HImode))
12462 op = XEXP (inner, 0);
12463 break;
12464 case ASHIFTRT:
12465 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
12466 && (inner_mode == QImode || inner_mode == HImode))
12467 op = XEXP (inner, 0);
12468 break;
12469 default:
12470 break;
12473 return op;
12476 /* Return true if the mask and a shift amount from an RTX of the form
12477 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
12478 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
12480 bool
12481 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
12482 rtx shft_amnt)
12484 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
12485 && INTVAL (mask) > 0
12486 && UINTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
12487 && exact_log2 ((UINTVAL (mask) >> UINTVAL (shft_amnt)) + 1) >= 0
12488 && (UINTVAL (mask)
12489 & ((HOST_WIDE_INT_1U << UINTVAL (shft_amnt)) - 1)) == 0;
12492 /* Return true if the masks and a shift amount from an RTX of the form
12493 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
12494 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
12496 bool
12497 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
12498 unsigned HOST_WIDE_INT mask1,
12499 unsigned HOST_WIDE_INT shft_amnt,
12500 unsigned HOST_WIDE_INT mask2)
12502 unsigned HOST_WIDE_INT t;
12504 /* Verify that there is no overlap in what bits are set in the two masks. */
12505 if (mask1 != ~mask2)
12506 return false;
12508 /* Verify that mask2 is not all zeros or ones. */
12509 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
12510 return false;
12512 /* The shift amount should always be less than the mode size. */
12513 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
12515 /* Verify that the mask being shifted is contiguous and would be in the
12516 least significant bits after shifting by shft_amnt. */
12517 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
12518 return (t == (t & -t));
12521 /* Calculate the cost of calculating X, storing it in *COST. Result
12522 is true if the total cost of the operation has now been calculated. */
12523 static bool
12524 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
12525 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
12527 rtx op0, op1, op2;
12528 const struct cpu_cost_table *extra_cost
12529 = aarch64_tune_params.insn_extra_cost;
12530 int code = GET_CODE (x);
12531 scalar_int_mode int_mode;
12533 /* By default, assume that everything has equivalent cost to the
12534 cheapest instruction. Any additional costs are applied as a delta
12535 above this default. */
12536 *cost = COSTS_N_INSNS (1);
12538 switch (code)
12540 case SET:
12541 /* The cost depends entirely on the operands to SET. */
12542 *cost = 0;
12543 op0 = SET_DEST (x);
12544 op1 = SET_SRC (x);
12546 switch (GET_CODE (op0))
12548 case MEM:
12549 if (speed)
12551 rtx address = XEXP (op0, 0);
12552 if (VECTOR_MODE_P (mode))
12553 *cost += extra_cost->ldst.storev;
12554 else if (GET_MODE_CLASS (mode) == MODE_INT)
12555 *cost += extra_cost->ldst.store;
12556 else if (mode == SFmode)
12557 *cost += extra_cost->ldst.storef;
12558 else if (mode == DFmode)
12559 *cost += extra_cost->ldst.stored;
12561 *cost +=
12562 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12563 0, speed));
12566 *cost += rtx_cost (op1, mode, SET, 1, speed);
12567 return true;
12569 case SUBREG:
12570 if (! REG_P (SUBREG_REG (op0)))
12571 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
12573 /* Fall through. */
12574 case REG:
12575 /* The cost is one per vector-register copied. */
12576 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
12578 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
12579 *cost = COSTS_N_INSNS (nregs);
12581 /* const0_rtx is in general free, but we will use an
12582 instruction to set a register to 0. */
12583 else if (REG_P (op1) || op1 == const0_rtx)
12585 /* The cost is 1 per register copied. */
12586 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
12587 *cost = COSTS_N_INSNS (nregs);
12589 else
12590 /* Cost is just the cost of the RHS of the set. */
12591 *cost += rtx_cost (op1, mode, SET, 1, speed);
12592 return true;
12594 case ZERO_EXTRACT:
12595 case SIGN_EXTRACT:
12596 /* Bit-field insertion. Strip any redundant widening of
12597 the RHS to meet the width of the target. */
12598 if (GET_CODE (op1) == SUBREG)
12599 op1 = SUBREG_REG (op1);
12600 if ((GET_CODE (op1) == ZERO_EXTEND
12601 || GET_CODE (op1) == SIGN_EXTEND)
12602 && CONST_INT_P (XEXP (op0, 1))
12603 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
12604 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
12605 op1 = XEXP (op1, 0);
12607 if (CONST_INT_P (op1))
12609 /* MOV immediate is assumed to always be cheap. */
12610 *cost = COSTS_N_INSNS (1);
12612 else
12614 /* BFM. */
12615 if (speed)
12616 *cost += extra_cost->alu.bfi;
12617 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
12620 return true;
12622 default:
12623 /* We can't make sense of this, assume default cost. */
12624 *cost = COSTS_N_INSNS (1);
12625 return false;
12627 return false;
12629 case CONST_INT:
12630 /* If an instruction can incorporate a constant within the
12631 instruction, the instruction's expression avoids calling
12632 rtx_cost() on the constant. If rtx_cost() is called on a
12633 constant, then it is usually because the constant must be
12634 moved into a register by one or more instructions.
12636 The exception is constant 0, which can be expressed
12637 as XZR/WZR and is therefore free. The exception to this is
12638 if we have (set (reg) (const0_rtx)) in which case we must cost
12639 the move. However, we can catch that when we cost the SET, so
12640 we don't need to consider that here. */
12641 if (x == const0_rtx)
12642 *cost = 0;
12643 else
12645 /* To an approximation, building any other constant is
12646 proportionally expensive to the number of instructions
12647 required to build that constant. This is true whether we
12648 are compiling for SPEED or otherwise. */
12649 if (!is_a <scalar_int_mode> (mode, &int_mode))
12650 int_mode = word_mode;
12651 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
12652 (NULL_RTX, x, false, int_mode));
12654 return true;
12656 case CONST_DOUBLE:
12658 /* First determine number of instructions to do the move
12659 as an integer constant. */
12660 if (!aarch64_float_const_representable_p (x)
12661 && !aarch64_can_const_movi_rtx_p (x, mode)
12662 && aarch64_float_const_rtx_p (x))
12664 unsigned HOST_WIDE_INT ival;
12665 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
12666 gcc_assert (succeed);
12668 scalar_int_mode imode = (mode == HFmode
12669 ? SImode
12670 : int_mode_for_mode (mode).require ());
12671 int ncost = aarch64_internal_mov_immediate
12672 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
12673 *cost += COSTS_N_INSNS (ncost);
12674 return true;
12677 if (speed)
12679 /* mov[df,sf]_aarch64. */
12680 if (aarch64_float_const_representable_p (x))
12681 /* FMOV (scalar immediate). */
12682 *cost += extra_cost->fp[mode == DFmode].fpconst;
12683 else if (!aarch64_float_const_zero_rtx_p (x))
12685 /* This will be a load from memory. */
12686 if (mode == DFmode)
12687 *cost += extra_cost->ldst.loadd;
12688 else
12689 *cost += extra_cost->ldst.loadf;
12691 else
12692 /* Otherwise this is +0.0. We get this using MOVI d0, #0
12693 or MOV v0.s[0], wzr - neither of which are modeled by the
12694 cost tables. Just use the default cost. */
12699 return true;
12701 case MEM:
12702 if (speed)
12704 /* For loads we want the base cost of a load, plus an
12705 approximation for the additional cost of the addressing
12706 mode. */
12707 rtx address = XEXP (x, 0);
12708 if (VECTOR_MODE_P (mode))
12709 *cost += extra_cost->ldst.loadv;
12710 else if (GET_MODE_CLASS (mode) == MODE_INT)
12711 *cost += extra_cost->ldst.load;
12712 else if (mode == SFmode)
12713 *cost += extra_cost->ldst.loadf;
12714 else if (mode == DFmode)
12715 *cost += extra_cost->ldst.loadd;
12717 *cost +=
12718 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12719 0, speed));
12722 return true;
12724 case NEG:
12725 op0 = XEXP (x, 0);
12727 if (VECTOR_MODE_P (mode))
12729 if (speed)
12731 /* FNEG. */
12732 *cost += extra_cost->vect.alu;
12734 return false;
12737 if (GET_MODE_CLASS (mode) == MODE_INT)
12739 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12740 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12742 /* CSETM. */
12743 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
12744 return true;
12747 /* Cost this as SUB wzr, X. */
12748 op0 = CONST0_RTX (mode);
12749 op1 = XEXP (x, 0);
12750 goto cost_minus;
12753 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12755 /* Support (neg(fma...)) as a single instruction only if
12756 sign of zeros is unimportant. This matches the decision
12757 making in aarch64.md. */
12758 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
12760 /* FNMADD. */
12761 *cost = rtx_cost (op0, mode, NEG, 0, speed);
12762 return true;
12764 if (GET_CODE (op0) == MULT)
12766 /* FNMUL. */
12767 *cost = rtx_cost (op0, mode, NEG, 0, speed);
12768 return true;
12770 if (speed)
12771 /* FNEG. */
12772 *cost += extra_cost->fp[mode == DFmode].neg;
12773 return false;
12776 return false;
12778 case CLRSB:
12779 case CLZ:
12780 if (speed)
12782 if (VECTOR_MODE_P (mode))
12783 *cost += extra_cost->vect.alu;
12784 else
12785 *cost += extra_cost->alu.clz;
12788 return false;
12790 case CTZ:
12791 *cost = COSTS_N_INSNS (2);
12793 if (speed)
12794 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
12795 return false;
12797 case COMPARE:
12798 op0 = XEXP (x, 0);
12799 op1 = XEXP (x, 1);
12801 if (op1 == const0_rtx
12802 && GET_CODE (op0) == AND)
12804 x = op0;
12805 mode = GET_MODE (op0);
12806 goto cost_logic;
12809 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
12811 /* TODO: A write to the CC flags possibly costs extra, this
12812 needs encoding in the cost tables. */
12814 mode = GET_MODE (op0);
12815 /* ANDS. */
12816 if (GET_CODE (op0) == AND)
12818 x = op0;
12819 goto cost_logic;
12822 if (GET_CODE (op0) == PLUS)
12824 /* ADDS (and CMN alias). */
12825 x = op0;
12826 goto cost_plus;
12829 if (GET_CODE (op0) == MINUS)
12831 /* SUBS. */
12832 x = op0;
12833 goto cost_minus;
12836 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
12837 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
12838 && CONST_INT_P (XEXP (op0, 2)))
12840 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
12841 Handle it here directly rather than going to cost_logic
12842 since we know the immediate generated for the TST is valid
12843 so we can avoid creating an intermediate rtx for it only
12844 for costing purposes. */
12845 if (speed)
12846 *cost += extra_cost->alu.logical;
12848 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
12849 ZERO_EXTRACT, 0, speed);
12850 return true;
12853 if (GET_CODE (op1) == NEG)
12855 /* CMN. */
12856 if (speed)
12857 *cost += extra_cost->alu.arith;
12859 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
12860 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
12861 return true;
12864 /* CMP.
12866 Compare can freely swap the order of operands, and
12867 canonicalization puts the more complex operation first.
12868 But the integer MINUS logic expects the shift/extend
12869 operation in op1. */
12870 if (! (REG_P (op0)
12871 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
12873 op0 = XEXP (x, 1);
12874 op1 = XEXP (x, 0);
12876 goto cost_minus;
12879 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
12881 /* FCMP. */
12882 if (speed)
12883 *cost += extra_cost->fp[mode == DFmode].compare;
12885 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
12887 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
12888 /* FCMP supports constant 0.0 for no extra cost. */
12889 return true;
12891 return false;
12894 if (VECTOR_MODE_P (mode))
12896 /* Vector compare. */
12897 if (speed)
12898 *cost += extra_cost->vect.alu;
12900 if (aarch64_float_const_zero_rtx_p (op1))
12902 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
12903 cost. */
12904 return true;
12906 return false;
12908 return false;
12910 case MINUS:
12912 op0 = XEXP (x, 0);
12913 op1 = XEXP (x, 1);
12915 cost_minus:
12916 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
12918 /* Detect valid immediates. */
12919 if ((GET_MODE_CLASS (mode) == MODE_INT
12920 || (GET_MODE_CLASS (mode) == MODE_CC
12921 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
12922 && CONST_INT_P (op1)
12923 && aarch64_uimm12_shift (INTVAL (op1)))
12925 if (speed)
12926 /* SUB(S) (immediate). */
12927 *cost += extra_cost->alu.arith;
12928 return true;
12931 /* Look for SUB (extended register). */
12932 if (is_a <scalar_int_mode> (mode)
12933 && aarch64_rtx_arith_op_extract_p (op1))
12935 if (speed)
12936 *cost += extra_cost->alu.extend_arith;
12938 op1 = aarch64_strip_extend (op1, true);
12939 *cost += rtx_cost (op1, VOIDmode,
12940 (enum rtx_code) GET_CODE (op1), 0, speed);
12941 return true;
12944 rtx new_op1 = aarch64_strip_extend (op1, false);
12946 /* Cost this as an FMA-alike operation. */
12947 if ((GET_CODE (new_op1) == MULT
12948 || aarch64_shift_p (GET_CODE (new_op1)))
12949 && code != COMPARE)
12951 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
12952 (enum rtx_code) code,
12953 speed);
12954 return true;
12957 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
12959 if (speed)
12961 if (VECTOR_MODE_P (mode))
12963 /* Vector SUB. */
12964 *cost += extra_cost->vect.alu;
12966 else if (GET_MODE_CLASS (mode) == MODE_INT)
12968 /* SUB(S). */
12969 *cost += extra_cost->alu.arith;
12971 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12973 /* FSUB. */
12974 *cost += extra_cost->fp[mode == DFmode].addsub;
12977 return true;
12980 case PLUS:
12982 rtx new_op0;
12984 op0 = XEXP (x, 0);
12985 op1 = XEXP (x, 1);
12987 cost_plus:
12988 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12989 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12991 /* CSINC. */
12992 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
12993 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12994 return true;
12997 if (GET_MODE_CLASS (mode) == MODE_INT
12998 && (aarch64_plus_immediate (op1, mode)
12999 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
13001 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
13003 if (speed)
13005 /* ADD (immediate). */
13006 *cost += extra_cost->alu.arith;
13008 /* Some tunings prefer to not use the VL-based scalar ops.
13009 Increase the cost of the poly immediate to prevent their
13010 formation. */
13011 if (GET_CODE (op1) == CONST_POLY_INT
13012 && (aarch64_tune_params.extra_tuning_flags
13013 & AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS))
13014 *cost += COSTS_N_INSNS (1);
13016 return true;
13019 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
13021 /* Look for ADD (extended register). */
13022 if (is_a <scalar_int_mode> (mode)
13023 && aarch64_rtx_arith_op_extract_p (op0))
13025 if (speed)
13026 *cost += extra_cost->alu.extend_arith;
13028 op0 = aarch64_strip_extend (op0, true);
13029 *cost += rtx_cost (op0, VOIDmode,
13030 (enum rtx_code) GET_CODE (op0), 0, speed);
13031 return true;
13034 /* Strip any extend, leave shifts behind as we will
13035 cost them through mult_cost. */
13036 new_op0 = aarch64_strip_extend (op0, false);
13038 if (GET_CODE (new_op0) == MULT
13039 || aarch64_shift_p (GET_CODE (new_op0)))
13041 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
13042 speed);
13043 return true;
13046 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
13048 if (speed)
13050 if (VECTOR_MODE_P (mode))
13052 /* Vector ADD. */
13053 *cost += extra_cost->vect.alu;
13055 else if (GET_MODE_CLASS (mode) == MODE_INT)
13057 /* ADD. */
13058 *cost += extra_cost->alu.arith;
13060 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13062 /* FADD. */
13063 *cost += extra_cost->fp[mode == DFmode].addsub;
13066 return true;
13069 case BSWAP:
13070 *cost = COSTS_N_INSNS (1);
13072 if (speed)
13074 if (VECTOR_MODE_P (mode))
13075 *cost += extra_cost->vect.alu;
13076 else
13077 *cost += extra_cost->alu.rev;
13079 return false;
13081 case IOR:
13082 if (aarch_rev16_p (x))
13084 *cost = COSTS_N_INSNS (1);
13086 if (speed)
13088 if (VECTOR_MODE_P (mode))
13089 *cost += extra_cost->vect.alu;
13090 else
13091 *cost += extra_cost->alu.rev;
13093 return true;
13096 if (aarch64_extr_rtx_p (x, &op0, &op1))
13098 *cost += rtx_cost (op0, mode, IOR, 0, speed);
13099 *cost += rtx_cost (op1, mode, IOR, 1, speed);
13100 if (speed)
13101 *cost += extra_cost->alu.shift;
13103 return true;
13105 /* Fall through. */
13106 case XOR:
13107 case AND:
13108 cost_logic:
13109 op0 = XEXP (x, 0);
13110 op1 = XEXP (x, 1);
13112 if (VECTOR_MODE_P (mode))
13114 if (speed)
13115 *cost += extra_cost->vect.alu;
13116 return true;
13119 if (code == AND
13120 && GET_CODE (op0) == MULT
13121 && CONST_INT_P (XEXP (op0, 1))
13122 && CONST_INT_P (op1)
13123 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
13124 INTVAL (op1)) != 0)
13126 /* This is a UBFM/SBFM. */
13127 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
13128 if (speed)
13129 *cost += extra_cost->alu.bfx;
13130 return true;
13133 if (is_int_mode (mode, &int_mode))
13135 if (CONST_INT_P (op1))
13137 /* We have a mask + shift version of a UBFIZ
13138 i.e. the *andim_ashift<mode>_bfiz pattern. */
13139 if (GET_CODE (op0) == ASHIFT
13140 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
13141 XEXP (op0, 1)))
13143 *cost += rtx_cost (XEXP (op0, 0), int_mode,
13144 (enum rtx_code) code, 0, speed);
13145 if (speed)
13146 *cost += extra_cost->alu.bfx;
13148 return true;
13150 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
13152 /* We possibly get the immediate for free, this is not
13153 modelled. */
13154 *cost += rtx_cost (op0, int_mode,
13155 (enum rtx_code) code, 0, speed);
13156 if (speed)
13157 *cost += extra_cost->alu.logical;
13159 return true;
13162 else
13164 rtx new_op0 = op0;
13166 /* Handle ORN, EON, or BIC. */
13167 if (GET_CODE (op0) == NOT)
13168 op0 = XEXP (op0, 0);
13170 new_op0 = aarch64_strip_shift (op0);
13172 /* If we had a shift on op0 then this is a logical-shift-
13173 by-register/immediate operation. Otherwise, this is just
13174 a logical operation. */
13175 if (speed)
13177 if (new_op0 != op0)
13179 /* Shift by immediate. */
13180 if (CONST_INT_P (XEXP (op0, 1)))
13181 *cost += extra_cost->alu.log_shift;
13182 else
13183 *cost += extra_cost->alu.log_shift_reg;
13185 else
13186 *cost += extra_cost->alu.logical;
13189 /* In both cases we want to cost both operands. */
13190 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
13191 0, speed);
13192 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
13193 1, speed);
13195 return true;
13198 return false;
13200 case NOT:
13201 x = XEXP (x, 0);
13202 op0 = aarch64_strip_shift (x);
13204 if (VECTOR_MODE_P (mode))
13206 /* Vector NOT. */
13207 *cost += extra_cost->vect.alu;
13208 return false;
13211 /* MVN-shifted-reg. */
13212 if (op0 != x)
13214 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
13216 if (speed)
13217 *cost += extra_cost->alu.log_shift;
13219 return true;
13221 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
13222 Handle the second form here taking care that 'a' in the above can
13223 be a shift. */
13224 else if (GET_CODE (op0) == XOR)
13226 rtx newop0 = XEXP (op0, 0);
13227 rtx newop1 = XEXP (op0, 1);
13228 rtx op0_stripped = aarch64_strip_shift (newop0);
13230 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
13231 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
13233 if (speed)
13235 if (op0_stripped != newop0)
13236 *cost += extra_cost->alu.log_shift;
13237 else
13238 *cost += extra_cost->alu.logical;
13241 return true;
13243 /* MVN. */
13244 if (speed)
13245 *cost += extra_cost->alu.logical;
13247 return false;
13249 case ZERO_EXTEND:
13251 op0 = XEXP (x, 0);
13252 /* If a value is written in SI mode, then zero extended to DI
13253 mode, the operation will in general be free as a write to
13254 a 'w' register implicitly zeroes the upper bits of an 'x'
13255 register. However, if this is
13257 (set (reg) (zero_extend (reg)))
13259 we must cost the explicit register move. */
13260 if (mode == DImode
13261 && GET_MODE (op0) == SImode
13262 && outer == SET)
13264 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
13266 /* If OP_COST is non-zero, then the cost of the zero extend
13267 is effectively the cost of the inner operation. Otherwise
13268 we have a MOV instruction and we take the cost from the MOV
13269 itself. This is true independently of whether we are
13270 optimizing for space or time. */
13271 if (op_cost)
13272 *cost = op_cost;
13274 return true;
13276 else if (MEM_P (op0))
13278 /* All loads can zero extend to any size for free. */
13279 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
13280 return true;
13283 op0 = aarch64_extend_bitfield_pattern_p (x);
13284 if (op0)
13286 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
13287 if (speed)
13288 *cost += extra_cost->alu.bfx;
13289 return true;
13292 if (speed)
13294 if (VECTOR_MODE_P (mode))
13296 /* UMOV. */
13297 *cost += extra_cost->vect.alu;
13299 else
13301 /* We generate an AND instead of UXTB/UXTH. */
13302 *cost += extra_cost->alu.logical;
13305 return false;
13307 case SIGN_EXTEND:
13308 if (MEM_P (XEXP (x, 0)))
13310 /* LDRSH. */
13311 if (speed)
13313 rtx address = XEXP (XEXP (x, 0), 0);
13314 *cost += extra_cost->ldst.load_sign_extend;
13316 *cost +=
13317 COSTS_N_INSNS (aarch64_address_cost (address, mode,
13318 0, speed));
13320 return true;
13323 op0 = aarch64_extend_bitfield_pattern_p (x);
13324 if (op0)
13326 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
13327 if (speed)
13328 *cost += extra_cost->alu.bfx;
13329 return true;
13332 if (speed)
13334 if (VECTOR_MODE_P (mode))
13335 *cost += extra_cost->vect.alu;
13336 else
13337 *cost += extra_cost->alu.extend;
13339 return false;
13341 case ASHIFT:
13342 op0 = XEXP (x, 0);
13343 op1 = XEXP (x, 1);
13345 if (CONST_INT_P (op1))
13347 if (speed)
13349 if (VECTOR_MODE_P (mode))
13351 /* Vector shift (immediate). */
13352 *cost += extra_cost->vect.alu;
13354 else
13356 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
13357 aliases. */
13358 *cost += extra_cost->alu.shift;
13362 /* We can incorporate zero/sign extend for free. */
13363 if (GET_CODE (op0) == ZERO_EXTEND
13364 || GET_CODE (op0) == SIGN_EXTEND)
13365 op0 = XEXP (op0, 0);
13367 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
13368 return true;
13370 else
13372 if (VECTOR_MODE_P (mode))
13374 if (speed)
13375 /* Vector shift (register). */
13376 *cost += extra_cost->vect.alu;
13378 else
13380 if (speed)
13381 /* LSLV. */
13382 *cost += extra_cost->alu.shift_reg;
13384 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
13385 && CONST_INT_P (XEXP (op1, 1))
13386 && known_eq (INTVAL (XEXP (op1, 1)),
13387 GET_MODE_BITSIZE (mode) - 1))
13389 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
13390 /* We already demanded XEXP (op1, 0) to be REG_P, so
13391 don't recurse into it. */
13392 return true;
13395 return false; /* All arguments need to be in registers. */
13398 case ROTATE:
13399 case ROTATERT:
13400 case LSHIFTRT:
13401 case ASHIFTRT:
13402 op0 = XEXP (x, 0);
13403 op1 = XEXP (x, 1);
13405 if (CONST_INT_P (op1))
13407 /* ASR (immediate) and friends. */
13408 if (speed)
13410 if (VECTOR_MODE_P (mode))
13411 *cost += extra_cost->vect.alu;
13412 else
13413 *cost += extra_cost->alu.shift;
13416 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
13417 return true;
13419 else
13421 if (VECTOR_MODE_P (mode))
13423 if (speed)
13424 /* Vector shift (register). */
13425 *cost += extra_cost->vect.alu;
13427 else
13429 if (speed)
13430 /* ASR (register) and friends. */
13431 *cost += extra_cost->alu.shift_reg;
13433 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
13434 && CONST_INT_P (XEXP (op1, 1))
13435 && known_eq (INTVAL (XEXP (op1, 1)),
13436 GET_MODE_BITSIZE (mode) - 1))
13438 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
13439 /* We already demanded XEXP (op1, 0) to be REG_P, so
13440 don't recurse into it. */
13441 return true;
13444 return false; /* All arguments need to be in registers. */
13447 case SYMBOL_REF:
13449 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
13450 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
13452 /* LDR. */
13453 if (speed)
13454 *cost += extra_cost->ldst.load;
13456 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
13457 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
13459 /* ADRP, followed by ADD. */
13460 *cost += COSTS_N_INSNS (1);
13461 if (speed)
13462 *cost += 2 * extra_cost->alu.arith;
13464 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
13465 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
13467 /* ADR. */
13468 if (speed)
13469 *cost += extra_cost->alu.arith;
13472 if (flag_pic)
13474 /* One extra load instruction, after accessing the GOT. */
13475 *cost += COSTS_N_INSNS (1);
13476 if (speed)
13477 *cost += extra_cost->ldst.load;
13479 return true;
13481 case HIGH:
13482 case LO_SUM:
13483 /* ADRP/ADD (immediate). */
13484 if (speed)
13485 *cost += extra_cost->alu.arith;
13486 return true;
13488 case ZERO_EXTRACT:
13489 case SIGN_EXTRACT:
13490 /* UBFX/SBFX. */
13491 if (speed)
13493 if (VECTOR_MODE_P (mode))
13494 *cost += extra_cost->vect.alu;
13495 else
13496 *cost += extra_cost->alu.bfx;
13499 /* We can trust that the immediates used will be correct (there
13500 are no by-register forms), so we need only cost op0. */
13501 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
13502 return true;
13504 case MULT:
13505 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
13506 /* aarch64_rtx_mult_cost always handles recursion to its
13507 operands. */
13508 return true;
13510 case MOD:
13511 /* We can expand signed mod by power of 2 using a NEGS, two parallel
13512 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
13513 an unconditional negate. This case should only ever be reached through
13514 the set_smod_pow2_cheap check in expmed.c. */
13515 if (CONST_INT_P (XEXP (x, 1))
13516 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
13517 && (mode == SImode || mode == DImode))
13519 /* We expand to 4 instructions. Reset the baseline. */
13520 *cost = COSTS_N_INSNS (4);
13522 if (speed)
13523 *cost += 2 * extra_cost->alu.logical
13524 + 2 * extra_cost->alu.arith;
13526 return true;
13529 /* Fall-through. */
13530 case UMOD:
13531 if (speed)
13533 /* Slighly prefer UMOD over SMOD. */
13534 if (VECTOR_MODE_P (mode))
13535 *cost += extra_cost->vect.alu;
13536 else if (GET_MODE_CLASS (mode) == MODE_INT)
13537 *cost += (extra_cost->mult[mode == DImode].add
13538 + extra_cost->mult[mode == DImode].idiv
13539 + (code == MOD ? 1 : 0));
13541 return false; /* All arguments need to be in registers. */
13543 case DIV:
13544 case UDIV:
13545 case SQRT:
13546 if (speed)
13548 if (VECTOR_MODE_P (mode))
13549 *cost += extra_cost->vect.alu;
13550 else if (GET_MODE_CLASS (mode) == MODE_INT)
13551 /* There is no integer SQRT, so only DIV and UDIV can get
13552 here. */
13553 *cost += (extra_cost->mult[mode == DImode].idiv
13554 /* Slighly prefer UDIV over SDIV. */
13555 + (code == DIV ? 1 : 0));
13556 else
13557 *cost += extra_cost->fp[mode == DFmode].div;
13559 return false; /* All arguments need to be in registers. */
13561 case IF_THEN_ELSE:
13562 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
13563 XEXP (x, 2), cost, speed);
13565 case EQ:
13566 case NE:
13567 case GT:
13568 case GTU:
13569 case LT:
13570 case LTU:
13571 case GE:
13572 case GEU:
13573 case LE:
13574 case LEU:
13576 return false; /* All arguments must be in registers. */
13578 case FMA:
13579 op0 = XEXP (x, 0);
13580 op1 = XEXP (x, 1);
13581 op2 = XEXP (x, 2);
13583 if (speed)
13585 if (VECTOR_MODE_P (mode))
13586 *cost += extra_cost->vect.alu;
13587 else
13588 *cost += extra_cost->fp[mode == DFmode].fma;
13591 /* FMSUB, FNMADD, and FNMSUB are free. */
13592 if (GET_CODE (op0) == NEG)
13593 op0 = XEXP (op0, 0);
13595 if (GET_CODE (op2) == NEG)
13596 op2 = XEXP (op2, 0);
13598 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
13599 and the by-element operand as operand 0. */
13600 if (GET_CODE (op1) == NEG)
13601 op1 = XEXP (op1, 0);
13603 /* Catch vector-by-element operations. The by-element operand can
13604 either be (vec_duplicate (vec_select (x))) or just
13605 (vec_select (x)), depending on whether we are multiplying by
13606 a vector or a scalar.
13608 Canonicalization is not very good in these cases, FMA4 will put the
13609 by-element operand as operand 0, FNMA4 will have it as operand 1. */
13610 if (GET_CODE (op0) == VEC_DUPLICATE)
13611 op0 = XEXP (op0, 0);
13612 else if (GET_CODE (op1) == VEC_DUPLICATE)
13613 op1 = XEXP (op1, 0);
13615 if (GET_CODE (op0) == VEC_SELECT)
13616 op0 = XEXP (op0, 0);
13617 else if (GET_CODE (op1) == VEC_SELECT)
13618 op1 = XEXP (op1, 0);
13620 /* If the remaining parameters are not registers,
13621 get the cost to put them into registers. */
13622 *cost += rtx_cost (op0, mode, FMA, 0, speed);
13623 *cost += rtx_cost (op1, mode, FMA, 1, speed);
13624 *cost += rtx_cost (op2, mode, FMA, 2, speed);
13625 return true;
13627 case FLOAT:
13628 case UNSIGNED_FLOAT:
13629 if (speed)
13630 *cost += extra_cost->fp[mode == DFmode].fromint;
13631 return false;
13633 case FLOAT_EXTEND:
13634 if (speed)
13636 if (VECTOR_MODE_P (mode))
13638 /*Vector truncate. */
13639 *cost += extra_cost->vect.alu;
13641 else
13642 *cost += extra_cost->fp[mode == DFmode].widen;
13644 return false;
13646 case FLOAT_TRUNCATE:
13647 if (speed)
13649 if (VECTOR_MODE_P (mode))
13651 /*Vector conversion. */
13652 *cost += extra_cost->vect.alu;
13654 else
13655 *cost += extra_cost->fp[mode == DFmode].narrow;
13657 return false;
13659 case FIX:
13660 case UNSIGNED_FIX:
13661 x = XEXP (x, 0);
13662 /* Strip the rounding part. They will all be implemented
13663 by the fcvt* family of instructions anyway. */
13664 if (GET_CODE (x) == UNSPEC)
13666 unsigned int uns_code = XINT (x, 1);
13668 if (uns_code == UNSPEC_FRINTA
13669 || uns_code == UNSPEC_FRINTM
13670 || uns_code == UNSPEC_FRINTN
13671 || uns_code == UNSPEC_FRINTP
13672 || uns_code == UNSPEC_FRINTZ)
13673 x = XVECEXP (x, 0, 0);
13676 if (speed)
13678 if (VECTOR_MODE_P (mode))
13679 *cost += extra_cost->vect.alu;
13680 else
13681 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
13684 /* We can combine fmul by a power of 2 followed by a fcvt into a single
13685 fixed-point fcvt. */
13686 if (GET_CODE (x) == MULT
13687 && ((VECTOR_MODE_P (mode)
13688 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
13689 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
13691 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
13692 0, speed);
13693 return true;
13696 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
13697 return true;
13699 case ABS:
13700 if (VECTOR_MODE_P (mode))
13702 /* ABS (vector). */
13703 if (speed)
13704 *cost += extra_cost->vect.alu;
13706 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13708 op0 = XEXP (x, 0);
13710 /* FABD, which is analogous to FADD. */
13711 if (GET_CODE (op0) == MINUS)
13713 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
13714 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
13715 if (speed)
13716 *cost += extra_cost->fp[mode == DFmode].addsub;
13718 return true;
13720 /* Simple FABS is analogous to FNEG. */
13721 if (speed)
13722 *cost += extra_cost->fp[mode == DFmode].neg;
13724 else
13726 /* Integer ABS will either be split to
13727 two arithmetic instructions, or will be an ABS
13728 (scalar), which we don't model. */
13729 *cost = COSTS_N_INSNS (2);
13730 if (speed)
13731 *cost += 2 * extra_cost->alu.arith;
13733 return false;
13735 case SMAX:
13736 case SMIN:
13737 if (speed)
13739 if (VECTOR_MODE_P (mode))
13740 *cost += extra_cost->vect.alu;
13741 else
13743 /* FMAXNM/FMINNM/FMAX/FMIN.
13744 TODO: This may not be accurate for all implementations, but
13745 we do not model this in the cost tables. */
13746 *cost += extra_cost->fp[mode == DFmode].addsub;
13749 return false;
13751 case UNSPEC:
13752 /* The floating point round to integer frint* instructions. */
13753 if (aarch64_frint_unspec_p (XINT (x, 1)))
13755 if (speed)
13756 *cost += extra_cost->fp[mode == DFmode].roundint;
13758 return false;
13761 if (XINT (x, 1) == UNSPEC_RBIT)
13763 if (speed)
13764 *cost += extra_cost->alu.rev;
13766 return false;
13768 break;
13770 case TRUNCATE:
13772 /* Decompose <su>muldi3_highpart. */
13773 if (/* (truncate:DI */
13774 mode == DImode
13775 /* (lshiftrt:TI */
13776 && GET_MODE (XEXP (x, 0)) == TImode
13777 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
13778 /* (mult:TI */
13779 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13780 /* (ANY_EXTEND:TI (reg:DI))
13781 (ANY_EXTEND:TI (reg:DI))) */
13782 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
13783 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
13784 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
13785 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
13786 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
13787 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
13788 /* (const_int 64) */
13789 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13790 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
13792 /* UMULH/SMULH. */
13793 if (speed)
13794 *cost += extra_cost->mult[mode == DImode].extend;
13795 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
13796 mode, MULT, 0, speed);
13797 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
13798 mode, MULT, 1, speed);
13799 return true;
13802 /* Fall through. */
13803 default:
13804 break;
13807 if (dump_file
13808 && flag_aarch64_verbose_cost)
13809 fprintf (dump_file,
13810 "\nFailed to cost RTX. Assuming default cost.\n");
13812 return true;
13815 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
13816 calculated for X. This cost is stored in *COST. Returns true
13817 if the total cost of X was calculated. */
13818 static bool
13819 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
13820 int param, int *cost, bool speed)
13822 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
13824 if (dump_file
13825 && flag_aarch64_verbose_cost)
13827 print_rtl_single (dump_file, x);
13828 fprintf (dump_file, "\n%s cost: %d (%s)\n",
13829 speed ? "Hot" : "Cold",
13830 *cost, result ? "final" : "partial");
13833 return result;
13836 static int
13837 aarch64_register_move_cost (machine_mode mode,
13838 reg_class_t from_i, reg_class_t to_i)
13840 enum reg_class from = (enum reg_class) from_i;
13841 enum reg_class to = (enum reg_class) to_i;
13842 const struct cpu_regmove_cost *regmove_cost
13843 = aarch64_tune_params.regmove_cost;
13845 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
13846 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
13847 || to == STUB_REGS)
13848 to = GENERAL_REGS;
13850 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
13851 || from == STUB_REGS)
13852 from = GENERAL_REGS;
13854 /* Make RDFFR very expensive. In particular, if we know that the FFR
13855 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
13856 as a way of obtaining a PTRUE. */
13857 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
13858 && hard_reg_set_subset_p (reg_class_contents[from_i],
13859 reg_class_contents[FFR_REGS]))
13860 return 80;
13862 /* Moving between GPR and stack cost is the same as GP2GP. */
13863 if ((from == GENERAL_REGS && to == STACK_REG)
13864 || (to == GENERAL_REGS && from == STACK_REG))
13865 return regmove_cost->GP2GP;
13867 /* To/From the stack register, we move via the gprs. */
13868 if (to == STACK_REG || from == STACK_REG)
13869 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
13870 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
13872 if (known_eq (GET_MODE_SIZE (mode), 16))
13874 /* 128-bit operations on general registers require 2 instructions. */
13875 if (from == GENERAL_REGS && to == GENERAL_REGS)
13876 return regmove_cost->GP2GP * 2;
13877 else if (from == GENERAL_REGS)
13878 return regmove_cost->GP2FP * 2;
13879 else if (to == GENERAL_REGS)
13880 return regmove_cost->FP2GP * 2;
13882 /* When AdvSIMD instructions are disabled it is not possible to move
13883 a 128-bit value directly between Q registers. This is handled in
13884 secondary reload. A general register is used as a scratch to move
13885 the upper DI value and the lower DI value is moved directly,
13886 hence the cost is the sum of three moves. */
13887 if (! TARGET_SIMD)
13888 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
13890 return regmove_cost->FP2FP;
13893 if (from == GENERAL_REGS && to == GENERAL_REGS)
13894 return regmove_cost->GP2GP;
13895 else if (from == GENERAL_REGS)
13896 return regmove_cost->GP2FP;
13897 else if (to == GENERAL_REGS)
13898 return regmove_cost->FP2GP;
13900 return regmove_cost->FP2FP;
13903 static int
13904 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
13905 reg_class_t rclass ATTRIBUTE_UNUSED,
13906 bool in ATTRIBUTE_UNUSED)
13908 return aarch64_tune_params.memmov_cost;
13911 /* Implement TARGET_INIT_BUILTINS. */
13912 static void
13913 aarch64_init_builtins ()
13915 aarch64_general_init_builtins ();
13916 aarch64_sve::init_builtins ();
13917 #ifdef SUBTARGET_INIT_BUILTINS
13918 SUBTARGET_INIT_BUILTINS;
13919 #endif
13922 /* Implement TARGET_FOLD_BUILTIN. */
13923 static tree
13924 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
13926 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13927 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13928 tree type = TREE_TYPE (TREE_TYPE (fndecl));
13929 switch (code & AARCH64_BUILTIN_CLASS)
13931 case AARCH64_BUILTIN_GENERAL:
13932 return aarch64_general_fold_builtin (subcode, type, nargs, args);
13934 case AARCH64_BUILTIN_SVE:
13935 return NULL_TREE;
13937 gcc_unreachable ();
13940 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
13941 static bool
13942 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
13944 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
13945 tree fndecl = gimple_call_fndecl (stmt);
13946 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13947 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13948 gimple *new_stmt = NULL;
13949 switch (code & AARCH64_BUILTIN_CLASS)
13951 case AARCH64_BUILTIN_GENERAL:
13952 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
13953 break;
13955 case AARCH64_BUILTIN_SVE:
13956 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
13957 break;
13960 if (!new_stmt)
13961 return false;
13963 gsi_replace (gsi, new_stmt, true);
13964 return true;
13967 /* Implement TARGET_EXPAND_BUILTIN. */
13968 static rtx
13969 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
13971 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
13972 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13973 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13974 switch (code & AARCH64_BUILTIN_CLASS)
13976 case AARCH64_BUILTIN_GENERAL:
13977 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
13979 case AARCH64_BUILTIN_SVE:
13980 return aarch64_sve::expand_builtin (subcode, exp, target);
13982 gcc_unreachable ();
13985 /* Implement TARGET_BUILTIN_DECL. */
13986 static tree
13987 aarch64_builtin_decl (unsigned int code, bool initialize_p)
13989 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13990 switch (code & AARCH64_BUILTIN_CLASS)
13992 case AARCH64_BUILTIN_GENERAL:
13993 return aarch64_general_builtin_decl (subcode, initialize_p);
13995 case AARCH64_BUILTIN_SVE:
13996 return aarch64_sve::builtin_decl (subcode, initialize_p);
13998 gcc_unreachable ();
14001 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
14002 to optimize 1.0/sqrt. */
14004 static bool
14005 use_rsqrt_p (machine_mode mode)
14007 return (!flag_trapping_math
14008 && flag_unsafe_math_optimizations
14009 && ((aarch64_tune_params.approx_modes->recip_sqrt
14010 & AARCH64_APPROX_MODE (mode))
14011 || flag_mrecip_low_precision_sqrt));
14014 /* Function to decide when to use the approximate reciprocal square root
14015 builtin. */
14017 static tree
14018 aarch64_builtin_reciprocal (tree fndecl)
14020 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
14022 if (!use_rsqrt_p (mode))
14023 return NULL_TREE;
14024 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
14025 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
14026 switch (code & AARCH64_BUILTIN_CLASS)
14028 case AARCH64_BUILTIN_GENERAL:
14029 return aarch64_general_builtin_rsqrt (subcode);
14031 case AARCH64_BUILTIN_SVE:
14032 return NULL_TREE;
14034 gcc_unreachable ();
14037 /* Emit code to perform the floating-point operation:
14039 DST = SRC1 * SRC2
14041 where all three operands are already known to be registers.
14042 If the operation is an SVE one, PTRUE is a suitable all-true
14043 predicate. */
14045 static void
14046 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
14048 if (ptrue)
14049 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
14050 dst, ptrue, src1, src2,
14051 gen_int_mode (SVE_RELAXED_GP, SImode)));
14052 else
14053 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
14056 /* Emit instruction sequence to compute either the approximate square root
14057 or its approximate reciprocal, depending on the flag RECP, and return
14058 whether the sequence was emitted or not. */
14060 bool
14061 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
14063 machine_mode mode = GET_MODE (dst);
14065 if (GET_MODE_INNER (mode) == HFmode)
14067 gcc_assert (!recp);
14068 return false;
14071 if (!recp)
14073 if (!(flag_mlow_precision_sqrt
14074 || (aarch64_tune_params.approx_modes->sqrt
14075 & AARCH64_APPROX_MODE (mode))))
14076 return false;
14078 if (!flag_finite_math_only
14079 || flag_trapping_math
14080 || !flag_unsafe_math_optimizations
14081 || optimize_function_for_size_p (cfun))
14082 return false;
14084 else
14085 /* Caller assumes we cannot fail. */
14086 gcc_assert (use_rsqrt_p (mode));
14088 rtx pg = NULL_RTX;
14089 if (aarch64_sve_mode_p (mode))
14090 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
14091 machine_mode mmsk = (VECTOR_MODE_P (mode)
14092 ? related_int_vector_mode (mode).require ()
14093 : int_mode_for_mode (mode).require ());
14094 rtx xmsk = NULL_RTX;
14095 if (!recp)
14097 /* When calculating the approximate square root, compare the
14098 argument with 0.0 and create a mask. */
14099 rtx zero = CONST0_RTX (mode);
14100 if (pg)
14102 xmsk = gen_reg_rtx (GET_MODE (pg));
14103 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
14104 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
14105 xmsk, pg, hint, src, zero));
14107 else
14109 xmsk = gen_reg_rtx (mmsk);
14110 emit_insn (gen_rtx_SET (xmsk,
14111 gen_rtx_NEG (mmsk,
14112 gen_rtx_EQ (mmsk, src, zero))));
14116 /* Estimate the approximate reciprocal square root. */
14117 rtx xdst = gen_reg_rtx (mode);
14118 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
14120 /* Iterate over the series twice for SF and thrice for DF. */
14121 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
14123 /* Optionally iterate over the series once less for faster performance
14124 while sacrificing the accuracy. */
14125 if ((recp && flag_mrecip_low_precision_sqrt)
14126 || (!recp && flag_mlow_precision_sqrt))
14127 iterations--;
14129 /* Iterate over the series to calculate the approximate reciprocal square
14130 root. */
14131 rtx x1 = gen_reg_rtx (mode);
14132 while (iterations--)
14134 rtx x2 = gen_reg_rtx (mode);
14135 aarch64_emit_mult (x2, pg, xdst, xdst);
14137 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
14139 if (iterations > 0)
14140 aarch64_emit_mult (xdst, pg, xdst, x1);
14143 if (!recp)
14145 if (pg)
14146 /* Multiply nonzero source values by the corresponding intermediate
14147 result elements, so that the final calculation is the approximate
14148 square root rather than its reciprocal. Select a zero result for
14149 zero source values, to avoid the Inf * 0 -> NaN that we'd get
14150 otherwise. */
14151 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
14152 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
14153 else
14155 /* Qualify the approximate reciprocal square root when the
14156 argument is 0.0 by squashing the intermediary result to 0.0. */
14157 rtx xtmp = gen_reg_rtx (mmsk);
14158 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
14159 gen_rtx_SUBREG (mmsk, xdst, 0)));
14160 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
14162 /* Calculate the approximate square root. */
14163 aarch64_emit_mult (xdst, pg, xdst, src);
14167 /* Finalize the approximation. */
14168 aarch64_emit_mult (dst, pg, xdst, x1);
14170 return true;
14173 /* Emit the instruction sequence to compute the approximation for the division
14174 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
14176 bool
14177 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
14179 machine_mode mode = GET_MODE (quo);
14181 if (GET_MODE_INNER (mode) == HFmode)
14182 return false;
14184 bool use_approx_division_p = (flag_mlow_precision_div
14185 || (aarch64_tune_params.approx_modes->division
14186 & AARCH64_APPROX_MODE (mode)));
14188 if (!flag_finite_math_only
14189 || flag_trapping_math
14190 || !flag_unsafe_math_optimizations
14191 || optimize_function_for_size_p (cfun)
14192 || !use_approx_division_p)
14193 return false;
14195 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
14196 return false;
14198 rtx pg = NULL_RTX;
14199 if (aarch64_sve_mode_p (mode))
14200 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
14202 /* Estimate the approximate reciprocal. */
14203 rtx xrcp = gen_reg_rtx (mode);
14204 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
14206 /* Iterate over the series twice for SF and thrice for DF. */
14207 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
14209 /* Optionally iterate over the series less for faster performance,
14210 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
14211 if (flag_mlow_precision_div)
14212 iterations = (GET_MODE_INNER (mode) == DFmode
14213 ? aarch64_double_recp_precision
14214 : aarch64_float_recp_precision);
14216 /* Iterate over the series to calculate the approximate reciprocal. */
14217 rtx xtmp = gen_reg_rtx (mode);
14218 while (iterations--)
14220 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
14222 if (iterations > 0)
14223 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
14226 if (num != CONST1_RTX (mode))
14228 /* As the approximate reciprocal of DEN is already calculated, only
14229 calculate the approximate division when NUM is not 1.0. */
14230 rtx xnum = force_reg (mode, num);
14231 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
14234 /* Finalize the approximation. */
14235 aarch64_emit_mult (quo, pg, xrcp, xtmp);
14236 return true;
14239 /* Return the number of instructions that can be issued per cycle. */
14240 static int
14241 aarch64_sched_issue_rate (void)
14243 return aarch64_tune_params.issue_rate;
14246 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
14247 static int
14248 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
14250 if (DEBUG_INSN_P (insn))
14251 return more;
14253 rtx_code code = GET_CODE (PATTERN (insn));
14254 if (code == USE || code == CLOBBER)
14255 return more;
14257 if (get_attr_type (insn) == TYPE_NO_INSN)
14258 return more;
14260 return more - 1;
14263 static int
14264 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
14266 int issue_rate = aarch64_sched_issue_rate ();
14268 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
14272 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
14273 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
14274 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
14276 static int
14277 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
14278 int ready_index)
14280 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
14284 /* Vectorizer cost model target hooks. */
14286 /* Information about how the CPU would issue the scalar, Advanced SIMD
14287 or SVE version of a vector loop, using the scheme defined by the
14288 aarch64_base_vec_issue_info hierarchy of structures. */
14289 struct aarch64_vec_op_count
14291 void dump () const;
14293 /* The number of individual "general" operations. See the comments
14294 in aarch64_base_vec_issue_info for details. */
14295 unsigned int general_ops = 0;
14297 /* The number of load and store operations, under the same scheme
14298 as above. */
14299 unsigned int loads = 0;
14300 unsigned int stores = 0;
14302 /* The minimum number of cycles needed to execute all loop-carried
14303 operations, which in the vector code become associated with
14304 reductions. */
14305 unsigned int reduction_latency = 0;
14308 /* Extends aarch64_vec_op_count with SVE-specific information. */
14309 struct aarch64_sve_op_count : aarch64_vec_op_count
14311 void dump () const;
14313 /* The number of individual predicate operations. See the comments
14314 in aarch64_sve_vec_issue_info for details. */
14315 unsigned int pred_ops = 0;
14318 /* Information about vector code that we're in the process of costing. */
14319 struct aarch64_vector_costs
14321 /* The normal latency-based costs for each region (prologue, body and
14322 epilogue), indexed by vect_cost_model_location. */
14323 unsigned int region[3] = {};
14325 /* True if we have performed one-time initialization based on the vec_info.
14327 This variable exists because the vec_info is not passed to the
14328 init_cost hook. We therefore have to defer initialization based on
14329 it till later. */
14330 bool analyzed_vinfo = false;
14332 /* True if we're costing a vector loop, false if we're costing block-level
14333 vectorization. */
14334 bool is_loop = false;
14336 /* True if we've seen an SVE operation that we cannot currently vectorize
14337 using Advanced SIMD. */
14338 bool saw_sve_only_op = false;
14340 /* - If VEC_FLAGS is zero then we're costing the original scalar code.
14341 - If VEC_FLAGS & VEC_ADVSIMD is nonzero then we're costing Advanced
14342 SIMD code.
14343 - If VEC_FLAGS & VEC_ANY_SVE is nonzero then we're costing SVE code. */
14344 unsigned int vec_flags = 0;
14346 /* On some CPUs, SVE and Advanced SIMD provide the same theoretical vector
14347 throughput, such as 4x128 Advanced SIMD vs. 2x256 SVE. In those
14348 situations, we try to predict whether an Advanced SIMD implementation
14349 of the loop could be completely unrolled and become straight-line code.
14350 If so, it is generally better to use the Advanced SIMD version rather
14351 than length-agnostic SVE, since the SVE loop would execute an unknown
14352 number of times and so could not be completely unrolled in the same way.
14354 If we're applying this heuristic, UNROLLED_ADVSIMD_NITERS is the
14355 number of Advanced SIMD loop iterations that would be unrolled and
14356 UNROLLED_ADVSIMD_STMTS estimates the total number of statements
14357 in the unrolled loop. Both values are zero if we're not applying
14358 the heuristic. */
14359 unsigned HOST_WIDE_INT unrolled_advsimd_niters = 0;
14360 unsigned HOST_WIDE_INT unrolled_advsimd_stmts = 0;
14362 /* If we're vectorizing a loop that executes a constant number of times,
14363 this variable gives the number of times that the vector loop would
14364 iterate, otherwise it is zero. */
14365 uint64_t num_vector_iterations = 0;
14367 /* Used only when vectorizing loops. Estimates the number and kind of scalar
14368 operations that would be needed to perform the same work as one iteration
14369 of the vector loop. */
14370 aarch64_vec_op_count scalar_ops;
14372 /* Used only when vectorizing loops. If VEC_FLAGS & VEC_ADVSIMD,
14373 this structure estimates the number and kind of operations that the
14374 vector loop would contain. If VEC_FLAGS & VEC_SVE, the structure
14375 estimates what the equivalent Advanced SIMD-only code would need in
14376 order to perform the same work as one iteration of the SVE loop. */
14377 aarch64_vec_op_count advsimd_ops;
14379 /* Used only when vectorizing loops with SVE. It estimates the number and
14380 kind of operations that the SVE loop would contain. */
14381 aarch64_sve_op_count sve_ops;
14383 /* Used to detect cases in which we end up costing the same load twice,
14384 once to account for results that are actually used and once to account
14385 for unused results. */
14386 hash_map<nofree_ptr_hash<_stmt_vec_info>, unsigned int> seen_loads;
14389 /* Implement TARGET_VECTORIZE_INIT_COST. */
14390 void *
14391 aarch64_init_cost (class loop *, bool)
14393 return new aarch64_vector_costs;
14396 /* Return true if the current CPU should use the new costs defined
14397 in GCC 11. This should be removed for GCC 12 and above, with the
14398 costs applying to all CPUs instead. */
14399 static bool
14400 aarch64_use_new_vector_costs_p ()
14402 return (aarch64_tune_params.extra_tuning_flags
14403 & AARCH64_EXTRA_TUNE_USE_NEW_VECTOR_COSTS);
14406 /* Return the appropriate SIMD costs for vectors of type VECTYPE. */
14407 static const simd_vec_cost *
14408 aarch64_simd_vec_costs (tree vectype)
14410 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
14411 if (vectype != NULL
14412 && aarch64_sve_mode_p (TYPE_MODE (vectype))
14413 && costs->sve != NULL)
14414 return costs->sve;
14415 return costs->advsimd;
14418 /* Return the appropriate SIMD costs for vectors with VEC_* flags FLAGS. */
14419 static const simd_vec_cost *
14420 aarch64_simd_vec_costs_for_flags (unsigned int flags)
14422 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
14423 if ((flags & VEC_ANY_SVE) && costs->sve)
14424 return costs->sve;
14425 return costs->advsimd;
14428 /* Decide whether to use the unrolling heuristic described above
14429 aarch64_vector_costs::unrolled_advsimd_niters, updating that
14430 field if so. LOOP_VINFO describes the loop that we're vectorizing
14431 and COSTS are the costs that we're calculating for it. */
14432 static void
14433 aarch64_record_potential_advsimd_unrolling (loop_vec_info loop_vinfo,
14434 aarch64_vector_costs *costs)
14436 /* The heuristic only makes sense on targets that have the same
14437 vector throughput for SVE and Advanced SIMD. */
14438 if (!(aarch64_tune_params.extra_tuning_flags
14439 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT))
14440 return;
14442 /* We only want to apply the heuristic if LOOP_VINFO is being
14443 vectorized for SVE. */
14444 if (!(costs->vec_flags & VEC_ANY_SVE))
14445 return;
14447 /* Check whether it is possible in principle to use Advanced SIMD
14448 instead. */
14449 if (aarch64_autovec_preference == 2)
14450 return;
14452 /* We don't want to apply the heuristic to outer loops, since it's
14453 harder to track two levels of unrolling. */
14454 if (LOOP_VINFO_LOOP (loop_vinfo)->inner)
14455 return;
14457 /* Only handle cases in which the number of Advanced SIMD iterations
14458 would be known at compile time but the number of SVE iterations
14459 would not. */
14460 if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
14461 || aarch64_sve_vg.is_constant ())
14462 return;
14464 /* Guess how many times the Advanced SIMD loop would iterate and make
14465 sure that it is within the complete unrolling limit. Even if the
14466 number of iterations is small enough, the number of statements might
14467 not be, which is why we need to estimate the number of statements too. */
14468 unsigned int estimated_vq = aarch64_estimated_sve_vq ();
14469 unsigned int advsimd_vf = CEIL (vect_vf_for_cost (loop_vinfo), estimated_vq);
14470 unsigned HOST_WIDE_INT unrolled_advsimd_niters
14471 = LOOP_VINFO_INT_NITERS (loop_vinfo) / advsimd_vf;
14472 if (unrolled_advsimd_niters > (unsigned int) param_max_completely_peel_times)
14473 return;
14475 /* Record that we're applying the heuristic and should try to estimate
14476 the number of statements in the Advanced SIMD loop. */
14477 costs->unrolled_advsimd_niters = unrolled_advsimd_niters;
14480 /* Do one-time initialization of COSTS given that we're costing the loop
14481 vectorization described by LOOP_VINFO. */
14482 static void
14483 aarch64_analyze_loop_vinfo (loop_vec_info loop_vinfo,
14484 aarch64_vector_costs *costs)
14486 costs->is_loop = true;
14488 /* Record the number of times that the vector loop would execute,
14489 if known. */
14490 class loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
14491 auto scalar_niters = max_stmt_executions_int (loop);
14492 if (scalar_niters >= 0)
14494 unsigned int vf = vect_vf_for_cost (loop_vinfo);
14495 if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
14496 costs->num_vector_iterations = scalar_niters / vf;
14497 else
14498 costs->num_vector_iterations = CEIL (scalar_niters, vf);
14501 /* Detect whether we're costing the scalar code or the vector code.
14502 This is a bit hacky: it would be better if the vectorizer told
14503 us directly.
14505 If we're costing the vector code, record whether we're vectorizing
14506 for Advanced SIMD or SVE. */
14507 if (costs == LOOP_VINFO_TARGET_COST_DATA (loop_vinfo))
14508 costs->vec_flags = aarch64_classify_vector_mode (loop_vinfo->vector_mode);
14509 else
14510 costs->vec_flags = 0;
14512 /* Detect whether we're vectorizing for SVE and should
14513 apply the unrolling heuristic described above
14514 aarch64_vector_costs::unrolled_advsimd_niters. */
14515 aarch64_record_potential_advsimd_unrolling (loop_vinfo, costs);
14517 /* Record the issue information for any SVE WHILE instructions that the
14518 loop needs. */
14519 auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
14520 if (issue_info
14521 && issue_info->sve
14522 && !LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
14524 unsigned int num_masks = 0;
14525 rgroup_controls *rgm;
14526 unsigned int num_vectors_m1;
14527 FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), num_vectors_m1, rgm)
14528 if (rgm->type)
14529 num_masks += num_vectors_m1 + 1;
14530 costs->sve_ops.pred_ops += num_masks * issue_info->sve->while_pred_ops;
14534 /* Do one-time initialization of COSTS given that we're costing the block
14535 vectorization described by BB_VINFO. */
14536 static void
14537 aarch64_analyze_bb_vinfo (bb_vec_info bb_vinfo, aarch64_vector_costs *costs)
14539 /* Unfortunately, there's no easy way of telling whether we're costing
14540 the vector code or the scalar code, so just assume that we're costing
14541 the vector code. */
14542 costs->vec_flags = aarch64_classify_vector_mode (bb_vinfo->vector_mode);
14545 /* Implement targetm.vectorize.builtin_vectorization_cost. */
14546 static int
14547 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
14548 tree vectype,
14549 int misalign ATTRIBUTE_UNUSED)
14551 unsigned elements;
14552 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
14553 bool fp = false;
14555 if (vectype != NULL)
14556 fp = FLOAT_TYPE_P (vectype);
14558 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
14560 switch (type_of_cost)
14562 case scalar_stmt:
14563 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
14565 case scalar_load:
14566 return costs->scalar_load_cost;
14568 case scalar_store:
14569 return costs->scalar_store_cost;
14571 case vector_stmt:
14572 return fp ? simd_costs->fp_stmt_cost
14573 : simd_costs->int_stmt_cost;
14575 case vector_load:
14576 return simd_costs->align_load_cost;
14578 case vector_store:
14579 return simd_costs->store_cost;
14581 case vec_to_scalar:
14582 return simd_costs->vec_to_scalar_cost;
14584 case scalar_to_vec:
14585 return simd_costs->scalar_to_vec_cost;
14587 case unaligned_load:
14588 case vector_gather_load:
14589 return simd_costs->unalign_load_cost;
14591 case unaligned_store:
14592 case vector_scatter_store:
14593 return simd_costs->unalign_store_cost;
14595 case cond_branch_taken:
14596 return costs->cond_taken_branch_cost;
14598 case cond_branch_not_taken:
14599 return costs->cond_not_taken_branch_cost;
14601 case vec_perm:
14602 return simd_costs->permute_cost;
14604 case vec_promote_demote:
14605 return fp ? simd_costs->fp_stmt_cost
14606 : simd_costs->int_stmt_cost;
14608 case vec_construct:
14609 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
14610 return elements / 2 + 1;
14612 default:
14613 gcc_unreachable ();
14617 /* Return true if STMT_INFO represents part of a reduction. */
14618 static bool
14619 aarch64_is_reduction (stmt_vec_info stmt_info)
14621 return (STMT_VINFO_REDUC_DEF (stmt_info)
14622 || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)));
14625 /* If STMT_INFO describes a reduction, return the type of reduction
14626 it describes, otherwise return -1. */
14627 static int
14628 aarch64_reduc_type (vec_info *vinfo, stmt_vec_info stmt_info)
14630 if (loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo))
14631 if (STMT_VINFO_REDUC_DEF (stmt_info))
14633 stmt_vec_info reduc_info = info_for_reduction (loop_vinfo, stmt_info);
14634 return int (STMT_VINFO_REDUC_TYPE (reduc_info));
14636 return -1;
14639 /* Return true if an access of kind KIND for STMT_INFO represents one
14640 vector of an LD[234] or ST[234] operation. Return the total number of
14641 vectors (2, 3 or 4) if so, otherwise return a value outside that range. */
14642 static int
14643 aarch64_ld234_st234_vectors (vect_cost_for_stmt kind, stmt_vec_info stmt_info)
14645 if ((kind == vector_load
14646 || kind == unaligned_load
14647 || kind == vector_store
14648 || kind == unaligned_store)
14649 && STMT_VINFO_DATA_REF (stmt_info))
14651 stmt_info = DR_GROUP_FIRST_ELEMENT (stmt_info);
14652 if (stmt_info
14653 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_LOAD_STORE_LANES)
14654 return DR_GROUP_SIZE (stmt_info);
14656 return 0;
14659 /* If STMT_INFO is a COND_EXPR that includes an embedded comparison, return the
14660 scalar type of the values being compared. Return null otherwise. */
14661 static tree
14662 aarch64_embedded_comparison_type (stmt_vec_info stmt_info)
14664 if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
14665 if (gimple_assign_rhs_code (assign) == COND_EXPR)
14667 tree cond = gimple_assign_rhs1 (assign);
14668 if (COMPARISON_CLASS_P (cond))
14669 return TREE_TYPE (TREE_OPERAND (cond, 0));
14671 return NULL_TREE;
14674 /* If STMT_INFO is a comparison or contains an embedded comparison, return the
14675 scalar type of the values being compared. Return null otherwise. */
14676 static tree
14677 aarch64_comparison_type (stmt_vec_info stmt_info)
14679 if (auto *assign = dyn_cast<gassign *> (stmt_info->stmt))
14680 if (TREE_CODE_CLASS (gimple_assign_rhs_code (assign)) == tcc_comparison)
14681 return TREE_TYPE (gimple_assign_rhs1 (assign));
14682 return aarch64_embedded_comparison_type (stmt_info);
14685 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
14686 vectors would produce a series of LDP or STP operations. KIND is the
14687 kind of statement that STMT_INFO represents. */
14688 static bool
14689 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
14690 stmt_vec_info stmt_info)
14692 switch (kind)
14694 case vector_load:
14695 case vector_store:
14696 case unaligned_load:
14697 case unaligned_store:
14698 break;
14700 default:
14701 return false;
14704 if (aarch64_tune_params.extra_tuning_flags
14705 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
14706 return false;
14708 return is_gimple_assign (stmt_info->stmt);
14711 /* Return true if STMT_INFO extends the result of a load. */
14712 static bool
14713 aarch64_extending_load_p (class vec_info *vinfo, stmt_vec_info stmt_info)
14715 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
14716 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
14717 return false;
14719 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
14720 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
14721 tree rhs_type = TREE_TYPE (rhs);
14722 if (!INTEGRAL_TYPE_P (lhs_type)
14723 || !INTEGRAL_TYPE_P (rhs_type)
14724 || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
14725 return false;
14727 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
14728 return (def_stmt_info
14729 && STMT_VINFO_DATA_REF (def_stmt_info)
14730 && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
14733 /* Return true if STMT_INFO is an integer truncation. */
14734 static bool
14735 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
14737 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
14738 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
14739 return false;
14741 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
14742 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
14743 return (INTEGRAL_TYPE_P (lhs_type)
14744 && INTEGRAL_TYPE_P (rhs_type)
14745 && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
14748 /* Return true if STMT_INFO is the second part of a two-statement multiply-add
14749 or multiply-subtract sequence that might be suitable for fusing into a
14750 single instruction. */
14751 static bool
14752 aarch64_multiply_add_p (vec_info *vinfo, stmt_vec_info stmt_info)
14754 gassign *assign = dyn_cast<gassign *> (stmt_info->stmt);
14755 if (!assign)
14756 return false;
14757 tree_code code = gimple_assign_rhs_code (assign);
14758 if (code != PLUS_EXPR && code != MINUS_EXPR)
14759 return false;
14761 if (CONSTANT_CLASS_P (gimple_assign_rhs1 (assign))
14762 || CONSTANT_CLASS_P (gimple_assign_rhs2 (assign)))
14763 return false;
14765 for (int i = 1; i < 3; ++i)
14767 tree rhs = gimple_op (assign, i);
14768 /* ??? Should we try to check for a single use as well? */
14769 if (TREE_CODE (rhs) != SSA_NAME)
14770 continue;
14772 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
14773 if (!def_stmt_info
14774 || STMT_VINFO_DEF_TYPE (def_stmt_info) != vect_internal_def)
14775 continue;
14776 gassign *rhs_assign = dyn_cast<gassign *> (def_stmt_info->stmt);
14777 if (!rhs_assign || gimple_assign_rhs_code (rhs_assign) != MULT_EXPR)
14778 continue;
14780 return true;
14782 return false;
14785 /* Return true if the vectorized form of STMT_INFO is something that is only
14786 possible when using SVE instead of Advanced SIMD. VECTYPE is the type of
14787 the vector that STMT_INFO is operating on. */
14788 static bool
14789 aarch64_sve_only_stmt_p (stmt_vec_info stmt_info, tree vectype)
14791 if (!aarch64_sve_mode_p (TYPE_MODE (vectype)))
14792 return false;
14794 if (STMT_VINFO_DATA_REF (stmt_info))
14796 /* Check for true gathers and scatters (rather than just strided accesses
14797 that we've chosen to implement using gathers and scatters). Although
14798 in principle we could use elementwise accesses for Advanced SIMD,
14799 the vectorizer doesn't yet support that. */
14800 if (STMT_VINFO_GATHER_SCATTER_P (stmt_info))
14801 return true;
14803 /* Check for masked loads and stores. */
14804 if (auto *call = dyn_cast<gcall *> (stmt_info->stmt))
14805 if (gimple_call_internal_p (call)
14806 && internal_fn_mask_index (gimple_call_internal_fn (call)) >= 0)
14807 return true;
14810 /* Check for 64-bit integer multiplications. */
14811 auto *assign = dyn_cast<gassign *> (stmt_info->stmt);
14812 if (assign
14813 && gimple_assign_rhs_code (assign) == MULT_EXPR
14814 && GET_MODE_INNER (TYPE_MODE (vectype)) == DImode
14815 && !integer_pow2p (gimple_assign_rhs2 (assign)))
14816 return true;
14818 return false;
14821 /* We are considering implementing STMT_INFO using SVE vector type VECTYPE.
14822 If STMT_INFO is an in-loop reduction that SVE supports directly, return
14823 its latency in cycles, otherwise return zero. SVE_COSTS specifies the
14824 latencies of the relevant instructions. */
14825 static unsigned int
14826 aarch64_sve_in_loop_reduction_latency (vec_info *vinfo,
14827 stmt_vec_info stmt_info,
14828 tree vectype,
14829 const sve_vec_cost *sve_costs)
14831 switch (aarch64_reduc_type (vinfo, stmt_info))
14833 case EXTRACT_LAST_REDUCTION:
14834 return sve_costs->clast_cost;
14836 case FOLD_LEFT_REDUCTION:
14837 switch (GET_MODE_INNER (TYPE_MODE (vectype)))
14839 case E_HFmode:
14840 case E_BFmode:
14841 return sve_costs->fadda_f16_cost;
14843 case E_SFmode:
14844 return sve_costs->fadda_f32_cost;
14846 case E_DFmode:
14847 return sve_costs->fadda_f64_cost;
14849 default:
14850 break;
14852 break;
14855 return 0;
14858 /* STMT_INFO describes a loop-carried operation in the original scalar code
14859 that we are considering implementing as a reduction. Return one of the
14860 following values, depending on VEC_FLAGS:
14862 - If VEC_FLAGS is zero, return the loop carry latency of the original
14863 scalar operation.
14865 - If VEC_FLAGS & VEC_ADVSIMD, return the loop carry latency of the
14866 the Advanced SIMD implementation.
14868 - If VEC_FLAGS & VEC_ANY_SVE, return the loop carry latency of the
14869 SVE implementation.
14871 VECTYPE is the type of vector that the vectorizer is considering using
14872 for STMT_INFO, which might be different from the type of vector described
14873 by VEC_FLAGS. */
14874 static unsigned int
14875 aarch64_in_loop_reduction_latency (vec_info *vinfo, stmt_vec_info stmt_info,
14876 tree vectype, unsigned int vec_flags)
14878 const cpu_vector_cost *vec_costs = aarch64_tune_params.vec_costs;
14879 const sve_vec_cost *sve_costs = nullptr;
14880 if (vec_flags & VEC_ANY_SVE)
14881 sve_costs = aarch64_tune_params.vec_costs->sve;
14883 /* If the caller is asking for the SVE latency, check for forms of reduction
14884 that only SVE can handle directly. */
14885 if (sve_costs)
14887 unsigned int latency
14888 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, vectype,
14889 sve_costs);
14890 if (latency)
14891 return latency;
14894 /* Handle scalar costs. */
14895 if (vec_flags == 0)
14897 if (FLOAT_TYPE_P (vectype))
14898 return vec_costs->scalar_fp_stmt_cost;
14899 return vec_costs->scalar_int_stmt_cost;
14902 /* Otherwise, the loop body just contains normal integer or FP operations,
14903 with a vector reduction outside the loop. */
14904 const simd_vec_cost *simd_costs
14905 = aarch64_simd_vec_costs_for_flags (vec_flags);
14906 if (FLOAT_TYPE_P (vectype))
14907 return simd_costs->fp_stmt_cost;
14908 return simd_costs->int_stmt_cost;
14911 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
14912 for STMT_INFO, which has cost kind KIND. If this is a scalar operation,
14913 try to subdivide the target-independent categorization provided by KIND
14914 to get a more accurate cost. */
14915 static unsigned int
14916 aarch64_detect_scalar_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
14917 stmt_vec_info stmt_info,
14918 unsigned int stmt_cost)
14920 /* Detect an extension of a loaded value. In general, we'll be able to fuse
14921 the extension with the load. */
14922 if (kind == scalar_stmt && aarch64_extending_load_p (vinfo, stmt_info))
14923 return 0;
14925 return stmt_cost;
14928 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
14929 for the vectorized form of STMT_INFO, which has cost kind KIND and which
14930 when vectorized would operate on vector type VECTYPE. Try to subdivide
14931 the target-independent categorization provided by KIND to get a more
14932 accurate cost. WHERE specifies where the cost associated with KIND
14933 occurs. */
14934 static unsigned int
14935 aarch64_detect_vector_stmt_subtype (vec_info *vinfo, vect_cost_for_stmt kind,
14936 stmt_vec_info stmt_info, tree vectype,
14937 enum vect_cost_model_location where,
14938 unsigned int stmt_cost)
14940 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
14941 const sve_vec_cost *sve_costs = nullptr;
14942 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
14943 sve_costs = aarch64_tune_params.vec_costs->sve;
14945 /* It's generally better to avoid costing inductions, since the induction
14946 will usually be hidden by other operations. This is particularly true
14947 for things like COND_REDUCTIONS. */
14948 if (is_a<gphi *> (stmt_info->stmt))
14949 return 0;
14951 /* Detect cases in which vec_to_scalar is describing the extraction of a
14952 vector element in preparation for a scalar store. The store itself is
14953 costed separately. */
14954 if (kind == vec_to_scalar
14955 && STMT_VINFO_DATA_REF (stmt_info)
14956 && DR_IS_WRITE (STMT_VINFO_DATA_REF (stmt_info)))
14957 return simd_costs->store_elt_extra_cost;
14959 /* Detect cases in which a scalar_store is really storing one element
14960 in a scatter operation. */
14961 if (kind == scalar_store
14962 && sve_costs
14963 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
14964 return sve_costs->scatter_store_elt_cost;
14966 /* Detect cases in which vec_to_scalar represents an in-loop reduction. */
14967 if (kind == vec_to_scalar
14968 && where == vect_body
14969 && sve_costs)
14971 unsigned int latency
14972 = aarch64_sve_in_loop_reduction_latency (vinfo, stmt_info, vectype,
14973 sve_costs);
14974 if (latency)
14975 return latency;
14978 /* Detect cases in which vec_to_scalar represents a single reduction
14979 instruction like FADDP or MAXV. */
14980 if (kind == vec_to_scalar
14981 && where == vect_epilogue
14982 && aarch64_is_reduction (stmt_info))
14983 switch (GET_MODE_INNER (TYPE_MODE (vectype)))
14985 case E_QImode:
14986 return simd_costs->reduc_i8_cost;
14988 case E_HImode:
14989 return simd_costs->reduc_i16_cost;
14991 case E_SImode:
14992 return simd_costs->reduc_i32_cost;
14994 case E_DImode:
14995 return simd_costs->reduc_i64_cost;
14997 case E_HFmode:
14998 case E_BFmode:
14999 return simd_costs->reduc_f16_cost;
15001 case E_SFmode:
15002 return simd_costs->reduc_f32_cost;
15004 case E_DFmode:
15005 return simd_costs->reduc_f64_cost;
15007 default:
15008 break;
15011 /* Otherwise stick with the original categorization. */
15012 return stmt_cost;
15015 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
15016 for STMT_INFO, which has cost kind KIND and which when vectorized would
15017 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
15018 targets. */
15019 static unsigned int
15020 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
15021 stmt_vec_info stmt_info, tree vectype,
15022 unsigned int stmt_cost)
15024 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
15025 vector register size or number of units. Integer promotions of this
15026 type therefore map to SXT[BHW] or UXT[BHW].
15028 Most loads have extending forms that can do the sign or zero extension
15029 on the fly. Optimistically assume that a load followed by an extension
15030 will fold to this form during combine, and that the extension therefore
15031 comes for free. */
15032 if (kind == vector_stmt && aarch64_extending_load_p (vinfo, stmt_info))
15033 stmt_cost = 0;
15035 /* For similar reasons, vector_stmt integer truncations are a no-op,
15036 because we can just ignore the unused upper bits of the source. */
15037 if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
15038 stmt_cost = 0;
15040 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
15041 but there are no equivalent instructions for SVE. This means that
15042 (all other things being equal) 128-bit SVE needs twice as many load
15043 and store instructions as Advanced SIMD in order to process vector pairs.
15045 Also, scalar code can often use LDP and STP to access pairs of values,
15046 so it is too simplistic to say that one SVE load or store replaces
15047 VF scalar loads and stores.
15049 Ideally we would account for this in the scalar and Advanced SIMD
15050 costs by making suitable load/store pairs as cheap as a single
15051 load/store. However, that would be a very invasive change and in
15052 practice it tends to stress other parts of the cost model too much.
15053 E.g. stores of scalar constants currently count just a store,
15054 whereas stores of vector constants count a store and a vec_init.
15055 This is an artificial distinction for AArch64, where stores of
15056 nonzero scalar constants need the same kind of register invariant
15057 as vector stores.
15059 An alternative would be to double the cost of any SVE loads and stores
15060 that could be paired in Advanced SIMD (and possibly also paired in
15061 scalar code). But this tends to stress other parts of the cost model
15062 in the same way. It also means that we can fall back to Advanced SIMD
15063 even if full-loop predication would have been useful.
15065 Here we go for a more conservative version: double the costs of SVE
15066 loads and stores if one iteration of the scalar loop processes enough
15067 elements for it to use a whole number of Advanced SIMD LDP or STP
15068 instructions. This makes it very likely that the VF would be 1 for
15069 Advanced SIMD, and so no epilogue should be needed. */
15070 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
15072 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
15073 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
15074 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
15075 if (multiple_p (count * elt_bits, 256)
15076 && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
15077 stmt_cost *= 2;
15080 return stmt_cost;
15083 /* STMT_COST is the cost calculated for STMT_INFO, which has cost kind KIND
15084 and which when vectorized would operate on vector type VECTYPE. Add the
15085 cost of any embedded operations. */
15086 static unsigned int
15087 aarch64_adjust_stmt_cost (vect_cost_for_stmt kind, stmt_vec_info stmt_info,
15088 tree vectype, unsigned int stmt_cost)
15090 if (vectype)
15092 const simd_vec_cost *simd_costs = aarch64_simd_vec_costs (vectype);
15094 /* Detect cases in which a vector load or store represents an
15095 LD[234] or ST[234] instruction. */
15096 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
15098 case 2:
15099 stmt_cost += simd_costs->ld2_st2_permute_cost;
15100 break;
15102 case 3:
15103 stmt_cost += simd_costs->ld3_st3_permute_cost;
15104 break;
15106 case 4:
15107 stmt_cost += simd_costs->ld4_st4_permute_cost;
15108 break;
15111 if (kind == vector_stmt || kind == vec_to_scalar)
15112 if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info))
15114 if (FLOAT_TYPE_P (cmp_type))
15115 stmt_cost += simd_costs->fp_stmt_cost;
15116 else
15117 stmt_cost += simd_costs->int_stmt_cost;
15121 if (kind == scalar_stmt)
15122 if (tree cmp_type = aarch64_embedded_comparison_type (stmt_info))
15124 if (FLOAT_TYPE_P (cmp_type))
15125 stmt_cost += aarch64_tune_params.vec_costs->scalar_fp_stmt_cost;
15126 else
15127 stmt_cost += aarch64_tune_params.vec_costs->scalar_int_stmt_cost;
15130 return stmt_cost;
15133 /* VINFO, COSTS, COUNT, KIND, STMT_INFO and VECTYPE are the same as for
15134 TARGET_VECTORIZE_ADD_STMT_COST and they describe an operation in the
15135 body of a vector loop. Record issue information relating to the vector
15136 operation in OPS, where OPS is one of COSTS->scalar_ops, COSTS->advsimd_ops
15137 or COSTS->sve_ops; see the comments above those variables for details.
15138 In addition:
15140 - VEC_FLAGS is zero if OPS is COSTS->scalar_ops.
15142 - VEC_FLAGS & VEC_ADVSIMD is nonzero if OPS is COSTS->advsimd_ops.
15144 - VEC_FLAGS & VEC_ANY_SVE is nonzero if OPS is COSTS->sve_ops.
15146 ISSUE_INFO provides the scalar, Advanced SIMD or SVE issue information
15147 associated with OPS and VEC_FLAGS. FACTOR says how many iterations of
15148 the loop described by VEC_FLAGS would be needed to match one iteration
15149 of the vector loop in VINFO. */
15150 static void
15151 aarch64_count_ops (class vec_info *vinfo, aarch64_vector_costs *costs,
15152 unsigned int count, enum vect_cost_for_stmt kind,
15153 _stmt_vec_info *stmt_info, tree vectype,
15154 unsigned int vec_flags, aarch64_vec_op_count *ops,
15155 const aarch64_base_vec_issue_info *issue_info,
15156 unsigned int factor)
15158 if (!issue_info)
15159 return;
15161 const aarch64_simd_vec_issue_info *simd_issue = nullptr;
15162 if (vec_flags)
15163 simd_issue = static_cast<const aarch64_simd_vec_issue_info *> (issue_info);
15165 const aarch64_sve_vec_issue_info *sve_issue = nullptr;
15166 if (vec_flags & VEC_ANY_SVE)
15167 sve_issue = static_cast<const aarch64_sve_vec_issue_info *> (issue_info);
15169 /* Calculate the minimum cycles per iteration imposed by a reduction
15170 operation. */
15171 if ((kind == vector_stmt || kind == vec_to_scalar)
15172 && aarch64_is_reduction (stmt_info))
15174 unsigned int base
15175 = aarch64_in_loop_reduction_latency (vinfo, stmt_info, vectype,
15176 vec_flags);
15177 if (aarch64_reduc_type (vinfo, stmt_info) == FOLD_LEFT_REDUCTION)
15179 if (aarch64_sve_mode_p (TYPE_MODE (vectype)))
15181 /* When costing an SVE FADDA, the vectorizer treats vec_to_scalar
15182 as a single operation, whereas for Advanced SIMD it is a
15183 per-element one. Increase the factor accordingly, both for
15184 the reduction_latency calculation and for the op couting. */
15185 if (vec_flags & VEC_ADVSIMD)
15186 factor = vect_nunits_for_cost (vectype);
15188 else
15189 /* An Advanced SIMD fold-left reduction is the same as a
15190 scalar one and the vectorizer therefore treats vec_to_scalar
15191 as a per-element cost. There is no extra factor to apply for
15192 scalar code, either for reduction_latency or for the op
15193 counting below. */
15194 factor = 1;
15197 /* ??? Ideally for vector code we'd do COUNT * FACTOR reductions in
15198 parallel, but unfortunately that's not yet the case. */
15199 ops->reduction_latency = MAX (ops->reduction_latency,
15200 base * count * factor);
15203 /* Assume that multiply-adds will become a single operation. */
15204 if (stmt_info && aarch64_multiply_add_p (vinfo, stmt_info))
15205 return;
15207 /* When costing scalar statements in vector code, the count already
15208 includes the number of scalar elements in the vector, so we don't
15209 need to apply the factor as well. */
15210 if (kind == scalar_load || kind == scalar_store || kind == scalar_stmt)
15211 factor = 1;
15213 /* This can go negative with the load handling below. */
15214 int num_copies = count * factor;
15216 /* Count the basic operation cost associated with KIND. */
15217 switch (kind)
15219 case cond_branch_taken:
15220 case cond_branch_not_taken:
15221 case vector_gather_load:
15222 case vector_scatter_store:
15223 /* We currently don't expect these to be used in a loop body. */
15224 break;
15226 case vec_perm:
15227 case vec_promote_demote:
15228 case vec_construct:
15229 case vec_to_scalar:
15230 case scalar_to_vec:
15231 /* Assume that these operations have no overhead in the original
15232 scalar code. */
15233 if (!vec_flags)
15234 break;
15235 /* Fallthrough. */
15236 case vector_stmt:
15237 case scalar_stmt:
15238 ops->general_ops += num_copies;
15239 break;
15241 case scalar_load:
15242 case vector_load:
15243 case unaligned_load:
15244 /* When costing scalars, detect cases in which we are called twice for
15245 the same load. This happens for LD[234] operations if only some of
15246 the results are used. The first time represents the cost of loading
15247 the unused vectors, while the second time represents the cost of
15248 loading the useful parts. Only the latter should count towards the
15249 scalar costs. */
15250 if (stmt_info && !vec_flags)
15252 bool existed = false;
15253 unsigned int &prev_count
15254 = costs->seen_loads.get_or_insert (stmt_info, &existed);
15255 if (existed)
15256 num_copies -= prev_count;
15257 else
15258 prev_count = num_copies;
15260 ops->loads += num_copies;
15261 if (vec_flags || FLOAT_TYPE_P (vectype))
15262 ops->general_ops += issue_info->fp_simd_load_general_ops * num_copies;
15263 break;
15265 case vector_store:
15266 case unaligned_store:
15267 case scalar_store:
15268 ops->stores += num_copies;
15269 if (vec_flags || FLOAT_TYPE_P (vectype))
15270 ops->general_ops += issue_info->fp_simd_store_general_ops * num_copies;
15271 break;
15274 /* Add any embedded comparison operations. */
15275 if ((kind == scalar_stmt || kind == vector_stmt || kind == vec_to_scalar)
15276 && aarch64_embedded_comparison_type (stmt_info))
15277 ops->general_ops += num_copies;
15279 /* Detect COND_REDUCTIONs and things that would need to become
15280 COND_REDUCTIONs if they were implemented using Advanced SIMD.
15281 There are then two sets of VEC_COND_EXPRs, whereas so far we
15282 have only accounted for one. */
15283 if (vec_flags && (kind == vector_stmt || kind == vec_to_scalar))
15285 int reduc_type = aarch64_reduc_type (vinfo, stmt_info);
15286 if ((reduc_type == EXTRACT_LAST_REDUCTION && (vec_flags & VEC_ADVSIMD))
15287 || reduc_type == COND_REDUCTION)
15288 ops->general_ops += num_copies;
15291 /* Count the predicate operations needed by an SVE comparison. */
15292 if (sve_issue && (kind == vector_stmt || kind == vec_to_scalar))
15293 if (tree type = aarch64_comparison_type (stmt_info))
15295 unsigned int base = (FLOAT_TYPE_P (type)
15296 ? sve_issue->fp_cmp_pred_ops
15297 : sve_issue->int_cmp_pred_ops);
15298 costs->sve_ops.pred_ops += base * num_copies;
15301 /* Add any extra overhead associated with LD[234] and ST[234] operations. */
15302 if (simd_issue)
15303 switch (aarch64_ld234_st234_vectors (kind, stmt_info))
15305 case 2:
15306 ops->general_ops += simd_issue->ld2_st2_general_ops * num_copies;
15307 break;
15309 case 3:
15310 ops->general_ops += simd_issue->ld3_st3_general_ops * num_copies;
15311 break;
15313 case 4:
15314 ops->general_ops += simd_issue->ld4_st4_general_ops * num_copies;
15315 break;
15318 /* Add any overhead associated with gather loads and scatter stores. */
15319 if (sve_issue
15320 && (kind == scalar_load || kind == scalar_store)
15321 && STMT_VINFO_MEMORY_ACCESS_TYPE (stmt_info) == VMAT_GATHER_SCATTER)
15323 unsigned int pairs = CEIL (count, 2);
15324 costs->sve_ops.pred_ops
15325 += sve_issue->gather_scatter_pair_pred_ops * pairs;
15326 ops->general_ops += sve_issue->gather_scatter_pair_general_ops * pairs;
15330 /* Implement targetm.vectorize.add_stmt_cost. */
15331 static unsigned
15332 aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
15333 enum vect_cost_for_stmt kind,
15334 struct _stmt_vec_info *stmt_info, tree vectype,
15335 int misalign, enum vect_cost_model_location where)
15337 auto *costs = static_cast<aarch64_vector_costs *> (data);
15338 unsigned retval = 0;
15340 if (flag_vect_cost_model)
15342 int stmt_cost
15343 = aarch64_builtin_vectorization_cost (kind, vectype, misalign);
15345 /* Do one-time initialization based on the vinfo. */
15346 loop_vec_info loop_vinfo = dyn_cast<loop_vec_info> (vinfo);
15347 bb_vec_info bb_vinfo = dyn_cast<bb_vec_info> (vinfo);
15348 if (!costs->analyzed_vinfo && aarch64_use_new_vector_costs_p ())
15350 if (loop_vinfo)
15351 aarch64_analyze_loop_vinfo (loop_vinfo, costs);
15352 else
15353 aarch64_analyze_bb_vinfo (bb_vinfo, costs);
15354 costs->analyzed_vinfo = true;
15357 /* Try to get a more accurate cost by looking at STMT_INFO instead
15358 of just looking at KIND. */
15359 if (stmt_info && aarch64_use_new_vector_costs_p ())
15361 if (vectype && aarch64_sve_only_stmt_p (stmt_info, vectype))
15362 costs->saw_sve_only_op = true;
15364 stmt_cost = aarch64_detect_scalar_stmt_subtype
15365 (vinfo, kind, stmt_info, stmt_cost);
15367 if (vectype && costs->vec_flags)
15368 stmt_cost = aarch64_detect_vector_stmt_subtype (vinfo, kind,
15369 stmt_info, vectype,
15370 where, stmt_cost);
15373 /* Do any SVE-specific adjustments to the cost. */
15374 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
15375 stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
15376 vectype, stmt_cost);
15378 if (stmt_info && aarch64_use_new_vector_costs_p ())
15380 /* Account for any extra "embedded" costs that apply additively
15381 to the base cost calculated above. */
15382 stmt_cost = aarch64_adjust_stmt_cost (kind, stmt_info, vectype,
15383 stmt_cost);
15385 /* If we're recording a nonzero vector loop body cost, also estimate
15386 the operations that would need to be issued by all relevant
15387 implementations of the loop. */
15388 auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
15389 if (loop_vinfo
15390 && issue_info
15391 && costs->vec_flags
15392 && where == vect_body
15393 && vectype
15394 && stmt_cost != 0)
15396 /* Record estimates for the scalar code. */
15397 aarch64_count_ops (vinfo, costs, count, kind, stmt_info, vectype,
15398 0, &costs->scalar_ops, issue_info->scalar,
15399 vect_nunits_for_cost (vectype));
15401 if (aarch64_sve_mode_p (vinfo->vector_mode) && issue_info->sve)
15403 /* Record estimates for a possible Advanced SIMD version
15404 of the SVE code. */
15405 aarch64_count_ops (vinfo, costs, count, kind, stmt_info,
15406 vectype, VEC_ADVSIMD, &costs->advsimd_ops,
15407 issue_info->advsimd,
15408 aarch64_estimated_sve_vq ());
15410 /* Record estimates for the SVE code itself. */
15411 aarch64_count_ops (vinfo, costs, count, kind, stmt_info,
15412 vectype, VEC_ANY_SVE, &costs->sve_ops,
15413 issue_info->sve, 1);
15415 else
15416 /* Record estimates for the Advanced SIMD code. Treat SVE like
15417 Advanced SIMD if the CPU has no specific SVE costs. */
15418 aarch64_count_ops (vinfo, costs, count, kind, stmt_info,
15419 vectype, VEC_ADVSIMD, &costs->advsimd_ops,
15420 issue_info->advsimd, 1);
15423 /* If we're applying the SVE vs. Advanced SIMD unrolling heuristic,
15424 estimate the number of statements in the unrolled Advanced SIMD
15425 loop. For simplicitly, we assume that one iteration of the
15426 Advanced SIMD loop would need the same number of statements
15427 as one iteration of the SVE loop. */
15428 if (where == vect_body && costs->unrolled_advsimd_niters)
15429 costs->unrolled_advsimd_stmts
15430 += count * costs->unrolled_advsimd_niters;
15433 /* Statements in an inner loop relative to the loop being
15434 vectorized are weighted more heavily. The value here is
15435 arbitrary and could potentially be improved with analysis. */
15436 if (where == vect_body && stmt_info
15437 && stmt_in_inner_loop_p (vinfo, stmt_info))
15439 gcc_assert (loop_vinfo);
15440 count *= LOOP_VINFO_INNER_LOOP_COST_FACTOR (loop_vinfo); /* FIXME */
15443 retval = (unsigned) (count * stmt_cost);
15444 costs->region[where] += retval;
15447 return retval;
15450 /* Dump information about the structure. */
15451 void
15452 aarch64_vec_op_count::dump () const
15454 dump_printf_loc (MSG_NOTE, vect_location,
15455 " load operations = %d\n", loads);
15456 dump_printf_loc (MSG_NOTE, vect_location,
15457 " store operations = %d\n", stores);
15458 dump_printf_loc (MSG_NOTE, vect_location,
15459 " general operations = %d\n", general_ops);
15460 dump_printf_loc (MSG_NOTE, vect_location,
15461 " reduction latency = %d\n", reduction_latency);
15464 /* Dump information about the structure. */
15465 void
15466 aarch64_sve_op_count::dump () const
15468 aarch64_vec_op_count::dump ();
15469 dump_printf_loc (MSG_NOTE, vect_location,
15470 " predicate operations = %d\n", pred_ops);
15473 /* Use ISSUE_INFO to estimate the minimum number of cycles needed to issue
15474 the operations described by OPS. This is a very simplistic model! */
15475 static unsigned int
15476 aarch64_estimate_min_cycles_per_iter
15477 (const aarch64_vec_op_count *ops,
15478 const aarch64_base_vec_issue_info *issue_info)
15480 unsigned int cycles = MAX (ops->reduction_latency, 1);
15481 cycles = MAX (cycles, CEIL (ops->stores, issue_info->stores_per_cycle));
15482 cycles = MAX (cycles, CEIL (ops->loads + ops->stores,
15483 issue_info->loads_stores_per_cycle));
15484 cycles = MAX (cycles, CEIL (ops->general_ops,
15485 issue_info->general_ops_per_cycle));
15486 return cycles;
15489 /* BODY_COST is the cost of a vector loop body recorded in COSTS.
15490 Adjust the cost as necessary and return the new cost. */
15491 static unsigned int
15492 aarch64_adjust_body_cost (aarch64_vector_costs *costs, unsigned int body_cost)
15494 unsigned int orig_body_cost = body_cost;
15495 bool should_disparage = false;
15497 if (dump_enabled_p ())
15498 dump_printf_loc (MSG_NOTE, vect_location,
15499 "Original vector body cost = %d\n", body_cost);
15501 if (costs->unrolled_advsimd_stmts)
15503 if (dump_enabled_p ())
15504 dump_printf_loc (MSG_NOTE, vect_location, "Number of insns in"
15505 " unrolled Advanced SIMD loop = %d\n",
15506 costs->unrolled_advsimd_stmts);
15508 /* Apply the Advanced SIMD vs. SVE unrolling heuristic described above
15509 aarch64_vector_costs::unrolled_advsimd_niters.
15511 The balance here is tricky. On the one hand, we can't be sure whether
15512 the code is vectorizable with Advanced SIMD or not. However, even if
15513 it isn't vectorizable with Advanced SIMD, there's a possibility that
15514 the scalar code could also be unrolled. Some of the code might then
15515 benefit from SLP, or from using LDP and STP. We therefore apply
15516 the heuristic regardless of can_use_advsimd_p. */
15517 if (costs->unrolled_advsimd_stmts
15518 && (costs->unrolled_advsimd_stmts
15519 <= (unsigned int) param_max_completely_peeled_insns))
15521 unsigned int estimated_vq = aarch64_estimated_sve_vq ();
15522 unsigned int min_cost = (orig_body_cost * estimated_vq) + 1;
15523 if (body_cost < min_cost)
15525 if (dump_enabled_p ())
15526 dump_printf_loc (MSG_NOTE, vect_location,
15527 "Increasing body cost to %d to account for"
15528 " unrolling\n", min_cost);
15529 body_cost = min_cost;
15530 should_disparage = true;
15535 auto *issue_info = aarch64_tune_params.vec_costs->issue_info;
15536 if (!issue_info)
15537 return body_cost;
15539 unsigned int scalar_cycles_per_iter
15540 = aarch64_estimate_min_cycles_per_iter (&costs->scalar_ops,
15541 issue_info->scalar);
15542 unsigned int advsimd_cycles_per_iter
15543 = aarch64_estimate_min_cycles_per_iter (&costs->advsimd_ops,
15544 issue_info->advsimd);
15545 bool could_use_advsimd
15546 = ((costs->vec_flags & VEC_ADVSIMD)
15547 || (aarch64_autovec_preference != 2
15548 && (aarch64_tune_params.extra_tuning_flags
15549 & AARCH64_EXTRA_TUNE_MATCHED_VECTOR_THROUGHPUT)
15550 && !costs->saw_sve_only_op));
15552 if (dump_enabled_p ())
15554 if (IN_RANGE (costs->num_vector_iterations, 0, 65536))
15555 dump_printf_loc (MSG_NOTE, vect_location,
15556 "Vector loop iterates at most %wd times\n",
15557 costs->num_vector_iterations);
15558 dump_printf_loc (MSG_NOTE, vect_location, "Scalar issue estimate:\n");
15559 costs->scalar_ops.dump ();
15560 dump_printf_loc (MSG_NOTE, vect_location,
15561 " estimated cycles per iteration = %d\n",
15562 scalar_cycles_per_iter);
15563 if (could_use_advsimd)
15565 dump_printf_loc (MSG_NOTE, vect_location,
15566 "Advanced SIMD issue estimate:\n");
15567 costs->advsimd_ops.dump ();
15568 dump_printf_loc (MSG_NOTE, vect_location,
15569 " estimated cycles per iteration = %d\n",
15570 advsimd_cycles_per_iter);
15572 else
15573 dump_printf_loc (MSG_NOTE, vect_location,
15574 "Loop could not use Advanced SIMD\n");
15577 uint64_t vector_cycles_per_iter = advsimd_cycles_per_iter;
15578 unsigned int vector_reduction_latency = costs->advsimd_ops.reduction_latency;
15579 if ((costs->vec_flags & VEC_ANY_SVE) && issue_info->sve)
15581 /* Estimate the minimum number of cycles per iteration needed to issue
15582 non-predicate operations. */
15583 unsigned int sve_cycles_per_iter
15584 = aarch64_estimate_min_cycles_per_iter (&costs->sve_ops,
15585 issue_info->sve);
15587 /* Separately estimate the minimum number of cycles per iteration needed
15588 to issue the predicate operations. */
15589 unsigned int pred_cycles_per_iter
15590 = CEIL (costs->sve_ops.pred_ops, issue_info->sve->pred_ops_per_cycle);
15592 if (dump_enabled_p ())
15594 dump_printf_loc (MSG_NOTE, vect_location, "SVE issue estimate:\n");
15595 costs->sve_ops.dump ();
15596 dump_printf_loc (MSG_NOTE, vect_location,
15597 " estimated cycles per iteration for non-predicate"
15598 " operations = %d\n", sve_cycles_per_iter);
15599 if (costs->sve_ops.pred_ops)
15600 dump_printf_loc (MSG_NOTE, vect_location, " estimated cycles per"
15601 " iteration for predicate operations = %d\n",
15602 pred_cycles_per_iter);
15605 vector_cycles_per_iter = MAX (sve_cycles_per_iter, pred_cycles_per_iter);
15606 vector_reduction_latency = costs->sve_ops.reduction_latency;
15608 /* If the scalar version of the loop could issue at least as
15609 quickly as the predicate parts of the SVE loop, make the SVE loop
15610 prohibitively expensive. In this case vectorization is adding an
15611 overhead that the original scalar code didn't have.
15613 This is mostly intended to detect cases in which WHILELOs dominate
15614 for very tight loops, which is something that normal latency-based
15615 costs would not model. Adding this kind of cliffedge would be
15616 too drastic for scalar_cycles_per_iter vs. sve_cycles_per_iter;
15617 code later in the function handles that case in a more
15618 conservative way. */
15619 uint64_t sve_estimate = pred_cycles_per_iter + 1;
15620 if (scalar_cycles_per_iter < sve_estimate)
15622 unsigned int min_cost
15623 = orig_body_cost * estimated_poly_value (BYTES_PER_SVE_VECTOR);
15624 if (body_cost < min_cost)
15626 if (dump_enabled_p ())
15627 dump_printf_loc (MSG_NOTE, vect_location,
15628 "Increasing body cost to %d because the"
15629 " scalar code could issue within the limit"
15630 " imposed by predicate operations\n",
15631 min_cost);
15632 body_cost = min_cost;
15633 should_disparage = true;
15637 /* If it appears that the Advanced SIMD version of a loop could issue
15638 more quickly than the SVE one, increase the SVE cost in proportion
15639 to the difference. The intention is to make Advanced SIMD preferable
15640 in cases where an Advanced SIMD version exists, without increasing
15641 the costs so much that SVE won't be used at all.
15643 The reasoning is similar to the scalar vs. predicate comparison above:
15644 if the issue rate of the SVE code is limited by predicate operations
15645 (i.e. if pred_cycles_per_iter > sve_cycles_per_iter), and if the
15646 Advanced SIMD code could issue within the limit imposed by the
15647 predicate operations, the predicate operations are adding an
15648 overhead that the original code didn't have and so we should prefer
15649 the Advanced SIMD version. However, if the predicate operations
15650 do not dominate in this way, we should only increase the cost of
15651 the SVE code if sve_cycles_per_iter is strictly greater than
15652 advsimd_cycles_per_iter. Given rounding effects, this should mean
15653 that Advanced SIMD is either better or at least no worse. */
15654 if (sve_cycles_per_iter >= pred_cycles_per_iter)
15655 sve_estimate = sve_cycles_per_iter;
15656 if (could_use_advsimd && advsimd_cycles_per_iter < sve_estimate)
15658 /* This ensures that min_cost > orig_body_cost * 2. */
15659 unsigned int min_cost
15660 = orig_body_cost * CEIL (sve_estimate, advsimd_cycles_per_iter) + 1;
15661 if (body_cost < min_cost)
15663 if (dump_enabled_p ())
15664 dump_printf_loc (MSG_NOTE, vect_location,
15665 "Increasing body cost to %d because Advanced"
15666 " SIMD code could issue as quickly\n",
15667 min_cost);
15668 body_cost = min_cost;
15669 should_disparage = true;
15674 /* Decide whether to stick to latency-based costs or whether to try to
15675 take issue rates into account. */
15676 unsigned int threshold = aarch64_loop_vect_issue_rate_niters;
15677 if (costs->vec_flags & VEC_ANY_SVE)
15678 threshold = CEIL (threshold, aarch64_estimated_sve_vq ());
15680 if (costs->num_vector_iterations >= 1
15681 && costs->num_vector_iterations < threshold)
15683 if (dump_enabled_p ())
15684 dump_printf_loc (MSG_NOTE, vect_location,
15685 "Low iteration count, so using pure latency"
15686 " costs\n");
15688 /* Increase the cost of the vector code if it looks like the scalar code
15689 could issue more quickly. These values are only rough estimates,
15690 so minor differences should only result in minor changes. */
15691 else if (scalar_cycles_per_iter < vector_cycles_per_iter)
15693 body_cost = CEIL (body_cost * vector_cycles_per_iter,
15694 scalar_cycles_per_iter);
15695 if (dump_enabled_p ())
15696 dump_printf_loc (MSG_NOTE, vect_location,
15697 "Increasing body cost to %d because scalar code"
15698 " would issue more quickly\n", body_cost);
15700 /* In general, it's expected that the proposed vector code would be able
15701 to issue more quickly than the original scalar code. This should
15702 already be reflected to some extent in the latency-based costs.
15704 However, the latency-based costs effectively assume that the scalar
15705 code and the vector code execute serially, which tends to underplay
15706 one important case: if the real (non-serialized) execution time of
15707 a scalar iteration is dominated by loop-carried dependencies,
15708 and if the vector code is able to reduce both the length of
15709 the loop-carried dependencies *and* the number of cycles needed
15710 to issue the code in general, we can be more confident that the
15711 vector code is an improvement, even if adding the other (non-loop-carried)
15712 latencies tends to hide this saving. We therefore reduce the cost of the
15713 vector loop body in proportion to the saving. */
15714 else if (costs->scalar_ops.reduction_latency > vector_reduction_latency
15715 && costs->scalar_ops.reduction_latency == scalar_cycles_per_iter
15716 && scalar_cycles_per_iter > vector_cycles_per_iter
15717 && !should_disparage)
15719 body_cost = CEIL (body_cost * vector_cycles_per_iter,
15720 scalar_cycles_per_iter);
15721 if (dump_enabled_p ())
15722 dump_printf_loc (MSG_NOTE, vect_location,
15723 "Decreasing body cost to %d account for smaller"
15724 " reduction latency\n", body_cost);
15727 return body_cost;
15730 /* Implement TARGET_VECTORIZE_FINISH_COST. */
15731 static void
15732 aarch64_finish_cost (void *data, unsigned *prologue_cost,
15733 unsigned *body_cost, unsigned *epilogue_cost)
15735 auto *costs = static_cast<aarch64_vector_costs *> (data);
15736 *prologue_cost = costs->region[vect_prologue];
15737 *body_cost = costs->region[vect_body];
15738 *epilogue_cost = costs->region[vect_epilogue];
15740 if (costs->is_loop
15741 && costs->vec_flags
15742 && aarch64_use_new_vector_costs_p ())
15743 *body_cost = aarch64_adjust_body_cost (costs, *body_cost);
15746 /* Implement TARGET_VECTORIZE_DESTROY_COST_DATA. */
15747 static void
15748 aarch64_destroy_cost_data (void *data)
15750 delete static_cast<aarch64_vector_costs *> (data);
15753 static void initialize_aarch64_code_model (struct gcc_options *);
15755 /* Parse the TO_PARSE string and put the architecture struct that it
15756 selects into RES and the architectural features into ISA_FLAGS.
15757 Return an aarch64_parse_opt_result describing the parse result.
15758 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
15759 When the TO_PARSE string contains an invalid extension,
15760 a copy of the string is created and stored to INVALID_EXTENSION. */
15762 static enum aarch64_parse_opt_result
15763 aarch64_parse_arch (const char *to_parse, const struct processor **res,
15764 uint64_t *isa_flags, std::string *invalid_extension)
15766 const char *ext;
15767 const struct processor *arch;
15768 size_t len;
15770 ext = strchr (to_parse, '+');
15772 if (ext != NULL)
15773 len = ext - to_parse;
15774 else
15775 len = strlen (to_parse);
15777 if (len == 0)
15778 return AARCH64_PARSE_MISSING_ARG;
15781 /* Loop through the list of supported ARCHes to find a match. */
15782 for (arch = all_architectures; arch->name != NULL; arch++)
15784 if (strlen (arch->name) == len
15785 && strncmp (arch->name, to_parse, len) == 0)
15787 uint64_t isa_temp = arch->flags;
15789 if (ext != NULL)
15791 /* TO_PARSE string contains at least one extension. */
15792 enum aarch64_parse_opt_result ext_res
15793 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
15795 if (ext_res != AARCH64_PARSE_OK)
15796 return ext_res;
15798 /* Extension parsing was successful. Confirm the result
15799 arch and ISA flags. */
15800 *res = arch;
15801 *isa_flags = isa_temp;
15802 return AARCH64_PARSE_OK;
15806 /* ARCH name not found in list. */
15807 return AARCH64_PARSE_INVALID_ARG;
15810 /* Parse the TO_PARSE string and put the result tuning in RES and the
15811 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
15812 describing the parse result. If there is an error parsing, RES and
15813 ISA_FLAGS are left unchanged.
15814 When the TO_PARSE string contains an invalid extension,
15815 a copy of the string is created and stored to INVALID_EXTENSION. */
15817 static enum aarch64_parse_opt_result
15818 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
15819 uint64_t *isa_flags, std::string *invalid_extension)
15821 const char *ext;
15822 const struct processor *cpu;
15823 size_t len;
15825 ext = strchr (to_parse, '+');
15827 if (ext != NULL)
15828 len = ext - to_parse;
15829 else
15830 len = strlen (to_parse);
15832 if (len == 0)
15833 return AARCH64_PARSE_MISSING_ARG;
15836 /* Loop through the list of supported CPUs to find a match. */
15837 for (cpu = all_cores; cpu->name != NULL; cpu++)
15839 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
15841 uint64_t isa_temp = cpu->flags;
15844 if (ext != NULL)
15846 /* TO_PARSE string contains at least one extension. */
15847 enum aarch64_parse_opt_result ext_res
15848 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
15850 if (ext_res != AARCH64_PARSE_OK)
15851 return ext_res;
15853 /* Extension parsing was successfull. Confirm the result
15854 cpu and ISA flags. */
15855 *res = cpu;
15856 *isa_flags = isa_temp;
15857 return AARCH64_PARSE_OK;
15861 /* CPU name not found in list. */
15862 return AARCH64_PARSE_INVALID_ARG;
15865 /* Parse the TO_PARSE string and put the cpu it selects into RES.
15866 Return an aarch64_parse_opt_result describing the parse result.
15867 If the parsing fails the RES does not change. */
15869 static enum aarch64_parse_opt_result
15870 aarch64_parse_tune (const char *to_parse, const struct processor **res)
15872 const struct processor *cpu;
15874 /* Loop through the list of supported CPUs to find a match. */
15875 for (cpu = all_cores; cpu->name != NULL; cpu++)
15877 if (strcmp (cpu->name, to_parse) == 0)
15879 *res = cpu;
15880 return AARCH64_PARSE_OK;
15884 /* CPU name not found in list. */
15885 return AARCH64_PARSE_INVALID_ARG;
15888 /* Parse TOKEN, which has length LENGTH to see if it is an option
15889 described in FLAG. If it is, return the index bit for that fusion type.
15890 If not, error (printing OPTION_NAME) and return zero. */
15892 static unsigned int
15893 aarch64_parse_one_option_token (const char *token,
15894 size_t length,
15895 const struct aarch64_flag_desc *flag,
15896 const char *option_name)
15898 for (; flag->name != NULL; flag++)
15900 if (length == strlen (flag->name)
15901 && !strncmp (flag->name, token, length))
15902 return flag->flag;
15905 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
15906 return 0;
15909 /* Parse OPTION which is a comma-separated list of flags to enable.
15910 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
15911 default state we inherit from the CPU tuning structures. OPTION_NAME
15912 gives the top-level option we are parsing in the -moverride string,
15913 for use in error messages. */
15915 static unsigned int
15916 aarch64_parse_boolean_options (const char *option,
15917 const struct aarch64_flag_desc *flags,
15918 unsigned int initial_state,
15919 const char *option_name)
15921 const char separator = '.';
15922 const char* specs = option;
15923 const char* ntoken = option;
15924 unsigned int found_flags = initial_state;
15926 while ((ntoken = strchr (specs, separator)))
15928 size_t token_length = ntoken - specs;
15929 unsigned token_ops = aarch64_parse_one_option_token (specs,
15930 token_length,
15931 flags,
15932 option_name);
15933 /* If we find "none" (or, for simplicity's sake, an error) anywhere
15934 in the token stream, reset the supported operations. So:
15936 adrp+add.cmp+branch.none.adrp+add
15938 would have the result of turning on only adrp+add fusion. */
15939 if (!token_ops)
15940 found_flags = 0;
15942 found_flags |= token_ops;
15943 specs = ++ntoken;
15946 /* We ended with a comma, print something. */
15947 if (!(*specs))
15949 error ("%s string ill-formed\n", option_name);
15950 return 0;
15953 /* We still have one more token to parse. */
15954 size_t token_length = strlen (specs);
15955 unsigned token_ops = aarch64_parse_one_option_token (specs,
15956 token_length,
15957 flags,
15958 option_name);
15959 if (!token_ops)
15960 found_flags = 0;
15962 found_flags |= token_ops;
15963 return found_flags;
15966 /* Support for overriding instruction fusion. */
15968 static void
15969 aarch64_parse_fuse_string (const char *fuse_string,
15970 struct tune_params *tune)
15972 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
15973 aarch64_fusible_pairs,
15974 tune->fusible_ops,
15975 "fuse=");
15978 /* Support for overriding other tuning flags. */
15980 static void
15981 aarch64_parse_tune_string (const char *tune_string,
15982 struct tune_params *tune)
15984 tune->extra_tuning_flags
15985 = aarch64_parse_boolean_options (tune_string,
15986 aarch64_tuning_flags,
15987 tune->extra_tuning_flags,
15988 "tune=");
15991 /* Parse the sve_width tuning moverride string in TUNE_STRING.
15992 Accept the valid SVE vector widths allowed by
15993 aarch64_sve_vector_bits_enum and use it to override sve_width
15994 in TUNE. */
15996 static void
15997 aarch64_parse_sve_width_string (const char *tune_string,
15998 struct tune_params *tune)
16000 int width = -1;
16002 int n = sscanf (tune_string, "%d", &width);
16003 if (n == EOF)
16005 error ("invalid format for sve_width");
16006 return;
16008 switch (width)
16010 case SVE_128:
16011 case SVE_256:
16012 case SVE_512:
16013 case SVE_1024:
16014 case SVE_2048:
16015 break;
16016 default:
16017 error ("invalid sve_width value: %d", width);
16019 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
16022 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
16023 we understand. If it is, extract the option string and handoff to
16024 the appropriate function. */
16026 void
16027 aarch64_parse_one_override_token (const char* token,
16028 size_t length,
16029 struct tune_params *tune)
16031 const struct aarch64_tuning_override_function *fn
16032 = aarch64_tuning_override_functions;
16034 const char *option_part = strchr (token, '=');
16035 if (!option_part)
16037 error ("tuning string missing in option (%s)", token);
16038 return;
16041 /* Get the length of the option name. */
16042 length = option_part - token;
16043 /* Skip the '=' to get to the option string. */
16044 option_part++;
16046 for (; fn->name != NULL; fn++)
16048 if (!strncmp (fn->name, token, length))
16050 fn->parse_override (option_part, tune);
16051 return;
16055 error ("unknown tuning option (%s)",token);
16056 return;
16059 /* A checking mechanism for the implementation of the tls size. */
16061 static void
16062 initialize_aarch64_tls_size (struct gcc_options *opts)
16064 if (aarch64_tls_size == 0)
16065 aarch64_tls_size = 24;
16067 switch (opts->x_aarch64_cmodel_var)
16069 case AARCH64_CMODEL_TINY:
16070 /* Both the default and maximum TLS size allowed under tiny is 1M which
16071 needs two instructions to address, so we clamp the size to 24. */
16072 if (aarch64_tls_size > 24)
16073 aarch64_tls_size = 24;
16074 break;
16075 case AARCH64_CMODEL_SMALL:
16076 /* The maximum TLS size allowed under small is 4G. */
16077 if (aarch64_tls_size > 32)
16078 aarch64_tls_size = 32;
16079 break;
16080 case AARCH64_CMODEL_LARGE:
16081 /* The maximum TLS size allowed under large is 16E.
16082 FIXME: 16E should be 64bit, we only support 48bit offset now. */
16083 if (aarch64_tls_size > 48)
16084 aarch64_tls_size = 48;
16085 break;
16086 default:
16087 gcc_unreachable ();
16090 return;
16093 /* Parse STRING looking for options in the format:
16094 string :: option:string
16095 option :: name=substring
16096 name :: {a-z}
16097 substring :: defined by option. */
16099 static void
16100 aarch64_parse_override_string (const char* input_string,
16101 struct tune_params* tune)
16103 const char separator = ':';
16104 size_t string_length = strlen (input_string) + 1;
16105 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
16106 char *string = string_root;
16107 strncpy (string, input_string, string_length);
16108 string[string_length - 1] = '\0';
16110 char* ntoken = string;
16112 while ((ntoken = strchr (string, separator)))
16114 size_t token_length = ntoken - string;
16115 /* Make this substring look like a string. */
16116 *ntoken = '\0';
16117 aarch64_parse_one_override_token (string, token_length, tune);
16118 string = ++ntoken;
16121 /* One last option to parse. */
16122 aarch64_parse_one_override_token (string, strlen (string), tune);
16123 free (string_root);
16126 /* Adjust CURRENT_TUNE (a generic tuning struct) with settings that
16127 are best for a generic target with the currently-enabled architecture
16128 extensions. */
16129 static void
16130 aarch64_adjust_generic_arch_tuning (struct tune_params &current_tune)
16132 /* Neoverse V1 is the only core that is known to benefit from
16133 AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS. There is therefore no
16134 point enabling it for SVE2 and above. */
16135 if (TARGET_SVE2)
16136 current_tune.extra_tuning_flags
16137 &= ~AARCH64_EXTRA_TUNE_CSE_SVE_VL_CONSTANTS;
16140 static void
16141 aarch64_override_options_after_change_1 (struct gcc_options *opts)
16143 if (accepted_branch_protection_string)
16145 opts->x_aarch64_branch_protection_string
16146 = xstrdup (accepted_branch_protection_string);
16149 /* PR 70044: We have to be careful about being called multiple times for the
16150 same function. This means all changes should be repeatable. */
16152 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
16153 Disable the frame pointer flag so the mid-end will not use a frame
16154 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
16155 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
16156 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
16157 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
16158 if (opts->x_flag_omit_frame_pointer == 0)
16159 opts->x_flag_omit_frame_pointer = 2;
16161 /* If not optimizing for size, set the default
16162 alignment to what the target wants. */
16163 if (!opts->x_optimize_size)
16165 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
16166 opts->x_str_align_loops = aarch64_tune_params.loop_align;
16167 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
16168 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
16169 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
16170 opts->x_str_align_functions = aarch64_tune_params.function_align;
16173 /* We default to no pc-relative literal loads. */
16175 aarch64_pcrelative_literal_loads = false;
16177 /* If -mpc-relative-literal-loads is set on the command line, this
16178 implies that the user asked for PC relative literal loads. */
16179 if (opts->x_pcrelative_literal_loads == 1)
16180 aarch64_pcrelative_literal_loads = true;
16182 /* In the tiny memory model it makes no sense to disallow PC relative
16183 literal pool loads. */
16184 if (aarch64_cmodel == AARCH64_CMODEL_TINY
16185 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
16186 aarch64_pcrelative_literal_loads = true;
16188 /* When enabling the lower precision Newton series for the square root, also
16189 enable it for the reciprocal square root, since the latter is an
16190 intermediary step for the former. */
16191 if (flag_mlow_precision_sqrt)
16192 flag_mrecip_low_precision_sqrt = true;
16195 /* 'Unpack' up the internal tuning structs and update the options
16196 in OPTS. The caller must have set up selected_tune and selected_arch
16197 as all the other target-specific codegen decisions are
16198 derived from them. */
16200 void
16201 aarch64_override_options_internal (struct gcc_options *opts)
16203 aarch64_tune_flags = selected_tune->flags;
16204 aarch64_tune = selected_tune->sched_core;
16205 /* Make a copy of the tuning parameters attached to the core, which
16206 we may later overwrite. */
16207 aarch64_tune_params = *(selected_tune->tune);
16208 aarch64_architecture_version = selected_arch->architecture_version;
16209 if (selected_tune->tune == &generic_tunings)
16210 aarch64_adjust_generic_arch_tuning (aarch64_tune_params);
16212 if (opts->x_aarch64_override_tune_string)
16213 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
16214 &aarch64_tune_params);
16216 /* This target defaults to strict volatile bitfields. */
16217 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
16218 opts->x_flag_strict_volatile_bitfields = 1;
16220 if (aarch64_stack_protector_guard == SSP_GLOBAL
16221 && opts->x_aarch64_stack_protector_guard_offset_str)
16223 error ("incompatible options %<-mstack-protector-guard=global%> and "
16224 "%<-mstack-protector-guard-offset=%s%>",
16225 aarch64_stack_protector_guard_offset_str);
16228 if (aarch64_stack_protector_guard == SSP_SYSREG
16229 && !(opts->x_aarch64_stack_protector_guard_offset_str
16230 && opts->x_aarch64_stack_protector_guard_reg_str))
16232 error ("both %<-mstack-protector-guard-offset%> and "
16233 "%<-mstack-protector-guard-reg%> must be used "
16234 "with %<-mstack-protector-guard=sysreg%>");
16237 if (opts->x_aarch64_stack_protector_guard_reg_str)
16239 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
16240 error ("specify a system register with a small string length.");
16243 if (opts->x_aarch64_stack_protector_guard_offset_str)
16245 char *end;
16246 const char *str = aarch64_stack_protector_guard_offset_str;
16247 errno = 0;
16248 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
16249 if (!*str || *end || errno)
16250 error ("%qs is not a valid offset in %qs", str,
16251 "-mstack-protector-guard-offset=");
16252 aarch64_stack_protector_guard_offset = offs;
16255 initialize_aarch64_code_model (opts);
16256 initialize_aarch64_tls_size (opts);
16258 int queue_depth = 0;
16259 switch (aarch64_tune_params.autoprefetcher_model)
16261 case tune_params::AUTOPREFETCHER_OFF:
16262 queue_depth = -1;
16263 break;
16264 case tune_params::AUTOPREFETCHER_WEAK:
16265 queue_depth = 0;
16266 break;
16267 case tune_params::AUTOPREFETCHER_STRONG:
16268 queue_depth = max_insn_queue_index + 1;
16269 break;
16270 default:
16271 gcc_unreachable ();
16274 /* We don't mind passing in global_options_set here as we don't use
16275 the *options_set structs anyway. */
16276 SET_OPTION_IF_UNSET (opts, &global_options_set,
16277 param_sched_autopref_queue_depth, queue_depth);
16279 /* If using Advanced SIMD only for autovectorization disable SVE vector costs
16280 comparison. */
16281 if (aarch64_autovec_preference == 1)
16282 SET_OPTION_IF_UNSET (opts, &global_options_set,
16283 aarch64_sve_compare_costs, 0);
16285 /* Set up parameters to be used in prefetching algorithm. Do not
16286 override the defaults unless we are tuning for a core we have
16287 researched values for. */
16288 if (aarch64_tune_params.prefetch->num_slots > 0)
16289 SET_OPTION_IF_UNSET (opts, &global_options_set,
16290 param_simultaneous_prefetches,
16291 aarch64_tune_params.prefetch->num_slots);
16292 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
16293 SET_OPTION_IF_UNSET (opts, &global_options_set,
16294 param_l1_cache_size,
16295 aarch64_tune_params.prefetch->l1_cache_size);
16296 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
16297 SET_OPTION_IF_UNSET (opts, &global_options_set,
16298 param_l1_cache_line_size,
16299 aarch64_tune_params.prefetch->l1_cache_line_size);
16300 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
16301 SET_OPTION_IF_UNSET (opts, &global_options_set,
16302 param_l2_cache_size,
16303 aarch64_tune_params.prefetch->l2_cache_size);
16304 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
16305 SET_OPTION_IF_UNSET (opts, &global_options_set,
16306 param_prefetch_dynamic_strides, 0);
16307 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
16308 SET_OPTION_IF_UNSET (opts, &global_options_set,
16309 param_prefetch_minimum_stride,
16310 aarch64_tune_params.prefetch->minimum_stride);
16312 /* Use the alternative scheduling-pressure algorithm by default. */
16313 SET_OPTION_IF_UNSET (opts, &global_options_set,
16314 param_sched_pressure_algorithm,
16315 SCHED_PRESSURE_MODEL);
16317 /* Validate the guard size. */
16318 int guard_size = param_stack_clash_protection_guard_size;
16320 if (guard_size != 12 && guard_size != 16)
16321 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
16322 "size. Given value %d (%llu KB) is out of range",
16323 guard_size, (1ULL << guard_size) / 1024ULL);
16325 /* Enforce that interval is the same size as size so the mid-end does the
16326 right thing. */
16327 SET_OPTION_IF_UNSET (opts, &global_options_set,
16328 param_stack_clash_protection_probe_interval,
16329 guard_size);
16331 /* The maybe_set calls won't update the value if the user has explicitly set
16332 one. Which means we need to validate that probing interval and guard size
16333 are equal. */
16334 int probe_interval
16335 = param_stack_clash_protection_probe_interval;
16336 if (guard_size != probe_interval)
16337 error ("stack clash guard size %<%d%> must be equal to probing interval "
16338 "%<%d%>", guard_size, probe_interval);
16340 /* Enable sw prefetching at specified optimization level for
16341 CPUS that have prefetch. Lower optimization level threshold by 1
16342 when profiling is enabled. */
16343 if (opts->x_flag_prefetch_loop_arrays < 0
16344 && !opts->x_optimize_size
16345 && aarch64_tune_params.prefetch->default_opt_level >= 0
16346 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
16347 opts->x_flag_prefetch_loop_arrays = 1;
16349 if (opts->x_aarch64_arch_string == NULL)
16350 opts->x_aarch64_arch_string = selected_arch->name;
16351 if (opts->x_aarch64_cpu_string == NULL)
16352 opts->x_aarch64_cpu_string = selected_cpu->name;
16353 if (opts->x_aarch64_tune_string == NULL)
16354 opts->x_aarch64_tune_string = selected_tune->name;
16356 aarch64_override_options_after_change_1 (opts);
16359 /* Print a hint with a suggestion for a core or architecture name that
16360 most closely resembles what the user passed in STR. ARCH is true if
16361 the user is asking for an architecture name. ARCH is false if the user
16362 is asking for a core name. */
16364 static void
16365 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
16367 auto_vec<const char *> candidates;
16368 const struct processor *entry = arch ? all_architectures : all_cores;
16369 for (; entry->name != NULL; entry++)
16370 candidates.safe_push (entry->name);
16372 #ifdef HAVE_LOCAL_CPU_DETECT
16373 /* Add also "native" as possible value. */
16374 if (arch)
16375 candidates.safe_push ("native");
16376 #endif
16378 char *s;
16379 const char *hint = candidates_list_and_hint (str, s, candidates);
16380 if (hint)
16381 inform (input_location, "valid arguments are: %s;"
16382 " did you mean %qs?", s, hint);
16383 else
16384 inform (input_location, "valid arguments are: %s", s);
16386 XDELETEVEC (s);
16389 /* Print a hint with a suggestion for a core name that most closely resembles
16390 what the user passed in STR. */
16392 inline static void
16393 aarch64_print_hint_for_core (const char *str)
16395 aarch64_print_hint_for_core_or_arch (str, false);
16398 /* Print a hint with a suggestion for an architecture name that most closely
16399 resembles what the user passed in STR. */
16401 inline static void
16402 aarch64_print_hint_for_arch (const char *str)
16404 aarch64_print_hint_for_core_or_arch (str, true);
16408 /* Print a hint with a suggestion for an extension name
16409 that most closely resembles what the user passed in STR. */
16411 void
16412 aarch64_print_hint_for_extensions (const std::string &str)
16414 auto_vec<const char *> candidates;
16415 aarch64_get_all_extension_candidates (&candidates);
16416 char *s;
16417 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
16418 if (hint)
16419 inform (input_location, "valid arguments are: %s;"
16420 " did you mean %qs?", s, hint);
16421 else
16422 inform (input_location, "valid arguments are: %s;", s);
16424 XDELETEVEC (s);
16427 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
16428 specified in STR and throw errors if appropriate. Put the results if
16429 they are valid in RES and ISA_FLAGS. Return whether the option is
16430 valid. */
16432 static bool
16433 aarch64_validate_mcpu (const char *str, const struct processor **res,
16434 uint64_t *isa_flags)
16436 std::string invalid_extension;
16437 enum aarch64_parse_opt_result parse_res
16438 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
16440 if (parse_res == AARCH64_PARSE_OK)
16441 return true;
16443 switch (parse_res)
16445 case AARCH64_PARSE_MISSING_ARG:
16446 error ("missing cpu name in %<-mcpu=%s%>", str);
16447 break;
16448 case AARCH64_PARSE_INVALID_ARG:
16449 error ("unknown value %qs for %<-mcpu%>", str);
16450 aarch64_print_hint_for_core (str);
16451 break;
16452 case AARCH64_PARSE_INVALID_FEATURE:
16453 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
16454 invalid_extension.c_str (), str);
16455 aarch64_print_hint_for_extensions (invalid_extension);
16456 break;
16457 default:
16458 gcc_unreachable ();
16461 return false;
16464 /* Straight line speculation indicators. */
16465 enum aarch64_sls_hardening_type
16467 SLS_NONE = 0,
16468 SLS_RETBR = 1,
16469 SLS_BLR = 2,
16470 SLS_ALL = 3,
16472 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
16474 /* Return whether we should mitigatate Straight Line Speculation for the RET
16475 and BR instructions. */
16476 bool
16477 aarch64_harden_sls_retbr_p (void)
16479 return aarch64_sls_hardening & SLS_RETBR;
16482 /* Return whether we should mitigatate Straight Line Speculation for the BLR
16483 instruction. */
16484 bool
16485 aarch64_harden_sls_blr_p (void)
16487 return aarch64_sls_hardening & SLS_BLR;
16490 /* As of yet we only allow setting these options globally, in the future we may
16491 allow setting them per function. */
16492 static void
16493 aarch64_validate_sls_mitigation (const char *const_str)
16495 char *token_save = NULL;
16496 char *str = NULL;
16498 if (strcmp (const_str, "none") == 0)
16500 aarch64_sls_hardening = SLS_NONE;
16501 return;
16503 if (strcmp (const_str, "all") == 0)
16505 aarch64_sls_hardening = SLS_ALL;
16506 return;
16509 char *str_root = xstrdup (const_str);
16510 str = strtok_r (str_root, ",", &token_save);
16511 if (!str)
16512 error ("invalid argument given to %<-mharden-sls=%>");
16514 int temp = SLS_NONE;
16515 while (str)
16517 if (strcmp (str, "blr") == 0)
16518 temp |= SLS_BLR;
16519 else if (strcmp (str, "retbr") == 0)
16520 temp |= SLS_RETBR;
16521 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
16523 error ("%<%s%> must be by itself for %<-mharden-sls=%>", str);
16524 break;
16526 else
16528 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
16529 break;
16531 str = strtok_r (NULL, ",", &token_save);
16533 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
16534 free (str_root);
16537 /* Parses CONST_STR for branch protection features specified in
16538 aarch64_branch_protect_types, and set any global variables required. Returns
16539 the parsing result and assigns LAST_STR to the last processed token from
16540 CONST_STR so that it can be used for error reporting. */
16542 static enum
16543 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
16544 char** last_str)
16546 char *str_root = xstrdup (const_str);
16547 char* token_save = NULL;
16548 char *str = strtok_r (str_root, "+", &token_save);
16549 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
16550 if (!str)
16551 res = AARCH64_PARSE_MISSING_ARG;
16552 else
16554 char *next_str = strtok_r (NULL, "+", &token_save);
16555 /* Reset the branch protection features to their defaults. */
16556 aarch64_handle_no_branch_protection (NULL, NULL);
16558 while (str && res == AARCH64_PARSE_OK)
16560 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
16561 bool found = false;
16562 /* Search for this type. */
16563 while (type && type->name && !found && res == AARCH64_PARSE_OK)
16565 if (strcmp (str, type->name) == 0)
16567 found = true;
16568 res = type->handler (str, next_str);
16569 str = next_str;
16570 next_str = strtok_r (NULL, "+", &token_save);
16572 else
16573 type++;
16575 if (found && res == AARCH64_PARSE_OK)
16577 bool found_subtype = true;
16578 /* Loop through each token until we find one that isn't a
16579 subtype. */
16580 while (found_subtype)
16582 found_subtype = false;
16583 const aarch64_branch_protect_type *subtype = type->subtypes;
16584 /* Search for the subtype. */
16585 while (str && subtype && subtype->name && !found_subtype
16586 && res == AARCH64_PARSE_OK)
16588 if (strcmp (str, subtype->name) == 0)
16590 found_subtype = true;
16591 res = subtype->handler (str, next_str);
16592 str = next_str;
16593 next_str = strtok_r (NULL, "+", &token_save);
16595 else
16596 subtype++;
16600 else if (!found)
16601 res = AARCH64_PARSE_INVALID_ARG;
16604 /* Copy the last processed token into the argument to pass it back.
16605 Used by option and attribute validation to print the offending token. */
16606 if (last_str)
16608 if (str) strcpy (*last_str, str);
16609 else *last_str = NULL;
16611 if (res == AARCH64_PARSE_OK)
16613 /* If needed, alloc the accepted string then copy in const_str.
16614 Used by override_option_after_change_1. */
16615 if (!accepted_branch_protection_string)
16616 accepted_branch_protection_string = (char *) xmalloc (
16617 BRANCH_PROTECT_STR_MAX
16618 + 1);
16619 strncpy (accepted_branch_protection_string, const_str,
16620 BRANCH_PROTECT_STR_MAX + 1);
16621 /* Forcibly null-terminate. */
16622 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
16624 return res;
16627 static bool
16628 aarch64_validate_mbranch_protection (const char *const_str)
16630 char *str = (char *) xmalloc (strlen (const_str));
16631 enum aarch64_parse_opt_result res =
16632 aarch64_parse_branch_protection (const_str, &str);
16633 if (res == AARCH64_PARSE_INVALID_ARG)
16634 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
16635 else if (res == AARCH64_PARSE_MISSING_ARG)
16636 error ("missing argument for %<-mbranch-protection=%>");
16637 free (str);
16638 return res == AARCH64_PARSE_OK;
16641 /* Validate a command-line -march option. Parse the arch and extensions
16642 (if any) specified in STR and throw errors if appropriate. Put the
16643 results, if they are valid, in RES and ISA_FLAGS. Return whether the
16644 option is valid. */
16646 static bool
16647 aarch64_validate_march (const char *str, const struct processor **res,
16648 uint64_t *isa_flags)
16650 std::string invalid_extension;
16651 enum aarch64_parse_opt_result parse_res
16652 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
16654 if (parse_res == AARCH64_PARSE_OK)
16655 return true;
16657 switch (parse_res)
16659 case AARCH64_PARSE_MISSING_ARG:
16660 error ("missing arch name in %<-march=%s%>", str);
16661 break;
16662 case AARCH64_PARSE_INVALID_ARG:
16663 error ("unknown value %qs for %<-march%>", str);
16664 aarch64_print_hint_for_arch (str);
16665 break;
16666 case AARCH64_PARSE_INVALID_FEATURE:
16667 error ("invalid feature modifier %qs in %<-march=%s%>",
16668 invalid_extension.c_str (), str);
16669 aarch64_print_hint_for_extensions (invalid_extension);
16670 break;
16671 default:
16672 gcc_unreachable ();
16675 return false;
16678 /* Validate a command-line -mtune option. Parse the cpu
16679 specified in STR and throw errors if appropriate. Put the
16680 result, if it is valid, in RES. Return whether the option is
16681 valid. */
16683 static bool
16684 aarch64_validate_mtune (const char *str, const struct processor **res)
16686 enum aarch64_parse_opt_result parse_res
16687 = aarch64_parse_tune (str, res);
16689 if (parse_res == AARCH64_PARSE_OK)
16690 return true;
16692 switch (parse_res)
16694 case AARCH64_PARSE_MISSING_ARG:
16695 error ("missing cpu name in %<-mtune=%s%>", str);
16696 break;
16697 case AARCH64_PARSE_INVALID_ARG:
16698 error ("unknown value %qs for %<-mtune%>", str);
16699 aarch64_print_hint_for_core (str);
16700 break;
16701 default:
16702 gcc_unreachable ();
16704 return false;
16707 /* Return the CPU corresponding to the enum CPU.
16708 If it doesn't specify a cpu, return the default. */
16710 static const struct processor *
16711 aarch64_get_tune_cpu (enum aarch64_processor cpu)
16713 if (cpu != aarch64_none)
16714 return &all_cores[cpu];
16716 /* The & 0x3f is to extract the bottom 6 bits that encode the
16717 default cpu as selected by the --with-cpu GCC configure option
16718 in config.gcc.
16719 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
16720 flags mechanism should be reworked to make it more sane. */
16721 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
16724 /* Return the architecture corresponding to the enum ARCH.
16725 If it doesn't specify a valid architecture, return the default. */
16727 static const struct processor *
16728 aarch64_get_arch (enum aarch64_arch arch)
16730 if (arch != aarch64_no_arch)
16731 return &all_architectures[arch];
16733 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
16735 return &all_architectures[cpu->arch];
16738 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
16740 static poly_uint16
16741 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
16743 /* 128-bit SVE and Advanced SIMD modes use different register layouts
16744 on big-endian targets, so we would need to forbid subregs that convert
16745 from one to the other. By default a reinterpret sequence would then
16746 involve a store to memory in one mode and a load back in the other.
16747 Even if we optimize that sequence using reverse instructions,
16748 it would still be a significant potential overhead.
16750 For now, it seems better to generate length-agnostic code for that
16751 case instead. */
16752 if (value == SVE_SCALABLE
16753 || (value == SVE_128 && BYTES_BIG_ENDIAN))
16754 return poly_uint16 (2, 2);
16755 else
16756 return (int) value / 64;
16759 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
16760 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
16761 tuning structs. In particular it must set selected_tune and
16762 aarch64_isa_flags that define the available ISA features and tuning
16763 decisions. It must also set selected_arch as this will be used to
16764 output the .arch asm tags for each function. */
16766 static void
16767 aarch64_override_options (void)
16769 uint64_t cpu_isa = 0;
16770 uint64_t arch_isa = 0;
16771 aarch64_isa_flags = 0;
16773 bool valid_cpu = true;
16774 bool valid_tune = true;
16775 bool valid_arch = true;
16777 selected_cpu = NULL;
16778 selected_arch = NULL;
16779 selected_tune = NULL;
16781 if (aarch64_harden_sls_string)
16782 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
16784 if (aarch64_branch_protection_string)
16785 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
16787 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
16788 If either of -march or -mtune is given, they override their
16789 respective component of -mcpu. */
16790 if (aarch64_cpu_string)
16791 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
16792 &cpu_isa);
16794 if (aarch64_arch_string)
16795 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
16796 &arch_isa);
16798 if (aarch64_tune_string)
16799 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
16801 #ifdef SUBTARGET_OVERRIDE_OPTIONS
16802 SUBTARGET_OVERRIDE_OPTIONS;
16803 #endif
16805 /* If the user did not specify a processor, choose the default
16806 one for them. This will be the CPU set during configuration using
16807 --with-cpu, otherwise it is "generic". */
16808 if (!selected_cpu)
16810 if (selected_arch)
16812 selected_cpu = &all_cores[selected_arch->ident];
16813 aarch64_isa_flags = arch_isa;
16814 explicit_arch = selected_arch->arch;
16816 else
16818 /* Get default configure-time CPU. */
16819 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
16820 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
16823 if (selected_tune)
16824 explicit_tune_core = selected_tune->ident;
16826 /* If both -mcpu and -march are specified check that they are architecturally
16827 compatible, warn if they're not and prefer the -march ISA flags. */
16828 else if (selected_arch)
16830 if (selected_arch->arch != selected_cpu->arch)
16832 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
16833 aarch64_cpu_string,
16834 aarch64_arch_string);
16836 aarch64_isa_flags = arch_isa;
16837 explicit_arch = selected_arch->arch;
16838 explicit_tune_core = selected_tune ? selected_tune->ident
16839 : selected_cpu->ident;
16841 else
16843 /* -mcpu but no -march. */
16844 aarch64_isa_flags = cpu_isa;
16845 explicit_tune_core = selected_tune ? selected_tune->ident
16846 : selected_cpu->ident;
16847 gcc_assert (selected_cpu);
16848 selected_arch = &all_architectures[selected_cpu->arch];
16849 explicit_arch = selected_arch->arch;
16852 /* Set the arch as well as we will need it when outputing
16853 the .arch directive in assembly. */
16854 if (!selected_arch)
16856 gcc_assert (selected_cpu);
16857 selected_arch = &all_architectures[selected_cpu->arch];
16860 if (!selected_tune)
16861 selected_tune = selected_cpu;
16863 if (aarch64_enable_bti == 2)
16865 #ifdef TARGET_ENABLE_BTI
16866 aarch64_enable_bti = 1;
16867 #else
16868 aarch64_enable_bti = 0;
16869 #endif
16872 /* Return address signing is currently not supported for ILP32 targets. For
16873 LP64 targets use the configured option in the absence of a command-line
16874 option for -mbranch-protection. */
16875 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
16877 #ifdef TARGET_ENABLE_PAC_RET
16878 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
16879 #else
16880 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
16881 #endif
16884 #ifndef HAVE_AS_MABI_OPTION
16885 /* The compiler may have been configured with 2.23.* binutils, which does
16886 not have support for ILP32. */
16887 if (TARGET_ILP32)
16888 error ("assembler does not support %<-mabi=ilp32%>");
16889 #endif
16891 /* Convert -msve-vector-bits to a VG count. */
16892 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
16894 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
16895 sorry ("return address signing is only supported for %<-mabi=lp64%>");
16897 /* Make sure we properly set up the explicit options. */
16898 if ((aarch64_cpu_string && valid_cpu)
16899 || (aarch64_tune_string && valid_tune))
16900 gcc_assert (explicit_tune_core != aarch64_none);
16902 if ((aarch64_cpu_string && valid_cpu)
16903 || (aarch64_arch_string && valid_arch))
16904 gcc_assert (explicit_arch != aarch64_no_arch);
16906 /* The pass to insert speculation tracking runs before
16907 shrink-wrapping and the latter does not know how to update the
16908 tracking status. So disable it in this case. */
16909 if (aarch64_track_speculation)
16910 flag_shrink_wrap = 0;
16912 aarch64_override_options_internal (&global_options);
16914 /* Save these options as the default ones in case we push and pop them later
16915 while processing functions with potential target attributes. */
16916 target_option_default_node = target_option_current_node
16917 = build_target_option_node (&global_options, &global_options_set);
16920 /* Implement targetm.override_options_after_change. */
16922 static void
16923 aarch64_override_options_after_change (void)
16925 aarch64_override_options_after_change_1 (&global_options);
16928 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
16929 static char *
16930 aarch64_offload_options (void)
16932 if (TARGET_ILP32)
16933 return xstrdup ("-foffload-abi=ilp32");
16934 else
16935 return xstrdup ("-foffload-abi=lp64");
16938 static struct machine_function *
16939 aarch64_init_machine_status (void)
16941 struct machine_function *machine;
16942 machine = ggc_cleared_alloc<machine_function> ();
16943 return machine;
16946 void
16947 aarch64_init_expanders (void)
16949 init_machine_status = aarch64_init_machine_status;
16952 /* A checking mechanism for the implementation of the various code models. */
16953 static void
16954 initialize_aarch64_code_model (struct gcc_options *opts)
16956 aarch64_cmodel = opts->x_aarch64_cmodel_var;
16957 switch (opts->x_aarch64_cmodel_var)
16959 case AARCH64_CMODEL_TINY:
16960 if (opts->x_flag_pic)
16961 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
16962 break;
16963 case AARCH64_CMODEL_SMALL:
16964 if (opts->x_flag_pic)
16966 #ifdef HAVE_AS_SMALL_PIC_RELOCS
16967 aarch64_cmodel = (flag_pic == 2
16968 ? AARCH64_CMODEL_SMALL_PIC
16969 : AARCH64_CMODEL_SMALL_SPIC);
16970 #else
16971 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
16972 #endif
16974 break;
16975 case AARCH64_CMODEL_LARGE:
16976 if (opts->x_flag_pic)
16977 sorry ("code model %qs with %<-f%s%>", "large",
16978 opts->x_flag_pic > 1 ? "PIC" : "pic");
16979 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
16980 sorry ("code model %qs not supported in ilp32 mode", "large");
16981 break;
16982 case AARCH64_CMODEL_TINY_PIC:
16983 case AARCH64_CMODEL_SMALL_PIC:
16984 case AARCH64_CMODEL_SMALL_SPIC:
16985 gcc_unreachable ();
16989 /* Implement TARGET_OPTION_SAVE. */
16991 static void
16992 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts,
16993 struct gcc_options */* opts_set */)
16995 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
16996 ptr->x_aarch64_branch_protection_string
16997 = opts->x_aarch64_branch_protection_string;
17000 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
17001 using the information saved in PTR. */
17003 static void
17004 aarch64_option_restore (struct gcc_options *opts,
17005 struct gcc_options */* opts_set */,
17006 struct cl_target_option *ptr)
17008 opts->x_explicit_arch = ptr->x_explicit_arch;
17009 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
17010 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
17011 if (opts->x_explicit_tune_core == aarch64_none
17012 && opts->x_explicit_arch != aarch64_no_arch)
17013 selected_tune = &all_cores[selected_arch->ident];
17014 else
17015 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
17016 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
17017 opts->x_aarch64_branch_protection_string
17018 = ptr->x_aarch64_branch_protection_string;
17019 if (opts->x_aarch64_branch_protection_string)
17021 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
17022 NULL);
17025 aarch64_override_options_internal (opts);
17028 /* Implement TARGET_OPTION_PRINT. */
17030 static void
17031 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
17033 const struct processor *cpu
17034 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
17035 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
17036 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
17037 std::string extension
17038 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
17040 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
17041 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
17042 arch->name, extension.c_str ());
17045 static GTY(()) tree aarch64_previous_fndecl;
17047 void
17048 aarch64_reset_previous_fndecl (void)
17050 aarch64_previous_fndecl = NULL;
17053 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
17054 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
17055 make sure optab availability predicates are recomputed when necessary. */
17057 void
17058 aarch64_save_restore_target_globals (tree new_tree)
17060 if (TREE_TARGET_GLOBALS (new_tree))
17061 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
17062 else if (new_tree == target_option_default_node)
17063 restore_target_globals (&default_target_globals);
17064 else
17065 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
17068 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
17069 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
17070 of the function, if such exists. This function may be called multiple
17071 times on a single function so use aarch64_previous_fndecl to avoid
17072 setting up identical state. */
17074 static void
17075 aarch64_set_current_function (tree fndecl)
17077 if (!fndecl || fndecl == aarch64_previous_fndecl)
17078 return;
17080 tree old_tree = (aarch64_previous_fndecl
17081 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
17082 : NULL_TREE);
17084 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
17086 /* If current function has no attributes but the previous one did,
17087 use the default node. */
17088 if (!new_tree && old_tree)
17089 new_tree = target_option_default_node;
17091 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
17092 the default have been handled by aarch64_save_restore_target_globals from
17093 aarch64_pragma_target_parse. */
17094 if (old_tree == new_tree)
17095 return;
17097 aarch64_previous_fndecl = fndecl;
17099 /* First set the target options. */
17100 cl_target_option_restore (&global_options, &global_options_set,
17101 TREE_TARGET_OPTION (new_tree));
17103 aarch64_save_restore_target_globals (new_tree);
17106 /* Enum describing the various ways we can handle attributes.
17107 In many cases we can reuse the generic option handling machinery. */
17109 enum aarch64_attr_opt_type
17111 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
17112 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
17113 aarch64_attr_enum, /* Attribute sets an enum variable. */
17114 aarch64_attr_custom /* Attribute requires a custom handling function. */
17117 /* All the information needed to handle a target attribute.
17118 NAME is the name of the attribute.
17119 ATTR_TYPE specifies the type of behavior of the attribute as described
17120 in the definition of enum aarch64_attr_opt_type.
17121 ALLOW_NEG is true if the attribute supports a "no-" form.
17122 HANDLER is the function that takes the attribute string as an argument
17123 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
17124 OPT_NUM is the enum specifying the option that the attribute modifies.
17125 This is needed for attributes that mirror the behavior of a command-line
17126 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
17127 aarch64_attr_enum. */
17129 struct aarch64_attribute_info
17131 const char *name;
17132 enum aarch64_attr_opt_type attr_type;
17133 bool allow_neg;
17134 bool (*handler) (const char *);
17135 enum opt_code opt_num;
17138 /* Handle the ARCH_STR argument to the arch= target attribute. */
17140 static bool
17141 aarch64_handle_attr_arch (const char *str)
17143 const struct processor *tmp_arch = NULL;
17144 std::string invalid_extension;
17145 enum aarch64_parse_opt_result parse_res
17146 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
17148 if (parse_res == AARCH64_PARSE_OK)
17150 gcc_assert (tmp_arch);
17151 selected_arch = tmp_arch;
17152 explicit_arch = selected_arch->arch;
17153 return true;
17156 switch (parse_res)
17158 case AARCH64_PARSE_MISSING_ARG:
17159 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
17160 break;
17161 case AARCH64_PARSE_INVALID_ARG:
17162 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
17163 aarch64_print_hint_for_arch (str);
17164 break;
17165 case AARCH64_PARSE_INVALID_FEATURE:
17166 error ("invalid feature modifier %s of value (\"%s\") in "
17167 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
17168 aarch64_print_hint_for_extensions (invalid_extension);
17169 break;
17170 default:
17171 gcc_unreachable ();
17174 return false;
17177 /* Handle the argument CPU_STR to the cpu= target attribute. */
17179 static bool
17180 aarch64_handle_attr_cpu (const char *str)
17182 const struct processor *tmp_cpu = NULL;
17183 std::string invalid_extension;
17184 enum aarch64_parse_opt_result parse_res
17185 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
17187 if (parse_res == AARCH64_PARSE_OK)
17189 gcc_assert (tmp_cpu);
17190 selected_tune = tmp_cpu;
17191 explicit_tune_core = selected_tune->ident;
17193 selected_arch = &all_architectures[tmp_cpu->arch];
17194 explicit_arch = selected_arch->arch;
17195 return true;
17198 switch (parse_res)
17200 case AARCH64_PARSE_MISSING_ARG:
17201 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
17202 break;
17203 case AARCH64_PARSE_INVALID_ARG:
17204 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
17205 aarch64_print_hint_for_core (str);
17206 break;
17207 case AARCH64_PARSE_INVALID_FEATURE:
17208 error ("invalid feature modifier %s of value (\"%s\") in "
17209 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
17210 aarch64_print_hint_for_extensions (invalid_extension);
17211 break;
17212 default:
17213 gcc_unreachable ();
17216 return false;
17219 /* Handle the argument STR to the branch-protection= attribute. */
17221 static bool
17222 aarch64_handle_attr_branch_protection (const char* str)
17224 char *err_str = (char *) xmalloc (strlen (str) + 1);
17225 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
17226 &err_str);
17227 bool success = false;
17228 switch (res)
17230 case AARCH64_PARSE_MISSING_ARG:
17231 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
17232 " attribute");
17233 break;
17234 case AARCH64_PARSE_INVALID_ARG:
17235 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
17236 "=\")%> pragma or attribute", err_str);
17237 break;
17238 case AARCH64_PARSE_OK:
17239 success = true;
17240 /* Fall through. */
17241 case AARCH64_PARSE_INVALID_FEATURE:
17242 break;
17243 default:
17244 gcc_unreachable ();
17246 free (err_str);
17247 return success;
17250 /* Handle the argument STR to the tune= target attribute. */
17252 static bool
17253 aarch64_handle_attr_tune (const char *str)
17255 const struct processor *tmp_tune = NULL;
17256 enum aarch64_parse_opt_result parse_res
17257 = aarch64_parse_tune (str, &tmp_tune);
17259 if (parse_res == AARCH64_PARSE_OK)
17261 gcc_assert (tmp_tune);
17262 selected_tune = tmp_tune;
17263 explicit_tune_core = selected_tune->ident;
17264 return true;
17267 switch (parse_res)
17269 case AARCH64_PARSE_INVALID_ARG:
17270 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
17271 aarch64_print_hint_for_core (str);
17272 break;
17273 default:
17274 gcc_unreachable ();
17277 return false;
17280 /* Parse an architecture extensions target attribute string specified in STR.
17281 For example "+fp+nosimd". Show any errors if needed. Return TRUE
17282 if successful. Update aarch64_isa_flags to reflect the ISA features
17283 modified. */
17285 static bool
17286 aarch64_handle_attr_isa_flags (char *str)
17288 enum aarch64_parse_opt_result parse_res;
17289 uint64_t isa_flags = aarch64_isa_flags;
17291 /* We allow "+nothing" in the beginning to clear out all architectural
17292 features if the user wants to handpick specific features. */
17293 if (strncmp ("+nothing", str, 8) == 0)
17295 isa_flags = 0;
17296 str += 8;
17299 std::string invalid_extension;
17300 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
17302 if (parse_res == AARCH64_PARSE_OK)
17304 aarch64_isa_flags = isa_flags;
17305 return true;
17308 switch (parse_res)
17310 case AARCH64_PARSE_MISSING_ARG:
17311 error ("missing value in %<target()%> pragma or attribute");
17312 break;
17314 case AARCH64_PARSE_INVALID_FEATURE:
17315 error ("invalid feature modifier %s of value (\"%s\") in "
17316 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
17317 break;
17319 default:
17320 gcc_unreachable ();
17323 return false;
17326 /* The target attributes that we support. On top of these we also support just
17327 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
17328 handled explicitly in aarch64_process_one_target_attr. */
17330 static const struct aarch64_attribute_info aarch64_attributes[] =
17332 { "general-regs-only", aarch64_attr_mask, false, NULL,
17333 OPT_mgeneral_regs_only },
17334 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
17335 OPT_mfix_cortex_a53_835769 },
17336 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
17337 OPT_mfix_cortex_a53_843419 },
17338 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
17339 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
17340 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
17341 OPT_momit_leaf_frame_pointer },
17342 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
17343 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
17344 OPT_march_ },
17345 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
17346 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
17347 OPT_mtune_ },
17348 { "branch-protection", aarch64_attr_custom, false,
17349 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
17350 { "sign-return-address", aarch64_attr_enum, false, NULL,
17351 OPT_msign_return_address_ },
17352 { "outline-atomics", aarch64_attr_bool, true, NULL,
17353 OPT_moutline_atomics},
17354 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
17357 /* Parse ARG_STR which contains the definition of one target attribute.
17358 Show appropriate errors if any or return true if the attribute is valid. */
17360 static bool
17361 aarch64_process_one_target_attr (char *arg_str)
17363 bool invert = false;
17365 size_t len = strlen (arg_str);
17367 if (len == 0)
17369 error ("malformed %<target()%> pragma or attribute");
17370 return false;
17373 char *str_to_check = (char *) alloca (len + 1);
17374 strcpy (str_to_check, arg_str);
17376 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
17377 It is easier to detect and handle it explicitly here rather than going
17378 through the machinery for the rest of the target attributes in this
17379 function. */
17380 if (*str_to_check == '+')
17381 return aarch64_handle_attr_isa_flags (str_to_check);
17383 if (len > 3 && startswith (str_to_check, "no-"))
17385 invert = true;
17386 str_to_check += 3;
17388 char *arg = strchr (str_to_check, '=');
17390 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
17391 and point ARG to "foo". */
17392 if (arg)
17394 *arg = '\0';
17395 arg++;
17397 const struct aarch64_attribute_info *p_attr;
17398 bool found = false;
17399 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
17401 /* If the names don't match up, or the user has given an argument
17402 to an attribute that doesn't accept one, or didn't give an argument
17403 to an attribute that expects one, fail to match. */
17404 if (strcmp (str_to_check, p_attr->name) != 0)
17405 continue;
17407 found = true;
17408 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
17409 || p_attr->attr_type == aarch64_attr_enum;
17411 if (attr_need_arg_p ^ (arg != NULL))
17413 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
17414 return false;
17417 /* If the name matches but the attribute does not allow "no-" versions
17418 then we can't match. */
17419 if (invert && !p_attr->allow_neg)
17421 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
17422 return false;
17425 switch (p_attr->attr_type)
17427 /* Has a custom handler registered.
17428 For example, cpu=, arch=, tune=. */
17429 case aarch64_attr_custom:
17430 gcc_assert (p_attr->handler);
17431 if (!p_attr->handler (arg))
17432 return false;
17433 break;
17435 /* Either set or unset a boolean option. */
17436 case aarch64_attr_bool:
17438 struct cl_decoded_option decoded;
17440 generate_option (p_attr->opt_num, NULL, !invert,
17441 CL_TARGET, &decoded);
17442 aarch64_handle_option (&global_options, &global_options_set,
17443 &decoded, input_location);
17444 break;
17446 /* Set or unset a bit in the target_flags. aarch64_handle_option
17447 should know what mask to apply given the option number. */
17448 case aarch64_attr_mask:
17450 struct cl_decoded_option decoded;
17451 /* We only need to specify the option number.
17452 aarch64_handle_option will know which mask to apply. */
17453 decoded.opt_index = p_attr->opt_num;
17454 decoded.value = !invert;
17455 aarch64_handle_option (&global_options, &global_options_set,
17456 &decoded, input_location);
17457 break;
17459 /* Use the option setting machinery to set an option to an enum. */
17460 case aarch64_attr_enum:
17462 gcc_assert (arg);
17463 bool valid;
17464 int value;
17465 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
17466 &value, CL_TARGET);
17467 if (valid)
17469 set_option (&global_options, NULL, p_attr->opt_num, value,
17470 NULL, DK_UNSPECIFIED, input_location,
17471 global_dc);
17473 else
17475 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
17477 break;
17479 default:
17480 gcc_unreachable ();
17484 /* If we reached here we either have found an attribute and validated
17485 it or didn't match any. If we matched an attribute but its arguments
17486 were malformed we will have returned false already. */
17487 return found;
17490 /* Count how many times the character C appears in
17491 NULL-terminated string STR. */
17493 static unsigned int
17494 num_occurences_in_str (char c, char *str)
17496 unsigned int res = 0;
17497 while (*str != '\0')
17499 if (*str == c)
17500 res++;
17502 str++;
17505 return res;
17508 /* Parse the tree in ARGS that contains the target attribute information
17509 and update the global target options space. */
17511 bool
17512 aarch64_process_target_attr (tree args)
17514 if (TREE_CODE (args) == TREE_LIST)
17518 tree head = TREE_VALUE (args);
17519 if (head)
17521 if (!aarch64_process_target_attr (head))
17522 return false;
17524 args = TREE_CHAIN (args);
17525 } while (args);
17527 return true;
17530 if (TREE_CODE (args) != STRING_CST)
17532 error ("attribute %<target%> argument not a string");
17533 return false;
17536 size_t len = strlen (TREE_STRING_POINTER (args));
17537 char *str_to_check = (char *) alloca (len + 1);
17538 strcpy (str_to_check, TREE_STRING_POINTER (args));
17540 if (len == 0)
17542 error ("malformed %<target()%> pragma or attribute");
17543 return false;
17546 /* Used to catch empty spaces between commas i.e.
17547 attribute ((target ("attr1,,attr2"))). */
17548 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
17550 /* Handle multiple target attributes separated by ','. */
17551 char *token = strtok_r (str_to_check, ",", &str_to_check);
17553 unsigned int num_attrs = 0;
17554 while (token)
17556 num_attrs++;
17557 if (!aarch64_process_one_target_attr (token))
17559 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
17560 return false;
17563 token = strtok_r (NULL, ",", &str_to_check);
17566 if (num_attrs != num_commas + 1)
17568 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
17569 return false;
17572 return true;
17575 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
17576 process attribute ((target ("..."))). */
17578 static bool
17579 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
17581 struct cl_target_option cur_target;
17582 bool ret;
17583 tree old_optimize;
17584 tree new_target, new_optimize;
17585 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
17587 /* If what we're processing is the current pragma string then the
17588 target option node is already stored in target_option_current_node
17589 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
17590 having to re-parse the string. This is especially useful to keep
17591 arm_neon.h compile times down since that header contains a lot
17592 of intrinsics enclosed in pragmas. */
17593 if (!existing_target && args == current_target_pragma)
17595 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
17596 return true;
17598 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
17600 old_optimize
17601 = build_optimization_node (&global_options, &global_options_set);
17602 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
17604 /* If the function changed the optimization levels as well as setting
17605 target options, start with the optimizations specified. */
17606 if (func_optimize && func_optimize != old_optimize)
17607 cl_optimization_restore (&global_options, &global_options_set,
17608 TREE_OPTIMIZATION (func_optimize));
17610 /* Save the current target options to restore at the end. */
17611 cl_target_option_save (&cur_target, &global_options, &global_options_set);
17613 /* If fndecl already has some target attributes applied to it, unpack
17614 them so that we add this attribute on top of them, rather than
17615 overwriting them. */
17616 if (existing_target)
17618 struct cl_target_option *existing_options
17619 = TREE_TARGET_OPTION (existing_target);
17621 if (existing_options)
17622 cl_target_option_restore (&global_options, &global_options_set,
17623 existing_options);
17625 else
17626 cl_target_option_restore (&global_options, &global_options_set,
17627 TREE_TARGET_OPTION (target_option_current_node));
17629 ret = aarch64_process_target_attr (args);
17631 /* Set up any additional state. */
17632 if (ret)
17634 aarch64_override_options_internal (&global_options);
17635 /* Initialize SIMD builtins if we haven't already.
17636 Set current_target_pragma to NULL for the duration so that
17637 the builtin initialization code doesn't try to tag the functions
17638 being built with the attributes specified by any current pragma, thus
17639 going into an infinite recursion. */
17640 if (TARGET_SIMD)
17642 tree saved_current_target_pragma = current_target_pragma;
17643 current_target_pragma = NULL;
17644 aarch64_init_simd_builtins ();
17645 current_target_pragma = saved_current_target_pragma;
17647 new_target = build_target_option_node (&global_options,
17648 &global_options_set);
17650 else
17651 new_target = NULL;
17653 new_optimize = build_optimization_node (&global_options,
17654 &global_options_set);
17656 if (fndecl && ret)
17658 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
17660 if (old_optimize != new_optimize)
17661 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
17664 cl_target_option_restore (&global_options, &global_options_set, &cur_target);
17666 if (old_optimize != new_optimize)
17667 cl_optimization_restore (&global_options, &global_options_set,
17668 TREE_OPTIMIZATION (old_optimize));
17669 return ret;
17672 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
17673 tri-bool options (yes, no, don't care) and the default value is
17674 DEF, determine whether to reject inlining. */
17676 static bool
17677 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
17678 int dont_care, int def)
17680 /* If the callee doesn't care, always allow inlining. */
17681 if (callee == dont_care)
17682 return true;
17684 /* If the caller doesn't care, always allow inlining. */
17685 if (caller == dont_care)
17686 return true;
17688 /* Otherwise, allow inlining if either the callee and caller values
17689 agree, or if the callee is using the default value. */
17690 return (callee == caller || callee == def);
17693 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
17694 to inline CALLEE into CALLER based on target-specific info.
17695 Make sure that the caller and callee have compatible architectural
17696 features. Then go through the other possible target attributes
17697 and see if they can block inlining. Try not to reject always_inline
17698 callees unless they are incompatible architecturally. */
17700 static bool
17701 aarch64_can_inline_p (tree caller, tree callee)
17703 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
17704 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
17706 struct cl_target_option *caller_opts
17707 = TREE_TARGET_OPTION (caller_tree ? caller_tree
17708 : target_option_default_node);
17710 struct cl_target_option *callee_opts
17711 = TREE_TARGET_OPTION (callee_tree ? callee_tree
17712 : target_option_default_node);
17714 /* Callee's ISA flags should be a subset of the caller's. */
17715 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
17716 != callee_opts->x_aarch64_isa_flags)
17717 return false;
17719 /* Allow non-strict aligned functions inlining into strict
17720 aligned ones. */
17721 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
17722 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
17723 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
17724 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
17725 return false;
17727 bool always_inline = lookup_attribute ("always_inline",
17728 DECL_ATTRIBUTES (callee));
17730 /* If the architectural features match up and the callee is always_inline
17731 then the other attributes don't matter. */
17732 if (always_inline)
17733 return true;
17735 if (caller_opts->x_aarch64_cmodel_var
17736 != callee_opts->x_aarch64_cmodel_var)
17737 return false;
17739 if (caller_opts->x_aarch64_tls_dialect
17740 != callee_opts->x_aarch64_tls_dialect)
17741 return false;
17743 /* Honour explicit requests to workaround errata. */
17744 if (!aarch64_tribools_ok_for_inlining_p (
17745 caller_opts->x_aarch64_fix_a53_err835769,
17746 callee_opts->x_aarch64_fix_a53_err835769,
17747 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
17748 return false;
17750 if (!aarch64_tribools_ok_for_inlining_p (
17751 caller_opts->x_aarch64_fix_a53_err843419,
17752 callee_opts->x_aarch64_fix_a53_err843419,
17753 2, TARGET_FIX_ERR_A53_843419))
17754 return false;
17756 /* If the user explicitly specified -momit-leaf-frame-pointer for the
17757 caller and calle and they don't match up, reject inlining. */
17758 if (!aarch64_tribools_ok_for_inlining_p (
17759 caller_opts->x_flag_omit_leaf_frame_pointer,
17760 callee_opts->x_flag_omit_leaf_frame_pointer,
17761 2, 1))
17762 return false;
17764 /* If the callee has specific tuning overrides, respect them. */
17765 if (callee_opts->x_aarch64_override_tune_string != NULL
17766 && caller_opts->x_aarch64_override_tune_string == NULL)
17767 return false;
17769 /* If the user specified tuning override strings for the
17770 caller and callee and they don't match up, reject inlining.
17771 We just do a string compare here, we don't analyze the meaning
17772 of the string, as it would be too costly for little gain. */
17773 if (callee_opts->x_aarch64_override_tune_string
17774 && caller_opts->x_aarch64_override_tune_string
17775 && (strcmp (callee_opts->x_aarch64_override_tune_string,
17776 caller_opts->x_aarch64_override_tune_string) != 0))
17777 return false;
17779 return true;
17782 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
17783 been already. */
17785 unsigned int
17786 aarch64_tlsdesc_abi_id ()
17788 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
17789 if (!tlsdesc_abi.initialized_p ())
17791 HARD_REG_SET full_reg_clobbers;
17792 CLEAR_HARD_REG_SET (full_reg_clobbers);
17793 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
17794 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
17795 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
17796 SET_HARD_REG_BIT (full_reg_clobbers, regno);
17797 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
17799 return tlsdesc_abi.id ();
17802 /* Return true if SYMBOL_REF X binds locally. */
17804 static bool
17805 aarch64_symbol_binds_local_p (const_rtx x)
17807 return (SYMBOL_REF_DECL (x)
17808 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
17809 : SYMBOL_REF_LOCAL_P (x));
17812 /* Return true if SYMBOL_REF X is thread local */
17813 static bool
17814 aarch64_tls_symbol_p (rtx x)
17816 if (! TARGET_HAVE_TLS)
17817 return false;
17819 x = strip_salt (x);
17820 if (!SYMBOL_REF_P (x))
17821 return false;
17823 return SYMBOL_REF_TLS_MODEL (x) != 0;
17826 /* Classify a TLS symbol into one of the TLS kinds. */
17827 enum aarch64_symbol_type
17828 aarch64_classify_tls_symbol (rtx x)
17830 enum tls_model tls_kind = tls_symbolic_operand_type (x);
17832 switch (tls_kind)
17834 case TLS_MODEL_GLOBAL_DYNAMIC:
17835 case TLS_MODEL_LOCAL_DYNAMIC:
17836 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
17838 case TLS_MODEL_INITIAL_EXEC:
17839 switch (aarch64_cmodel)
17841 case AARCH64_CMODEL_TINY:
17842 case AARCH64_CMODEL_TINY_PIC:
17843 return SYMBOL_TINY_TLSIE;
17844 default:
17845 return SYMBOL_SMALL_TLSIE;
17848 case TLS_MODEL_LOCAL_EXEC:
17849 if (aarch64_tls_size == 12)
17850 return SYMBOL_TLSLE12;
17851 else if (aarch64_tls_size == 24)
17852 return SYMBOL_TLSLE24;
17853 else if (aarch64_tls_size == 32)
17854 return SYMBOL_TLSLE32;
17855 else if (aarch64_tls_size == 48)
17856 return SYMBOL_TLSLE48;
17857 else
17858 gcc_unreachable ();
17860 case TLS_MODEL_EMULATED:
17861 case TLS_MODEL_NONE:
17862 return SYMBOL_FORCE_TO_MEM;
17864 default:
17865 gcc_unreachable ();
17869 /* Return the correct method for accessing X + OFFSET, where X is either
17870 a SYMBOL_REF or LABEL_REF. */
17872 enum aarch64_symbol_type
17873 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
17875 x = strip_salt (x);
17877 if (LABEL_REF_P (x))
17879 switch (aarch64_cmodel)
17881 case AARCH64_CMODEL_LARGE:
17882 return SYMBOL_FORCE_TO_MEM;
17884 case AARCH64_CMODEL_TINY_PIC:
17885 case AARCH64_CMODEL_TINY:
17886 return SYMBOL_TINY_ABSOLUTE;
17888 case AARCH64_CMODEL_SMALL_SPIC:
17889 case AARCH64_CMODEL_SMALL_PIC:
17890 case AARCH64_CMODEL_SMALL:
17891 return SYMBOL_SMALL_ABSOLUTE;
17893 default:
17894 gcc_unreachable ();
17898 if (SYMBOL_REF_P (x))
17900 if (aarch64_tls_symbol_p (x))
17901 return aarch64_classify_tls_symbol (x);
17903 switch (aarch64_cmodel)
17905 case AARCH64_CMODEL_TINY_PIC:
17906 case AARCH64_CMODEL_TINY:
17907 /* With -fPIC non-local symbols use the GOT. For orthogonality
17908 always use the GOT for extern weak symbols. */
17909 if ((flag_pic || SYMBOL_REF_WEAK (x))
17910 && !aarch64_symbol_binds_local_p (x))
17911 return SYMBOL_TINY_GOT;
17913 /* When we retrieve symbol + offset address, we have to make sure
17914 the offset does not cause overflow of the final address. But
17915 we have no way of knowing the address of symbol at compile time
17916 so we can't accurately say if the distance between the PC and
17917 symbol + offset is outside the addressible range of +/-1MB in the
17918 TINY code model. So we limit the maximum offset to +/-64KB and
17919 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
17920 If offset_within_block_p is true we allow larger offsets. */
17921 if (!(IN_RANGE (offset, -0x10000, 0x10000)
17922 || offset_within_block_p (x, offset)))
17923 return SYMBOL_FORCE_TO_MEM;
17925 return SYMBOL_TINY_ABSOLUTE;
17928 case AARCH64_CMODEL_SMALL_SPIC:
17929 case AARCH64_CMODEL_SMALL_PIC:
17930 case AARCH64_CMODEL_SMALL:
17931 if ((flag_pic || SYMBOL_REF_WEAK (x))
17932 && !aarch64_symbol_binds_local_p (x))
17933 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
17934 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G;
17936 /* Same reasoning as the tiny code model, but the offset cap here is
17937 1MB, allowing +/-3.9GB for the offset to the symbol. */
17938 if (!(IN_RANGE (offset, -0x100000, 0x100000)
17939 || offset_within_block_p (x, offset)))
17940 return SYMBOL_FORCE_TO_MEM;
17942 return SYMBOL_SMALL_ABSOLUTE;
17944 case AARCH64_CMODEL_LARGE:
17945 /* This is alright even in PIC code as the constant
17946 pool reference is always PC relative and within
17947 the same translation unit. */
17948 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
17949 return SYMBOL_SMALL_ABSOLUTE;
17950 else
17951 return SYMBOL_FORCE_TO_MEM;
17953 default:
17954 gcc_unreachable ();
17958 /* By default push everything into the constant pool. */
17959 return SYMBOL_FORCE_TO_MEM;
17962 bool
17963 aarch64_constant_address_p (rtx x)
17965 return (CONSTANT_P (x) && memory_address_p (DImode, x));
17968 bool
17969 aarch64_legitimate_pic_operand_p (rtx x)
17971 poly_int64 offset;
17972 x = strip_offset_and_salt (x, &offset);
17973 if (SYMBOL_REF_P (x))
17974 return false;
17976 return true;
17979 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
17980 that should be rematerialized rather than spilled. */
17982 static bool
17983 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
17985 /* Support CSE and rematerialization of common constants. */
17986 if (CONST_INT_P (x)
17987 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT))
17988 return true;
17990 /* Only accept variable-length vector constants if they can be
17991 handled directly.
17993 ??? It would be possible (but complex) to handle rematerialization
17994 of other constants via secondary reloads. */
17995 if (!GET_MODE_SIZE (mode).is_constant ())
17996 return aarch64_simd_valid_immediate (x, NULL);
17998 /* Otherwise, accept any CONST_VECTOR that, if all else fails, can at
17999 least be forced to memory and loaded from there. */
18000 if (GET_CODE (x) == CONST_VECTOR)
18001 return !targetm.cannot_force_const_mem (mode, x);
18003 /* Do not allow vector struct mode constants for Advanced SIMD.
18004 We could support 0 and -1 easily, but they need support in
18005 aarch64-simd.md. */
18006 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
18007 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
18008 return false;
18010 if (GET_CODE (x) == HIGH)
18011 x = XEXP (x, 0);
18013 /* Accept polynomial constants that can be calculated by using the
18014 destination of a move as the sole temporary. Constants that
18015 require a second temporary cannot be rematerialized (they can't be
18016 forced to memory and also aren't legitimate constants). */
18017 poly_int64 offset;
18018 if (poly_int_rtx_p (x, &offset))
18019 return aarch64_offset_temporaries (false, offset) <= 1;
18021 /* If an offset is being added to something else, we need to allow the
18022 base to be moved into the destination register, meaning that there
18023 are no free temporaries for the offset. */
18024 x = strip_offset_and_salt (x, &offset);
18025 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
18026 return false;
18028 /* Do not allow const (plus (anchor_symbol, const_int)). */
18029 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
18030 return false;
18032 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
18033 so spilling them is better than rematerialization. */
18034 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
18035 return true;
18037 /* Label references are always constant. */
18038 if (LABEL_REF_P (x))
18039 return true;
18041 return false;
18045 aarch64_load_tp (rtx target)
18047 if (!target
18048 || GET_MODE (target) != Pmode
18049 || !register_operand (target, Pmode))
18050 target = gen_reg_rtx (Pmode);
18052 /* Can return in any reg. */
18053 emit_insn (gen_aarch64_load_tp_hard (target));
18054 return target;
18057 /* On AAPCS systems, this is the "struct __va_list". */
18058 static GTY(()) tree va_list_type;
18060 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
18061 Return the type to use as __builtin_va_list.
18063 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
18065 struct __va_list
18067 void *__stack;
18068 void *__gr_top;
18069 void *__vr_top;
18070 int __gr_offs;
18071 int __vr_offs;
18072 }; */
18074 static tree
18075 aarch64_build_builtin_va_list (void)
18077 tree va_list_name;
18078 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
18080 /* Create the type. */
18081 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
18082 /* Give it the required name. */
18083 va_list_name = build_decl (BUILTINS_LOCATION,
18084 TYPE_DECL,
18085 get_identifier ("__va_list"),
18086 va_list_type);
18087 DECL_ARTIFICIAL (va_list_name) = 1;
18088 TYPE_NAME (va_list_type) = va_list_name;
18089 TYPE_STUB_DECL (va_list_type) = va_list_name;
18091 /* Create the fields. */
18092 f_stack = build_decl (BUILTINS_LOCATION,
18093 FIELD_DECL, get_identifier ("__stack"),
18094 ptr_type_node);
18095 f_grtop = build_decl (BUILTINS_LOCATION,
18096 FIELD_DECL, get_identifier ("__gr_top"),
18097 ptr_type_node);
18098 f_vrtop = build_decl (BUILTINS_LOCATION,
18099 FIELD_DECL, get_identifier ("__vr_top"),
18100 ptr_type_node);
18101 f_groff = build_decl (BUILTINS_LOCATION,
18102 FIELD_DECL, get_identifier ("__gr_offs"),
18103 integer_type_node);
18104 f_vroff = build_decl (BUILTINS_LOCATION,
18105 FIELD_DECL, get_identifier ("__vr_offs"),
18106 integer_type_node);
18108 /* Tell tree-stdarg pass about our internal offset fields.
18109 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
18110 purpose to identify whether the code is updating va_list internal
18111 offset fields through irregular way. */
18112 va_list_gpr_counter_field = f_groff;
18113 va_list_fpr_counter_field = f_vroff;
18115 DECL_ARTIFICIAL (f_stack) = 1;
18116 DECL_ARTIFICIAL (f_grtop) = 1;
18117 DECL_ARTIFICIAL (f_vrtop) = 1;
18118 DECL_ARTIFICIAL (f_groff) = 1;
18119 DECL_ARTIFICIAL (f_vroff) = 1;
18121 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
18122 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
18123 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
18124 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
18125 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
18127 TYPE_FIELDS (va_list_type) = f_stack;
18128 DECL_CHAIN (f_stack) = f_grtop;
18129 DECL_CHAIN (f_grtop) = f_vrtop;
18130 DECL_CHAIN (f_vrtop) = f_groff;
18131 DECL_CHAIN (f_groff) = f_vroff;
18133 /* Compute its layout. */
18134 layout_type (va_list_type);
18136 return va_list_type;
18139 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
18140 static void
18141 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
18143 const CUMULATIVE_ARGS *cum;
18144 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
18145 tree stack, grtop, vrtop, groff, vroff;
18146 tree t;
18147 int gr_save_area_size = cfun->va_list_gpr_size;
18148 int vr_save_area_size = cfun->va_list_fpr_size;
18149 int vr_offset;
18151 cum = &crtl->args.info;
18152 if (cfun->va_list_gpr_size)
18153 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
18154 cfun->va_list_gpr_size);
18155 if (cfun->va_list_fpr_size)
18156 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
18157 * UNITS_PER_VREG, cfun->va_list_fpr_size);
18159 if (!TARGET_FLOAT)
18161 gcc_assert (cum->aapcs_nvrn == 0);
18162 vr_save_area_size = 0;
18165 f_stack = TYPE_FIELDS (va_list_type_node);
18166 f_grtop = DECL_CHAIN (f_stack);
18167 f_vrtop = DECL_CHAIN (f_grtop);
18168 f_groff = DECL_CHAIN (f_vrtop);
18169 f_vroff = DECL_CHAIN (f_groff);
18171 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
18172 NULL_TREE);
18173 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
18174 NULL_TREE);
18175 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
18176 NULL_TREE);
18177 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
18178 NULL_TREE);
18179 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
18180 NULL_TREE);
18182 /* Emit code to initialize STACK, which points to the next varargs stack
18183 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
18184 by named arguments. STACK is 8-byte aligned. */
18185 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
18186 if (cum->aapcs_stack_size > 0)
18187 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
18188 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
18189 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18191 /* Emit code to initialize GRTOP, the top of the GR save area.
18192 virtual_incoming_args_rtx should have been 16 byte aligned. */
18193 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
18194 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
18195 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18197 /* Emit code to initialize VRTOP, the top of the VR save area.
18198 This address is gr_save_area_bytes below GRTOP, rounded
18199 down to the next 16-byte boundary. */
18200 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
18201 vr_offset = ROUND_UP (gr_save_area_size,
18202 STACK_BOUNDARY / BITS_PER_UNIT);
18204 if (vr_offset)
18205 t = fold_build_pointer_plus_hwi (t, -vr_offset);
18206 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
18207 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18209 /* Emit code to initialize GROFF, the offset from GRTOP of the
18210 next GPR argument. */
18211 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
18212 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
18213 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18215 /* Likewise emit code to initialize VROFF, the offset from FTOP
18216 of the next VR argument. */
18217 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
18218 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
18219 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
18222 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
18224 static tree
18225 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
18226 gimple_seq *post_p ATTRIBUTE_UNUSED)
18228 tree addr;
18229 bool indirect_p;
18230 bool is_ha; /* is HFA or HVA. */
18231 bool dw_align; /* double-word align. */
18232 machine_mode ag_mode = VOIDmode;
18233 int nregs;
18234 machine_mode mode;
18236 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
18237 tree stack, f_top, f_off, off, arg, roundup, on_stack;
18238 HOST_WIDE_INT size, rsize, adjust, align;
18239 tree t, u, cond1, cond2;
18241 indirect_p = pass_va_arg_by_reference (type);
18242 if (indirect_p)
18243 type = build_pointer_type (type);
18245 mode = TYPE_MODE (type);
18247 f_stack = TYPE_FIELDS (va_list_type_node);
18248 f_grtop = DECL_CHAIN (f_stack);
18249 f_vrtop = DECL_CHAIN (f_grtop);
18250 f_groff = DECL_CHAIN (f_vrtop);
18251 f_vroff = DECL_CHAIN (f_groff);
18253 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
18254 f_stack, NULL_TREE);
18255 size = int_size_in_bytes (type);
18257 unsigned int abi_break;
18258 align
18259 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
18261 dw_align = false;
18262 adjust = 0;
18263 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
18264 &is_ha, false))
18266 /* No frontends can create types with variable-sized modes, so we
18267 shouldn't be asked to pass or return them. */
18268 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
18270 /* TYPE passed in fp/simd registers. */
18271 if (!TARGET_FLOAT)
18272 aarch64_err_no_fpadvsimd (mode);
18274 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
18275 unshare_expr (valist), f_vrtop, NULL_TREE);
18276 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
18277 unshare_expr (valist), f_vroff, NULL_TREE);
18279 rsize = nregs * UNITS_PER_VREG;
18281 if (is_ha)
18283 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
18284 adjust = UNITS_PER_VREG - ag_size;
18286 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
18287 && size < UNITS_PER_VREG)
18289 adjust = UNITS_PER_VREG - size;
18292 else
18294 /* TYPE passed in general registers. */
18295 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
18296 unshare_expr (valist), f_grtop, NULL_TREE);
18297 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
18298 unshare_expr (valist), f_groff, NULL_TREE);
18299 rsize = ROUND_UP (size, UNITS_PER_WORD);
18300 nregs = rsize / UNITS_PER_WORD;
18302 if (align > 8)
18304 if (abi_break && warn_psabi)
18305 inform (input_location, "parameter passing for argument of type "
18306 "%qT changed in GCC 9.1", type);
18307 dw_align = true;
18310 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
18311 && size < UNITS_PER_WORD)
18313 adjust = UNITS_PER_WORD - size;
18317 /* Get a local temporary for the field value. */
18318 off = get_initialized_tmp_var (f_off, pre_p, NULL);
18320 /* Emit code to branch if off >= 0. */
18321 t = build2 (GE_EXPR, boolean_type_node, off,
18322 build_int_cst (TREE_TYPE (off), 0));
18323 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
18325 if (dw_align)
18327 /* Emit: offs = (offs + 15) & -16. */
18328 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
18329 build_int_cst (TREE_TYPE (off), 15));
18330 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
18331 build_int_cst (TREE_TYPE (off), -16));
18332 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
18334 else
18335 roundup = NULL;
18337 /* Update ap.__[g|v]r_offs */
18338 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
18339 build_int_cst (TREE_TYPE (off), rsize));
18340 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
18342 /* String up. */
18343 if (roundup)
18344 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
18346 /* [cond2] if (ap.__[g|v]r_offs > 0) */
18347 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
18348 build_int_cst (TREE_TYPE (f_off), 0));
18349 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
18351 /* String up: make sure the assignment happens before the use. */
18352 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
18353 COND_EXPR_ELSE (cond1) = t;
18355 /* Prepare the trees handling the argument that is passed on the stack;
18356 the top level node will store in ON_STACK. */
18357 arg = get_initialized_tmp_var (stack, pre_p, NULL);
18358 if (align > 8)
18360 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
18361 t = fold_build_pointer_plus_hwi (arg, 15);
18362 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
18363 build_int_cst (TREE_TYPE (t), -16));
18364 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
18366 else
18367 roundup = NULL;
18368 /* Advance ap.__stack */
18369 t = fold_build_pointer_plus_hwi (arg, size + 7);
18370 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
18371 build_int_cst (TREE_TYPE (t), -8));
18372 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
18373 /* String up roundup and advance. */
18374 if (roundup)
18375 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
18376 /* String up with arg */
18377 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
18378 /* Big-endianness related address adjustment. */
18379 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
18380 && size < UNITS_PER_WORD)
18382 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
18383 size_int (UNITS_PER_WORD - size));
18384 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
18387 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
18388 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
18390 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
18391 t = off;
18392 if (adjust)
18393 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
18394 build_int_cst (TREE_TYPE (off), adjust));
18396 t = fold_convert (sizetype, t);
18397 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
18399 if (is_ha)
18401 /* type ha; // treat as "struct {ftype field[n];}"
18402 ... [computing offs]
18403 for (i = 0; i <nregs; ++i, offs += 16)
18404 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
18405 return ha; */
18406 int i;
18407 tree tmp_ha, field_t, field_ptr_t;
18409 /* Declare a local variable. */
18410 tmp_ha = create_tmp_var_raw (type, "ha");
18411 gimple_add_tmp_var (tmp_ha);
18413 /* Establish the base type. */
18414 switch (ag_mode)
18416 case E_SFmode:
18417 field_t = float_type_node;
18418 field_ptr_t = float_ptr_type_node;
18419 break;
18420 case E_DFmode:
18421 field_t = double_type_node;
18422 field_ptr_t = double_ptr_type_node;
18423 break;
18424 case E_TFmode:
18425 field_t = long_double_type_node;
18426 field_ptr_t = long_double_ptr_type_node;
18427 break;
18428 case E_HFmode:
18429 field_t = aarch64_fp16_type_node;
18430 field_ptr_t = aarch64_fp16_ptr_type_node;
18431 break;
18432 case E_BFmode:
18433 field_t = aarch64_bf16_type_node;
18434 field_ptr_t = aarch64_bf16_ptr_type_node;
18435 break;
18436 case E_V2SImode:
18437 case E_V4SImode:
18439 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
18440 field_t = build_vector_type_for_mode (innertype, ag_mode);
18441 field_ptr_t = build_pointer_type (field_t);
18443 break;
18444 default:
18445 gcc_assert (0);
18448 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
18449 TREE_ADDRESSABLE (tmp_ha) = 1;
18450 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
18451 addr = t;
18452 t = fold_convert (field_ptr_t, addr);
18453 t = build2 (MODIFY_EXPR, field_t,
18454 build1 (INDIRECT_REF, field_t, tmp_ha),
18455 build1 (INDIRECT_REF, field_t, t));
18457 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
18458 for (i = 1; i < nregs; ++i)
18460 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
18461 u = fold_convert (field_ptr_t, addr);
18462 u = build2 (MODIFY_EXPR, field_t,
18463 build2 (MEM_REF, field_t, tmp_ha,
18464 build_int_cst (field_ptr_t,
18465 (i *
18466 int_size_in_bytes (field_t)))),
18467 build1 (INDIRECT_REF, field_t, u));
18468 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
18471 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
18472 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
18475 COND_EXPR_ELSE (cond2) = t;
18476 addr = fold_convert (build_pointer_type (type), cond1);
18477 addr = build_va_arg_indirect_ref (addr);
18479 if (indirect_p)
18480 addr = build_va_arg_indirect_ref (addr);
18482 return addr;
18485 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
18487 static void
18488 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
18489 const function_arg_info &arg,
18490 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
18492 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
18493 CUMULATIVE_ARGS local_cum;
18494 int gr_saved = cfun->va_list_gpr_size;
18495 int vr_saved = cfun->va_list_fpr_size;
18497 /* The caller has advanced CUM up to, but not beyond, the last named
18498 argument. Advance a local copy of CUM past the last "real" named
18499 argument, to find out how many registers are left over. */
18500 local_cum = *cum;
18501 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
18503 /* Found out how many registers we need to save.
18504 Honor tree-stdvar analysis results. */
18505 if (cfun->va_list_gpr_size)
18506 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
18507 cfun->va_list_gpr_size / UNITS_PER_WORD);
18508 if (cfun->va_list_fpr_size)
18509 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
18510 cfun->va_list_fpr_size / UNITS_PER_VREG);
18512 if (!TARGET_FLOAT)
18514 gcc_assert (local_cum.aapcs_nvrn == 0);
18515 vr_saved = 0;
18518 if (!no_rtl)
18520 if (gr_saved > 0)
18522 rtx ptr, mem;
18524 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
18525 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
18526 - gr_saved * UNITS_PER_WORD);
18527 mem = gen_frame_mem (BLKmode, ptr);
18528 set_mem_alias_set (mem, get_varargs_alias_set ());
18530 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
18531 mem, gr_saved);
18533 if (vr_saved > 0)
18535 /* We can't use move_block_from_reg, because it will use
18536 the wrong mode, storing D regs only. */
18537 machine_mode mode = TImode;
18538 int off, i, vr_start;
18540 /* Set OFF to the offset from virtual_incoming_args_rtx of
18541 the first vector register. The VR save area lies below
18542 the GR one, and is aligned to 16 bytes. */
18543 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
18544 STACK_BOUNDARY / BITS_PER_UNIT);
18545 off -= vr_saved * UNITS_PER_VREG;
18547 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
18548 for (i = 0; i < vr_saved; ++i)
18550 rtx ptr, mem;
18552 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
18553 mem = gen_frame_mem (mode, ptr);
18554 set_mem_alias_set (mem, get_varargs_alias_set ());
18555 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
18556 off += UNITS_PER_VREG;
18561 /* We don't save the size into *PRETEND_SIZE because we want to avoid
18562 any complication of having crtl->args.pretend_args_size changed. */
18563 cfun->machine->frame.saved_varargs_size
18564 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
18565 STACK_BOUNDARY / BITS_PER_UNIT)
18566 + vr_saved * UNITS_PER_VREG);
18569 static void
18570 aarch64_conditional_register_usage (void)
18572 int i;
18573 if (!TARGET_FLOAT)
18575 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
18577 fixed_regs[i] = 1;
18578 call_used_regs[i] = 1;
18581 if (!TARGET_SVE)
18582 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
18584 fixed_regs[i] = 1;
18585 call_used_regs[i] = 1;
18588 /* Only allow the FFR and FFRT to be accessed via special patterns. */
18589 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
18590 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
18592 /* When tracking speculation, we need a couple of call-clobbered registers
18593 to track the speculation state. It would be nice to just use
18594 IP0 and IP1, but currently there are numerous places that just
18595 assume these registers are free for other uses (eg pointer
18596 authentication). */
18597 if (aarch64_track_speculation)
18599 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
18600 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
18601 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
18602 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
18606 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
18608 bool
18609 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
18611 /* For records we're passed a FIELD_DECL, for arrays we're passed
18612 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
18613 const_tree type = TREE_TYPE (field_or_array);
18615 /* Assign BLKmode to anything that contains multiple SVE predicates.
18616 For structures, the "multiple" case is indicated by MODE being
18617 VOIDmode. */
18618 unsigned int num_zr, num_pr;
18619 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
18621 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
18622 return !simple_cst_equal (TYPE_SIZE (field_or_array),
18623 TYPE_SIZE (type));
18624 return mode == VOIDmode;
18627 return default_member_type_forces_blk (field_or_array, mode);
18630 /* Bitmasks that indicate whether earlier versions of GCC would have
18631 taken a different path through the ABI logic. This should result in
18632 a -Wpsabi warning if the earlier path led to a different ABI decision.
18634 WARN_PSABI_EMPTY_CXX17_BASE
18635 Indicates that the type includes an artificial empty C++17 base field
18636 that, prior to GCC 10.1, would prevent the type from being treated as
18637 a HFA or HVA. See PR94383 for details.
18639 WARN_PSABI_NO_UNIQUE_ADDRESS
18640 Indicates that the type includes an empty [[no_unique_address]] field
18641 that, prior to GCC 10.1, would prevent the type from being treated as
18642 a HFA or HVA. */
18643 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
18644 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
18646 /* Walk down the type tree of TYPE counting consecutive base elements.
18647 If *MODEP is VOIDmode, then set it to the first valid floating point
18648 type. If a non-floating point type is found, or if a floating point
18649 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
18650 otherwise return the count in the sub-tree.
18652 The WARN_PSABI_FLAGS argument allows the caller to check whether this
18653 function has changed its behavior relative to earlier versions of GCC.
18654 Normally the argument should be nonnull and point to a zero-initialized
18655 variable. The function then records whether the ABI decision might
18656 be affected by a known fix to the ABI logic, setting the associated
18657 WARN_PSABI_* bits if so.
18659 When the argument is instead a null pointer, the function tries to
18660 simulate the behavior of GCC before all such ABI fixes were made.
18661 This is useful to check whether the function returns something
18662 different after the ABI fixes. */
18663 static int
18664 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
18665 unsigned int *warn_psabi_flags)
18667 machine_mode mode;
18668 HOST_WIDE_INT size;
18670 if (aarch64_sve::builtin_type_p (type))
18671 return -1;
18673 switch (TREE_CODE (type))
18675 case REAL_TYPE:
18676 mode = TYPE_MODE (type);
18677 if (mode != DFmode && mode != SFmode
18678 && mode != TFmode && mode != HFmode)
18679 return -1;
18681 if (*modep == VOIDmode)
18682 *modep = mode;
18684 if (*modep == mode)
18685 return 1;
18687 break;
18689 case COMPLEX_TYPE:
18690 mode = TYPE_MODE (TREE_TYPE (type));
18691 if (mode != DFmode && mode != SFmode
18692 && mode != TFmode && mode != HFmode)
18693 return -1;
18695 if (*modep == VOIDmode)
18696 *modep = mode;
18698 if (*modep == mode)
18699 return 2;
18701 break;
18703 case VECTOR_TYPE:
18704 /* Use V2SImode and V4SImode as representatives of all 64-bit
18705 and 128-bit vector types. */
18706 size = int_size_in_bytes (type);
18707 switch (size)
18709 case 8:
18710 mode = V2SImode;
18711 break;
18712 case 16:
18713 mode = V4SImode;
18714 break;
18715 default:
18716 return -1;
18719 if (*modep == VOIDmode)
18720 *modep = mode;
18722 /* Vector modes are considered to be opaque: two vectors are
18723 equivalent for the purposes of being homogeneous aggregates
18724 if they are the same size. */
18725 if (*modep == mode)
18726 return 1;
18728 break;
18730 case ARRAY_TYPE:
18732 int count;
18733 tree index = TYPE_DOMAIN (type);
18735 /* Can't handle incomplete types nor sizes that are not
18736 fixed. */
18737 if (!COMPLETE_TYPE_P (type)
18738 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
18739 return -1;
18741 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
18742 warn_psabi_flags);
18743 if (count == -1
18744 || !index
18745 || !TYPE_MAX_VALUE (index)
18746 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
18747 || !TYPE_MIN_VALUE (index)
18748 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
18749 || count < 0)
18750 return -1;
18752 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
18753 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
18755 /* There must be no padding. */
18756 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
18757 count * GET_MODE_BITSIZE (*modep)))
18758 return -1;
18760 return count;
18763 case RECORD_TYPE:
18765 int count = 0;
18766 int sub_count;
18767 tree field;
18769 /* Can't handle incomplete types nor sizes that are not
18770 fixed. */
18771 if (!COMPLETE_TYPE_P (type)
18772 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
18773 return -1;
18775 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
18777 if (TREE_CODE (field) != FIELD_DECL)
18778 continue;
18780 if (DECL_FIELD_ABI_IGNORED (field))
18782 /* See whether this is something that earlier versions of
18783 GCC failed to ignore. */
18784 unsigned int flag;
18785 if (lookup_attribute ("no_unique_address",
18786 DECL_ATTRIBUTES (field)))
18787 flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
18788 else if (cxx17_empty_base_field_p (field))
18789 flag = WARN_PSABI_EMPTY_CXX17_BASE;
18790 else
18791 /* No compatibility problem. */
18792 continue;
18794 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
18795 if (warn_psabi_flags)
18797 *warn_psabi_flags |= flag;
18798 continue;
18802 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
18803 warn_psabi_flags);
18804 if (sub_count < 0)
18805 return -1;
18806 count += sub_count;
18809 /* There must be no padding. */
18810 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
18811 count * GET_MODE_BITSIZE (*modep)))
18812 return -1;
18814 return count;
18817 case UNION_TYPE:
18818 case QUAL_UNION_TYPE:
18820 /* These aren't very interesting except in a degenerate case. */
18821 int count = 0;
18822 int sub_count;
18823 tree field;
18825 /* Can't handle incomplete types nor sizes that are not
18826 fixed. */
18827 if (!COMPLETE_TYPE_P (type)
18828 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
18829 return -1;
18831 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
18833 if (TREE_CODE (field) != FIELD_DECL)
18834 continue;
18836 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
18837 warn_psabi_flags);
18838 if (sub_count < 0)
18839 return -1;
18840 count = count > sub_count ? count : sub_count;
18843 /* There must be no padding. */
18844 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
18845 count * GET_MODE_BITSIZE (*modep)))
18846 return -1;
18848 return count;
18851 default:
18852 break;
18855 return -1;
18858 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
18859 type as described in AAPCS64 \S 4.1.2.
18861 See the comment above aarch64_composite_type_p for the notes on MODE. */
18863 static bool
18864 aarch64_short_vector_p (const_tree type,
18865 machine_mode mode)
18867 poly_int64 size = -1;
18869 if (type && TREE_CODE (type) == VECTOR_TYPE)
18871 if (aarch64_sve::builtin_type_p (type))
18872 return false;
18873 size = int_size_in_bytes (type);
18875 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
18876 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
18878 /* Rely only on the type, not the mode, when processing SVE types. */
18879 if (type && aarch64_some_values_include_pst_objects_p (type))
18880 /* Leave later code to report an error if SVE is disabled. */
18881 gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
18882 else
18883 size = GET_MODE_SIZE (mode);
18885 if (known_eq (size, 8) || known_eq (size, 16))
18887 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
18888 they are being treated as scalable AAPCS64 types. */
18889 gcc_assert (!aarch64_sve_mode_p (mode));
18890 return true;
18892 return false;
18895 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
18896 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
18897 array types. The C99 floating-point complex types are also considered
18898 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
18899 types, which are GCC extensions and out of the scope of AAPCS64, are
18900 treated as composite types here as well.
18902 Note that MODE itself is not sufficient in determining whether a type
18903 is such a composite type or not. This is because
18904 stor-layout.c:compute_record_mode may have already changed the MODE
18905 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
18906 structure with only one field may have its MODE set to the mode of the
18907 field. Also an integer mode whose size matches the size of the
18908 RECORD_TYPE type may be used to substitute the original mode
18909 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
18910 solely relied on. */
18912 static bool
18913 aarch64_composite_type_p (const_tree type,
18914 machine_mode mode)
18916 if (aarch64_short_vector_p (type, mode))
18917 return false;
18919 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
18920 return true;
18922 if (mode == BLKmode
18923 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
18924 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
18925 return true;
18927 return false;
18930 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
18931 shall be passed or returned in simd/fp register(s) (providing these
18932 parameter passing registers are available).
18934 Upon successful return, *COUNT returns the number of needed registers,
18935 *BASE_MODE returns the mode of the individual register and when IS_HA
18936 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
18937 floating-point aggregate or a homogeneous short-vector aggregate.
18939 SILENT_P is true if the function should refrain from reporting any
18940 diagnostics. This should only be used if the caller is certain that
18941 any ABI decisions would eventually come through this function with
18942 SILENT_P set to false. */
18944 static bool
18945 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
18946 const_tree type,
18947 machine_mode *base_mode,
18948 int *count,
18949 bool *is_ha,
18950 bool silent_p)
18952 if (is_ha != NULL) *is_ha = false;
18954 machine_mode new_mode = VOIDmode;
18955 bool composite_p = aarch64_composite_type_p (type, mode);
18957 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
18958 || aarch64_short_vector_p (type, mode))
18960 *count = 1;
18961 new_mode = mode;
18963 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
18965 if (is_ha != NULL) *is_ha = true;
18966 *count = 2;
18967 new_mode = GET_MODE_INNER (mode);
18969 else if (type && composite_p)
18971 unsigned int warn_psabi_flags = 0;
18972 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
18973 &warn_psabi_flags);
18974 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
18976 static unsigned last_reported_type_uid;
18977 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
18978 int alt;
18979 if (!silent_p
18980 && warn_psabi
18981 && warn_psabi_flags
18982 && uid != last_reported_type_uid
18983 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
18984 != ag_count))
18986 const char *url
18987 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
18988 gcc_assert (alt == -1);
18989 last_reported_type_uid = uid;
18990 /* Use TYPE_MAIN_VARIANT to strip any redundant const
18991 qualification. */
18992 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
18993 inform (input_location, "parameter passing for argument of "
18994 "type %qT with %<[[no_unique_address]]%> members "
18995 "changed %{in GCC 10.1%}",
18996 TYPE_MAIN_VARIANT (type), url);
18997 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
18998 inform (input_location, "parameter passing for argument of "
18999 "type %qT when C++17 is enabled changed to match "
19000 "C++14 %{in GCC 10.1%}",
19001 TYPE_MAIN_VARIANT (type), url);
19004 if (is_ha != NULL) *is_ha = true;
19005 *count = ag_count;
19007 else
19008 return false;
19010 else
19011 return false;
19013 gcc_assert (!aarch64_sve_mode_p (new_mode));
19014 *base_mode = new_mode;
19015 return true;
19018 /* Implement TARGET_STRUCT_VALUE_RTX. */
19020 static rtx
19021 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
19022 int incoming ATTRIBUTE_UNUSED)
19024 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
19027 /* Implements target hook vector_mode_supported_p. */
19028 static bool
19029 aarch64_vector_mode_supported_p (machine_mode mode)
19031 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19032 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
19035 /* Return the full-width SVE vector mode for element mode MODE, if one
19036 exists. */
19037 opt_machine_mode
19038 aarch64_full_sve_mode (scalar_mode mode)
19040 switch (mode)
19042 case E_DFmode:
19043 return VNx2DFmode;
19044 case E_SFmode:
19045 return VNx4SFmode;
19046 case E_HFmode:
19047 return VNx8HFmode;
19048 case E_BFmode:
19049 return VNx8BFmode;
19050 case E_DImode:
19051 return VNx2DImode;
19052 case E_SImode:
19053 return VNx4SImode;
19054 case E_HImode:
19055 return VNx8HImode;
19056 case E_QImode:
19057 return VNx16QImode;
19058 default:
19059 return opt_machine_mode ();
19063 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
19064 if it exists. */
19065 opt_machine_mode
19066 aarch64_vq_mode (scalar_mode mode)
19068 switch (mode)
19070 case E_DFmode:
19071 return V2DFmode;
19072 case E_SFmode:
19073 return V4SFmode;
19074 case E_HFmode:
19075 return V8HFmode;
19076 case E_BFmode:
19077 return V8BFmode;
19078 case E_SImode:
19079 return V4SImode;
19080 case E_HImode:
19081 return V8HImode;
19082 case E_QImode:
19083 return V16QImode;
19084 case E_DImode:
19085 return V2DImode;
19086 default:
19087 return opt_machine_mode ();
19091 /* Return appropriate SIMD container
19092 for MODE within a vector of WIDTH bits. */
19093 static machine_mode
19094 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
19096 if (TARGET_SVE
19097 && maybe_ne (width, 128)
19098 && known_eq (width, BITS_PER_SVE_VECTOR))
19099 return aarch64_full_sve_mode (mode).else_mode (word_mode);
19101 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
19102 if (TARGET_SIMD)
19104 if (known_eq (width, 128))
19105 return aarch64_vq_mode (mode).else_mode (word_mode);
19106 else
19107 switch (mode)
19109 case E_SFmode:
19110 return V2SFmode;
19111 case E_HFmode:
19112 return V4HFmode;
19113 case E_BFmode:
19114 return V4BFmode;
19115 case E_SImode:
19116 return V2SImode;
19117 case E_HImode:
19118 return V4HImode;
19119 case E_QImode:
19120 return V8QImode;
19121 default:
19122 break;
19125 return word_mode;
19128 /* Compare an SVE mode SVE_M and an Advanced SIMD mode ASIMD_M
19129 and return whether the SVE mode should be preferred over the
19130 Advanced SIMD one in aarch64_autovectorize_vector_modes. */
19131 static bool
19132 aarch64_cmp_autovec_modes (machine_mode sve_m, machine_mode asimd_m)
19134 /* Take into account the aarch64-autovec-preference param if non-zero. */
19135 bool only_asimd_p = aarch64_autovec_preference == 1;
19136 bool only_sve_p = aarch64_autovec_preference == 2;
19138 if (only_asimd_p)
19139 return false;
19140 if (only_sve_p)
19141 return true;
19143 /* The preference in case of a tie in costs. */
19144 bool prefer_asimd = aarch64_autovec_preference == 3;
19145 bool prefer_sve = aarch64_autovec_preference == 4;
19147 aarch64_sve_vector_bits_enum tune_width = aarch64_tune_params.sve_width;
19149 poly_int64 nunits_sve = GET_MODE_NUNITS (sve_m);
19150 poly_int64 nunits_asimd = GET_MODE_NUNITS (asimd_m);
19151 /* If the CPU information does not have an SVE width registered use the
19152 generic poly_int comparison that prefers SVE. If a preference is
19153 explicitly requested avoid this path. */
19154 if (tune_width == SVE_SCALABLE
19155 && !prefer_asimd
19156 && !prefer_sve)
19157 return maybe_gt (nunits_sve, nunits_asimd);
19159 /* Otherwise estimate the runtime width of the modes involved. */
19160 HOST_WIDE_INT est_sve = estimated_poly_value (nunits_sve);
19161 HOST_WIDE_INT est_asimd = estimated_poly_value (nunits_asimd);
19163 /* Preferring SVE means picking it first unless the Advanced SIMD mode
19164 is clearly wider. */
19165 if (prefer_sve)
19166 return est_sve >= est_asimd;
19167 /* Conversely, preferring Advanced SIMD means picking SVE only if SVE
19168 is clearly wider. */
19169 if (prefer_asimd)
19170 return est_sve > est_asimd;
19172 /* In the default case prefer Advanced SIMD over SVE in case of a tie. */
19173 return est_sve > est_asimd;
19176 /* Return 128-bit container as the preferred SIMD mode for MODE. */
19177 static machine_mode
19178 aarch64_preferred_simd_mode (scalar_mode mode)
19180 /* Take into account explicit auto-vectorization ISA preferences through
19181 aarch64_cmp_autovec_modes. */
19182 if (TARGET_SVE && aarch64_cmp_autovec_modes (VNx16QImode, V16QImode))
19183 return aarch64_full_sve_mode (mode).else_mode (word_mode);
19184 if (TARGET_SIMD)
19185 return aarch64_vq_mode (mode).else_mode (word_mode);
19186 return word_mode;
19189 /* Return a list of possible vector sizes for the vectorizer
19190 to iterate over. */
19191 static unsigned int
19192 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
19194 static const machine_mode sve_modes[] = {
19195 /* Try using full vectors for all element types. */
19196 VNx16QImode,
19198 /* Try using 16-bit containers for 8-bit elements and full vectors
19199 for wider elements. */
19200 VNx8QImode,
19202 /* Try using 32-bit containers for 8-bit and 16-bit elements and
19203 full vectors for wider elements. */
19204 VNx4QImode,
19206 /* Try using 64-bit containers for all element types. */
19207 VNx2QImode
19210 static const machine_mode advsimd_modes[] = {
19211 /* Try using 128-bit vectors for all element types. */
19212 V16QImode,
19214 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
19215 for wider elements. */
19216 V8QImode,
19218 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
19219 for wider elements.
19221 TODO: We could support a limited form of V4QImode too, so that
19222 we use 32-bit vectors for 8-bit elements. */
19223 V4HImode,
19225 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
19226 for 64-bit elements.
19228 TODO: We could similarly support limited forms of V2QImode and V2HImode
19229 for this case. */
19230 V2SImode
19233 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
19234 This is because:
19236 - If we can't use N-byte Advanced SIMD vectors then the placement
19237 doesn't matter; we'll just continue as though the Advanced SIMD
19238 entry didn't exist.
19240 - If an SVE main loop with N bytes ends up being cheaper than an
19241 Advanced SIMD main loop with N bytes then by default we'll replace
19242 the Advanced SIMD version with the SVE one.
19244 - If an Advanced SIMD main loop with N bytes ends up being cheaper
19245 than an SVE main loop with N bytes then by default we'll try to
19246 use the SVE loop to vectorize the epilogue instead. */
19248 bool only_asimd_p = aarch64_autovec_preference == 1;
19249 bool only_sve_p = aarch64_autovec_preference == 2;
19251 unsigned int sve_i = (TARGET_SVE && !only_asimd_p) ? 0 : ARRAY_SIZE (sve_modes);
19252 unsigned int advsimd_i = 0;
19254 while (!only_sve_p && advsimd_i < ARRAY_SIZE (advsimd_modes))
19256 if (sve_i < ARRAY_SIZE (sve_modes)
19257 && aarch64_cmp_autovec_modes (sve_modes[sve_i],
19258 advsimd_modes[advsimd_i]))
19259 modes->safe_push (sve_modes[sve_i++]);
19260 else
19261 modes->safe_push (advsimd_modes[advsimd_i++]);
19263 while (sve_i < ARRAY_SIZE (sve_modes))
19264 modes->safe_push (sve_modes[sve_i++]);
19266 unsigned int flags = 0;
19267 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
19268 can compare SVE against Advanced SIMD and so that we can compare
19269 multiple SVE vectorization approaches against each other. There's
19270 not really any point doing this for Advanced SIMD only, since the
19271 first mode that works should always be the best. */
19272 if (TARGET_SVE && aarch64_sve_compare_costs)
19273 flags |= VECT_COMPARE_COSTS;
19274 return flags;
19277 /* Implement TARGET_MANGLE_TYPE. */
19279 static const char *
19280 aarch64_mangle_type (const_tree type)
19282 /* The AArch64 ABI documents say that "__va_list" has to be
19283 mangled as if it is in the "std" namespace. */
19284 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
19285 return "St9__va_list";
19287 /* Half-precision floating point types. */
19288 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
19290 if (TYPE_MODE (type) == BFmode)
19291 return "u6__bf16";
19292 else
19293 return "Dh";
19296 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
19297 builtin types. */
19298 if (TYPE_NAME (type) != NULL)
19300 const char *res;
19301 if ((res = aarch64_general_mangle_builtin_type (type))
19302 || (res = aarch64_sve::mangle_builtin_type (type)))
19303 return res;
19306 /* Use the default mangling. */
19307 return NULL;
19310 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
19312 static bool
19313 aarch64_verify_type_context (location_t loc, type_context_kind context,
19314 const_tree type, bool silent_p)
19316 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
19319 /* Find the first rtx_insn before insn that will generate an assembly
19320 instruction. */
19322 static rtx_insn *
19323 aarch64_prev_real_insn (rtx_insn *insn)
19325 if (!insn)
19326 return NULL;
19330 insn = prev_real_insn (insn);
19332 while (insn && recog_memoized (insn) < 0);
19334 return insn;
19337 static bool
19338 is_madd_op (enum attr_type t1)
19340 unsigned int i;
19341 /* A number of these may be AArch32 only. */
19342 enum attr_type mlatypes[] = {
19343 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
19344 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
19345 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
19348 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
19350 if (t1 == mlatypes[i])
19351 return true;
19354 return false;
19357 /* Check if there is a register dependency between a load and the insn
19358 for which we hold recog_data. */
19360 static bool
19361 dep_between_memop_and_curr (rtx memop)
19363 rtx load_reg;
19364 int opno;
19366 gcc_assert (GET_CODE (memop) == SET);
19368 if (!REG_P (SET_DEST (memop)))
19369 return false;
19371 load_reg = SET_DEST (memop);
19372 for (opno = 1; opno < recog_data.n_operands; opno++)
19374 rtx operand = recog_data.operand[opno];
19375 if (REG_P (operand)
19376 && reg_overlap_mentioned_p (load_reg, operand))
19377 return true;
19380 return false;
19384 /* When working around the Cortex-A53 erratum 835769,
19385 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
19386 instruction and has a preceding memory instruction such that a NOP
19387 should be inserted between them. */
19389 bool
19390 aarch64_madd_needs_nop (rtx_insn* insn)
19392 enum attr_type attr_type;
19393 rtx_insn *prev;
19394 rtx body;
19396 if (!TARGET_FIX_ERR_A53_835769)
19397 return false;
19399 if (!INSN_P (insn) || recog_memoized (insn) < 0)
19400 return false;
19402 attr_type = get_attr_type (insn);
19403 if (!is_madd_op (attr_type))
19404 return false;
19406 prev = aarch64_prev_real_insn (insn);
19407 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
19408 Restore recog state to INSN to avoid state corruption. */
19409 extract_constrain_insn_cached (insn);
19411 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
19412 return false;
19414 body = single_set (prev);
19416 /* If the previous insn is a memory op and there is no dependency between
19417 it and the DImode madd, emit a NOP between them. If body is NULL then we
19418 have a complex memory operation, probably a load/store pair.
19419 Be conservative for now and emit a NOP. */
19420 if (GET_MODE (recog_data.operand[0]) == DImode
19421 && (!body || !dep_between_memop_and_curr (body)))
19422 return true;
19424 return false;
19429 /* Implement FINAL_PRESCAN_INSN. */
19431 void
19432 aarch64_final_prescan_insn (rtx_insn *insn)
19434 if (aarch64_madd_needs_nop (insn))
19435 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
19439 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
19440 instruction. */
19442 bool
19443 aarch64_sve_index_immediate_p (rtx base_or_step)
19445 return (CONST_INT_P (base_or_step)
19446 && IN_RANGE (INTVAL (base_or_step), -16, 15));
19449 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
19450 when applied to mode MODE. Negate X first if NEGATE_P is true. */
19452 bool
19453 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
19455 rtx elt = unwrap_const_vec_duplicate (x);
19456 if (!CONST_INT_P (elt))
19457 return false;
19459 HOST_WIDE_INT val = INTVAL (elt);
19460 if (negate_p)
19461 val = -val;
19462 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
19464 if (val & 0xff)
19465 return IN_RANGE (val, 0, 0xff);
19466 return IN_RANGE (val, 0, 0xff00);
19469 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
19470 instructions when applied to mode MODE. Negate X first if NEGATE_P
19471 is true. */
19473 bool
19474 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
19476 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
19477 return false;
19479 /* After the optional negation, the immediate must be nonnegative.
19480 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
19481 instead of SQADD Zn.B, Zn.B, #129. */
19482 rtx elt = unwrap_const_vec_duplicate (x);
19483 return negate_p == (INTVAL (elt) < 0);
19486 /* Return true if X is a valid immediate operand for an SVE logical
19487 instruction such as AND. */
19489 bool
19490 aarch64_sve_bitmask_immediate_p (rtx x)
19492 rtx elt;
19494 return (const_vec_duplicate_p (x, &elt)
19495 && CONST_INT_P (elt)
19496 && aarch64_bitmask_imm (INTVAL (elt),
19497 GET_MODE_INNER (GET_MODE (x))));
19500 /* Return true if X is a valid immediate for the SVE DUP and CPY
19501 instructions. */
19503 bool
19504 aarch64_sve_dup_immediate_p (rtx x)
19506 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
19507 if (!CONST_INT_P (x))
19508 return false;
19510 HOST_WIDE_INT val = INTVAL (x);
19511 if (val & 0xff)
19512 return IN_RANGE (val, -0x80, 0x7f);
19513 return IN_RANGE (val, -0x8000, 0x7f00);
19516 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
19517 SIGNED_P says whether the operand is signed rather than unsigned. */
19519 bool
19520 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
19522 x = unwrap_const_vec_duplicate (x);
19523 return (CONST_INT_P (x)
19524 && (signed_p
19525 ? IN_RANGE (INTVAL (x), -16, 15)
19526 : IN_RANGE (INTVAL (x), 0, 127)));
19529 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
19530 instruction. Negate X first if NEGATE_P is true. */
19532 bool
19533 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
19535 rtx elt;
19536 REAL_VALUE_TYPE r;
19538 if (!const_vec_duplicate_p (x, &elt)
19539 || !CONST_DOUBLE_P (elt))
19540 return false;
19542 r = *CONST_DOUBLE_REAL_VALUE (elt);
19544 if (negate_p)
19545 r = real_value_negate (&r);
19547 if (real_equal (&r, &dconst1))
19548 return true;
19549 if (real_equal (&r, &dconsthalf))
19550 return true;
19551 return false;
19554 /* Return true if X is a valid immediate operand for an SVE FMUL
19555 instruction. */
19557 bool
19558 aarch64_sve_float_mul_immediate_p (rtx x)
19560 rtx elt;
19562 return (const_vec_duplicate_p (x, &elt)
19563 && CONST_DOUBLE_P (elt)
19564 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
19565 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
19568 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
19569 for the Advanced SIMD operation described by WHICH and INSN. If INFO
19570 is nonnull, use it to describe valid immediates. */
19571 static bool
19572 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
19573 simd_immediate_info *info,
19574 enum simd_immediate_check which,
19575 simd_immediate_info::insn_type insn)
19577 /* Try a 4-byte immediate with LSL. */
19578 for (unsigned int shift = 0; shift < 32; shift += 8)
19579 if ((val32 & (0xff << shift)) == val32)
19581 if (info)
19582 *info = simd_immediate_info (SImode, val32 >> shift, insn,
19583 simd_immediate_info::LSL, shift);
19584 return true;
19587 /* Try a 2-byte immediate with LSL. */
19588 unsigned int imm16 = val32 & 0xffff;
19589 if (imm16 == (val32 >> 16))
19590 for (unsigned int shift = 0; shift < 16; shift += 8)
19591 if ((imm16 & (0xff << shift)) == imm16)
19593 if (info)
19594 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
19595 simd_immediate_info::LSL, shift);
19596 return true;
19599 /* Try a 4-byte immediate with MSL, except for cases that MVN
19600 can handle. */
19601 if (which == AARCH64_CHECK_MOV)
19602 for (unsigned int shift = 8; shift < 24; shift += 8)
19604 unsigned int low = (1 << shift) - 1;
19605 if (((val32 & (0xff << shift)) | low) == val32)
19607 if (info)
19608 *info = simd_immediate_info (SImode, val32 >> shift, insn,
19609 simd_immediate_info::MSL, shift);
19610 return true;
19614 return false;
19617 /* Return true if replicating VAL64 is a valid immediate for the
19618 Advanced SIMD operation described by WHICH. If INFO is nonnull,
19619 use it to describe valid immediates. */
19620 static bool
19621 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
19622 simd_immediate_info *info,
19623 enum simd_immediate_check which)
19625 unsigned int val32 = val64 & 0xffffffff;
19626 unsigned int val16 = val64 & 0xffff;
19627 unsigned int val8 = val64 & 0xff;
19629 if (val32 == (val64 >> 32))
19631 if ((which & AARCH64_CHECK_ORR) != 0
19632 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
19633 simd_immediate_info::MOV))
19634 return true;
19636 if ((which & AARCH64_CHECK_BIC) != 0
19637 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
19638 simd_immediate_info::MVN))
19639 return true;
19641 /* Try using a replicated byte. */
19642 if (which == AARCH64_CHECK_MOV
19643 && val16 == (val32 >> 16)
19644 && val8 == (val16 >> 8))
19646 if (info)
19647 *info = simd_immediate_info (QImode, val8);
19648 return true;
19652 /* Try using a bit-to-bytemask. */
19653 if (which == AARCH64_CHECK_MOV)
19655 unsigned int i;
19656 for (i = 0; i < 64; i += 8)
19658 unsigned char byte = (val64 >> i) & 0xff;
19659 if (byte != 0 && byte != 0xff)
19660 break;
19662 if (i == 64)
19664 if (info)
19665 *info = simd_immediate_info (DImode, val64);
19666 return true;
19669 return false;
19672 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
19673 instruction. If INFO is nonnull, use it to describe valid immediates. */
19675 static bool
19676 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
19677 simd_immediate_info *info)
19679 scalar_int_mode mode = DImode;
19680 unsigned int val32 = val64 & 0xffffffff;
19681 if (val32 == (val64 >> 32))
19683 mode = SImode;
19684 unsigned int val16 = val32 & 0xffff;
19685 if (val16 == (val32 >> 16))
19687 mode = HImode;
19688 unsigned int val8 = val16 & 0xff;
19689 if (val8 == (val16 >> 8))
19690 mode = QImode;
19693 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
19694 if (IN_RANGE (val, -0x80, 0x7f))
19696 /* DUP with no shift. */
19697 if (info)
19698 *info = simd_immediate_info (mode, val);
19699 return true;
19701 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
19703 /* DUP with LSL #8. */
19704 if (info)
19705 *info = simd_immediate_info (mode, val);
19706 return true;
19708 if (aarch64_bitmask_imm (val64, mode))
19710 /* DUPM. */
19711 if (info)
19712 *info = simd_immediate_info (mode, val);
19713 return true;
19715 return false;
19718 /* Return true if X is an UNSPEC_PTRUE constant of the form:
19720 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
19722 where PATTERN is the svpattern as a CONST_INT and where ZERO
19723 is a zero constant of the required PTRUE mode (which can have
19724 fewer elements than X's mode, if zero bits are significant).
19726 If so, and if INFO is nonnull, describe the immediate in INFO. */
19727 bool
19728 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
19730 if (GET_CODE (x) != CONST)
19731 return false;
19733 x = XEXP (x, 0);
19734 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
19735 return false;
19737 if (info)
19739 aarch64_svpattern pattern
19740 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
19741 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
19742 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
19743 *info = simd_immediate_info (int_mode, pattern);
19745 return true;
19748 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
19749 it to describe valid immediates. */
19751 static bool
19752 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
19754 if (aarch64_sve_ptrue_svpattern_p (x, info))
19755 return true;
19757 if (x == CONST0_RTX (GET_MODE (x)))
19759 if (info)
19760 *info = simd_immediate_info (DImode, 0);
19761 return true;
19764 /* Analyze the value as a VNx16BImode. This should be relatively
19765 efficient, since rtx_vector_builder has enough built-in capacity
19766 to store all VLA predicate constants without needing the heap. */
19767 rtx_vector_builder builder;
19768 if (!aarch64_get_sve_pred_bits (builder, x))
19769 return false;
19771 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
19772 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
19774 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
19775 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
19776 if (pattern != AARCH64_NUM_SVPATTERNS)
19778 if (info)
19780 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
19781 *info = simd_immediate_info (int_mode, pattern);
19783 return true;
19786 return false;
19789 /* Return true if OP is a valid SIMD immediate for the operation
19790 described by WHICH. If INFO is nonnull, use it to describe valid
19791 immediates. */
19792 bool
19793 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
19794 enum simd_immediate_check which)
19796 machine_mode mode = GET_MODE (op);
19797 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
19798 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
19799 return false;
19801 if (vec_flags & VEC_SVE_PRED)
19802 return aarch64_sve_pred_valid_immediate (op, info);
19804 scalar_mode elt_mode = GET_MODE_INNER (mode);
19805 rtx base, step;
19806 unsigned int n_elts;
19807 if (GET_CODE (op) == CONST_VECTOR
19808 && CONST_VECTOR_DUPLICATE_P (op))
19809 n_elts = CONST_VECTOR_NPATTERNS (op);
19810 else if ((vec_flags & VEC_SVE_DATA)
19811 && const_vec_series_p (op, &base, &step))
19813 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
19814 if (!aarch64_sve_index_immediate_p (base)
19815 || !aarch64_sve_index_immediate_p (step))
19816 return false;
19818 if (info)
19820 /* Get the corresponding container mode. E.g. an INDEX on V2SI
19821 should yield two integer values per 128-bit block, meaning
19822 that we need to treat it in the same way as V2DI and then
19823 ignore the upper 32 bits of each element. */
19824 elt_mode = aarch64_sve_container_int_mode (mode);
19825 *info = simd_immediate_info (elt_mode, base, step);
19827 return true;
19829 else if (GET_CODE (op) == CONST_VECTOR
19830 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
19831 /* N_ELTS set above. */;
19832 else
19833 return false;
19835 scalar_float_mode elt_float_mode;
19836 if (n_elts == 1
19837 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
19839 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
19840 if (aarch64_float_const_zero_rtx_p (elt)
19841 || aarch64_float_const_representable_p (elt))
19843 if (info)
19844 *info = simd_immediate_info (elt_float_mode, elt);
19845 return true;
19849 /* If all elements in an SVE vector have the same value, we have a free
19850 choice between using the element mode and using the container mode.
19851 Using the element mode means that unused parts of the vector are
19852 duplicates of the used elements, while using the container mode means
19853 that the unused parts are an extension of the used elements. Using the
19854 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
19855 for its container mode VNx4SI while 0x00000101 isn't.
19857 If not all elements in an SVE vector have the same value, we need the
19858 transition from one element to the next to occur at container boundaries.
19859 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
19860 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
19861 scalar_int_mode elt_int_mode;
19862 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
19863 elt_int_mode = aarch64_sve_container_int_mode (mode);
19864 else
19865 elt_int_mode = int_mode_for_mode (elt_mode).require ();
19867 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
19868 if (elt_size > 8)
19869 return false;
19871 /* Expand the vector constant out into a byte vector, with the least
19872 significant byte of the register first. */
19873 auto_vec<unsigned char, 16> bytes;
19874 bytes.reserve (n_elts * elt_size);
19875 for (unsigned int i = 0; i < n_elts; i++)
19877 /* The vector is provided in gcc endian-neutral fashion.
19878 For aarch64_be Advanced SIMD, it must be laid out in the vector
19879 register in reverse order. */
19880 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
19881 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
19883 if (elt_mode != elt_int_mode)
19884 elt = gen_lowpart (elt_int_mode, elt);
19886 if (!CONST_INT_P (elt))
19887 return false;
19889 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
19890 for (unsigned int byte = 0; byte < elt_size; byte++)
19892 bytes.quick_push (elt_val & 0xff);
19893 elt_val >>= BITS_PER_UNIT;
19897 /* The immediate must repeat every eight bytes. */
19898 unsigned int nbytes = bytes.length ();
19899 for (unsigned i = 8; i < nbytes; ++i)
19900 if (bytes[i] != bytes[i - 8])
19901 return false;
19903 /* Get the repeating 8-byte value as an integer. No endian correction
19904 is needed here because bytes is already in lsb-first order. */
19905 unsigned HOST_WIDE_INT val64 = 0;
19906 for (unsigned int i = 0; i < 8; i++)
19907 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
19908 << (i * BITS_PER_UNIT));
19910 if (vec_flags & VEC_SVE_DATA)
19911 return aarch64_sve_valid_immediate (val64, info);
19912 else
19913 return aarch64_advsimd_valid_immediate (val64, info, which);
19916 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
19917 has a step in the range of INDEX. Return the index expression if so,
19918 otherwise return null. */
19920 aarch64_check_zero_based_sve_index_immediate (rtx x)
19922 rtx base, step;
19923 if (const_vec_series_p (x, &base, &step)
19924 && base == const0_rtx
19925 && aarch64_sve_index_immediate_p (step))
19926 return step;
19927 return NULL_RTX;
19930 /* Check of immediate shift constants are within range. */
19931 bool
19932 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
19934 x = unwrap_const_vec_duplicate (x);
19935 if (!CONST_INT_P (x))
19936 return false;
19937 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
19938 if (left)
19939 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
19940 else
19941 return IN_RANGE (INTVAL (x), 1, bit_width);
19944 /* Return the bitmask CONST_INT to select the bits required by a zero extract
19945 operation of width WIDTH at bit position POS. */
19948 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
19950 gcc_assert (CONST_INT_P (width));
19951 gcc_assert (CONST_INT_P (pos));
19953 unsigned HOST_WIDE_INT mask
19954 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
19955 return GEN_INT (mask << UINTVAL (pos));
19958 bool
19959 aarch64_mov_operand_p (rtx x, machine_mode mode)
19961 if (GET_CODE (x) == HIGH
19962 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
19963 return true;
19965 if (CONST_INT_P (x))
19966 return true;
19968 if (VECTOR_MODE_P (GET_MODE (x)))
19970 /* Require predicate constants to be VNx16BI before RA, so that we
19971 force everything to have a canonical form. */
19972 if (!lra_in_progress
19973 && !reload_completed
19974 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
19975 && GET_MODE (x) != VNx16BImode)
19976 return false;
19978 return aarch64_simd_valid_immediate (x, NULL);
19981 x = strip_salt (x);
19982 if (SYMBOL_REF_P (x) && mode == DImode && CONSTANT_ADDRESS_P (x))
19983 return true;
19985 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
19986 return true;
19988 return aarch64_classify_symbolic_expression (x)
19989 == SYMBOL_TINY_ABSOLUTE;
19992 /* Return a const_int vector of VAL. */
19994 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
19996 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
19997 return gen_const_vec_duplicate (mode, c);
20000 /* Check OP is a legal scalar immediate for the MOVI instruction. */
20002 bool
20003 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
20005 machine_mode vmode;
20007 vmode = aarch64_simd_container_mode (mode, 64);
20008 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
20009 return aarch64_simd_valid_immediate (op_v, NULL);
20012 /* Construct and return a PARALLEL RTX vector with elements numbering the
20013 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
20014 the vector - from the perspective of the architecture. This does not
20015 line up with GCC's perspective on lane numbers, so we end up with
20016 different masks depending on our target endian-ness. The diagram
20017 below may help. We must draw the distinction when building masks
20018 which select one half of the vector. An instruction selecting
20019 architectural low-lanes for a big-endian target, must be described using
20020 a mask selecting GCC high-lanes.
20022 Big-Endian Little-Endian
20024 GCC 0 1 2 3 3 2 1 0
20025 | x | x | x | x | | x | x | x | x |
20026 Architecture 3 2 1 0 3 2 1 0
20028 Low Mask: { 2, 3 } { 0, 1 }
20029 High Mask: { 0, 1 } { 2, 3 }
20031 MODE Is the mode of the vector and NUNITS is the number of units in it. */
20034 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
20036 rtvec v = rtvec_alloc (nunits / 2);
20037 int high_base = nunits / 2;
20038 int low_base = 0;
20039 int base;
20040 rtx t1;
20041 int i;
20043 if (BYTES_BIG_ENDIAN)
20044 base = high ? low_base : high_base;
20045 else
20046 base = high ? high_base : low_base;
20048 for (i = 0; i < nunits / 2; i++)
20049 RTVEC_ELT (v, i) = GEN_INT (base + i);
20051 t1 = gen_rtx_PARALLEL (mode, v);
20052 return t1;
20055 /* Check OP for validity as a PARALLEL RTX vector with elements
20056 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
20057 from the perspective of the architecture. See the diagram above
20058 aarch64_simd_vect_par_cnst_half for more details. */
20060 bool
20061 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
20062 bool high)
20064 int nelts;
20065 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
20066 return false;
20068 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
20069 HOST_WIDE_INT count_op = XVECLEN (op, 0);
20070 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
20071 int i = 0;
20073 if (count_op != count_ideal)
20074 return false;
20076 for (i = 0; i < count_ideal; i++)
20078 rtx elt_op = XVECEXP (op, 0, i);
20079 rtx elt_ideal = XVECEXP (ideal, 0, i);
20081 if (!CONST_INT_P (elt_op)
20082 || INTVAL (elt_ideal) != INTVAL (elt_op))
20083 return false;
20085 return true;
20088 /* Return a PARALLEL containing NELTS elements, with element I equal
20089 to BASE + I * STEP. */
20092 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
20094 rtvec vec = rtvec_alloc (nelts);
20095 for (unsigned int i = 0; i < nelts; ++i)
20096 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
20097 return gen_rtx_PARALLEL (VOIDmode, vec);
20100 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
20101 series with step STEP. */
20103 bool
20104 aarch64_stepped_int_parallel_p (rtx op, int step)
20106 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
20107 return false;
20109 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
20110 for (int i = 1; i < XVECLEN (op, 0); ++i)
20111 if (!CONST_INT_P (XVECEXP (op, 0, i))
20112 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
20113 return false;
20115 return true;
20118 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
20119 HIGH (exclusive). */
20120 void
20121 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
20122 const_tree exp)
20124 HOST_WIDE_INT lane;
20125 gcc_assert (CONST_INT_P (operand));
20126 lane = INTVAL (operand);
20128 if (lane < low || lane >= high)
20130 if (exp)
20131 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
20132 else
20133 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
20137 /* Peform endian correction on lane number N, which indexes a vector
20138 of mode MODE, and return the result as an SImode rtx. */
20141 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
20143 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
20146 /* Return TRUE if OP is a valid vector addressing mode. */
20148 bool
20149 aarch64_simd_mem_operand_p (rtx op)
20151 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
20152 || REG_P (XEXP (op, 0)));
20155 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
20157 bool
20158 aarch64_sve_ld1r_operand_p (rtx op)
20160 struct aarch64_address_info addr;
20161 scalar_mode mode;
20163 return (MEM_P (op)
20164 && is_a <scalar_mode> (GET_MODE (op), &mode)
20165 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
20166 && addr.type == ADDRESS_REG_IMM
20167 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
20170 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
20171 where the size of the read data is specified by `mode` and the size of the
20172 vector elements are specified by `elem_mode`. */
20173 bool
20174 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
20175 scalar_mode elem_mode)
20177 struct aarch64_address_info addr;
20178 if (!MEM_P (op)
20179 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
20180 return false;
20182 if (addr.type == ADDRESS_REG_IMM)
20183 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
20185 if (addr.type == ADDRESS_REG_REG)
20186 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
20188 return false;
20191 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
20192 bool
20193 aarch64_sve_ld1rq_operand_p (rtx op)
20195 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
20196 GET_MODE_INNER (GET_MODE (op)));
20199 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
20200 accessing a vector where the element size is specified by `elem_mode`. */
20201 bool
20202 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
20204 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
20207 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
20208 bool
20209 aarch64_sve_ldff1_operand_p (rtx op)
20211 if (!MEM_P (op))
20212 return false;
20214 struct aarch64_address_info addr;
20215 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
20216 return false;
20218 if (addr.type == ADDRESS_REG_IMM)
20219 return known_eq (addr.const_offset, 0);
20221 return addr.type == ADDRESS_REG_REG;
20224 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
20225 bool
20226 aarch64_sve_ldnf1_operand_p (rtx op)
20228 struct aarch64_address_info addr;
20230 return (MEM_P (op)
20231 && aarch64_classify_address (&addr, XEXP (op, 0),
20232 GET_MODE (op), false)
20233 && addr.type == ADDRESS_REG_IMM);
20236 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
20237 The conditions for STR are the same. */
20238 bool
20239 aarch64_sve_ldr_operand_p (rtx op)
20241 struct aarch64_address_info addr;
20243 return (MEM_P (op)
20244 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
20245 false, ADDR_QUERY_ANY)
20246 && addr.type == ADDRESS_REG_IMM);
20249 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
20250 addressing memory of mode MODE. */
20251 bool
20252 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
20254 struct aarch64_address_info addr;
20255 if (!aarch64_classify_address (&addr, op, mode, false, ADDR_QUERY_ANY))
20256 return false;
20258 if (addr.type == ADDRESS_REG_IMM)
20259 return offset_6bit_signed_scaled_p (mode, addr.const_offset);
20261 return addr.type == ADDRESS_REG_REG;
20264 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
20265 We need to be able to access the individual pieces, so the range
20266 is different from LD[234] and ST[234]. */
20267 bool
20268 aarch64_sve_struct_memory_operand_p (rtx op)
20270 if (!MEM_P (op))
20271 return false;
20273 machine_mode mode = GET_MODE (op);
20274 struct aarch64_address_info addr;
20275 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
20276 ADDR_QUERY_ANY)
20277 || addr.type != ADDRESS_REG_IMM)
20278 return false;
20280 poly_int64 first = addr.const_offset;
20281 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
20282 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
20283 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
20286 /* Emit a register copy from operand to operand, taking care not to
20287 early-clobber source registers in the process.
20289 COUNT is the number of components into which the copy needs to be
20290 decomposed. */
20291 void
20292 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
20293 unsigned int count)
20295 unsigned int i;
20296 int rdest = REGNO (operands[0]);
20297 int rsrc = REGNO (operands[1]);
20299 if (!reg_overlap_mentioned_p (operands[0], operands[1])
20300 || rdest < rsrc)
20301 for (i = 0; i < count; i++)
20302 emit_move_insn (gen_rtx_REG (mode, rdest + i),
20303 gen_rtx_REG (mode, rsrc + i));
20304 else
20305 for (i = 0; i < count; i++)
20306 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
20307 gen_rtx_REG (mode, rsrc + count - i - 1));
20310 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
20311 one of VSTRUCT modes: OI, CI, or XI. */
20313 aarch64_simd_attr_length_rglist (machine_mode mode)
20315 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
20316 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
20319 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
20320 alignment of a vector to 128 bits. SVE predicates have an alignment of
20321 16 bits. */
20322 static HOST_WIDE_INT
20323 aarch64_simd_vector_alignment (const_tree type)
20325 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
20326 be set for non-predicate vectors of booleans. Modes are the most
20327 direct way we have of identifying real SVE predicate types. */
20328 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
20329 return 16;
20330 widest_int min_size
20331 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
20332 return wi::umin (min_size, 128).to_uhwi ();
20335 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
20336 static poly_uint64
20337 aarch64_vectorize_preferred_vector_alignment (const_tree type)
20339 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
20341 /* If the length of the vector is a fixed power of 2, try to align
20342 to that length, otherwise don't try to align at all. */
20343 HOST_WIDE_INT result;
20344 if (!GET_MODE_BITSIZE (TYPE_MODE (type)).is_constant (&result)
20345 || !pow2p_hwi (result))
20346 result = TYPE_ALIGN (TREE_TYPE (type));
20347 return result;
20349 return TYPE_ALIGN (type);
20352 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
20353 static bool
20354 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
20356 if (is_packed)
20357 return false;
20359 /* For fixed-length vectors, check that the vectorizer will aim for
20360 full-vector alignment. This isn't true for generic GCC vectors
20361 that are wider than the ABI maximum of 128 bits. */
20362 poly_uint64 preferred_alignment =
20363 aarch64_vectorize_preferred_vector_alignment (type);
20364 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
20365 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
20366 preferred_alignment))
20367 return false;
20369 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
20370 return true;
20373 /* Return true if the vector misalignment factor is supported by the
20374 target. */
20375 static bool
20376 aarch64_builtin_support_vector_misalignment (machine_mode mode,
20377 const_tree type, int misalignment,
20378 bool is_packed)
20380 if (TARGET_SIMD && STRICT_ALIGNMENT)
20382 /* Return if movmisalign pattern is not supported for this mode. */
20383 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
20384 return false;
20386 /* Misalignment factor is unknown at compile time. */
20387 if (misalignment == -1)
20388 return false;
20390 return default_builtin_support_vector_misalignment (mode, type, misalignment,
20391 is_packed);
20394 /* If VALS is a vector constant that can be loaded into a register
20395 using DUP, generate instructions to do so and return an RTX to
20396 assign to the register. Otherwise return NULL_RTX. */
20397 static rtx
20398 aarch64_simd_dup_constant (rtx vals)
20400 machine_mode mode = GET_MODE (vals);
20401 machine_mode inner_mode = GET_MODE_INNER (mode);
20402 rtx x;
20404 if (!const_vec_duplicate_p (vals, &x))
20405 return NULL_RTX;
20407 /* We can load this constant by using DUP and a constant in a
20408 single ARM register. This will be cheaper than a vector
20409 load. */
20410 x = copy_to_mode_reg (inner_mode, x);
20411 return gen_vec_duplicate (mode, x);
20415 /* Generate code to load VALS, which is a PARALLEL containing only
20416 constants (for vec_init) or CONST_VECTOR, efficiently into a
20417 register. Returns an RTX to copy into the register, or NULL_RTX
20418 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
20419 static rtx
20420 aarch64_simd_make_constant (rtx vals)
20422 machine_mode mode = GET_MODE (vals);
20423 rtx const_dup;
20424 rtx const_vec = NULL_RTX;
20425 int n_const = 0;
20426 int i;
20428 if (GET_CODE (vals) == CONST_VECTOR)
20429 const_vec = vals;
20430 else if (GET_CODE (vals) == PARALLEL)
20432 /* A CONST_VECTOR must contain only CONST_INTs and
20433 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
20434 Only store valid constants in a CONST_VECTOR. */
20435 int n_elts = XVECLEN (vals, 0);
20436 for (i = 0; i < n_elts; ++i)
20438 rtx x = XVECEXP (vals, 0, i);
20439 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
20440 n_const++;
20442 if (n_const == n_elts)
20443 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
20445 else
20446 gcc_unreachable ();
20448 if (const_vec != NULL_RTX
20449 && aarch64_simd_valid_immediate (const_vec, NULL))
20450 /* Load using MOVI/MVNI. */
20451 return const_vec;
20452 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
20453 /* Loaded using DUP. */
20454 return const_dup;
20455 else if (const_vec != NULL_RTX)
20456 /* Load from constant pool. We cannot take advantage of single-cycle
20457 LD1 because we need a PC-relative addressing mode. */
20458 return const_vec;
20459 else
20460 /* A PARALLEL containing something not valid inside CONST_VECTOR.
20461 We cannot construct an initializer. */
20462 return NULL_RTX;
20465 /* Expand a vector initialisation sequence, such that TARGET is
20466 initialised to contain VALS. */
20468 void
20469 aarch64_expand_vector_init (rtx target, rtx vals)
20471 machine_mode mode = GET_MODE (target);
20472 scalar_mode inner_mode = GET_MODE_INNER (mode);
20473 /* The number of vector elements. */
20474 int n_elts = XVECLEN (vals, 0);
20475 /* The number of vector elements which are not constant. */
20476 int n_var = 0;
20477 rtx any_const = NULL_RTX;
20478 /* The first element of vals. */
20479 rtx v0 = XVECEXP (vals, 0, 0);
20480 bool all_same = true;
20482 /* This is a special vec_init<M><N> where N is not an element mode but a
20483 vector mode with half the elements of M. We expect to find two entries
20484 of mode N in VALS and we must put their concatentation into TARGET. */
20485 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
20487 gcc_assert (known_eq (GET_MODE_SIZE (mode),
20488 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
20489 rtx lo = XVECEXP (vals, 0, 0);
20490 rtx hi = XVECEXP (vals, 0, 1);
20491 machine_mode narrow_mode = GET_MODE (lo);
20492 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
20493 gcc_assert (narrow_mode == GET_MODE (hi));
20495 /* When we want to concatenate a half-width vector with zeroes we can
20496 use the aarch64_combinez[_be] patterns. Just make sure that the
20497 zeroes are in the right half. */
20498 if (BYTES_BIG_ENDIAN
20499 && aarch64_simd_imm_zero (lo, narrow_mode)
20500 && general_operand (hi, narrow_mode))
20501 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
20502 else if (!BYTES_BIG_ENDIAN
20503 && aarch64_simd_imm_zero (hi, narrow_mode)
20504 && general_operand (lo, narrow_mode))
20505 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
20506 else
20508 /* Else create the two half-width registers and combine them. */
20509 if (!REG_P (lo))
20510 lo = force_reg (GET_MODE (lo), lo);
20511 if (!REG_P (hi))
20512 hi = force_reg (GET_MODE (hi), hi);
20514 if (BYTES_BIG_ENDIAN)
20515 std::swap (lo, hi);
20516 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
20518 return;
20521 /* Count the number of variable elements to initialise. */
20522 for (int i = 0; i < n_elts; ++i)
20524 rtx x = XVECEXP (vals, 0, i);
20525 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
20526 ++n_var;
20527 else
20528 any_const = x;
20530 all_same &= rtx_equal_p (x, v0);
20533 /* No variable elements, hand off to aarch64_simd_make_constant which knows
20534 how best to handle this. */
20535 if (n_var == 0)
20537 rtx constant = aarch64_simd_make_constant (vals);
20538 if (constant != NULL_RTX)
20540 emit_move_insn (target, constant);
20541 return;
20545 /* Splat a single non-constant element if we can. */
20546 if (all_same)
20548 rtx x = copy_to_mode_reg (inner_mode, v0);
20549 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
20550 return;
20553 enum insn_code icode = optab_handler (vec_set_optab, mode);
20554 gcc_assert (icode != CODE_FOR_nothing);
20556 /* If there are only variable elements, try to optimize
20557 the insertion using dup for the most common element
20558 followed by insertions. */
20560 /* The algorithm will fill matches[*][0] with the earliest matching element,
20561 and matches[X][1] with the count of duplicate elements (if X is the
20562 earliest element which has duplicates). */
20564 if (n_var == n_elts && n_elts <= 16)
20566 int matches[16][2] = {0};
20567 for (int i = 0; i < n_elts; i++)
20569 for (int j = 0; j <= i; j++)
20571 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
20573 matches[i][0] = j;
20574 matches[j][1]++;
20575 break;
20579 int maxelement = 0;
20580 int maxv = 0;
20581 for (int i = 0; i < n_elts; i++)
20582 if (matches[i][1] > maxv)
20584 maxelement = i;
20585 maxv = matches[i][1];
20588 /* Create a duplicate of the most common element, unless all elements
20589 are equally useless to us, in which case just immediately set the
20590 vector register using the first element. */
20592 if (maxv == 1)
20594 /* For vectors of two 64-bit elements, we can do even better. */
20595 if (n_elts == 2
20596 && (inner_mode == E_DImode
20597 || inner_mode == E_DFmode))
20600 rtx x0 = XVECEXP (vals, 0, 0);
20601 rtx x1 = XVECEXP (vals, 0, 1);
20602 /* Combine can pick up this case, but handling it directly
20603 here leaves clearer RTL.
20605 This is load_pair_lanes<mode>, and also gives us a clean-up
20606 for store_pair_lanes<mode>. */
20607 if (memory_operand (x0, inner_mode)
20608 && memory_operand (x1, inner_mode)
20609 && !STRICT_ALIGNMENT
20610 && rtx_equal_p (XEXP (x1, 0),
20611 plus_constant (Pmode,
20612 XEXP (x0, 0),
20613 GET_MODE_SIZE (inner_mode))))
20615 rtx t;
20616 if (inner_mode == DFmode)
20617 t = gen_load_pair_lanesdf (target, x0, x1);
20618 else
20619 t = gen_load_pair_lanesdi (target, x0, x1);
20620 emit_insn (t);
20621 return;
20624 /* The subreg-move sequence below will move into lane zero of the
20625 vector register. For big-endian we want that position to hold
20626 the last element of VALS. */
20627 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
20628 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
20629 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
20631 else
20633 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
20634 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
20637 /* Insert the rest. */
20638 for (int i = 0; i < n_elts; i++)
20640 rtx x = XVECEXP (vals, 0, i);
20641 if (matches[i][0] == maxelement)
20642 continue;
20643 x = copy_to_mode_reg (inner_mode, x);
20644 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
20646 return;
20649 /* Initialise a vector which is part-variable. We want to first try
20650 to build those lanes which are constant in the most efficient way we
20651 can. */
20652 if (n_var != n_elts)
20654 rtx copy = copy_rtx (vals);
20656 /* Load constant part of vector. We really don't care what goes into the
20657 parts we will overwrite, but we're more likely to be able to load the
20658 constant efficiently if it has fewer, larger, repeating parts
20659 (see aarch64_simd_valid_immediate). */
20660 for (int i = 0; i < n_elts; i++)
20662 rtx x = XVECEXP (vals, 0, i);
20663 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
20664 continue;
20665 rtx subst = any_const;
20666 for (int bit = n_elts / 2; bit > 0; bit /= 2)
20668 /* Look in the copied vector, as more elements are const. */
20669 rtx test = XVECEXP (copy, 0, i ^ bit);
20670 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
20672 subst = test;
20673 break;
20676 XVECEXP (copy, 0, i) = subst;
20678 aarch64_expand_vector_init (target, copy);
20681 /* Insert the variable lanes directly. */
20682 for (int i = 0; i < n_elts; i++)
20684 rtx x = XVECEXP (vals, 0, i);
20685 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
20686 continue;
20687 x = copy_to_mode_reg (inner_mode, x);
20688 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
20692 /* Emit RTL corresponding to:
20693 insr TARGET, ELEM. */
20695 static void
20696 emit_insr (rtx target, rtx elem)
20698 machine_mode mode = GET_MODE (target);
20699 scalar_mode elem_mode = GET_MODE_INNER (mode);
20700 elem = force_reg (elem_mode, elem);
20702 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
20703 gcc_assert (icode != CODE_FOR_nothing);
20704 emit_insn (GEN_FCN (icode) (target, target, elem));
20707 /* Subroutine of aarch64_sve_expand_vector_init for handling
20708 trailing constants.
20709 This function works as follows:
20710 (a) Create a new vector consisting of trailing constants.
20711 (b) Initialize TARGET with the constant vector using emit_move_insn.
20712 (c) Insert remaining elements in TARGET using insr.
20713 NELTS is the total number of elements in original vector while
20714 while NELTS_REQD is the number of elements that are actually
20715 significant.
20717 ??? The heuristic used is to do above only if number of constants
20718 is at least half the total number of elements. May need fine tuning. */
20720 static bool
20721 aarch64_sve_expand_vector_init_handle_trailing_constants
20722 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
20724 machine_mode mode = GET_MODE (target);
20725 scalar_mode elem_mode = GET_MODE_INNER (mode);
20726 int n_trailing_constants = 0;
20728 for (int i = nelts_reqd - 1;
20729 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
20730 i--)
20731 n_trailing_constants++;
20733 if (n_trailing_constants >= nelts_reqd / 2)
20735 /* Try to use the natural pattern of BUILDER to extend the trailing
20736 constant elements to a full vector. Replace any variables in the
20737 extra elements with zeros.
20739 ??? It would be better if the builders supported "don't care"
20740 elements, with the builder filling in whichever elements
20741 give the most compact encoding. */
20742 rtx_vector_builder v (mode, nelts, 1);
20743 for (int i = 0; i < nelts; i++)
20745 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
20746 if (!valid_for_const_vector_p (elem_mode, x))
20747 x = const0_rtx;
20748 v.quick_push (x);
20750 rtx const_vec = v.build ();
20751 emit_move_insn (target, const_vec);
20753 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
20754 emit_insr (target, builder.elt (i));
20756 return true;
20759 return false;
20762 /* Subroutine of aarch64_sve_expand_vector_init.
20763 Works as follows:
20764 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
20765 (b) Skip trailing elements from BUILDER, which are the same as
20766 element NELTS_REQD - 1.
20767 (c) Insert earlier elements in reverse order in TARGET using insr. */
20769 static void
20770 aarch64_sve_expand_vector_init_insert_elems (rtx target,
20771 const rtx_vector_builder &builder,
20772 int nelts_reqd)
20774 machine_mode mode = GET_MODE (target);
20775 scalar_mode elem_mode = GET_MODE_INNER (mode);
20777 struct expand_operand ops[2];
20778 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
20779 gcc_assert (icode != CODE_FOR_nothing);
20781 create_output_operand (&ops[0], target, mode);
20782 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
20783 expand_insn (icode, 2, ops);
20785 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
20786 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
20787 emit_insr (target, builder.elt (i));
20790 /* Subroutine of aarch64_sve_expand_vector_init to handle case
20791 when all trailing elements of builder are same.
20792 This works as follows:
20793 (a) Use expand_insn interface to broadcast last vector element in TARGET.
20794 (b) Insert remaining elements in TARGET using insr.
20796 ??? The heuristic used is to do above if number of same trailing elements
20797 is at least 3/4 of total number of elements, loosely based on
20798 heuristic from mostly_zeros_p. May need fine-tuning. */
20800 static bool
20801 aarch64_sve_expand_vector_init_handle_trailing_same_elem
20802 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
20804 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
20805 if (ndups >= (3 * nelts_reqd) / 4)
20807 aarch64_sve_expand_vector_init_insert_elems (target, builder,
20808 nelts_reqd - ndups + 1);
20809 return true;
20812 return false;
20815 /* Initialize register TARGET from BUILDER. NELTS is the constant number
20816 of elements in BUILDER.
20818 The function tries to initialize TARGET from BUILDER if it fits one
20819 of the special cases outlined below.
20821 Failing that, the function divides BUILDER into two sub-vectors:
20822 v_even = even elements of BUILDER;
20823 v_odd = odd elements of BUILDER;
20825 and recursively calls itself with v_even and v_odd.
20827 if (recursive call succeeded for v_even or v_odd)
20828 TARGET = zip (v_even, v_odd)
20830 The function returns true if it managed to build TARGET from BUILDER
20831 with one of the special cases, false otherwise.
20833 Example: {a, 1, b, 2, c, 3, d, 4}
20835 The vector gets divided into:
20836 v_even = {a, b, c, d}
20837 v_odd = {1, 2, 3, 4}
20839 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
20840 initialize tmp2 from constant vector v_odd using emit_move_insn.
20842 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
20843 4 elements, so we construct tmp1 from v_even using insr:
20844 tmp1 = dup(d)
20845 insr tmp1, c
20846 insr tmp1, b
20847 insr tmp1, a
20849 And finally:
20850 TARGET = zip (tmp1, tmp2)
20851 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
20853 static bool
20854 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
20855 int nelts, int nelts_reqd)
20857 machine_mode mode = GET_MODE (target);
20859 /* Case 1: Vector contains trailing constants. */
20861 if (aarch64_sve_expand_vector_init_handle_trailing_constants
20862 (target, builder, nelts, nelts_reqd))
20863 return true;
20865 /* Case 2: Vector contains leading constants. */
20867 rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
20868 for (int i = 0; i < nelts_reqd; i++)
20869 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
20870 rev_builder.finalize ();
20872 if (aarch64_sve_expand_vector_init_handle_trailing_constants
20873 (target, rev_builder, nelts, nelts_reqd))
20875 emit_insn (gen_aarch64_sve_rev (mode, target, target));
20876 return true;
20879 /* Case 3: Vector contains trailing same element. */
20881 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
20882 (target, builder, nelts_reqd))
20883 return true;
20885 /* Case 4: Vector contains leading same element. */
20887 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
20888 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
20890 emit_insn (gen_aarch64_sve_rev (mode, target, target));
20891 return true;
20894 /* Avoid recursing below 4-elements.
20895 ??? The threshold 4 may need fine-tuning. */
20897 if (nelts_reqd <= 4)
20898 return false;
20900 rtx_vector_builder v_even (mode, nelts, 1);
20901 rtx_vector_builder v_odd (mode, nelts, 1);
20903 for (int i = 0; i < nelts * 2; i += 2)
20905 v_even.quick_push (builder.elt (i));
20906 v_odd.quick_push (builder.elt (i + 1));
20909 v_even.finalize ();
20910 v_odd.finalize ();
20912 rtx tmp1 = gen_reg_rtx (mode);
20913 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
20914 nelts, nelts_reqd / 2);
20916 rtx tmp2 = gen_reg_rtx (mode);
20917 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
20918 nelts, nelts_reqd / 2);
20920 if (!did_even_p && !did_odd_p)
20921 return false;
20923 /* Initialize v_even and v_odd using INSR if it didn't match any of the
20924 special cases and zip v_even, v_odd. */
20926 if (!did_even_p)
20927 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
20929 if (!did_odd_p)
20930 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
20932 rtvec v = gen_rtvec (2, tmp1, tmp2);
20933 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
20934 return true;
20937 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
20939 void
20940 aarch64_sve_expand_vector_init (rtx target, rtx vals)
20942 machine_mode mode = GET_MODE (target);
20943 int nelts = XVECLEN (vals, 0);
20945 rtx_vector_builder v (mode, nelts, 1);
20946 for (int i = 0; i < nelts; i++)
20947 v.quick_push (XVECEXP (vals, 0, i));
20948 v.finalize ();
20950 /* If neither sub-vectors of v could be initialized specially,
20951 then use INSR to insert all elements from v into TARGET.
20952 ??? This might not be optimal for vectors with large
20953 initializers like 16-element or above.
20954 For nelts < 4, it probably isn't useful to handle specially. */
20956 if (nelts < 4
20957 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
20958 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
20961 /* Check whether VALUE is a vector constant in which every element
20962 is either a power of 2 or a negated power of 2. If so, return
20963 a constant vector of log2s, and flip CODE between PLUS and MINUS
20964 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
20966 static rtx
20967 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
20969 if (GET_CODE (value) != CONST_VECTOR)
20970 return NULL_RTX;
20972 rtx_vector_builder builder;
20973 if (!builder.new_unary_operation (GET_MODE (value), value, false))
20974 return NULL_RTX;
20976 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
20977 /* 1 if the result of the multiplication must be negated,
20978 0 if it mustn't, or -1 if we don't yet care. */
20979 int negate = -1;
20980 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
20981 for (unsigned int i = 0; i < encoded_nelts; ++i)
20983 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
20984 if (!CONST_SCALAR_INT_P (elt))
20985 return NULL_RTX;
20986 rtx_mode_t val (elt, int_mode);
20987 wide_int pow2 = wi::neg (val);
20988 if (val != pow2)
20990 /* It matters whether we negate or not. Make that choice,
20991 and make sure that it's consistent with previous elements. */
20992 if (negate == !wi::neg_p (val))
20993 return NULL_RTX;
20994 negate = wi::neg_p (val);
20995 if (!negate)
20996 pow2 = val;
20998 /* POW2 is now the value that we want to be a power of 2. */
20999 int shift = wi::exact_log2 (pow2);
21000 if (shift < 0)
21001 return NULL_RTX;
21002 builder.quick_push (gen_int_mode (shift, int_mode));
21004 if (negate == -1)
21005 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
21006 code = PLUS;
21007 else if (negate == 1)
21008 code = code == PLUS ? MINUS : PLUS;
21009 return builder.build ();
21012 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
21013 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
21014 operands array, in the same order as for fma_optab. Return true if
21015 the function emitted all the necessary instructions, false if the caller
21016 should generate the pattern normally with the new OPERANDS array. */
21018 bool
21019 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
21021 machine_mode mode = GET_MODE (operands[0]);
21022 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
21024 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
21025 NULL_RTX, true, OPTAB_DIRECT);
21026 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
21027 operands[3], product, operands[0], true,
21028 OPTAB_DIRECT);
21029 return true;
21031 operands[2] = force_reg (mode, operands[2]);
21032 return false;
21035 /* Likewise, but for a conditional pattern. */
21037 bool
21038 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
21040 machine_mode mode = GET_MODE (operands[0]);
21041 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
21043 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
21044 NULL_RTX, true, OPTAB_DIRECT);
21045 emit_insn (gen_cond (code, mode, operands[0], operands[1],
21046 operands[4], product, operands[5]));
21047 return true;
21049 operands[3] = force_reg (mode, operands[3]);
21050 return false;
21053 static unsigned HOST_WIDE_INT
21054 aarch64_shift_truncation_mask (machine_mode mode)
21056 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
21057 return 0;
21058 return GET_MODE_UNIT_BITSIZE (mode) - 1;
21061 /* Select a format to encode pointers in exception handling data. */
21063 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
21065 int type;
21066 switch (aarch64_cmodel)
21068 case AARCH64_CMODEL_TINY:
21069 case AARCH64_CMODEL_TINY_PIC:
21070 case AARCH64_CMODEL_SMALL:
21071 case AARCH64_CMODEL_SMALL_PIC:
21072 case AARCH64_CMODEL_SMALL_SPIC:
21073 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
21074 for everything. */
21075 type = DW_EH_PE_sdata4;
21076 break;
21077 default:
21078 /* No assumptions here. 8-byte relocs required. */
21079 type = DW_EH_PE_sdata8;
21080 break;
21082 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
21085 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
21087 static void
21088 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
21090 if (TREE_CODE (decl) == FUNCTION_DECL)
21092 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
21093 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
21095 fprintf (stream, "\t.variant_pcs\t");
21096 assemble_name (stream, name);
21097 fprintf (stream, "\n");
21102 /* The last .arch and .tune assembly strings that we printed. */
21103 static std::string aarch64_last_printed_arch_string;
21104 static std::string aarch64_last_printed_tune_string;
21106 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
21107 by the function fndecl. */
21109 void
21110 aarch64_declare_function_name (FILE *stream, const char* name,
21111 tree fndecl)
21113 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
21115 struct cl_target_option *targ_options;
21116 if (target_parts)
21117 targ_options = TREE_TARGET_OPTION (target_parts);
21118 else
21119 targ_options = TREE_TARGET_OPTION (target_option_current_node);
21120 gcc_assert (targ_options);
21122 const struct processor *this_arch
21123 = aarch64_get_arch (targ_options->x_explicit_arch);
21125 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
21126 std::string extension
21127 = aarch64_get_extension_string_for_isa_flags (isa_flags,
21128 this_arch->flags);
21129 /* Only update the assembler .arch string if it is distinct from the last
21130 such string we printed. */
21131 std::string to_print = this_arch->name + extension;
21132 if (to_print != aarch64_last_printed_arch_string)
21134 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
21135 aarch64_last_printed_arch_string = to_print;
21138 /* Print the cpu name we're tuning for in the comments, might be
21139 useful to readers of the generated asm. Do it only when it changes
21140 from function to function and verbose assembly is requested. */
21141 const struct processor *this_tune
21142 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
21144 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
21146 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
21147 this_tune->name);
21148 aarch64_last_printed_tune_string = this_tune->name;
21151 aarch64_asm_output_variant_pcs (stream, fndecl, name);
21153 /* Don't forget the type directive for ELF. */
21154 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
21155 ASM_OUTPUT_LABEL (stream, name);
21157 cfun->machine->label_is_assembled = true;
21160 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. Check if the patch area is after
21161 the function label and emit a BTI if necessary. */
21163 void
21164 aarch64_print_patchable_function_entry (FILE *file,
21165 unsigned HOST_WIDE_INT patch_area_size,
21166 bool record_p)
21168 if (cfun->machine->label_is_assembled
21169 && aarch64_bti_enabled ()
21170 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
21172 /* Remove the BTI that follows the patch area and insert a new BTI
21173 before the patch area right after the function label. */
21174 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
21175 if (insn
21176 && INSN_P (insn)
21177 && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
21178 && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
21179 delete_insn (insn);
21180 asm_fprintf (file, "\thint\t34 // bti c\n");
21183 default_print_patchable_function_entry (file, patch_area_size, record_p);
21186 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
21188 void
21189 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
21191 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
21192 const char *value = IDENTIFIER_POINTER (target);
21193 aarch64_asm_output_variant_pcs (stream, decl, name);
21194 ASM_OUTPUT_DEF (stream, name, value);
21197 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
21198 function symbol references. */
21200 void
21201 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
21203 default_elf_asm_output_external (stream, decl, name);
21204 aarch64_asm_output_variant_pcs (stream, decl, name);
21207 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
21208 Used to output the .cfi_b_key_frame directive when signing the current
21209 function with the B key. */
21211 void
21212 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
21214 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
21215 && aarch64_ra_sign_key == AARCH64_KEY_B)
21216 asm_fprintf (f, "\t.cfi_b_key_frame\n");
21219 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
21221 static void
21222 aarch64_start_file (void)
21224 struct cl_target_option *default_options
21225 = TREE_TARGET_OPTION (target_option_default_node);
21227 const struct processor *default_arch
21228 = aarch64_get_arch (default_options->x_explicit_arch);
21229 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
21230 std::string extension
21231 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
21232 default_arch->flags);
21234 aarch64_last_printed_arch_string = default_arch->name + extension;
21235 aarch64_last_printed_tune_string = "";
21236 asm_fprintf (asm_out_file, "\t.arch %s\n",
21237 aarch64_last_printed_arch_string.c_str ());
21239 default_file_start ();
21242 /* Emit load exclusive. */
21244 static void
21245 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
21246 rtx mem, rtx model_rtx)
21248 if (mode == TImode)
21249 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
21250 gen_highpart (DImode, rval),
21251 mem, model_rtx));
21252 else
21253 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
21256 /* Emit store exclusive. */
21258 static void
21259 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
21260 rtx mem, rtx rval, rtx model_rtx)
21262 if (mode == TImode)
21263 emit_insn (gen_aarch64_store_exclusive_pair
21264 (bval, mem, operand_subword (rval, 0, 0, TImode),
21265 operand_subword (rval, 1, 0, TImode), model_rtx));
21266 else
21267 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
21270 /* Mark the previous jump instruction as unlikely. */
21272 static void
21273 aarch64_emit_unlikely_jump (rtx insn)
21275 rtx_insn *jump = emit_jump_insn (insn);
21276 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
21279 /* We store the names of the various atomic helpers in a 5x4 array.
21280 Return the libcall function given MODE, MODEL and NAMES. */
21283 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
21284 const atomic_ool_names *names)
21286 memmodel model = memmodel_base (INTVAL (model_rtx));
21287 int mode_idx, model_idx;
21289 switch (mode)
21291 case E_QImode:
21292 mode_idx = 0;
21293 break;
21294 case E_HImode:
21295 mode_idx = 1;
21296 break;
21297 case E_SImode:
21298 mode_idx = 2;
21299 break;
21300 case E_DImode:
21301 mode_idx = 3;
21302 break;
21303 case E_TImode:
21304 mode_idx = 4;
21305 break;
21306 default:
21307 gcc_unreachable ();
21310 switch (model)
21312 case MEMMODEL_RELAXED:
21313 model_idx = 0;
21314 break;
21315 case MEMMODEL_CONSUME:
21316 case MEMMODEL_ACQUIRE:
21317 model_idx = 1;
21318 break;
21319 case MEMMODEL_RELEASE:
21320 model_idx = 2;
21321 break;
21322 case MEMMODEL_ACQ_REL:
21323 case MEMMODEL_SEQ_CST:
21324 model_idx = 3;
21325 break;
21326 default:
21327 gcc_unreachable ();
21330 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
21331 VISIBILITY_HIDDEN);
21334 #define DEF0(B, N) \
21335 { "__aarch64_" #B #N "_relax", \
21336 "__aarch64_" #B #N "_acq", \
21337 "__aarch64_" #B #N "_rel", \
21338 "__aarch64_" #B #N "_acq_rel" }
21340 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
21341 { NULL, NULL, NULL, NULL }
21342 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
21344 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
21345 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
21346 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
21347 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
21348 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
21349 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
21351 #undef DEF0
21352 #undef DEF4
21353 #undef DEF5
21355 /* Expand a compare and swap pattern. */
21357 void
21358 aarch64_expand_compare_and_swap (rtx operands[])
21360 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
21361 machine_mode mode, r_mode;
21363 bval = operands[0];
21364 rval = operands[1];
21365 mem = operands[2];
21366 oldval = operands[3];
21367 newval = operands[4];
21368 is_weak = operands[5];
21369 mod_s = operands[6];
21370 mod_f = operands[7];
21371 mode = GET_MODE (mem);
21373 /* Normally the succ memory model must be stronger than fail, but in the
21374 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
21375 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
21376 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
21377 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
21378 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
21380 r_mode = mode;
21381 if (mode == QImode || mode == HImode)
21383 r_mode = SImode;
21384 rval = gen_reg_rtx (r_mode);
21387 if (TARGET_LSE)
21389 /* The CAS insn requires oldval and rval overlap, but we need to
21390 have a copy of oldval saved across the operation to tell if
21391 the operation is successful. */
21392 if (reg_overlap_mentioned_p (rval, oldval))
21393 rval = copy_to_mode_reg (r_mode, oldval);
21394 else
21395 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
21397 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
21398 newval, mod_s));
21399 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
21401 else if (TARGET_OUTLINE_ATOMICS)
21403 /* Oldval must satisfy compare afterward. */
21404 if (!aarch64_plus_operand (oldval, mode))
21405 oldval = force_reg (mode, oldval);
21406 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
21407 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
21408 oldval, mode, newval, mode,
21409 XEXP (mem, 0), Pmode);
21410 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
21412 else
21414 /* The oldval predicate varies by mode. Test it and force to reg. */
21415 insn_code code = code_for_aarch64_compare_and_swap (mode);
21416 if (!insn_data[code].operand[2].predicate (oldval, mode))
21417 oldval = force_reg (mode, oldval);
21419 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
21420 is_weak, mod_s, mod_f));
21421 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
21424 if (r_mode != mode)
21425 rval = gen_lowpart (mode, rval);
21426 emit_move_insn (operands[1], rval);
21428 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
21429 emit_insn (gen_rtx_SET (bval, x));
21432 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
21433 sequence implementing an atomic operation. */
21435 static void
21436 aarch64_emit_post_barrier (enum memmodel model)
21438 const enum memmodel base_model = memmodel_base (model);
21440 if (is_mm_sync (model)
21441 && (base_model == MEMMODEL_ACQUIRE
21442 || base_model == MEMMODEL_ACQ_REL
21443 || base_model == MEMMODEL_SEQ_CST))
21445 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
21449 /* Split a compare and swap pattern. */
21451 void
21452 aarch64_split_compare_and_swap (rtx operands[])
21454 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
21455 gcc_assert (epilogue_completed);
21457 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
21458 machine_mode mode;
21459 bool is_weak;
21460 rtx_code_label *label1, *label2;
21461 enum memmodel model;
21463 rval = operands[0];
21464 mem = operands[1];
21465 oldval = operands[2];
21466 newval = operands[3];
21467 is_weak = (operands[4] != const0_rtx);
21468 model_rtx = operands[5];
21469 scratch = operands[7];
21470 mode = GET_MODE (mem);
21471 model = memmodel_from_int (INTVAL (model_rtx));
21473 /* When OLDVAL is zero and we want the strong version we can emit a tighter
21474 loop:
21475 .label1:
21476 LD[A]XR rval, [mem]
21477 CBNZ rval, .label2
21478 ST[L]XR scratch, newval, [mem]
21479 CBNZ scratch, .label1
21480 .label2:
21481 CMP rval, 0. */
21482 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
21483 oldval == const0_rtx && mode != TImode);
21485 label1 = NULL;
21486 if (!is_weak)
21488 label1 = gen_label_rtx ();
21489 emit_label (label1);
21491 label2 = gen_label_rtx ();
21493 /* The initial load can be relaxed for a __sync operation since a final
21494 barrier will be emitted to stop code hoisting. */
21495 if (is_mm_sync (model))
21496 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
21497 else
21498 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
21500 if (strong_zero_p)
21501 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
21502 else
21504 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
21505 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
21507 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21508 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
21509 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21511 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
21513 if (!is_weak)
21515 if (aarch64_track_speculation)
21517 /* Emit an explicit compare instruction, so that we can correctly
21518 track the condition codes. */
21519 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
21520 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
21522 else
21523 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
21525 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21526 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
21527 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21529 else
21530 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
21532 emit_label (label2);
21534 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
21535 to set the condition flags. If this is not used it will be removed by
21536 later passes. */
21537 if (strong_zero_p)
21538 aarch64_gen_compare_reg (NE, rval, const0_rtx);
21540 /* Emit any final barrier needed for a __sync operation. */
21541 if (is_mm_sync (model))
21542 aarch64_emit_post_barrier (model);
21545 /* Split an atomic operation. */
21547 void
21548 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
21549 rtx value, rtx model_rtx, rtx cond)
21551 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
21552 gcc_assert (epilogue_completed);
21554 machine_mode mode = GET_MODE (mem);
21555 machine_mode wmode = (mode == DImode ? DImode : SImode);
21556 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
21557 const bool is_sync = is_mm_sync (model);
21558 rtx_code_label *label;
21559 rtx x;
21561 /* Split the atomic operation into a sequence. */
21562 label = gen_label_rtx ();
21563 emit_label (label);
21565 if (new_out)
21566 new_out = gen_lowpart (wmode, new_out);
21567 if (old_out)
21568 old_out = gen_lowpart (wmode, old_out);
21569 else
21570 old_out = new_out;
21571 value = simplify_gen_subreg (wmode, value, mode, 0);
21573 /* The initial load can be relaxed for a __sync operation since a final
21574 barrier will be emitted to stop code hoisting. */
21575 if (is_sync)
21576 aarch64_emit_load_exclusive (mode, old_out, mem,
21577 GEN_INT (MEMMODEL_RELAXED));
21578 else
21579 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
21581 switch (code)
21583 case SET:
21584 new_out = value;
21585 break;
21587 case NOT:
21588 x = gen_rtx_AND (wmode, old_out, value);
21589 emit_insn (gen_rtx_SET (new_out, x));
21590 x = gen_rtx_NOT (wmode, new_out);
21591 emit_insn (gen_rtx_SET (new_out, x));
21592 break;
21594 case MINUS:
21595 if (CONST_INT_P (value))
21597 value = GEN_INT (-UINTVAL (value));
21598 code = PLUS;
21600 /* Fall through. */
21602 default:
21603 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
21604 emit_insn (gen_rtx_SET (new_out, x));
21605 break;
21608 aarch64_emit_store_exclusive (mode, cond, mem,
21609 gen_lowpart (mode, new_out), model_rtx);
21611 if (aarch64_track_speculation)
21613 /* Emit an explicit compare instruction, so that we can correctly
21614 track the condition codes. */
21615 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
21616 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
21618 else
21619 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
21621 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21622 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
21623 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21625 /* Emit any final barrier needed for a __sync operation. */
21626 if (is_sync)
21627 aarch64_emit_post_barrier (model);
21630 static void
21631 aarch64_init_libfuncs (void)
21633 /* Half-precision float operations. The compiler handles all operations
21634 with NULL libfuncs by converting to SFmode. */
21636 /* Conversions. */
21637 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
21638 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
21640 /* Arithmetic. */
21641 set_optab_libfunc (add_optab, HFmode, NULL);
21642 set_optab_libfunc (sdiv_optab, HFmode, NULL);
21643 set_optab_libfunc (smul_optab, HFmode, NULL);
21644 set_optab_libfunc (neg_optab, HFmode, NULL);
21645 set_optab_libfunc (sub_optab, HFmode, NULL);
21647 /* Comparisons. */
21648 set_optab_libfunc (eq_optab, HFmode, NULL);
21649 set_optab_libfunc (ne_optab, HFmode, NULL);
21650 set_optab_libfunc (lt_optab, HFmode, NULL);
21651 set_optab_libfunc (le_optab, HFmode, NULL);
21652 set_optab_libfunc (ge_optab, HFmode, NULL);
21653 set_optab_libfunc (gt_optab, HFmode, NULL);
21654 set_optab_libfunc (unord_optab, HFmode, NULL);
21657 /* Target hook for c_mode_for_suffix. */
21658 static machine_mode
21659 aarch64_c_mode_for_suffix (char suffix)
21661 if (suffix == 'q')
21662 return TFmode;
21664 return VOIDmode;
21667 /* We can only represent floating point constants which will fit in
21668 "quarter-precision" values. These values are characterised by
21669 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
21672 (-1)^s * (n/16) * 2^r
21674 Where:
21675 's' is the sign bit.
21676 'n' is an integer in the range 16 <= n <= 31.
21677 'r' is an integer in the range -3 <= r <= 4. */
21679 /* Return true iff X can be represented by a quarter-precision
21680 floating point immediate operand X. Note, we cannot represent 0.0. */
21681 bool
21682 aarch64_float_const_representable_p (rtx x)
21684 /* This represents our current view of how many bits
21685 make up the mantissa. */
21686 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
21687 int exponent;
21688 unsigned HOST_WIDE_INT mantissa, mask;
21689 REAL_VALUE_TYPE r, m;
21690 bool fail;
21692 x = unwrap_const_vec_duplicate (x);
21693 if (!CONST_DOUBLE_P (x))
21694 return false;
21696 if (GET_MODE (x) == VOIDmode
21697 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
21698 return false;
21700 r = *CONST_DOUBLE_REAL_VALUE (x);
21702 /* We cannot represent infinities, NaNs or +/-zero. We won't
21703 know if we have +zero until we analyse the mantissa, but we
21704 can reject the other invalid values. */
21705 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
21706 || REAL_VALUE_MINUS_ZERO (r))
21707 return false;
21709 /* Extract exponent. */
21710 r = real_value_abs (&r);
21711 exponent = REAL_EXP (&r);
21713 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
21714 highest (sign) bit, with a fixed binary point at bit point_pos.
21715 m1 holds the low part of the mantissa, m2 the high part.
21716 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
21717 bits for the mantissa, this can fail (low bits will be lost). */
21718 real_ldexp (&m, &r, point_pos - exponent);
21719 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
21721 /* If the low part of the mantissa has bits set we cannot represent
21722 the value. */
21723 if (w.ulow () != 0)
21724 return false;
21725 /* We have rejected the lower HOST_WIDE_INT, so update our
21726 understanding of how many bits lie in the mantissa and
21727 look only at the high HOST_WIDE_INT. */
21728 mantissa = w.elt (1);
21729 point_pos -= HOST_BITS_PER_WIDE_INT;
21731 /* We can only represent values with a mantissa of the form 1.xxxx. */
21732 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
21733 if ((mantissa & mask) != 0)
21734 return false;
21736 /* Having filtered unrepresentable values, we may now remove all
21737 but the highest 5 bits. */
21738 mantissa >>= point_pos - 5;
21740 /* We cannot represent the value 0.0, so reject it. This is handled
21741 elsewhere. */
21742 if (mantissa == 0)
21743 return false;
21745 /* Then, as bit 4 is always set, we can mask it off, leaving
21746 the mantissa in the range [0, 15]. */
21747 mantissa &= ~(1 << 4);
21748 gcc_assert (mantissa <= 15);
21750 /* GCC internally does not use IEEE754-like encoding (where normalized
21751 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
21752 Our mantissa values are shifted 4 places to the left relative to
21753 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
21754 by 5 places to correct for GCC's representation. */
21755 exponent = 5 - exponent;
21757 return (exponent >= 0 && exponent <= 7);
21760 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
21761 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
21762 output MOVI/MVNI, ORR or BIC immediate. */
21763 char*
21764 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
21765 enum simd_immediate_check which)
21767 bool is_valid;
21768 static char templ[40];
21769 const char *mnemonic;
21770 const char *shift_op;
21771 unsigned int lane_count = 0;
21772 char element_char;
21774 struct simd_immediate_info info;
21776 /* This will return true to show const_vector is legal for use as either
21777 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
21778 It will also update INFO to show how the immediate should be generated.
21779 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
21780 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
21781 gcc_assert (is_valid);
21783 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
21784 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
21786 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
21788 gcc_assert (info.insn == simd_immediate_info::MOV
21789 && info.u.mov.shift == 0);
21790 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
21791 move immediate path. */
21792 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
21793 info.u.mov.value = GEN_INT (0);
21794 else
21796 const unsigned int buf_size = 20;
21797 char float_buf[buf_size] = {'\0'};
21798 real_to_decimal_for_mode (float_buf,
21799 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
21800 buf_size, buf_size, 1, info.elt_mode);
21802 if (lane_count == 1)
21803 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
21804 else
21805 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
21806 lane_count, element_char, float_buf);
21807 return templ;
21811 gcc_assert (CONST_INT_P (info.u.mov.value));
21813 if (which == AARCH64_CHECK_MOV)
21815 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
21816 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
21817 ? "msl" : "lsl");
21818 if (lane_count == 1)
21819 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
21820 mnemonic, UINTVAL (info.u.mov.value));
21821 else if (info.u.mov.shift)
21822 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
21823 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
21824 element_char, UINTVAL (info.u.mov.value), shift_op,
21825 info.u.mov.shift);
21826 else
21827 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
21828 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
21829 element_char, UINTVAL (info.u.mov.value));
21831 else
21833 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
21834 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
21835 if (info.u.mov.shift)
21836 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
21837 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
21838 element_char, UINTVAL (info.u.mov.value), "lsl",
21839 info.u.mov.shift);
21840 else
21841 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
21842 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
21843 element_char, UINTVAL (info.u.mov.value));
21845 return templ;
21848 char*
21849 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
21852 /* If a floating point number was passed and we desire to use it in an
21853 integer mode do the conversion to integer. */
21854 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
21856 unsigned HOST_WIDE_INT ival;
21857 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
21858 gcc_unreachable ();
21859 immediate = gen_int_mode (ival, mode);
21862 machine_mode vmode;
21863 /* use a 64 bit mode for everything except for DI/DF mode, where we use
21864 a 128 bit vector mode. */
21865 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
21867 vmode = aarch64_simd_container_mode (mode, width);
21868 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
21869 return aarch64_output_simd_mov_immediate (v_op, width);
21872 /* Return the output string to use for moving immediate CONST_VECTOR
21873 into an SVE register. */
21875 char *
21876 aarch64_output_sve_mov_immediate (rtx const_vector)
21878 static char templ[40];
21879 struct simd_immediate_info info;
21880 char element_char;
21882 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
21883 gcc_assert (is_valid);
21885 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
21887 machine_mode vec_mode = GET_MODE (const_vector);
21888 if (aarch64_sve_pred_mode_p (vec_mode))
21890 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
21891 if (info.insn == simd_immediate_info::MOV)
21893 gcc_assert (info.u.mov.value == const0_rtx);
21894 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
21896 else
21898 gcc_assert (info.insn == simd_immediate_info::PTRUE);
21899 unsigned int total_bytes;
21900 if (info.u.pattern == AARCH64_SV_ALL
21901 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
21902 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
21903 total_bytes / GET_MODE_SIZE (info.elt_mode));
21904 else
21905 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
21906 svpattern_token (info.u.pattern));
21908 return buf;
21911 if (info.insn == simd_immediate_info::INDEX)
21913 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
21914 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
21915 element_char, INTVAL (info.u.index.base),
21916 INTVAL (info.u.index.step));
21917 return templ;
21920 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
21922 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
21923 info.u.mov.value = GEN_INT (0);
21924 else
21926 const int buf_size = 20;
21927 char float_buf[buf_size] = {};
21928 real_to_decimal_for_mode (float_buf,
21929 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
21930 buf_size, buf_size, 1, info.elt_mode);
21932 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
21933 element_char, float_buf);
21934 return templ;
21938 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
21939 element_char, INTVAL (info.u.mov.value));
21940 return templ;
21943 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
21944 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
21945 pattern. */
21947 char *
21948 aarch64_output_sve_ptrues (rtx const_unspec)
21950 static char templ[40];
21952 struct simd_immediate_info info;
21953 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
21954 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
21956 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
21957 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
21958 svpattern_token (info.u.pattern));
21959 return templ;
21962 /* Split operands into moves from op[1] + op[2] into op[0]. */
21964 void
21965 aarch64_split_combinev16qi (rtx operands[3])
21967 unsigned int dest = REGNO (operands[0]);
21968 unsigned int src1 = REGNO (operands[1]);
21969 unsigned int src2 = REGNO (operands[2]);
21970 machine_mode halfmode = GET_MODE (operands[1]);
21971 unsigned int halfregs = REG_NREGS (operands[1]);
21972 rtx destlo, desthi;
21974 gcc_assert (halfmode == V16QImode);
21976 if (src1 == dest && src2 == dest + halfregs)
21978 /* No-op move. Can't split to nothing; emit something. */
21979 emit_note (NOTE_INSN_DELETED);
21980 return;
21983 /* Preserve register attributes for variable tracking. */
21984 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
21985 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
21986 GET_MODE_SIZE (halfmode));
21988 /* Special case of reversed high/low parts. */
21989 if (reg_overlap_mentioned_p (operands[2], destlo)
21990 && reg_overlap_mentioned_p (operands[1], desthi))
21992 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
21993 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
21994 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
21996 else if (!reg_overlap_mentioned_p (operands[2], destlo))
21998 /* Try to avoid unnecessary moves if part of the result
21999 is in the right place already. */
22000 if (src1 != dest)
22001 emit_move_insn (destlo, operands[1]);
22002 if (src2 != dest + halfregs)
22003 emit_move_insn (desthi, operands[2]);
22005 else
22007 if (src2 != dest + halfregs)
22008 emit_move_insn (desthi, operands[2]);
22009 if (src1 != dest)
22010 emit_move_insn (destlo, operands[1]);
22014 /* vec_perm support. */
22016 struct expand_vec_perm_d
22018 rtx target, op0, op1;
22019 vec_perm_indices perm;
22020 machine_mode vmode;
22021 unsigned int vec_flags;
22022 bool one_vector_p;
22023 bool testing_p;
22026 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
22028 /* Generate a variable permutation. */
22030 static void
22031 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
22033 machine_mode vmode = GET_MODE (target);
22034 bool one_vector_p = rtx_equal_p (op0, op1);
22036 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
22037 gcc_checking_assert (GET_MODE (op0) == vmode);
22038 gcc_checking_assert (GET_MODE (op1) == vmode);
22039 gcc_checking_assert (GET_MODE (sel) == vmode);
22040 gcc_checking_assert (TARGET_SIMD);
22042 if (one_vector_p)
22044 if (vmode == V8QImode)
22046 /* Expand the argument to a V16QI mode by duplicating it. */
22047 rtx pair = gen_reg_rtx (V16QImode);
22048 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
22049 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
22051 else
22053 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
22056 else
22058 rtx pair;
22060 if (vmode == V8QImode)
22062 pair = gen_reg_rtx (V16QImode);
22063 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
22064 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
22066 else
22068 pair = gen_reg_rtx (OImode);
22069 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
22070 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
22075 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
22076 NELT is the number of elements in the vector. */
22078 void
22079 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
22080 unsigned int nelt)
22082 machine_mode vmode = GET_MODE (target);
22083 bool one_vector_p = rtx_equal_p (op0, op1);
22084 rtx mask;
22086 /* The TBL instruction does not use a modulo index, so we must take care
22087 of that ourselves. */
22088 mask = aarch64_simd_gen_const_vector_dup (vmode,
22089 one_vector_p ? nelt - 1 : 2 * nelt - 1);
22090 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
22092 /* For big-endian, we also need to reverse the index within the vector
22093 (but not which vector). */
22094 if (BYTES_BIG_ENDIAN)
22096 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
22097 if (!one_vector_p)
22098 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
22099 sel = expand_simple_binop (vmode, XOR, sel, mask,
22100 NULL, 0, OPTAB_LIB_WIDEN);
22102 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
22105 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
22107 static void
22108 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
22110 emit_insn (gen_rtx_SET (target,
22111 gen_rtx_UNSPEC (GET_MODE (target),
22112 gen_rtvec (2, op0, op1), code)));
22115 /* Expand an SVE vec_perm with the given operands. */
22117 void
22118 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
22120 machine_mode data_mode = GET_MODE (target);
22121 machine_mode sel_mode = GET_MODE (sel);
22122 /* Enforced by the pattern condition. */
22123 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
22125 /* Note: vec_perm indices are supposed to wrap when they go beyond the
22126 size of the two value vectors, i.e. the upper bits of the indices
22127 are effectively ignored. SVE TBL instead produces 0 for any
22128 out-of-range indices, so we need to modulo all the vec_perm indices
22129 to ensure they are all in range. */
22130 rtx sel_reg = force_reg (sel_mode, sel);
22132 /* Check if the sel only references the first values vector. */
22133 if (GET_CODE (sel) == CONST_VECTOR
22134 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
22136 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
22137 return;
22140 /* Check if the two values vectors are the same. */
22141 if (rtx_equal_p (op0, op1))
22143 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
22144 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
22145 NULL, 0, OPTAB_DIRECT);
22146 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
22147 return;
22150 /* Run TBL on for each value vector and combine the results. */
22152 rtx res0 = gen_reg_rtx (data_mode);
22153 rtx res1 = gen_reg_rtx (data_mode);
22154 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
22155 if (GET_CODE (sel) != CONST_VECTOR
22156 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
22158 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
22159 2 * nunits - 1);
22160 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
22161 NULL, 0, OPTAB_DIRECT);
22163 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
22164 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
22165 NULL, 0, OPTAB_DIRECT);
22166 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
22167 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
22168 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
22169 else
22170 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
22173 /* Recognize patterns suitable for the TRN instructions. */
22174 static bool
22175 aarch64_evpc_trn (struct expand_vec_perm_d *d)
22177 HOST_WIDE_INT odd;
22178 poly_uint64 nelt = d->perm.length ();
22179 rtx out, in0, in1, x;
22180 machine_mode vmode = d->vmode;
22182 if (GET_MODE_UNIT_SIZE (vmode) > 8)
22183 return false;
22185 /* Note that these are little-endian tests.
22186 We correct for big-endian later. */
22187 if (!d->perm[0].is_constant (&odd)
22188 || (odd != 0 && odd != 1)
22189 || !d->perm.series_p (0, 2, odd, 2)
22190 || !d->perm.series_p (1, 2, nelt + odd, 2))
22191 return false;
22193 /* Success! */
22194 if (d->testing_p)
22195 return true;
22197 in0 = d->op0;
22198 in1 = d->op1;
22199 /* We don't need a big-endian lane correction for SVE; see the comment
22200 at the head of aarch64-sve.md for details. */
22201 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
22203 x = in0, in0 = in1, in1 = x;
22204 odd = !odd;
22206 out = d->target;
22208 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
22209 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
22210 return true;
22213 /* Try to re-encode the PERM constant so it combines odd and even elements.
22214 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
22215 We retry with this new constant with the full suite of patterns. */
22216 static bool
22217 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
22219 expand_vec_perm_d newd;
22220 unsigned HOST_WIDE_INT nelt;
22222 if (d->vec_flags != VEC_ADVSIMD)
22223 return false;
22225 /* Get the new mode. Always twice the size of the inner
22226 and half the elements. */
22227 poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
22228 unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
22229 auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
22230 machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
22232 if (new_mode == word_mode)
22233 return false;
22235 /* to_constant is safe since this routine is specific to Advanced SIMD
22236 vectors. */
22237 nelt = d->perm.length ().to_constant ();
22239 vec_perm_builder newpermconst;
22240 newpermconst.new_vector (nelt / 2, nelt / 2, 1);
22242 /* Convert the perm constant if we can. Require even, odd as the pairs. */
22243 for (unsigned int i = 0; i < nelt; i += 2)
22245 poly_int64 elt0 = d->perm[i];
22246 poly_int64 elt1 = d->perm[i + 1];
22247 poly_int64 newelt;
22248 if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
22249 return false;
22250 newpermconst.quick_push (newelt.to_constant ());
22252 newpermconst.finalize ();
22254 newd.vmode = new_mode;
22255 newd.vec_flags = VEC_ADVSIMD;
22256 newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
22257 newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
22258 newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
22259 newd.testing_p = d->testing_p;
22260 newd.one_vector_p = d->one_vector_p;
22262 newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
22263 return aarch64_expand_vec_perm_const_1 (&newd);
22266 /* Recognize patterns suitable for the UZP instructions. */
22267 static bool
22268 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
22270 HOST_WIDE_INT odd;
22271 rtx out, in0, in1, x;
22272 machine_mode vmode = d->vmode;
22274 if (GET_MODE_UNIT_SIZE (vmode) > 8)
22275 return false;
22277 /* Note that these are little-endian tests.
22278 We correct for big-endian later. */
22279 if (!d->perm[0].is_constant (&odd)
22280 || (odd != 0 && odd != 1)
22281 || !d->perm.series_p (0, 1, odd, 2))
22282 return false;
22284 /* Success! */
22285 if (d->testing_p)
22286 return true;
22288 in0 = d->op0;
22289 in1 = d->op1;
22290 /* We don't need a big-endian lane correction for SVE; see the comment
22291 at the head of aarch64-sve.md for details. */
22292 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
22294 x = in0, in0 = in1, in1 = x;
22295 odd = !odd;
22297 out = d->target;
22299 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
22300 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
22301 return true;
22304 /* Recognize patterns suitable for the ZIP instructions. */
22305 static bool
22306 aarch64_evpc_zip (struct expand_vec_perm_d *d)
22308 unsigned int high;
22309 poly_uint64 nelt = d->perm.length ();
22310 rtx out, in0, in1, x;
22311 machine_mode vmode = d->vmode;
22313 if (GET_MODE_UNIT_SIZE (vmode) > 8)
22314 return false;
22316 /* Note that these are little-endian tests.
22317 We correct for big-endian later. */
22318 poly_uint64 first = d->perm[0];
22319 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
22320 || !d->perm.series_p (0, 2, first, 1)
22321 || !d->perm.series_p (1, 2, first + nelt, 1))
22322 return false;
22323 high = maybe_ne (first, 0U);
22325 /* Success! */
22326 if (d->testing_p)
22327 return true;
22329 in0 = d->op0;
22330 in1 = d->op1;
22331 /* We don't need a big-endian lane correction for SVE; see the comment
22332 at the head of aarch64-sve.md for details. */
22333 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
22335 x = in0, in0 = in1, in1 = x;
22336 high = !high;
22338 out = d->target;
22340 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
22341 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
22342 return true;
22345 /* Recognize patterns for the EXT insn. */
22347 static bool
22348 aarch64_evpc_ext (struct expand_vec_perm_d *d)
22350 HOST_WIDE_INT location;
22351 rtx offset;
22353 /* The first element always refers to the first vector.
22354 Check if the extracted indices are increasing by one. */
22355 if (d->vec_flags == VEC_SVE_PRED
22356 || !d->perm[0].is_constant (&location)
22357 || !d->perm.series_p (0, 1, location, 1))
22358 return false;
22360 /* Success! */
22361 if (d->testing_p)
22362 return true;
22364 /* The case where (location == 0) is a no-op for both big- and little-endian,
22365 and is removed by the mid-end at optimization levels -O1 and higher.
22367 We don't need a big-endian lane correction for SVE; see the comment
22368 at the head of aarch64-sve.md for details. */
22369 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
22371 /* After setup, we want the high elements of the first vector (stored
22372 at the LSB end of the register), and the low elements of the second
22373 vector (stored at the MSB end of the register). So swap. */
22374 std::swap (d->op0, d->op1);
22375 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
22376 to_constant () is safe since this is restricted to Advanced SIMD
22377 vectors. */
22378 location = d->perm.length ().to_constant () - location;
22381 offset = GEN_INT (location);
22382 emit_set_insn (d->target,
22383 gen_rtx_UNSPEC (d->vmode,
22384 gen_rtvec (3, d->op0, d->op1, offset),
22385 UNSPEC_EXT));
22386 return true;
22389 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
22390 within each 64-bit, 32-bit or 16-bit granule. */
22392 static bool
22393 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
22395 HOST_WIDE_INT diff;
22396 unsigned int i, size, unspec;
22397 machine_mode pred_mode;
22399 if (d->vec_flags == VEC_SVE_PRED
22400 || !d->one_vector_p
22401 || !d->perm[0].is_constant (&diff)
22402 || !diff)
22403 return false;
22405 if (d->vec_flags & VEC_SVE_DATA)
22406 size = (diff + 1) * aarch64_sve_container_bits (d->vmode);
22407 else
22408 size = (diff + 1) * GET_MODE_UNIT_BITSIZE (d->vmode);
22409 if (size == 64)
22411 unspec = UNSPEC_REV64;
22412 pred_mode = VNx2BImode;
22414 else if (size == 32)
22416 unspec = UNSPEC_REV32;
22417 pred_mode = VNx4BImode;
22419 else if (size == 16)
22421 unspec = UNSPEC_REV16;
22422 pred_mode = VNx8BImode;
22424 else
22425 return false;
22427 unsigned int step = diff + 1;
22428 for (i = 0; i < step; ++i)
22429 if (!d->perm.series_p (i, step, diff - i, step))
22430 return false;
22432 /* Success! */
22433 if (d->testing_p)
22434 return true;
22436 if (d->vec_flags & VEC_SVE_DATA)
22438 rtx pred = aarch64_ptrue_reg (pred_mode);
22439 emit_insn (gen_aarch64_sve_revbhw (d->vmode, pred_mode,
22440 d->target, pred, d->op0));
22441 return true;
22443 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
22444 emit_set_insn (d->target, src);
22445 return true;
22448 /* Recognize patterns for the REV insn, which reverses elements within
22449 a full vector. */
22451 static bool
22452 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
22454 poly_uint64 nelt = d->perm.length ();
22456 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
22457 return false;
22459 if (!d->perm.series_p (0, 1, nelt - 1, -1))
22460 return false;
22462 /* Success! */
22463 if (d->testing_p)
22464 return true;
22466 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
22467 emit_set_insn (d->target, src);
22468 return true;
22471 static bool
22472 aarch64_evpc_dup (struct expand_vec_perm_d *d)
22474 rtx out = d->target;
22475 rtx in0;
22476 HOST_WIDE_INT elt;
22477 machine_mode vmode = d->vmode;
22478 rtx lane;
22480 if (d->vec_flags == VEC_SVE_PRED
22481 || d->perm.encoding ().encoded_nelts () != 1
22482 || !d->perm[0].is_constant (&elt))
22483 return false;
22485 if ((d->vec_flags & VEC_SVE_DATA)
22486 && elt * (aarch64_sve_container_bits (vmode) / 8) >= 64)
22487 return false;
22489 /* Success! */
22490 if (d->testing_p)
22491 return true;
22493 /* The generic preparation in aarch64_expand_vec_perm_const_1
22494 swaps the operand order and the permute indices if it finds
22495 d->perm[0] to be in the second operand. Thus, we can always
22496 use d->op0 and need not do any extra arithmetic to get the
22497 correct lane number. */
22498 in0 = d->op0;
22499 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
22501 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
22502 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
22503 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
22504 return true;
22507 static bool
22508 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
22510 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
22511 machine_mode vmode = d->vmode;
22513 /* Make sure that the indices are constant. */
22514 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
22515 for (unsigned int i = 0; i < encoded_nelts; ++i)
22516 if (!d->perm[i].is_constant ())
22517 return false;
22519 if (d->testing_p)
22520 return true;
22522 /* Generic code will try constant permutation twice. Once with the
22523 original mode and again with the elements lowered to QImode.
22524 So wait and don't do the selector expansion ourselves. */
22525 if (vmode != V8QImode && vmode != V16QImode)
22526 return false;
22528 /* to_constant is safe since this routine is specific to Advanced SIMD
22529 vectors. */
22530 unsigned int nelt = d->perm.length ().to_constant ();
22531 for (unsigned int i = 0; i < nelt; ++i)
22532 /* If big-endian and two vectors we end up with a weird mixed-endian
22533 mode on NEON. Reverse the index within each word but not the word
22534 itself. to_constant is safe because we checked is_constant above. */
22535 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
22536 ? d->perm[i].to_constant () ^ (nelt - 1)
22537 : d->perm[i].to_constant ());
22539 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
22540 sel = force_reg (vmode, sel);
22542 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
22543 return true;
22546 /* Try to implement D using an SVE TBL instruction. */
22548 static bool
22549 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
22551 unsigned HOST_WIDE_INT nelt;
22553 /* Permuting two variable-length vectors could overflow the
22554 index range. */
22555 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
22556 return false;
22558 if (d->testing_p)
22559 return true;
22561 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
22562 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
22563 if (d->one_vector_p)
22564 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
22565 else
22566 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
22567 return true;
22570 /* Try to implement D using SVE SEL instruction. */
22572 static bool
22573 aarch64_evpc_sel (struct expand_vec_perm_d *d)
22575 machine_mode vmode = d->vmode;
22576 int unit_size = GET_MODE_UNIT_SIZE (vmode);
22578 if (d->vec_flags != VEC_SVE_DATA
22579 || unit_size > 8)
22580 return false;
22582 int n_patterns = d->perm.encoding ().npatterns ();
22583 poly_int64 vec_len = d->perm.length ();
22585 for (int i = 0; i < n_patterns; ++i)
22586 if (!known_eq (d->perm[i], i)
22587 && !known_eq (d->perm[i], vec_len + i))
22588 return false;
22590 for (int i = n_patterns; i < n_patterns * 2; i++)
22591 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
22592 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
22593 return false;
22595 if (d->testing_p)
22596 return true;
22598 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
22600 /* Build a predicate that is true when op0 elements should be used. */
22601 rtx_vector_builder builder (pred_mode, n_patterns, 2);
22602 for (int i = 0; i < n_patterns * 2; i++)
22604 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
22605 : CONST0_RTX (BImode);
22606 builder.quick_push (elem);
22609 rtx const_vec = builder.build ();
22610 rtx pred = force_reg (pred_mode, const_vec);
22611 /* TARGET = PRED ? OP0 : OP1. */
22612 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
22613 return true;
22616 /* Recognize patterns suitable for the INS instructions. */
22617 static bool
22618 aarch64_evpc_ins (struct expand_vec_perm_d *d)
22620 machine_mode mode = d->vmode;
22621 unsigned HOST_WIDE_INT nelt;
22623 if (d->vec_flags != VEC_ADVSIMD)
22624 return false;
22626 /* to_constant is safe since this routine is specific to Advanced SIMD
22627 vectors. */
22628 nelt = d->perm.length ().to_constant ();
22629 rtx insv = d->op0;
22631 HOST_WIDE_INT idx = -1;
22633 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
22635 HOST_WIDE_INT elt;
22636 if (!d->perm[i].is_constant (&elt))
22637 return false;
22638 if (elt == (HOST_WIDE_INT) i)
22639 continue;
22640 if (idx != -1)
22642 idx = -1;
22643 break;
22645 idx = i;
22648 if (idx == -1)
22650 insv = d->op1;
22651 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
22653 if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
22654 continue;
22655 if (idx != -1)
22656 return false;
22657 idx = i;
22660 if (idx == -1)
22661 return false;
22664 if (d->testing_p)
22665 return true;
22667 gcc_assert (idx != -1);
22669 unsigned extractindex = d->perm[idx].to_constant ();
22670 rtx extractv = d->op0;
22671 if (extractindex >= nelt)
22673 extractv = d->op1;
22674 extractindex -= nelt;
22676 gcc_assert (extractindex < nelt);
22678 emit_move_insn (d->target, insv);
22679 insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
22680 expand_operand ops[5];
22681 create_output_operand (&ops[0], d->target, mode);
22682 create_input_operand (&ops[1], d->target, mode);
22683 create_integer_operand (&ops[2], 1 << idx);
22684 create_input_operand (&ops[3], extractv, mode);
22685 create_integer_operand (&ops[4], extractindex);
22686 expand_insn (icode, 5, ops);
22688 return true;
22691 static bool
22692 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
22694 /* The pattern matching functions above are written to look for a small
22695 number to begin the sequence (0, 1, N/2). If we begin with an index
22696 from the second operand, we can swap the operands. */
22697 poly_int64 nelt = d->perm.length ();
22698 if (known_ge (d->perm[0], nelt))
22700 d->perm.rotate_inputs (1);
22701 std::swap (d->op0, d->op1);
22704 if ((d->vec_flags == VEC_ADVSIMD
22705 || d->vec_flags == VEC_SVE_DATA
22706 || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
22707 || d->vec_flags == VEC_SVE_PRED)
22708 && known_gt (nelt, 1))
22710 if (aarch64_evpc_rev_local (d))
22711 return true;
22712 else if (aarch64_evpc_rev_global (d))
22713 return true;
22714 else if (aarch64_evpc_ext (d))
22715 return true;
22716 else if (aarch64_evpc_dup (d))
22717 return true;
22718 else if (aarch64_evpc_zip (d))
22719 return true;
22720 else if (aarch64_evpc_uzp (d))
22721 return true;
22722 else if (aarch64_evpc_trn (d))
22723 return true;
22724 else if (aarch64_evpc_sel (d))
22725 return true;
22726 else if (aarch64_evpc_ins (d))
22727 return true;
22728 else if (aarch64_evpc_reencode (d))
22729 return true;
22730 if (d->vec_flags == VEC_SVE_DATA)
22731 return aarch64_evpc_sve_tbl (d);
22732 else if (d->vec_flags == VEC_ADVSIMD)
22733 return aarch64_evpc_tbl (d);
22735 return false;
22738 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
22740 static bool
22741 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
22742 rtx op1, const vec_perm_indices &sel)
22744 struct expand_vec_perm_d d;
22746 /* Check whether the mask can be applied to a single vector. */
22747 if (sel.ninputs () == 1
22748 || (op0 && rtx_equal_p (op0, op1)))
22749 d.one_vector_p = true;
22750 else if (sel.all_from_input_p (0))
22752 d.one_vector_p = true;
22753 op1 = op0;
22755 else if (sel.all_from_input_p (1))
22757 d.one_vector_p = true;
22758 op0 = op1;
22760 else
22761 d.one_vector_p = false;
22763 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
22764 sel.nelts_per_input ());
22765 d.vmode = vmode;
22766 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
22767 d.target = target;
22768 d.op0 = op0 ? force_reg (vmode, op0) : NULL_RTX;
22769 if (op0 == op1)
22770 d.op1 = d.op0;
22771 else
22772 d.op1 = op1 ? force_reg (vmode, op1) : NULL_RTX;
22773 d.testing_p = !target;
22775 if (!d.testing_p)
22776 return aarch64_expand_vec_perm_const_1 (&d);
22778 rtx_insn *last = get_last_insn ();
22779 bool ret = aarch64_expand_vec_perm_const_1 (&d);
22780 gcc_assert (last == get_last_insn ());
22782 return ret;
22785 /* Generate a byte permute mask for a register of mode MODE,
22786 which has NUNITS units. */
22789 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
22791 /* We have to reverse each vector because we dont have
22792 a permuted load that can reverse-load according to ABI rules. */
22793 rtx mask;
22794 rtvec v = rtvec_alloc (16);
22795 unsigned int i, j;
22796 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
22798 gcc_assert (BYTES_BIG_ENDIAN);
22799 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
22801 for (i = 0; i < nunits; i++)
22802 for (j = 0; j < usize; j++)
22803 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
22804 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
22805 return force_reg (V16QImode, mask);
22808 /* Expand an SVE integer comparison using the SVE equivalent of:
22810 (set TARGET (CODE OP0 OP1)). */
22812 void
22813 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
22815 machine_mode pred_mode = GET_MODE (target);
22816 machine_mode data_mode = GET_MODE (op0);
22817 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
22818 op0, op1);
22819 if (!rtx_equal_p (target, res))
22820 emit_move_insn (target, res);
22823 /* Return the UNSPEC_COND_* code for comparison CODE. */
22825 static unsigned int
22826 aarch64_unspec_cond_code (rtx_code code)
22828 switch (code)
22830 case NE:
22831 return UNSPEC_COND_FCMNE;
22832 case EQ:
22833 return UNSPEC_COND_FCMEQ;
22834 case LT:
22835 return UNSPEC_COND_FCMLT;
22836 case GT:
22837 return UNSPEC_COND_FCMGT;
22838 case LE:
22839 return UNSPEC_COND_FCMLE;
22840 case GE:
22841 return UNSPEC_COND_FCMGE;
22842 case UNORDERED:
22843 return UNSPEC_COND_FCMUO;
22844 default:
22845 gcc_unreachable ();
22849 /* Emit:
22851 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
22853 where <X> is the operation associated with comparison CODE.
22854 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
22856 static void
22857 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
22858 bool known_ptrue_p, rtx op0, rtx op1)
22860 rtx flag = gen_int_mode (known_ptrue_p, SImode);
22861 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
22862 gen_rtvec (4, pred, flag, op0, op1),
22863 aarch64_unspec_cond_code (code));
22864 emit_set_insn (target, unspec);
22867 /* Emit the SVE equivalent of:
22869 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
22870 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
22871 (set TARGET (ior:PRED_MODE TMP1 TMP2))
22873 where <Xi> is the operation associated with comparison CODEi.
22874 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
22876 static void
22877 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
22878 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
22880 machine_mode pred_mode = GET_MODE (pred);
22881 rtx tmp1 = gen_reg_rtx (pred_mode);
22882 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
22883 rtx tmp2 = gen_reg_rtx (pred_mode);
22884 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
22885 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
22888 /* Emit the SVE equivalent of:
22890 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
22891 (set TARGET (not TMP))
22893 where <X> is the operation associated with comparison CODE.
22894 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
22896 static void
22897 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
22898 bool known_ptrue_p, rtx op0, rtx op1)
22900 machine_mode pred_mode = GET_MODE (pred);
22901 rtx tmp = gen_reg_rtx (pred_mode);
22902 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
22903 aarch64_emit_unop (target, one_cmpl_optab, tmp);
22906 /* Expand an SVE floating-point comparison using the SVE equivalent of:
22908 (set TARGET (CODE OP0 OP1))
22910 If CAN_INVERT_P is true, the caller can also handle inverted results;
22911 return true if the result is in fact inverted. */
22913 bool
22914 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
22915 rtx op0, rtx op1, bool can_invert_p)
22917 machine_mode pred_mode = GET_MODE (target);
22918 machine_mode data_mode = GET_MODE (op0);
22920 rtx ptrue = aarch64_ptrue_reg (pred_mode);
22921 switch (code)
22923 case UNORDERED:
22924 /* UNORDERED has no immediate form. */
22925 op1 = force_reg (data_mode, op1);
22926 /* fall through */
22927 case LT:
22928 case LE:
22929 case GT:
22930 case GE:
22931 case EQ:
22932 case NE:
22934 /* There is native support for the comparison. */
22935 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
22936 return false;
22939 case LTGT:
22940 /* This is a trapping operation (LT or GT). */
22941 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
22942 return false;
22944 case UNEQ:
22945 if (!flag_trapping_math)
22947 /* This would trap for signaling NaNs. */
22948 op1 = force_reg (data_mode, op1);
22949 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
22950 ptrue, true, op0, op1);
22951 return false;
22953 /* fall through */
22954 case UNLT:
22955 case UNLE:
22956 case UNGT:
22957 case UNGE:
22958 if (flag_trapping_math)
22960 /* Work out which elements are ordered. */
22961 rtx ordered = gen_reg_rtx (pred_mode);
22962 op1 = force_reg (data_mode, op1);
22963 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
22964 ptrue, true, op0, op1);
22966 /* Test the opposite condition for the ordered elements,
22967 then invert the result. */
22968 if (code == UNEQ)
22969 code = NE;
22970 else
22971 code = reverse_condition_maybe_unordered (code);
22972 if (can_invert_p)
22974 aarch64_emit_sve_fp_cond (target, code,
22975 ordered, false, op0, op1);
22976 return true;
22978 aarch64_emit_sve_invert_fp_cond (target, code,
22979 ordered, false, op0, op1);
22980 return false;
22982 break;
22984 case ORDERED:
22985 /* ORDERED has no immediate form. */
22986 op1 = force_reg (data_mode, op1);
22987 break;
22989 default:
22990 gcc_unreachable ();
22993 /* There is native support for the inverse comparison. */
22994 code = reverse_condition_maybe_unordered (code);
22995 if (can_invert_p)
22997 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
22998 return true;
23000 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
23001 return false;
23004 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
23005 of the data being selected and CMP_MODE is the mode of the values being
23006 compared. */
23008 void
23009 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
23010 rtx *ops)
23012 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
23013 rtx pred = gen_reg_rtx (pred_mode);
23014 if (FLOAT_MODE_P (cmp_mode))
23016 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
23017 ops[4], ops[5], true))
23018 std::swap (ops[1], ops[2]);
23020 else
23021 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
23023 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
23024 ops[1] = force_reg (data_mode, ops[1]);
23025 /* The "false" value can only be zero if the "true" value is a constant. */
23026 if (register_operand (ops[1], data_mode)
23027 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
23028 ops[2] = force_reg (data_mode, ops[2]);
23030 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
23031 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
23034 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
23035 true. However due to issues with register allocation it is preferable
23036 to avoid tieing integer scalar and FP scalar modes. Executing integer
23037 operations in general registers is better than treating them as scalar
23038 vector operations. This reduces latency and avoids redundant int<->FP
23039 moves. So tie modes if they are either the same class, or vector modes
23040 with other vector modes, vector structs or any scalar mode. */
23042 static bool
23043 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
23045 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
23046 return true;
23048 /* We specifically want to allow elements of "structure" modes to
23049 be tieable to the structure. This more general condition allows
23050 other rarer situations too. The reason we don't extend this to
23051 predicate modes is that there are no predicate structure modes
23052 nor any specific instructions for extracting part of a predicate
23053 register. */
23054 if (aarch64_vector_data_mode_p (mode1)
23055 && aarch64_vector_data_mode_p (mode2))
23056 return true;
23058 /* Also allow any scalar modes with vectors. */
23059 if (aarch64_vector_mode_supported_p (mode1)
23060 || aarch64_vector_mode_supported_p (mode2))
23061 return true;
23063 return false;
23066 /* Return a new RTX holding the result of moving POINTER forward by
23067 AMOUNT bytes. */
23069 static rtx
23070 aarch64_move_pointer (rtx pointer, poly_int64 amount)
23072 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
23074 return adjust_automodify_address (pointer, GET_MODE (pointer),
23075 next, amount);
23078 /* Return a new RTX holding the result of moving POINTER forward by the
23079 size of the mode it points to. */
23081 static rtx
23082 aarch64_progress_pointer (rtx pointer)
23084 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
23087 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
23088 MODE bytes. */
23090 static void
23091 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
23092 machine_mode mode)
23094 /* Handle 256-bit memcpy separately. We do this by making 2 adjacent memory
23095 address copies using V4SImode so that we can use Q registers. */
23096 if (known_eq (GET_MODE_BITSIZE (mode), 256))
23098 mode = V4SImode;
23099 rtx reg1 = gen_reg_rtx (mode);
23100 rtx reg2 = gen_reg_rtx (mode);
23101 /* "Cast" the pointers to the correct mode. */
23102 *src = adjust_address (*src, mode, 0);
23103 *dst = adjust_address (*dst, mode, 0);
23104 /* Emit the memcpy. */
23105 emit_insn (aarch64_gen_load_pair (mode, reg1, *src, reg2,
23106 aarch64_progress_pointer (*src)));
23107 emit_insn (aarch64_gen_store_pair (mode, *dst, reg1,
23108 aarch64_progress_pointer (*dst), reg2));
23109 /* Move the pointers forward. */
23110 *src = aarch64_move_pointer (*src, 32);
23111 *dst = aarch64_move_pointer (*dst, 32);
23112 return;
23115 rtx reg = gen_reg_rtx (mode);
23117 /* "Cast" the pointers to the correct mode. */
23118 *src = adjust_address (*src, mode, 0);
23119 *dst = adjust_address (*dst, mode, 0);
23120 /* Emit the memcpy. */
23121 emit_move_insn (reg, *src);
23122 emit_move_insn (*dst, reg);
23123 /* Move the pointers forward. */
23124 *src = aarch64_progress_pointer (*src);
23125 *dst = aarch64_progress_pointer (*dst);
23128 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
23129 we succeed, otherwise return false. */
23131 bool
23132 aarch64_expand_cpymem (rtx *operands)
23134 int mode_bits;
23135 rtx dst = operands[0];
23136 rtx src = operands[1];
23137 rtx base;
23138 machine_mode cur_mode = BLKmode;
23140 /* Only expand fixed-size copies. */
23141 if (!CONST_INT_P (operands[2]))
23142 return false;
23144 unsigned HOST_WIDE_INT size = INTVAL (operands[2]);
23146 /* Inline up to 256 bytes when optimizing for speed. */
23147 unsigned HOST_WIDE_INT max_copy_size = 256;
23149 if (optimize_function_for_size_p (cfun))
23150 max_copy_size = 128;
23152 int copy_bits = 256;
23154 /* Default to 256-bit LDP/STP on large copies, however small copies, no SIMD
23155 support or slow 256-bit LDP/STP fall back to 128-bit chunks. */
23156 if (size <= 24
23157 || !TARGET_SIMD
23158 || (aarch64_tune_params.extra_tuning_flags
23159 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
23161 copy_bits = 128;
23162 max_copy_size = max_copy_size / 2;
23165 if (size > max_copy_size)
23166 return false;
23168 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
23169 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
23171 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
23172 src = adjust_automodify_address (src, VOIDmode, base, 0);
23174 /* Convert size to bits to make the rest of the code simpler. */
23175 int n = size * BITS_PER_UNIT;
23177 while (n > 0)
23179 /* Find the largest mode in which to do the copy in without over reading
23180 or writing. */
23181 opt_scalar_int_mode mode_iter;
23182 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
23183 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_bits))
23184 cur_mode = mode_iter.require ();
23186 gcc_assert (cur_mode != BLKmode);
23188 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
23190 /* Prefer Q-register accesses for the last bytes. */
23191 if (mode_bits == 128 && copy_bits == 256)
23192 cur_mode = V4SImode;
23194 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
23196 n -= mode_bits;
23198 /* Emit trailing copies using overlapping unaligned accesses - this is
23199 smaller and faster. */
23200 if (n > 0 && n < copy_bits / 2)
23202 machine_mode next_mode = smallest_mode_for_size (n, MODE_INT);
23203 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
23204 gcc_assert (n_bits <= mode_bits);
23205 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
23206 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
23207 n = n_bits;
23211 return true;
23214 /* Like aarch64_copy_one_block_and_progress_pointers, except for memset where
23215 SRC is a register we have created with the duplicated value to be set. */
23216 static void
23217 aarch64_set_one_block_and_progress_pointer (rtx src, rtx *dst,
23218 machine_mode mode)
23220 /* If we are copying 128bits or 256bits, we can do that straight from
23221 the SIMD register we prepared. */
23222 if (known_eq (GET_MODE_BITSIZE (mode), 256))
23224 mode = GET_MODE (src);
23225 /* "Cast" the *dst to the correct mode. */
23226 *dst = adjust_address (*dst, mode, 0);
23227 /* Emit the memset. */
23228 emit_insn (aarch64_gen_store_pair (mode, *dst, src,
23229 aarch64_progress_pointer (*dst), src));
23231 /* Move the pointers forward. */
23232 *dst = aarch64_move_pointer (*dst, 32);
23233 return;
23235 if (known_eq (GET_MODE_BITSIZE (mode), 128))
23237 /* "Cast" the *dst to the correct mode. */
23238 *dst = adjust_address (*dst, GET_MODE (src), 0);
23239 /* Emit the memset. */
23240 emit_move_insn (*dst, src);
23241 /* Move the pointers forward. */
23242 *dst = aarch64_move_pointer (*dst, 16);
23243 return;
23245 /* For copying less, we have to extract the right amount from src. */
23246 rtx reg = lowpart_subreg (mode, src, GET_MODE (src));
23248 /* "Cast" the *dst to the correct mode. */
23249 *dst = adjust_address (*dst, mode, 0);
23250 /* Emit the memset. */
23251 emit_move_insn (*dst, reg);
23252 /* Move the pointer forward. */
23253 *dst = aarch64_progress_pointer (*dst);
23256 /* Expand setmem, as if from a __builtin_memset. Return true if
23257 we succeed, otherwise return false. */
23259 bool
23260 aarch64_expand_setmem (rtx *operands)
23262 int n, mode_bits;
23263 unsigned HOST_WIDE_INT len;
23264 rtx dst = operands[0];
23265 rtx val = operands[2], src;
23266 rtx base;
23267 machine_mode cur_mode = BLKmode, next_mode;
23269 /* We can't do anything smart if the amount to copy is not constant. */
23270 if (!CONST_INT_P (operands[1]))
23271 return false;
23273 bool speed_p = !optimize_function_for_size_p (cfun);
23275 /* Default the maximum to 256-bytes. */
23276 unsigned max_set_size = 256;
23278 /* In case we are optimizing for size or if the core does not
23279 want to use STP Q regs, lower the max_set_size. */
23280 max_set_size = (!speed_p
23281 || (aarch64_tune_params.extra_tuning_flags
23282 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
23283 ? max_set_size / 2 : max_set_size;
23285 len = INTVAL (operands[1]);
23287 /* Upper bound check. */
23288 if (len > max_set_size)
23289 return false;
23291 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
23292 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
23294 /* Prepare the val using a DUP/MOVI v0.16B, val. */
23295 src = expand_vector_broadcast (V16QImode, val);
23296 src = force_reg (V16QImode, src);
23298 /* Convert len to bits to make the rest of the code simpler. */
23299 n = len * BITS_PER_UNIT;
23301 /* Maximum amount to copy in one go. We allow 256-bit chunks based on the
23302 AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS tuning parameter. setmem expand
23303 pattern is only turned on for TARGET_SIMD. */
23304 const int copy_limit = (speed_p
23305 && (aarch64_tune_params.extra_tuning_flags
23306 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS))
23307 ? GET_MODE_BITSIZE (TImode) : 256;
23309 while (n > 0)
23311 /* Find the largest mode in which to do the copy without
23312 over writing. */
23313 opt_scalar_int_mode mode_iter;
23314 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
23315 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
23316 cur_mode = mode_iter.require ();
23318 gcc_assert (cur_mode != BLKmode);
23320 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
23321 aarch64_set_one_block_and_progress_pointer (src, &dst, cur_mode);
23323 n -= mode_bits;
23325 /* Do certain trailing copies as overlapping if it's going to be
23326 cheaper. i.e. less instructions to do so. For instance doing a 15
23327 byte copy it's more efficient to do two overlapping 8 byte copies than
23328 8 + 4 + 2 + 1. */
23329 if (n > 0 && n < copy_limit / 2)
23331 next_mode = smallest_mode_for_size (n, MODE_INT);
23332 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
23333 gcc_assert (n_bits <= mode_bits);
23334 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
23335 n = n_bits;
23339 return true;
23343 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
23344 SImode stores. Handle the case when the constant has identical
23345 bottom and top halves. This is beneficial when the two stores can be
23346 merged into an STP and we avoid synthesising potentially expensive
23347 immediates twice. Return true if such a split is possible. */
23349 bool
23350 aarch64_split_dimode_const_store (rtx dst, rtx src)
23352 rtx lo = gen_lowpart (SImode, src);
23353 rtx hi = gen_highpart_mode (SImode, DImode, src);
23355 bool size_p = optimize_function_for_size_p (cfun);
23357 if (!rtx_equal_p (lo, hi))
23358 return false;
23360 unsigned int orig_cost
23361 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
23362 unsigned int lo_cost
23363 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
23365 /* We want to transform:
23366 MOV x1, 49370
23367 MOVK x1, 0x140, lsl 16
23368 MOVK x1, 0xc0da, lsl 32
23369 MOVK x1, 0x140, lsl 48
23370 STR x1, [x0]
23371 into:
23372 MOV w1, 49370
23373 MOVK w1, 0x140, lsl 16
23374 STP w1, w1, [x0]
23375 So we want to perform this only when we save two instructions
23376 or more. When optimizing for size, however, accept any code size
23377 savings we can. */
23378 if (size_p && orig_cost <= lo_cost)
23379 return false;
23381 if (!size_p
23382 && (orig_cost <= lo_cost + 1))
23383 return false;
23385 rtx mem_lo = adjust_address (dst, SImode, 0);
23386 if (!aarch64_mem_pair_operand (mem_lo, SImode))
23387 return false;
23389 rtx tmp_reg = gen_reg_rtx (SImode);
23390 aarch64_expand_mov_immediate (tmp_reg, lo);
23391 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
23392 /* Don't emit an explicit store pair as this may not be always profitable.
23393 Let the sched-fusion logic decide whether to merge them. */
23394 emit_move_insn (mem_lo, tmp_reg);
23395 emit_move_insn (mem_hi, tmp_reg);
23397 return true;
23400 /* Generate RTL for a conditional branch with rtx comparison CODE in
23401 mode CC_MODE. The destination of the unlikely conditional branch
23402 is LABEL_REF. */
23404 void
23405 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
23406 rtx label_ref)
23408 rtx x;
23409 x = gen_rtx_fmt_ee (code, VOIDmode,
23410 gen_rtx_REG (cc_mode, CC_REGNUM),
23411 const0_rtx);
23413 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
23414 gen_rtx_LABEL_REF (VOIDmode, label_ref),
23415 pc_rtx);
23416 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
23419 /* Generate DImode scratch registers for 128-bit (TImode) addition.
23421 OP1 represents the TImode destination operand 1
23422 OP2 represents the TImode destination operand 2
23423 LOW_DEST represents the low half (DImode) of TImode operand 0
23424 LOW_IN1 represents the low half (DImode) of TImode operand 1
23425 LOW_IN2 represents the low half (DImode) of TImode operand 2
23426 HIGH_DEST represents the high half (DImode) of TImode operand 0
23427 HIGH_IN1 represents the high half (DImode) of TImode operand 1
23428 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
23430 void
23431 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
23432 rtx *low_in1, rtx *low_in2,
23433 rtx *high_dest, rtx *high_in1,
23434 rtx *high_in2)
23436 *low_dest = gen_reg_rtx (DImode);
23437 *low_in1 = gen_lowpart (DImode, op1);
23438 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
23439 subreg_lowpart_offset (DImode, TImode));
23440 *high_dest = gen_reg_rtx (DImode);
23441 *high_in1 = gen_highpart (DImode, op1);
23442 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
23443 subreg_highpart_offset (DImode, TImode));
23446 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
23448 This function differs from 'arch64_addti_scratch_regs' in that
23449 OP1 can be an immediate constant (zero). We must call
23450 subreg_highpart_offset with DImode and TImode arguments, otherwise
23451 VOIDmode will be used for the const_int which generates an internal
23452 error from subreg_size_highpart_offset which does not expect a size of zero.
23454 OP1 represents the TImode destination operand 1
23455 OP2 represents the TImode destination operand 2
23456 LOW_DEST represents the low half (DImode) of TImode operand 0
23457 LOW_IN1 represents the low half (DImode) of TImode operand 1
23458 LOW_IN2 represents the low half (DImode) of TImode operand 2
23459 HIGH_DEST represents the high half (DImode) of TImode operand 0
23460 HIGH_IN1 represents the high half (DImode) of TImode operand 1
23461 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
23464 void
23465 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
23466 rtx *low_in1, rtx *low_in2,
23467 rtx *high_dest, rtx *high_in1,
23468 rtx *high_in2)
23470 *low_dest = gen_reg_rtx (DImode);
23471 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
23472 subreg_lowpart_offset (DImode, TImode));
23474 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
23475 subreg_lowpart_offset (DImode, TImode));
23476 *high_dest = gen_reg_rtx (DImode);
23478 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
23479 subreg_highpart_offset (DImode, TImode));
23480 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
23481 subreg_highpart_offset (DImode, TImode));
23484 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
23486 OP0 represents the TImode destination operand 0
23487 LOW_DEST represents the low half (DImode) of TImode operand 0
23488 LOW_IN1 represents the low half (DImode) of TImode operand 1
23489 LOW_IN2 represents the low half (DImode) of TImode operand 2
23490 HIGH_DEST represents the high half (DImode) of TImode operand 0
23491 HIGH_IN1 represents the high half (DImode) of TImode operand 1
23492 HIGH_IN2 represents the high half (DImode) of TImode operand 2
23493 UNSIGNED_P is true if the operation is being performed on unsigned
23494 values. */
23495 void
23496 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
23497 rtx low_in2, rtx high_dest, rtx high_in1,
23498 rtx high_in2, bool unsigned_p)
23500 if (low_in2 == const0_rtx)
23502 low_dest = low_in1;
23503 high_in2 = force_reg (DImode, high_in2);
23504 if (unsigned_p)
23505 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
23506 else
23507 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
23509 else
23511 if (aarch64_plus_immediate (low_in2, DImode))
23512 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
23513 GEN_INT (-UINTVAL (low_in2))));
23514 else
23516 low_in2 = force_reg (DImode, low_in2);
23517 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
23519 high_in2 = force_reg (DImode, high_in2);
23521 if (unsigned_p)
23522 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
23523 else
23524 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
23527 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
23528 emit_move_insn (gen_highpart (DImode, op0), high_dest);
23532 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
23534 static unsigned HOST_WIDE_INT
23535 aarch64_asan_shadow_offset (void)
23537 if (TARGET_ILP32)
23538 return (HOST_WIDE_INT_1 << 29);
23539 else
23540 return (HOST_WIDE_INT_1 << 36);
23543 static rtx
23544 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
23545 int code, tree treeop0, tree treeop1)
23547 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
23548 rtx op0, op1;
23549 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
23550 insn_code icode;
23551 struct expand_operand ops[4];
23553 start_sequence ();
23554 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
23556 op_mode = GET_MODE (op0);
23557 if (op_mode == VOIDmode)
23558 op_mode = GET_MODE (op1);
23560 switch (op_mode)
23562 case E_QImode:
23563 case E_HImode:
23564 case E_SImode:
23565 cmp_mode = SImode;
23566 icode = CODE_FOR_cmpsi;
23567 break;
23569 case E_DImode:
23570 cmp_mode = DImode;
23571 icode = CODE_FOR_cmpdi;
23572 break;
23574 case E_SFmode:
23575 cmp_mode = SFmode;
23576 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
23577 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
23578 break;
23580 case E_DFmode:
23581 cmp_mode = DFmode;
23582 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
23583 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
23584 break;
23586 default:
23587 end_sequence ();
23588 return NULL_RTX;
23591 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
23592 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
23593 if (!op0 || !op1)
23595 end_sequence ();
23596 return NULL_RTX;
23598 *prep_seq = get_insns ();
23599 end_sequence ();
23601 create_fixed_operand (&ops[0], op0);
23602 create_fixed_operand (&ops[1], op1);
23604 start_sequence ();
23605 if (!maybe_expand_insn (icode, 2, ops))
23607 end_sequence ();
23608 return NULL_RTX;
23610 *gen_seq = get_insns ();
23611 end_sequence ();
23613 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
23614 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
23617 static rtx
23618 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
23619 int cmp_code, tree treeop0, tree treeop1, int bit_code)
23621 rtx op0, op1, target;
23622 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
23623 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
23624 insn_code icode;
23625 struct expand_operand ops[6];
23626 int aarch64_cond;
23628 push_to_sequence (*prep_seq);
23629 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
23631 op_mode = GET_MODE (op0);
23632 if (op_mode == VOIDmode)
23633 op_mode = GET_MODE (op1);
23635 switch (op_mode)
23637 case E_QImode:
23638 case E_HImode:
23639 case E_SImode:
23640 cmp_mode = SImode;
23641 break;
23643 case E_DImode:
23644 cmp_mode = DImode;
23645 break;
23647 case E_SFmode:
23648 cmp_mode = SFmode;
23649 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
23650 break;
23652 case E_DFmode:
23653 cmp_mode = DFmode;
23654 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
23655 break;
23657 default:
23658 end_sequence ();
23659 return NULL_RTX;
23662 icode = code_for_ccmp (cc_mode, cmp_mode);
23664 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
23665 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
23666 if (!op0 || !op1)
23668 end_sequence ();
23669 return NULL_RTX;
23671 *prep_seq = get_insns ();
23672 end_sequence ();
23674 target = gen_rtx_REG (cc_mode, CC_REGNUM);
23675 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
23677 if (bit_code != AND)
23679 /* Treat the ccmp patterns as canonical and use them where possible,
23680 but fall back to ccmp_rev patterns if there's no other option. */
23681 rtx_code prev_code = GET_CODE (prev);
23682 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
23683 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
23684 && !(prev_code == EQ
23685 || prev_code == NE
23686 || prev_code == ORDERED
23687 || prev_code == UNORDERED))
23688 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
23689 else
23691 rtx_code code = reverse_condition (prev_code);
23692 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
23694 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
23697 create_fixed_operand (&ops[0], XEXP (prev, 0));
23698 create_fixed_operand (&ops[1], target);
23699 create_fixed_operand (&ops[2], op0);
23700 create_fixed_operand (&ops[3], op1);
23701 create_fixed_operand (&ops[4], prev);
23702 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
23704 push_to_sequence (*gen_seq);
23705 if (!maybe_expand_insn (icode, 6, ops))
23707 end_sequence ();
23708 return NULL_RTX;
23711 *gen_seq = get_insns ();
23712 end_sequence ();
23714 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
23717 #undef TARGET_GEN_CCMP_FIRST
23718 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
23720 #undef TARGET_GEN_CCMP_NEXT
23721 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
23723 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
23724 instruction fusion of some sort. */
23726 static bool
23727 aarch64_macro_fusion_p (void)
23729 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
23733 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
23734 should be kept together during scheduling. */
23736 static bool
23737 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
23739 rtx set_dest;
23740 rtx prev_set = single_set (prev);
23741 rtx curr_set = single_set (curr);
23742 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
23743 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
23745 if (!aarch64_macro_fusion_p ())
23746 return false;
23748 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
23750 /* We are trying to match:
23751 prev (mov) == (set (reg r0) (const_int imm16))
23752 curr (movk) == (set (zero_extract (reg r0)
23753 (const_int 16)
23754 (const_int 16))
23755 (const_int imm16_1)) */
23757 set_dest = SET_DEST (curr_set);
23759 if (GET_CODE (set_dest) == ZERO_EXTRACT
23760 && CONST_INT_P (SET_SRC (curr_set))
23761 && CONST_INT_P (SET_SRC (prev_set))
23762 && CONST_INT_P (XEXP (set_dest, 2))
23763 && INTVAL (XEXP (set_dest, 2)) == 16
23764 && REG_P (XEXP (set_dest, 0))
23765 && REG_P (SET_DEST (prev_set))
23766 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
23768 return true;
23772 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
23775 /* We're trying to match:
23776 prev (adrp) == (set (reg r1)
23777 (high (symbol_ref ("SYM"))))
23778 curr (add) == (set (reg r0)
23779 (lo_sum (reg r1)
23780 (symbol_ref ("SYM"))))
23781 Note that r0 need not necessarily be the same as r1, especially
23782 during pre-regalloc scheduling. */
23784 if (satisfies_constraint_Ush (SET_SRC (prev_set))
23785 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
23787 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
23788 && REG_P (XEXP (SET_SRC (curr_set), 0))
23789 && REGNO (XEXP (SET_SRC (curr_set), 0))
23790 == REGNO (SET_DEST (prev_set))
23791 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
23792 XEXP (SET_SRC (curr_set), 1)))
23793 return true;
23797 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
23800 /* We're trying to match:
23801 prev (movk) == (set (zero_extract (reg r0)
23802 (const_int 16)
23803 (const_int 32))
23804 (const_int imm16_1))
23805 curr (movk) == (set (zero_extract (reg r0)
23806 (const_int 16)
23807 (const_int 48))
23808 (const_int imm16_2)) */
23810 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
23811 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
23812 && REG_P (XEXP (SET_DEST (prev_set), 0))
23813 && REG_P (XEXP (SET_DEST (curr_set), 0))
23814 && REGNO (XEXP (SET_DEST (prev_set), 0))
23815 == REGNO (XEXP (SET_DEST (curr_set), 0))
23816 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
23817 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
23818 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
23819 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
23820 && CONST_INT_P (SET_SRC (prev_set))
23821 && CONST_INT_P (SET_SRC (curr_set)))
23822 return true;
23825 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
23827 /* We're trying to match:
23828 prev (adrp) == (set (reg r0)
23829 (high (symbol_ref ("SYM"))))
23830 curr (ldr) == (set (reg r1)
23831 (mem (lo_sum (reg r0)
23832 (symbol_ref ("SYM")))))
23834 curr (ldr) == (set (reg r1)
23835 (zero_extend (mem
23836 (lo_sum (reg r0)
23837 (symbol_ref ("SYM")))))) */
23838 if (satisfies_constraint_Ush (SET_SRC (prev_set))
23839 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
23841 rtx curr_src = SET_SRC (curr_set);
23843 if (GET_CODE (curr_src) == ZERO_EXTEND)
23844 curr_src = XEXP (curr_src, 0);
23846 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
23847 && REG_P (XEXP (XEXP (curr_src, 0), 0))
23848 && REGNO (XEXP (XEXP (curr_src, 0), 0))
23849 == REGNO (SET_DEST (prev_set))
23850 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
23851 XEXP (SET_SRC (prev_set), 0)))
23852 return true;
23856 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
23857 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
23858 && prev_set && curr_set && any_condjump_p (curr)
23859 && GET_CODE (SET_SRC (prev_set)) == COMPARE
23860 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
23861 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
23862 return true;
23864 /* Fuse flag-setting ALU instructions and conditional branch. */
23865 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
23866 && any_condjump_p (curr))
23868 unsigned int condreg1, condreg2;
23869 rtx cc_reg_1;
23870 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
23871 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
23873 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
23874 && prev
23875 && modified_in_p (cc_reg_1, prev))
23877 enum attr_type prev_type = get_attr_type (prev);
23879 /* FIXME: this misses some which is considered simple arthematic
23880 instructions for ThunderX. Simple shifts are missed here. */
23881 if (prev_type == TYPE_ALUS_SREG
23882 || prev_type == TYPE_ALUS_IMM
23883 || prev_type == TYPE_LOGICS_REG
23884 || prev_type == TYPE_LOGICS_IMM)
23885 return true;
23889 /* Fuse ALU instructions and CBZ/CBNZ. */
23890 if (prev_set
23891 && curr_set
23892 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
23893 && any_condjump_p (curr))
23895 /* We're trying to match:
23896 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
23897 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
23898 (const_int 0))
23899 (label_ref ("SYM"))
23900 (pc)) */
23901 if (SET_DEST (curr_set) == (pc_rtx)
23902 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
23903 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
23904 && REG_P (SET_DEST (prev_set))
23905 && REGNO (SET_DEST (prev_set))
23906 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
23908 /* Fuse ALU operations followed by conditional branch instruction. */
23909 switch (get_attr_type (prev))
23911 case TYPE_ALU_IMM:
23912 case TYPE_ALU_SREG:
23913 case TYPE_ADC_REG:
23914 case TYPE_ADC_IMM:
23915 case TYPE_ADCS_REG:
23916 case TYPE_ADCS_IMM:
23917 case TYPE_LOGIC_REG:
23918 case TYPE_LOGIC_IMM:
23919 case TYPE_CSEL:
23920 case TYPE_ADR:
23921 case TYPE_MOV_IMM:
23922 case TYPE_SHIFT_REG:
23923 case TYPE_SHIFT_IMM:
23924 case TYPE_BFM:
23925 case TYPE_RBIT:
23926 case TYPE_REV:
23927 case TYPE_EXTEND:
23928 return true;
23930 default:;
23935 return false;
23938 /* Return true iff the instruction fusion described by OP is enabled. */
23940 bool
23941 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
23943 return (aarch64_tune_params.fusible_ops & op) != 0;
23946 /* If MEM is in the form of [base+offset], extract the two parts
23947 of address and set to BASE and OFFSET, otherwise return false
23948 after clearing BASE and OFFSET. */
23950 bool
23951 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
23953 rtx addr;
23955 gcc_assert (MEM_P (mem));
23957 addr = XEXP (mem, 0);
23959 if (REG_P (addr))
23961 *base = addr;
23962 *offset = const0_rtx;
23963 return true;
23966 if (GET_CODE (addr) == PLUS
23967 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
23969 *base = XEXP (addr, 0);
23970 *offset = XEXP (addr, 1);
23971 return true;
23974 *base = NULL_RTX;
23975 *offset = NULL_RTX;
23977 return false;
23980 /* Types for scheduling fusion. */
23981 enum sched_fusion_type
23983 SCHED_FUSION_NONE = 0,
23984 SCHED_FUSION_LD_SIGN_EXTEND,
23985 SCHED_FUSION_LD_ZERO_EXTEND,
23986 SCHED_FUSION_LD,
23987 SCHED_FUSION_ST,
23988 SCHED_FUSION_NUM
23991 /* If INSN is a load or store of address in the form of [base+offset],
23992 extract the two parts and set to BASE and OFFSET. Return scheduling
23993 fusion type this INSN is. */
23995 static enum sched_fusion_type
23996 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
23998 rtx x, dest, src;
23999 enum sched_fusion_type fusion = SCHED_FUSION_LD;
24001 gcc_assert (INSN_P (insn));
24002 x = PATTERN (insn);
24003 if (GET_CODE (x) != SET)
24004 return SCHED_FUSION_NONE;
24006 src = SET_SRC (x);
24007 dest = SET_DEST (x);
24009 machine_mode dest_mode = GET_MODE (dest);
24011 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
24012 return SCHED_FUSION_NONE;
24014 if (GET_CODE (src) == SIGN_EXTEND)
24016 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
24017 src = XEXP (src, 0);
24018 if (!MEM_P (src) || GET_MODE (src) != SImode)
24019 return SCHED_FUSION_NONE;
24021 else if (GET_CODE (src) == ZERO_EXTEND)
24023 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
24024 src = XEXP (src, 0);
24025 if (!MEM_P (src) || GET_MODE (src) != SImode)
24026 return SCHED_FUSION_NONE;
24029 if (MEM_P (src) && REG_P (dest))
24030 extract_base_offset_in_addr (src, base, offset);
24031 else if (MEM_P (dest) && (REG_P (src) || src == const0_rtx))
24033 fusion = SCHED_FUSION_ST;
24034 extract_base_offset_in_addr (dest, base, offset);
24036 else
24037 return SCHED_FUSION_NONE;
24039 if (*base == NULL_RTX || *offset == NULL_RTX)
24040 fusion = SCHED_FUSION_NONE;
24042 return fusion;
24045 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
24047 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
24048 and PRI are only calculated for these instructions. For other instruction,
24049 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
24050 type instruction fusion can be added by returning different priorities.
24052 It's important that irrelevant instructions get the largest FUSION_PRI. */
24054 static void
24055 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
24056 int *fusion_pri, int *pri)
24058 int tmp, off_val;
24059 rtx base, offset;
24060 enum sched_fusion_type fusion;
24062 gcc_assert (INSN_P (insn));
24064 tmp = max_pri - 1;
24065 fusion = fusion_load_store (insn, &base, &offset);
24066 if (fusion == SCHED_FUSION_NONE)
24068 *pri = tmp;
24069 *fusion_pri = tmp;
24070 return;
24073 /* Set FUSION_PRI according to fusion type and base register. */
24074 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
24076 /* Calculate PRI. */
24077 tmp /= 2;
24079 /* INSN with smaller offset goes first. */
24080 off_val = (int)(INTVAL (offset));
24081 if (off_val >= 0)
24082 tmp -= (off_val & 0xfffff);
24083 else
24084 tmp += ((- off_val) & 0xfffff);
24086 *pri = tmp;
24087 return;
24090 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
24091 Adjust priority of sha1h instructions so they are scheduled before
24092 other SHA1 instructions. */
24094 static int
24095 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
24097 rtx x = PATTERN (insn);
24099 if (GET_CODE (x) == SET)
24101 x = SET_SRC (x);
24103 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
24104 return priority + 10;
24107 return priority;
24110 /* Given OPERANDS of consecutive load/store, check if we can merge
24111 them into ldp/stp. LOAD is true if they are load instructions.
24112 MODE is the mode of memory operands. */
24114 bool
24115 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
24116 machine_mode mode)
24118 HOST_WIDE_INT offval_1, offval_2, msize;
24119 enum reg_class rclass_1, rclass_2;
24120 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
24122 if (load)
24124 mem_1 = operands[1];
24125 mem_2 = operands[3];
24126 reg_1 = operands[0];
24127 reg_2 = operands[2];
24128 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
24129 if (REGNO (reg_1) == REGNO (reg_2))
24130 return false;
24132 else
24134 mem_1 = operands[0];
24135 mem_2 = operands[2];
24136 reg_1 = operands[1];
24137 reg_2 = operands[3];
24140 /* The mems cannot be volatile. */
24141 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
24142 return false;
24144 /* If we have SImode and slow unaligned ldp,
24145 check the alignment to be at least 8 byte. */
24146 if (mode == SImode
24147 && (aarch64_tune_params.extra_tuning_flags
24148 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
24149 && !optimize_size
24150 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
24151 return false;
24153 /* Check if the addresses are in the form of [base+offset]. */
24154 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
24155 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
24156 return false;
24157 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
24158 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
24159 return false;
24161 /* Check if the bases are same. */
24162 if (!rtx_equal_p (base_1, base_2))
24163 return false;
24165 /* The operands must be of the same size. */
24166 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
24167 GET_MODE_SIZE (GET_MODE (mem_2))));
24169 offval_1 = INTVAL (offset_1);
24170 offval_2 = INTVAL (offset_2);
24171 /* We should only be trying this for fixed-sized modes. There is no
24172 SVE LDP/STP instruction. */
24173 msize = GET_MODE_SIZE (mode).to_constant ();
24174 /* Check if the offsets are consecutive. */
24175 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
24176 return false;
24178 /* Check if the addresses are clobbered by load. */
24179 if (load)
24181 if (reg_mentioned_p (reg_1, mem_1))
24182 return false;
24184 /* In increasing order, the last load can clobber the address. */
24185 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
24186 return false;
24189 /* One of the memory accesses must be a mempair operand.
24190 If it is not the first one, they need to be swapped by the
24191 peephole. */
24192 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
24193 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
24194 return false;
24196 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
24197 rclass_1 = FP_REGS;
24198 else
24199 rclass_1 = GENERAL_REGS;
24201 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
24202 rclass_2 = FP_REGS;
24203 else
24204 rclass_2 = GENERAL_REGS;
24206 /* Check if the registers are of same class. */
24207 if (rclass_1 != rclass_2)
24208 return false;
24210 return true;
24213 /* Given OPERANDS of consecutive load/store that can be merged,
24214 swap them if they are not in ascending order. */
24215 void
24216 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
24218 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
24219 HOST_WIDE_INT offval_1, offval_2;
24221 if (load)
24223 mem_1 = operands[1];
24224 mem_2 = operands[3];
24226 else
24228 mem_1 = operands[0];
24229 mem_2 = operands[2];
24232 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
24233 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
24235 offval_1 = INTVAL (offset_1);
24236 offval_2 = INTVAL (offset_2);
24238 if (offval_1 > offval_2)
24240 /* Irrespective of whether this is a load or a store,
24241 we do the same swap. */
24242 std::swap (operands[0], operands[2]);
24243 std::swap (operands[1], operands[3]);
24247 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
24248 comparison between the two. */
24250 aarch64_host_wide_int_compare (const void *x, const void *y)
24252 return wi::cmps (* ((const HOST_WIDE_INT *) x),
24253 * ((const HOST_WIDE_INT *) y));
24256 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
24257 other pointing to a REG rtx containing an offset, compare the offsets
24258 of the two pairs.
24260 Return:
24262 1 iff offset (X) > offset (Y)
24263 0 iff offset (X) == offset (Y)
24264 -1 iff offset (X) < offset (Y) */
24266 aarch64_ldrstr_offset_compare (const void *x, const void *y)
24268 const rtx * operands_1 = (const rtx *) x;
24269 const rtx * operands_2 = (const rtx *) y;
24270 rtx mem_1, mem_2, base, offset_1, offset_2;
24272 if (MEM_P (operands_1[0]))
24273 mem_1 = operands_1[0];
24274 else
24275 mem_1 = operands_1[1];
24277 if (MEM_P (operands_2[0]))
24278 mem_2 = operands_2[0];
24279 else
24280 mem_2 = operands_2[1];
24282 /* Extract the offsets. */
24283 extract_base_offset_in_addr (mem_1, &base, &offset_1);
24284 extract_base_offset_in_addr (mem_2, &base, &offset_2);
24286 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
24288 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
24291 /* Given OPERANDS of consecutive load/store, check if we can merge
24292 them into ldp/stp by adjusting the offset. LOAD is true if they
24293 are load instructions. MODE is the mode of memory operands.
24295 Given below consecutive stores:
24297 str w1, [xb, 0x100]
24298 str w1, [xb, 0x104]
24299 str w1, [xb, 0x108]
24300 str w1, [xb, 0x10c]
24302 Though the offsets are out of the range supported by stp, we can
24303 still pair them after adjusting the offset, like:
24305 add scratch, xb, 0x100
24306 stp w1, w1, [scratch]
24307 stp w1, w1, [scratch, 0x8]
24309 The peephole patterns detecting this opportunity should guarantee
24310 the scratch register is avaliable. */
24312 bool
24313 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
24314 machine_mode mode)
24316 const int num_insns = 4;
24317 enum reg_class rclass;
24318 HOST_WIDE_INT offvals[num_insns], msize;
24319 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
24321 if (load)
24323 for (int i = 0; i < num_insns; i++)
24325 reg[i] = operands[2 * i];
24326 mem[i] = operands[2 * i + 1];
24328 gcc_assert (REG_P (reg[i]));
24331 /* Do not attempt to merge the loads if the loads clobber each other. */
24332 for (int i = 0; i < 8; i += 2)
24333 for (int j = i + 2; j < 8; j += 2)
24334 if (reg_overlap_mentioned_p (operands[i], operands[j]))
24335 return false;
24337 else
24338 for (int i = 0; i < num_insns; i++)
24340 mem[i] = operands[2 * i];
24341 reg[i] = operands[2 * i + 1];
24344 /* Skip if memory operand is by itself valid for ldp/stp. */
24345 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
24346 return false;
24348 for (int i = 0; i < num_insns; i++)
24350 /* The mems cannot be volatile. */
24351 if (MEM_VOLATILE_P (mem[i]))
24352 return false;
24354 /* Check if the addresses are in the form of [base+offset]. */
24355 extract_base_offset_in_addr (mem[i], base + i, offset + i);
24356 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
24357 return false;
24360 /* Check if the registers are of same class. */
24361 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
24362 ? FP_REGS : GENERAL_REGS;
24364 for (int i = 1; i < num_insns; i++)
24365 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
24367 if (rclass != FP_REGS)
24368 return false;
24370 else
24372 if (rclass != GENERAL_REGS)
24373 return false;
24376 /* Only the last register in the order in which they occur
24377 may be clobbered by the load. */
24378 if (rclass == GENERAL_REGS && load)
24379 for (int i = 0; i < num_insns - 1; i++)
24380 if (reg_mentioned_p (reg[i], mem[i]))
24381 return false;
24383 /* Check if the bases are same. */
24384 for (int i = 0; i < num_insns - 1; i++)
24385 if (!rtx_equal_p (base[i], base[i + 1]))
24386 return false;
24388 for (int i = 0; i < num_insns; i++)
24389 offvals[i] = INTVAL (offset[i]);
24391 msize = GET_MODE_SIZE (mode).to_constant ();
24393 /* Check if the offsets can be put in the right order to do a ldp/stp. */
24394 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
24395 aarch64_host_wide_int_compare);
24397 if (!(offvals[1] == offvals[0] + msize
24398 && offvals[3] == offvals[2] + msize))
24399 return false;
24401 /* Check that offsets are within range of each other. The ldp/stp
24402 instructions have 7 bit immediate offsets, so use 0x80. */
24403 if (offvals[2] - offvals[0] >= msize * 0x80)
24404 return false;
24406 /* The offsets must be aligned with respect to each other. */
24407 if (offvals[0] % msize != offvals[2] % msize)
24408 return false;
24410 /* If we have SImode and slow unaligned ldp,
24411 check the alignment to be at least 8 byte. */
24412 if (mode == SImode
24413 && (aarch64_tune_params.extra_tuning_flags
24414 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
24415 && !optimize_size
24416 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
24417 return false;
24419 return true;
24422 /* Given OPERANDS of consecutive load/store, this function pairs them
24423 into LDP/STP after adjusting the offset. It depends on the fact
24424 that the operands can be sorted so the offsets are correct for STP.
24425 MODE is the mode of memory operands. CODE is the rtl operator
24426 which should be applied to all memory operands, it's SIGN_EXTEND,
24427 ZERO_EXTEND or UNKNOWN. */
24429 bool
24430 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
24431 machine_mode mode, RTX_CODE code)
24433 rtx base, offset_1, offset_3, t1, t2;
24434 rtx mem_1, mem_2, mem_3, mem_4;
24435 rtx temp_operands[8];
24436 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
24437 stp_off_upper_limit, stp_off_lower_limit, msize;
24439 /* We make changes on a copy as we may still bail out. */
24440 for (int i = 0; i < 8; i ++)
24441 temp_operands[i] = operands[i];
24443 /* Sort the operands. */
24444 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
24446 /* Copy the memory operands so that if we have to bail for some
24447 reason the original addresses are unchanged. */
24448 if (load)
24450 mem_1 = copy_rtx (temp_operands[1]);
24451 mem_2 = copy_rtx (temp_operands[3]);
24452 mem_3 = copy_rtx (temp_operands[5]);
24453 mem_4 = copy_rtx (temp_operands[7]);
24455 else
24457 mem_1 = copy_rtx (temp_operands[0]);
24458 mem_2 = copy_rtx (temp_operands[2]);
24459 mem_3 = copy_rtx (temp_operands[4]);
24460 mem_4 = copy_rtx (temp_operands[6]);
24461 gcc_assert (code == UNKNOWN);
24464 extract_base_offset_in_addr (mem_1, &base, &offset_1);
24465 extract_base_offset_in_addr (mem_3, &base, &offset_3);
24466 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
24467 && offset_3 != NULL_RTX);
24469 /* Adjust offset so it can fit in LDP/STP instruction. */
24470 msize = GET_MODE_SIZE (mode).to_constant();
24471 stp_off_upper_limit = msize * (0x40 - 1);
24472 stp_off_lower_limit = - msize * 0x40;
24474 off_val_1 = INTVAL (offset_1);
24475 off_val_3 = INTVAL (offset_3);
24477 /* The base offset is optimally half way between the two STP/LDP offsets. */
24478 if (msize <= 4)
24479 base_off = (off_val_1 + off_val_3) / 2;
24480 else
24481 /* However, due to issues with negative LDP/STP offset generation for
24482 larger modes, for DF, DI and vector modes. we must not use negative
24483 addresses smaller than 9 signed unadjusted bits can store. This
24484 provides the most range in this case. */
24485 base_off = off_val_1;
24487 /* Adjust the base so that it is aligned with the addresses but still
24488 optimal. */
24489 if (base_off % msize != off_val_1 % msize)
24490 /* Fix the offset, bearing in mind we want to make it bigger not
24491 smaller. */
24492 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
24493 else if (msize <= 4)
24494 /* The negative range of LDP/STP is one larger than the positive range. */
24495 base_off += msize;
24497 /* Check if base offset is too big or too small. We can attempt to resolve
24498 this issue by setting it to the maximum value and seeing if the offsets
24499 still fit. */
24500 if (base_off >= 0x1000)
24502 base_off = 0x1000 - 1;
24503 /* We must still make sure that the base offset is aligned with respect
24504 to the address. But it may not be made any bigger. */
24505 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
24508 /* Likewise for the case where the base is too small. */
24509 if (base_off <= -0x1000)
24511 base_off = -0x1000 + 1;
24512 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
24515 /* Offset of the first STP/LDP. */
24516 new_off_1 = off_val_1 - base_off;
24518 /* Offset of the second STP/LDP. */
24519 new_off_3 = off_val_3 - base_off;
24521 /* The offsets must be within the range of the LDP/STP instructions. */
24522 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
24523 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
24524 return false;
24526 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
24527 new_off_1), true);
24528 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
24529 new_off_1 + msize), true);
24530 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
24531 new_off_3), true);
24532 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
24533 new_off_3 + msize), true);
24535 if (!aarch64_mem_pair_operand (mem_1, mode)
24536 || !aarch64_mem_pair_operand (mem_3, mode))
24537 return false;
24539 if (code == ZERO_EXTEND)
24541 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
24542 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
24543 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
24544 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
24546 else if (code == SIGN_EXTEND)
24548 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
24549 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
24550 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
24551 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
24554 if (load)
24556 operands[0] = temp_operands[0];
24557 operands[1] = mem_1;
24558 operands[2] = temp_operands[2];
24559 operands[3] = mem_2;
24560 operands[4] = temp_operands[4];
24561 operands[5] = mem_3;
24562 operands[6] = temp_operands[6];
24563 operands[7] = mem_4;
24565 else
24567 operands[0] = mem_1;
24568 operands[1] = temp_operands[1];
24569 operands[2] = mem_2;
24570 operands[3] = temp_operands[3];
24571 operands[4] = mem_3;
24572 operands[5] = temp_operands[5];
24573 operands[6] = mem_4;
24574 operands[7] = temp_operands[7];
24577 /* Emit adjusting instruction. */
24578 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
24579 /* Emit ldp/stp instructions. */
24580 t1 = gen_rtx_SET (operands[0], operands[1]);
24581 t2 = gen_rtx_SET (operands[2], operands[3]);
24582 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
24583 t1 = gen_rtx_SET (operands[4], operands[5]);
24584 t2 = gen_rtx_SET (operands[6], operands[7]);
24585 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
24586 return true;
24589 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
24590 it isn't worth branching around empty masked ops (including masked
24591 stores). */
24593 static bool
24594 aarch64_empty_mask_is_expensive (unsigned)
24596 return false;
24599 /* Return 1 if pseudo register should be created and used to hold
24600 GOT address for PIC code. */
24602 bool
24603 aarch64_use_pseudo_pic_reg (void)
24605 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
24608 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
24610 static int
24611 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
24613 switch (XINT (x, 1))
24615 case UNSPEC_GOTSMALLPIC:
24616 case UNSPEC_GOTSMALLPIC28K:
24617 case UNSPEC_GOTTINYPIC:
24618 return 0;
24619 default:
24620 break;
24623 return default_unspec_may_trap_p (x, flags);
24627 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
24628 return the log2 of that value. Otherwise return -1. */
24631 aarch64_fpconst_pow_of_2 (rtx x)
24633 const REAL_VALUE_TYPE *r;
24635 if (!CONST_DOUBLE_P (x))
24636 return -1;
24638 r = CONST_DOUBLE_REAL_VALUE (x);
24640 if (REAL_VALUE_NEGATIVE (*r)
24641 || REAL_VALUE_ISNAN (*r)
24642 || REAL_VALUE_ISINF (*r)
24643 || !real_isinteger (r, DFmode))
24644 return -1;
24646 return exact_log2 (real_to_integer (r));
24649 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
24650 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
24651 return n. Otherwise return -1. */
24654 aarch64_fpconst_pow2_recip (rtx x)
24656 REAL_VALUE_TYPE r0;
24658 if (!CONST_DOUBLE_P (x))
24659 return -1;
24661 r0 = *CONST_DOUBLE_REAL_VALUE (x);
24662 if (exact_real_inverse (DFmode, &r0)
24663 && !REAL_VALUE_NEGATIVE (r0))
24665 int ret = exact_log2 (real_to_integer (&r0));
24666 if (ret >= 1 && ret <= 32)
24667 return ret;
24669 return -1;
24672 /* If X is a vector of equal CONST_DOUBLE values and that value is
24673 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
24676 aarch64_vec_fpconst_pow_of_2 (rtx x)
24678 int nelts;
24679 if (GET_CODE (x) != CONST_VECTOR
24680 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
24681 return -1;
24683 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
24684 return -1;
24686 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
24687 if (firstval <= 0)
24688 return -1;
24690 for (int i = 1; i < nelts; i++)
24691 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
24692 return -1;
24694 return firstval;
24697 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
24698 to float.
24700 __fp16 always promotes through this hook.
24701 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
24702 through the generic excess precision logic rather than here. */
24704 static tree
24705 aarch64_promoted_type (const_tree t)
24707 if (SCALAR_FLOAT_TYPE_P (t)
24708 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
24709 return float_type_node;
24711 return NULL_TREE;
24714 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
24716 static bool
24717 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
24718 optimization_type opt_type)
24720 switch (op)
24722 case rsqrt_optab:
24723 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
24725 default:
24726 return true;
24730 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
24732 static unsigned int
24733 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
24734 int *offset)
24736 /* Polynomial invariant 1 == (VG / 2) - 1. */
24737 gcc_assert (i == 1);
24738 *factor = 2;
24739 *offset = 1;
24740 return AARCH64_DWARF_VG;
24743 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
24744 if MODE is HFmode, and punt to the generic implementation otherwise. */
24746 static bool
24747 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
24749 return (mode == HFmode
24750 ? true
24751 : default_libgcc_floating_mode_supported_p (mode));
24754 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
24755 if MODE is HFmode, and punt to the generic implementation otherwise. */
24757 static bool
24758 aarch64_scalar_mode_supported_p (scalar_mode mode)
24760 return (mode == HFmode
24761 ? true
24762 : default_scalar_mode_supported_p (mode));
24765 /* Set the value of FLT_EVAL_METHOD.
24766 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
24768 0: evaluate all operations and constants, whose semantic type has at
24769 most the range and precision of type float, to the range and
24770 precision of float; evaluate all other operations and constants to
24771 the range and precision of the semantic type;
24773 N, where _FloatN is a supported interchange floating type
24774 evaluate all operations and constants, whose semantic type has at
24775 most the range and precision of _FloatN type, to the range and
24776 precision of the _FloatN type; evaluate all other operations and
24777 constants to the range and precision of the semantic type;
24779 If we have the ARMv8.2-A extensions then we support _Float16 in native
24780 precision, so we should set this to 16. Otherwise, we support the type,
24781 but want to evaluate expressions in float precision, so set this to
24782 0. */
24784 static enum flt_eval_method
24785 aarch64_excess_precision (enum excess_precision_type type)
24787 switch (type)
24789 case EXCESS_PRECISION_TYPE_FAST:
24790 case EXCESS_PRECISION_TYPE_STANDARD:
24791 /* We can calculate either in 16-bit range and precision or
24792 32-bit range and precision. Make that decision based on whether
24793 we have native support for the ARMv8.2-A 16-bit floating-point
24794 instructions or not. */
24795 return (TARGET_FP_F16INST
24796 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
24797 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
24798 case EXCESS_PRECISION_TYPE_IMPLICIT:
24799 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
24800 default:
24801 gcc_unreachable ();
24803 return FLT_EVAL_METHOD_UNPREDICTABLE;
24806 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
24807 scheduled for speculative execution. Reject the long-running division
24808 and square-root instructions. */
24810 static bool
24811 aarch64_sched_can_speculate_insn (rtx_insn *insn)
24813 switch (get_attr_type (insn))
24815 case TYPE_SDIV:
24816 case TYPE_UDIV:
24817 case TYPE_FDIVS:
24818 case TYPE_FDIVD:
24819 case TYPE_FSQRTS:
24820 case TYPE_FSQRTD:
24821 case TYPE_NEON_FP_SQRT_S:
24822 case TYPE_NEON_FP_SQRT_D:
24823 case TYPE_NEON_FP_SQRT_S_Q:
24824 case TYPE_NEON_FP_SQRT_D_Q:
24825 case TYPE_NEON_FP_DIV_S:
24826 case TYPE_NEON_FP_DIV_D:
24827 case TYPE_NEON_FP_DIV_S_Q:
24828 case TYPE_NEON_FP_DIV_D_Q:
24829 return false;
24830 default:
24831 return true;
24835 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
24837 static int
24838 aarch64_compute_pressure_classes (reg_class *classes)
24840 int i = 0;
24841 classes[i++] = GENERAL_REGS;
24842 classes[i++] = FP_REGS;
24843 /* PR_REGS isn't a useful pressure class because many predicate pseudo
24844 registers need to go in PR_LO_REGS at some point during their
24845 lifetime. Splitting it into two halves has the effect of making
24846 all predicates count against PR_LO_REGS, so that we try whenever
24847 possible to restrict the number of live predicates to 8. This
24848 greatly reduces the amount of spilling in certain loops. */
24849 classes[i++] = PR_LO_REGS;
24850 classes[i++] = PR_HI_REGS;
24851 return i;
24854 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
24856 static bool
24857 aarch64_can_change_mode_class (machine_mode from,
24858 machine_mode to, reg_class_t)
24860 unsigned int from_flags = aarch64_classify_vector_mode (from);
24861 unsigned int to_flags = aarch64_classify_vector_mode (to);
24863 bool from_sve_p = (from_flags & VEC_ANY_SVE);
24864 bool to_sve_p = (to_flags & VEC_ANY_SVE);
24866 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
24867 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
24869 bool from_pred_p = (from_flags & VEC_SVE_PRED);
24870 bool to_pred_p = (to_flags & VEC_SVE_PRED);
24872 /* Don't allow changes between predicate modes and other modes.
24873 Only predicate registers can hold predicate modes and only
24874 non-predicate registers can hold non-predicate modes, so any
24875 attempt to mix them would require a round trip through memory. */
24876 if (from_pred_p != to_pred_p)
24877 return false;
24879 /* Don't allow changes between partial SVE modes and other modes.
24880 The contents of partial SVE modes are distributed evenly across
24881 the register, whereas GCC expects them to be clustered together. */
24882 if (from_partial_sve_p != to_partial_sve_p)
24883 return false;
24885 /* Similarly reject changes between partial SVE modes that have
24886 different patterns of significant and insignificant bits. */
24887 if (from_partial_sve_p
24888 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
24889 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
24890 return false;
24892 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
24894 /* Don't allow changes between SVE modes and other modes that might
24895 be bigger than 128 bits. In particular, OImode, CImode and XImode
24896 divide into 128-bit quantities while SVE modes divide into
24897 BITS_PER_SVE_VECTOR quantities. */
24898 if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
24899 return false;
24900 if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
24901 return false;
24904 if (BYTES_BIG_ENDIAN)
24906 /* Don't allow changes between SVE data modes and non-SVE modes.
24907 See the comment at the head of aarch64-sve.md for details. */
24908 if (from_sve_p != to_sve_p)
24909 return false;
24911 /* Don't allow changes in element size: lane 0 of the new vector
24912 would not then be lane 0 of the old vector. See the comment
24913 above aarch64_maybe_expand_sve_subreg_move for a more detailed
24914 description.
24916 In the worst case, this forces a register to be spilled in
24917 one mode and reloaded in the other, which handles the
24918 endianness correctly. */
24919 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
24920 return false;
24922 return true;
24925 /* Implement TARGET_EARLY_REMAT_MODES. */
24927 static void
24928 aarch64_select_early_remat_modes (sbitmap modes)
24930 /* SVE values are not normally live across a call, so it should be
24931 worth doing early rematerialization even in VL-specific mode. */
24932 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
24933 if (aarch64_sve_mode_p ((machine_mode) i))
24934 bitmap_set_bit (modes, i);
24937 /* Override the default target speculation_safe_value. */
24938 static rtx
24939 aarch64_speculation_safe_value (machine_mode mode,
24940 rtx result, rtx val, rtx failval)
24942 /* Maybe we should warn if falling back to hard barriers. They are
24943 likely to be noticably more expensive than the alternative below. */
24944 if (!aarch64_track_speculation)
24945 return default_speculation_safe_value (mode, result, val, failval);
24947 if (!REG_P (val))
24948 val = copy_to_mode_reg (mode, val);
24950 if (!aarch64_reg_or_zero (failval, mode))
24951 failval = copy_to_mode_reg (mode, failval);
24953 emit_insn (gen_despeculate_copy (mode, result, val, failval));
24954 return result;
24957 /* Implement TARGET_ESTIMATED_POLY_VALUE.
24958 Look into the tuning structure for an estimate.
24959 KIND specifies the type of requested estimate: min, max or likely.
24960 For cores with a known SVE width all three estimates are the same.
24961 For generic SVE tuning we want to distinguish the maximum estimate from
24962 the minimum and likely ones.
24963 The likely estimate is the same as the minimum in that case to give a
24964 conservative behavior of auto-vectorizing with SVE when it is a win
24965 even for 128-bit SVE.
24966 When SVE width information is available VAL.coeffs[1] is multiplied by
24967 the number of VQ chunks over the initial Advanced SIMD 128 bits. */
24969 static HOST_WIDE_INT
24970 aarch64_estimated_poly_value (poly_int64 val,
24971 poly_value_estimate_kind kind
24972 = POLY_VALUE_LIKELY)
24974 enum aarch64_sve_vector_bits_enum width_source
24975 = aarch64_tune_params.sve_width;
24977 /* If there is no core-specific information then the minimum and likely
24978 values are based on 128-bit vectors and the maximum is based on
24979 the architectural maximum of 2048 bits. */
24980 if (width_source == SVE_SCALABLE)
24981 switch (kind)
24983 case POLY_VALUE_MIN:
24984 case POLY_VALUE_LIKELY:
24985 return val.coeffs[0];
24986 case POLY_VALUE_MAX:
24987 return val.coeffs[0] + val.coeffs[1] * 15;
24990 /* If the core provides width information, use that. */
24991 HOST_WIDE_INT over_128 = width_source - 128;
24992 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
24996 /* Return true for types that could be supported as SIMD return or
24997 argument types. */
24999 static bool
25000 supported_simd_type (tree t)
25002 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
25004 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
25005 return s == 1 || s == 2 || s == 4 || s == 8;
25007 return false;
25010 /* Return true for types that currently are supported as SIMD return
25011 or argument types. */
25013 static bool
25014 currently_supported_simd_type (tree t, tree b)
25016 if (COMPLEX_FLOAT_TYPE_P (t))
25017 return false;
25019 if (TYPE_SIZE (t) != TYPE_SIZE (b))
25020 return false;
25022 return supported_simd_type (t);
25025 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
25027 static int
25028 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
25029 struct cgraph_simd_clone *clonei,
25030 tree base_type, int num)
25032 tree t, ret_type;
25033 unsigned int elt_bits, count;
25034 unsigned HOST_WIDE_INT const_simdlen;
25035 poly_uint64 vec_bits;
25037 if (!TARGET_SIMD)
25038 return 0;
25040 /* For now, SVE simdclones won't produce illegal simdlen, So only check
25041 const simdlens here. */
25042 if (maybe_ne (clonei->simdlen, 0U)
25043 && clonei->simdlen.is_constant (&const_simdlen)
25044 && (const_simdlen < 2
25045 || const_simdlen > 1024
25046 || (const_simdlen & (const_simdlen - 1)) != 0))
25048 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25049 "unsupported simdlen %wd", const_simdlen);
25050 return 0;
25053 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
25054 if (TREE_CODE (ret_type) != VOID_TYPE
25055 && !currently_supported_simd_type (ret_type, base_type))
25057 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
25058 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25059 "GCC does not currently support mixed size types "
25060 "for %<simd%> functions");
25061 else if (supported_simd_type (ret_type))
25062 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25063 "GCC does not currently support return type %qT "
25064 "for %<simd%> functions", ret_type);
25065 else
25066 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25067 "unsupported return type %qT for %<simd%> functions",
25068 ret_type);
25069 return 0;
25072 int i;
25073 tree type_arg_types = TYPE_ARG_TYPES (TREE_TYPE (node->decl));
25074 bool decl_arg_p = (node->definition || type_arg_types == NULL_TREE);
25076 for (t = (decl_arg_p ? DECL_ARGUMENTS (node->decl) : type_arg_types), i = 0;
25077 t && t != void_list_node; t = TREE_CHAIN (t), i++)
25079 tree arg_type = decl_arg_p ? TREE_TYPE (t) : TREE_VALUE (t);
25081 if (clonei->args[i].arg_type != SIMD_CLONE_ARG_TYPE_UNIFORM
25082 && !currently_supported_simd_type (arg_type, base_type))
25084 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
25085 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25086 "GCC does not currently support mixed size types "
25087 "for %<simd%> functions");
25088 else
25089 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25090 "GCC does not currently support argument type %qT "
25091 "for %<simd%> functions", arg_type);
25092 return 0;
25096 clonei->vecsize_mangle = 'n';
25097 clonei->mask_mode = VOIDmode;
25098 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
25099 if (known_eq (clonei->simdlen, 0U))
25101 count = 2;
25102 vec_bits = (num == 0 ? 64 : 128);
25103 clonei->simdlen = exact_div (vec_bits, elt_bits);
25105 else
25107 count = 1;
25108 vec_bits = clonei->simdlen * elt_bits;
25109 /* For now, SVE simdclones won't produce illegal simdlen, So only check
25110 const simdlens here. */
25111 if (clonei->simdlen.is_constant (&const_simdlen)
25112 && maybe_ne (vec_bits, 64U) && maybe_ne (vec_bits, 128U))
25114 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
25115 "GCC does not currently support simdlen %wd for type %qT",
25116 const_simdlen, base_type);
25117 return 0;
25120 clonei->vecsize_int = vec_bits;
25121 clonei->vecsize_float = vec_bits;
25122 return count;
25125 /* Implement TARGET_SIMD_CLONE_ADJUST. */
25127 static void
25128 aarch64_simd_clone_adjust (struct cgraph_node *node)
25130 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
25131 use the correct ABI. */
25133 tree t = TREE_TYPE (node->decl);
25134 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
25135 TYPE_ATTRIBUTES (t));
25138 /* Implement TARGET_SIMD_CLONE_USABLE. */
25140 static int
25141 aarch64_simd_clone_usable (struct cgraph_node *node)
25143 switch (node->simdclone->vecsize_mangle)
25145 case 'n':
25146 if (!TARGET_SIMD)
25147 return -1;
25148 return 0;
25149 default:
25150 gcc_unreachable ();
25154 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
25156 static int
25157 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
25159 auto check_attr = [&](const char *name) {
25160 tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
25161 tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
25162 if (!attr1 && !attr2)
25163 return true;
25165 return attr1 && attr2 && attribute_value_equal (attr1, attr2);
25168 if (!check_attr ("aarch64_vector_pcs"))
25169 return 0;
25170 if (!check_attr ("Advanced SIMD type"))
25171 return 0;
25172 if (!check_attr ("SVE type"))
25173 return 0;
25174 if (!check_attr ("SVE sizeless type"))
25175 return 0;
25176 return 1;
25179 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
25181 static const char *
25182 aarch64_get_multilib_abi_name (void)
25184 if (TARGET_BIG_END)
25185 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
25186 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
25189 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
25190 global variable based guard use the default else
25191 return a null tree. */
25192 static tree
25193 aarch64_stack_protect_guard (void)
25195 if (aarch64_stack_protector_guard == SSP_GLOBAL)
25196 return default_stack_protect_guard ();
25198 return NULL_TREE;
25201 /* Return the diagnostic message string if conversion from FROMTYPE to
25202 TOTYPE is not allowed, NULL otherwise. */
25204 static const char *
25205 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
25207 if (element_mode (fromtype) != element_mode (totype))
25209 /* Do no allow conversions to/from BFmode scalar types. */
25210 if (TYPE_MODE (fromtype) == BFmode)
25211 return N_("invalid conversion from type %<bfloat16_t%>");
25212 if (TYPE_MODE (totype) == BFmode)
25213 return N_("invalid conversion to type %<bfloat16_t%>");
25216 /* Conversion allowed. */
25217 return NULL;
25220 /* Return the diagnostic message string if the unary operation OP is
25221 not permitted on TYPE, NULL otherwise. */
25223 static const char *
25224 aarch64_invalid_unary_op (int op, const_tree type)
25226 /* Reject all single-operand operations on BFmode except for &. */
25227 if (element_mode (type) == BFmode && op != ADDR_EXPR)
25228 return N_("operation not permitted on type %<bfloat16_t%>");
25230 /* Operation allowed. */
25231 return NULL;
25234 /* Return the diagnostic message string if the binary operation OP is
25235 not permitted on TYPE1 and TYPE2, NULL otherwise. */
25237 static const char *
25238 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
25239 const_tree type2)
25241 /* Reject all 2-operand operations on BFmode. */
25242 if (element_mode (type1) == BFmode
25243 || element_mode (type2) == BFmode)
25244 return N_("operation not permitted on type %<bfloat16_t%>");
25246 if (VECTOR_TYPE_P (type1)
25247 && VECTOR_TYPE_P (type2)
25248 && !TYPE_INDIVISIBLE_P (type1)
25249 && !TYPE_INDIVISIBLE_P (type2)
25250 && (aarch64_sve::builtin_type_p (type1)
25251 != aarch64_sve::builtin_type_p (type2)))
25252 return N_("cannot combine GNU and SVE vectors in a binary operation");
25254 /* Operation allowed. */
25255 return NULL;
25258 /* Implement TARGET_MEMTAG_CAN_TAG_ADDRESSES. Here we tell the rest of the
25259 compiler that we automatically ignore the top byte of our pointers, which
25260 allows using -fsanitize=hwaddress. */
25261 bool
25262 aarch64_can_tag_addresses ()
25264 return !TARGET_ILP32;
25267 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
25268 section at the end if needed. */
25269 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
25270 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
25271 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
25272 void
25273 aarch64_file_end_indicate_exec_stack ()
25275 file_end_indicate_exec_stack ();
25277 unsigned feature_1_and = 0;
25278 if (aarch64_bti_enabled ())
25279 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
25281 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
25282 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
25284 if (feature_1_and)
25286 /* Generate .note.gnu.property section. */
25287 switch_to_section (get_section (".note.gnu.property",
25288 SECTION_NOTYPE, NULL));
25290 /* PT_NOTE header: namesz, descsz, type.
25291 namesz = 4 ("GNU\0")
25292 descsz = 16 (Size of the program property array)
25293 [(12 + padding) * Number of array elements]
25294 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
25295 assemble_align (POINTER_SIZE);
25296 assemble_integer (GEN_INT (4), 4, 32, 1);
25297 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
25298 assemble_integer (GEN_INT (5), 4, 32, 1);
25300 /* PT_NOTE name. */
25301 assemble_string ("GNU", 4);
25303 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
25304 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
25305 datasz = 4
25306 data = feature_1_and. */
25307 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
25308 assemble_integer (GEN_INT (4), 4, 32, 1);
25309 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
25311 /* Pad the size of the note to the required alignment. */
25312 assemble_align (POINTER_SIZE);
25315 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
25316 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
25317 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
25319 /* Helper function for straight line speculation.
25320 Return what barrier should be emitted for straight line speculation
25321 mitigation.
25322 When not mitigating against straight line speculation this function returns
25323 an empty string.
25324 When mitigating against straight line speculation, use:
25325 * SB when the v8.5-A SB extension is enabled.
25326 * DSB+ISB otherwise. */
25327 const char *
25328 aarch64_sls_barrier (int mitigation_required)
25330 return mitigation_required
25331 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
25332 : "";
25335 static GTY (()) tree aarch64_sls_shared_thunks[30];
25336 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
25337 const char *indirect_symbol_names[30] = {
25338 "__call_indirect_x0",
25339 "__call_indirect_x1",
25340 "__call_indirect_x2",
25341 "__call_indirect_x3",
25342 "__call_indirect_x4",
25343 "__call_indirect_x5",
25344 "__call_indirect_x6",
25345 "__call_indirect_x7",
25346 "__call_indirect_x8",
25347 "__call_indirect_x9",
25348 "__call_indirect_x10",
25349 "__call_indirect_x11",
25350 "__call_indirect_x12",
25351 "__call_indirect_x13",
25352 "__call_indirect_x14",
25353 "__call_indirect_x15",
25354 "", /* "__call_indirect_x16", */
25355 "", /* "__call_indirect_x17", */
25356 "__call_indirect_x18",
25357 "__call_indirect_x19",
25358 "__call_indirect_x20",
25359 "__call_indirect_x21",
25360 "__call_indirect_x22",
25361 "__call_indirect_x23",
25362 "__call_indirect_x24",
25363 "__call_indirect_x25",
25364 "__call_indirect_x26",
25365 "__call_indirect_x27",
25366 "__call_indirect_x28",
25367 "__call_indirect_x29",
25370 /* Function to create a BLR thunk. This thunk is used to mitigate straight
25371 line speculation. Instead of a simple BLR that can be speculated past,
25372 we emit a BL to this thunk, and this thunk contains a BR to the relevant
25373 register. These thunks have the relevant speculation barries put after
25374 their indirect branch so that speculation is blocked.
25376 We use such a thunk so the speculation barriers are kept off the
25377 architecturally executed path in order to reduce the performance overhead.
25379 When optimizing for size we use stubs shared by the linked object.
25380 When optimizing for performance we emit stubs for each function in the hope
25381 that the branch predictor can better train on jumps specific for a given
25382 function. */
25384 aarch64_sls_create_blr_label (int regnum)
25386 gcc_assert (STUB_REGNUM_P (regnum));
25387 if (optimize_function_for_size_p (cfun))
25389 /* For the thunks shared between different functions in this compilation
25390 unit we use a named symbol -- this is just for users to more easily
25391 understand the generated assembly. */
25392 aarch64_sls_shared_thunks_needed = true;
25393 const char *thunk_name = indirect_symbol_names[regnum];
25394 if (aarch64_sls_shared_thunks[regnum] == NULL)
25396 /* Build a decl representing this function stub and record it for
25397 later. We build a decl here so we can use the GCC machinery for
25398 handling sections automatically (through `get_named_section` and
25399 `make_decl_one_only`). That saves us a lot of trouble handling
25400 the specifics of different output file formats. */
25401 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
25402 get_identifier (thunk_name),
25403 build_function_type_list (void_type_node,
25404 NULL_TREE));
25405 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
25406 NULL_TREE, void_type_node);
25407 TREE_PUBLIC (decl) = 1;
25408 TREE_STATIC (decl) = 1;
25409 DECL_IGNORED_P (decl) = 1;
25410 DECL_ARTIFICIAL (decl) = 1;
25411 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
25412 resolve_unique_section (decl, 0, false);
25413 aarch64_sls_shared_thunks[regnum] = decl;
25416 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
25419 if (cfun->machine->call_via[regnum] == NULL)
25420 cfun->machine->call_via[regnum]
25421 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
25422 return cfun->machine->call_via[regnum];
25425 /* Helper function for aarch64_sls_emit_blr_function_thunks and
25426 aarch64_sls_emit_shared_blr_thunks below. */
25427 static void
25428 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
25430 /* Save in x16 and branch to that function so this transformation does
25431 not prevent jumping to `BTI c` instructions. */
25432 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
25433 asm_fprintf (out_file, "\tbr\tx16\n");
25436 /* Emit all BLR stubs for this particular function.
25437 Here we emit all the BLR stubs needed for the current function. Since we
25438 emit these stubs in a consecutive block we know there will be no speculation
25439 gadgets between each stub, and hence we only emit a speculation barrier at
25440 the end of the stub sequences.
25442 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
25443 void
25444 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
25446 if (! aarch64_harden_sls_blr_p ())
25447 return;
25449 bool any_functions_emitted = false;
25450 /* We must save and restore the current function section since this assembly
25451 is emitted at the end of the function. This means it can be emitted *just
25452 after* the cold section of a function. That cold part would be emitted in
25453 a different section. That switch would trigger a `.cfi_endproc` directive
25454 to be emitted in the original section and a `.cfi_startproc` directive to
25455 be emitted in the new section. Switching to the original section without
25456 restoring would mean that the `.cfi_endproc` emitted as a function ends
25457 would happen in a different section -- leaving an unmatched
25458 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
25459 in the standard text section. */
25460 section *save_text_section = in_section;
25461 switch_to_section (function_section (current_function_decl));
25462 for (int regnum = 0; regnum < 30; ++regnum)
25464 rtx specu_label = cfun->machine->call_via[regnum];
25465 if (specu_label == NULL)
25466 continue;
25468 targetm.asm_out.print_operand (out_file, specu_label, 0);
25469 asm_fprintf (out_file, ":\n");
25470 aarch64_sls_emit_function_stub (out_file, regnum);
25471 any_functions_emitted = true;
25473 if (any_functions_emitted)
25474 /* Can use the SB if needs be here, since this stub will only be used
25475 by the current function, and hence for the current target. */
25476 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
25477 switch_to_section (save_text_section);
25480 /* Emit shared BLR stubs for the current compilation unit.
25481 Over the course of compiling this unit we may have converted some BLR
25482 instructions to a BL to a shared stub function. This is where we emit those
25483 stub functions.
25484 This function is for the stubs shared between different functions in this
25485 compilation unit. We share when optimizing for size instead of speed.
25487 This function is called through the TARGET_ASM_FILE_END hook. */
25488 void
25489 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
25491 if (! aarch64_sls_shared_thunks_needed)
25492 return;
25494 for (int regnum = 0; regnum < 30; ++regnum)
25496 tree decl = aarch64_sls_shared_thunks[regnum];
25497 if (!decl)
25498 continue;
25500 const char *name = indirect_symbol_names[regnum];
25501 switch_to_section (get_named_section (decl, NULL, 0));
25502 ASM_OUTPUT_ALIGN (out_file, 2);
25503 targetm.asm_out.globalize_label (out_file, name);
25504 /* Only emits if the compiler is configured for an assembler that can
25505 handle visibility directives. */
25506 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
25507 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
25508 ASM_OUTPUT_LABEL (out_file, name);
25509 aarch64_sls_emit_function_stub (out_file, regnum);
25510 /* Use the most conservative target to ensure it can always be used by any
25511 function in the translation unit. */
25512 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
25513 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
25517 /* Implement TARGET_ASM_FILE_END. */
25518 void
25519 aarch64_asm_file_end ()
25521 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
25522 /* Since this function will be called for the ASM_FILE_END hook, we ensure
25523 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
25524 for FreeBSD) still gets called. */
25525 #ifdef TARGET_ASM_FILE_END
25526 TARGET_ASM_FILE_END ();
25527 #endif
25530 const char *
25531 aarch64_indirect_call_asm (rtx addr)
25533 gcc_assert (REG_P (addr));
25534 if (aarch64_harden_sls_blr_p ())
25536 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
25537 output_asm_insn ("bl\t%0", &stub_label);
25539 else
25540 output_asm_insn ("blr\t%0", &addr);
25541 return "";
25544 /* Target-specific selftests. */
25546 #if CHECKING_P
25548 namespace selftest {
25550 /* Selftest for the RTL loader.
25551 Verify that the RTL loader copes with a dump from
25552 print_rtx_function. This is essentially just a test that class
25553 function_reader can handle a real dump, but it also verifies
25554 that lookup_reg_by_dump_name correctly handles hard regs.
25555 The presence of hard reg names in the dump means that the test is
25556 target-specific, hence it is in this file. */
25558 static void
25559 aarch64_test_loading_full_dump ()
25561 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
25563 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
25565 rtx_insn *insn_1 = get_insn_by_uid (1);
25566 ASSERT_EQ (NOTE, GET_CODE (insn_1));
25568 rtx_insn *insn_15 = get_insn_by_uid (15);
25569 ASSERT_EQ (INSN, GET_CODE (insn_15));
25570 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
25572 /* Verify crtl->return_rtx. */
25573 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
25574 ASSERT_EQ (0, REGNO (crtl->return_rtx));
25575 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
25578 /* Run all target-specific selftests. */
25580 static void
25581 aarch64_run_selftests (void)
25583 aarch64_test_loading_full_dump ();
25586 } // namespace selftest
25588 #endif /* #if CHECKING_P */
25590 #undef TARGET_STACK_PROTECT_GUARD
25591 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
25593 #undef TARGET_ADDRESS_COST
25594 #define TARGET_ADDRESS_COST aarch64_address_cost
25596 /* This hook will determines whether unnamed bitfields affect the alignment
25597 of the containing structure. The hook returns true if the structure
25598 should inherit the alignment requirements of an unnamed bitfield's
25599 type. */
25600 #undef TARGET_ALIGN_ANON_BITFIELD
25601 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
25603 #undef TARGET_ASM_ALIGNED_DI_OP
25604 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
25606 #undef TARGET_ASM_ALIGNED_HI_OP
25607 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
25609 #undef TARGET_ASM_ALIGNED_SI_OP
25610 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
25612 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
25613 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
25614 hook_bool_const_tree_hwi_hwi_const_tree_true
25616 #undef TARGET_ASM_FILE_START
25617 #define TARGET_ASM_FILE_START aarch64_start_file
25619 #undef TARGET_ASM_OUTPUT_MI_THUNK
25620 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
25622 #undef TARGET_ASM_SELECT_RTX_SECTION
25623 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
25625 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
25626 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
25628 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
25629 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
25631 #undef TARGET_BUILD_BUILTIN_VA_LIST
25632 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
25634 #undef TARGET_CALLEE_COPIES
25635 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
25637 #undef TARGET_CAN_ELIMINATE
25638 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
25640 #undef TARGET_CAN_INLINE_P
25641 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
25643 #undef TARGET_CANNOT_FORCE_CONST_MEM
25644 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
25646 #undef TARGET_CASE_VALUES_THRESHOLD
25647 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
25649 #undef TARGET_CONDITIONAL_REGISTER_USAGE
25650 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
25652 #undef TARGET_MEMBER_TYPE_FORCES_BLK
25653 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
25655 /* Only the least significant bit is used for initialization guard
25656 variables. */
25657 #undef TARGET_CXX_GUARD_MASK_BIT
25658 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
25660 #undef TARGET_C_MODE_FOR_SUFFIX
25661 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
25663 #ifdef TARGET_BIG_ENDIAN_DEFAULT
25664 #undef TARGET_DEFAULT_TARGET_FLAGS
25665 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
25666 #endif
25668 #undef TARGET_CLASS_MAX_NREGS
25669 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
25671 #undef TARGET_BUILTIN_DECL
25672 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
25674 #undef TARGET_BUILTIN_RECIPROCAL
25675 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
25677 #undef TARGET_C_EXCESS_PRECISION
25678 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
25680 #undef TARGET_EXPAND_BUILTIN
25681 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
25683 #undef TARGET_EXPAND_BUILTIN_VA_START
25684 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
25686 #undef TARGET_FOLD_BUILTIN
25687 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
25689 #undef TARGET_FUNCTION_ARG
25690 #define TARGET_FUNCTION_ARG aarch64_function_arg
25692 #undef TARGET_FUNCTION_ARG_ADVANCE
25693 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
25695 #undef TARGET_FUNCTION_ARG_BOUNDARY
25696 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
25698 #undef TARGET_FUNCTION_ARG_PADDING
25699 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
25701 #undef TARGET_GET_RAW_RESULT_MODE
25702 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
25703 #undef TARGET_GET_RAW_ARG_MODE
25704 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
25706 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
25707 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
25709 #undef TARGET_FUNCTION_VALUE
25710 #define TARGET_FUNCTION_VALUE aarch64_function_value
25712 #undef TARGET_FUNCTION_VALUE_REGNO_P
25713 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
25715 #undef TARGET_GIMPLE_FOLD_BUILTIN
25716 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
25718 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
25719 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
25721 #undef TARGET_INIT_BUILTINS
25722 #define TARGET_INIT_BUILTINS aarch64_init_builtins
25724 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
25725 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
25726 aarch64_ira_change_pseudo_allocno_class
25728 #undef TARGET_LEGITIMATE_ADDRESS_P
25729 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
25731 #undef TARGET_LEGITIMATE_CONSTANT_P
25732 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
25734 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
25735 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
25736 aarch64_legitimize_address_displacement
25738 #undef TARGET_LIBGCC_CMP_RETURN_MODE
25739 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
25741 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
25742 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
25743 aarch64_libgcc_floating_mode_supported_p
25745 #undef TARGET_MANGLE_TYPE
25746 #define TARGET_MANGLE_TYPE aarch64_mangle_type
25748 #undef TARGET_INVALID_CONVERSION
25749 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
25751 #undef TARGET_INVALID_UNARY_OP
25752 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
25754 #undef TARGET_INVALID_BINARY_OP
25755 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
25757 #undef TARGET_VERIFY_TYPE_CONTEXT
25758 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
25760 #undef TARGET_MEMORY_MOVE_COST
25761 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
25763 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
25764 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
25766 #undef TARGET_MUST_PASS_IN_STACK
25767 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
25769 /* This target hook should return true if accesses to volatile bitfields
25770 should use the narrowest mode possible. It should return false if these
25771 accesses should use the bitfield container type. */
25772 #undef TARGET_NARROW_VOLATILE_BITFIELD
25773 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
25775 #undef TARGET_OPTION_OVERRIDE
25776 #define TARGET_OPTION_OVERRIDE aarch64_override_options
25778 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
25779 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
25780 aarch64_override_options_after_change
25782 #undef TARGET_OFFLOAD_OPTIONS
25783 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
25785 #undef TARGET_OPTION_SAVE
25786 #define TARGET_OPTION_SAVE aarch64_option_save
25788 #undef TARGET_OPTION_RESTORE
25789 #define TARGET_OPTION_RESTORE aarch64_option_restore
25791 #undef TARGET_OPTION_PRINT
25792 #define TARGET_OPTION_PRINT aarch64_option_print
25794 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
25795 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
25797 #undef TARGET_SET_CURRENT_FUNCTION
25798 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
25800 #undef TARGET_PASS_BY_REFERENCE
25801 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
25803 #undef TARGET_PREFERRED_RELOAD_CLASS
25804 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
25806 #undef TARGET_SCHED_REASSOCIATION_WIDTH
25807 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
25809 #undef TARGET_PROMOTED_TYPE
25810 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
25812 #undef TARGET_SECONDARY_RELOAD
25813 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
25815 #undef TARGET_SHIFT_TRUNCATION_MASK
25816 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
25818 #undef TARGET_SETUP_INCOMING_VARARGS
25819 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
25821 #undef TARGET_STRUCT_VALUE_RTX
25822 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
25824 #undef TARGET_REGISTER_MOVE_COST
25825 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
25827 #undef TARGET_RETURN_IN_MEMORY
25828 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
25830 #undef TARGET_RETURN_IN_MSB
25831 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
25833 #undef TARGET_RTX_COSTS
25834 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
25836 #undef TARGET_SCALAR_MODE_SUPPORTED_P
25837 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
25839 #undef TARGET_SCHED_ISSUE_RATE
25840 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
25842 #undef TARGET_SCHED_VARIABLE_ISSUE
25843 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
25845 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
25846 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
25847 aarch64_sched_first_cycle_multipass_dfa_lookahead
25849 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
25850 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
25851 aarch64_first_cycle_multipass_dfa_lookahead_guard
25853 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
25854 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
25855 aarch64_get_separate_components
25857 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
25858 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
25859 aarch64_components_for_bb
25861 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
25862 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
25863 aarch64_disqualify_components
25865 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
25866 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
25867 aarch64_emit_prologue_components
25869 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
25870 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
25871 aarch64_emit_epilogue_components
25873 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
25874 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
25875 aarch64_set_handled_components
25877 #undef TARGET_TRAMPOLINE_INIT
25878 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
25880 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
25881 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
25883 #undef TARGET_VECTOR_MODE_SUPPORTED_P
25884 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
25886 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
25887 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
25889 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
25890 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
25891 aarch64_builtin_support_vector_misalignment
25893 #undef TARGET_ARRAY_MODE
25894 #define TARGET_ARRAY_MODE aarch64_array_mode
25896 #undef TARGET_ARRAY_MODE_SUPPORTED_P
25897 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
25899 #undef TARGET_VECTORIZE_INIT_COST
25900 #define TARGET_VECTORIZE_INIT_COST aarch64_init_cost
25902 #undef TARGET_VECTORIZE_ADD_STMT_COST
25903 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
25905 #undef TARGET_VECTORIZE_FINISH_COST
25906 #define TARGET_VECTORIZE_FINISH_COST aarch64_finish_cost
25908 #undef TARGET_VECTORIZE_DESTROY_COST_DATA
25909 #define TARGET_VECTORIZE_DESTROY_COST_DATA aarch64_destroy_cost_data
25911 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
25912 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
25913 aarch64_builtin_vectorization_cost
25915 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
25916 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
25918 #undef TARGET_VECTORIZE_BUILTINS
25919 #define TARGET_VECTORIZE_BUILTINS
25921 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
25922 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
25923 aarch64_builtin_vectorized_function
25925 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
25926 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
25927 aarch64_autovectorize_vector_modes
25929 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
25930 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
25931 aarch64_atomic_assign_expand_fenv
25933 /* Section anchor support. */
25935 #undef TARGET_MIN_ANCHOR_OFFSET
25936 #define TARGET_MIN_ANCHOR_OFFSET -256
25938 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
25939 byte offset; we can do much more for larger data types, but have no way
25940 to determine the size of the access. We assume accesses are aligned. */
25941 #undef TARGET_MAX_ANCHOR_OFFSET
25942 #define TARGET_MAX_ANCHOR_OFFSET 4095
25944 #undef TARGET_VECTOR_ALIGNMENT
25945 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
25947 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
25948 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
25949 aarch64_vectorize_preferred_vector_alignment
25950 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
25951 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
25952 aarch64_simd_vector_alignment_reachable
25954 /* vec_perm support. */
25956 #undef TARGET_VECTORIZE_VEC_PERM_CONST
25957 #define TARGET_VECTORIZE_VEC_PERM_CONST \
25958 aarch64_vectorize_vec_perm_const
25960 #undef TARGET_VECTORIZE_RELATED_MODE
25961 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
25962 #undef TARGET_VECTORIZE_GET_MASK_MODE
25963 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
25964 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
25965 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
25966 aarch64_empty_mask_is_expensive
25967 #undef TARGET_PREFERRED_ELSE_VALUE
25968 #define TARGET_PREFERRED_ELSE_VALUE \
25969 aarch64_preferred_else_value
25971 #undef TARGET_INIT_LIBFUNCS
25972 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
25974 #undef TARGET_FIXED_CONDITION_CODE_REGS
25975 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
25977 #undef TARGET_FLAGS_REGNUM
25978 #define TARGET_FLAGS_REGNUM CC_REGNUM
25980 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
25981 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
25983 #undef TARGET_ASAN_SHADOW_OFFSET
25984 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
25986 #undef TARGET_LEGITIMIZE_ADDRESS
25987 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
25989 #undef TARGET_SCHED_CAN_SPECULATE_INSN
25990 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
25992 #undef TARGET_CAN_USE_DOLOOP_P
25993 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
25995 #undef TARGET_SCHED_ADJUST_PRIORITY
25996 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
25998 #undef TARGET_SCHED_MACRO_FUSION_P
25999 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
26001 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
26002 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
26004 #undef TARGET_SCHED_FUSION_PRIORITY
26005 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
26007 #undef TARGET_UNSPEC_MAY_TRAP_P
26008 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
26010 #undef TARGET_USE_PSEUDO_PIC_REG
26011 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
26013 #undef TARGET_PRINT_OPERAND
26014 #define TARGET_PRINT_OPERAND aarch64_print_operand
26016 #undef TARGET_PRINT_OPERAND_ADDRESS
26017 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
26019 #undef TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA
26020 #define TARGET_ASM_OUTPUT_ADDR_CONST_EXTRA aarch64_output_addr_const_extra
26022 #undef TARGET_OPTAB_SUPPORTED_P
26023 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
26025 #undef TARGET_OMIT_STRUCT_RETURN_REG
26026 #define TARGET_OMIT_STRUCT_RETURN_REG true
26028 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
26029 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
26030 aarch64_dwarf_poly_indeterminate_value
26032 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
26033 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
26034 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
26036 #undef TARGET_HARD_REGNO_NREGS
26037 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
26038 #undef TARGET_HARD_REGNO_MODE_OK
26039 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
26041 #undef TARGET_MODES_TIEABLE_P
26042 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
26044 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
26045 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
26046 aarch64_hard_regno_call_part_clobbered
26048 #undef TARGET_INSN_CALLEE_ABI
26049 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
26051 #undef TARGET_CONSTANT_ALIGNMENT
26052 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
26054 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
26055 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
26056 aarch64_stack_clash_protection_alloca_probe_range
26058 #undef TARGET_COMPUTE_PRESSURE_CLASSES
26059 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
26061 #undef TARGET_CAN_CHANGE_MODE_CLASS
26062 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
26064 #undef TARGET_SELECT_EARLY_REMAT_MODES
26065 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
26067 #undef TARGET_SPECULATION_SAFE_VALUE
26068 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
26070 #undef TARGET_ESTIMATED_POLY_VALUE
26071 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
26073 #undef TARGET_ATTRIBUTE_TABLE
26074 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
26076 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
26077 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
26078 aarch64_simd_clone_compute_vecsize_and_simdlen
26080 #undef TARGET_SIMD_CLONE_ADJUST
26081 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
26083 #undef TARGET_SIMD_CLONE_USABLE
26084 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
26086 #undef TARGET_COMP_TYPE_ATTRIBUTES
26087 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
26089 #undef TARGET_GET_MULTILIB_ABI_NAME
26090 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
26092 #undef TARGET_FNTYPE_ABI
26093 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
26095 #undef TARGET_MEMTAG_CAN_TAG_ADDRESSES
26096 #define TARGET_MEMTAG_CAN_TAG_ADDRESSES aarch64_can_tag_addresses
26098 #if CHECKING_P
26099 #undef TARGET_RUN_TARGET_SELFTESTS
26100 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
26101 #endif /* #if CHECKING_P */
26103 #undef TARGET_ASM_POST_CFI_STARTPROC
26104 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
26106 #undef TARGET_STRICT_ARGUMENT_NAMING
26107 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
26109 #undef TARGET_MD_ASM_ADJUST
26110 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
26112 #undef TARGET_ASM_FILE_END
26113 #define TARGET_ASM_FILE_END aarch64_asm_file_end
26115 #undef TARGET_ASM_FUNCTION_EPILOGUE
26116 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
26118 struct gcc_target targetm = TARGET_INITIALIZER;
26120 #include "gt-aarch64.h"