iDefine TARGET_OFFLOAD_OPTIONS for AArch64
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
bloba108119ba3e1160809c48a80897c0c0ac175b271
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2020 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "cgraph.h"
44 #include "diagnostic.h"
45 #include "insn-attr.h"
46 #include "alias.h"
47 #include "fold-const.h"
48 #include "stor-layout.h"
49 #include "calls.h"
50 #include "varasm.h"
51 #include "output.h"
52 #include "flags.h"
53 #include "explow.h"
54 #include "expr.h"
55 #include "reload.h"
56 #include "langhooks.h"
57 #include "opts.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
74 #include "intl.h"
75 #include "expmed.h"
76 #include "function-abi.h"
78 /* This file should be included last. */
79 #include "target-def.h"
81 /* Defined for convenience. */
82 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
84 /* Information about a legitimate vector immediate operand. */
85 struct simd_immediate_info
87 enum insn_type { MOV, MVN, INDEX, PTRUE };
88 enum modifier_type { LSL, MSL };
90 simd_immediate_info () {}
91 simd_immediate_info (scalar_float_mode, rtx);
92 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
93 insn_type = MOV, modifier_type = LSL,
94 unsigned int = 0);
95 simd_immediate_info (scalar_mode, rtx, rtx);
96 simd_immediate_info (scalar_int_mode, aarch64_svpattern);
98 /* The mode of the elements. */
99 scalar_mode elt_mode;
101 /* The instruction to use to move the immediate into a vector. */
102 insn_type insn;
104 union
106 /* For MOV and MVN. */
107 struct
109 /* The value of each element. */
110 rtx value;
112 /* The kind of shift modifier to use, and the number of bits to shift.
113 This is (LSL, 0) if no shift is needed. */
114 modifier_type modifier;
115 unsigned int shift;
116 } mov;
118 /* For INDEX. */
119 struct
121 /* The value of the first element and the step to be added for each
122 subsequent element. */
123 rtx base, step;
124 } index;
126 /* For PTRUE. */
127 aarch64_svpattern pattern;
128 } u;
131 /* Construct a floating-point immediate in which each element has mode
132 ELT_MODE_IN and value VALUE_IN. */
133 inline simd_immediate_info
134 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
135 : elt_mode (elt_mode_in), insn (MOV)
137 u.mov.value = value_in;
138 u.mov.modifier = LSL;
139 u.mov.shift = 0;
142 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
143 and value VALUE_IN. The other parameters are as for the structure
144 fields. */
145 inline simd_immediate_info
146 ::simd_immediate_info (scalar_int_mode elt_mode_in,
147 unsigned HOST_WIDE_INT value_in,
148 insn_type insn_in, modifier_type modifier_in,
149 unsigned int shift_in)
150 : elt_mode (elt_mode_in), insn (insn_in)
152 u.mov.value = gen_int_mode (value_in, elt_mode_in);
153 u.mov.modifier = modifier_in;
154 u.mov.shift = shift_in;
157 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
158 and where element I is equal to BASE_IN + I * STEP_IN. */
159 inline simd_immediate_info
160 ::simd_immediate_info (scalar_mode elt_mode_in, rtx base_in, rtx step_in)
161 : elt_mode (elt_mode_in), insn (INDEX)
163 u.index.base = base_in;
164 u.index.step = step_in;
167 /* Construct a predicate that controls elements of mode ELT_MODE_IN
168 and has PTRUE pattern PATTERN_IN. */
169 inline simd_immediate_info
170 ::simd_immediate_info (scalar_int_mode elt_mode_in,
171 aarch64_svpattern pattern_in)
172 : elt_mode (elt_mode_in), insn (PTRUE)
174 u.pattern = pattern_in;
177 namespace {
179 /* Describes types that map to Pure Scalable Types (PSTs) in the AAPCS64. */
180 class pure_scalable_type_info
182 public:
183 /* Represents the result of analyzing a type. All values are nonzero,
184 in the possibly forlorn hope that accidental conversions to bool
185 trigger a warning. */
186 enum analysis_result
188 /* The type does not have an ABI identity; i.e. it doesn't contain
189 at least one object whose type is a Fundamental Data Type. */
190 NO_ABI_IDENTITY = 1,
192 /* The type is definitely a Pure Scalable Type. */
193 IS_PST,
195 /* The type is definitely not a Pure Scalable Type. */
196 ISNT_PST,
198 /* It doesn't matter for PCS purposes whether the type is a Pure
199 Scalable Type or not, since the type will be handled the same
200 way regardless.
202 Specifically, this means that if the type is a Pure Scalable Type,
203 there aren't enough argument registers to hold it, and so it will
204 need to be passed or returned in memory. If the type isn't a
205 Pure Scalable Type, it's too big to be passed or returned in core
206 or SIMD&FP registers, and so again will need to go in memory. */
207 DOESNT_MATTER
210 /* Aggregates of 17 bytes or more are normally passed and returned
211 in memory, so aggregates of that size can safely be analyzed as
212 DOESNT_MATTER. We need to be able to collect enough pieces to
213 represent a PST that is smaller than that. Since predicates are
214 2 bytes in size for -msve-vector-bits=128, that means we need to be
215 able to store at least 8 pieces.
217 We also need to be able to store enough pieces to represent
218 a single vector in each vector argument register and a single
219 predicate in each predicate argument register. This means that
220 we need at least 12 pieces. */
221 static const unsigned int MAX_PIECES = NUM_FP_ARG_REGS + NUM_PR_ARG_REGS;
222 #if __cplusplus >= 201103L
223 static_assert (MAX_PIECES >= 8, "Need to store at least 8 predicates");
224 #endif
226 /* Describes one piece of a PST. Each piece is one of:
228 - a single Scalable Vector Type (SVT)
229 - a single Scalable Predicate Type (SPT)
230 - a PST containing 2, 3 or 4 SVTs, with no padding
232 It either represents a single built-in type or a PST formed from
233 multiple homogeneous built-in types. */
234 struct piece
236 rtx get_rtx (unsigned int, unsigned int) const;
238 /* The number of vector and predicate registers that the piece
239 occupies. One of the two is always zero. */
240 unsigned int num_zr;
241 unsigned int num_pr;
243 /* The mode of the registers described above. */
244 machine_mode mode;
246 /* If this piece is formed from multiple homogeneous built-in types,
247 this is the mode of the built-in types, otherwise it is MODE. */
248 machine_mode orig_mode;
250 /* The offset in bytes of the piece from the start of the type. */
251 poly_uint64_pod offset;
254 /* Divides types analyzed as IS_PST into individual pieces. The pieces
255 are in memory order. */
256 auto_vec<piece, MAX_PIECES> pieces;
258 unsigned int num_zr () const;
259 unsigned int num_pr () const;
261 rtx get_rtx (machine_mode mode, unsigned int, unsigned int) const;
263 analysis_result analyze (const_tree);
264 bool analyze_registers (const_tree);
266 private:
267 analysis_result analyze_array (const_tree);
268 analysis_result analyze_record (const_tree);
269 void add_piece (const piece &);
273 /* The current code model. */
274 enum aarch64_code_model aarch64_cmodel;
276 /* The number of 64-bit elements in an SVE vector. */
277 poly_uint16 aarch64_sve_vg;
279 #ifdef HAVE_AS_TLS
280 #undef TARGET_HAVE_TLS
281 #define TARGET_HAVE_TLS 1
282 #endif
284 static bool aarch64_composite_type_p (const_tree, machine_mode);
285 static bool aarch64_return_in_memory_1 (const_tree);
286 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
287 const_tree,
288 machine_mode *, int *,
289 bool *, bool);
290 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
291 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
292 static void aarch64_override_options_after_change (void);
293 static bool aarch64_vector_mode_supported_p (machine_mode);
294 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
295 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
296 const_tree type,
297 int misalignment,
298 bool is_packed);
299 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
300 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
301 aarch64_addr_query_type);
302 static HOST_WIDE_INT aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val);
304 /* Major revision number of the ARM Architecture implemented by the target. */
305 unsigned aarch64_architecture_version;
307 /* The processor for which instructions should be scheduled. */
308 enum aarch64_processor aarch64_tune = cortexa53;
310 /* Mask to specify which instruction scheduling options should be used. */
311 uint64_t aarch64_tune_flags = 0;
313 /* Global flag for PC relative loads. */
314 bool aarch64_pcrelative_literal_loads;
316 /* Global flag for whether frame pointer is enabled. */
317 bool aarch64_use_frame_pointer;
319 #define BRANCH_PROTECT_STR_MAX 255
320 char *accepted_branch_protection_string = NULL;
322 static enum aarch64_parse_opt_result
323 aarch64_parse_branch_protection (const char*, char**);
325 /* Support for command line parsing of boolean flags in the tuning
326 structures. */
327 struct aarch64_flag_desc
329 const char* name;
330 unsigned int flag;
333 #define AARCH64_FUSION_PAIR(name, internal_name) \
334 { name, AARCH64_FUSE_##internal_name },
335 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
337 { "none", AARCH64_FUSE_NOTHING },
338 #include "aarch64-fusion-pairs.def"
339 { "all", AARCH64_FUSE_ALL },
340 { NULL, AARCH64_FUSE_NOTHING }
343 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
344 { name, AARCH64_EXTRA_TUNE_##internal_name },
345 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
347 { "none", AARCH64_EXTRA_TUNE_NONE },
348 #include "aarch64-tuning-flags.def"
349 { "all", AARCH64_EXTRA_TUNE_ALL },
350 { NULL, AARCH64_EXTRA_TUNE_NONE }
353 /* Tuning parameters. */
355 static const struct cpu_addrcost_table generic_addrcost_table =
358 1, /* hi */
359 0, /* si */
360 0, /* di */
361 1, /* ti */
363 0, /* pre_modify */
364 0, /* post_modify */
365 0, /* register_offset */
366 0, /* register_sextend */
367 0, /* register_zextend */
368 0 /* imm_offset */
371 static const struct cpu_addrcost_table exynosm1_addrcost_table =
374 0, /* hi */
375 0, /* si */
376 0, /* di */
377 2, /* ti */
379 0, /* pre_modify */
380 0, /* post_modify */
381 1, /* register_offset */
382 1, /* register_sextend */
383 2, /* register_zextend */
384 0, /* imm_offset */
387 static const struct cpu_addrcost_table xgene1_addrcost_table =
390 1, /* hi */
391 0, /* si */
392 0, /* di */
393 1, /* ti */
395 1, /* pre_modify */
396 1, /* post_modify */
397 0, /* register_offset */
398 1, /* register_sextend */
399 1, /* register_zextend */
400 0, /* imm_offset */
403 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
406 1, /* hi */
407 1, /* si */
408 1, /* di */
409 2, /* ti */
411 0, /* pre_modify */
412 0, /* post_modify */
413 2, /* register_offset */
414 3, /* register_sextend */
415 3, /* register_zextend */
416 0, /* imm_offset */
419 static const struct cpu_addrcost_table thunderx3t110_addrcost_table =
422 1, /* hi */
423 1, /* si */
424 1, /* di */
425 2, /* ti */
427 0, /* pre_modify */
428 0, /* post_modify */
429 2, /* register_offset */
430 3, /* register_sextend */
431 3, /* register_zextend */
432 0, /* imm_offset */
435 static const struct cpu_addrcost_table tsv110_addrcost_table =
438 1, /* hi */
439 0, /* si */
440 0, /* di */
441 1, /* ti */
443 0, /* pre_modify */
444 0, /* post_modify */
445 0, /* register_offset */
446 1, /* register_sextend */
447 1, /* register_zextend */
448 0, /* imm_offset */
451 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
454 1, /* hi */
455 1, /* si */
456 1, /* di */
457 2, /* ti */
459 1, /* pre_modify */
460 1, /* post_modify */
461 3, /* register_offset */
462 3, /* register_sextend */
463 3, /* register_zextend */
464 2, /* imm_offset */
467 static const struct cpu_regmove_cost generic_regmove_cost =
469 1, /* GP2GP */
470 /* Avoid the use of slow int<->fp moves for spilling by setting
471 their cost higher than memmov_cost. */
472 5, /* GP2FP */
473 5, /* FP2GP */
474 2 /* FP2FP */
477 static const struct cpu_regmove_cost cortexa57_regmove_cost =
479 1, /* GP2GP */
480 /* Avoid the use of slow int<->fp moves for spilling by setting
481 their cost higher than memmov_cost. */
482 5, /* GP2FP */
483 5, /* FP2GP */
484 2 /* FP2FP */
487 static const struct cpu_regmove_cost cortexa53_regmove_cost =
489 1, /* GP2GP */
490 /* Avoid the use of slow int<->fp moves for spilling by setting
491 their cost higher than memmov_cost. */
492 5, /* GP2FP */
493 5, /* FP2GP */
494 2 /* FP2FP */
497 static const struct cpu_regmove_cost exynosm1_regmove_cost =
499 1, /* GP2GP */
500 /* Avoid the use of slow int<->fp moves for spilling by setting
501 their cost higher than memmov_cost (actual, 4 and 9). */
502 9, /* GP2FP */
503 9, /* FP2GP */
504 1 /* FP2FP */
507 static const struct cpu_regmove_cost thunderx_regmove_cost =
509 2, /* GP2GP */
510 2, /* GP2FP */
511 6, /* FP2GP */
512 4 /* FP2FP */
515 static const struct cpu_regmove_cost xgene1_regmove_cost =
517 1, /* GP2GP */
518 /* Avoid the use of slow int<->fp moves for spilling by setting
519 their cost higher than memmov_cost. */
520 8, /* GP2FP */
521 8, /* FP2GP */
522 2 /* FP2FP */
525 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
527 2, /* GP2GP */
528 /* Avoid the use of int<->fp moves for spilling. */
529 6, /* GP2FP */
530 6, /* FP2GP */
531 4 /* FP2FP */
534 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
536 1, /* GP2GP */
537 /* Avoid the use of int<->fp moves for spilling. */
538 5, /* GP2FP */
539 6, /* FP2GP */
540 3, /* FP2FP */
543 static const struct cpu_regmove_cost thunderx3t110_regmove_cost =
545 1, /* GP2GP */
546 /* Avoid the use of int<->fp moves for spilling. */
547 4, /* GP2FP */
548 5, /* FP2GP */
549 4 /* FP2FP */
552 static const struct cpu_regmove_cost tsv110_regmove_cost =
554 1, /* GP2GP */
555 /* Avoid the use of slow int<->fp moves for spilling by setting
556 their cost higher than memmov_cost. */
557 2, /* GP2FP */
558 3, /* FP2GP */
559 2 /* FP2FP */
562 /* Generic costs for vector insn classes. */
563 static const struct cpu_vector_cost generic_vector_cost =
565 1, /* scalar_int_stmt_cost */
566 1, /* scalar_fp_stmt_cost */
567 1, /* scalar_load_cost */
568 1, /* scalar_store_cost */
569 1, /* vec_int_stmt_cost */
570 1, /* vec_fp_stmt_cost */
571 2, /* vec_permute_cost */
572 2, /* vec_to_scalar_cost */
573 1, /* scalar_to_vec_cost */
574 1, /* vec_align_load_cost */
575 1, /* vec_unalign_load_cost */
576 1, /* vec_unalign_store_cost */
577 1, /* vec_store_cost */
578 3, /* cond_taken_branch_cost */
579 1 /* cond_not_taken_branch_cost */
582 /* QDF24XX costs for vector insn classes. */
583 static const struct cpu_vector_cost qdf24xx_vector_cost =
585 1, /* scalar_int_stmt_cost */
586 1, /* scalar_fp_stmt_cost */
587 1, /* scalar_load_cost */
588 1, /* scalar_store_cost */
589 1, /* vec_int_stmt_cost */
590 3, /* vec_fp_stmt_cost */
591 2, /* vec_permute_cost */
592 1, /* vec_to_scalar_cost */
593 1, /* scalar_to_vec_cost */
594 1, /* vec_align_load_cost */
595 1, /* vec_unalign_load_cost */
596 1, /* vec_unalign_store_cost */
597 1, /* vec_store_cost */
598 3, /* cond_taken_branch_cost */
599 1 /* cond_not_taken_branch_cost */
602 /* ThunderX costs for vector insn classes. */
603 static const struct cpu_vector_cost thunderx_vector_cost =
605 1, /* scalar_int_stmt_cost */
606 1, /* scalar_fp_stmt_cost */
607 3, /* scalar_load_cost */
608 1, /* scalar_store_cost */
609 4, /* vec_int_stmt_cost */
610 1, /* vec_fp_stmt_cost */
611 4, /* vec_permute_cost */
612 2, /* vec_to_scalar_cost */
613 2, /* scalar_to_vec_cost */
614 3, /* vec_align_load_cost */
615 5, /* vec_unalign_load_cost */
616 5, /* vec_unalign_store_cost */
617 1, /* vec_store_cost */
618 3, /* cond_taken_branch_cost */
619 3 /* cond_not_taken_branch_cost */
622 static const struct cpu_vector_cost tsv110_vector_cost =
624 1, /* scalar_int_stmt_cost */
625 1, /* scalar_fp_stmt_cost */
626 5, /* scalar_load_cost */
627 1, /* scalar_store_cost */
628 2, /* vec_int_stmt_cost */
629 2, /* vec_fp_stmt_cost */
630 2, /* vec_permute_cost */
631 3, /* vec_to_scalar_cost */
632 2, /* scalar_to_vec_cost */
633 5, /* vec_align_load_cost */
634 5, /* vec_unalign_load_cost */
635 1, /* vec_unalign_store_cost */
636 1, /* vec_store_cost */
637 1, /* cond_taken_branch_cost */
638 1 /* cond_not_taken_branch_cost */
641 /* Generic costs for vector insn classes. */
642 static const struct cpu_vector_cost cortexa57_vector_cost =
644 1, /* scalar_int_stmt_cost */
645 1, /* scalar_fp_stmt_cost */
646 4, /* scalar_load_cost */
647 1, /* scalar_store_cost */
648 2, /* vec_int_stmt_cost */
649 2, /* vec_fp_stmt_cost */
650 3, /* vec_permute_cost */
651 8, /* vec_to_scalar_cost */
652 8, /* scalar_to_vec_cost */
653 4, /* vec_align_load_cost */
654 4, /* vec_unalign_load_cost */
655 1, /* vec_unalign_store_cost */
656 1, /* vec_store_cost */
657 1, /* cond_taken_branch_cost */
658 1 /* cond_not_taken_branch_cost */
661 static const struct cpu_vector_cost exynosm1_vector_cost =
663 1, /* scalar_int_stmt_cost */
664 1, /* scalar_fp_stmt_cost */
665 5, /* scalar_load_cost */
666 1, /* scalar_store_cost */
667 3, /* vec_int_stmt_cost */
668 3, /* vec_fp_stmt_cost */
669 3, /* vec_permute_cost */
670 3, /* vec_to_scalar_cost */
671 3, /* scalar_to_vec_cost */
672 5, /* vec_align_load_cost */
673 5, /* vec_unalign_load_cost */
674 1, /* vec_unalign_store_cost */
675 1, /* vec_store_cost */
676 1, /* cond_taken_branch_cost */
677 1 /* cond_not_taken_branch_cost */
680 /* Generic costs for vector insn classes. */
681 static const struct cpu_vector_cost xgene1_vector_cost =
683 1, /* scalar_int_stmt_cost */
684 1, /* scalar_fp_stmt_cost */
685 5, /* scalar_load_cost */
686 1, /* scalar_store_cost */
687 2, /* vec_int_stmt_cost */
688 2, /* vec_fp_stmt_cost */
689 2, /* vec_permute_cost */
690 4, /* vec_to_scalar_cost */
691 4, /* scalar_to_vec_cost */
692 10, /* vec_align_load_cost */
693 10, /* vec_unalign_load_cost */
694 2, /* vec_unalign_store_cost */
695 2, /* vec_store_cost */
696 2, /* cond_taken_branch_cost */
697 1 /* cond_not_taken_branch_cost */
700 /* Costs for vector insn classes for Vulcan. */
701 static const struct cpu_vector_cost thunderx2t99_vector_cost =
703 1, /* scalar_int_stmt_cost */
704 6, /* scalar_fp_stmt_cost */
705 4, /* scalar_load_cost */
706 1, /* scalar_store_cost */
707 4, /* vec_int_stmt_cost */
708 5, /* vec_fp_stmt_cost */
709 10, /* vec_permute_cost */
710 6, /* vec_to_scalar_cost */
711 5, /* scalar_to_vec_cost */
712 4, /* vec_align_load_cost */
713 4, /* vec_unalign_load_cost */
714 1, /* vec_unalign_store_cost */
715 1, /* vec_store_cost */
716 2, /* cond_taken_branch_cost */
717 1 /* cond_not_taken_branch_cost */
720 static const struct cpu_vector_cost thunderx3t110_vector_cost =
722 1, /* scalar_int_stmt_cost */
723 5, /* scalar_fp_stmt_cost */
724 4, /* scalar_load_cost */
725 1, /* scalar_store_cost */
726 5, /* vec_int_stmt_cost */
727 5, /* vec_fp_stmt_cost */
728 10, /* vec_permute_cost */
729 5, /* vec_to_scalar_cost */
730 5, /* scalar_to_vec_cost */
731 4, /* vec_align_load_cost */
732 4, /* vec_unalign_load_cost */
733 4, /* vec_unalign_store_cost */
734 4, /* vec_store_cost */
735 2, /* cond_taken_branch_cost */
736 1 /* cond_not_taken_branch_cost */
740 /* Generic costs for branch instructions. */
741 static const struct cpu_branch_cost generic_branch_cost =
743 1, /* Predictable. */
744 3 /* Unpredictable. */
747 /* Generic approximation modes. */
748 static const cpu_approx_modes generic_approx_modes =
750 AARCH64_APPROX_NONE, /* division */
751 AARCH64_APPROX_NONE, /* sqrt */
752 AARCH64_APPROX_NONE /* recip_sqrt */
755 /* Approximation modes for Exynos M1. */
756 static const cpu_approx_modes exynosm1_approx_modes =
758 AARCH64_APPROX_NONE, /* division */
759 AARCH64_APPROX_ALL, /* sqrt */
760 AARCH64_APPROX_ALL /* recip_sqrt */
763 /* Approximation modes for X-Gene 1. */
764 static const cpu_approx_modes xgene1_approx_modes =
766 AARCH64_APPROX_NONE, /* division */
767 AARCH64_APPROX_NONE, /* sqrt */
768 AARCH64_APPROX_ALL /* recip_sqrt */
771 /* Generic prefetch settings (which disable prefetch). */
772 static const cpu_prefetch_tune generic_prefetch_tune =
774 0, /* num_slots */
775 -1, /* l1_cache_size */
776 -1, /* l1_cache_line_size */
777 -1, /* l2_cache_size */
778 true, /* prefetch_dynamic_strides */
779 -1, /* minimum_stride */
780 -1 /* default_opt_level */
783 static const cpu_prefetch_tune exynosm1_prefetch_tune =
785 0, /* num_slots */
786 -1, /* l1_cache_size */
787 64, /* l1_cache_line_size */
788 -1, /* l2_cache_size */
789 true, /* prefetch_dynamic_strides */
790 -1, /* minimum_stride */
791 -1 /* default_opt_level */
794 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
796 4, /* num_slots */
797 32, /* l1_cache_size */
798 64, /* l1_cache_line_size */
799 512, /* l2_cache_size */
800 false, /* prefetch_dynamic_strides */
801 2048, /* minimum_stride */
802 3 /* default_opt_level */
805 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
807 8, /* num_slots */
808 32, /* l1_cache_size */
809 128, /* l1_cache_line_size */
810 16*1024, /* l2_cache_size */
811 true, /* prefetch_dynamic_strides */
812 -1, /* minimum_stride */
813 3 /* default_opt_level */
816 static const cpu_prefetch_tune thunderx_prefetch_tune =
818 8, /* num_slots */
819 32, /* l1_cache_size */
820 128, /* l1_cache_line_size */
821 -1, /* l2_cache_size */
822 true, /* prefetch_dynamic_strides */
823 -1, /* minimum_stride */
824 -1 /* default_opt_level */
827 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
829 8, /* num_slots */
830 32, /* l1_cache_size */
831 64, /* l1_cache_line_size */
832 256, /* l2_cache_size */
833 true, /* prefetch_dynamic_strides */
834 -1, /* minimum_stride */
835 -1 /* default_opt_level */
838 static const cpu_prefetch_tune thunderx3t110_prefetch_tune =
840 8, /* num_slots */
841 32, /* l1_cache_size */
842 64, /* l1_cache_line_size */
843 256, /* l2_cache_size */
844 true, /* prefetch_dynamic_strides */
845 -1, /* minimum_stride */
846 -1 /* default_opt_level */
849 static const cpu_prefetch_tune tsv110_prefetch_tune =
851 0, /* num_slots */
852 64, /* l1_cache_size */
853 64, /* l1_cache_line_size */
854 512, /* l2_cache_size */
855 true, /* prefetch_dynamic_strides */
856 -1, /* minimum_stride */
857 -1 /* default_opt_level */
860 static const cpu_prefetch_tune xgene1_prefetch_tune =
862 8, /* num_slots */
863 32, /* l1_cache_size */
864 64, /* l1_cache_line_size */
865 256, /* l2_cache_size */
866 true, /* prefetch_dynamic_strides */
867 -1, /* minimum_stride */
868 -1 /* default_opt_level */
871 static const struct tune_params generic_tunings =
873 &cortexa57_extra_costs,
874 &generic_addrcost_table,
875 &generic_regmove_cost,
876 &generic_vector_cost,
877 &generic_branch_cost,
878 &generic_approx_modes,
879 SVE_NOT_IMPLEMENTED, /* sve_width */
880 4, /* memmov_cost */
881 2, /* issue_rate */
882 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
883 "16:12", /* function_align. */
884 "4", /* jump_align. */
885 "8", /* loop_align. */
886 2, /* int_reassoc_width. */
887 4, /* fp_reassoc_width. */
888 1, /* vec_reassoc_width. */
889 2, /* min_div_recip_mul_sf. */
890 2, /* min_div_recip_mul_df. */
891 0, /* max_case_values. */
892 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
893 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
894 &generic_prefetch_tune
897 static const struct tune_params cortexa35_tunings =
899 &cortexa53_extra_costs,
900 &generic_addrcost_table,
901 &cortexa53_regmove_cost,
902 &generic_vector_cost,
903 &generic_branch_cost,
904 &generic_approx_modes,
905 SVE_NOT_IMPLEMENTED, /* sve_width */
906 4, /* memmov_cost */
907 1, /* issue_rate */
908 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
909 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
910 "16", /* function_align. */
911 "4", /* jump_align. */
912 "8", /* loop_align. */
913 2, /* int_reassoc_width. */
914 4, /* fp_reassoc_width. */
915 1, /* vec_reassoc_width. */
916 2, /* min_div_recip_mul_sf. */
917 2, /* min_div_recip_mul_df. */
918 0, /* max_case_values. */
919 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
920 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
921 &generic_prefetch_tune
924 static const struct tune_params cortexa53_tunings =
926 &cortexa53_extra_costs,
927 &generic_addrcost_table,
928 &cortexa53_regmove_cost,
929 &generic_vector_cost,
930 &generic_branch_cost,
931 &generic_approx_modes,
932 SVE_NOT_IMPLEMENTED, /* sve_width */
933 4, /* memmov_cost */
934 2, /* issue_rate */
935 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
936 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
937 "16", /* function_align. */
938 "4", /* jump_align. */
939 "8", /* loop_align. */
940 2, /* int_reassoc_width. */
941 4, /* fp_reassoc_width. */
942 1, /* vec_reassoc_width. */
943 2, /* min_div_recip_mul_sf. */
944 2, /* min_div_recip_mul_df. */
945 0, /* max_case_values. */
946 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
947 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
948 &generic_prefetch_tune
951 static const struct tune_params cortexa57_tunings =
953 &cortexa57_extra_costs,
954 &generic_addrcost_table,
955 &cortexa57_regmove_cost,
956 &cortexa57_vector_cost,
957 &generic_branch_cost,
958 &generic_approx_modes,
959 SVE_NOT_IMPLEMENTED, /* sve_width */
960 4, /* memmov_cost */
961 3, /* issue_rate */
962 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
963 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
964 "16", /* function_align. */
965 "4", /* jump_align. */
966 "8", /* loop_align. */
967 2, /* int_reassoc_width. */
968 4, /* fp_reassoc_width. */
969 1, /* vec_reassoc_width. */
970 2, /* min_div_recip_mul_sf. */
971 2, /* min_div_recip_mul_df. */
972 0, /* max_case_values. */
973 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
974 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
975 &generic_prefetch_tune
978 static const struct tune_params cortexa72_tunings =
980 &cortexa57_extra_costs,
981 &generic_addrcost_table,
982 &cortexa57_regmove_cost,
983 &cortexa57_vector_cost,
984 &generic_branch_cost,
985 &generic_approx_modes,
986 SVE_NOT_IMPLEMENTED, /* sve_width */
987 4, /* memmov_cost */
988 3, /* issue_rate */
989 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
990 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
991 "16", /* function_align. */
992 "4", /* jump_align. */
993 "8", /* loop_align. */
994 2, /* int_reassoc_width. */
995 4, /* fp_reassoc_width. */
996 1, /* vec_reassoc_width. */
997 2, /* min_div_recip_mul_sf. */
998 2, /* min_div_recip_mul_df. */
999 0, /* max_case_values. */
1000 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1001 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1002 &generic_prefetch_tune
1005 static const struct tune_params cortexa73_tunings =
1007 &cortexa57_extra_costs,
1008 &generic_addrcost_table,
1009 &cortexa57_regmove_cost,
1010 &cortexa57_vector_cost,
1011 &generic_branch_cost,
1012 &generic_approx_modes,
1013 SVE_NOT_IMPLEMENTED, /* sve_width */
1014 4, /* memmov_cost. */
1015 2, /* issue_rate. */
1016 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1017 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
1018 "16", /* function_align. */
1019 "4", /* jump_align. */
1020 "8", /* loop_align. */
1021 2, /* int_reassoc_width. */
1022 4, /* fp_reassoc_width. */
1023 1, /* vec_reassoc_width. */
1024 2, /* min_div_recip_mul_sf. */
1025 2, /* min_div_recip_mul_df. */
1026 0, /* max_case_values. */
1027 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1028 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1029 &generic_prefetch_tune
1034 static const struct tune_params exynosm1_tunings =
1036 &exynosm1_extra_costs,
1037 &exynosm1_addrcost_table,
1038 &exynosm1_regmove_cost,
1039 &exynosm1_vector_cost,
1040 &generic_branch_cost,
1041 &exynosm1_approx_modes,
1042 SVE_NOT_IMPLEMENTED, /* sve_width */
1043 4, /* memmov_cost */
1044 3, /* issue_rate */
1045 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
1046 "4", /* function_align. */
1047 "4", /* jump_align. */
1048 "4", /* loop_align. */
1049 2, /* int_reassoc_width. */
1050 4, /* fp_reassoc_width. */
1051 1, /* vec_reassoc_width. */
1052 2, /* min_div_recip_mul_sf. */
1053 2, /* min_div_recip_mul_df. */
1054 48, /* max_case_values. */
1055 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1056 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1057 &exynosm1_prefetch_tune
1060 static const struct tune_params thunderxt88_tunings =
1062 &thunderx_extra_costs,
1063 &generic_addrcost_table,
1064 &thunderx_regmove_cost,
1065 &thunderx_vector_cost,
1066 &generic_branch_cost,
1067 &generic_approx_modes,
1068 SVE_NOT_IMPLEMENTED, /* sve_width */
1069 6, /* memmov_cost */
1070 2, /* issue_rate */
1071 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1072 "8", /* function_align. */
1073 "8", /* jump_align. */
1074 "8", /* loop_align. */
1075 2, /* int_reassoc_width. */
1076 4, /* fp_reassoc_width. */
1077 1, /* vec_reassoc_width. */
1078 2, /* min_div_recip_mul_sf. */
1079 2, /* min_div_recip_mul_df. */
1080 0, /* max_case_values. */
1081 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1082 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
1083 &thunderxt88_prefetch_tune
1086 static const struct tune_params thunderx_tunings =
1088 &thunderx_extra_costs,
1089 &generic_addrcost_table,
1090 &thunderx_regmove_cost,
1091 &thunderx_vector_cost,
1092 &generic_branch_cost,
1093 &generic_approx_modes,
1094 SVE_NOT_IMPLEMENTED, /* sve_width */
1095 6, /* memmov_cost */
1096 2, /* issue_rate */
1097 AARCH64_FUSE_ALU_BRANCH, /* fusible_ops */
1098 "8", /* function_align. */
1099 "8", /* jump_align. */
1100 "8", /* loop_align. */
1101 2, /* int_reassoc_width. */
1102 4, /* fp_reassoc_width. */
1103 1, /* vec_reassoc_width. */
1104 2, /* min_div_recip_mul_sf. */
1105 2, /* min_div_recip_mul_df. */
1106 0, /* max_case_values. */
1107 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1108 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
1109 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
1110 &thunderx_prefetch_tune
1113 static const struct tune_params tsv110_tunings =
1115 &tsv110_extra_costs,
1116 &tsv110_addrcost_table,
1117 &tsv110_regmove_cost,
1118 &tsv110_vector_cost,
1119 &generic_branch_cost,
1120 &generic_approx_modes,
1121 SVE_NOT_IMPLEMENTED, /* sve_width */
1122 4, /* memmov_cost */
1123 4, /* issue_rate */
1124 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_ALU_BRANCH
1125 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1126 "16", /* function_align. */
1127 "4", /* jump_align. */
1128 "8", /* loop_align. */
1129 2, /* int_reassoc_width. */
1130 4, /* fp_reassoc_width. */
1131 1, /* vec_reassoc_width. */
1132 2, /* min_div_recip_mul_sf. */
1133 2, /* min_div_recip_mul_df. */
1134 0, /* max_case_values. */
1135 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1136 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1137 &tsv110_prefetch_tune
1140 static const struct tune_params xgene1_tunings =
1142 &xgene1_extra_costs,
1143 &xgene1_addrcost_table,
1144 &xgene1_regmove_cost,
1145 &xgene1_vector_cost,
1146 &generic_branch_cost,
1147 &xgene1_approx_modes,
1148 SVE_NOT_IMPLEMENTED, /* sve_width */
1149 6, /* memmov_cost */
1150 4, /* issue_rate */
1151 AARCH64_FUSE_NOTHING, /* fusible_ops */
1152 "16", /* function_align. */
1153 "16", /* jump_align. */
1154 "16", /* loop_align. */
1155 2, /* int_reassoc_width. */
1156 4, /* fp_reassoc_width. */
1157 1, /* vec_reassoc_width. */
1158 2, /* min_div_recip_mul_sf. */
1159 2, /* min_div_recip_mul_df. */
1160 17, /* max_case_values. */
1161 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1162 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1163 &xgene1_prefetch_tune
1166 static const struct tune_params emag_tunings =
1168 &xgene1_extra_costs,
1169 &xgene1_addrcost_table,
1170 &xgene1_regmove_cost,
1171 &xgene1_vector_cost,
1172 &generic_branch_cost,
1173 &xgene1_approx_modes,
1174 SVE_NOT_IMPLEMENTED,
1175 6, /* memmov_cost */
1176 4, /* issue_rate */
1177 AARCH64_FUSE_NOTHING, /* fusible_ops */
1178 "16", /* function_align. */
1179 "16", /* jump_align. */
1180 "16", /* loop_align. */
1181 2, /* int_reassoc_width. */
1182 4, /* fp_reassoc_width. */
1183 1, /* vec_reassoc_width. */
1184 2, /* min_div_recip_mul_sf. */
1185 2, /* min_div_recip_mul_df. */
1186 17, /* max_case_values. */
1187 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
1188 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
1189 &xgene1_prefetch_tune
1192 static const struct tune_params qdf24xx_tunings =
1194 &qdf24xx_extra_costs,
1195 &qdf24xx_addrcost_table,
1196 &qdf24xx_regmove_cost,
1197 &qdf24xx_vector_cost,
1198 &generic_branch_cost,
1199 &generic_approx_modes,
1200 SVE_NOT_IMPLEMENTED, /* sve_width */
1201 4, /* memmov_cost */
1202 4, /* issue_rate */
1203 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1204 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1205 "16", /* function_align. */
1206 "8", /* jump_align. */
1207 "16", /* loop_align. */
1208 2, /* int_reassoc_width. */
1209 4, /* fp_reassoc_width. */
1210 1, /* vec_reassoc_width. */
1211 2, /* min_div_recip_mul_sf. */
1212 2, /* min_div_recip_mul_df. */
1213 0, /* max_case_values. */
1214 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1215 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
1216 &qdf24xx_prefetch_tune
1219 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
1220 for now. */
1221 static const struct tune_params saphira_tunings =
1223 &generic_extra_costs,
1224 &generic_addrcost_table,
1225 &generic_regmove_cost,
1226 &generic_vector_cost,
1227 &generic_branch_cost,
1228 &generic_approx_modes,
1229 SVE_NOT_IMPLEMENTED, /* sve_width */
1230 4, /* memmov_cost */
1231 4, /* issue_rate */
1232 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
1233 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
1234 "16", /* function_align. */
1235 "8", /* jump_align. */
1236 "16", /* loop_align. */
1237 2, /* int_reassoc_width. */
1238 4, /* fp_reassoc_width. */
1239 1, /* vec_reassoc_width. */
1240 2, /* min_div_recip_mul_sf. */
1241 2, /* min_div_recip_mul_df. */
1242 0, /* max_case_values. */
1243 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1244 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1245 &generic_prefetch_tune
1248 static const struct tune_params thunderx2t99_tunings =
1250 &thunderx2t99_extra_costs,
1251 &thunderx2t99_addrcost_table,
1252 &thunderx2t99_regmove_cost,
1253 &thunderx2t99_vector_cost,
1254 &generic_branch_cost,
1255 &generic_approx_modes,
1256 SVE_NOT_IMPLEMENTED, /* sve_width */
1257 4, /* memmov_cost. */
1258 4, /* issue_rate. */
1259 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1260 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1261 "16", /* function_align. */
1262 "8", /* jump_align. */
1263 "16", /* loop_align. */
1264 3, /* int_reassoc_width. */
1265 2, /* fp_reassoc_width. */
1266 2, /* vec_reassoc_width. */
1267 2, /* min_div_recip_mul_sf. */
1268 2, /* min_div_recip_mul_df. */
1269 0, /* max_case_values. */
1270 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1271 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1272 &thunderx2t99_prefetch_tune
1275 static const struct tune_params thunderx3t110_tunings =
1277 &thunderx3t110_extra_costs,
1278 &thunderx3t110_addrcost_table,
1279 &thunderx3t110_regmove_cost,
1280 &thunderx3t110_vector_cost,
1281 &generic_branch_cost,
1282 &generic_approx_modes,
1283 SVE_NOT_IMPLEMENTED, /* sve_width */
1284 4, /* memmov_cost. */
1285 6, /* issue_rate. */
1286 (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
1287 | AARCH64_FUSE_ALU_CBZ), /* fusible_ops */
1288 "16", /* function_align. */
1289 "8", /* jump_align. */
1290 "16", /* loop_align. */
1291 3, /* int_reassoc_width. */
1292 2, /* fp_reassoc_width. */
1293 2, /* vec_reassoc_width. */
1294 2, /* min_div_recip_mul_sf. */
1295 2, /* min_div_recip_mul_df. */
1296 0, /* max_case_values. */
1297 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1298 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1299 &thunderx3t110_prefetch_tune
1302 static const struct tune_params neoversen1_tunings =
1304 &cortexa57_extra_costs,
1305 &generic_addrcost_table,
1306 &generic_regmove_cost,
1307 &cortexa57_vector_cost,
1308 &generic_branch_cost,
1309 &generic_approx_modes,
1310 SVE_NOT_IMPLEMENTED, /* sve_width */
1311 4, /* memmov_cost */
1312 3, /* issue_rate */
1313 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH), /* fusible_ops */
1314 "32:16", /* function_align. */
1315 "4", /* jump_align. */
1316 "32:16", /* loop_align. */
1317 2, /* int_reassoc_width. */
1318 4, /* fp_reassoc_width. */
1319 2, /* vec_reassoc_width. */
1320 2, /* min_div_recip_mul_sf. */
1321 2, /* min_div_recip_mul_df. */
1322 0, /* max_case_values. */
1323 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1324 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1325 &generic_prefetch_tune
1328 /* Support for fine-grained override of the tuning structures. */
1329 struct aarch64_tuning_override_function
1331 const char* name;
1332 void (*parse_override)(const char*, struct tune_params*);
1335 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1336 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1337 static void aarch64_parse_sve_width_string (const char*, struct tune_params*);
1339 static const struct aarch64_tuning_override_function
1340 aarch64_tuning_override_functions[] =
1342 { "fuse", aarch64_parse_fuse_string },
1343 { "tune", aarch64_parse_tune_string },
1344 { "sve_width", aarch64_parse_sve_width_string },
1345 { NULL, NULL }
1348 /* A processor implementing AArch64. */
1349 struct processor
1351 const char *const name;
1352 enum aarch64_processor ident;
1353 enum aarch64_processor sched_core;
1354 enum aarch64_arch arch;
1355 unsigned architecture_version;
1356 const uint64_t flags;
1357 const struct tune_params *const tune;
1360 /* Architectures implementing AArch64. */
1361 static const struct processor all_architectures[] =
1363 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1364 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1365 #include "aarch64-arches.def"
1366 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1369 /* Processor cores implementing AArch64. */
1370 static const struct processor all_cores[] =
1372 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1373 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1374 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1375 FLAGS, &COSTS##_tunings},
1376 #include "aarch64-cores.def"
1377 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1378 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1379 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1383 /* Target specification. These are populated by the -march, -mtune, -mcpu
1384 handling code or by target attributes. */
1385 static const struct processor *selected_arch;
1386 static const struct processor *selected_cpu;
1387 static const struct processor *selected_tune;
1389 enum aarch64_key_type aarch64_ra_sign_key = AARCH64_KEY_A;
1391 /* The current tuning set. */
1392 struct tune_params aarch64_tune_params = generic_tunings;
1394 /* Check whether an 'aarch64_vector_pcs' attribute is valid. */
1396 static tree
1397 handle_aarch64_vector_pcs_attribute (tree *node, tree name, tree,
1398 int, bool *no_add_attrs)
1400 /* Since we set fn_type_req to true, the caller should have checked
1401 this for us. */
1402 gcc_assert (FUNC_OR_METHOD_TYPE_P (*node));
1403 switch ((arm_pcs) fntype_abi (*node).id ())
1405 case ARM_PCS_AAPCS64:
1406 case ARM_PCS_SIMD:
1407 return NULL_TREE;
1409 case ARM_PCS_SVE:
1410 error ("the %qE attribute cannot be applied to an SVE function type",
1411 name);
1412 *no_add_attrs = true;
1413 return NULL_TREE;
1415 case ARM_PCS_TLSDESC:
1416 case ARM_PCS_UNKNOWN:
1417 break;
1419 gcc_unreachable ();
1422 /* Table of machine attributes. */
1423 static const struct attribute_spec aarch64_attribute_table[] =
1425 /* { name, min_len, max_len, decl_req, type_req, fn_type_req,
1426 affects_type_identity, handler, exclude } */
1427 { "aarch64_vector_pcs", 0, 0, false, true, true, true,
1428 handle_aarch64_vector_pcs_attribute, NULL },
1429 { "arm_sve_vector_bits", 1, 1, false, true, false, true,
1430 aarch64_sve::handle_arm_sve_vector_bits_attribute,
1431 NULL },
1432 { "Advanced SIMD type", 1, 1, false, true, false, true, NULL, NULL },
1433 { "SVE type", 3, 3, false, true, false, true, NULL, NULL },
1434 { "SVE sizeless type", 0, 0, false, true, false, true, NULL, NULL },
1435 { NULL, 0, 0, false, false, false, false, NULL, NULL }
1438 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1440 /* An ISA extension in the co-processor and main instruction set space. */
1441 struct aarch64_option_extension
1443 const char *const name;
1444 const unsigned long flags_on;
1445 const unsigned long flags_off;
1448 typedef enum aarch64_cond_code
1450 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1451 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1452 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1454 aarch64_cc;
1456 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1458 struct aarch64_branch_protect_type
1460 /* The type's name that the user passes to the branch-protection option
1461 string. */
1462 const char* name;
1463 /* Function to handle the protection type and set global variables.
1464 First argument is the string token corresponding with this type and the
1465 second argument is the next token in the option string.
1466 Return values:
1467 * AARCH64_PARSE_OK: Handling was sucessful.
1468 * AARCH64_INVALID_ARG: The type is invalid in this context and the caller
1469 should print an error.
1470 * AARCH64_INVALID_FEATURE: The type is invalid and the handler prints its
1471 own error. */
1472 enum aarch64_parse_opt_result (*handler)(char*, char*);
1473 /* A list of types that can follow this type in the option string. */
1474 const aarch64_branch_protect_type* subtypes;
1475 unsigned int num_subtypes;
1478 static enum aarch64_parse_opt_result
1479 aarch64_handle_no_branch_protection (char* str, char* rest)
1481 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
1482 aarch64_enable_bti = 0;
1483 if (rest)
1485 error ("unexpected %<%s%> after %<%s%>", rest, str);
1486 return AARCH64_PARSE_INVALID_FEATURE;
1488 return AARCH64_PARSE_OK;
1491 static enum aarch64_parse_opt_result
1492 aarch64_handle_standard_branch_protection (char* str, char* rest)
1494 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1495 aarch64_ra_sign_key = AARCH64_KEY_A;
1496 aarch64_enable_bti = 1;
1497 if (rest)
1499 error ("unexpected %<%s%> after %<%s%>", rest, str);
1500 return AARCH64_PARSE_INVALID_FEATURE;
1502 return AARCH64_PARSE_OK;
1505 static enum aarch64_parse_opt_result
1506 aarch64_handle_pac_ret_protection (char* str ATTRIBUTE_UNUSED,
1507 char* rest ATTRIBUTE_UNUSED)
1509 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
1510 aarch64_ra_sign_key = AARCH64_KEY_A;
1511 return AARCH64_PARSE_OK;
1514 static enum aarch64_parse_opt_result
1515 aarch64_handle_pac_ret_leaf (char* str ATTRIBUTE_UNUSED,
1516 char* rest ATTRIBUTE_UNUSED)
1518 aarch64_ra_sign_scope = AARCH64_FUNCTION_ALL;
1519 return AARCH64_PARSE_OK;
1522 static enum aarch64_parse_opt_result
1523 aarch64_handle_pac_ret_b_key (char* str ATTRIBUTE_UNUSED,
1524 char* rest ATTRIBUTE_UNUSED)
1526 aarch64_ra_sign_key = AARCH64_KEY_B;
1527 return AARCH64_PARSE_OK;
1530 static enum aarch64_parse_opt_result
1531 aarch64_handle_bti_protection (char* str ATTRIBUTE_UNUSED,
1532 char* rest ATTRIBUTE_UNUSED)
1534 aarch64_enable_bti = 1;
1535 return AARCH64_PARSE_OK;
1538 static const struct aarch64_branch_protect_type aarch64_pac_ret_subtypes[] = {
1539 { "leaf", aarch64_handle_pac_ret_leaf, NULL, 0 },
1540 { "b-key", aarch64_handle_pac_ret_b_key, NULL, 0 },
1541 { NULL, NULL, NULL, 0 }
1544 static const struct aarch64_branch_protect_type aarch64_branch_protect_types[] = {
1545 { "none", aarch64_handle_no_branch_protection, NULL, 0 },
1546 { "standard", aarch64_handle_standard_branch_protection, NULL, 0 },
1547 { "pac-ret", aarch64_handle_pac_ret_protection, aarch64_pac_ret_subtypes,
1548 ARRAY_SIZE (aarch64_pac_ret_subtypes) },
1549 { "bti", aarch64_handle_bti_protection, NULL, 0 },
1550 { NULL, NULL, NULL, 0 }
1553 /* The condition codes of the processor, and the inverse function. */
1554 static const char * const aarch64_condition_codes[] =
1556 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1557 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1560 /* The preferred condition codes for SVE conditions. */
1561 static const char *const aarch64_sve_condition_codes[] =
1563 "none", "any", "nlast", "last", "first", "nfrst", "vs", "vc",
1564 "pmore", "plast", "tcont", "tstop", "gt", "le", "al", "nv"
1567 /* Return the assembly token for svpattern value VALUE. */
1569 static const char *
1570 svpattern_token (enum aarch64_svpattern pattern)
1572 switch (pattern)
1574 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
1575 AARCH64_FOR_SVPATTERN (CASE)
1576 #undef CASE
1577 case AARCH64_NUM_SVPATTERNS:
1578 break;
1580 gcc_unreachable ();
1583 /* Return the location of a piece that is known to be passed or returned
1584 in registers. FIRST_ZR is the first unused vector argument register
1585 and FIRST_PR is the first unused predicate argument register. */
1588 pure_scalable_type_info::piece::get_rtx (unsigned int first_zr,
1589 unsigned int first_pr) const
1591 gcc_assert (VECTOR_MODE_P (mode)
1592 && first_zr + num_zr <= V0_REGNUM + NUM_FP_ARG_REGS
1593 && first_pr + num_pr <= P0_REGNUM + NUM_PR_ARG_REGS);
1595 if (num_zr > 0 && num_pr == 0)
1596 return gen_rtx_REG (mode, first_zr);
1598 if (num_zr == 0 && num_pr == 1)
1599 return gen_rtx_REG (mode, first_pr);
1601 gcc_unreachable ();
1604 /* Return the total number of vector registers required by the PST. */
1606 unsigned int
1607 pure_scalable_type_info::num_zr () const
1609 unsigned int res = 0;
1610 for (unsigned int i = 0; i < pieces.length (); ++i)
1611 res += pieces[i].num_zr;
1612 return res;
1615 /* Return the total number of predicate registers required by the PST. */
1617 unsigned int
1618 pure_scalable_type_info::num_pr () const
1620 unsigned int res = 0;
1621 for (unsigned int i = 0; i < pieces.length (); ++i)
1622 res += pieces[i].num_pr;
1623 return res;
1626 /* Return the location of a PST that is known to be passed or returned
1627 in registers. FIRST_ZR is the first unused vector argument register
1628 and FIRST_PR is the first unused predicate argument register. */
1631 pure_scalable_type_info::get_rtx (machine_mode mode,
1632 unsigned int first_zr,
1633 unsigned int first_pr) const
1635 /* Try to return a single REG if possible. This leads to better
1636 code generation; it isn't required for correctness. */
1637 if (mode == pieces[0].mode)
1639 gcc_assert (pieces.length () == 1);
1640 return pieces[0].get_rtx (first_zr, first_pr);
1643 /* Build up a PARALLEL that contains the individual pieces. */
1644 rtvec rtxes = rtvec_alloc (pieces.length ());
1645 for (unsigned int i = 0; i < pieces.length (); ++i)
1647 rtx reg = pieces[i].get_rtx (first_zr, first_pr);
1648 rtx offset = gen_int_mode (pieces[i].offset, Pmode);
1649 RTVEC_ELT (rtxes, i) = gen_rtx_EXPR_LIST (VOIDmode, reg, offset);
1650 first_zr += pieces[i].num_zr;
1651 first_pr += pieces[i].num_pr;
1653 return gen_rtx_PARALLEL (mode, rtxes);
1656 /* Analyze whether TYPE is a Pure Scalable Type according to the rules
1657 in the AAPCS64. */
1659 pure_scalable_type_info::analysis_result
1660 pure_scalable_type_info::analyze (const_tree type)
1662 /* Prevent accidental reuse. */
1663 gcc_assert (pieces.is_empty ());
1665 /* No code will be generated for erroneous types, so we won't establish
1666 an ABI mapping. */
1667 if (type == error_mark_node)
1668 return NO_ABI_IDENTITY;
1670 /* Zero-sized types disappear in the language->ABI mapping. */
1671 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1672 return NO_ABI_IDENTITY;
1674 /* Check for SVTs, SPTs, and built-in tuple types that map to PSTs. */
1675 piece p = {};
1676 if (aarch64_sve::builtin_type_p (type, &p.num_zr, &p.num_pr))
1678 machine_mode mode = TYPE_MODE_RAW (type);
1679 gcc_assert (VECTOR_MODE_P (mode)
1680 && (!TARGET_SVE || aarch64_sve_mode_p (mode)));
1682 p.mode = p.orig_mode = mode;
1683 add_piece (p);
1684 return IS_PST;
1687 /* Check for user-defined PSTs. */
1688 if (TREE_CODE (type) == ARRAY_TYPE)
1689 return analyze_array (type);
1690 if (TREE_CODE (type) == RECORD_TYPE)
1691 return analyze_record (type);
1693 return ISNT_PST;
1696 /* Analyze a type that is known not to be passed or returned in memory.
1697 Return true if it has an ABI identity and is a Pure Scalable Type. */
1699 bool
1700 pure_scalable_type_info::analyze_registers (const_tree type)
1702 analysis_result result = analyze (type);
1703 gcc_assert (result != DOESNT_MATTER);
1704 return result == IS_PST;
1707 /* Subroutine of analyze for handling ARRAY_TYPEs. */
1709 pure_scalable_type_info::analysis_result
1710 pure_scalable_type_info::analyze_array (const_tree type)
1712 /* Analyze the element type. */
1713 pure_scalable_type_info element_info;
1714 analysis_result result = element_info.analyze (TREE_TYPE (type));
1715 if (result != IS_PST)
1716 return result;
1718 /* An array of unknown, flexible or variable length will be passed and
1719 returned by reference whatever we do. */
1720 tree nelts_minus_one = array_type_nelts (type);
1721 if (!tree_fits_uhwi_p (nelts_minus_one))
1722 return DOESNT_MATTER;
1724 /* Likewise if the array is constant-sized but too big to be interesting.
1725 The double checks against MAX_PIECES are to protect against overflow. */
1726 unsigned HOST_WIDE_INT count = tree_to_uhwi (nelts_minus_one);
1727 if (count > MAX_PIECES)
1728 return DOESNT_MATTER;
1729 count += 1;
1730 if (count * element_info.pieces.length () > MAX_PIECES)
1731 return DOESNT_MATTER;
1733 /* The above checks should have weeded out elements of unknown size. */
1734 poly_uint64 element_bytes;
1735 if (!poly_int_tree_p (TYPE_SIZE_UNIT (TREE_TYPE (type)), &element_bytes))
1736 gcc_unreachable ();
1738 /* Build up the list of individual vectors and predicates. */
1739 gcc_assert (!element_info.pieces.is_empty ());
1740 for (unsigned int i = 0; i < count; ++i)
1741 for (unsigned int j = 0; j < element_info.pieces.length (); ++j)
1743 piece p = element_info.pieces[j];
1744 p.offset += i * element_bytes;
1745 add_piece (p);
1747 return IS_PST;
1750 /* Subroutine of analyze for handling RECORD_TYPEs. */
1752 pure_scalable_type_info::analysis_result
1753 pure_scalable_type_info::analyze_record (const_tree type)
1755 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1757 if (TREE_CODE (field) != FIELD_DECL)
1758 continue;
1760 /* Zero-sized fields disappear in the language->ABI mapping. */
1761 if (DECL_SIZE (field) && integer_zerop (DECL_SIZE (field)))
1762 continue;
1764 /* All fields with an ABI identity must be PSTs for the record as
1765 a whole to be a PST. If any individual field is too big to be
1766 interesting then the record is too. */
1767 pure_scalable_type_info field_info;
1768 analysis_result subresult = field_info.analyze (TREE_TYPE (field));
1769 if (subresult == NO_ABI_IDENTITY)
1770 continue;
1771 if (subresult != IS_PST)
1772 return subresult;
1774 /* Since all previous fields are PSTs, we ought to be able to track
1775 the field offset using poly_ints. */
1776 tree bitpos = bit_position (field);
1777 gcc_assert (poly_int_tree_p (bitpos));
1779 /* For the same reason, it shouldn't be possible to create a PST field
1780 whose offset isn't byte-aligned. */
1781 poly_widest_int wide_bytepos = exact_div (wi::to_poly_widest (bitpos),
1782 BITS_PER_UNIT);
1784 /* Punt if the record is too big to be interesting. */
1785 poly_uint64 bytepos;
1786 if (!wide_bytepos.to_uhwi (&bytepos)
1787 || pieces.length () + field_info.pieces.length () > MAX_PIECES)
1788 return DOESNT_MATTER;
1790 /* Add the individual vectors and predicates in the field to the
1791 record's list. */
1792 gcc_assert (!field_info.pieces.is_empty ());
1793 for (unsigned int i = 0; i < field_info.pieces.length (); ++i)
1795 piece p = field_info.pieces[i];
1796 p.offset += bytepos;
1797 add_piece (p);
1800 /* Empty structures disappear in the language->ABI mapping. */
1801 return pieces.is_empty () ? NO_ABI_IDENTITY : IS_PST;
1804 /* Add P to the list of pieces in the type. */
1806 void
1807 pure_scalable_type_info::add_piece (const piece &p)
1809 /* Try to fold the new piece into the previous one to form a
1810 single-mode PST. For example, if we see three consecutive vectors
1811 of the same mode, we can represent them using the corresponding
1812 3-tuple mode.
1814 This is purely an optimization. */
1815 if (!pieces.is_empty ())
1817 piece &prev = pieces.last ();
1818 gcc_assert (VECTOR_MODE_P (p.mode) && VECTOR_MODE_P (prev.mode));
1819 unsigned int nelems1, nelems2;
1820 if (prev.orig_mode == p.orig_mode
1821 && known_eq (prev.offset + GET_MODE_SIZE (prev.mode), p.offset)
1822 && constant_multiple_p (GET_MODE_NUNITS (prev.mode),
1823 GET_MODE_NUNITS (p.orig_mode), &nelems1)
1824 && constant_multiple_p (GET_MODE_NUNITS (p.mode),
1825 GET_MODE_NUNITS (p.orig_mode), &nelems2)
1826 && targetm.array_mode (p.orig_mode,
1827 nelems1 + nelems2).exists (&prev.mode))
1829 prev.num_zr += p.num_zr;
1830 prev.num_pr += p.num_pr;
1831 return;
1834 pieces.quick_push (p);
1837 /* Return true if at least one possible value of type TYPE includes at
1838 least one object of Pure Scalable Type, in the sense of the AAPCS64.
1840 This is a relatively expensive test for some types, so it should
1841 generally be made as late as possible. */
1843 static bool
1844 aarch64_some_values_include_pst_objects_p (const_tree type)
1846 if (TYPE_SIZE (type) && integer_zerop (TYPE_SIZE (type)))
1847 return false;
1849 if (aarch64_sve::builtin_type_p (type))
1850 return true;
1852 if (TREE_CODE (type) == ARRAY_TYPE || TREE_CODE (type) == COMPLEX_TYPE)
1853 return aarch64_some_values_include_pst_objects_p (TREE_TYPE (type));
1855 if (RECORD_OR_UNION_TYPE_P (type))
1856 for (tree field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
1857 if (TREE_CODE (field) == FIELD_DECL
1858 && aarch64_some_values_include_pst_objects_p (TREE_TYPE (field)))
1859 return true;
1861 return false;
1864 /* Return the descriptor of the SIMD ABI. */
1866 static const predefined_function_abi &
1867 aarch64_simd_abi (void)
1869 predefined_function_abi &simd_abi = function_abis[ARM_PCS_SIMD];
1870 if (!simd_abi.initialized_p ())
1872 HARD_REG_SET full_reg_clobbers
1873 = default_function_abi.full_reg_clobbers ();
1874 for (int regno = 0; regno < FIRST_PSEUDO_REGISTER; regno++)
1875 if (FP_SIMD_SAVED_REGNUM_P (regno))
1876 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1877 simd_abi.initialize (ARM_PCS_SIMD, full_reg_clobbers);
1879 return simd_abi;
1882 /* Return the descriptor of the SVE PCS. */
1884 static const predefined_function_abi &
1885 aarch64_sve_abi (void)
1887 predefined_function_abi &sve_abi = function_abis[ARM_PCS_SVE];
1888 if (!sve_abi.initialized_p ())
1890 HARD_REG_SET full_reg_clobbers
1891 = default_function_abi.full_reg_clobbers ();
1892 for (int regno = V8_REGNUM; regno <= V23_REGNUM; ++regno)
1893 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1894 for (int regno = P4_REGNUM; regno <= P15_REGNUM; ++regno)
1895 CLEAR_HARD_REG_BIT (full_reg_clobbers, regno);
1896 sve_abi.initialize (ARM_PCS_SVE, full_reg_clobbers);
1898 return sve_abi;
1901 /* Generate code to enable conditional branches in functions over 1 MiB. */
1902 const char *
1903 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1904 const char * branch_format)
1906 rtx_code_label * tmp_label = gen_label_rtx ();
1907 char label_buf[256];
1908 char buffer[128];
1909 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1910 CODE_LABEL_NUMBER (tmp_label));
1911 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1912 rtx dest_label = operands[pos_label];
1913 operands[pos_label] = tmp_label;
1915 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1916 output_asm_insn (buffer, operands);
1918 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1919 operands[pos_label] = dest_label;
1920 output_asm_insn (buffer, operands);
1921 return "";
1924 void
1925 aarch64_err_no_fpadvsimd (machine_mode mode)
1927 if (TARGET_GENERAL_REGS_ONLY)
1928 if (FLOAT_MODE_P (mode))
1929 error ("%qs is incompatible with the use of floating-point types",
1930 "-mgeneral-regs-only");
1931 else
1932 error ("%qs is incompatible with the use of vector types",
1933 "-mgeneral-regs-only");
1934 else
1935 if (FLOAT_MODE_P (mode))
1936 error ("%qs feature modifier is incompatible with the use of"
1937 " floating-point types", "+nofp");
1938 else
1939 error ("%qs feature modifier is incompatible with the use of"
1940 " vector types", "+nofp");
1943 /* Report when we try to do something that requires SVE when SVE is disabled.
1944 This is an error of last resort and isn't very high-quality. It usually
1945 involves attempts to measure the vector length in some way. */
1946 static void
1947 aarch64_report_sve_required (void)
1949 static bool reported_p = false;
1951 /* Avoid reporting a slew of messages for a single oversight. */
1952 if (reported_p)
1953 return;
1955 error ("this operation requires the SVE ISA extension");
1956 inform (input_location, "you can enable SVE using the command-line"
1957 " option %<-march%>, or by using the %<target%>"
1958 " attribute or pragma");
1959 reported_p = true;
1962 /* Return true if REGNO is P0-P15 or one of the special FFR-related
1963 registers. */
1964 inline bool
1965 pr_or_ffr_regnum_p (unsigned int regno)
1967 return PR_REGNUM_P (regno) || regno == FFR_REGNUM || regno == FFRT_REGNUM;
1970 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1971 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1972 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1973 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1974 and GENERAL_REGS is lower than the memory cost (in this case the best class
1975 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1976 cost results in bad allocations with many redundant int<->FP moves which
1977 are expensive on various cores.
1978 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1979 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1980 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1981 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1982 The result of this is that it is no longer inefficient to have a higher
1983 memory move cost than the register move cost.
1986 static reg_class_t
1987 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1988 reg_class_t best_class)
1990 machine_mode mode;
1992 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1993 || !reg_class_subset_p (FP_REGS, allocno_class))
1994 return allocno_class;
1996 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1997 || !reg_class_subset_p (FP_REGS, best_class))
1998 return best_class;
2000 mode = PSEUDO_REGNO_MODE (regno);
2001 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
2004 static unsigned int
2005 aarch64_min_divisions_for_recip_mul (machine_mode mode)
2007 if (GET_MODE_UNIT_SIZE (mode) == 4)
2008 return aarch64_tune_params.min_div_recip_mul_sf;
2009 return aarch64_tune_params.min_div_recip_mul_df;
2012 /* Return the reassociation width of treeop OPC with mode MODE. */
2013 static int
2014 aarch64_reassociation_width (unsigned opc, machine_mode mode)
2016 if (VECTOR_MODE_P (mode))
2017 return aarch64_tune_params.vec_reassoc_width;
2018 if (INTEGRAL_MODE_P (mode))
2019 return aarch64_tune_params.int_reassoc_width;
2020 /* Avoid reassociating floating point addition so we emit more FMAs. */
2021 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
2022 return aarch64_tune_params.fp_reassoc_width;
2023 return 1;
2026 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
2027 unsigned
2028 aarch64_dbx_register_number (unsigned regno)
2030 if (GP_REGNUM_P (regno))
2031 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
2032 else if (regno == SP_REGNUM)
2033 return AARCH64_DWARF_SP;
2034 else if (FP_REGNUM_P (regno))
2035 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
2036 else if (PR_REGNUM_P (regno))
2037 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
2038 else if (regno == VG_REGNUM)
2039 return AARCH64_DWARF_VG;
2041 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
2042 equivalent DWARF register. */
2043 return DWARF_FRAME_REGISTERS;
2046 /* If X is a CONST_DOUBLE, return its bit representation as a constant
2047 integer, otherwise return X unmodified. */
2048 static rtx
2049 aarch64_bit_representation (rtx x)
2051 if (CONST_DOUBLE_P (x))
2052 x = gen_lowpart (int_mode_for_mode (GET_MODE (x)).require (), x);
2053 return x;
2056 /* Return true if MODE is any of the Advanced SIMD structure modes. */
2057 static bool
2058 aarch64_advsimd_struct_mode_p (machine_mode mode)
2060 return (TARGET_SIMD
2061 && (mode == OImode || mode == CImode || mode == XImode));
2064 /* Return true if MODE is an SVE predicate mode. */
2065 static bool
2066 aarch64_sve_pred_mode_p (machine_mode mode)
2068 return (TARGET_SVE
2069 && (mode == VNx16BImode
2070 || mode == VNx8BImode
2071 || mode == VNx4BImode
2072 || mode == VNx2BImode));
2075 /* Three mutually-exclusive flags describing a vector or predicate type. */
2076 const unsigned int VEC_ADVSIMD = 1;
2077 const unsigned int VEC_SVE_DATA = 2;
2078 const unsigned int VEC_SVE_PRED = 4;
2079 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
2080 a structure of 2, 3 or 4 vectors. */
2081 const unsigned int VEC_STRUCT = 8;
2082 /* Can be used in combination with VEC_SVE_DATA to indicate that the
2083 vector has fewer significant bytes than a full SVE vector. */
2084 const unsigned int VEC_PARTIAL = 16;
2085 /* Useful combinations of the above. */
2086 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
2087 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
2089 /* Return a set of flags describing the vector properties of mode MODE.
2090 Ignore modes that are not supported by the current target. */
2091 static unsigned int
2092 aarch64_classify_vector_mode (machine_mode mode)
2094 if (aarch64_advsimd_struct_mode_p (mode))
2095 return VEC_ADVSIMD | VEC_STRUCT;
2097 if (aarch64_sve_pred_mode_p (mode))
2098 return VEC_SVE_PRED;
2100 /* Make the decision based on the mode's enum value rather than its
2101 properties, so that we keep the correct classification regardless
2102 of -msve-vector-bits. */
2103 switch (mode)
2105 /* Partial SVE QI vectors. */
2106 case E_VNx2QImode:
2107 case E_VNx4QImode:
2108 case E_VNx8QImode:
2109 /* Partial SVE HI vectors. */
2110 case E_VNx2HImode:
2111 case E_VNx4HImode:
2112 /* Partial SVE SI vector. */
2113 case E_VNx2SImode:
2114 /* Partial SVE HF vectors. */
2115 case E_VNx2HFmode:
2116 case E_VNx4HFmode:
2117 /* Partial SVE SF vector. */
2118 case E_VNx2SFmode:
2119 return TARGET_SVE ? VEC_SVE_DATA | VEC_PARTIAL : 0;
2121 case E_VNx16QImode:
2122 case E_VNx8HImode:
2123 case E_VNx4SImode:
2124 case E_VNx2DImode:
2125 case E_VNx8BFmode:
2126 case E_VNx8HFmode:
2127 case E_VNx4SFmode:
2128 case E_VNx2DFmode:
2129 return TARGET_SVE ? VEC_SVE_DATA : 0;
2131 /* x2 SVE vectors. */
2132 case E_VNx32QImode:
2133 case E_VNx16HImode:
2134 case E_VNx8SImode:
2135 case E_VNx4DImode:
2136 case E_VNx16BFmode:
2137 case E_VNx16HFmode:
2138 case E_VNx8SFmode:
2139 case E_VNx4DFmode:
2140 /* x3 SVE vectors. */
2141 case E_VNx48QImode:
2142 case E_VNx24HImode:
2143 case E_VNx12SImode:
2144 case E_VNx6DImode:
2145 case E_VNx24BFmode:
2146 case E_VNx24HFmode:
2147 case E_VNx12SFmode:
2148 case E_VNx6DFmode:
2149 /* x4 SVE vectors. */
2150 case E_VNx64QImode:
2151 case E_VNx32HImode:
2152 case E_VNx16SImode:
2153 case E_VNx8DImode:
2154 case E_VNx32BFmode:
2155 case E_VNx32HFmode:
2156 case E_VNx16SFmode:
2157 case E_VNx8DFmode:
2158 return TARGET_SVE ? VEC_SVE_DATA | VEC_STRUCT : 0;
2160 /* 64-bit Advanced SIMD vectors. */
2161 case E_V8QImode:
2162 case E_V4HImode:
2163 case E_V2SImode:
2164 /* ...E_V1DImode doesn't exist. */
2165 case E_V4HFmode:
2166 case E_V4BFmode:
2167 case E_V2SFmode:
2168 case E_V1DFmode:
2169 /* 128-bit Advanced SIMD vectors. */
2170 case E_V16QImode:
2171 case E_V8HImode:
2172 case E_V4SImode:
2173 case E_V2DImode:
2174 case E_V8HFmode:
2175 case E_V8BFmode:
2176 case E_V4SFmode:
2177 case E_V2DFmode:
2178 return TARGET_SIMD ? VEC_ADVSIMD : 0;
2180 default:
2181 return 0;
2185 /* Return true if MODE is any of the data vector modes, including
2186 structure modes. */
2187 static bool
2188 aarch64_vector_data_mode_p (machine_mode mode)
2190 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
2193 /* Return true if MODE is any form of SVE mode, including predicates,
2194 vectors and structures. */
2195 bool
2196 aarch64_sve_mode_p (machine_mode mode)
2198 return aarch64_classify_vector_mode (mode) & VEC_ANY_SVE;
2201 /* Return true if MODE is an SVE data vector mode; either a single vector
2202 or a structure of vectors. */
2203 static bool
2204 aarch64_sve_data_mode_p (machine_mode mode)
2206 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
2209 /* Return the number of defined bytes in one constituent vector of
2210 SVE mode MODE, which has vector flags VEC_FLAGS. */
2211 static poly_int64
2212 aarch64_vl_bytes (machine_mode mode, unsigned int vec_flags)
2214 if (vec_flags & VEC_PARTIAL)
2215 /* A single partial vector. */
2216 return GET_MODE_SIZE (mode);
2218 if (vec_flags & VEC_SVE_DATA)
2219 /* A single vector or a tuple. */
2220 return BYTES_PER_SVE_VECTOR;
2222 /* A single predicate. */
2223 gcc_assert (vec_flags & VEC_SVE_PRED);
2224 return BYTES_PER_SVE_PRED;
2227 /* Implement target hook TARGET_ARRAY_MODE. */
2228 static opt_machine_mode
2229 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
2231 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
2232 && IN_RANGE (nelems, 2, 4))
2233 return mode_for_vector (GET_MODE_INNER (mode),
2234 GET_MODE_NUNITS (mode) * nelems);
2236 return opt_machine_mode ();
2239 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
2240 static bool
2241 aarch64_array_mode_supported_p (machine_mode mode,
2242 unsigned HOST_WIDE_INT nelems)
2244 if (TARGET_SIMD
2245 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
2246 || AARCH64_VALID_SIMD_DREG_MODE (mode))
2247 && (nelems >= 2 && nelems <= 4))
2248 return true;
2250 return false;
2253 /* MODE is some form of SVE vector mode. For data modes, return the number
2254 of vector register bits that each element of MODE occupies, such as 64
2255 for both VNx2DImode and VNx2SImode (where each 32-bit value is stored
2256 in a 64-bit container). For predicate modes, return the number of
2257 data bits controlled by each significant predicate bit. */
2259 static unsigned int
2260 aarch64_sve_container_bits (machine_mode mode)
2262 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2263 poly_uint64 vector_bits = (vec_flags & (VEC_PARTIAL | VEC_SVE_PRED)
2264 ? BITS_PER_SVE_VECTOR
2265 : GET_MODE_BITSIZE (mode));
2266 return vector_element_size (vector_bits, GET_MODE_NUNITS (mode));
2269 /* Return the SVE predicate mode to use for elements that have
2270 ELEM_NBYTES bytes, if such a mode exists. */
2272 opt_machine_mode
2273 aarch64_sve_pred_mode (unsigned int elem_nbytes)
2275 if (TARGET_SVE)
2277 if (elem_nbytes == 1)
2278 return VNx16BImode;
2279 if (elem_nbytes == 2)
2280 return VNx8BImode;
2281 if (elem_nbytes == 4)
2282 return VNx4BImode;
2283 if (elem_nbytes == 8)
2284 return VNx2BImode;
2286 return opt_machine_mode ();
2289 /* Return the SVE predicate mode that should be used to control
2290 SVE mode MODE. */
2292 machine_mode
2293 aarch64_sve_pred_mode (machine_mode mode)
2295 unsigned int bits = aarch64_sve_container_bits (mode);
2296 return aarch64_sve_pred_mode (bits / BITS_PER_UNIT).require ();
2299 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
2301 static opt_machine_mode
2302 aarch64_get_mask_mode (machine_mode mode)
2304 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2305 if (vec_flags & VEC_SVE_DATA)
2306 return aarch64_sve_pred_mode (mode);
2308 return default_get_mask_mode (mode);
2311 /* Return the SVE vector mode that has NUNITS elements of mode INNER_MODE. */
2313 opt_machine_mode
2314 aarch64_sve_data_mode (scalar_mode inner_mode, poly_uint64 nunits)
2316 enum mode_class mclass = (is_a <scalar_float_mode> (inner_mode)
2317 ? MODE_VECTOR_FLOAT : MODE_VECTOR_INT);
2318 machine_mode mode;
2319 FOR_EACH_MODE_IN_CLASS (mode, mclass)
2320 if (inner_mode == GET_MODE_INNER (mode)
2321 && known_eq (nunits, GET_MODE_NUNITS (mode))
2322 && aarch64_sve_data_mode_p (mode))
2323 return mode;
2324 return opt_machine_mode ();
2327 /* Return the integer element mode associated with SVE mode MODE. */
2329 static scalar_int_mode
2330 aarch64_sve_element_int_mode (machine_mode mode)
2332 poly_uint64 vector_bits = (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
2333 ? BITS_PER_SVE_VECTOR
2334 : GET_MODE_BITSIZE (mode));
2335 unsigned int elt_bits = vector_element_size (vector_bits,
2336 GET_MODE_NUNITS (mode));
2337 return int_mode_for_size (elt_bits, 0).require ();
2340 /* Return an integer element mode that contains exactly
2341 aarch64_sve_container_bits (MODE) bits. This is wider than
2342 aarch64_sve_element_int_mode if MODE is a partial vector,
2343 otherwise it's the same. */
2345 static scalar_int_mode
2346 aarch64_sve_container_int_mode (machine_mode mode)
2348 return int_mode_for_size (aarch64_sve_container_bits (mode), 0).require ();
2351 /* Return the integer vector mode associated with SVE mode MODE.
2352 Unlike related_int_vector_mode, this can handle the case in which
2353 MODE is a predicate (and thus has a different total size). */
2355 machine_mode
2356 aarch64_sve_int_mode (machine_mode mode)
2358 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
2359 return aarch64_sve_data_mode (int_mode, GET_MODE_NUNITS (mode)).require ();
2362 /* Implement TARGET_VECTORIZE_RELATED_MODE. */
2364 static opt_machine_mode
2365 aarch64_vectorize_related_mode (machine_mode vector_mode,
2366 scalar_mode element_mode,
2367 poly_uint64 nunits)
2369 unsigned int vec_flags = aarch64_classify_vector_mode (vector_mode);
2371 /* If we're operating on SVE vectors, try to return an SVE mode. */
2372 poly_uint64 sve_nunits;
2373 if ((vec_flags & VEC_SVE_DATA)
2374 && multiple_p (BYTES_PER_SVE_VECTOR,
2375 GET_MODE_SIZE (element_mode), &sve_nunits))
2377 machine_mode sve_mode;
2378 if (maybe_ne (nunits, 0U))
2380 /* Try to find a full or partial SVE mode with exactly
2381 NUNITS units. */
2382 if (multiple_p (sve_nunits, nunits)
2383 && aarch64_sve_data_mode (element_mode,
2384 nunits).exists (&sve_mode))
2385 return sve_mode;
2387 else
2389 /* Take the preferred number of units from the number of bytes
2390 that fit in VECTOR_MODE. We always start by "autodetecting"
2391 a full vector mode with preferred_simd_mode, so vectors
2392 chosen here will also be full vector modes. Then
2393 autovectorize_vector_modes tries smaller starting modes
2394 and thus smaller preferred numbers of units. */
2395 sve_nunits = ordered_min (sve_nunits, GET_MODE_SIZE (vector_mode));
2396 if (aarch64_sve_data_mode (element_mode,
2397 sve_nunits).exists (&sve_mode))
2398 return sve_mode;
2402 /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors. */
2403 if ((vec_flags & VEC_ADVSIMD)
2404 && known_eq (nunits, 0U)
2405 && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
2406 && maybe_ge (GET_MODE_BITSIZE (element_mode)
2407 * GET_MODE_NUNITS (vector_mode), 128U))
2409 machine_mode res = aarch64_simd_container_mode (element_mode, 128);
2410 if (VECTOR_MODE_P (res))
2411 return res;
2414 return default_vectorize_related_mode (vector_mode, element_mode, nunits);
2417 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
2418 prefer to use the first arithmetic operand as the else value if
2419 the else value doesn't matter, since that exactly matches the SVE
2420 destructive merging form. For ternary operations we could either
2421 pick the first operand and use FMAD-like instructions or the last
2422 operand and use FMLA-like instructions; the latter seems more
2423 natural. */
2425 static tree
2426 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
2428 return nops == 3 ? ops[2] : ops[0];
2431 /* Implement TARGET_HARD_REGNO_NREGS. */
2433 static unsigned int
2434 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
2436 /* ??? Logically we should only need to provide a value when
2437 HARD_REGNO_MODE_OK says that the combination is valid,
2438 but at the moment we need to handle all modes. Just ignore
2439 any runtime parts for registers that can't store them. */
2440 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
2441 switch (aarch64_regno_regclass (regno))
2443 case FP_REGS:
2444 case FP_LO_REGS:
2445 case FP_LO8_REGS:
2447 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2448 if (vec_flags & VEC_SVE_DATA)
2449 return exact_div (GET_MODE_SIZE (mode),
2450 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
2451 return CEIL (lowest_size, UNITS_PER_VREG);
2453 case PR_REGS:
2454 case PR_LO_REGS:
2455 case PR_HI_REGS:
2456 case FFR_REGS:
2457 case PR_AND_FFR_REGS:
2458 return 1;
2459 default:
2460 return CEIL (lowest_size, UNITS_PER_WORD);
2462 gcc_unreachable ();
2465 /* Implement TARGET_HARD_REGNO_MODE_OK. */
2467 static bool
2468 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
2470 if (GET_MODE_CLASS (mode) == MODE_CC)
2471 return regno == CC_REGNUM;
2473 if (regno == VG_REGNUM)
2474 /* This must have the same size as _Unwind_Word. */
2475 return mode == DImode;
2477 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2478 if (vec_flags & VEC_SVE_PRED)
2479 return pr_or_ffr_regnum_p (regno);
2481 if (pr_or_ffr_regnum_p (regno))
2482 return false;
2484 if (regno == SP_REGNUM)
2485 /* The purpose of comparing with ptr_mode is to support the
2486 global register variable associated with the stack pointer
2487 register via the syntax of asm ("wsp") in ILP32. */
2488 return mode == Pmode || mode == ptr_mode;
2490 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
2491 return mode == Pmode;
2493 if (GP_REGNUM_P (regno))
2495 if (vec_flags & VEC_ANY_SVE)
2496 return false;
2497 if (known_le (GET_MODE_SIZE (mode), 8))
2498 return true;
2499 if (known_le (GET_MODE_SIZE (mode), 16))
2500 return (regno & 1) == 0;
2502 else if (FP_REGNUM_P (regno))
2504 if (vec_flags & VEC_STRUCT)
2505 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
2506 else
2507 return !VECTOR_MODE_P (mode) || vec_flags != 0;
2510 return false;
2513 /* Return true if a function with type FNTYPE returns its value in
2514 SVE vector or predicate registers. */
2516 static bool
2517 aarch64_returns_value_in_sve_regs_p (const_tree fntype)
2519 tree return_type = TREE_TYPE (fntype);
2521 pure_scalable_type_info pst_info;
2522 switch (pst_info.analyze (return_type))
2524 case pure_scalable_type_info::IS_PST:
2525 return (pst_info.num_zr () <= NUM_FP_ARG_REGS
2526 && pst_info.num_pr () <= NUM_PR_ARG_REGS);
2528 case pure_scalable_type_info::DOESNT_MATTER:
2529 gcc_assert (aarch64_return_in_memory_1 (return_type));
2530 return false;
2532 case pure_scalable_type_info::NO_ABI_IDENTITY:
2533 case pure_scalable_type_info::ISNT_PST:
2534 return false;
2536 gcc_unreachable ();
2539 /* Return true if a function with type FNTYPE takes arguments in
2540 SVE vector or predicate registers. */
2542 static bool
2543 aarch64_takes_arguments_in_sve_regs_p (const_tree fntype)
2545 CUMULATIVE_ARGS args_so_far_v;
2546 aarch64_init_cumulative_args (&args_so_far_v, NULL_TREE, NULL_RTX,
2547 NULL_TREE, 0, true);
2548 cumulative_args_t args_so_far = pack_cumulative_args (&args_so_far_v);
2550 for (tree chain = TYPE_ARG_TYPES (fntype);
2551 chain && chain != void_list_node;
2552 chain = TREE_CHAIN (chain))
2554 tree arg_type = TREE_VALUE (chain);
2555 if (arg_type == error_mark_node)
2556 return false;
2558 function_arg_info arg (arg_type, /*named=*/true);
2559 apply_pass_by_reference_rules (&args_so_far_v, arg);
2560 pure_scalable_type_info pst_info;
2561 if (pst_info.analyze_registers (arg.type))
2563 unsigned int end_zr = args_so_far_v.aapcs_nvrn + pst_info.num_zr ();
2564 unsigned int end_pr = args_so_far_v.aapcs_nprn + pst_info.num_pr ();
2565 gcc_assert (end_zr <= NUM_FP_ARG_REGS && end_pr <= NUM_PR_ARG_REGS);
2566 return true;
2569 targetm.calls.function_arg_advance (args_so_far, arg);
2571 return false;
2574 /* Implement TARGET_FNTYPE_ABI. */
2576 static const predefined_function_abi &
2577 aarch64_fntype_abi (const_tree fntype)
2579 if (lookup_attribute ("aarch64_vector_pcs", TYPE_ATTRIBUTES (fntype)))
2580 return aarch64_simd_abi ();
2582 if (aarch64_returns_value_in_sve_regs_p (fntype)
2583 || aarch64_takes_arguments_in_sve_regs_p (fntype))
2584 return aarch64_sve_abi ();
2586 return default_function_abi;
2589 /* Implement TARGET_COMPATIBLE_VECTOR_TYPES_P. */
2591 static bool
2592 aarch64_compatible_vector_types_p (const_tree type1, const_tree type2)
2594 return (aarch64_sve::builtin_type_p (type1)
2595 == aarch64_sve::builtin_type_p (type2));
2598 /* Return true if we should emit CFI for register REGNO. */
2600 static bool
2601 aarch64_emit_cfi_for_reg_p (unsigned int regno)
2603 return (GP_REGNUM_P (regno)
2604 || !default_function_abi.clobbers_full_reg_p (regno));
2607 /* Return the mode we should use to save and restore register REGNO. */
2609 static machine_mode
2610 aarch64_reg_save_mode (unsigned int regno)
2612 if (GP_REGNUM_P (regno))
2613 return DImode;
2615 if (FP_REGNUM_P (regno))
2616 switch (crtl->abi->id ())
2618 case ARM_PCS_AAPCS64:
2619 /* Only the low 64 bits are saved by the base PCS. */
2620 return DFmode;
2622 case ARM_PCS_SIMD:
2623 /* The vector PCS saves the low 128 bits (which is the full
2624 register on non-SVE targets). */
2625 return TFmode;
2627 case ARM_PCS_SVE:
2628 /* Use vectors of DImode for registers that need frame
2629 information, so that the first 64 bytes of the save slot
2630 are always the equivalent of what storing D<n> would give. */
2631 if (aarch64_emit_cfi_for_reg_p (regno))
2632 return VNx2DImode;
2634 /* Use vectors of bytes otherwise, so that the layout is
2635 endian-agnostic, and so that we can use LDR and STR for
2636 big-endian targets. */
2637 return VNx16QImode;
2639 case ARM_PCS_TLSDESC:
2640 case ARM_PCS_UNKNOWN:
2641 break;
2644 if (PR_REGNUM_P (regno))
2645 /* Save the full predicate register. */
2646 return VNx16BImode;
2648 gcc_unreachable ();
2651 /* Implement TARGET_INSN_CALLEE_ABI. */
2653 const predefined_function_abi &
2654 aarch64_insn_callee_abi (const rtx_insn *insn)
2656 rtx pat = PATTERN (insn);
2657 gcc_assert (GET_CODE (pat) == PARALLEL);
2658 rtx unspec = XVECEXP (pat, 0, 1);
2659 gcc_assert (GET_CODE (unspec) == UNSPEC
2660 && XINT (unspec, 1) == UNSPEC_CALLEE_ABI);
2661 return function_abis[INTVAL (XVECEXP (unspec, 0, 0))];
2664 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
2665 the lower 64 bits of a 128-bit register. Tell the compiler the callee
2666 clobbers the top 64 bits when restoring the bottom 64 bits. */
2668 static bool
2669 aarch64_hard_regno_call_part_clobbered (unsigned int abi_id,
2670 unsigned int regno,
2671 machine_mode mode)
2673 if (FP_REGNUM_P (regno) && abi_id != ARM_PCS_SVE)
2675 poly_int64 per_register_size = GET_MODE_SIZE (mode);
2676 unsigned int nregs = hard_regno_nregs (regno, mode);
2677 if (nregs > 1)
2678 per_register_size = exact_div (per_register_size, nregs);
2679 if (abi_id == ARM_PCS_SIMD || abi_id == ARM_PCS_TLSDESC)
2680 return maybe_gt (per_register_size, 16);
2681 return maybe_gt (per_register_size, 8);
2683 return false;
2686 /* Implement REGMODE_NATURAL_SIZE. */
2687 poly_uint64
2688 aarch64_regmode_natural_size (machine_mode mode)
2690 /* The natural size for SVE data modes is one SVE data vector,
2691 and similarly for predicates. We can't independently modify
2692 anything smaller than that. */
2693 /* ??? For now, only do this for variable-width SVE registers.
2694 Doing it for constant-sized registers breaks lower-subreg.c. */
2695 /* ??? And once that's fixed, we should probably have similar
2696 code for Advanced SIMD. */
2697 if (!aarch64_sve_vg.is_constant ())
2699 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
2700 if (vec_flags & VEC_SVE_PRED)
2701 return BYTES_PER_SVE_PRED;
2702 if (vec_flags & VEC_SVE_DATA)
2703 return BYTES_PER_SVE_VECTOR;
2705 return UNITS_PER_WORD;
2708 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
2709 machine_mode
2710 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
2711 machine_mode mode)
2713 /* The predicate mode determines which bits are significant and
2714 which are "don't care". Decreasing the number of lanes would
2715 lose data while increasing the number of lanes would make bits
2716 unnecessarily significant. */
2717 if (PR_REGNUM_P (regno))
2718 return mode;
2719 if (known_ge (GET_MODE_SIZE (mode), 4))
2720 return mode;
2721 else
2722 return SImode;
2725 /* Return true if I's bits are consecutive ones from the MSB. */
2726 bool
2727 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
2729 return exact_log2 (-i) != HOST_WIDE_INT_M1;
2732 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
2733 that strcpy from constants will be faster. */
2735 static HOST_WIDE_INT
2736 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
2738 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
2739 return MAX (align, BITS_PER_WORD);
2740 return align;
2743 /* Return true if calls to DECL should be treated as
2744 long-calls (ie called via a register). */
2745 static bool
2746 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
2748 return false;
2751 /* Return true if calls to symbol-ref SYM should be treated as
2752 long-calls (ie called via a register). */
2753 bool
2754 aarch64_is_long_call_p (rtx sym)
2756 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
2759 /* Return true if calls to symbol-ref SYM should not go through
2760 plt stubs. */
2762 bool
2763 aarch64_is_noplt_call_p (rtx sym)
2765 const_tree decl = SYMBOL_REF_DECL (sym);
2767 if (flag_pic
2768 && decl
2769 && (!flag_plt
2770 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
2771 && !targetm.binds_local_p (decl))
2772 return true;
2774 return false;
2777 /* Return true if the offsets to a zero/sign-extract operation
2778 represent an expression that matches an extend operation. The
2779 operands represent the parameters from
2781 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
2782 bool
2783 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
2784 rtx extract_imm)
2786 HOST_WIDE_INT mult_val, extract_val;
2788 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
2789 return false;
2791 mult_val = INTVAL (mult_imm);
2792 extract_val = INTVAL (extract_imm);
2794 if (extract_val > 8
2795 && extract_val < GET_MODE_BITSIZE (mode)
2796 && exact_log2 (extract_val & ~7) > 0
2797 && (extract_val & 7) <= 4
2798 && mult_val == (1 << (extract_val & 7)))
2799 return true;
2801 return false;
2804 /* Emit an insn that's a simple single-set. Both the operands must be
2805 known to be valid. */
2806 inline static rtx_insn *
2807 emit_set_insn (rtx x, rtx y)
2809 return emit_insn (gen_rtx_SET (x, y));
2812 /* X and Y are two things to compare using CODE. Emit the compare insn and
2813 return the rtx for register 0 in the proper mode. */
2815 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
2817 machine_mode cmp_mode = GET_MODE (x);
2818 machine_mode cc_mode;
2819 rtx cc_reg;
2821 if (cmp_mode == TImode)
2823 gcc_assert (code == NE);
2825 cc_mode = CCmode;
2826 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2828 rtx x_lo = operand_subword (x, 0, 0, TImode);
2829 rtx y_lo = operand_subword (y, 0, 0, TImode);
2830 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x_lo, y_lo));
2832 rtx x_hi = operand_subword (x, 1, 0, TImode);
2833 rtx y_hi = operand_subword (y, 1, 0, TImode);
2834 emit_insn (gen_ccmpccdi (cc_reg, cc_reg, x_hi, y_hi,
2835 gen_rtx_EQ (cc_mode, cc_reg, const0_rtx),
2836 GEN_INT (AARCH64_EQ)));
2838 else
2840 cc_mode = SELECT_CC_MODE (code, x, y);
2841 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2842 emit_set_insn (cc_reg, gen_rtx_COMPARE (cc_mode, x, y));
2844 return cc_reg;
2847 /* Similarly, but maybe zero-extend Y if Y_MODE < SImode. */
2849 static rtx
2850 aarch64_gen_compare_reg_maybe_ze (RTX_CODE code, rtx x, rtx y,
2851 machine_mode y_mode)
2853 if (y_mode == E_QImode || y_mode == E_HImode)
2855 if (CONST_INT_P (y))
2857 y = GEN_INT (INTVAL (y) & GET_MODE_MASK (y_mode));
2858 y_mode = SImode;
2860 else
2862 rtx t, cc_reg;
2863 machine_mode cc_mode;
2865 t = gen_rtx_ZERO_EXTEND (SImode, y);
2866 t = gen_rtx_COMPARE (CC_SWPmode, t, x);
2867 cc_mode = CC_SWPmode;
2868 cc_reg = gen_rtx_REG (cc_mode, CC_REGNUM);
2869 emit_set_insn (cc_reg, t);
2870 return cc_reg;
2874 if (!aarch64_plus_operand (y, y_mode))
2875 y = force_reg (y_mode, y);
2877 return aarch64_gen_compare_reg (code, x, y);
2880 /* Build the SYMBOL_REF for __tls_get_addr. */
2882 static GTY(()) rtx tls_get_addr_libfunc;
2885 aarch64_tls_get_addr (void)
2887 if (!tls_get_addr_libfunc)
2888 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
2889 return tls_get_addr_libfunc;
2892 /* Return the TLS model to use for ADDR. */
2894 static enum tls_model
2895 tls_symbolic_operand_type (rtx addr)
2897 enum tls_model tls_kind = TLS_MODEL_NONE;
2898 if (GET_CODE (addr) == CONST)
2900 poly_int64 addend;
2901 rtx sym = strip_offset (addr, &addend);
2902 if (GET_CODE (sym) == SYMBOL_REF)
2903 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
2905 else if (GET_CODE (addr) == SYMBOL_REF)
2906 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
2908 return tls_kind;
2911 /* We'll allow lo_sum's in addresses in our legitimate addresses
2912 so that combine would take care of combining addresses where
2913 necessary, but for generation purposes, we'll generate the address
2914 as :
2915 RTL Absolute
2916 tmp = hi (symbol_ref); adrp x1, foo
2917 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
2920 PIC TLS
2921 adrp x1, :got:foo adrp tmp, :tlsgd:foo
2922 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
2923 bl __tls_get_addr
2926 Load TLS symbol, depending on TLS mechanism and TLS access model.
2928 Global Dynamic - Traditional TLS:
2929 adrp tmp, :tlsgd:imm
2930 add dest, tmp, #:tlsgd_lo12:imm
2931 bl __tls_get_addr
2933 Global Dynamic - TLS Descriptors:
2934 adrp dest, :tlsdesc:imm
2935 ldr tmp, [dest, #:tlsdesc_lo12:imm]
2936 add dest, dest, #:tlsdesc_lo12:imm
2937 blr tmp
2938 mrs tp, tpidr_el0
2939 add dest, dest, tp
2941 Initial Exec:
2942 mrs tp, tpidr_el0
2943 adrp tmp, :gottprel:imm
2944 ldr dest, [tmp, #:gottprel_lo12:imm]
2945 add dest, dest, tp
2947 Local Exec:
2948 mrs tp, tpidr_el0
2949 add t0, tp, #:tprel_hi12:imm, lsl #12
2950 add t0, t0, #:tprel_lo12_nc:imm
2953 static void
2954 aarch64_load_symref_appropriately (rtx dest, rtx imm,
2955 enum aarch64_symbol_type type)
2957 switch (type)
2959 case SYMBOL_SMALL_ABSOLUTE:
2961 /* In ILP32, the mode of dest can be either SImode or DImode. */
2962 rtx tmp_reg = dest;
2963 machine_mode mode = GET_MODE (dest);
2965 gcc_assert (mode == Pmode || mode == ptr_mode);
2967 if (can_create_pseudo_p ())
2968 tmp_reg = gen_reg_rtx (mode);
2970 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
2971 emit_insn (gen_add_losym (dest, tmp_reg, imm));
2972 return;
2975 case SYMBOL_TINY_ABSOLUTE:
2976 emit_insn (gen_rtx_SET (dest, imm));
2977 return;
2979 case SYMBOL_SMALL_GOT_28K:
2981 machine_mode mode = GET_MODE (dest);
2982 rtx gp_rtx = pic_offset_table_rtx;
2983 rtx insn;
2984 rtx mem;
2986 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
2987 here before rtl expand. Tree IVOPT will generate rtl pattern to
2988 decide rtx costs, in which case pic_offset_table_rtx is not
2989 initialized. For that case no need to generate the first adrp
2990 instruction as the final cost for global variable access is
2991 one instruction. */
2992 if (gp_rtx != NULL)
2994 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
2995 using the page base as GOT base, the first page may be wasted,
2996 in the worst scenario, there is only 28K space for GOT).
2998 The generate instruction sequence for accessing global variable
3001 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
3003 Only one instruction needed. But we must initialize
3004 pic_offset_table_rtx properly. We generate initialize insn for
3005 every global access, and allow CSE to remove all redundant.
3007 The final instruction sequences will look like the following
3008 for multiply global variables access.
3010 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
3012 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
3013 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
3014 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
3015 ... */
3017 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
3018 crtl->uses_pic_offset_table = 1;
3019 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
3021 if (mode != GET_MODE (gp_rtx))
3022 gp_rtx = gen_lowpart (mode, gp_rtx);
3026 if (mode == ptr_mode)
3028 if (mode == DImode)
3029 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
3030 else
3031 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
3033 mem = XVECEXP (SET_SRC (insn), 0, 0);
3035 else
3037 gcc_assert (mode == Pmode);
3039 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
3040 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3043 /* The operand is expected to be MEM. Whenever the related insn
3044 pattern changed, above code which calculate mem should be
3045 updated. */
3046 gcc_assert (GET_CODE (mem) == MEM);
3047 MEM_READONLY_P (mem) = 1;
3048 MEM_NOTRAP_P (mem) = 1;
3049 emit_insn (insn);
3050 return;
3053 case SYMBOL_SMALL_GOT_4G:
3055 /* In ILP32, the mode of dest can be either SImode or DImode,
3056 while the got entry is always of SImode size. The mode of
3057 dest depends on how dest is used: if dest is assigned to a
3058 pointer (e.g. in the memory), it has SImode; it may have
3059 DImode if dest is dereferenced to access the memeory.
3060 This is why we have to handle three different ldr_got_small
3061 patterns here (two patterns for ILP32). */
3063 rtx insn;
3064 rtx mem;
3065 rtx tmp_reg = dest;
3066 machine_mode mode = GET_MODE (dest);
3068 if (can_create_pseudo_p ())
3069 tmp_reg = gen_reg_rtx (mode);
3071 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
3072 if (mode == ptr_mode)
3074 if (mode == DImode)
3075 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
3076 else
3077 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
3079 mem = XVECEXP (SET_SRC (insn), 0, 0);
3081 else
3083 gcc_assert (mode == Pmode);
3085 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
3086 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
3089 gcc_assert (GET_CODE (mem) == MEM);
3090 MEM_READONLY_P (mem) = 1;
3091 MEM_NOTRAP_P (mem) = 1;
3092 emit_insn (insn);
3093 return;
3096 case SYMBOL_SMALL_TLSGD:
3098 rtx_insn *insns;
3099 /* The return type of __tls_get_addr is the C pointer type
3100 so use ptr_mode. */
3101 rtx result = gen_rtx_REG (ptr_mode, R0_REGNUM);
3102 rtx tmp_reg = dest;
3104 if (GET_MODE (dest) != ptr_mode)
3105 tmp_reg = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : result;
3107 start_sequence ();
3108 if (ptr_mode == SImode)
3109 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
3110 else
3111 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
3112 insns = get_insns ();
3113 end_sequence ();
3115 RTL_CONST_CALL_P (insns) = 1;
3116 emit_libcall_block (insns, tmp_reg, result, imm);
3117 /* Convert back to the mode of the dest adding a zero_extend
3118 from SImode (ptr_mode) to DImode (Pmode). */
3119 if (dest != tmp_reg)
3120 convert_move (dest, tmp_reg, true);
3121 return;
3124 case SYMBOL_SMALL_TLSDESC:
3126 machine_mode mode = GET_MODE (dest);
3127 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
3128 rtx tp;
3130 gcc_assert (mode == Pmode || mode == ptr_mode);
3132 /* In ILP32, the got entry is always of SImode size. Unlike
3133 small GOT, the dest is fixed at reg 0. */
3134 if (TARGET_ILP32)
3135 emit_insn (gen_tlsdesc_small_si (imm));
3136 else
3137 emit_insn (gen_tlsdesc_small_di (imm));
3138 tp = aarch64_load_tp (NULL);
3140 if (mode != Pmode)
3141 tp = gen_lowpart (mode, tp);
3143 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
3144 if (REG_P (dest))
3145 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3146 return;
3149 case SYMBOL_SMALL_TLSIE:
3151 /* In ILP32, the mode of dest can be either SImode or DImode,
3152 while the got entry is always of SImode size. The mode of
3153 dest depends on how dest is used: if dest is assigned to a
3154 pointer (e.g. in the memory), it has SImode; it may have
3155 DImode if dest is dereferenced to access the memeory.
3156 This is why we have to handle three different tlsie_small
3157 patterns here (two patterns for ILP32). */
3158 machine_mode mode = GET_MODE (dest);
3159 rtx tmp_reg = gen_reg_rtx (mode);
3160 rtx tp = aarch64_load_tp (NULL);
3162 if (mode == ptr_mode)
3164 if (mode == DImode)
3165 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
3166 else
3168 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
3169 tp = gen_lowpart (mode, tp);
3172 else
3174 gcc_assert (mode == Pmode);
3175 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
3178 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
3179 if (REG_P (dest))
3180 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3181 return;
3184 case SYMBOL_TLSLE12:
3185 case SYMBOL_TLSLE24:
3186 case SYMBOL_TLSLE32:
3187 case SYMBOL_TLSLE48:
3189 machine_mode mode = GET_MODE (dest);
3190 rtx tp = aarch64_load_tp (NULL);
3192 if (mode != Pmode)
3193 tp = gen_lowpart (mode, tp);
3195 switch (type)
3197 case SYMBOL_TLSLE12:
3198 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
3199 (dest, tp, imm));
3200 break;
3201 case SYMBOL_TLSLE24:
3202 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
3203 (dest, tp, imm));
3204 break;
3205 case SYMBOL_TLSLE32:
3206 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
3207 (dest, imm));
3208 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3209 (dest, dest, tp));
3210 break;
3211 case SYMBOL_TLSLE48:
3212 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
3213 (dest, imm));
3214 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
3215 (dest, dest, tp));
3216 break;
3217 default:
3218 gcc_unreachable ();
3221 if (REG_P (dest))
3222 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3223 return;
3226 case SYMBOL_TINY_GOT:
3228 rtx insn;
3229 machine_mode mode = GET_MODE (dest);
3231 if (mode == ptr_mode)
3232 insn = gen_ldr_got_tiny (mode, dest, imm);
3233 else
3235 gcc_assert (mode == Pmode);
3236 insn = gen_ldr_got_tiny_sidi (dest, imm);
3239 emit_insn (insn);
3240 return;
3243 case SYMBOL_TINY_TLSIE:
3245 machine_mode mode = GET_MODE (dest);
3246 rtx tp = aarch64_load_tp (NULL);
3248 if (mode == ptr_mode)
3250 if (mode == DImode)
3251 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
3252 else
3254 tp = gen_lowpart (mode, tp);
3255 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
3258 else
3260 gcc_assert (mode == Pmode);
3261 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
3264 if (REG_P (dest))
3265 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
3266 return;
3269 default:
3270 gcc_unreachable ();
3274 /* Emit a move from SRC to DEST. Assume that the move expanders can
3275 handle all moves if !can_create_pseudo_p (). The distinction is
3276 important because, unlike emit_move_insn, the move expanders know
3277 how to force Pmode objects into the constant pool even when the
3278 constant pool address is not itself legitimate. */
3279 static rtx
3280 aarch64_emit_move (rtx dest, rtx src)
3282 return (can_create_pseudo_p ()
3283 ? emit_move_insn (dest, src)
3284 : emit_move_insn_1 (dest, src));
3287 /* Apply UNOPTAB to OP and store the result in DEST. */
3289 static void
3290 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
3292 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
3293 if (dest != tmp)
3294 emit_move_insn (dest, tmp);
3297 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
3299 static void
3300 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
3302 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
3303 OPTAB_DIRECT);
3304 if (dest != tmp)
3305 emit_move_insn (dest, tmp);
3308 /* Split a 128-bit move operation into two 64-bit move operations,
3309 taking care to handle partial overlap of register to register
3310 copies. Special cases are needed when moving between GP regs and
3311 FP regs. SRC can be a register, constant or memory; DST a register
3312 or memory. If either operand is memory it must not have any side
3313 effects. */
3314 void
3315 aarch64_split_128bit_move (rtx dst, rtx src)
3317 rtx dst_lo, dst_hi;
3318 rtx src_lo, src_hi;
3320 machine_mode mode = GET_MODE (dst);
3322 gcc_assert (mode == TImode || mode == TFmode);
3323 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
3324 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
3326 if (REG_P (dst) && REG_P (src))
3328 int src_regno = REGNO (src);
3329 int dst_regno = REGNO (dst);
3331 /* Handle FP <-> GP regs. */
3332 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
3334 src_lo = gen_lowpart (word_mode, src);
3335 src_hi = gen_highpart (word_mode, src);
3337 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
3338 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
3339 return;
3341 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
3343 dst_lo = gen_lowpart (word_mode, dst);
3344 dst_hi = gen_highpart (word_mode, dst);
3346 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
3347 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
3348 return;
3352 dst_lo = gen_lowpart (word_mode, dst);
3353 dst_hi = gen_highpart (word_mode, dst);
3354 src_lo = gen_lowpart (word_mode, src);
3355 src_hi = gen_highpart_mode (word_mode, mode, src);
3357 /* At most one pairing may overlap. */
3358 if (reg_overlap_mentioned_p (dst_lo, src_hi))
3360 aarch64_emit_move (dst_hi, src_hi);
3361 aarch64_emit_move (dst_lo, src_lo);
3363 else
3365 aarch64_emit_move (dst_lo, src_lo);
3366 aarch64_emit_move (dst_hi, src_hi);
3370 bool
3371 aarch64_split_128bit_move_p (rtx dst, rtx src)
3373 return (! REG_P (src)
3374 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
3377 /* Split a complex SIMD combine. */
3379 void
3380 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
3382 machine_mode src_mode = GET_MODE (src1);
3383 machine_mode dst_mode = GET_MODE (dst);
3385 gcc_assert (VECTOR_MODE_P (dst_mode));
3386 gcc_assert (register_operand (dst, dst_mode)
3387 && register_operand (src1, src_mode)
3388 && register_operand (src2, src_mode));
3390 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
3391 return;
3394 /* Split a complex SIMD move. */
3396 void
3397 aarch64_split_simd_move (rtx dst, rtx src)
3399 machine_mode src_mode = GET_MODE (src);
3400 machine_mode dst_mode = GET_MODE (dst);
3402 gcc_assert (VECTOR_MODE_P (dst_mode));
3404 if (REG_P (dst) && REG_P (src))
3406 gcc_assert (VECTOR_MODE_P (src_mode));
3407 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
3411 bool
3412 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
3413 machine_mode ymode, rtx y)
3415 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
3416 gcc_assert (r != NULL);
3417 return rtx_equal_p (x, r);
3420 /* Return TARGET if it is nonnull and a register of mode MODE.
3421 Otherwise, return a fresh register of mode MODE if we can,
3422 or TARGET reinterpreted as MODE if we can't. */
3424 static rtx
3425 aarch64_target_reg (rtx target, machine_mode mode)
3427 if (target && REG_P (target) && GET_MODE (target) == mode)
3428 return target;
3429 if (!can_create_pseudo_p ())
3431 gcc_assert (target);
3432 return gen_lowpart (mode, target);
3434 return gen_reg_rtx (mode);
3437 /* Return a register that contains the constant in BUILDER, given that
3438 the constant is a legitimate move operand. Use TARGET as the register
3439 if it is nonnull and convenient. */
3441 static rtx
3442 aarch64_emit_set_immediate (rtx target, rtx_vector_builder &builder)
3444 rtx src = builder.build ();
3445 target = aarch64_target_reg (target, GET_MODE (src));
3446 emit_insn (gen_rtx_SET (target, src));
3447 return target;
3450 static rtx
3451 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
3453 if (can_create_pseudo_p ())
3454 return force_reg (mode, value);
3455 else
3457 gcc_assert (x);
3458 aarch64_emit_move (x, value);
3459 return x;
3463 /* Return true if predicate value X is a constant in which every element
3464 is a CONST_INT. When returning true, describe X in BUILDER as a VNx16BI
3465 value, i.e. as a predicate in which all bits are significant. */
3467 static bool
3468 aarch64_get_sve_pred_bits (rtx_vector_builder &builder, rtx x)
3470 if (GET_CODE (x) != CONST_VECTOR)
3471 return false;
3473 unsigned int factor = vector_element_size (GET_MODE_NUNITS (VNx16BImode),
3474 GET_MODE_NUNITS (GET_MODE (x)));
3475 unsigned int npatterns = CONST_VECTOR_NPATTERNS (x) * factor;
3476 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x);
3477 builder.new_vector (VNx16BImode, npatterns, nelts_per_pattern);
3479 unsigned int nelts = const_vector_encoded_nelts (x);
3480 for (unsigned int i = 0; i < nelts; ++i)
3482 rtx elt = CONST_VECTOR_ENCODED_ELT (x, i);
3483 if (!CONST_INT_P (elt))
3484 return false;
3486 builder.quick_push (elt);
3487 for (unsigned int j = 1; j < factor; ++j)
3488 builder.quick_push (const0_rtx);
3490 builder.finalize ();
3491 return true;
3494 /* BUILDER contains a predicate constant of mode VNx16BI. Return the
3495 widest predicate element size it can have (that is, the largest size
3496 for which each element would still be 0 or 1). */
3498 unsigned int
3499 aarch64_widest_sve_pred_elt_size (rtx_vector_builder &builder)
3501 /* Start with the most optimistic assumption: that we only need
3502 one bit per pattern. This is what we will use if only the first
3503 bit in each pattern is ever set. */
3504 unsigned int mask = GET_MODE_SIZE (DImode);
3505 mask |= builder.npatterns ();
3507 /* Look for set bits. */
3508 unsigned int nelts = builder.encoded_nelts ();
3509 for (unsigned int i = 1; i < nelts; ++i)
3510 if (INTVAL (builder.elt (i)) != 0)
3512 if (i & 1)
3513 return 1;
3514 mask |= i;
3516 return mask & -mask;
3519 /* If VNx16BImode rtx X is a canonical PTRUE for a predicate mode,
3520 return that predicate mode, otherwise return opt_machine_mode (). */
3522 opt_machine_mode
3523 aarch64_ptrue_all_mode (rtx x)
3525 gcc_assert (GET_MODE (x) == VNx16BImode);
3526 if (GET_CODE (x) != CONST_VECTOR
3527 || !CONST_VECTOR_DUPLICATE_P (x)
3528 || !CONST_INT_P (CONST_VECTOR_ENCODED_ELT (x, 0))
3529 || INTVAL (CONST_VECTOR_ENCODED_ELT (x, 0)) == 0)
3530 return opt_machine_mode ();
3532 unsigned int nelts = const_vector_encoded_nelts (x);
3533 for (unsigned int i = 1; i < nelts; ++i)
3534 if (CONST_VECTOR_ENCODED_ELT (x, i) != const0_rtx)
3535 return opt_machine_mode ();
3537 return aarch64_sve_pred_mode (nelts);
3540 /* BUILDER is a predicate constant of mode VNx16BI. Consider the value
3541 that the constant would have with predicate element size ELT_SIZE
3542 (ignoring the upper bits in each element) and return:
3544 * -1 if all bits are set
3545 * N if the predicate has N leading set bits followed by all clear bits
3546 * 0 if the predicate does not have any of these forms. */
3549 aarch64_partial_ptrue_length (rtx_vector_builder &builder,
3550 unsigned int elt_size)
3552 /* If nelts_per_pattern is 3, we have set bits followed by clear bits
3553 followed by set bits. */
3554 if (builder.nelts_per_pattern () == 3)
3555 return 0;
3557 /* Skip over leading set bits. */
3558 unsigned int nelts = builder.encoded_nelts ();
3559 unsigned int i = 0;
3560 for (; i < nelts; i += elt_size)
3561 if (INTVAL (builder.elt (i)) == 0)
3562 break;
3563 unsigned int vl = i / elt_size;
3565 /* Check for the all-true case. */
3566 if (i == nelts)
3567 return -1;
3569 /* If nelts_per_pattern is 1, then either VL is zero, or we have a
3570 repeating pattern of set bits followed by clear bits. */
3571 if (builder.nelts_per_pattern () != 2)
3572 return 0;
3574 /* We have a "foreground" value and a duplicated "background" value.
3575 If the background might repeat and the last set bit belongs to it,
3576 we might have set bits followed by clear bits followed by set bits. */
3577 if (i > builder.npatterns () && maybe_ne (nelts, builder.full_nelts ()))
3578 return 0;
3580 /* Make sure that the rest are all clear. */
3581 for (; i < nelts; i += elt_size)
3582 if (INTVAL (builder.elt (i)) != 0)
3583 return 0;
3585 return vl;
3588 /* See if there is an svpattern that encodes an SVE predicate of mode
3589 PRED_MODE in which the first VL bits are set and the rest are clear.
3590 Return the pattern if so, otherwise return AARCH64_NUM_SVPATTERNS.
3591 A VL of -1 indicates an all-true vector. */
3593 aarch64_svpattern
3594 aarch64_svpattern_for_vl (machine_mode pred_mode, int vl)
3596 if (vl < 0)
3597 return AARCH64_SV_ALL;
3599 if (maybe_gt (vl, GET_MODE_NUNITS (pred_mode)))
3600 return AARCH64_NUM_SVPATTERNS;
3602 if (vl >= 1 && vl <= 8)
3603 return aarch64_svpattern (AARCH64_SV_VL1 + (vl - 1));
3605 if (vl >= 16 && vl <= 256 && pow2p_hwi (vl))
3606 return aarch64_svpattern (AARCH64_SV_VL16 + (exact_log2 (vl) - 4));
3608 int max_vl;
3609 if (GET_MODE_NUNITS (pred_mode).is_constant (&max_vl))
3611 if (vl == (max_vl / 3) * 3)
3612 return AARCH64_SV_MUL3;
3613 /* These would only trigger for non-power-of-2 lengths. */
3614 if (vl == (max_vl & -4))
3615 return AARCH64_SV_MUL4;
3616 if (vl == (1 << floor_log2 (max_vl)))
3617 return AARCH64_SV_POW2;
3618 if (vl == max_vl)
3619 return AARCH64_SV_ALL;
3621 return AARCH64_NUM_SVPATTERNS;
3624 /* Return a VNx16BImode constant in which every sequence of ELT_SIZE
3625 bits has the lowest bit set and the upper bits clear. This is the
3626 VNx16BImode equivalent of a PTRUE for controlling elements of
3627 ELT_SIZE bytes. However, because the constant is VNx16BImode,
3628 all bits are significant, even the upper zeros. */
3631 aarch64_ptrue_all (unsigned int elt_size)
3633 rtx_vector_builder builder (VNx16BImode, elt_size, 1);
3634 builder.quick_push (const1_rtx);
3635 for (unsigned int i = 1; i < elt_size; ++i)
3636 builder.quick_push (const0_rtx);
3637 return builder.build ();
3640 /* Return an all-true predicate register of mode MODE. */
3643 aarch64_ptrue_reg (machine_mode mode)
3645 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3646 rtx reg = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3647 return gen_lowpart (mode, reg);
3650 /* Return an all-false predicate register of mode MODE. */
3653 aarch64_pfalse_reg (machine_mode mode)
3655 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL);
3656 rtx reg = force_reg (VNx16BImode, CONST0_RTX (VNx16BImode));
3657 return gen_lowpart (mode, reg);
3660 /* Return true if predicate PRED1[0] is true whenever predicate PRED2 is
3661 true, or alternatively if we know that the operation predicated by
3662 PRED1[0] is safe to perform whenever PRED2 is true. PRED1[1] is a
3663 aarch64_sve_gp_strictness operand that describes the operation
3664 predicated by PRED1[0]. */
3666 bool
3667 aarch64_sve_pred_dominates_p (rtx *pred1, rtx pred2)
3669 machine_mode mode = GET_MODE (pred2);
3670 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3671 && mode == GET_MODE (pred1[0])
3672 && aarch64_sve_gp_strictness (pred1[1], SImode));
3673 return (pred1[0] == CONSTM1_RTX (mode)
3674 || INTVAL (pred1[1]) == SVE_RELAXED_GP
3675 || rtx_equal_p (pred1[0], pred2));
3678 /* PRED1[0] is a PTEST predicate and PRED1[1] is an aarch64_sve_ptrue_flag
3679 for it. PRED2[0] is the predicate for the instruction whose result
3680 is tested by the PTEST and PRED2[1] is again an aarch64_sve_ptrue_flag
3681 for it. Return true if we can prove that the two predicates are
3682 equivalent for PTEST purposes; that is, if we can replace PRED2[0]
3683 with PRED1[0] without changing behavior. */
3685 bool
3686 aarch64_sve_same_pred_for_ptest_p (rtx *pred1, rtx *pred2)
3688 machine_mode mode = GET_MODE (pred1[0]);
3689 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
3690 && mode == GET_MODE (pred2[0])
3691 && aarch64_sve_ptrue_flag (pred1[1], SImode)
3692 && aarch64_sve_ptrue_flag (pred2[1], SImode));
3694 bool ptrue1_p = (pred1[0] == CONSTM1_RTX (mode)
3695 || INTVAL (pred1[1]) == SVE_KNOWN_PTRUE);
3696 bool ptrue2_p = (pred2[0] == CONSTM1_RTX (mode)
3697 || INTVAL (pred2[1]) == SVE_KNOWN_PTRUE);
3698 return (ptrue1_p && ptrue2_p) || rtx_equal_p (pred1[0], pred2[0]);
3701 /* Emit a comparison CMP between OP0 and OP1, both of which have mode
3702 DATA_MODE, and return the result in a predicate of mode PRED_MODE.
3703 Use TARGET as the target register if nonnull and convenient. */
3705 static rtx
3706 aarch64_sve_emit_int_cmp (rtx target, machine_mode pred_mode, rtx_code cmp,
3707 machine_mode data_mode, rtx op1, rtx op2)
3709 insn_code icode = code_for_aarch64_pred_cmp (cmp, data_mode);
3710 expand_operand ops[5];
3711 create_output_operand (&ops[0], target, pred_mode);
3712 create_input_operand (&ops[1], CONSTM1_RTX (pred_mode), pred_mode);
3713 create_integer_operand (&ops[2], SVE_KNOWN_PTRUE);
3714 create_input_operand (&ops[3], op1, data_mode);
3715 create_input_operand (&ops[4], op2, data_mode);
3716 expand_insn (icode, 5, ops);
3717 return ops[0].value;
3720 /* Use a comparison to convert integer vector SRC into MODE, which is
3721 the corresponding SVE predicate mode. Use TARGET for the result
3722 if it's nonnull and convenient. */
3725 aarch64_convert_sve_data_to_pred (rtx target, machine_mode mode, rtx src)
3727 machine_mode src_mode = GET_MODE (src);
3728 return aarch64_sve_emit_int_cmp (target, mode, NE, src_mode,
3729 src, CONST0_RTX (src_mode));
3732 /* Return the assembly token for svprfop value PRFOP. */
3734 static const char *
3735 svprfop_token (enum aarch64_svprfop prfop)
3737 switch (prfop)
3739 #define CASE(UPPER, LOWER, VALUE) case AARCH64_SV_##UPPER: return #LOWER;
3740 AARCH64_FOR_SVPRFOP (CASE)
3741 #undef CASE
3742 case AARCH64_NUM_SVPRFOPS:
3743 break;
3745 gcc_unreachable ();
3748 /* Return the assembly string for an SVE prefetch operation with
3749 mnemonic MNEMONIC, given that PRFOP_RTX is the prefetch operation
3750 and that SUFFIX is the format for the remaining operands. */
3752 char *
3753 aarch64_output_sve_prefetch (const char *mnemonic, rtx prfop_rtx,
3754 const char *suffix)
3756 static char buffer[128];
3757 aarch64_svprfop prfop = (aarch64_svprfop) INTVAL (prfop_rtx);
3758 unsigned int written = snprintf (buffer, sizeof (buffer), "%s\t%s, %s",
3759 mnemonic, svprfop_token (prfop), suffix);
3760 gcc_assert (written < sizeof (buffer));
3761 return buffer;
3764 /* Check whether we can calculate the number of elements in PATTERN
3765 at compile time, given that there are NELTS_PER_VQ elements per
3766 128-bit block. Return the value if so, otherwise return -1. */
3768 HOST_WIDE_INT
3769 aarch64_fold_sve_cnt_pat (aarch64_svpattern pattern, unsigned int nelts_per_vq)
3771 unsigned int vl, const_vg;
3772 if (pattern >= AARCH64_SV_VL1 && pattern <= AARCH64_SV_VL8)
3773 vl = 1 + (pattern - AARCH64_SV_VL1);
3774 else if (pattern >= AARCH64_SV_VL16 && pattern <= AARCH64_SV_VL256)
3775 vl = 16 << (pattern - AARCH64_SV_VL16);
3776 else if (aarch64_sve_vg.is_constant (&const_vg))
3778 /* There are two vector granules per quadword. */
3779 unsigned int nelts = (const_vg / 2) * nelts_per_vq;
3780 switch (pattern)
3782 case AARCH64_SV_POW2: return 1 << floor_log2 (nelts);
3783 case AARCH64_SV_MUL4: return nelts & -4;
3784 case AARCH64_SV_MUL3: return (nelts / 3) * 3;
3785 case AARCH64_SV_ALL: return nelts;
3786 default: gcc_unreachable ();
3789 else
3790 return -1;
3792 /* There are two vector granules per quadword. */
3793 poly_uint64 nelts_all = exact_div (aarch64_sve_vg, 2) * nelts_per_vq;
3794 if (known_le (vl, nelts_all))
3795 return vl;
3797 /* Requesting more elements than are available results in a PFALSE. */
3798 if (known_gt (vl, nelts_all))
3799 return 0;
3801 return -1;
3804 /* Return true if we can move VALUE into a register using a single
3805 CNT[BHWD] instruction. */
3807 static bool
3808 aarch64_sve_cnt_immediate_p (poly_int64 value)
3810 HOST_WIDE_INT factor = value.coeffs[0];
3811 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
3812 return (value.coeffs[1] == factor
3813 && IN_RANGE (factor, 2, 16 * 16)
3814 && (factor & 1) == 0
3815 && factor <= 16 * (factor & -factor));
3818 /* Likewise for rtx X. */
3820 bool
3821 aarch64_sve_cnt_immediate_p (rtx x)
3823 poly_int64 value;
3824 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
3827 /* Return the asm string for an instruction with a CNT-like vector size
3828 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3829 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3830 first part of the operands template (the part that comes before the
3831 vector size itself). PATTERN is the pattern to use. FACTOR is the
3832 number of quadwords. NELTS_PER_VQ, if nonzero, is the number of elements
3833 in each quadword. If it is zero, we can use any element size. */
3835 static char *
3836 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3837 aarch64_svpattern pattern,
3838 unsigned int factor,
3839 unsigned int nelts_per_vq)
3841 static char buffer[sizeof ("sqincd\t%x0, %w0, vl256, mul #16")];
3843 if (nelts_per_vq == 0)
3844 /* There is some overlap in the ranges of the four CNT instructions.
3845 Here we always use the smallest possible element size, so that the
3846 multiplier is 1 whereever possible. */
3847 nelts_per_vq = factor & -factor;
3848 int shift = std::min (exact_log2 (nelts_per_vq), 4);
3849 gcc_assert (IN_RANGE (shift, 1, 4));
3850 char suffix = "dwhb"[shift - 1];
3852 factor >>= shift;
3853 unsigned int written;
3854 if (pattern == AARCH64_SV_ALL && factor == 1)
3855 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
3856 prefix, suffix, operands);
3857 else if (factor == 1)
3858 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s",
3859 prefix, suffix, operands, svpattern_token (pattern));
3860 else
3861 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, %s, mul #%d",
3862 prefix, suffix, operands, svpattern_token (pattern),
3863 factor);
3864 gcc_assert (written < sizeof (buffer));
3865 return buffer;
3868 /* Return the asm string for an instruction with a CNT-like vector size
3869 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3870 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3871 first part of the operands template (the part that comes before the
3872 vector size itself). X is the value of the vector size operand,
3873 as a polynomial integer rtx; we need to convert this into an "all"
3874 pattern with a multiplier. */
3876 char *
3877 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
3878 rtx x)
3880 poly_int64 value = rtx_to_poly_int64 (x);
3881 gcc_assert (aarch64_sve_cnt_immediate_p (value));
3882 return aarch64_output_sve_cnt_immediate (prefix, operands, AARCH64_SV_ALL,
3883 value.coeffs[1], 0);
3886 /* Return the asm string for an instruction with a CNT-like vector size
3887 operand (a vector pattern followed by a multiplier in the range [1, 16]).
3888 PREFIX is the mnemonic without the size suffix and OPERANDS is the
3889 first part of the operands template (the part that comes before the
3890 vector size itself). CNT_PAT[0..2] are the operands of the
3891 UNSPEC_SVE_CNT_PAT; see aarch64_sve_cnt_pat for details. */
3893 char *
3894 aarch64_output_sve_cnt_pat_immediate (const char *prefix,
3895 const char *operands, rtx *cnt_pat)
3897 aarch64_svpattern pattern = (aarch64_svpattern) INTVAL (cnt_pat[0]);
3898 unsigned int nelts_per_vq = INTVAL (cnt_pat[1]);
3899 unsigned int factor = INTVAL (cnt_pat[2]) * nelts_per_vq;
3900 return aarch64_output_sve_cnt_immediate (prefix, operands, pattern,
3901 factor, nelts_per_vq);
3904 /* Return true if we can add X using a single SVE INC or DEC instruction. */
3906 bool
3907 aarch64_sve_scalar_inc_dec_immediate_p (rtx x)
3909 poly_int64 value;
3910 return (poly_int_rtx_p (x, &value)
3911 && (aarch64_sve_cnt_immediate_p (value)
3912 || aarch64_sve_cnt_immediate_p (-value)));
3915 /* Return the asm string for adding SVE INC/DEC immediate OFFSET to
3916 operand 0. */
3918 char *
3919 aarch64_output_sve_scalar_inc_dec (rtx offset)
3921 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3922 gcc_assert (offset_value.coeffs[0] == offset_value.coeffs[1]);
3923 if (offset_value.coeffs[1] > 0)
3924 return aarch64_output_sve_cnt_immediate ("inc", "%x0", AARCH64_SV_ALL,
3925 offset_value.coeffs[1], 0);
3926 else
3927 return aarch64_output_sve_cnt_immediate ("dec", "%x0", AARCH64_SV_ALL,
3928 -offset_value.coeffs[1], 0);
3931 /* Return true if we can add VALUE to a register using a single ADDVL
3932 or ADDPL instruction. */
3934 static bool
3935 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
3937 HOST_WIDE_INT factor = value.coeffs[0];
3938 if (factor == 0 || value.coeffs[1] != factor)
3939 return false;
3940 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
3941 and a value of 16 is one vector width. */
3942 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
3943 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
3946 /* Likewise for rtx X. */
3948 bool
3949 aarch64_sve_addvl_addpl_immediate_p (rtx x)
3951 poly_int64 value;
3952 return (poly_int_rtx_p (x, &value)
3953 && aarch64_sve_addvl_addpl_immediate_p (value));
3956 /* Return the asm string for adding ADDVL or ADDPL immediate OFFSET
3957 to operand 1 and storing the result in operand 0. */
3959 char *
3960 aarch64_output_sve_addvl_addpl (rtx offset)
3962 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
3963 poly_int64 offset_value = rtx_to_poly_int64 (offset);
3964 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
3966 int factor = offset_value.coeffs[1];
3967 if ((factor & 15) == 0)
3968 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
3969 else
3970 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
3971 return buffer;
3974 /* Return true if X is a valid immediate for an SVE vector INC or DEC
3975 instruction. If it is, store the number of elements in each vector
3976 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
3977 factor in *FACTOR_OUT (if nonnull). */
3979 bool
3980 aarch64_sve_vector_inc_dec_immediate_p (rtx x, int *factor_out,
3981 unsigned int *nelts_per_vq_out)
3983 rtx elt;
3984 poly_int64 value;
3986 if (!const_vec_duplicate_p (x, &elt)
3987 || !poly_int_rtx_p (elt, &value))
3988 return false;
3990 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
3991 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
3992 /* There's no vector INCB. */
3993 return false;
3995 HOST_WIDE_INT factor = value.coeffs[0];
3996 if (value.coeffs[1] != factor)
3997 return false;
3999 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
4000 if ((factor % nelts_per_vq) != 0
4001 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
4002 return false;
4004 if (factor_out)
4005 *factor_out = factor;
4006 if (nelts_per_vq_out)
4007 *nelts_per_vq_out = nelts_per_vq;
4008 return true;
4011 /* Return true if X is a valid immediate for an SVE vector INC or DEC
4012 instruction. */
4014 bool
4015 aarch64_sve_vector_inc_dec_immediate_p (rtx x)
4017 return aarch64_sve_vector_inc_dec_immediate_p (x, NULL, NULL);
4020 /* Return the asm template for an SVE vector INC or DEC instruction.
4021 OPERANDS gives the operands before the vector count and X is the
4022 value of the vector count operand itself. */
4024 char *
4025 aarch64_output_sve_vector_inc_dec (const char *operands, rtx x)
4027 int factor;
4028 unsigned int nelts_per_vq;
4029 if (!aarch64_sve_vector_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
4030 gcc_unreachable ();
4031 if (factor < 0)
4032 return aarch64_output_sve_cnt_immediate ("dec", operands, AARCH64_SV_ALL,
4033 -factor, nelts_per_vq);
4034 else
4035 return aarch64_output_sve_cnt_immediate ("inc", operands, AARCH64_SV_ALL,
4036 factor, nelts_per_vq);
4039 static int
4040 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
4041 scalar_int_mode mode)
4043 int i;
4044 unsigned HOST_WIDE_INT val, val2, mask;
4045 int one_match, zero_match;
4046 int num_insns;
4048 val = INTVAL (imm);
4050 if (aarch64_move_imm (val, mode))
4052 if (generate)
4053 emit_insn (gen_rtx_SET (dest, imm));
4054 return 1;
4057 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
4058 (with XXXX non-zero). In that case check to see if the move can be done in
4059 a smaller mode. */
4060 val2 = val & 0xffffffff;
4061 if (mode == DImode
4062 && aarch64_move_imm (val2, SImode)
4063 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
4065 if (generate)
4066 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4068 /* Check if we have to emit a second instruction by checking to see
4069 if any of the upper 32 bits of the original DI mode value is set. */
4070 if (val == val2)
4071 return 1;
4073 i = (val >> 48) ? 48 : 32;
4075 if (generate)
4076 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4077 GEN_INT ((val >> i) & 0xffff)));
4079 return 2;
4082 if ((val >> 32) == 0 || mode == SImode)
4084 if (generate)
4086 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
4087 if (mode == SImode)
4088 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
4089 GEN_INT ((val >> 16) & 0xffff)));
4090 else
4091 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
4092 GEN_INT ((val >> 16) & 0xffff)));
4094 return 2;
4097 /* Remaining cases are all for DImode. */
4099 mask = 0xffff;
4100 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
4101 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
4102 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
4103 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
4105 if (zero_match != 2 && one_match != 2)
4107 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
4108 For a 64-bit bitmask try whether changing 16 bits to all ones or
4109 zeroes creates a valid bitmask. To check any repeated bitmask,
4110 try using 16 bits from the other 32-bit half of val. */
4112 for (i = 0; i < 64; i += 16, mask <<= 16)
4114 val2 = val & ~mask;
4115 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4116 break;
4117 val2 = val | mask;
4118 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4119 break;
4120 val2 = val2 & ~mask;
4121 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
4122 if (val2 != val && aarch64_bitmask_imm (val2, mode))
4123 break;
4125 if (i != 64)
4127 if (generate)
4129 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
4130 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4131 GEN_INT ((val >> i) & 0xffff)));
4133 return 2;
4137 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
4138 are emitted by the initial mov. If one_match > zero_match, skip set bits,
4139 otherwise skip zero bits. */
4141 num_insns = 1;
4142 mask = 0xffff;
4143 val2 = one_match > zero_match ? ~val : val;
4144 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
4146 if (generate)
4147 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
4148 ? (val | ~(mask << i))
4149 : (val & (mask << i)))));
4150 for (i += 16; i < 64; i += 16)
4152 if ((val2 & (mask << i)) == 0)
4153 continue;
4154 if (generate)
4155 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
4156 GEN_INT ((val >> i) & 0xffff)));
4157 num_insns ++;
4160 return num_insns;
4163 /* Return whether imm is a 128-bit immediate which is simple enough to
4164 expand inline. */
4165 bool
4166 aarch64_mov128_immediate (rtx imm)
4168 if (GET_CODE (imm) == CONST_INT)
4169 return true;
4171 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
4173 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
4174 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
4176 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
4177 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
4181 /* Return the number of temporary registers that aarch64_add_offset_1
4182 would need to add OFFSET to a register. */
4184 static unsigned int
4185 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
4187 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
4190 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
4191 a non-polynomial OFFSET. MODE is the mode of the addition.
4192 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4193 be set and CFA adjustments added to the generated instructions.
4195 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4196 temporary if register allocation is already complete. This temporary
4197 register may overlap DEST but must not overlap SRC. If TEMP1 is known
4198 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
4199 the immediate again.
4201 Since this function may be used to adjust the stack pointer, we must
4202 ensure that it cannot cause transient stack deallocation (for example
4203 by first incrementing SP and then decrementing when adjusting by a
4204 large immediate). */
4206 static void
4207 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
4208 rtx src, HOST_WIDE_INT offset, rtx temp1,
4209 bool frame_related_p, bool emit_move_imm)
4211 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4212 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4214 unsigned HOST_WIDE_INT moffset = absu_hwi (offset);
4215 rtx_insn *insn;
4217 if (!moffset)
4219 if (!rtx_equal_p (dest, src))
4221 insn = emit_insn (gen_rtx_SET (dest, src));
4222 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4224 return;
4227 /* Single instruction adjustment. */
4228 if (aarch64_uimm12_shift (moffset))
4230 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
4231 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4232 return;
4235 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
4236 and either:
4238 a) the offset cannot be loaded by a 16-bit move or
4239 b) there is no spare register into which we can move it. */
4240 if (moffset < 0x1000000
4241 && ((!temp1 && !can_create_pseudo_p ())
4242 || !aarch64_move_imm (moffset, mode)))
4244 HOST_WIDE_INT low_off = moffset & 0xfff;
4246 low_off = offset < 0 ? -low_off : low_off;
4247 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
4248 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4249 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
4250 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4251 return;
4254 /* Emit a move immediate if required and an addition/subtraction. */
4255 if (emit_move_imm)
4257 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
4258 temp1 = aarch64_force_temporary (mode, temp1,
4259 gen_int_mode (moffset, mode));
4261 insn = emit_insn (offset < 0
4262 ? gen_sub3_insn (dest, src, temp1)
4263 : gen_add3_insn (dest, src, temp1));
4264 if (frame_related_p)
4266 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4267 rtx adj = plus_constant (mode, src, offset);
4268 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
4272 /* Return the number of temporary registers that aarch64_add_offset
4273 would need to move OFFSET into a register or add OFFSET to a register;
4274 ADD_P is true if we want the latter rather than the former. */
4276 static unsigned int
4277 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
4279 /* This follows the same structure as aarch64_add_offset. */
4280 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
4281 return 0;
4283 unsigned int count = 0;
4284 HOST_WIDE_INT factor = offset.coeffs[1];
4285 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4286 poly_int64 poly_offset (factor, factor);
4287 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4288 /* Need one register for the ADDVL/ADDPL result. */
4289 count += 1;
4290 else if (factor != 0)
4292 factor = abs (factor);
4293 if (factor > 16 * (factor & -factor))
4294 /* Need one register for the CNT result and one for the multiplication
4295 factor. If necessary, the second temporary can be reused for the
4296 constant part of the offset. */
4297 return 2;
4298 /* Need one register for the CNT result (which might then
4299 be shifted). */
4300 count += 1;
4302 return count + aarch64_add_offset_1_temporaries (constant);
4305 /* If X can be represented as a poly_int64, return the number
4306 of temporaries that are required to add it to a register.
4307 Return -1 otherwise. */
4310 aarch64_add_offset_temporaries (rtx x)
4312 poly_int64 offset;
4313 if (!poly_int_rtx_p (x, &offset))
4314 return -1;
4315 return aarch64_offset_temporaries (true, offset);
4318 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
4319 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
4320 be set and CFA adjustments added to the generated instructions.
4322 TEMP1, if nonnull, is a register of mode MODE that can be used as a
4323 temporary if register allocation is already complete. This temporary
4324 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
4325 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
4326 false to avoid emitting the immediate again.
4328 TEMP2, if nonnull, is a second temporary register that doesn't
4329 overlap either DEST or REG.
4331 Since this function may be used to adjust the stack pointer, we must
4332 ensure that it cannot cause transient stack deallocation (for example
4333 by first incrementing SP and then decrementing when adjusting by a
4334 large immediate). */
4336 static void
4337 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4338 poly_int64 offset, rtx temp1, rtx temp2,
4339 bool frame_related_p, bool emit_move_imm = true)
4341 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
4342 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
4343 gcc_assert (temp1 == NULL_RTX
4344 || !frame_related_p
4345 || !reg_overlap_mentioned_p (temp1, dest));
4346 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
4348 /* Try using ADDVL or ADDPL to add the whole value. */
4349 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
4351 rtx offset_rtx = gen_int_mode (offset, mode);
4352 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4353 RTX_FRAME_RELATED_P (insn) = frame_related_p;
4354 return;
4357 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
4358 SVE vector register, over and above the minimum size of 128 bits.
4359 This is equivalent to half the value returned by CNTD with a
4360 vector shape of ALL. */
4361 HOST_WIDE_INT factor = offset.coeffs[1];
4362 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
4364 /* Try using ADDVL or ADDPL to add the VG-based part. */
4365 poly_int64 poly_offset (factor, factor);
4366 if (src != const0_rtx
4367 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
4369 rtx offset_rtx = gen_int_mode (poly_offset, mode);
4370 if (frame_related_p)
4372 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
4373 RTX_FRAME_RELATED_P (insn) = true;
4374 src = dest;
4376 else
4378 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
4379 src = aarch64_force_temporary (mode, temp1, addr);
4380 temp1 = temp2;
4381 temp2 = NULL_RTX;
4384 /* Otherwise use a CNT-based sequence. */
4385 else if (factor != 0)
4387 /* Use a subtraction if we have a negative factor. */
4388 rtx_code code = PLUS;
4389 if (factor < 0)
4391 factor = -factor;
4392 code = MINUS;
4395 /* Calculate CNTD * FACTOR / 2. First try to fold the division
4396 into the multiplication. */
4397 rtx val;
4398 int shift = 0;
4399 if (factor & 1)
4400 /* Use a right shift by 1. */
4401 shift = -1;
4402 else
4403 factor /= 2;
4404 HOST_WIDE_INT low_bit = factor & -factor;
4405 if (factor <= 16 * low_bit)
4407 if (factor > 16 * 8)
4409 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
4410 the value with the minimum multiplier and shift it into
4411 position. */
4412 int extra_shift = exact_log2 (low_bit);
4413 shift += extra_shift;
4414 factor >>= extra_shift;
4416 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
4418 else
4420 /* Base the factor on LOW_BIT if we can calculate LOW_BIT
4421 directly, since that should increase the chances of being
4422 able to use a shift and add sequence. If LOW_BIT itself
4423 is out of range, just use CNTD. */
4424 if (low_bit <= 16 * 8)
4425 factor /= low_bit;
4426 else
4427 low_bit = 1;
4429 val = gen_int_mode (poly_int64 (low_bit * 2, low_bit * 2), mode);
4430 val = aarch64_force_temporary (mode, temp1, val);
4432 if (can_create_pseudo_p ())
4434 rtx coeff1 = gen_int_mode (factor, mode);
4435 val = expand_mult (mode, val, coeff1, NULL_RTX, false, true);
4437 else
4439 /* Go back to using a negative multiplication factor if we have
4440 no register from which to subtract. */
4441 if (code == MINUS && src == const0_rtx)
4443 factor = -factor;
4444 code = PLUS;
4446 rtx coeff1 = gen_int_mode (factor, mode);
4447 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
4448 val = gen_rtx_MULT (mode, val, coeff1);
4452 if (shift > 0)
4454 /* Multiply by 1 << SHIFT. */
4455 val = aarch64_force_temporary (mode, temp1, val);
4456 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
4458 else if (shift == -1)
4460 /* Divide by 2. */
4461 val = aarch64_force_temporary (mode, temp1, val);
4462 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
4465 /* Calculate SRC +/- CNTD * FACTOR / 2. */
4466 if (src != const0_rtx)
4468 val = aarch64_force_temporary (mode, temp1, val);
4469 val = gen_rtx_fmt_ee (code, mode, src, val);
4471 else if (code == MINUS)
4473 val = aarch64_force_temporary (mode, temp1, val);
4474 val = gen_rtx_NEG (mode, val);
4477 if (constant == 0 || frame_related_p)
4479 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
4480 if (frame_related_p)
4482 RTX_FRAME_RELATED_P (insn) = true;
4483 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4484 gen_rtx_SET (dest, plus_constant (Pmode, src,
4485 poly_offset)));
4487 src = dest;
4488 if (constant == 0)
4489 return;
4491 else
4493 src = aarch64_force_temporary (mode, temp1, val);
4494 temp1 = temp2;
4495 temp2 = NULL_RTX;
4498 emit_move_imm = true;
4501 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
4502 frame_related_p, emit_move_imm);
4505 /* Like aarch64_add_offset, but the offset is given as an rtx rather
4506 than a poly_int64. */
4508 void
4509 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
4510 rtx offset_rtx, rtx temp1, rtx temp2)
4512 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
4513 temp1, temp2, false);
4516 /* Add DELTA to the stack pointer, marking the instructions frame-related.
4517 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
4518 if TEMP1 already contains abs (DELTA). */
4520 static inline void
4521 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
4523 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
4524 temp1, temp2, true, emit_move_imm);
4527 /* Subtract DELTA from the stack pointer, marking the instructions
4528 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
4529 if nonnull. */
4531 static inline void
4532 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p,
4533 bool emit_move_imm = true)
4535 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
4536 temp1, temp2, frame_related_p, emit_move_imm);
4539 /* Set DEST to (vec_series BASE STEP). */
4541 static void
4542 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
4544 machine_mode mode = GET_MODE (dest);
4545 scalar_mode inner = GET_MODE_INNER (mode);
4547 /* Each operand can be a register or an immediate in the range [-16, 15]. */
4548 if (!aarch64_sve_index_immediate_p (base))
4549 base = force_reg (inner, base);
4550 if (!aarch64_sve_index_immediate_p (step))
4551 step = force_reg (inner, step);
4553 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
4556 /* Duplicate 128-bit Advanced SIMD vector SRC so that it fills an SVE
4557 register of mode MODE. Use TARGET for the result if it's nonnull
4558 and convenient.
4560 The two vector modes must have the same element mode. The behavior
4561 is to duplicate architectural lane N of SRC into architectural lanes
4562 N + I * STEP of the result. On big-endian targets, architectural
4563 lane 0 of an Advanced SIMD vector is the last element of the vector
4564 in memory layout, so for big-endian targets this operation has the
4565 effect of reversing SRC before duplicating it. Callers need to
4566 account for this. */
4569 aarch64_expand_sve_dupq (rtx target, machine_mode mode, rtx src)
4571 machine_mode src_mode = GET_MODE (src);
4572 gcc_assert (GET_MODE_INNER (mode) == GET_MODE_INNER (src_mode));
4573 insn_code icode = (BYTES_BIG_ENDIAN
4574 ? code_for_aarch64_vec_duplicate_vq_be (mode)
4575 : code_for_aarch64_vec_duplicate_vq_le (mode));
4577 unsigned int i = 0;
4578 expand_operand ops[3];
4579 create_output_operand (&ops[i++], target, mode);
4580 create_output_operand (&ops[i++], src, src_mode);
4581 if (BYTES_BIG_ENDIAN)
4583 /* Create a PARALLEL describing the reversal of SRC. */
4584 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (mode);
4585 rtx sel = aarch64_gen_stepped_int_parallel (nelts_per_vq,
4586 nelts_per_vq - 1, -1);
4587 create_fixed_operand (&ops[i++], sel);
4589 expand_insn (icode, i, ops);
4590 return ops[0].value;
4593 /* Try to force 128-bit vector value SRC into memory and use LD1RQ to fetch
4594 the memory image into DEST. Return true on success. */
4596 static bool
4597 aarch64_expand_sve_ld1rq (rtx dest, rtx src)
4599 src = force_const_mem (GET_MODE (src), src);
4600 if (!src)
4601 return false;
4603 /* Make sure that the address is legitimate. */
4604 if (!aarch64_sve_ld1rq_operand_p (src))
4606 rtx addr = force_reg (Pmode, XEXP (src, 0));
4607 src = replace_equiv_address (src, addr);
4610 machine_mode mode = GET_MODE (dest);
4611 machine_mode pred_mode = aarch64_sve_pred_mode (mode);
4612 rtx ptrue = aarch64_ptrue_reg (pred_mode);
4613 emit_insn (gen_aarch64_sve_ld1rq (mode, dest, src, ptrue));
4614 return true;
4617 /* Return a register containing CONST_VECTOR SRC, given that SRC has an
4618 SVE data mode and isn't a legitimate constant. Use TARGET for the
4619 result if convenient.
4621 The returned register can have whatever mode seems most natural
4622 given the contents of SRC. */
4624 static rtx
4625 aarch64_expand_sve_const_vector (rtx target, rtx src)
4627 machine_mode mode = GET_MODE (src);
4628 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
4629 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
4630 scalar_mode elt_mode = GET_MODE_INNER (mode);
4631 unsigned int elt_bits = GET_MODE_BITSIZE (elt_mode);
4632 unsigned int container_bits = aarch64_sve_container_bits (mode);
4633 unsigned int encoded_bits = npatterns * nelts_per_pattern * container_bits;
4635 if (nelts_per_pattern == 1
4636 && encoded_bits <= 128
4637 && container_bits != elt_bits)
4639 /* We have a partial vector mode and a constant whose full-vector
4640 equivalent would occupy a repeating 128-bit sequence. Build that
4641 full-vector equivalent instead, so that we have the option of
4642 using LD1RQ and Advanced SIMD operations. */
4643 unsigned int repeat = container_bits / elt_bits;
4644 machine_mode full_mode = aarch64_full_sve_mode (elt_mode).require ();
4645 rtx_vector_builder builder (full_mode, npatterns * repeat, 1);
4646 for (unsigned int i = 0; i < npatterns; ++i)
4647 for (unsigned int j = 0; j < repeat; ++j)
4648 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, i));
4649 target = aarch64_target_reg (target, full_mode);
4650 return aarch64_expand_sve_const_vector (target, builder.build ());
4653 if (nelts_per_pattern == 1 && encoded_bits == 128)
4655 /* The constant is a duplicated quadword but can't be narrowed
4656 beyond a quadword. Get the memory image of the first quadword
4657 as a 128-bit vector and try using LD1RQ to load it from memory.
4659 The effect for both endiannesses is to load memory lane N into
4660 architectural lanes N + I * STEP of the result. On big-endian
4661 targets, the layout of the 128-bit vector in an Advanced SIMD
4662 register would be different from its layout in an SVE register,
4663 but this 128-bit vector is a memory value only. */
4664 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4665 rtx vq_value = simplify_gen_subreg (vq_mode, src, mode, 0);
4666 if (vq_value && aarch64_expand_sve_ld1rq (target, vq_value))
4667 return target;
4670 if (nelts_per_pattern == 1 && encoded_bits < 128)
4672 /* The vector is a repeating sequence of 64 bits or fewer.
4673 See if we can load them using an Advanced SIMD move and then
4674 duplicate it to fill a vector. This is better than using a GPR
4675 move because it keeps everything in the same register file. */
4676 machine_mode vq_mode = aarch64_vq_mode (elt_mode).require ();
4677 rtx_vector_builder builder (vq_mode, npatterns, 1);
4678 for (unsigned int i = 0; i < npatterns; ++i)
4680 /* We want memory lane N to go into architectural lane N,
4681 so reverse for big-endian targets. The DUP .Q pattern
4682 has a compensating reverse built-in. */
4683 unsigned int srci = BYTES_BIG_ENDIAN ? npatterns - i - 1 : i;
4684 builder.quick_push (CONST_VECTOR_ENCODED_ELT (src, srci));
4686 rtx vq_src = builder.build ();
4687 if (aarch64_simd_valid_immediate (vq_src, NULL))
4689 vq_src = force_reg (vq_mode, vq_src);
4690 return aarch64_expand_sve_dupq (target, mode, vq_src);
4693 /* Get an integer representation of the repeating part of Advanced
4694 SIMD vector VQ_SRC. This preserves the endianness of VQ_SRC,
4695 which for big-endian targets is lane-swapped wrt a normal
4696 Advanced SIMD vector. This means that for both endiannesses,
4697 memory lane N of SVE vector SRC corresponds to architectural
4698 lane N of a register holding VQ_SRC. This in turn means that
4699 memory lane 0 of SVE vector SRC is in the lsb of VQ_SRC (viewed
4700 as a single 128-bit value) and thus that memory lane 0 of SRC is
4701 in the lsb of the integer. Duplicating the integer therefore
4702 ensures that memory lane N of SRC goes into architectural lane
4703 N + I * INDEX of the SVE register. */
4704 scalar_mode int_mode = int_mode_for_size (encoded_bits, 0).require ();
4705 rtx elt_value = simplify_gen_subreg (int_mode, vq_src, vq_mode, 0);
4706 if (elt_value)
4708 /* Pretend that we had a vector of INT_MODE to start with. */
4709 elt_mode = int_mode;
4710 mode = aarch64_full_sve_mode (int_mode).require ();
4712 /* If the integer can be moved into a general register by a
4713 single instruction, do that and duplicate the result. */
4714 if (CONST_INT_P (elt_value)
4715 && aarch64_move_imm (INTVAL (elt_value), elt_mode))
4717 elt_value = force_reg (elt_mode, elt_value);
4718 return expand_vector_broadcast (mode, elt_value);
4721 else if (npatterns == 1)
4722 /* We're duplicating a single value, but can't do better than
4723 force it to memory and load from there. This handles things
4724 like symbolic constants. */
4725 elt_value = CONST_VECTOR_ENCODED_ELT (src, 0);
4727 if (elt_value)
4729 /* Load the element from memory if we can, otherwise move it into
4730 a register and use a DUP. */
4731 rtx op = force_const_mem (elt_mode, elt_value);
4732 if (!op)
4733 op = force_reg (elt_mode, elt_value);
4734 return expand_vector_broadcast (mode, op);
4738 /* Try using INDEX. */
4739 rtx base, step;
4740 if (const_vec_series_p (src, &base, &step))
4742 aarch64_expand_vec_series (target, base, step);
4743 return target;
4746 /* From here on, it's better to force the whole constant to memory
4747 if we can. */
4748 if (GET_MODE_NUNITS (mode).is_constant ())
4749 return NULL_RTX;
4751 /* Expand each pattern individually. */
4752 gcc_assert (npatterns > 1);
4753 rtx_vector_builder builder;
4754 auto_vec<rtx, 16> vectors (npatterns);
4755 for (unsigned int i = 0; i < npatterns; ++i)
4757 builder.new_vector (mode, 1, nelts_per_pattern);
4758 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
4759 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
4760 vectors.quick_push (force_reg (mode, builder.build ()));
4763 /* Use permutes to interleave the separate vectors. */
4764 while (npatterns > 1)
4766 npatterns /= 2;
4767 for (unsigned int i = 0; i < npatterns; ++i)
4769 rtx tmp = (npatterns == 1 ? target : gen_reg_rtx (mode));
4770 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
4771 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
4772 vectors[i] = tmp;
4775 gcc_assert (vectors[0] == target);
4776 return target;
4779 /* Use WHILE to set a predicate register of mode MODE in which the first
4780 VL bits are set and the rest are clear. Use TARGET for the register
4781 if it's nonnull and convenient. */
4783 static rtx
4784 aarch64_sve_move_pred_via_while (rtx target, machine_mode mode,
4785 unsigned int vl)
4787 rtx limit = force_reg (DImode, gen_int_mode (vl, DImode));
4788 target = aarch64_target_reg (target, mode);
4789 emit_insn (gen_while (UNSPEC_WHILELO, DImode, mode,
4790 target, const0_rtx, limit));
4791 return target;
4794 static rtx
4795 aarch64_expand_sve_const_pred_1 (rtx, rtx_vector_builder &, bool);
4797 /* BUILDER is a constant predicate in which the index of every set bit
4798 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4799 by inverting every element at a multiple of ELT_SIZE and EORing the
4800 result with an ELT_SIZE PTRUE.
4802 Return a register that contains the constant on success, otherwise
4803 return null. Use TARGET as the register if it is nonnull and
4804 convenient. */
4806 static rtx
4807 aarch64_expand_sve_const_pred_eor (rtx target, rtx_vector_builder &builder,
4808 unsigned int elt_size)
4810 /* Invert every element at a multiple of ELT_SIZE, keeping the
4811 other bits zero. */
4812 rtx_vector_builder inv_builder (VNx16BImode, builder.npatterns (),
4813 builder.nelts_per_pattern ());
4814 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
4815 if ((i & (elt_size - 1)) == 0 && INTVAL (builder.elt (i)) == 0)
4816 inv_builder.quick_push (const1_rtx);
4817 else
4818 inv_builder.quick_push (const0_rtx);
4819 inv_builder.finalize ();
4821 /* See if we can load the constant cheaply. */
4822 rtx inv = aarch64_expand_sve_const_pred_1 (NULL_RTX, inv_builder, false);
4823 if (!inv)
4824 return NULL_RTX;
4826 /* EOR the result with an ELT_SIZE PTRUE. */
4827 rtx mask = aarch64_ptrue_all (elt_size);
4828 mask = force_reg (VNx16BImode, mask);
4829 inv = gen_lowpart (VNx16BImode, inv);
4830 target = aarch64_target_reg (target, VNx16BImode);
4831 emit_insn (gen_aarch64_pred_z (XOR, VNx16BImode, target, mask, inv, mask));
4832 return target;
4835 /* BUILDER is a constant predicate in which the index of every set bit
4836 is a multiple of ELT_SIZE (which is <= 8). Try to load the constant
4837 using a TRN1 of size PERMUTE_SIZE, which is >= ELT_SIZE. Return the
4838 register on success, otherwise return null. Use TARGET as the register
4839 if nonnull and convenient. */
4841 static rtx
4842 aarch64_expand_sve_const_pred_trn (rtx target, rtx_vector_builder &builder,
4843 unsigned int elt_size,
4844 unsigned int permute_size)
4846 /* We're going to split the constant into two new constants A and B,
4847 with element I of BUILDER going into A if (I & PERMUTE_SIZE) == 0
4848 and into B otherwise. E.g. for PERMUTE_SIZE == 4 && ELT_SIZE == 1:
4850 A: { 0, 1, 2, 3, _, _, _, _, 8, 9, 10, 11, _, _, _, _ }
4851 B: { 4, 5, 6, 7, _, _, _, _, 12, 13, 14, 15, _, _, _, _ }
4853 where _ indicates elements that will be discarded by the permute.
4855 First calculate the ELT_SIZEs for A and B. */
4856 unsigned int a_elt_size = GET_MODE_SIZE (DImode);
4857 unsigned int b_elt_size = GET_MODE_SIZE (DImode);
4858 for (unsigned int i = 0; i < builder.encoded_nelts (); i += elt_size)
4859 if (INTVAL (builder.elt (i)) != 0)
4861 if (i & permute_size)
4862 b_elt_size |= i - permute_size;
4863 else
4864 a_elt_size |= i;
4866 a_elt_size &= -a_elt_size;
4867 b_elt_size &= -b_elt_size;
4869 /* Now construct the vectors themselves. */
4870 rtx_vector_builder a_builder (VNx16BImode, builder.npatterns (),
4871 builder.nelts_per_pattern ());
4872 rtx_vector_builder b_builder (VNx16BImode, builder.npatterns (),
4873 builder.nelts_per_pattern ());
4874 unsigned int nelts = builder.encoded_nelts ();
4875 for (unsigned int i = 0; i < nelts; ++i)
4876 if (i & (elt_size - 1))
4878 a_builder.quick_push (const0_rtx);
4879 b_builder.quick_push (const0_rtx);
4881 else if ((i & permute_size) == 0)
4883 /* The A and B elements are significant. */
4884 a_builder.quick_push (builder.elt (i));
4885 b_builder.quick_push (builder.elt (i + permute_size));
4887 else
4889 /* The A and B elements are going to be discarded, so pick whatever
4890 is likely to give a nice constant. We are targeting element
4891 sizes A_ELT_SIZE and B_ELT_SIZE for A and B respectively,
4892 with the aim of each being a sequence of ones followed by
4893 a sequence of zeros. So:
4895 * if X_ELT_SIZE <= PERMUTE_SIZE, the best approach is to
4896 duplicate the last X_ELT_SIZE element, to extend the
4897 current sequence of ones or zeros.
4899 * if X_ELT_SIZE > PERMUTE_SIZE, the best approach is to add a
4900 zero, so that the constant really does have X_ELT_SIZE and
4901 not a smaller size. */
4902 if (a_elt_size > permute_size)
4903 a_builder.quick_push (const0_rtx);
4904 else
4905 a_builder.quick_push (a_builder.elt (i - a_elt_size));
4906 if (b_elt_size > permute_size)
4907 b_builder.quick_push (const0_rtx);
4908 else
4909 b_builder.quick_push (b_builder.elt (i - b_elt_size));
4911 a_builder.finalize ();
4912 b_builder.finalize ();
4914 /* Try loading A into a register. */
4915 rtx_insn *last = get_last_insn ();
4916 rtx a = aarch64_expand_sve_const_pred_1 (NULL_RTX, a_builder, false);
4917 if (!a)
4918 return NULL_RTX;
4920 /* Try loading B into a register. */
4921 rtx b = a;
4922 if (a_builder != b_builder)
4924 b = aarch64_expand_sve_const_pred_1 (NULL_RTX, b_builder, false);
4925 if (!b)
4927 delete_insns_since (last);
4928 return NULL_RTX;
4932 /* Emit the TRN1 itself. */
4933 machine_mode mode = aarch64_sve_pred_mode (permute_size).require ();
4934 target = aarch64_target_reg (target, mode);
4935 emit_insn (gen_aarch64_sve (UNSPEC_TRN1, mode, target,
4936 gen_lowpart (mode, a),
4937 gen_lowpart (mode, b)));
4938 return target;
4941 /* Subroutine of aarch64_expand_sve_const_pred. Try to load the VNx16BI
4942 constant in BUILDER into an SVE predicate register. Return the register
4943 on success, otherwise return null. Use TARGET for the register if
4944 nonnull and convenient.
4946 ALLOW_RECURSE_P is true if we can use methods that would call this
4947 function recursively. */
4949 static rtx
4950 aarch64_expand_sve_const_pred_1 (rtx target, rtx_vector_builder &builder,
4951 bool allow_recurse_p)
4953 if (builder.encoded_nelts () == 1)
4954 /* A PFALSE or a PTRUE .B ALL. */
4955 return aarch64_emit_set_immediate (target, builder);
4957 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
4958 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
4960 /* If we can load the constant using PTRUE, use it as-is. */
4961 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
4962 if (aarch64_svpattern_for_vl (mode, vl) != AARCH64_NUM_SVPATTERNS)
4963 return aarch64_emit_set_immediate (target, builder);
4965 /* Otherwise use WHILE to set the first VL bits. */
4966 return aarch64_sve_move_pred_via_while (target, mode, vl);
4969 if (!allow_recurse_p)
4970 return NULL_RTX;
4972 /* Try inverting the vector in element size ELT_SIZE and then EORing
4973 the result with an ELT_SIZE PTRUE. */
4974 if (INTVAL (builder.elt (0)) == 0)
4975 if (rtx res = aarch64_expand_sve_const_pred_eor (target, builder,
4976 elt_size))
4977 return res;
4979 /* Try using TRN1 to permute two simpler constants. */
4980 for (unsigned int i = elt_size; i <= 8; i *= 2)
4981 if (rtx res = aarch64_expand_sve_const_pred_trn (target, builder,
4982 elt_size, i))
4983 return res;
4985 return NULL_RTX;
4988 /* Return an SVE predicate register that contains the VNx16BImode
4989 constant in BUILDER, without going through the move expanders.
4991 The returned register can have whatever mode seems most natural
4992 given the contents of BUILDER. Use TARGET for the result if
4993 convenient. */
4995 static rtx
4996 aarch64_expand_sve_const_pred (rtx target, rtx_vector_builder &builder)
4998 /* Try loading the constant using pure predicate operations. */
4999 if (rtx res = aarch64_expand_sve_const_pred_1 (target, builder, true))
5000 return res;
5002 /* Try forcing the constant to memory. */
5003 if (builder.full_nelts ().is_constant ())
5004 if (rtx mem = force_const_mem (VNx16BImode, builder.build ()))
5006 target = aarch64_target_reg (target, VNx16BImode);
5007 emit_move_insn (target, mem);
5008 return target;
5011 /* The last resort is to load the constant as an integer and then
5012 compare it against zero. Use -1 for set bits in order to increase
5013 the changes of using SVE DUPM or an Advanced SIMD byte mask. */
5014 rtx_vector_builder int_builder (VNx16QImode, builder.npatterns (),
5015 builder.nelts_per_pattern ());
5016 for (unsigned int i = 0; i < builder.encoded_nelts (); ++i)
5017 int_builder.quick_push (INTVAL (builder.elt (i))
5018 ? constm1_rtx : const0_rtx);
5019 return aarch64_convert_sve_data_to_pred (target, VNx16BImode,
5020 int_builder.build ());
5023 /* Set DEST to immediate IMM. */
5025 void
5026 aarch64_expand_mov_immediate (rtx dest, rtx imm)
5028 machine_mode mode = GET_MODE (dest);
5030 /* Check on what type of symbol it is. */
5031 scalar_int_mode int_mode;
5032 if ((GET_CODE (imm) == SYMBOL_REF
5033 || GET_CODE (imm) == LABEL_REF
5034 || GET_CODE (imm) == CONST
5035 || GET_CODE (imm) == CONST_POLY_INT)
5036 && is_a <scalar_int_mode> (mode, &int_mode))
5038 rtx mem;
5039 poly_int64 offset;
5040 HOST_WIDE_INT const_offset;
5041 enum aarch64_symbol_type sty;
5043 /* If we have (const (plus symbol offset)), separate out the offset
5044 before we start classifying the symbol. */
5045 rtx base = strip_offset (imm, &offset);
5047 /* We must always add an offset involving VL separately, rather than
5048 folding it into the relocation. */
5049 if (!offset.is_constant (&const_offset))
5051 if (!TARGET_SVE)
5053 aarch64_report_sve_required ();
5054 return;
5056 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
5057 emit_insn (gen_rtx_SET (dest, imm));
5058 else
5060 /* Do arithmetic on 32-bit values if the result is smaller
5061 than that. */
5062 if (partial_subreg_p (int_mode, SImode))
5064 /* It is invalid to do symbol calculations in modes
5065 narrower than SImode. */
5066 gcc_assert (base == const0_rtx);
5067 dest = gen_lowpart (SImode, dest);
5068 int_mode = SImode;
5070 if (base != const0_rtx)
5072 base = aarch64_force_temporary (int_mode, dest, base);
5073 aarch64_add_offset (int_mode, dest, base, offset,
5074 NULL_RTX, NULL_RTX, false);
5076 else
5077 aarch64_add_offset (int_mode, dest, base, offset,
5078 dest, NULL_RTX, false);
5080 return;
5083 sty = aarch64_classify_symbol (base, const_offset);
5084 switch (sty)
5086 case SYMBOL_FORCE_TO_MEM:
5087 if (const_offset != 0
5088 && targetm.cannot_force_const_mem (int_mode, imm))
5090 gcc_assert (can_create_pseudo_p ());
5091 base = aarch64_force_temporary (int_mode, dest, base);
5092 aarch64_add_offset (int_mode, dest, base, const_offset,
5093 NULL_RTX, NULL_RTX, false);
5094 return;
5097 mem = force_const_mem (ptr_mode, imm);
5098 gcc_assert (mem);
5100 /* If we aren't generating PC relative literals, then
5101 we need to expand the literal pool access carefully.
5102 This is something that needs to be done in a number
5103 of places, so could well live as a separate function. */
5104 if (!aarch64_pcrelative_literal_loads)
5106 gcc_assert (can_create_pseudo_p ());
5107 base = gen_reg_rtx (ptr_mode);
5108 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
5109 if (ptr_mode != Pmode)
5110 base = convert_memory_address (Pmode, base);
5111 mem = gen_rtx_MEM (ptr_mode, base);
5114 if (int_mode != ptr_mode)
5115 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
5117 emit_insn (gen_rtx_SET (dest, mem));
5119 return;
5121 case SYMBOL_SMALL_TLSGD:
5122 case SYMBOL_SMALL_TLSDESC:
5123 case SYMBOL_SMALL_TLSIE:
5124 case SYMBOL_SMALL_GOT_28K:
5125 case SYMBOL_SMALL_GOT_4G:
5126 case SYMBOL_TINY_GOT:
5127 case SYMBOL_TINY_TLSIE:
5128 if (const_offset != 0)
5130 gcc_assert(can_create_pseudo_p ());
5131 base = aarch64_force_temporary (int_mode, dest, base);
5132 aarch64_add_offset (int_mode, dest, base, const_offset,
5133 NULL_RTX, NULL_RTX, false);
5134 return;
5136 /* FALLTHRU */
5138 case SYMBOL_SMALL_ABSOLUTE:
5139 case SYMBOL_TINY_ABSOLUTE:
5140 case SYMBOL_TLSLE12:
5141 case SYMBOL_TLSLE24:
5142 case SYMBOL_TLSLE32:
5143 case SYMBOL_TLSLE48:
5144 aarch64_load_symref_appropriately (dest, imm, sty);
5145 return;
5147 default:
5148 gcc_unreachable ();
5152 if (!CONST_INT_P (imm))
5154 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL)
5156 /* Only the low bit of each .H, .S and .D element is defined,
5157 so we can set the upper bits to whatever we like. If the
5158 predicate is all-true in MODE, prefer to set all the undefined
5159 bits as well, so that we can share a single .B predicate for
5160 all modes. */
5161 if (imm == CONSTM1_RTX (mode))
5162 imm = CONSTM1_RTX (VNx16BImode);
5164 /* All methods for constructing predicate modes wider than VNx16BI
5165 will set the upper bits of each element to zero. Expose this
5166 by moving such constants as a VNx16BI, so that all bits are
5167 significant and so that constants for different modes can be
5168 shared. The wider constant will still be available as a
5169 REG_EQUAL note. */
5170 rtx_vector_builder builder;
5171 if (aarch64_get_sve_pred_bits (builder, imm))
5173 rtx res = aarch64_expand_sve_const_pred (dest, builder);
5174 if (dest != res)
5175 emit_move_insn (dest, gen_lowpart (mode, res));
5176 return;
5180 if (GET_CODE (imm) == HIGH
5181 || aarch64_simd_valid_immediate (imm, NULL))
5183 emit_insn (gen_rtx_SET (dest, imm));
5184 return;
5187 if (GET_CODE (imm) == CONST_VECTOR && aarch64_sve_data_mode_p (mode))
5188 if (rtx res = aarch64_expand_sve_const_vector (dest, imm))
5190 if (dest != res)
5191 emit_insn (gen_aarch64_sve_reinterpret (mode, dest, res));
5192 return;
5195 rtx mem = force_const_mem (mode, imm);
5196 gcc_assert (mem);
5197 emit_move_insn (dest, mem);
5198 return;
5201 aarch64_internal_mov_immediate (dest, imm, true,
5202 as_a <scalar_int_mode> (mode));
5205 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
5206 that is known to contain PTRUE. */
5208 void
5209 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
5211 expand_operand ops[3];
5212 machine_mode mode = GET_MODE (dest);
5213 create_output_operand (&ops[0], dest, mode);
5214 create_input_operand (&ops[1], pred, GET_MODE(pred));
5215 create_input_operand (&ops[2], src, mode);
5216 temporary_volatile_ok v (true);
5217 expand_insn (code_for_aarch64_pred_mov (mode), 3, ops);
5220 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
5221 operand is in memory. In this case we need to use the predicated LD1
5222 and ST1 instead of LDR and STR, both for correctness on big-endian
5223 targets and because LD1 and ST1 support a wider range of addressing modes.
5224 PRED_MODE is the mode of the predicate.
5226 See the comment at the head of aarch64-sve.md for details about the
5227 big-endian handling. */
5229 void
5230 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
5232 machine_mode mode = GET_MODE (dest);
5233 rtx ptrue = aarch64_ptrue_reg (pred_mode);
5234 if (!register_operand (src, mode)
5235 && !register_operand (dest, mode))
5237 rtx tmp = gen_reg_rtx (mode);
5238 if (MEM_P (src))
5239 aarch64_emit_sve_pred_move (tmp, ptrue, src);
5240 else
5241 emit_move_insn (tmp, src);
5242 src = tmp;
5244 aarch64_emit_sve_pred_move (dest, ptrue, src);
5247 /* Called only on big-endian targets. See whether an SVE vector move
5248 from SRC to DEST is effectively a REV[BHW] instruction, because at
5249 least one operand is a subreg of an SVE vector that has wider or
5250 narrower elements. Return true and emit the instruction if so.
5252 For example:
5254 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
5256 represents a VIEW_CONVERT between the following vectors, viewed
5257 in memory order:
5259 R2: { [0].high, [0].low, [1].high, [1].low, ... }
5260 R1: { [0], [1], [2], [3], ... }
5262 The high part of lane X in R2 should therefore correspond to lane X*2
5263 of R1, but the register representations are:
5265 msb lsb
5266 R2: ...... [1].high [1].low [0].high [0].low
5267 R1: ...... [3] [2] [1] [0]
5269 where the low part of lane X in R2 corresponds to lane X*2 in R1.
5270 We therefore need a reverse operation to swap the high and low values
5271 around.
5273 This is purely an optimization. Without it we would spill the
5274 subreg operand to the stack in one mode and reload it in the
5275 other mode, which has the same effect as the REV. */
5277 bool
5278 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
5280 gcc_assert (BYTES_BIG_ENDIAN);
5281 if (GET_CODE (dest) == SUBREG)
5282 dest = SUBREG_REG (dest);
5283 if (GET_CODE (src) == SUBREG)
5284 src = SUBREG_REG (src);
5286 /* The optimization handles two single SVE REGs with different element
5287 sizes. */
5288 if (!REG_P (dest)
5289 || !REG_P (src)
5290 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
5291 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
5292 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
5293 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
5294 return false;
5296 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
5297 rtx ptrue = aarch64_ptrue_reg (VNx16BImode);
5298 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
5299 UNSPEC_REV_SUBREG);
5300 emit_insn (gen_rtx_SET (dest, unspec));
5301 return true;
5304 /* Return a copy of X with mode MODE, without changing its other
5305 attributes. Unlike gen_lowpart, this doesn't care whether the
5306 mode change is valid. */
5309 aarch64_replace_reg_mode (rtx x, machine_mode mode)
5311 if (GET_MODE (x) == mode)
5312 return x;
5314 x = shallow_copy_rtx (x);
5315 set_mode_and_regno (x, mode, REGNO (x));
5316 return x;
5319 /* Return the SVE REV[BHW] unspec for reversing quantites of mode MODE
5320 stored in wider integer containers. */
5322 static unsigned int
5323 aarch64_sve_rev_unspec (machine_mode mode)
5325 switch (GET_MODE_UNIT_SIZE (mode))
5327 case 1: return UNSPEC_REVB;
5328 case 2: return UNSPEC_REVH;
5329 case 4: return UNSPEC_REVW;
5331 gcc_unreachable ();
5334 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
5335 operands. */
5337 void
5338 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
5340 /* Decide which REV operation we need. The mode with wider elements
5341 determines the mode of the operands and the mode with the narrower
5342 elements determines the reverse width. */
5343 machine_mode mode_with_wider_elts = aarch64_sve_int_mode (GET_MODE (dest));
5344 machine_mode mode_with_narrower_elts = aarch64_sve_int_mode (GET_MODE (src));
5345 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
5346 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
5347 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
5349 unsigned int unspec = aarch64_sve_rev_unspec (mode_with_narrower_elts);
5350 machine_mode pred_mode = aarch64_sve_pred_mode (mode_with_wider_elts);
5352 /* Get the operands in the appropriate modes and emit the instruction. */
5353 ptrue = gen_lowpart (pred_mode, ptrue);
5354 dest = aarch64_replace_reg_mode (dest, mode_with_wider_elts);
5355 src = aarch64_replace_reg_mode (src, mode_with_wider_elts);
5356 emit_insn (gen_aarch64_pred (unspec, mode_with_wider_elts,
5357 dest, ptrue, src));
5360 static bool
5361 aarch64_function_ok_for_sibcall (tree, tree exp)
5363 if (crtl->abi->id () != expr_callee_abi (exp).id ())
5364 return false;
5366 return true;
5369 /* Subroutine of aarch64_pass_by_reference for arguments that are not
5370 passed in SVE registers. */
5372 static bool
5373 aarch64_pass_by_reference_1 (CUMULATIVE_ARGS *pcum,
5374 const function_arg_info &arg)
5376 HOST_WIDE_INT size;
5377 machine_mode dummymode;
5378 int nregs;
5380 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
5381 if (arg.mode == BLKmode && arg.type)
5382 size = int_size_in_bytes (arg.type);
5383 else
5384 /* No frontends can create types with variable-sized modes, so we
5385 shouldn't be asked to pass or return them. */
5386 size = GET_MODE_SIZE (arg.mode).to_constant ();
5388 /* Aggregates are passed by reference based on their size. */
5389 if (arg.aggregate_type_p ())
5390 size = int_size_in_bytes (arg.type);
5392 /* Variable sized arguments are always returned by reference. */
5393 if (size < 0)
5394 return true;
5396 /* Can this be a candidate to be passed in fp/simd register(s)? */
5397 if (aarch64_vfp_is_call_or_return_candidate (arg.mode, arg.type,
5398 &dummymode, &nregs, NULL,
5399 !pcum || pcum->silent_p))
5400 return false;
5402 /* Arguments which are variable sized or larger than 2 registers are
5403 passed by reference unless they are a homogenous floating point
5404 aggregate. */
5405 return size > 2 * UNITS_PER_WORD;
5408 /* Implement TARGET_PASS_BY_REFERENCE. */
5410 static bool
5411 aarch64_pass_by_reference (cumulative_args_t pcum_v,
5412 const function_arg_info &arg)
5414 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5416 if (!arg.type)
5417 return aarch64_pass_by_reference_1 (pcum, arg);
5419 pure_scalable_type_info pst_info;
5420 switch (pst_info.analyze (arg.type))
5422 case pure_scalable_type_info::IS_PST:
5423 if (pcum && !pcum->silent_p && !TARGET_SVE)
5424 /* We can't gracefully recover at this point, so make this a
5425 fatal error. */
5426 fatal_error (input_location, "arguments of type %qT require"
5427 " the SVE ISA extension", arg.type);
5429 /* Variadic SVE types are passed by reference. Normal non-variadic
5430 arguments are too if we've run out of registers. */
5431 return (!arg.named
5432 || pcum->aapcs_nvrn + pst_info.num_zr () > NUM_FP_ARG_REGS
5433 || pcum->aapcs_nprn + pst_info.num_pr () > NUM_PR_ARG_REGS);
5435 case pure_scalable_type_info::DOESNT_MATTER:
5436 gcc_assert (aarch64_pass_by_reference_1 (pcum, arg));
5437 return true;
5439 case pure_scalable_type_info::NO_ABI_IDENTITY:
5440 case pure_scalable_type_info::ISNT_PST:
5441 return aarch64_pass_by_reference_1 (pcum, arg);
5443 gcc_unreachable ();
5446 /* Return TRUE if VALTYPE is padded to its least significant bits. */
5447 static bool
5448 aarch64_return_in_msb (const_tree valtype)
5450 machine_mode dummy_mode;
5451 int dummy_int;
5453 /* Never happens in little-endian mode. */
5454 if (!BYTES_BIG_ENDIAN)
5455 return false;
5457 /* Only composite types smaller than or equal to 16 bytes can
5458 be potentially returned in registers. */
5459 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
5460 || int_size_in_bytes (valtype) <= 0
5461 || int_size_in_bytes (valtype) > 16)
5462 return false;
5464 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
5465 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
5466 is always passed/returned in the least significant bits of fp/simd
5467 register(s). */
5468 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
5469 &dummy_mode, &dummy_int, NULL,
5470 false))
5471 return false;
5473 /* Likewise pure scalable types for SVE vector and predicate registers. */
5474 pure_scalable_type_info pst_info;
5475 if (pst_info.analyze_registers (valtype))
5476 return false;
5478 return true;
5481 /* Implement TARGET_FUNCTION_VALUE.
5482 Define how to find the value returned by a function. */
5484 static rtx
5485 aarch64_function_value (const_tree type, const_tree func,
5486 bool outgoing ATTRIBUTE_UNUSED)
5488 machine_mode mode;
5489 int unsignedp;
5491 mode = TYPE_MODE (type);
5492 if (INTEGRAL_TYPE_P (type))
5493 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
5495 pure_scalable_type_info pst_info;
5496 if (type && pst_info.analyze_registers (type))
5497 return pst_info.get_rtx (mode, V0_REGNUM, P0_REGNUM);
5499 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5500 are returned in memory, not by value. */
5501 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5502 bool sve_p = (vec_flags & VEC_ANY_SVE);
5504 if (aarch64_return_in_msb (type))
5506 HOST_WIDE_INT size = int_size_in_bytes (type);
5508 if (size % UNITS_PER_WORD != 0)
5510 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
5511 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
5515 int count;
5516 machine_mode ag_mode;
5517 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &count,
5518 NULL, false))
5520 gcc_assert (!sve_p);
5521 if (!aarch64_composite_type_p (type, mode))
5523 gcc_assert (count == 1 && mode == ag_mode);
5524 return gen_rtx_REG (mode, V0_REGNUM);
5526 else
5528 int i;
5529 rtx par;
5531 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
5532 for (i = 0; i < count; i++)
5534 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
5535 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
5536 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5537 XVECEXP (par, 0, i) = tmp;
5539 return par;
5542 else
5544 if (sve_p)
5546 /* Vector types can acquire a partial SVE mode using things like
5547 __attribute__((vector_size(N))), and this is potentially useful.
5548 However, the choice of mode doesn't affect the type's ABI
5549 identity, so we should treat the types as though they had
5550 the associated integer mode, just like they did before SVE
5551 was introduced.
5553 We know that the vector must be 128 bits or smaller,
5554 otherwise we'd have returned it in memory instead. */
5555 gcc_assert (type
5556 && (aarch64_some_values_include_pst_objects_p (type)
5557 || (vec_flags & VEC_PARTIAL)));
5559 scalar_int_mode int_mode = int_mode_for_mode (mode).require ();
5560 rtx reg = gen_rtx_REG (int_mode, R0_REGNUM);
5561 rtx pair = gen_rtx_EXPR_LIST (VOIDmode, reg, const0_rtx);
5562 return gen_rtx_PARALLEL (mode, gen_rtvec (1, pair));
5564 return gen_rtx_REG (mode, R0_REGNUM);
5568 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
5569 Return true if REGNO is the number of a hard register in which the values
5570 of called function may come back. */
5572 static bool
5573 aarch64_function_value_regno_p (const unsigned int regno)
5575 /* Maximum of 16 bytes can be returned in the general registers. Examples
5576 of 16-byte return values are: 128-bit integers and 16-byte small
5577 structures (excluding homogeneous floating-point aggregates). */
5578 if (regno == R0_REGNUM || regno == R1_REGNUM)
5579 return true;
5581 /* Up to four fp/simd registers can return a function value, e.g. a
5582 homogeneous floating-point aggregate having four members. */
5583 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
5584 return TARGET_FLOAT;
5586 return false;
5589 /* Subroutine for aarch64_return_in_memory for types that are not returned
5590 in SVE registers. */
5592 static bool
5593 aarch64_return_in_memory_1 (const_tree type)
5595 HOST_WIDE_INT size;
5596 machine_mode ag_mode;
5597 int count;
5599 if (!AGGREGATE_TYPE_P (type)
5600 && TREE_CODE (type) != COMPLEX_TYPE
5601 && TREE_CODE (type) != VECTOR_TYPE)
5602 /* Simple scalar types always returned in registers. */
5603 return false;
5605 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5606 &ag_mode, &count, NULL, false))
5607 return false;
5609 /* Types larger than 2 registers returned in memory. */
5610 size = int_size_in_bytes (type);
5611 return (size < 0 || size > 2 * UNITS_PER_WORD);
5614 /* Implement TARGET_RETURN_IN_MEMORY.
5616 If the type T of the result of a function is such that
5617 void func (T arg)
5618 would require that arg be passed as a value in a register (or set of
5619 registers) according to the parameter passing rules, then the result
5620 is returned in the same registers as would be used for such an
5621 argument. */
5623 static bool
5624 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
5626 pure_scalable_type_info pst_info;
5627 switch (pst_info.analyze (type))
5629 case pure_scalable_type_info::IS_PST:
5630 return (pst_info.num_zr () > NUM_FP_ARG_REGS
5631 || pst_info.num_pr () > NUM_PR_ARG_REGS);
5633 case pure_scalable_type_info::DOESNT_MATTER:
5634 gcc_assert (aarch64_return_in_memory_1 (type));
5635 return true;
5637 case pure_scalable_type_info::NO_ABI_IDENTITY:
5638 case pure_scalable_type_info::ISNT_PST:
5639 return aarch64_return_in_memory_1 (type);
5641 gcc_unreachable ();
5644 static bool
5645 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
5646 const_tree type, int *nregs)
5648 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5649 return aarch64_vfp_is_call_or_return_candidate (mode, type,
5650 &pcum->aapcs_vfp_rmode,
5651 nregs, NULL, pcum->silent_p);
5654 /* Given MODE and TYPE of a function argument, return the alignment in
5655 bits. The idea is to suppress any stronger alignment requested by
5656 the user and opt for the natural alignment (specified in AAPCS64 \S
5657 4.1). ABI_BREAK is set to true if the alignment was incorrectly
5658 calculated in versions of GCC prior to GCC-9. This is a helper
5659 function for local use only. */
5661 static unsigned int
5662 aarch64_function_arg_alignment (machine_mode mode, const_tree type,
5663 bool *abi_break)
5665 *abi_break = false;
5666 if (!type)
5667 return GET_MODE_ALIGNMENT (mode);
5669 if (integer_zerop (TYPE_SIZE (type)))
5670 return 0;
5672 gcc_assert (TYPE_MODE (type) == mode);
5674 if (!AGGREGATE_TYPE_P (type))
5675 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
5677 if (TREE_CODE (type) == ARRAY_TYPE)
5678 return TYPE_ALIGN (TREE_TYPE (type));
5680 unsigned int alignment = 0;
5681 unsigned int bitfield_alignment = 0;
5682 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
5683 if (TREE_CODE (field) == FIELD_DECL)
5685 /* Note that we explicitly consider zero-sized fields here,
5686 even though they don't map to AAPCS64 machine types.
5687 For example, in:
5689 struct __attribute__((aligned(8))) empty {};
5691 struct s {
5692 [[no_unique_address]] empty e;
5693 int x;
5696 "s" contains only one Fundamental Data Type (the int field)
5697 but gains 8-byte alignment and size thanks to "e". */
5698 alignment = std::max (alignment, DECL_ALIGN (field));
5699 if (DECL_BIT_FIELD_TYPE (field))
5700 bitfield_alignment
5701 = std::max (bitfield_alignment,
5702 TYPE_ALIGN (DECL_BIT_FIELD_TYPE (field)));
5705 if (bitfield_alignment > alignment)
5707 *abi_break = true;
5708 return bitfield_alignment;
5711 return alignment;
5714 /* Layout a function argument according to the AAPCS64 rules. The rule
5715 numbers refer to the rule numbers in the AAPCS64. ORIG_MODE is the
5716 mode that was originally given to us by the target hook, whereas the
5717 mode in ARG might be the result of replacing partial SVE modes with
5718 the equivalent integer mode. */
5720 static void
5721 aarch64_layout_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5723 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5724 tree type = arg.type;
5725 machine_mode mode = arg.mode;
5726 int ncrn, nvrn, nregs;
5727 bool allocate_ncrn, allocate_nvrn;
5728 HOST_WIDE_INT size;
5729 bool abi_break;
5731 /* We need to do this once per argument. */
5732 if (pcum->aapcs_arg_processed)
5733 return;
5735 pcum->aapcs_arg_processed = true;
5737 pure_scalable_type_info pst_info;
5738 if (type && pst_info.analyze_registers (type))
5740 /* The PCS says that it is invalid to pass an SVE value to an
5741 unprototyped function. There is no ABI-defined location we
5742 can return in this case, so we have no real choice but to raise
5743 an error immediately, even though this is only a query function. */
5744 if (arg.named && pcum->pcs_variant != ARM_PCS_SVE)
5746 gcc_assert (!pcum->silent_p);
5747 error ("SVE type %qT cannot be passed to an unprototyped function",
5748 arg.type);
5749 /* Avoid repeating the message, and avoid tripping the assert
5750 below. */
5751 pcum->pcs_variant = ARM_PCS_SVE;
5754 /* We would have converted the argument into pass-by-reference
5755 form if it didn't fit in registers. */
5756 pcum->aapcs_nextnvrn = pcum->aapcs_nvrn + pst_info.num_zr ();
5757 pcum->aapcs_nextnprn = pcum->aapcs_nprn + pst_info.num_pr ();
5758 gcc_assert (arg.named
5759 && pcum->pcs_variant == ARM_PCS_SVE
5760 && pcum->aapcs_nextnvrn <= NUM_FP_ARG_REGS
5761 && pcum->aapcs_nextnprn <= NUM_PR_ARG_REGS);
5762 pcum->aapcs_reg = pst_info.get_rtx (mode, V0_REGNUM + pcum->aapcs_nvrn,
5763 P0_REGNUM + pcum->aapcs_nprn);
5764 return;
5767 /* Generic vectors that map to full SVE modes with -msve-vector-bits=N
5768 are passed by reference, not by value. */
5769 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5770 bool sve_p = (vec_flags & VEC_ANY_SVE);
5771 if (sve_p)
5772 /* Vector types can acquire a partial SVE mode using things like
5773 __attribute__((vector_size(N))), and this is potentially useful.
5774 However, the choice of mode doesn't affect the type's ABI
5775 identity, so we should treat the types as though they had
5776 the associated integer mode, just like they did before SVE
5777 was introduced.
5779 We know that the vector must be 128 bits or smaller,
5780 otherwise we'd have passed it in memory instead. */
5781 gcc_assert (type
5782 && (aarch64_some_values_include_pst_objects_p (type)
5783 || (vec_flags & VEC_PARTIAL)));
5785 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
5786 if (type)
5787 size = int_size_in_bytes (type);
5788 else
5789 /* No frontends can create types with variable-sized modes, so we
5790 shouldn't be asked to pass or return them. */
5791 size = GET_MODE_SIZE (mode).to_constant ();
5792 size = ROUND_UP (size, UNITS_PER_WORD);
5794 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
5795 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
5796 mode,
5797 type,
5798 &nregs);
5799 gcc_assert (!sve_p || !allocate_nvrn);
5801 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
5802 The following code thus handles passing by SIMD/FP registers first. */
5804 nvrn = pcum->aapcs_nvrn;
5806 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
5807 and homogenous short-vector aggregates (HVA). */
5808 if (allocate_nvrn)
5810 if (!pcum->silent_p && !TARGET_FLOAT)
5811 aarch64_err_no_fpadvsimd (mode);
5813 if (nvrn + nregs <= NUM_FP_ARG_REGS)
5815 pcum->aapcs_nextnvrn = nvrn + nregs;
5816 if (!aarch64_composite_type_p (type, mode))
5818 gcc_assert (nregs == 1);
5819 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
5821 else
5823 rtx par;
5824 int i;
5825 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5826 for (i = 0; i < nregs; i++)
5828 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
5829 V0_REGNUM + nvrn + i);
5830 rtx offset = gen_int_mode
5831 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
5832 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
5833 XVECEXP (par, 0, i) = tmp;
5835 pcum->aapcs_reg = par;
5837 return;
5839 else
5841 /* C.3 NSRN is set to 8. */
5842 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
5843 goto on_stack;
5847 ncrn = pcum->aapcs_ncrn;
5848 nregs = size / UNITS_PER_WORD;
5850 /* C6 - C9. though the sign and zero extension semantics are
5851 handled elsewhere. This is the case where the argument fits
5852 entirely general registers. */
5853 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
5855 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
5857 /* C.8 if the argument has an alignment of 16 then the NGRN is
5858 rounded up to the next even number. */
5859 if (nregs == 2
5860 && ncrn % 2
5861 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
5862 comparison is there because for > 16 * BITS_PER_UNIT
5863 alignment nregs should be > 2 and therefore it should be
5864 passed by reference rather than value. */
5865 && (aarch64_function_arg_alignment (mode, type, &abi_break)
5866 == 16 * BITS_PER_UNIT))
5868 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5869 inform (input_location, "parameter passing for argument of type "
5870 "%qT changed in GCC 9.1", type);
5871 ++ncrn;
5872 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
5875 /* If an argument with an SVE mode needs to be shifted up to the
5876 high part of the register, treat it as though it had an integer mode.
5877 Using the normal (parallel [...]) would suppress the shifting. */
5878 if (sve_p
5879 && BYTES_BIG_ENDIAN
5880 && maybe_ne (GET_MODE_SIZE (mode), nregs * UNITS_PER_WORD)
5881 && aarch64_pad_reg_upward (mode, type, false))
5883 mode = int_mode_for_mode (mode).require ();
5884 sve_p = false;
5887 /* NREGS can be 0 when e.g. an empty structure is to be passed.
5888 A reg is still generated for it, but the caller should be smart
5889 enough not to use it. */
5890 if (nregs == 0
5891 || (nregs == 1 && !sve_p)
5892 || GET_MODE_CLASS (mode) == MODE_INT)
5893 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
5894 else
5896 rtx par;
5897 int i;
5899 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
5900 for (i = 0; i < nregs; i++)
5902 scalar_int_mode reg_mode = word_mode;
5903 if (nregs == 1)
5904 reg_mode = int_mode_for_mode (mode).require ();
5905 rtx tmp = gen_rtx_REG (reg_mode, R0_REGNUM + ncrn + i);
5906 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
5907 GEN_INT (i * UNITS_PER_WORD));
5908 XVECEXP (par, 0, i) = tmp;
5910 pcum->aapcs_reg = par;
5913 pcum->aapcs_nextncrn = ncrn + nregs;
5914 return;
5917 /* C.11 */
5918 pcum->aapcs_nextncrn = NUM_ARG_REGS;
5920 /* The argument is passed on stack; record the needed number of words for
5921 this argument and align the total size if necessary. */
5922 on_stack:
5923 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
5925 if (aarch64_function_arg_alignment (mode, type, &abi_break)
5926 == 16 * BITS_PER_UNIT)
5928 int new_size = ROUND_UP (pcum->aapcs_stack_size, 16 / UNITS_PER_WORD);
5929 if (pcum->aapcs_stack_size != new_size)
5931 if (abi_break && warn_psabi && currently_expanding_gimple_stmt)
5932 inform (input_location, "parameter passing for argument of type "
5933 "%qT changed in GCC 9.1", type);
5934 pcum->aapcs_stack_size = new_size;
5937 return;
5940 /* Implement TARGET_FUNCTION_ARG. */
5942 static rtx
5943 aarch64_function_arg (cumulative_args_t pcum_v, const function_arg_info &arg)
5945 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
5946 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64
5947 || pcum->pcs_variant == ARM_PCS_SIMD
5948 || pcum->pcs_variant == ARM_PCS_SVE);
5950 if (arg.end_marker_p ())
5951 return gen_int_mode (pcum->pcs_variant, DImode);
5953 aarch64_layout_arg (pcum_v, arg);
5954 return pcum->aapcs_reg;
5957 void
5958 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
5959 const_tree fntype,
5960 rtx libname ATTRIBUTE_UNUSED,
5961 const_tree fndecl ATTRIBUTE_UNUSED,
5962 unsigned n_named ATTRIBUTE_UNUSED,
5963 bool silent_p)
5965 pcum->aapcs_ncrn = 0;
5966 pcum->aapcs_nvrn = 0;
5967 pcum->aapcs_nprn = 0;
5968 pcum->aapcs_nextncrn = 0;
5969 pcum->aapcs_nextnvrn = 0;
5970 pcum->aapcs_nextnprn = 0;
5971 if (fntype)
5972 pcum->pcs_variant = (arm_pcs) fntype_abi (fntype).id ();
5973 else
5974 pcum->pcs_variant = ARM_PCS_AAPCS64;
5975 pcum->aapcs_reg = NULL_RTX;
5976 pcum->aapcs_arg_processed = false;
5977 pcum->aapcs_stack_words = 0;
5978 pcum->aapcs_stack_size = 0;
5979 pcum->silent_p = silent_p;
5981 if (!silent_p
5982 && !TARGET_FLOAT
5983 && fndecl && TREE_PUBLIC (fndecl)
5984 && fntype && fntype != error_mark_node)
5986 const_tree type = TREE_TYPE (fntype);
5987 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
5988 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
5989 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
5990 &mode, &nregs, NULL, false))
5991 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
5994 if (!silent_p
5995 && !TARGET_SVE
5996 && pcum->pcs_variant == ARM_PCS_SVE)
5998 /* We can't gracefully recover at this point, so make this a
5999 fatal error. */
6000 if (fndecl)
6001 fatal_error (input_location, "%qE requires the SVE ISA extension",
6002 fndecl);
6003 else
6004 fatal_error (input_location, "calls to functions of type %qT require"
6005 " the SVE ISA extension", fntype);
6009 static void
6010 aarch64_function_arg_advance (cumulative_args_t pcum_v,
6011 const function_arg_info &arg)
6013 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
6014 if (pcum->pcs_variant == ARM_PCS_AAPCS64
6015 || pcum->pcs_variant == ARM_PCS_SIMD
6016 || pcum->pcs_variant == ARM_PCS_SVE)
6018 aarch64_layout_arg (pcum_v, arg);
6019 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
6020 != (pcum->aapcs_stack_words != 0));
6021 pcum->aapcs_arg_processed = false;
6022 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
6023 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
6024 pcum->aapcs_nprn = pcum->aapcs_nextnprn;
6025 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
6026 pcum->aapcs_stack_words = 0;
6027 pcum->aapcs_reg = NULL_RTX;
6031 bool
6032 aarch64_function_arg_regno_p (unsigned regno)
6034 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
6035 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
6038 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
6039 PARM_BOUNDARY bits of alignment, but will be given anything up
6040 to STACK_BOUNDARY bits if the type requires it. This makes sure
6041 that both before and after the layout of each argument, the Next
6042 Stacked Argument Address (NSAA) will have a minimum alignment of
6043 8 bytes. */
6045 static unsigned int
6046 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
6048 bool abi_break;
6049 unsigned int alignment = aarch64_function_arg_alignment (mode, type,
6050 &abi_break);
6051 if (abi_break & warn_psabi)
6052 inform (input_location, "parameter passing for argument of type "
6053 "%qT changed in GCC 9.1", type);
6055 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
6058 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
6060 static fixed_size_mode
6061 aarch64_get_reg_raw_mode (int regno)
6063 if (TARGET_SVE && FP_REGNUM_P (regno))
6064 /* Don't use the SVE part of the register for __builtin_apply and
6065 __builtin_return. The SVE registers aren't used by the normal PCS,
6066 so using them there would be a waste of time. The PCS extensions
6067 for SVE types are fundamentally incompatible with the
6068 __builtin_return/__builtin_apply interface. */
6069 return as_a <fixed_size_mode> (V16QImode);
6070 return default_get_reg_raw_mode (regno);
6073 /* Implement TARGET_FUNCTION_ARG_PADDING.
6075 Small aggregate types are placed in the lowest memory address.
6077 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
6079 static pad_direction
6080 aarch64_function_arg_padding (machine_mode mode, const_tree type)
6082 /* On little-endian targets, the least significant byte of every stack
6083 argument is passed at the lowest byte address of the stack slot. */
6084 if (!BYTES_BIG_ENDIAN)
6085 return PAD_UPWARD;
6087 /* Otherwise, integral, floating-point and pointer types are padded downward:
6088 the least significant byte of a stack argument is passed at the highest
6089 byte address of the stack slot. */
6090 if (type
6091 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
6092 || POINTER_TYPE_P (type))
6093 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
6094 return PAD_DOWNWARD;
6096 /* Everything else padded upward, i.e. data in first byte of stack slot. */
6097 return PAD_UPWARD;
6100 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
6102 It specifies padding for the last (may also be the only)
6103 element of a block move between registers and memory. If
6104 assuming the block is in the memory, padding upward means that
6105 the last element is padded after its highest significant byte,
6106 while in downward padding, the last element is padded at the
6107 its least significant byte side.
6109 Small aggregates and small complex types are always padded
6110 upwards.
6112 We don't need to worry about homogeneous floating-point or
6113 short-vector aggregates; their move is not affected by the
6114 padding direction determined here. Regardless of endianness,
6115 each element of such an aggregate is put in the least
6116 significant bits of a fp/simd register.
6118 Return !BYTES_BIG_ENDIAN if the least significant byte of the
6119 register has useful data, and return the opposite if the most
6120 significant byte does. */
6122 bool
6123 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
6124 bool first ATTRIBUTE_UNUSED)
6127 /* Aside from pure scalable types, small composite types are always
6128 padded upward. */
6129 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
6131 HOST_WIDE_INT size;
6132 if (type)
6133 size = int_size_in_bytes (type);
6134 else
6135 /* No frontends can create types with variable-sized modes, so we
6136 shouldn't be asked to pass or return them. */
6137 size = GET_MODE_SIZE (mode).to_constant ();
6138 if (size < 2 * UNITS_PER_WORD)
6140 pure_scalable_type_info pst_info;
6141 if (pst_info.analyze_registers (type))
6142 return false;
6143 return true;
6147 /* Otherwise, use the default padding. */
6148 return !BYTES_BIG_ENDIAN;
6151 static scalar_int_mode
6152 aarch64_libgcc_cmp_return_mode (void)
6154 return SImode;
6157 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
6159 /* We use the 12-bit shifted immediate arithmetic instructions so values
6160 must be multiple of (1 << 12), i.e. 4096. */
6161 #define ARITH_FACTOR 4096
6163 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
6164 #error Cannot use simple address calculation for stack probing
6165 #endif
6167 /* The pair of scratch registers used for stack probing. */
6168 #define PROBE_STACK_FIRST_REG R9_REGNUM
6169 #define PROBE_STACK_SECOND_REG R10_REGNUM
6171 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
6172 inclusive. These are offsets from the current stack pointer. */
6174 static void
6175 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
6177 HOST_WIDE_INT size;
6178 if (!poly_size.is_constant (&size))
6180 sorry ("stack probes for SVE frames");
6181 return;
6184 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
6186 /* See the same assertion on PROBE_INTERVAL above. */
6187 gcc_assert ((first % ARITH_FACTOR) == 0);
6189 /* See if we have a constant small number of probes to generate. If so,
6190 that's the easy case. */
6191 if (size <= PROBE_INTERVAL)
6193 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
6195 emit_set_insn (reg1,
6196 plus_constant (Pmode,
6197 stack_pointer_rtx, -(first + base)));
6198 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
6201 /* The run-time loop is made up of 8 insns in the generic case while the
6202 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
6203 else if (size <= 4 * PROBE_INTERVAL)
6205 HOST_WIDE_INT i, rem;
6207 emit_set_insn (reg1,
6208 plus_constant (Pmode,
6209 stack_pointer_rtx,
6210 -(first + PROBE_INTERVAL)));
6211 emit_stack_probe (reg1);
6213 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
6214 it exceeds SIZE. If only two probes are needed, this will not
6215 generate any code. Then probe at FIRST + SIZE. */
6216 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
6218 emit_set_insn (reg1,
6219 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
6220 emit_stack_probe (reg1);
6223 rem = size - (i - PROBE_INTERVAL);
6224 if (rem > 256)
6226 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6228 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
6229 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
6231 else
6232 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
6235 /* Otherwise, do the same as above, but in a loop. Note that we must be
6236 extra careful with variables wrapping around because we might be at
6237 the very top (or the very bottom) of the address space and we have
6238 to be able to handle this case properly; in particular, we use an
6239 equality test for the loop condition. */
6240 else
6242 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
6244 /* Step 1: round SIZE to the previous multiple of the interval. */
6246 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
6249 /* Step 2: compute initial and final value of the loop counter. */
6251 /* TEST_ADDR = SP + FIRST. */
6252 emit_set_insn (reg1,
6253 plus_constant (Pmode, stack_pointer_rtx, -first));
6255 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
6256 HOST_WIDE_INT adjustment = - (first + rounded_size);
6257 if (! aarch64_uimm12_shift (adjustment))
6259 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
6260 true, Pmode);
6261 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
6263 else
6264 emit_set_insn (reg2,
6265 plus_constant (Pmode, stack_pointer_rtx, adjustment));
6267 /* Step 3: the loop
6271 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
6272 probe at TEST_ADDR
6274 while (TEST_ADDR != LAST_ADDR)
6276 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
6277 until it is equal to ROUNDED_SIZE. */
6279 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
6282 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
6283 that SIZE is equal to ROUNDED_SIZE. */
6285 if (size != rounded_size)
6287 HOST_WIDE_INT rem = size - rounded_size;
6289 if (rem > 256)
6291 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
6293 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
6294 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
6296 else
6297 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
6301 /* Make sure nothing is scheduled before we are done. */
6302 emit_insn (gen_blockage ());
6305 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
6306 absolute addresses. */
6308 const char *
6309 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
6311 static int labelno = 0;
6312 char loop_lab[32];
6313 rtx xops[2];
6315 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
6317 /* Loop. */
6318 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
6320 HOST_WIDE_INT stack_clash_probe_interval
6321 = 1 << param_stack_clash_protection_guard_size;
6323 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
6324 xops[0] = reg1;
6325 HOST_WIDE_INT interval;
6326 if (flag_stack_clash_protection)
6327 interval = stack_clash_probe_interval;
6328 else
6329 interval = PROBE_INTERVAL;
6331 gcc_assert (aarch64_uimm12_shift (interval));
6332 xops[1] = GEN_INT (interval);
6334 output_asm_insn ("sub\t%0, %0, %1", xops);
6336 /* If doing stack clash protection then we probe up by the ABI specified
6337 amount. We do this because we're dropping full pages at a time in the
6338 loop. But if we're doing non-stack clash probing, probe at SP 0. */
6339 if (flag_stack_clash_protection)
6340 xops[1] = GEN_INT (STACK_CLASH_CALLER_GUARD);
6341 else
6342 xops[1] = CONST0_RTX (GET_MODE (xops[1]));
6344 /* Probe at TEST_ADDR. If we're inside the loop it is always safe to probe
6345 by this amount for each iteration. */
6346 output_asm_insn ("str\txzr, [%0, %1]", xops);
6348 /* Test if TEST_ADDR == LAST_ADDR. */
6349 xops[1] = reg2;
6350 output_asm_insn ("cmp\t%0, %1", xops);
6352 /* Branch. */
6353 fputs ("\tb.ne\t", asm_out_file);
6354 assemble_name_raw (asm_out_file, loop_lab);
6355 fputc ('\n', asm_out_file);
6357 return "";
6360 /* Emit the probe loop for doing stack clash probes and stack adjustments for
6361 SVE. This emits probes from BASE to BASE - ADJUSTMENT based on a guard size
6362 of GUARD_SIZE. When a probe is emitted it is done at most
6363 MIN_PROBE_THRESHOLD bytes from the current BASE at an interval of
6364 at most MIN_PROBE_THRESHOLD. By the end of this function
6365 BASE = BASE - ADJUSTMENT. */
6367 const char *
6368 aarch64_output_probe_sve_stack_clash (rtx base, rtx adjustment,
6369 rtx min_probe_threshold, rtx guard_size)
6371 /* This function is not allowed to use any instruction generation function
6372 like gen_ and friends. If you do you'll likely ICE during CFG validation,
6373 so instead emit the code you want using output_asm_insn. */
6374 gcc_assert (flag_stack_clash_protection);
6375 gcc_assert (CONST_INT_P (min_probe_threshold) && CONST_INT_P (guard_size));
6376 gcc_assert (INTVAL (guard_size) > INTVAL (min_probe_threshold));
6378 /* The minimum required allocation before the residual requires probing. */
6379 HOST_WIDE_INT residual_probe_guard = INTVAL (min_probe_threshold);
6381 /* Clamp the value down to the nearest value that can be used with a cmp. */
6382 residual_probe_guard = aarch64_clamp_to_uimm12_shift (residual_probe_guard);
6383 rtx probe_offset_value_rtx = gen_int_mode (residual_probe_guard, Pmode);
6385 gcc_assert (INTVAL (min_probe_threshold) >= residual_probe_guard);
6386 gcc_assert (aarch64_uimm12_shift (residual_probe_guard));
6388 static int labelno = 0;
6389 char loop_start_lab[32];
6390 char loop_end_lab[32];
6391 rtx xops[2];
6393 ASM_GENERATE_INTERNAL_LABEL (loop_start_lab, "SVLPSPL", labelno);
6394 ASM_GENERATE_INTERNAL_LABEL (loop_end_lab, "SVLPEND", labelno++);
6396 /* Emit loop start label. */
6397 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_start_lab);
6399 /* ADJUSTMENT < RESIDUAL_PROBE_GUARD. */
6400 xops[0] = adjustment;
6401 xops[1] = probe_offset_value_rtx;
6402 output_asm_insn ("cmp\t%0, %1", xops);
6404 /* Branch to end if not enough adjustment to probe. */
6405 fputs ("\tb.lt\t", asm_out_file);
6406 assemble_name_raw (asm_out_file, loop_end_lab);
6407 fputc ('\n', asm_out_file);
6409 /* BASE = BASE - RESIDUAL_PROBE_GUARD. */
6410 xops[0] = base;
6411 xops[1] = probe_offset_value_rtx;
6412 output_asm_insn ("sub\t%0, %0, %1", xops);
6414 /* Probe at BASE. */
6415 xops[1] = const0_rtx;
6416 output_asm_insn ("str\txzr, [%0, %1]", xops);
6418 /* ADJUSTMENT = ADJUSTMENT - RESIDUAL_PROBE_GUARD. */
6419 xops[0] = adjustment;
6420 xops[1] = probe_offset_value_rtx;
6421 output_asm_insn ("sub\t%0, %0, %1", xops);
6423 /* Branch to start if still more bytes to allocate. */
6424 fputs ("\tb\t", asm_out_file);
6425 assemble_name_raw (asm_out_file, loop_start_lab);
6426 fputc ('\n', asm_out_file);
6428 /* No probe leave. */
6429 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_end_lab);
6431 /* BASE = BASE - ADJUSTMENT. */
6432 xops[0] = base;
6433 xops[1] = adjustment;
6434 output_asm_insn ("sub\t%0, %0, %1", xops);
6435 return "";
6438 /* Determine whether a frame chain needs to be generated. */
6439 static bool
6440 aarch64_needs_frame_chain (void)
6442 /* Force a frame chain for EH returns so the return address is at FP+8. */
6443 if (frame_pointer_needed || crtl->calls_eh_return)
6444 return true;
6446 /* A leaf function cannot have calls or write LR. */
6447 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
6449 /* Don't use a frame chain in leaf functions if leaf frame pointers
6450 are disabled. */
6451 if (flag_omit_leaf_frame_pointer && is_leaf)
6452 return false;
6454 return aarch64_use_frame_pointer;
6457 /* Mark the registers that need to be saved by the callee and calculate
6458 the size of the callee-saved registers area and frame record (both FP
6459 and LR may be omitted). */
6460 static void
6461 aarch64_layout_frame (void)
6463 poly_int64 offset = 0;
6464 int regno, last_fp_reg = INVALID_REGNUM;
6465 machine_mode vector_save_mode = aarch64_reg_save_mode (V8_REGNUM);
6466 poly_int64 vector_save_size = GET_MODE_SIZE (vector_save_mode);
6467 bool frame_related_fp_reg_p = false;
6468 aarch64_frame &frame = cfun->machine->frame;
6470 frame.emit_frame_chain = aarch64_needs_frame_chain ();
6472 /* Adjust the outgoing arguments size if required. Keep it in sync with what
6473 the mid-end is doing. */
6474 crtl->outgoing_args_size = STACK_DYNAMIC_OFFSET (cfun);
6476 #define SLOT_NOT_REQUIRED (-2)
6477 #define SLOT_REQUIRED (-1)
6479 frame.wb_candidate1 = INVALID_REGNUM;
6480 frame.wb_candidate2 = INVALID_REGNUM;
6481 frame.spare_pred_reg = INVALID_REGNUM;
6483 /* First mark all the registers that really need to be saved... */
6484 for (regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
6485 frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
6487 /* ... that includes the eh data registers (if needed)... */
6488 if (crtl->calls_eh_return)
6489 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
6490 frame.reg_offset[EH_RETURN_DATA_REGNO (regno)] = SLOT_REQUIRED;
6492 /* ... and any callee saved register that dataflow says is live. */
6493 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6494 if (df_regs_ever_live_p (regno)
6495 && !fixed_regs[regno]
6496 && (regno == R30_REGNUM
6497 || !crtl->abi->clobbers_full_reg_p (regno)))
6498 frame.reg_offset[regno] = SLOT_REQUIRED;
6500 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6501 if (df_regs_ever_live_p (regno)
6502 && !fixed_regs[regno]
6503 && !crtl->abi->clobbers_full_reg_p (regno))
6505 frame.reg_offset[regno] = SLOT_REQUIRED;
6506 last_fp_reg = regno;
6507 if (aarch64_emit_cfi_for_reg_p (regno))
6508 frame_related_fp_reg_p = true;
6511 /* Big-endian SVE frames need a spare predicate register in order
6512 to save Z8-Z15. Decide which register they should use. Prefer
6513 an unused argument register if possible, so that we don't force P4
6514 to be saved unnecessarily. */
6515 if (frame_related_fp_reg_p
6516 && crtl->abi->id () == ARM_PCS_SVE
6517 && BYTES_BIG_ENDIAN)
6519 bitmap live1 = df_get_live_out (ENTRY_BLOCK_PTR_FOR_FN (cfun));
6520 bitmap live2 = df_get_live_in (EXIT_BLOCK_PTR_FOR_FN (cfun));
6521 for (regno = P0_REGNUM; regno <= P7_REGNUM; regno++)
6522 if (!bitmap_bit_p (live1, regno) && !bitmap_bit_p (live2, regno))
6523 break;
6524 gcc_assert (regno <= P7_REGNUM);
6525 frame.spare_pred_reg = regno;
6526 df_set_regs_ever_live (regno, true);
6529 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6530 if (df_regs_ever_live_p (regno)
6531 && !fixed_regs[regno]
6532 && !crtl->abi->clobbers_full_reg_p (regno))
6533 frame.reg_offset[regno] = SLOT_REQUIRED;
6535 /* With stack-clash, LR must be saved in non-leaf functions. The saving of
6536 LR counts as an implicit probe which allows us to maintain the invariant
6537 described in the comment at expand_prologue. */
6538 gcc_assert (crtl->is_leaf
6539 || maybe_ne (frame.reg_offset[R30_REGNUM], SLOT_NOT_REQUIRED));
6541 /* Now assign stack slots for the registers. Start with the predicate
6542 registers, since predicate LDR and STR have a relatively small
6543 offset range. These saves happen below the hard frame pointer. */
6544 for (regno = P0_REGNUM; regno <= P15_REGNUM; regno++)
6545 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6547 frame.reg_offset[regno] = offset;
6548 offset += BYTES_PER_SVE_PRED;
6551 if (maybe_ne (offset, 0))
6553 /* If we have any vector registers to save above the predicate registers,
6554 the offset of the vector register save slots need to be a multiple
6555 of the vector size. This lets us use the immediate forms of LDR/STR
6556 (or LD1/ST1 for big-endian).
6558 A vector register is 8 times the size of a predicate register,
6559 and we need to save a maximum of 12 predicate registers, so the
6560 first vector register will be at either #1, MUL VL or #2, MUL VL.
6562 If we don't have any vector registers to save, and we know how
6563 big the predicate save area is, we can just round it up to the
6564 next 16-byte boundary. */
6565 if (last_fp_reg == (int) INVALID_REGNUM && offset.is_constant ())
6566 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6567 else
6569 if (known_le (offset, vector_save_size))
6570 offset = vector_save_size;
6571 else if (known_le (offset, vector_save_size * 2))
6572 offset = vector_save_size * 2;
6573 else
6574 gcc_unreachable ();
6578 /* If we need to save any SVE vector registers, add them next. */
6579 if (last_fp_reg != (int) INVALID_REGNUM && crtl->abi->id () == ARM_PCS_SVE)
6580 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6581 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6583 frame.reg_offset[regno] = offset;
6584 offset += vector_save_size;
6587 /* OFFSET is now the offset of the hard frame pointer from the bottom
6588 of the callee save area. */
6589 bool saves_below_hard_fp_p = maybe_ne (offset, 0);
6590 frame.below_hard_fp_saved_regs_size = offset;
6591 if (frame.emit_frame_chain)
6593 /* FP and LR are placed in the linkage record. */
6594 frame.reg_offset[R29_REGNUM] = offset;
6595 frame.wb_candidate1 = R29_REGNUM;
6596 frame.reg_offset[R30_REGNUM] = offset + UNITS_PER_WORD;
6597 frame.wb_candidate2 = R30_REGNUM;
6598 offset += 2 * UNITS_PER_WORD;
6601 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
6602 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6604 frame.reg_offset[regno] = offset;
6605 if (frame.wb_candidate1 == INVALID_REGNUM)
6606 frame.wb_candidate1 = regno;
6607 else if (frame.wb_candidate2 == INVALID_REGNUM)
6608 frame.wb_candidate2 = regno;
6609 offset += UNITS_PER_WORD;
6612 poly_int64 max_int_offset = offset;
6613 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6614 bool has_align_gap = maybe_ne (offset, max_int_offset);
6616 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
6617 if (known_eq (frame.reg_offset[regno], SLOT_REQUIRED))
6619 /* If there is an alignment gap between integer and fp callee-saves,
6620 allocate the last fp register to it if possible. */
6621 if (regno == last_fp_reg
6622 && has_align_gap
6623 && known_eq (vector_save_size, 8)
6624 && multiple_p (offset, 16))
6626 frame.reg_offset[regno] = max_int_offset;
6627 break;
6630 frame.reg_offset[regno] = offset;
6631 if (frame.wb_candidate1 == INVALID_REGNUM)
6632 frame.wb_candidate1 = regno;
6633 else if (frame.wb_candidate2 == INVALID_REGNUM
6634 && frame.wb_candidate1 >= V0_REGNUM)
6635 frame.wb_candidate2 = regno;
6636 offset += vector_save_size;
6639 offset = aligned_upper_bound (offset, STACK_BOUNDARY / BITS_PER_UNIT);
6641 frame.saved_regs_size = offset;
6643 poly_int64 varargs_and_saved_regs_size = offset + frame.saved_varargs_size;
6645 poly_int64 above_outgoing_args
6646 = aligned_upper_bound (varargs_and_saved_regs_size
6647 + get_frame_size (),
6648 STACK_BOUNDARY / BITS_PER_UNIT);
6650 frame.hard_fp_offset
6651 = above_outgoing_args - frame.below_hard_fp_saved_regs_size;
6653 /* Both these values are already aligned. */
6654 gcc_assert (multiple_p (crtl->outgoing_args_size,
6655 STACK_BOUNDARY / BITS_PER_UNIT));
6656 frame.frame_size = above_outgoing_args + crtl->outgoing_args_size;
6658 frame.locals_offset = frame.saved_varargs_size;
6660 frame.initial_adjust = 0;
6661 frame.final_adjust = 0;
6662 frame.callee_adjust = 0;
6663 frame.sve_callee_adjust = 0;
6664 frame.callee_offset = 0;
6666 HOST_WIDE_INT max_push_offset = 0;
6667 if (frame.wb_candidate2 != INVALID_REGNUM)
6668 max_push_offset = 512;
6669 else if (frame.wb_candidate1 != INVALID_REGNUM)
6670 max_push_offset = 256;
6672 HOST_WIDE_INT const_size, const_outgoing_args_size, const_fp_offset;
6673 HOST_WIDE_INT const_saved_regs_size;
6674 if (frame.frame_size.is_constant (&const_size)
6675 && const_size < max_push_offset
6676 && known_eq (frame.hard_fp_offset, const_size))
6678 /* Simple, small frame with no outgoing arguments:
6680 stp reg1, reg2, [sp, -frame_size]!
6681 stp reg3, reg4, [sp, 16] */
6682 frame.callee_adjust = const_size;
6684 else if (crtl->outgoing_args_size.is_constant (&const_outgoing_args_size)
6685 && frame.saved_regs_size.is_constant (&const_saved_regs_size)
6686 && const_outgoing_args_size + const_saved_regs_size < 512
6687 /* We could handle this case even with outgoing args, provided
6688 that the number of args left us with valid offsets for all
6689 predicate and vector save slots. It's such a rare case that
6690 it hardly seems worth the effort though. */
6691 && (!saves_below_hard_fp_p || const_outgoing_args_size == 0)
6692 && !(cfun->calls_alloca
6693 && frame.hard_fp_offset.is_constant (&const_fp_offset)
6694 && const_fp_offset < max_push_offset))
6696 /* Frame with small outgoing arguments:
6698 sub sp, sp, frame_size
6699 stp reg1, reg2, [sp, outgoing_args_size]
6700 stp reg3, reg4, [sp, outgoing_args_size + 16] */
6701 frame.initial_adjust = frame.frame_size;
6702 frame.callee_offset = const_outgoing_args_size;
6704 else if (saves_below_hard_fp_p
6705 && known_eq (frame.saved_regs_size,
6706 frame.below_hard_fp_saved_regs_size))
6708 /* Frame in which all saves are SVE saves:
6710 sub sp, sp, hard_fp_offset + below_hard_fp_saved_regs_size
6711 save SVE registers relative to SP
6712 sub sp, sp, outgoing_args_size */
6713 frame.initial_adjust = (frame.hard_fp_offset
6714 + frame.below_hard_fp_saved_regs_size);
6715 frame.final_adjust = crtl->outgoing_args_size;
6717 else if (frame.hard_fp_offset.is_constant (&const_fp_offset)
6718 && const_fp_offset < max_push_offset)
6720 /* Frame with large outgoing arguments or SVE saves, but with
6721 a small local area:
6723 stp reg1, reg2, [sp, -hard_fp_offset]!
6724 stp reg3, reg4, [sp, 16]
6725 [sub sp, sp, below_hard_fp_saved_regs_size]
6726 [save SVE registers relative to SP]
6727 sub sp, sp, outgoing_args_size */
6728 frame.callee_adjust = const_fp_offset;
6729 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6730 frame.final_adjust = crtl->outgoing_args_size;
6732 else
6734 /* Frame with large local area and outgoing arguments or SVE saves,
6735 using frame pointer:
6737 sub sp, sp, hard_fp_offset
6738 stp x29, x30, [sp, 0]
6739 add x29, sp, 0
6740 stp reg3, reg4, [sp, 16]
6741 [sub sp, sp, below_hard_fp_saved_regs_size]
6742 [save SVE registers relative to SP]
6743 sub sp, sp, outgoing_args_size */
6744 frame.initial_adjust = frame.hard_fp_offset;
6745 frame.sve_callee_adjust = frame.below_hard_fp_saved_regs_size;
6746 frame.final_adjust = crtl->outgoing_args_size;
6749 /* Make sure the individual adjustments add up to the full frame size. */
6750 gcc_assert (known_eq (frame.initial_adjust
6751 + frame.callee_adjust
6752 + frame.sve_callee_adjust
6753 + frame.final_adjust, frame.frame_size));
6755 if (!frame.emit_frame_chain && frame.callee_adjust == 0)
6757 /* We've decided not to associate any register saves with the initial
6758 stack allocation. */
6759 frame.wb_candidate1 = INVALID_REGNUM;
6760 frame.wb_candidate2 = INVALID_REGNUM;
6763 frame.laid_out = true;
6766 /* Return true if the register REGNO is saved on entry to
6767 the current function. */
6769 static bool
6770 aarch64_register_saved_on_entry (int regno)
6772 return known_ge (cfun->machine->frame.reg_offset[regno], 0);
6775 /* Return the next register up from REGNO up to LIMIT for the callee
6776 to save. */
6778 static unsigned
6779 aarch64_next_callee_save (unsigned regno, unsigned limit)
6781 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
6782 regno ++;
6783 return regno;
6786 /* Push the register number REGNO of mode MODE to the stack with write-back
6787 adjusting the stack by ADJUSTMENT. */
6789 static void
6790 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
6791 HOST_WIDE_INT adjustment)
6793 rtx base_rtx = stack_pointer_rtx;
6794 rtx insn, reg, mem;
6796 reg = gen_rtx_REG (mode, regno);
6797 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
6798 plus_constant (Pmode, base_rtx, -adjustment));
6799 mem = gen_frame_mem (mode, mem);
6801 insn = emit_move_insn (mem, reg);
6802 RTX_FRAME_RELATED_P (insn) = 1;
6805 /* Generate and return an instruction to store the pair of registers
6806 REG and REG2 of mode MODE to location BASE with write-back adjusting
6807 the stack location BASE by ADJUSTMENT. */
6809 static rtx
6810 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6811 HOST_WIDE_INT adjustment)
6813 switch (mode)
6815 case E_DImode:
6816 return gen_storewb_pairdi_di (base, base, reg, reg2,
6817 GEN_INT (-adjustment),
6818 GEN_INT (UNITS_PER_WORD - adjustment));
6819 case E_DFmode:
6820 return gen_storewb_pairdf_di (base, base, reg, reg2,
6821 GEN_INT (-adjustment),
6822 GEN_INT (UNITS_PER_WORD - adjustment));
6823 case E_TFmode:
6824 return gen_storewb_pairtf_di (base, base, reg, reg2,
6825 GEN_INT (-adjustment),
6826 GEN_INT (UNITS_PER_VREG - adjustment));
6827 default:
6828 gcc_unreachable ();
6832 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
6833 stack pointer by ADJUSTMENT. */
6835 static void
6836 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
6838 rtx_insn *insn;
6839 machine_mode mode = aarch64_reg_save_mode (regno1);
6841 if (regno2 == INVALID_REGNUM)
6842 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
6844 rtx reg1 = gen_rtx_REG (mode, regno1);
6845 rtx reg2 = gen_rtx_REG (mode, regno2);
6847 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
6848 reg2, adjustment));
6849 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
6850 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
6851 RTX_FRAME_RELATED_P (insn) = 1;
6854 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
6855 adjusting it by ADJUSTMENT afterwards. */
6857 static rtx
6858 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
6859 HOST_WIDE_INT adjustment)
6861 switch (mode)
6863 case E_DImode:
6864 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
6865 GEN_INT (UNITS_PER_WORD));
6866 case E_DFmode:
6867 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
6868 GEN_INT (UNITS_PER_WORD));
6869 case E_TFmode:
6870 return gen_loadwb_pairtf_di (base, base, reg, reg2, GEN_INT (adjustment),
6871 GEN_INT (UNITS_PER_VREG));
6872 default:
6873 gcc_unreachable ();
6877 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
6878 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
6879 into CFI_OPS. */
6881 static void
6882 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
6883 rtx *cfi_ops)
6885 machine_mode mode = aarch64_reg_save_mode (regno1);
6886 rtx reg1 = gen_rtx_REG (mode, regno1);
6888 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
6890 if (regno2 == INVALID_REGNUM)
6892 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
6893 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
6894 emit_move_insn (reg1, gen_frame_mem (mode, mem));
6896 else
6898 rtx reg2 = gen_rtx_REG (mode, regno2);
6899 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
6900 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
6901 reg2, adjustment));
6905 /* Generate and return a store pair instruction of mode MODE to store
6906 register REG1 to MEM1 and register REG2 to MEM2. */
6908 static rtx
6909 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
6910 rtx reg2)
6912 switch (mode)
6914 case E_DImode:
6915 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
6917 case E_DFmode:
6918 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
6920 case E_TFmode:
6921 return gen_store_pair_dw_tftf (mem1, reg1, mem2, reg2);
6923 default:
6924 gcc_unreachable ();
6928 /* Generate and regurn a load pair isntruction of mode MODE to load register
6929 REG1 from MEM1 and register REG2 from MEM2. */
6931 static rtx
6932 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
6933 rtx mem2)
6935 switch (mode)
6937 case E_DImode:
6938 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
6940 case E_DFmode:
6941 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
6943 case E_TFmode:
6944 return gen_load_pair_dw_tftf (reg1, mem1, reg2, mem2);
6946 default:
6947 gcc_unreachable ();
6951 /* Return TRUE if return address signing should be enabled for the current
6952 function, otherwise return FALSE. */
6954 bool
6955 aarch64_return_address_signing_enabled (void)
6957 /* This function should only be called after frame laid out. */
6958 gcc_assert (cfun->machine->frame.laid_out);
6960 /* Turn return address signing off in any function that uses
6961 __builtin_eh_return. The address passed to __builtin_eh_return
6962 is not signed so either it has to be signed (with original sp)
6963 or the code path that uses it has to avoid authenticating it.
6964 Currently eh return introduces a return to anywhere gadget, no
6965 matter what we do here since it uses ret with user provided
6966 address. An ideal fix for that is to use indirect branch which
6967 can be protected with BTI j (to some extent). */
6968 if (crtl->calls_eh_return)
6969 return false;
6971 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
6972 if its LR is pushed onto stack. */
6973 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
6974 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
6975 && known_ge (cfun->machine->frame.reg_offset[LR_REGNUM], 0)));
6978 /* Return TRUE if Branch Target Identification Mechanism is enabled. */
6979 bool
6980 aarch64_bti_enabled (void)
6982 return (aarch64_enable_bti == 1);
6985 /* The caller is going to use ST1D or LD1D to save or restore an SVE
6986 register in mode MODE at BASE_RTX + OFFSET, where OFFSET is in
6987 the range [1, 16] * GET_MODE_SIZE (MODE). Prepare for this by:
6989 (1) updating BASE_RTX + OFFSET so that it is a legitimate ST1D
6990 or LD1D address
6992 (2) setting PRED to a valid predicate register for the ST1D or LD1D,
6993 if the variable isn't already nonnull
6995 (1) is needed when OFFSET is in the range [8, 16] * GET_MODE_SIZE (MODE).
6996 Handle this case using a temporary base register that is suitable for
6997 all offsets in that range. Use ANCHOR_REG as this base register if it
6998 is nonnull, otherwise create a new register and store it in ANCHOR_REG. */
7000 static inline void
7001 aarch64_adjust_sve_callee_save_base (machine_mode mode, rtx &base_rtx,
7002 rtx &anchor_reg, poly_int64 &offset,
7003 rtx &ptrue)
7005 if (maybe_ge (offset, 8 * GET_MODE_SIZE (mode)))
7007 /* This is the maximum valid offset of the anchor from the base.
7008 Lower values would be valid too. */
7009 poly_int64 anchor_offset = 16 * GET_MODE_SIZE (mode);
7010 if (!anchor_reg)
7012 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7013 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7014 gen_int_mode (anchor_offset, Pmode)));
7016 base_rtx = anchor_reg;
7017 offset -= anchor_offset;
7019 if (!ptrue)
7021 int pred_reg = cfun->machine->frame.spare_pred_reg;
7022 emit_move_insn (gen_rtx_REG (VNx16BImode, pred_reg),
7023 CONSTM1_RTX (VNx16BImode));
7024 ptrue = gen_rtx_REG (VNx2BImode, pred_reg);
7028 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
7029 is saved at BASE + OFFSET. */
7031 static void
7032 aarch64_add_cfa_expression (rtx_insn *insn, rtx reg,
7033 rtx base, poly_int64 offset)
7035 rtx mem = gen_frame_mem (GET_MODE (reg),
7036 plus_constant (Pmode, base, offset));
7037 add_reg_note (insn, REG_CFA_EXPRESSION, gen_rtx_SET (mem, reg));
7040 /* Emit code to save the callee-saved registers from register number START
7041 to LIMIT to the stack at the location starting at offset START_OFFSET,
7042 skipping any write-back candidates if SKIP_WB is true. HARD_FP_VALID_P
7043 is true if the hard frame pointer has been set up. */
7045 static void
7046 aarch64_save_callee_saves (poly_int64 start_offset,
7047 unsigned start, unsigned limit, bool skip_wb,
7048 bool hard_fp_valid_p)
7050 rtx_insn *insn;
7051 unsigned regno;
7052 unsigned regno2;
7053 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7055 for (regno = aarch64_next_callee_save (start, limit);
7056 regno <= limit;
7057 regno = aarch64_next_callee_save (regno + 1, limit))
7059 rtx reg, mem;
7060 poly_int64 offset;
7061 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7063 if (skip_wb
7064 && (regno == cfun->machine->frame.wb_candidate1
7065 || regno == cfun->machine->frame.wb_candidate2))
7066 continue;
7068 if (cfun->machine->reg_is_wrapped_separately[regno])
7069 continue;
7071 machine_mode mode = aarch64_reg_save_mode (regno);
7072 reg = gen_rtx_REG (mode, regno);
7073 offset = start_offset + cfun->machine->frame.reg_offset[regno];
7074 rtx base_rtx = stack_pointer_rtx;
7075 poly_int64 sp_offset = offset;
7077 HOST_WIDE_INT const_offset;
7078 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7079 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7080 offset, ptrue);
7081 else if (GP_REGNUM_P (regno)
7082 && (!offset.is_constant (&const_offset) || const_offset >= 512))
7084 gcc_assert (known_eq (start_offset, 0));
7085 poly_int64 fp_offset
7086 = cfun->machine->frame.below_hard_fp_saved_regs_size;
7087 if (hard_fp_valid_p)
7088 base_rtx = hard_frame_pointer_rtx;
7089 else
7091 if (!anchor_reg)
7093 anchor_reg = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7094 emit_insn (gen_add3_insn (anchor_reg, base_rtx,
7095 gen_int_mode (fp_offset, Pmode)));
7097 base_rtx = anchor_reg;
7099 offset -= fp_offset;
7101 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7102 bool need_cfa_note_p = (base_rtx != stack_pointer_rtx);
7104 if (!aarch64_sve_mode_p (mode)
7105 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7106 && !cfun->machine->reg_is_wrapped_separately[regno2]
7107 && known_eq (GET_MODE_SIZE (mode),
7108 cfun->machine->frame.reg_offset[regno2]
7109 - cfun->machine->frame.reg_offset[regno]))
7111 rtx reg2 = gen_rtx_REG (mode, regno2);
7112 rtx mem2;
7114 offset += GET_MODE_SIZE (mode);
7115 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7116 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
7117 reg2));
7119 /* The first part of a frame-related parallel insn is
7120 always assumed to be relevant to the frame
7121 calculations; subsequent parts, are only
7122 frame-related if explicitly marked. */
7123 if (aarch64_emit_cfi_for_reg_p (regno2))
7125 if (need_cfa_note_p)
7126 aarch64_add_cfa_expression (insn, reg2, stack_pointer_rtx,
7127 sp_offset + GET_MODE_SIZE (mode));
7128 else
7129 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
7132 regno = regno2;
7134 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7136 insn = emit_insn (gen_aarch64_pred_mov (mode, mem, ptrue, reg));
7137 need_cfa_note_p = true;
7139 else if (aarch64_sve_mode_p (mode))
7140 insn = emit_insn (gen_rtx_SET (mem, reg));
7141 else
7142 insn = emit_move_insn (mem, reg);
7144 RTX_FRAME_RELATED_P (insn) = frame_related_p;
7145 if (frame_related_p && need_cfa_note_p)
7146 aarch64_add_cfa_expression (insn, reg, stack_pointer_rtx, sp_offset);
7150 /* Emit code to restore the callee registers from register number START
7151 up to and including LIMIT. Restore from the stack offset START_OFFSET,
7152 skipping any write-back candidates if SKIP_WB is true. Write the
7153 appropriate REG_CFA_RESTORE notes into CFI_OPS. */
7155 static void
7156 aarch64_restore_callee_saves (poly_int64 start_offset, unsigned start,
7157 unsigned limit, bool skip_wb, rtx *cfi_ops)
7159 unsigned regno;
7160 unsigned regno2;
7161 poly_int64 offset;
7162 rtx anchor_reg = NULL_RTX, ptrue = NULL_RTX;
7164 for (regno = aarch64_next_callee_save (start, limit);
7165 regno <= limit;
7166 regno = aarch64_next_callee_save (regno + 1, limit))
7168 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7169 if (cfun->machine->reg_is_wrapped_separately[regno])
7170 continue;
7172 rtx reg, mem;
7174 if (skip_wb
7175 && (regno == cfun->machine->frame.wb_candidate1
7176 || regno == cfun->machine->frame.wb_candidate2))
7177 continue;
7179 machine_mode mode = aarch64_reg_save_mode (regno);
7180 reg = gen_rtx_REG (mode, regno);
7181 offset = start_offset + cfun->machine->frame.reg_offset[regno];
7182 rtx base_rtx = stack_pointer_rtx;
7183 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7184 aarch64_adjust_sve_callee_save_base (mode, base_rtx, anchor_reg,
7185 offset, ptrue);
7186 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7188 if (!aarch64_sve_mode_p (mode)
7189 && (regno2 = aarch64_next_callee_save (regno + 1, limit)) <= limit
7190 && !cfun->machine->reg_is_wrapped_separately[regno2]
7191 && known_eq (GET_MODE_SIZE (mode),
7192 cfun->machine->frame.reg_offset[regno2]
7193 - cfun->machine->frame.reg_offset[regno]))
7195 rtx reg2 = gen_rtx_REG (mode, regno2);
7196 rtx mem2;
7198 offset += GET_MODE_SIZE (mode);
7199 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
7200 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7202 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
7203 regno = regno2;
7205 else if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7206 emit_insn (gen_aarch64_pred_mov (mode, reg, ptrue, mem));
7207 else if (aarch64_sve_mode_p (mode))
7208 emit_insn (gen_rtx_SET (reg, mem));
7209 else
7210 emit_move_insn (reg, mem);
7211 if (frame_related_p)
7212 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
7216 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
7217 of MODE. */
7219 static inline bool
7220 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7222 HOST_WIDE_INT multiple;
7223 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7224 && IN_RANGE (multiple, -8, 7));
7227 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
7228 of MODE. */
7230 static inline bool
7231 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7233 HOST_WIDE_INT multiple;
7234 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7235 && IN_RANGE (multiple, 0, 63));
7238 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
7239 of MODE. */
7241 bool
7242 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7244 HOST_WIDE_INT multiple;
7245 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7246 && IN_RANGE (multiple, -64, 63));
7249 /* Return true if OFFSET is a signed 9-bit value. */
7251 bool
7252 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
7253 poly_int64 offset)
7255 HOST_WIDE_INT const_offset;
7256 return (offset.is_constant (&const_offset)
7257 && IN_RANGE (const_offset, -256, 255));
7260 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
7261 of MODE. */
7263 static inline bool
7264 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
7266 HOST_WIDE_INT multiple;
7267 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7268 && IN_RANGE (multiple, -256, 255));
7271 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
7272 of MODE. */
7274 static inline bool
7275 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
7277 HOST_WIDE_INT multiple;
7278 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
7279 && IN_RANGE (multiple, 0, 4095));
7282 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
7284 static sbitmap
7285 aarch64_get_separate_components (void)
7287 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7288 bitmap_clear (components);
7290 /* The registers we need saved to the frame. */
7291 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7292 if (aarch64_register_saved_on_entry (regno))
7294 /* Punt on saves and restores that use ST1D and LD1D. We could
7295 try to be smarter, but it would involve making sure that the
7296 spare predicate register itself is safe to use at the save
7297 and restore points. Also, when a frame pointer is being used,
7298 the slots are often out of reach of ST1D and LD1D anyway. */
7299 machine_mode mode = aarch64_reg_save_mode (regno);
7300 if (mode == VNx2DImode && BYTES_BIG_ENDIAN)
7301 continue;
7303 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7305 /* If the register is saved in the first SVE save slot, we use
7306 it as a stack probe for -fstack-clash-protection. */
7307 if (flag_stack_clash_protection
7308 && maybe_ne (cfun->machine->frame.below_hard_fp_saved_regs_size, 0)
7309 && known_eq (offset, 0))
7310 continue;
7312 /* Get the offset relative to the register we'll use. */
7313 if (frame_pointer_needed)
7314 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7315 else
7316 offset += crtl->outgoing_args_size;
7318 /* Check that we can access the stack slot of the register with one
7319 direct load with no adjustments needed. */
7320 if (aarch64_sve_mode_p (mode)
7321 ? offset_9bit_signed_scaled_p (mode, offset)
7322 : offset_12bit_unsigned_scaled_p (mode, offset))
7323 bitmap_set_bit (components, regno);
7326 /* Don't mess with the hard frame pointer. */
7327 if (frame_pointer_needed)
7328 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
7330 /* If the spare predicate register used by big-endian SVE code
7331 is call-preserved, it must be saved in the main prologue
7332 before any saves that use it. */
7333 if (cfun->machine->frame.spare_pred_reg != INVALID_REGNUM)
7334 bitmap_clear_bit (components, cfun->machine->frame.spare_pred_reg);
7336 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7337 unsigned reg2 = cfun->machine->frame.wb_candidate2;
7338 /* If registers have been chosen to be stored/restored with
7339 writeback don't interfere with them to avoid having to output explicit
7340 stack adjustment instructions. */
7341 if (reg2 != INVALID_REGNUM)
7342 bitmap_clear_bit (components, reg2);
7343 if (reg1 != INVALID_REGNUM)
7344 bitmap_clear_bit (components, reg1);
7346 bitmap_clear_bit (components, LR_REGNUM);
7347 bitmap_clear_bit (components, SP_REGNUM);
7349 return components;
7352 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
7354 static sbitmap
7355 aarch64_components_for_bb (basic_block bb)
7357 bitmap in = DF_LIVE_IN (bb);
7358 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
7359 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
7361 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
7362 bitmap_clear (components);
7364 /* Clobbered registers don't generate values in any meaningful sense,
7365 since nothing after the clobber can rely on their value. And we can't
7366 say that partially-clobbered registers are unconditionally killed,
7367 because whether they're killed or not depends on the mode of the
7368 value they're holding. Thus partially call-clobbered registers
7369 appear in neither the kill set nor the gen set.
7371 Check manually for any calls that clobber more of a register than the
7372 current function can. */
7373 function_abi_aggregator callee_abis;
7374 rtx_insn *insn;
7375 FOR_BB_INSNS (bb, insn)
7376 if (CALL_P (insn))
7377 callee_abis.note_callee_abi (insn_callee_abi (insn));
7378 HARD_REG_SET extra_caller_saves = callee_abis.caller_save_regs (*crtl->abi);
7380 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
7381 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7382 if (!fixed_regs[regno]
7383 && !crtl->abi->clobbers_full_reg_p (regno)
7384 && (TEST_HARD_REG_BIT (extra_caller_saves, regno)
7385 || bitmap_bit_p (in, regno)
7386 || bitmap_bit_p (gen, regno)
7387 || bitmap_bit_p (kill, regno)))
7389 bitmap_set_bit (components, regno);
7391 /* If there is a callee-save at an adjacent offset, add it too
7392 to increase the use of LDP/STP. */
7393 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7394 unsigned regno2 = multiple_p (offset, 16) ? regno + 1 : regno - 1;
7396 if (regno2 <= LAST_SAVED_REGNUM)
7398 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7399 if (regno < regno2
7400 ? known_eq (offset + 8, offset2)
7401 : multiple_p (offset2, 16) && known_eq (offset2 + 8, offset))
7402 bitmap_set_bit (components, regno2);
7406 return components;
7409 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
7410 Nothing to do for aarch64. */
7412 static void
7413 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
7417 /* Return the next set bit in BMP from START onwards. Return the total number
7418 of bits in BMP if no set bit is found at or after START. */
7420 static unsigned int
7421 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
7423 unsigned int nbits = SBITMAP_SIZE (bmp);
7424 if (start == nbits)
7425 return start;
7427 gcc_assert (start < nbits);
7428 for (unsigned int i = start; i < nbits; i++)
7429 if (bitmap_bit_p (bmp, i))
7430 return i;
7432 return nbits;
7435 /* Do the work for aarch64_emit_prologue_components and
7436 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
7437 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
7438 for these components or the epilogue sequence. That is, it determines
7439 whether we should emit stores or loads and what kind of CFA notes to attach
7440 to the insns. Otherwise the logic for the two sequences is very
7441 similar. */
7443 static void
7444 aarch64_process_components (sbitmap components, bool prologue_p)
7446 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
7447 ? HARD_FRAME_POINTER_REGNUM
7448 : STACK_POINTER_REGNUM);
7450 unsigned last_regno = SBITMAP_SIZE (components);
7451 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
7452 rtx_insn *insn = NULL;
7454 while (regno != last_regno)
7456 bool frame_related_p = aarch64_emit_cfi_for_reg_p (regno);
7457 machine_mode mode = aarch64_reg_save_mode (regno);
7459 rtx reg = gen_rtx_REG (mode, regno);
7460 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
7461 if (frame_pointer_needed)
7462 offset -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7463 else
7464 offset += crtl->outgoing_args_size;
7466 rtx addr = plus_constant (Pmode, ptr_reg, offset);
7467 rtx mem = gen_frame_mem (mode, addr);
7469 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
7470 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
7471 /* No more registers to handle after REGNO.
7472 Emit a single save/restore and exit. */
7473 if (regno2 == last_regno)
7475 insn = emit_insn (set);
7476 if (frame_related_p)
7478 RTX_FRAME_RELATED_P (insn) = 1;
7479 if (prologue_p)
7480 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7481 else
7482 add_reg_note (insn, REG_CFA_RESTORE, reg);
7484 break;
7487 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
7488 /* The next register is not of the same class or its offset is not
7489 mergeable with the current one into a pair. */
7490 if (aarch64_sve_mode_p (mode)
7491 || !satisfies_constraint_Ump (mem)
7492 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
7493 || (crtl->abi->id () == ARM_PCS_SIMD && FP_REGNUM_P (regno))
7494 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
7495 GET_MODE_SIZE (mode)))
7497 insn = emit_insn (set);
7498 if (frame_related_p)
7500 RTX_FRAME_RELATED_P (insn) = 1;
7501 if (prologue_p)
7502 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
7503 else
7504 add_reg_note (insn, REG_CFA_RESTORE, reg);
7507 regno = regno2;
7508 continue;
7511 bool frame_related2_p = aarch64_emit_cfi_for_reg_p (regno2);
7513 /* REGNO2 can be saved/restored in a pair with REGNO. */
7514 rtx reg2 = gen_rtx_REG (mode, regno2);
7515 if (frame_pointer_needed)
7516 offset2 -= cfun->machine->frame.below_hard_fp_saved_regs_size;
7517 else
7518 offset2 += crtl->outgoing_args_size;
7519 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
7520 rtx mem2 = gen_frame_mem (mode, addr2);
7521 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
7522 : gen_rtx_SET (reg2, mem2);
7524 if (prologue_p)
7525 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
7526 else
7527 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
7529 if (frame_related_p || frame_related2_p)
7531 RTX_FRAME_RELATED_P (insn) = 1;
7532 if (prologue_p)
7534 if (frame_related_p)
7535 add_reg_note (insn, REG_CFA_OFFSET, set);
7536 if (frame_related2_p)
7537 add_reg_note (insn, REG_CFA_OFFSET, set2);
7539 else
7541 if (frame_related_p)
7542 add_reg_note (insn, REG_CFA_RESTORE, reg);
7543 if (frame_related2_p)
7544 add_reg_note (insn, REG_CFA_RESTORE, reg2);
7548 regno = aarch64_get_next_set_bit (components, regno2 + 1);
7552 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
7554 static void
7555 aarch64_emit_prologue_components (sbitmap components)
7557 aarch64_process_components (components, true);
7560 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
7562 static void
7563 aarch64_emit_epilogue_components (sbitmap components)
7565 aarch64_process_components (components, false);
7568 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
7570 static void
7571 aarch64_set_handled_components (sbitmap components)
7573 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
7574 if (bitmap_bit_p (components, regno))
7575 cfun->machine->reg_is_wrapped_separately[regno] = true;
7578 /* On AArch64 we have an ABI defined safe buffer. This constant is used to
7579 determining the probe offset for alloca. */
7581 static HOST_WIDE_INT
7582 aarch64_stack_clash_protection_alloca_probe_range (void)
7584 return STACK_CLASH_CALLER_GUARD;
7588 /* Allocate POLY_SIZE bytes of stack space using TEMP1 and TEMP2 as scratch
7589 registers. If POLY_SIZE is not large enough to require a probe this function
7590 will only adjust the stack. When allocating the stack space
7591 FRAME_RELATED_P is then used to indicate if the allocation is frame related.
7592 FINAL_ADJUSTMENT_P indicates whether we are allocating the outgoing
7593 arguments. If we are then we ensure that any allocation larger than the ABI
7594 defined buffer needs a probe so that the invariant of having a 1KB buffer is
7595 maintained.
7597 We emit barriers after each stack adjustment to prevent optimizations from
7598 breaking the invariant that we never drop the stack more than a page. This
7599 invariant is needed to make it easier to correctly handle asynchronous
7600 events, e.g. if we were to allow the stack to be dropped by more than a page
7601 and then have multiple probes up and we take a signal somewhere in between
7602 then the signal handler doesn't know the state of the stack and can make no
7603 assumptions about which pages have been probed. */
7605 static void
7606 aarch64_allocate_and_probe_stack_space (rtx temp1, rtx temp2,
7607 poly_int64 poly_size,
7608 bool frame_related_p,
7609 bool final_adjustment_p)
7611 HOST_WIDE_INT guard_size
7612 = 1 << param_stack_clash_protection_guard_size;
7613 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
7614 HOST_WIDE_INT min_probe_threshold
7615 = (final_adjustment_p
7616 ? guard_used_by_caller
7617 : guard_size - guard_used_by_caller);
7618 /* When doing the final adjustment for the outgoing arguments, take into
7619 account any unprobed space there is above the current SP. There are
7620 two cases:
7622 - When saving SVE registers below the hard frame pointer, we force
7623 the lowest save to take place in the prologue before doing the final
7624 adjustment (i.e. we don't allow the save to be shrink-wrapped).
7625 This acts as a probe at SP, so there is no unprobed space.
7627 - When there are no SVE register saves, we use the store of the link
7628 register as a probe. We can't assume that LR was saved at position 0
7629 though, so treat any space below it as unprobed. */
7630 if (final_adjustment_p
7631 && known_eq (cfun->machine->frame.below_hard_fp_saved_regs_size, 0))
7633 poly_int64 lr_offset = cfun->machine->frame.reg_offset[LR_REGNUM];
7634 if (known_ge (lr_offset, 0))
7635 min_probe_threshold -= lr_offset.to_constant ();
7636 else
7637 gcc_assert (!flag_stack_clash_protection || known_eq (poly_size, 0));
7640 poly_int64 frame_size = cfun->machine->frame.frame_size;
7642 /* We should always have a positive probe threshold. */
7643 gcc_assert (min_probe_threshold > 0);
7645 if (flag_stack_clash_protection && !final_adjustment_p)
7647 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7648 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7649 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7651 if (known_eq (frame_size, 0))
7653 dump_stack_clash_frame_info (NO_PROBE_NO_FRAME, false);
7655 else if (known_lt (initial_adjust + sve_callee_adjust,
7656 guard_size - guard_used_by_caller)
7657 && known_lt (final_adjust, guard_used_by_caller))
7659 dump_stack_clash_frame_info (NO_PROBE_SMALL_FRAME, true);
7663 /* If SIZE is not large enough to require probing, just adjust the stack and
7664 exit. */
7665 if (known_lt (poly_size, min_probe_threshold)
7666 || !flag_stack_clash_protection)
7668 aarch64_sub_sp (temp1, temp2, poly_size, frame_related_p);
7669 return;
7672 HOST_WIDE_INT size;
7673 /* Handle the SVE non-constant case first. */
7674 if (!poly_size.is_constant (&size))
7676 if (dump_file)
7678 fprintf (dump_file, "Stack clash SVE prologue: ");
7679 print_dec (poly_size, dump_file);
7680 fprintf (dump_file, " bytes, dynamic probing will be required.\n");
7683 /* First calculate the amount of bytes we're actually spilling. */
7684 aarch64_add_offset (Pmode, temp1, CONST0_RTX (Pmode),
7685 poly_size, temp1, temp2, false, true);
7687 rtx_insn *insn = get_last_insn ();
7689 if (frame_related_p)
7691 /* This is done to provide unwinding information for the stack
7692 adjustments we're about to do, however to prevent the optimizers
7693 from removing the R11 move and leaving the CFA note (which would be
7694 very wrong) we tie the old and new stack pointer together.
7695 The tie will expand to nothing but the optimizers will not touch
7696 the instruction. */
7697 rtx stack_ptr_copy = gen_rtx_REG (Pmode, STACK_CLASH_SVE_CFA_REGNUM);
7698 emit_move_insn (stack_ptr_copy, stack_pointer_rtx);
7699 emit_insn (gen_stack_tie (stack_ptr_copy, stack_pointer_rtx));
7701 /* We want the CFA independent of the stack pointer for the
7702 duration of the loop. */
7703 add_reg_note (insn, REG_CFA_DEF_CFA, stack_ptr_copy);
7704 RTX_FRAME_RELATED_P (insn) = 1;
7707 rtx probe_const = gen_int_mode (min_probe_threshold, Pmode);
7708 rtx guard_const = gen_int_mode (guard_size, Pmode);
7710 insn = emit_insn (gen_probe_sve_stack_clash (Pmode, stack_pointer_rtx,
7711 stack_pointer_rtx, temp1,
7712 probe_const, guard_const));
7714 /* Now reset the CFA register if needed. */
7715 if (frame_related_p)
7717 add_reg_note (insn, REG_CFA_DEF_CFA,
7718 gen_rtx_PLUS (Pmode, stack_pointer_rtx,
7719 gen_int_mode (poly_size, Pmode)));
7720 RTX_FRAME_RELATED_P (insn) = 1;
7723 return;
7726 if (dump_file)
7727 fprintf (dump_file,
7728 "Stack clash AArch64 prologue: " HOST_WIDE_INT_PRINT_DEC
7729 " bytes, probing will be required.\n", size);
7731 /* Round size to the nearest multiple of guard_size, and calculate the
7732 residual as the difference between the original size and the rounded
7733 size. */
7734 HOST_WIDE_INT rounded_size = ROUND_DOWN (size, guard_size);
7735 HOST_WIDE_INT residual = size - rounded_size;
7737 /* We can handle a small number of allocations/probes inline. Otherwise
7738 punt to a loop. */
7739 if (rounded_size <= STACK_CLASH_MAX_UNROLL_PAGES * guard_size)
7741 for (HOST_WIDE_INT i = 0; i < rounded_size; i += guard_size)
7743 aarch64_sub_sp (NULL, temp2, guard_size, true);
7744 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7745 guard_used_by_caller));
7746 emit_insn (gen_blockage ());
7748 dump_stack_clash_frame_info (PROBE_INLINE, size != rounded_size);
7750 else
7752 /* Compute the ending address. */
7753 aarch64_add_offset (Pmode, temp1, stack_pointer_rtx, -rounded_size,
7754 temp1, NULL, false, true);
7755 rtx_insn *insn = get_last_insn ();
7757 /* For the initial allocation, we don't have a frame pointer
7758 set up, so we always need CFI notes. If we're doing the
7759 final allocation, then we may have a frame pointer, in which
7760 case it is the CFA, otherwise we need CFI notes.
7762 We can determine which allocation we are doing by looking at
7763 the value of FRAME_RELATED_P since the final allocations are not
7764 frame related. */
7765 if (frame_related_p)
7767 /* We want the CFA independent of the stack pointer for the
7768 duration of the loop. */
7769 add_reg_note (insn, REG_CFA_DEF_CFA,
7770 plus_constant (Pmode, temp1, rounded_size));
7771 RTX_FRAME_RELATED_P (insn) = 1;
7774 /* This allocates and probes the stack. Note that this re-uses some of
7775 the existing Ada stack protection code. However we are guaranteed not
7776 to enter the non loop or residual branches of that code.
7778 The non-loop part won't be entered because if our allocation amount
7779 doesn't require a loop, the case above would handle it.
7781 The residual amount won't be entered because TEMP1 is a mutliple of
7782 the allocation size. The residual will always be 0. As such, the only
7783 part we are actually using from that code is the loop setup. The
7784 actual probing is done in aarch64_output_probe_stack_range. */
7785 insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx,
7786 stack_pointer_rtx, temp1));
7788 /* Now reset the CFA register if needed. */
7789 if (frame_related_p)
7791 add_reg_note (insn, REG_CFA_DEF_CFA,
7792 plus_constant (Pmode, stack_pointer_rtx, rounded_size));
7793 RTX_FRAME_RELATED_P (insn) = 1;
7796 emit_insn (gen_blockage ());
7797 dump_stack_clash_frame_info (PROBE_LOOP, size != rounded_size);
7800 /* Handle any residuals. Residuals of at least MIN_PROBE_THRESHOLD have to
7801 be probed. This maintains the requirement that each page is probed at
7802 least once. For initial probing we probe only if the allocation is
7803 more than GUARD_SIZE - buffer, and for the outgoing arguments we probe
7804 if the amount is larger than buffer. GUARD_SIZE - buffer + buffer ==
7805 GUARD_SIZE. This works that for any allocation that is large enough to
7806 trigger a probe here, we'll have at least one, and if they're not large
7807 enough for this code to emit anything for them, The page would have been
7808 probed by the saving of FP/LR either by this function or any callees. If
7809 we don't have any callees then we won't have more stack adjustments and so
7810 are still safe. */
7811 if (residual)
7813 HOST_WIDE_INT residual_probe_offset = guard_used_by_caller;
7814 /* If we're doing final adjustments, and we've done any full page
7815 allocations then any residual needs to be probed. */
7816 if (final_adjustment_p && rounded_size != 0)
7817 min_probe_threshold = 0;
7818 /* If doing a small final adjustment, we always probe at offset 0.
7819 This is done to avoid issues when LR is not at position 0 or when
7820 the final adjustment is smaller than the probing offset. */
7821 else if (final_adjustment_p && rounded_size == 0)
7822 residual_probe_offset = 0;
7824 aarch64_sub_sp (temp1, temp2, residual, frame_related_p);
7825 if (residual >= min_probe_threshold)
7827 if (dump_file)
7828 fprintf (dump_file,
7829 "Stack clash AArch64 prologue residuals: "
7830 HOST_WIDE_INT_PRINT_DEC " bytes, probing will be required."
7831 "\n", residual);
7833 emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx,
7834 residual_probe_offset));
7835 emit_insn (gen_blockage ());
7840 /* Return 1 if the register is used by the epilogue. We need to say the
7841 return register is used, but only after epilogue generation is complete.
7842 Note that in the case of sibcalls, the values "used by the epilogue" are
7843 considered live at the start of the called function.
7845 For SIMD functions we need to return 1 for FP registers that are saved and
7846 restored by a function but are not zero in call_used_regs. If we do not do
7847 this optimizations may remove the restore of the register. */
7850 aarch64_epilogue_uses (int regno)
7852 if (epilogue_completed)
7854 if (regno == LR_REGNUM)
7855 return 1;
7857 return 0;
7860 /* AArch64 stack frames generated by this compiler look like:
7862 +-------------------------------+
7864 | incoming stack arguments |
7866 +-------------------------------+
7867 | | <-- incoming stack pointer (aligned)
7868 | callee-allocated save area |
7869 | for register varargs |
7871 +-------------------------------+
7872 | local variables | <-- frame_pointer_rtx
7874 +-------------------------------+
7875 | padding | \
7876 +-------------------------------+ |
7877 | callee-saved registers | | frame.saved_regs_size
7878 +-------------------------------+ |
7879 | LR' | |
7880 +-------------------------------+ |
7881 | FP' | |
7882 +-------------------------------+ |<- hard_frame_pointer_rtx (aligned)
7883 | SVE vector registers | | \
7884 +-------------------------------+ | | below_hard_fp_saved_regs_size
7885 | SVE predicate registers | / /
7886 +-------------------------------+
7887 | dynamic allocation |
7888 +-------------------------------+
7889 | padding |
7890 +-------------------------------+
7891 | outgoing stack arguments | <-- arg_pointer
7893 +-------------------------------+
7894 | | <-- stack_pointer_rtx (aligned)
7896 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
7897 but leave frame_pointer_rtx and hard_frame_pointer_rtx
7898 unchanged.
7900 By default for stack-clash we assume the guard is at least 64KB, but this
7901 value is configurable to either 4KB or 64KB. We also force the guard size to
7902 be the same as the probing interval and both values are kept in sync.
7904 With those assumptions the callee can allocate up to 63KB (or 3KB depending
7905 on the guard size) of stack space without probing.
7907 When probing is needed, we emit a probe at the start of the prologue
7908 and every PARAM_STACK_CLASH_PROTECTION_GUARD_SIZE bytes thereafter.
7910 We have to track how much space has been allocated and the only stores
7911 to the stack we track as implicit probes are the FP/LR stores.
7913 For outgoing arguments we probe if the size is larger than 1KB, such that
7914 the ABI specified buffer is maintained for the next callee.
7916 The following registers are reserved during frame layout and should not be
7917 used for any other purpose:
7919 - r11: Used by stack clash protection when SVE is enabled, and also
7920 as an anchor register when saving and restoring registers
7921 - r12(EP0) and r13(EP1): Used as temporaries for stack adjustment.
7922 - r14 and r15: Used for speculation tracking.
7923 - r16(IP0), r17(IP1): Used by indirect tailcalls.
7924 - r30(LR), r29(FP): Used by standard frame layout.
7926 These registers must be avoided in frame layout related code unless the
7927 explicit intention is to interact with one of the features listed above. */
7929 /* Generate the prologue instructions for entry into a function.
7930 Establish the stack frame by decreasing the stack pointer with a
7931 properly calculated size and, if necessary, create a frame record
7932 filled with the values of LR and previous frame pointer. The
7933 current FP is also set up if it is in use. */
7935 void
7936 aarch64_expand_prologue (void)
7938 poly_int64 frame_size = cfun->machine->frame.frame_size;
7939 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
7940 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
7941 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
7942 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
7943 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
7944 poly_int64 below_hard_fp_saved_regs_size
7945 = cfun->machine->frame.below_hard_fp_saved_regs_size;
7946 unsigned reg1 = cfun->machine->frame.wb_candidate1;
7947 unsigned reg2 = cfun->machine->frame.wb_candidate2;
7948 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
7949 rtx_insn *insn;
7951 if (flag_stack_clash_protection && known_eq (callee_adjust, 0))
7953 /* Fold the SVE allocation into the initial allocation.
7954 We don't do this in aarch64_layout_arg to avoid pessimizing
7955 the epilogue code. */
7956 initial_adjust += sve_callee_adjust;
7957 sve_callee_adjust = 0;
7960 /* Sign return address for functions. */
7961 if (aarch64_return_address_signing_enabled ())
7963 switch (aarch64_ra_sign_key)
7965 case AARCH64_KEY_A:
7966 insn = emit_insn (gen_paciasp ());
7967 break;
7968 case AARCH64_KEY_B:
7969 insn = emit_insn (gen_pacibsp ());
7970 break;
7971 default:
7972 gcc_unreachable ();
7974 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
7975 RTX_FRAME_RELATED_P (insn) = 1;
7978 if (flag_stack_usage_info)
7979 current_function_static_stack_size = constant_lower_bound (frame_size);
7981 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
7983 if (crtl->is_leaf && !cfun->calls_alloca)
7985 if (maybe_gt (frame_size, PROBE_INTERVAL)
7986 && maybe_gt (frame_size, get_stack_check_protect ()))
7987 aarch64_emit_probe_stack_range (get_stack_check_protect (),
7988 (frame_size
7989 - get_stack_check_protect ()));
7991 else if (maybe_gt (frame_size, 0))
7992 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
7995 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
7996 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
7998 /* In theory we should never have both an initial adjustment
7999 and a callee save adjustment. Verify that is the case since the
8000 code below does not handle it for -fstack-clash-protection. */
8001 gcc_assert (known_eq (initial_adjust, 0) || callee_adjust == 0);
8003 /* Will only probe if the initial adjustment is larger than the guard
8004 less the amount of the guard reserved for use by the caller's
8005 outgoing args. */
8006 aarch64_allocate_and_probe_stack_space (tmp0_rtx, tmp1_rtx, initial_adjust,
8007 true, false);
8009 if (callee_adjust != 0)
8010 aarch64_push_regs (reg1, reg2, callee_adjust);
8012 /* The offset of the frame chain record (if any) from the current SP. */
8013 poly_int64 chain_offset = (initial_adjust + callee_adjust
8014 - cfun->machine->frame.hard_fp_offset);
8015 gcc_assert (known_ge (chain_offset, 0));
8017 /* The offset of the bottom of the save area from the current SP. */
8018 poly_int64 saved_regs_offset = chain_offset - below_hard_fp_saved_regs_size;
8020 if (emit_frame_chain)
8022 if (callee_adjust == 0)
8024 reg1 = R29_REGNUM;
8025 reg2 = R30_REGNUM;
8026 aarch64_save_callee_saves (saved_regs_offset, reg1, reg2,
8027 false, false);
8029 else
8030 gcc_assert (known_eq (chain_offset, 0));
8031 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
8032 stack_pointer_rtx, chain_offset,
8033 tmp1_rtx, tmp0_rtx, frame_pointer_needed);
8034 if (frame_pointer_needed && !frame_size.is_constant ())
8036 /* Variable-sized frames need to describe the save slot
8037 address using DW_CFA_expression rather than DW_CFA_offset.
8038 This means that, without taking further action, the
8039 locations of the registers that we've already saved would
8040 remain based on the stack pointer even after we redefine
8041 the CFA based on the frame pointer. We therefore need new
8042 DW_CFA_expressions to re-express the save slots with addresses
8043 based on the frame pointer. */
8044 rtx_insn *insn = get_last_insn ();
8045 gcc_assert (RTX_FRAME_RELATED_P (insn));
8047 /* Add an explicit CFA definition if this was previously
8048 implicit. */
8049 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
8051 rtx src = plus_constant (Pmode, stack_pointer_rtx,
8052 callee_offset);
8053 add_reg_note (insn, REG_CFA_ADJUST_CFA,
8054 gen_rtx_SET (hard_frame_pointer_rtx, src));
8057 /* Change the save slot expressions for the registers that
8058 we've already saved. */
8059 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg2],
8060 hard_frame_pointer_rtx, UNITS_PER_WORD);
8061 aarch64_add_cfa_expression (insn, regno_reg_rtx[reg1],
8062 hard_frame_pointer_rtx, 0);
8064 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
8067 aarch64_save_callee_saves (saved_regs_offset, R0_REGNUM, R30_REGNUM,
8068 callee_adjust != 0 || emit_frame_chain,
8069 emit_frame_chain);
8070 if (maybe_ne (sve_callee_adjust, 0))
8072 gcc_assert (!flag_stack_clash_protection
8073 || known_eq (initial_adjust, 0));
8074 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx,
8075 sve_callee_adjust,
8076 !frame_pointer_needed, false);
8077 saved_regs_offset += sve_callee_adjust;
8079 aarch64_save_callee_saves (saved_regs_offset, P0_REGNUM, P15_REGNUM,
8080 false, emit_frame_chain);
8081 aarch64_save_callee_saves (saved_regs_offset, V0_REGNUM, V31_REGNUM,
8082 callee_adjust != 0 || emit_frame_chain,
8083 emit_frame_chain);
8085 /* We may need to probe the final adjustment if it is larger than the guard
8086 that is assumed by the called. */
8087 aarch64_allocate_and_probe_stack_space (tmp1_rtx, tmp0_rtx, final_adjust,
8088 !frame_pointer_needed, true);
8091 /* Return TRUE if we can use a simple_return insn.
8093 This function checks whether the callee saved stack is empty, which
8094 means no restore actions are need. The pro_and_epilogue will use
8095 this to check whether shrink-wrapping opt is feasible. */
8097 bool
8098 aarch64_use_return_insn_p (void)
8100 if (!reload_completed)
8101 return false;
8103 if (crtl->profile)
8104 return false;
8106 return known_eq (cfun->machine->frame.frame_size, 0);
8109 /* Generate the epilogue instructions for returning from a function.
8110 This is almost exactly the reverse of the prolog sequence, except
8111 that we need to insert barriers to avoid scheduling loads that read
8112 from a deallocated stack, and we optimize the unwind records by
8113 emitting them all together if possible. */
8114 void
8115 aarch64_expand_epilogue (bool for_sibcall)
8117 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
8118 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
8119 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
8120 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
8121 poly_int64 sve_callee_adjust = cfun->machine->frame.sve_callee_adjust;
8122 poly_int64 below_hard_fp_saved_regs_size
8123 = cfun->machine->frame.below_hard_fp_saved_regs_size;
8124 unsigned reg1 = cfun->machine->frame.wb_candidate1;
8125 unsigned reg2 = cfun->machine->frame.wb_candidate2;
8126 rtx cfi_ops = NULL;
8127 rtx_insn *insn;
8128 /* A stack clash protection prologue may not have left EP0_REGNUM or
8129 EP1_REGNUM in a usable state. The same is true for allocations
8130 with an SVE component, since we then need both temporary registers
8131 for each allocation. For stack clash we are in a usable state if
8132 the adjustment is less than GUARD_SIZE - GUARD_USED_BY_CALLER. */
8133 HOST_WIDE_INT guard_size
8134 = 1 << param_stack_clash_protection_guard_size;
8135 HOST_WIDE_INT guard_used_by_caller = STACK_CLASH_CALLER_GUARD;
8137 /* We can re-use the registers when:
8139 (a) the deallocation amount is the same as the corresponding
8140 allocation amount (which is false if we combine the initial
8141 and SVE callee save allocations in the prologue); and
8143 (b) the allocation amount doesn't need a probe (which is false
8144 if the amount is guard_size - guard_used_by_caller or greater).
8146 In such situations the register should remain live with the correct
8147 value. */
8148 bool can_inherit_p = (initial_adjust.is_constant ()
8149 && final_adjust.is_constant ()
8150 && (!flag_stack_clash_protection
8151 || (known_lt (initial_adjust,
8152 guard_size - guard_used_by_caller)
8153 && known_eq (sve_callee_adjust, 0))));
8155 /* We need to add memory barrier to prevent read from deallocated stack. */
8156 bool need_barrier_p
8157 = maybe_ne (get_frame_size ()
8158 + cfun->machine->frame.saved_varargs_size, 0);
8160 /* Emit a barrier to prevent loads from a deallocated stack. */
8161 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
8162 || cfun->calls_alloca
8163 || crtl->calls_eh_return)
8165 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8166 need_barrier_p = false;
8169 /* Restore the stack pointer from the frame pointer if it may not
8170 be the same as the stack pointer. */
8171 rtx tmp0_rtx = gen_rtx_REG (Pmode, EP0_REGNUM);
8172 rtx tmp1_rtx = gen_rtx_REG (Pmode, EP1_REGNUM);
8173 if (frame_pointer_needed
8174 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
8175 /* If writeback is used when restoring callee-saves, the CFA
8176 is restored on the instruction doing the writeback. */
8177 aarch64_add_offset (Pmode, stack_pointer_rtx,
8178 hard_frame_pointer_rtx,
8179 -callee_offset - below_hard_fp_saved_regs_size,
8180 tmp1_rtx, tmp0_rtx, callee_adjust == 0);
8181 else
8182 /* The case where we need to re-use the register here is very rare, so
8183 avoid the complicated condition and just always emit a move if the
8184 immediate doesn't fit. */
8185 aarch64_add_sp (tmp1_rtx, tmp0_rtx, final_adjust, true);
8187 /* Restore the vector registers before the predicate registers,
8188 so that we can use P4 as a temporary for big-endian SVE frames. */
8189 aarch64_restore_callee_saves (callee_offset, V0_REGNUM, V31_REGNUM,
8190 callee_adjust != 0, &cfi_ops);
8191 aarch64_restore_callee_saves (callee_offset, P0_REGNUM, P15_REGNUM,
8192 false, &cfi_ops);
8193 if (maybe_ne (sve_callee_adjust, 0))
8194 aarch64_add_sp (NULL_RTX, NULL_RTX, sve_callee_adjust, true);
8195 aarch64_restore_callee_saves (callee_offset - sve_callee_adjust,
8196 R0_REGNUM, R30_REGNUM,
8197 callee_adjust != 0, &cfi_ops);
8199 if (need_barrier_p)
8200 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
8202 if (callee_adjust != 0)
8203 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
8205 /* If we have no register restore information, the CFA must have been
8206 defined in terms of the stack pointer since the end of the prologue. */
8207 gcc_assert (cfi_ops || !frame_pointer_needed);
8209 if (cfi_ops && (callee_adjust != 0 || maybe_gt (initial_adjust, 65536)))
8211 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
8212 insn = get_last_insn ();
8213 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
8214 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
8215 RTX_FRAME_RELATED_P (insn) = 1;
8216 cfi_ops = NULL;
8219 /* Liveness of EP0_REGNUM can not be trusted across function calls either, so
8220 add restriction on emit_move optimization to leaf functions. */
8221 aarch64_add_sp (tmp0_rtx, tmp1_rtx, initial_adjust,
8222 (!can_inherit_p || !crtl->is_leaf
8223 || df_regs_ever_live_p (EP0_REGNUM)));
8225 if (cfi_ops)
8227 /* Emit delayed restores and reset the CFA to be SP. */
8228 insn = get_last_insn ();
8229 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
8230 REG_NOTES (insn) = cfi_ops;
8231 RTX_FRAME_RELATED_P (insn) = 1;
8234 /* We prefer to emit the combined return/authenticate instruction RETAA,
8235 however there are three cases in which we must instead emit an explicit
8236 authentication instruction.
8238 1) Sibcalls don't return in a normal way, so if we're about to call one
8239 we must authenticate.
8241 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
8242 generating code for !TARGET_ARMV8_3 we can't use it and must
8243 explicitly authenticate.
8245 3) On an eh_return path we make extra stack adjustments to update the
8246 canonical frame address to be the exception handler's CFA. We want
8247 to authenticate using the CFA of the function which calls eh_return.
8249 if (aarch64_return_address_signing_enabled ()
8250 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
8252 switch (aarch64_ra_sign_key)
8254 case AARCH64_KEY_A:
8255 insn = emit_insn (gen_autiasp ());
8256 break;
8257 case AARCH64_KEY_B:
8258 insn = emit_insn (gen_autibsp ());
8259 break;
8260 default:
8261 gcc_unreachable ();
8263 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
8264 RTX_FRAME_RELATED_P (insn) = 1;
8267 /* Stack adjustment for exception handler. */
8268 if (crtl->calls_eh_return && !for_sibcall)
8270 /* We need to unwind the stack by the offset computed by
8271 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
8272 to be SP; letting the CFA move during this adjustment
8273 is just as correct as retaining the CFA from the body
8274 of the function. Therefore, do nothing special. */
8275 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
8278 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
8279 if (!for_sibcall)
8280 emit_jump_insn (ret_rtx);
8283 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
8284 normally or return to a previous frame after unwinding.
8286 An EH return uses a single shared return sequence. The epilogue is
8287 exactly like a normal epilogue except that it has an extra input
8288 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
8289 that must be applied after the frame has been destroyed. An extra label
8290 is inserted before the epilogue which initializes this register to zero,
8291 and this is the entry point for a normal return.
8293 An actual EH return updates the return address, initializes the stack
8294 adjustment and jumps directly into the epilogue (bypassing the zeroing
8295 of the adjustment). Since the return address is typically saved on the
8296 stack when a function makes a call, the saved LR must be updated outside
8297 the epilogue.
8299 This poses problems as the store is generated well before the epilogue,
8300 so the offset of LR is not known yet. Also optimizations will remove the
8301 store as it appears dead, even after the epilogue is generated (as the
8302 base or offset for loading LR is different in many cases).
8304 To avoid these problems this implementation forces the frame pointer
8305 in eh_return functions so that the location of LR is fixed and known early.
8306 It also marks the store volatile, so no optimization is permitted to
8307 remove the store. */
8309 aarch64_eh_return_handler_rtx (void)
8311 rtx tmp = gen_frame_mem (Pmode,
8312 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
8314 /* Mark the store volatile, so no optimization is permitted to remove it. */
8315 MEM_VOLATILE_P (tmp) = true;
8316 return tmp;
8319 /* Output code to add DELTA to the first argument, and then jump
8320 to FUNCTION. Used for C++ multiple inheritance. */
8321 static void
8322 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
8323 HOST_WIDE_INT delta,
8324 HOST_WIDE_INT vcall_offset,
8325 tree function)
8327 /* The this pointer is always in x0. Note that this differs from
8328 Arm where the this pointer maybe bumped to r1 if r0 is required
8329 to return a pointer to an aggregate. On AArch64 a result value
8330 pointer will be in x8. */
8331 int this_regno = R0_REGNUM;
8332 rtx this_rtx, temp0, temp1, addr, funexp;
8333 rtx_insn *insn;
8334 const char *fnname = IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (thunk));
8336 if (aarch64_bti_enabled ())
8337 emit_insn (gen_bti_c());
8339 reload_completed = 1;
8340 emit_note (NOTE_INSN_PROLOGUE_END);
8342 this_rtx = gen_rtx_REG (Pmode, this_regno);
8343 temp0 = gen_rtx_REG (Pmode, EP0_REGNUM);
8344 temp1 = gen_rtx_REG (Pmode, EP1_REGNUM);
8346 if (vcall_offset == 0)
8347 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
8348 else
8350 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
8352 addr = this_rtx;
8353 if (delta != 0)
8355 if (delta >= -256 && delta < 256)
8356 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
8357 plus_constant (Pmode, this_rtx, delta));
8358 else
8359 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
8360 temp1, temp0, false);
8363 if (Pmode == ptr_mode)
8364 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
8365 else
8366 aarch64_emit_move (temp0,
8367 gen_rtx_ZERO_EXTEND (Pmode,
8368 gen_rtx_MEM (ptr_mode, addr)));
8370 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
8371 addr = plus_constant (Pmode, temp0, vcall_offset);
8372 else
8374 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
8375 Pmode);
8376 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
8379 if (Pmode == ptr_mode)
8380 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
8381 else
8382 aarch64_emit_move (temp1,
8383 gen_rtx_SIGN_EXTEND (Pmode,
8384 gen_rtx_MEM (ptr_mode, addr)));
8386 emit_insn (gen_add2_insn (this_rtx, temp1));
8389 /* Generate a tail call to the target function. */
8390 if (!TREE_USED (function))
8392 assemble_external (function);
8393 TREE_USED (function) = 1;
8395 funexp = XEXP (DECL_RTL (function), 0);
8396 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
8397 rtx callee_abi = gen_int_mode (fndecl_abi (function).id (), DImode);
8398 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, callee_abi));
8399 SIBLING_CALL_P (insn) = 1;
8401 insn = get_insns ();
8402 shorten_branches (insn);
8404 assemble_start_function (thunk, fnname);
8405 final_start_function (insn, file, 1);
8406 final (insn, file, 1);
8407 final_end_function ();
8408 assemble_end_function (thunk, fnname);
8410 /* Stop pretending to be a post-reload pass. */
8411 reload_completed = 0;
8414 static bool
8415 aarch64_tls_referenced_p (rtx x)
8417 if (!TARGET_HAVE_TLS)
8418 return false;
8419 subrtx_iterator::array_type array;
8420 FOR_EACH_SUBRTX (iter, array, x, ALL)
8422 const_rtx x = *iter;
8423 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
8424 return true;
8425 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
8426 TLS offsets, not real symbol references. */
8427 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
8428 iter.skip_subrtxes ();
8430 return false;
8434 /* Return true if val can be encoded as a 12-bit unsigned immediate with
8435 a left shift of 0 or 12 bits. */
8436 bool
8437 aarch64_uimm12_shift (HOST_WIDE_INT val)
8439 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
8440 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
8444 /* Returns the nearest value to VAL that will fit as a 12-bit unsigned immediate
8445 that can be created with a left shift of 0 or 12. */
8446 static HOST_WIDE_INT
8447 aarch64_clamp_to_uimm12_shift (HOST_WIDE_INT val)
8449 /* Check to see if the value fits in 24 bits, as that is the maximum we can
8450 handle correctly. */
8451 gcc_assert ((val & 0xffffff) == val);
8453 if (((val & 0xfff) << 0) == val)
8454 return val;
8456 return val & (0xfff << 12);
8459 /* Return true if val is an immediate that can be loaded into a
8460 register by a MOVZ instruction. */
8461 static bool
8462 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
8464 if (GET_MODE_SIZE (mode) > 4)
8466 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
8467 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
8468 return 1;
8470 else
8472 /* Ignore sign extension. */
8473 val &= (HOST_WIDE_INT) 0xffffffff;
8475 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
8476 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
8479 /* Test whether:
8481 X = (X & AND_VAL) | IOR_VAL;
8483 can be implemented using:
8485 MOVK X, #(IOR_VAL >> shift), LSL #shift
8487 Return the shift if so, otherwise return -1. */
8489 aarch64_movk_shift (const wide_int_ref &and_val,
8490 const wide_int_ref &ior_val)
8492 unsigned int precision = and_val.get_precision ();
8493 unsigned HOST_WIDE_INT mask = 0xffff;
8494 for (unsigned int shift = 0; shift < precision; shift += 16)
8496 if (and_val == ~mask && (ior_val & mask) == ior_val)
8497 return shift;
8498 mask <<= 16;
8500 return -1;
8503 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
8504 64-bit (DImode) integer. */
8506 static unsigned HOST_WIDE_INT
8507 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
8509 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
8510 while (size < 64)
8512 val &= (HOST_WIDE_INT_1U << size) - 1;
8513 val |= val << size;
8514 size *= 2;
8516 return val;
8519 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
8521 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
8523 0x0000000100000001ull,
8524 0x0001000100010001ull,
8525 0x0101010101010101ull,
8526 0x1111111111111111ull,
8527 0x5555555555555555ull,
8531 /* Return true if val is a valid bitmask immediate. */
8533 bool
8534 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
8536 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
8537 int bits;
8539 /* Check for a single sequence of one bits and return quickly if so.
8540 The special cases of all ones and all zeroes returns false. */
8541 val = aarch64_replicate_bitmask_imm (val_in, mode);
8542 tmp = val + (val & -val);
8544 if (tmp == (tmp & -tmp))
8545 return (val + 1) > 1;
8547 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
8548 if (mode == SImode)
8549 val = (val << 32) | (val & 0xffffffff);
8551 /* Invert if the immediate doesn't start with a zero bit - this means we
8552 only need to search for sequences of one bits. */
8553 if (val & 1)
8554 val = ~val;
8556 /* Find the first set bit and set tmp to val with the first sequence of one
8557 bits removed. Return success if there is a single sequence of ones. */
8558 first_one = val & -val;
8559 tmp = val & (val + first_one);
8561 if (tmp == 0)
8562 return true;
8564 /* Find the next set bit and compute the difference in bit position. */
8565 next_one = tmp & -tmp;
8566 bits = clz_hwi (first_one) - clz_hwi (next_one);
8567 mask = val ^ tmp;
8569 /* Check the bit position difference is a power of 2, and that the first
8570 sequence of one bits fits within 'bits' bits. */
8571 if ((mask >> bits) != 0 || bits != (bits & -bits))
8572 return false;
8574 /* Check the sequence of one bits is repeated 64/bits times. */
8575 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
8578 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
8579 Assumed precondition: VAL_IN Is not zero. */
8581 unsigned HOST_WIDE_INT
8582 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
8584 int lowest_bit_set = ctz_hwi (val_in);
8585 int highest_bit_set = floor_log2 (val_in);
8586 gcc_assert (val_in != 0);
8588 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
8589 (HOST_WIDE_INT_1U << lowest_bit_set));
8592 /* Create constant where bits outside of lowest bit set to highest bit set
8593 are set to 1. */
8595 unsigned HOST_WIDE_INT
8596 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
8598 return val_in | ~aarch64_and_split_imm1 (val_in);
8601 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
8603 bool
8604 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
8606 scalar_int_mode int_mode;
8607 if (!is_a <scalar_int_mode> (mode, &int_mode))
8608 return false;
8610 if (aarch64_bitmask_imm (val_in, int_mode))
8611 return false;
8613 if (aarch64_move_imm (val_in, int_mode))
8614 return false;
8616 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
8618 return aarch64_bitmask_imm (imm2, int_mode);
8621 /* Return true if val is an immediate that can be loaded into a
8622 register in a single instruction. */
8623 bool
8624 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
8626 scalar_int_mode int_mode;
8627 if (!is_a <scalar_int_mode> (mode, &int_mode))
8628 return false;
8630 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
8631 return 1;
8632 return aarch64_bitmask_imm (val, int_mode);
8635 static bool
8636 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
8638 rtx base, offset;
8640 if (GET_CODE (x) == HIGH)
8641 return true;
8643 /* There's no way to calculate VL-based values using relocations. */
8644 subrtx_iterator::array_type array;
8645 FOR_EACH_SUBRTX (iter, array, x, ALL)
8646 if (GET_CODE (*iter) == CONST_POLY_INT)
8647 return true;
8649 split_const (x, &base, &offset);
8650 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
8652 if (aarch64_classify_symbol (base, INTVAL (offset))
8653 != SYMBOL_FORCE_TO_MEM)
8654 return true;
8655 else
8656 /* Avoid generating a 64-bit relocation in ILP32; leave
8657 to aarch64_expand_mov_immediate to handle it properly. */
8658 return mode != ptr_mode;
8661 return aarch64_tls_referenced_p (x);
8664 /* Implement TARGET_CASE_VALUES_THRESHOLD.
8665 The expansion for a table switch is quite expensive due to the number
8666 of instructions, the table lookup and hard to predict indirect jump.
8667 When optimizing for speed, and -O3 enabled, use the per-core tuning if
8668 set, otherwise use tables for > 16 cases as a tradeoff between size and
8669 performance. When optimizing for size, use the default setting. */
8671 static unsigned int
8672 aarch64_case_values_threshold (void)
8674 /* Use the specified limit for the number of cases before using jump
8675 tables at higher optimization levels. */
8676 if (optimize > 2
8677 && selected_cpu->tune->max_case_values != 0)
8678 return selected_cpu->tune->max_case_values;
8679 else
8680 return optimize_size ? default_case_values_threshold () : 17;
8683 /* Return true if register REGNO is a valid index register.
8684 STRICT_P is true if REG_OK_STRICT is in effect. */
8686 bool
8687 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
8689 if (!HARD_REGISTER_NUM_P (regno))
8691 if (!strict_p)
8692 return true;
8694 if (!reg_renumber)
8695 return false;
8697 regno = reg_renumber[regno];
8699 return GP_REGNUM_P (regno);
8702 /* Return true if register REGNO is a valid base register for mode MODE.
8703 STRICT_P is true if REG_OK_STRICT is in effect. */
8705 bool
8706 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
8708 if (!HARD_REGISTER_NUM_P (regno))
8710 if (!strict_p)
8711 return true;
8713 if (!reg_renumber)
8714 return false;
8716 regno = reg_renumber[regno];
8719 /* The fake registers will be eliminated to either the stack or
8720 hard frame pointer, both of which are usually valid base registers.
8721 Reload deals with the cases where the eliminated form isn't valid. */
8722 return (GP_REGNUM_P (regno)
8723 || regno == SP_REGNUM
8724 || regno == FRAME_POINTER_REGNUM
8725 || regno == ARG_POINTER_REGNUM);
8728 /* Return true if X is a valid base register for mode MODE.
8729 STRICT_P is true if REG_OK_STRICT is in effect. */
8731 static bool
8732 aarch64_base_register_rtx_p (rtx x, bool strict_p)
8734 if (!strict_p
8735 && GET_CODE (x) == SUBREG
8736 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
8737 x = SUBREG_REG (x);
8739 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
8742 /* Return true if address offset is a valid index. If it is, fill in INFO
8743 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
8745 static bool
8746 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
8747 machine_mode mode, bool strict_p)
8749 enum aarch64_address_type type;
8750 rtx index;
8751 int shift;
8753 /* (reg:P) */
8754 if ((REG_P (x) || GET_CODE (x) == SUBREG)
8755 && GET_MODE (x) == Pmode)
8757 type = ADDRESS_REG_REG;
8758 index = x;
8759 shift = 0;
8761 /* (sign_extend:DI (reg:SI)) */
8762 else if ((GET_CODE (x) == SIGN_EXTEND
8763 || GET_CODE (x) == ZERO_EXTEND)
8764 && GET_MODE (x) == DImode
8765 && GET_MODE (XEXP (x, 0)) == SImode)
8767 type = (GET_CODE (x) == SIGN_EXTEND)
8768 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8769 index = XEXP (x, 0);
8770 shift = 0;
8772 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
8773 else if (GET_CODE (x) == MULT
8774 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8775 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8776 && GET_MODE (XEXP (x, 0)) == DImode
8777 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8778 && CONST_INT_P (XEXP (x, 1)))
8780 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8781 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8782 index = XEXP (XEXP (x, 0), 0);
8783 shift = exact_log2 (INTVAL (XEXP (x, 1)));
8785 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
8786 else if (GET_CODE (x) == ASHIFT
8787 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
8788 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
8789 && GET_MODE (XEXP (x, 0)) == DImode
8790 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
8791 && CONST_INT_P (XEXP (x, 1)))
8793 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
8794 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8795 index = XEXP (XEXP (x, 0), 0);
8796 shift = INTVAL (XEXP (x, 1));
8798 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
8799 else if ((GET_CODE (x) == SIGN_EXTRACT
8800 || GET_CODE (x) == ZERO_EXTRACT)
8801 && GET_MODE (x) == DImode
8802 && GET_CODE (XEXP (x, 0)) == MULT
8803 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8804 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8806 type = (GET_CODE (x) == SIGN_EXTRACT)
8807 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8808 index = XEXP (XEXP (x, 0), 0);
8809 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8810 if (INTVAL (XEXP (x, 1)) != 32 + shift
8811 || INTVAL (XEXP (x, 2)) != 0)
8812 shift = -1;
8814 /* (and:DI (mult:DI (reg:DI) (const_int scale))
8815 (const_int 0xffffffff<<shift)) */
8816 else if (GET_CODE (x) == AND
8817 && GET_MODE (x) == DImode
8818 && GET_CODE (XEXP (x, 0)) == MULT
8819 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8820 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8821 && CONST_INT_P (XEXP (x, 1)))
8823 type = ADDRESS_REG_UXTW;
8824 index = XEXP (XEXP (x, 0), 0);
8825 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
8826 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8827 shift = -1;
8829 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
8830 else if ((GET_CODE (x) == SIGN_EXTRACT
8831 || GET_CODE (x) == ZERO_EXTRACT)
8832 && GET_MODE (x) == DImode
8833 && GET_CODE (XEXP (x, 0)) == ASHIFT
8834 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8835 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
8837 type = (GET_CODE (x) == SIGN_EXTRACT)
8838 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
8839 index = XEXP (XEXP (x, 0), 0);
8840 shift = INTVAL (XEXP (XEXP (x, 0), 1));
8841 if (INTVAL (XEXP (x, 1)) != 32 + shift
8842 || INTVAL (XEXP (x, 2)) != 0)
8843 shift = -1;
8845 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
8846 (const_int 0xffffffff<<shift)) */
8847 else if (GET_CODE (x) == AND
8848 && GET_MODE (x) == DImode
8849 && GET_CODE (XEXP (x, 0)) == ASHIFT
8850 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
8851 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
8852 && CONST_INT_P (XEXP (x, 1)))
8854 type = ADDRESS_REG_UXTW;
8855 index = XEXP (XEXP (x, 0), 0);
8856 shift = INTVAL (XEXP (XEXP (x, 0), 1));
8857 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
8858 shift = -1;
8860 /* (mult:P (reg:P) (const_int scale)) */
8861 else if (GET_CODE (x) == MULT
8862 && GET_MODE (x) == Pmode
8863 && GET_MODE (XEXP (x, 0)) == Pmode
8864 && CONST_INT_P (XEXP (x, 1)))
8866 type = ADDRESS_REG_REG;
8867 index = XEXP (x, 0);
8868 shift = exact_log2 (INTVAL (XEXP (x, 1)));
8870 /* (ashift:P (reg:P) (const_int shift)) */
8871 else if (GET_CODE (x) == ASHIFT
8872 && GET_MODE (x) == Pmode
8873 && GET_MODE (XEXP (x, 0)) == Pmode
8874 && CONST_INT_P (XEXP (x, 1)))
8876 type = ADDRESS_REG_REG;
8877 index = XEXP (x, 0);
8878 shift = INTVAL (XEXP (x, 1));
8880 else
8881 return false;
8883 if (!strict_p
8884 && GET_CODE (index) == SUBREG
8885 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
8886 index = SUBREG_REG (index);
8888 if (aarch64_sve_data_mode_p (mode))
8890 if (type != ADDRESS_REG_REG
8891 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
8892 return false;
8894 else
8896 if (shift != 0
8897 && !(IN_RANGE (shift, 1, 3)
8898 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
8899 return false;
8902 if (REG_P (index)
8903 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
8905 info->type = type;
8906 info->offset = index;
8907 info->shift = shift;
8908 return true;
8911 return false;
8914 /* Return true if MODE is one of the modes for which we
8915 support LDP/STP operations. */
8917 static bool
8918 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
8920 return mode == SImode || mode == DImode
8921 || mode == SFmode || mode == DFmode
8922 || (aarch64_vector_mode_supported_p (mode)
8923 && (known_eq (GET_MODE_SIZE (mode), 8)
8924 || (known_eq (GET_MODE_SIZE (mode), 16)
8925 && (aarch64_tune_params.extra_tuning_flags
8926 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
8929 /* Return true if REGNO is a virtual pointer register, or an eliminable
8930 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
8931 include stack_pointer or hard_frame_pointer. */
8932 static bool
8933 virt_or_elim_regno_p (unsigned regno)
8935 return ((regno >= FIRST_VIRTUAL_REGISTER
8936 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
8937 || regno == FRAME_POINTER_REGNUM
8938 || regno == ARG_POINTER_REGNUM);
8941 /* Return true if X is a valid address of type TYPE for machine mode MODE.
8942 If it is, fill in INFO appropriately. STRICT_P is true if
8943 REG_OK_STRICT is in effect. */
8945 bool
8946 aarch64_classify_address (struct aarch64_address_info *info,
8947 rtx x, machine_mode mode, bool strict_p,
8948 aarch64_addr_query_type type)
8950 enum rtx_code code = GET_CODE (x);
8951 rtx op0, op1;
8952 poly_int64 offset;
8954 HOST_WIDE_INT const_size;
8956 /* Whether a vector mode is partial doesn't affect address legitimacy.
8957 Partial vectors like VNx8QImode allow the same indexed addressing
8958 mode and MUL VL addressing mode as full vectors like VNx16QImode;
8959 in both cases, MUL VL counts multiples of GET_MODE_SIZE. */
8960 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
8961 vec_flags &= ~VEC_PARTIAL;
8963 /* On BE, we use load/store pair for all large int mode load/stores.
8964 TI/TFmode may also use a load/store pair. */
8965 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
8966 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
8967 || type == ADDR_QUERY_LDP_STP_N
8968 || mode == TImode
8969 || mode == TFmode
8970 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
8972 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
8973 corresponds to the actual size of the memory being loaded/stored and the
8974 mode of the corresponding addressing mode is half of that. */
8975 if (type == ADDR_QUERY_LDP_STP_N
8976 && known_eq (GET_MODE_SIZE (mode), 16))
8977 mode = DFmode;
8979 bool allow_reg_index_p = (!load_store_pair_p
8980 && (known_lt (GET_MODE_SIZE (mode), 16)
8981 || vec_flags == VEC_ADVSIMD
8982 || vec_flags & VEC_SVE_DATA));
8984 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
8985 [Rn, #offset, MUL VL]. */
8986 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
8987 && (code != REG && code != PLUS))
8988 return false;
8990 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
8991 REG addressing. */
8992 if (advsimd_struct_p
8993 && !BYTES_BIG_ENDIAN
8994 && (code != POST_INC && code != REG))
8995 return false;
8997 gcc_checking_assert (GET_MODE (x) == VOIDmode
8998 || SCALAR_INT_MODE_P (GET_MODE (x)));
9000 switch (code)
9002 case REG:
9003 case SUBREG:
9004 info->type = ADDRESS_REG_IMM;
9005 info->base = x;
9006 info->offset = const0_rtx;
9007 info->const_offset = 0;
9008 return aarch64_base_register_rtx_p (x, strict_p);
9010 case PLUS:
9011 op0 = XEXP (x, 0);
9012 op1 = XEXP (x, 1);
9014 if (! strict_p
9015 && REG_P (op0)
9016 && virt_or_elim_regno_p (REGNO (op0))
9017 && poly_int_rtx_p (op1, &offset))
9019 info->type = ADDRESS_REG_IMM;
9020 info->base = op0;
9021 info->offset = op1;
9022 info->const_offset = offset;
9024 return true;
9027 if (maybe_ne (GET_MODE_SIZE (mode), 0)
9028 && aarch64_base_register_rtx_p (op0, strict_p)
9029 && poly_int_rtx_p (op1, &offset))
9031 info->type = ADDRESS_REG_IMM;
9032 info->base = op0;
9033 info->offset = op1;
9034 info->const_offset = offset;
9036 /* TImode and TFmode values are allowed in both pairs of X
9037 registers and individual Q registers. The available
9038 address modes are:
9039 X,X: 7-bit signed scaled offset
9040 Q: 9-bit signed offset
9041 We conservatively require an offset representable in either mode.
9042 When performing the check for pairs of X registers i.e. LDP/STP
9043 pass down DImode since that is the natural size of the LDP/STP
9044 instruction memory accesses. */
9045 if (mode == TImode || mode == TFmode)
9046 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
9047 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9048 || offset_12bit_unsigned_scaled_p (mode, offset)));
9050 /* A 7bit offset check because OImode will emit a ldp/stp
9051 instruction (only big endian will get here).
9052 For ldp/stp instructions, the offset is scaled for the size of a
9053 single element of the pair. */
9054 if (mode == OImode)
9055 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
9057 /* Three 9/12 bit offsets checks because CImode will emit three
9058 ldr/str instructions (only big endian will get here). */
9059 if (mode == CImode)
9060 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9061 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
9062 offset + 32)
9063 || offset_12bit_unsigned_scaled_p (V16QImode,
9064 offset + 32)));
9066 /* Two 7bit offsets checks because XImode will emit two ldp/stp
9067 instructions (only big endian will get here). */
9068 if (mode == XImode)
9069 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
9070 && aarch64_offset_7bit_signed_scaled_p (TImode,
9071 offset + 32));
9073 /* Make "m" use the LD1 offset range for SVE data modes, so
9074 that pre-RTL optimizers like ivopts will work to that
9075 instead of the wider LDR/STR range. */
9076 if (vec_flags == VEC_SVE_DATA)
9077 return (type == ADDR_QUERY_M
9078 ? offset_4bit_signed_scaled_p (mode, offset)
9079 : offset_9bit_signed_scaled_p (mode, offset));
9081 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
9083 poly_int64 end_offset = (offset
9084 + GET_MODE_SIZE (mode)
9085 - BYTES_PER_SVE_VECTOR);
9086 return (type == ADDR_QUERY_M
9087 ? offset_4bit_signed_scaled_p (mode, offset)
9088 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
9089 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
9090 end_offset)));
9093 if (vec_flags == VEC_SVE_PRED)
9094 return offset_9bit_signed_scaled_p (mode, offset);
9096 if (load_store_pair_p)
9097 return ((known_eq (GET_MODE_SIZE (mode), 4)
9098 || known_eq (GET_MODE_SIZE (mode), 8)
9099 || known_eq (GET_MODE_SIZE (mode), 16))
9100 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9101 else
9102 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
9103 || offset_12bit_unsigned_scaled_p (mode, offset));
9106 if (allow_reg_index_p)
9108 /* Look for base + (scaled/extended) index register. */
9109 if (aarch64_base_register_rtx_p (op0, strict_p)
9110 && aarch64_classify_index (info, op1, mode, strict_p))
9112 info->base = op0;
9113 return true;
9115 if (aarch64_base_register_rtx_p (op1, strict_p)
9116 && aarch64_classify_index (info, op0, mode, strict_p))
9118 info->base = op1;
9119 return true;
9123 return false;
9125 case POST_INC:
9126 case POST_DEC:
9127 case PRE_INC:
9128 case PRE_DEC:
9129 info->type = ADDRESS_REG_WB;
9130 info->base = XEXP (x, 0);
9131 info->offset = NULL_RTX;
9132 return aarch64_base_register_rtx_p (info->base, strict_p);
9134 case POST_MODIFY:
9135 case PRE_MODIFY:
9136 info->type = ADDRESS_REG_WB;
9137 info->base = XEXP (x, 0);
9138 if (GET_CODE (XEXP (x, 1)) == PLUS
9139 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
9140 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
9141 && aarch64_base_register_rtx_p (info->base, strict_p))
9143 info->offset = XEXP (XEXP (x, 1), 1);
9144 info->const_offset = offset;
9146 /* TImode and TFmode values are allowed in both pairs of X
9147 registers and individual Q registers. The available
9148 address modes are:
9149 X,X: 7-bit signed scaled offset
9150 Q: 9-bit signed offset
9151 We conservatively require an offset representable in either mode.
9153 if (mode == TImode || mode == TFmode)
9154 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
9155 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
9157 if (load_store_pair_p)
9158 return ((known_eq (GET_MODE_SIZE (mode), 4)
9159 || known_eq (GET_MODE_SIZE (mode), 8)
9160 || known_eq (GET_MODE_SIZE (mode), 16))
9161 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
9162 else
9163 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
9165 return false;
9167 case CONST:
9168 case SYMBOL_REF:
9169 case LABEL_REF:
9170 /* load literal: pc-relative constant pool entry. Only supported
9171 for SI mode or larger. */
9172 info->type = ADDRESS_SYMBOLIC;
9174 if (!load_store_pair_p
9175 && GET_MODE_SIZE (mode).is_constant (&const_size)
9176 && const_size >= 4)
9178 rtx sym, addend;
9180 split_const (x, &sym, &addend);
9181 return ((GET_CODE (sym) == LABEL_REF
9182 || (GET_CODE (sym) == SYMBOL_REF
9183 && CONSTANT_POOL_ADDRESS_P (sym)
9184 && aarch64_pcrelative_literal_loads)));
9186 return false;
9188 case LO_SUM:
9189 info->type = ADDRESS_LO_SUM;
9190 info->base = XEXP (x, 0);
9191 info->offset = XEXP (x, 1);
9192 if (allow_reg_index_p
9193 && aarch64_base_register_rtx_p (info->base, strict_p))
9195 rtx sym, offs;
9196 split_const (info->offset, &sym, &offs);
9197 if (GET_CODE (sym) == SYMBOL_REF
9198 && (aarch64_classify_symbol (sym, INTVAL (offs))
9199 == SYMBOL_SMALL_ABSOLUTE))
9201 /* The symbol and offset must be aligned to the access size. */
9202 unsigned int align;
9204 if (CONSTANT_POOL_ADDRESS_P (sym))
9205 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
9206 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
9208 tree exp = SYMBOL_REF_DECL (sym);
9209 align = TYPE_ALIGN (TREE_TYPE (exp));
9210 align = aarch64_constant_alignment (exp, align);
9212 else if (SYMBOL_REF_DECL (sym))
9213 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
9214 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
9215 && SYMBOL_REF_BLOCK (sym) != NULL)
9216 align = SYMBOL_REF_BLOCK (sym)->alignment;
9217 else
9218 align = BITS_PER_UNIT;
9220 poly_int64 ref_size = GET_MODE_SIZE (mode);
9221 if (known_eq (ref_size, 0))
9222 ref_size = GET_MODE_SIZE (DImode);
9224 return (multiple_p (INTVAL (offs), ref_size)
9225 && multiple_p (align / BITS_PER_UNIT, ref_size));
9228 return false;
9230 default:
9231 return false;
9235 /* Return true if the address X is valid for a PRFM instruction.
9236 STRICT_P is true if we should do strict checking with
9237 aarch64_classify_address. */
9239 bool
9240 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
9242 struct aarch64_address_info addr;
9244 /* PRFM accepts the same addresses as DImode... */
9245 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
9246 if (!res)
9247 return false;
9249 /* ... except writeback forms. */
9250 return addr.type != ADDRESS_REG_WB;
9253 bool
9254 aarch64_symbolic_address_p (rtx x)
9256 rtx offset;
9258 split_const (x, &x, &offset);
9259 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
9262 /* Classify the base of symbolic expression X. */
9264 enum aarch64_symbol_type
9265 aarch64_classify_symbolic_expression (rtx x)
9267 rtx offset;
9269 split_const (x, &x, &offset);
9270 return aarch64_classify_symbol (x, INTVAL (offset));
9274 /* Return TRUE if X is a legitimate address for accessing memory in
9275 mode MODE. */
9276 static bool
9277 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
9279 struct aarch64_address_info addr;
9281 return aarch64_classify_address (&addr, x, mode, strict_p);
9284 /* Return TRUE if X is a legitimate address of type TYPE for accessing
9285 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
9286 bool
9287 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
9288 aarch64_addr_query_type type)
9290 struct aarch64_address_info addr;
9292 return aarch64_classify_address (&addr, x, mode, strict_p, type);
9295 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
9297 static bool
9298 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
9299 poly_int64 orig_offset,
9300 machine_mode mode)
9302 HOST_WIDE_INT size;
9303 if (GET_MODE_SIZE (mode).is_constant (&size))
9305 HOST_WIDE_INT const_offset, second_offset;
9307 /* A general SVE offset is A * VQ + B. Remove the A component from
9308 coefficient 0 in order to get the constant B. */
9309 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
9311 /* Split an out-of-range address displacement into a base and
9312 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
9313 range otherwise to increase opportunities for sharing the base
9314 address of different sizes. Unaligned accesses use the signed
9315 9-bit range, TImode/TFmode use the intersection of signed
9316 scaled 7-bit and signed 9-bit offset. */
9317 if (mode == TImode || mode == TFmode)
9318 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
9319 else if ((const_offset & (size - 1)) != 0)
9320 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
9321 else
9322 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
9324 if (second_offset == 0 || known_eq (orig_offset, second_offset))
9325 return false;
9327 /* Split the offset into second_offset and the rest. */
9328 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9329 *offset2 = gen_int_mode (second_offset, Pmode);
9330 return true;
9332 else
9334 /* Get the mode we should use as the basis of the range. For structure
9335 modes this is the mode of one vector. */
9336 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
9337 machine_mode step_mode
9338 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
9340 /* Get the "mul vl" multiplier we'd like to use. */
9341 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
9342 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
9343 if (vec_flags & VEC_SVE_DATA)
9344 /* LDR supports a 9-bit range, but the move patterns for
9345 structure modes require all vectors to be in range of the
9346 same base. The simplest way of accomodating that while still
9347 promoting reuse of anchor points between different modes is
9348 to use an 8-bit range unconditionally. */
9349 vnum = ((vnum + 128) & 255) - 128;
9350 else
9351 /* Predicates are only handled singly, so we might as well use
9352 the full range. */
9353 vnum = ((vnum + 256) & 511) - 256;
9354 if (vnum == 0)
9355 return false;
9357 /* Convert the "mul vl" multiplier into a byte offset. */
9358 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
9359 if (known_eq (second_offset, orig_offset))
9360 return false;
9362 /* Split the offset into second_offset and the rest. */
9363 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
9364 *offset2 = gen_int_mode (second_offset, Pmode);
9365 return true;
9369 /* Return the binary representation of floating point constant VALUE in INTVAL.
9370 If the value cannot be converted, return false without setting INTVAL.
9371 The conversion is done in the given MODE. */
9372 bool
9373 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
9376 /* We make a general exception for 0. */
9377 if (aarch64_float_const_zero_rtx_p (value))
9379 *intval = 0;
9380 return true;
9383 scalar_float_mode mode;
9384 if (GET_CODE (value) != CONST_DOUBLE
9385 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
9386 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
9387 /* Only support up to DF mode. */
9388 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
9389 return false;
9391 unsigned HOST_WIDE_INT ival = 0;
9393 long res[2];
9394 real_to_target (res,
9395 CONST_DOUBLE_REAL_VALUE (value),
9396 REAL_MODE_FORMAT (mode));
9398 if (mode == DFmode)
9400 int order = BYTES_BIG_ENDIAN ? 1 : 0;
9401 ival = zext_hwi (res[order], 32);
9402 ival |= (zext_hwi (res[1 - order], 32) << 32);
9404 else
9405 ival = zext_hwi (res[0], 32);
9407 *intval = ival;
9408 return true;
9411 /* Return TRUE if rtx X is an immediate constant that can be moved using a
9412 single MOV(+MOVK) followed by an FMOV. */
9413 bool
9414 aarch64_float_const_rtx_p (rtx x)
9416 machine_mode mode = GET_MODE (x);
9417 if (mode == VOIDmode)
9418 return false;
9420 /* Determine whether it's cheaper to write float constants as
9421 mov/movk pairs over ldr/adrp pairs. */
9422 unsigned HOST_WIDE_INT ival;
9424 if (GET_CODE (x) == CONST_DOUBLE
9425 && SCALAR_FLOAT_MODE_P (mode)
9426 && aarch64_reinterpret_float_as_int (x, &ival))
9428 scalar_int_mode imode = (mode == HFmode
9429 ? SImode
9430 : int_mode_for_mode (mode).require ());
9431 int num_instr = aarch64_internal_mov_immediate
9432 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
9433 return num_instr < 3;
9436 return false;
9439 /* Return TRUE if rtx X is immediate constant 0.0 */
9440 bool
9441 aarch64_float_const_zero_rtx_p (rtx x)
9443 if (GET_MODE (x) == VOIDmode)
9444 return false;
9446 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
9447 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
9448 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
9451 /* Return TRUE if rtx X is immediate constant that fits in a single
9452 MOVI immediate operation. */
9453 bool
9454 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
9456 if (!TARGET_SIMD)
9457 return false;
9459 machine_mode vmode;
9460 scalar_int_mode imode;
9461 unsigned HOST_WIDE_INT ival;
9463 if (GET_CODE (x) == CONST_DOUBLE
9464 && SCALAR_FLOAT_MODE_P (mode))
9466 if (!aarch64_reinterpret_float_as_int (x, &ival))
9467 return false;
9469 /* We make a general exception for 0. */
9470 if (aarch64_float_const_zero_rtx_p (x))
9471 return true;
9473 imode = int_mode_for_mode (mode).require ();
9475 else if (GET_CODE (x) == CONST_INT
9476 && is_a <scalar_int_mode> (mode, &imode))
9477 ival = INTVAL (x);
9478 else
9479 return false;
9481 /* use a 64 bit mode for everything except for DI/DF mode, where we use
9482 a 128 bit vector mode. */
9483 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
9485 vmode = aarch64_simd_container_mode (imode, width);
9486 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
9488 return aarch64_simd_valid_immediate (v_op, NULL);
9492 /* Return the fixed registers used for condition codes. */
9494 static bool
9495 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
9497 *p1 = CC_REGNUM;
9498 *p2 = INVALID_REGNUM;
9499 return true;
9502 /* This function is used by the call expanders of the machine description.
9503 RESULT is the register in which the result is returned. It's NULL for
9504 "call" and "sibcall".
9505 MEM is the location of the function call.
9506 CALLEE_ABI is a const_int that gives the arm_pcs of the callee.
9507 SIBCALL indicates whether this function call is normal call or sibling call.
9508 It will generate different pattern accordingly. */
9510 void
9511 aarch64_expand_call (rtx result, rtx mem, rtx callee_abi, bool sibcall)
9513 rtx call, callee, tmp;
9514 rtvec vec;
9515 machine_mode mode;
9517 gcc_assert (MEM_P (mem));
9518 callee = XEXP (mem, 0);
9519 mode = GET_MODE (callee);
9520 gcc_assert (mode == Pmode);
9522 /* Decide if we should generate indirect calls by loading the
9523 address of the callee into a register before performing
9524 the branch-and-link. */
9525 if (SYMBOL_REF_P (callee)
9526 ? (aarch64_is_long_call_p (callee)
9527 || aarch64_is_noplt_call_p (callee))
9528 : !REG_P (callee))
9529 XEXP (mem, 0) = force_reg (mode, callee);
9531 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
9533 if (result != NULL_RTX)
9534 call = gen_rtx_SET (result, call);
9536 if (sibcall)
9537 tmp = ret_rtx;
9538 else
9539 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
9541 gcc_assert (CONST_INT_P (callee_abi));
9542 callee_abi = gen_rtx_UNSPEC (DImode, gen_rtvec (1, callee_abi),
9543 UNSPEC_CALLEE_ABI);
9545 vec = gen_rtvec (3, call, callee_abi, tmp);
9546 call = gen_rtx_PARALLEL (VOIDmode, vec);
9548 aarch64_emit_call_insn (call);
9551 /* Emit call insn with PAT and do aarch64-specific handling. */
9553 void
9554 aarch64_emit_call_insn (rtx pat)
9556 rtx insn = emit_call_insn (pat);
9558 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
9559 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
9560 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
9563 machine_mode
9564 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
9566 machine_mode mode_x = GET_MODE (x);
9567 rtx_code code_x = GET_CODE (x);
9569 /* All floating point compares return CCFP if it is an equality
9570 comparison, and CCFPE otherwise. */
9571 if (GET_MODE_CLASS (mode_x) == MODE_FLOAT)
9573 switch (code)
9575 case EQ:
9576 case NE:
9577 case UNORDERED:
9578 case ORDERED:
9579 case UNLT:
9580 case UNLE:
9581 case UNGT:
9582 case UNGE:
9583 case UNEQ:
9584 return CCFPmode;
9586 case LT:
9587 case LE:
9588 case GT:
9589 case GE:
9590 case LTGT:
9591 return CCFPEmode;
9593 default:
9594 gcc_unreachable ();
9598 /* Equality comparisons of short modes against zero can be performed
9599 using the TST instruction with the appropriate bitmask. */
9600 if (y == const0_rtx && (REG_P (x) || SUBREG_P (x))
9601 && (code == EQ || code == NE)
9602 && (mode_x == HImode || mode_x == QImode))
9603 return CC_NZmode;
9605 /* Similarly, comparisons of zero_extends from shorter modes can
9606 be performed using an ANDS with an immediate mask. */
9607 if (y == const0_rtx && code_x == ZERO_EXTEND
9608 && (mode_x == SImode || mode_x == DImode)
9609 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
9610 && (code == EQ || code == NE))
9611 return CC_NZmode;
9613 if ((mode_x == SImode || mode_x == DImode)
9614 && y == const0_rtx
9615 && (code == EQ || code == NE || code == LT || code == GE)
9616 && (code_x == PLUS || code_x == MINUS || code_x == AND
9617 || code_x == NEG
9618 || (code_x == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
9619 && CONST_INT_P (XEXP (x, 2)))))
9620 return CC_NZmode;
9622 /* A compare with a shifted operand. Because of canonicalization,
9623 the comparison will have to be swapped when we emit the assembly
9624 code. */
9625 if ((mode_x == SImode || mode_x == DImode)
9626 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
9627 && (code_x == ASHIFT || code_x == ASHIFTRT
9628 || code_x == LSHIFTRT
9629 || code_x == ZERO_EXTEND || code_x == SIGN_EXTEND))
9630 return CC_SWPmode;
9632 /* Similarly for a negated operand, but we can only do this for
9633 equalities. */
9634 if ((mode_x == SImode || mode_x == DImode)
9635 && (REG_P (y) || GET_CODE (y) == SUBREG)
9636 && (code == EQ || code == NE)
9637 && code_x == NEG)
9638 return CC_Zmode;
9640 /* A test for unsigned overflow from an addition. */
9641 if ((mode_x == DImode || mode_x == TImode)
9642 && (code == LTU || code == GEU)
9643 && code_x == PLUS
9644 && rtx_equal_p (XEXP (x, 0), y))
9645 return CC_Cmode;
9647 /* A test for unsigned overflow from an add with carry. */
9648 if ((mode_x == DImode || mode_x == TImode)
9649 && (code == LTU || code == GEU)
9650 && code_x == PLUS
9651 && CONST_SCALAR_INT_P (y)
9652 && (rtx_mode_t (y, mode_x)
9653 == (wi::shwi (1, mode_x)
9654 << (GET_MODE_BITSIZE (mode_x).to_constant () / 2))))
9655 return CC_ADCmode;
9657 /* A test for signed overflow. */
9658 if ((mode_x == DImode || mode_x == TImode)
9659 && code == NE
9660 && code_x == PLUS
9661 && GET_CODE (y) == SIGN_EXTEND)
9662 return CC_Vmode;
9664 /* For everything else, return CCmode. */
9665 return CCmode;
9668 static int
9669 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
9672 aarch64_get_condition_code (rtx x)
9674 machine_mode mode = GET_MODE (XEXP (x, 0));
9675 enum rtx_code comp_code = GET_CODE (x);
9677 if (GET_MODE_CLASS (mode) != MODE_CC)
9678 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
9679 return aarch64_get_condition_code_1 (mode, comp_code);
9682 static int
9683 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
9685 switch (mode)
9687 case E_CCFPmode:
9688 case E_CCFPEmode:
9689 switch (comp_code)
9691 case GE: return AARCH64_GE;
9692 case GT: return AARCH64_GT;
9693 case LE: return AARCH64_LS;
9694 case LT: return AARCH64_MI;
9695 case NE: return AARCH64_NE;
9696 case EQ: return AARCH64_EQ;
9697 case ORDERED: return AARCH64_VC;
9698 case UNORDERED: return AARCH64_VS;
9699 case UNLT: return AARCH64_LT;
9700 case UNLE: return AARCH64_LE;
9701 case UNGT: return AARCH64_HI;
9702 case UNGE: return AARCH64_PL;
9703 default: return -1;
9705 break;
9707 case E_CCmode:
9708 switch (comp_code)
9710 case NE: return AARCH64_NE;
9711 case EQ: return AARCH64_EQ;
9712 case GE: return AARCH64_GE;
9713 case GT: return AARCH64_GT;
9714 case LE: return AARCH64_LE;
9715 case LT: return AARCH64_LT;
9716 case GEU: return AARCH64_CS;
9717 case GTU: return AARCH64_HI;
9718 case LEU: return AARCH64_LS;
9719 case LTU: return AARCH64_CC;
9720 default: return -1;
9722 break;
9724 case E_CC_SWPmode:
9725 switch (comp_code)
9727 case NE: return AARCH64_NE;
9728 case EQ: return AARCH64_EQ;
9729 case GE: return AARCH64_LE;
9730 case GT: return AARCH64_LT;
9731 case LE: return AARCH64_GE;
9732 case LT: return AARCH64_GT;
9733 case GEU: return AARCH64_LS;
9734 case GTU: return AARCH64_CC;
9735 case LEU: return AARCH64_CS;
9736 case LTU: return AARCH64_HI;
9737 default: return -1;
9739 break;
9741 case E_CC_NZCmode:
9742 switch (comp_code)
9744 case NE: return AARCH64_NE; /* = any */
9745 case EQ: return AARCH64_EQ; /* = none */
9746 case GE: return AARCH64_PL; /* = nfrst */
9747 case LT: return AARCH64_MI; /* = first */
9748 case GEU: return AARCH64_CS; /* = nlast */
9749 case GTU: return AARCH64_HI; /* = pmore */
9750 case LEU: return AARCH64_LS; /* = plast */
9751 case LTU: return AARCH64_CC; /* = last */
9752 default: return -1;
9754 break;
9756 case E_CC_NZmode:
9757 switch (comp_code)
9759 case NE: return AARCH64_NE;
9760 case EQ: return AARCH64_EQ;
9761 case GE: return AARCH64_PL;
9762 case LT: return AARCH64_MI;
9763 default: return -1;
9765 break;
9767 case E_CC_Zmode:
9768 switch (comp_code)
9770 case NE: return AARCH64_NE;
9771 case EQ: return AARCH64_EQ;
9772 default: return -1;
9774 break;
9776 case E_CC_Cmode:
9777 switch (comp_code)
9779 case LTU: return AARCH64_CS;
9780 case GEU: return AARCH64_CC;
9781 default: return -1;
9783 break;
9785 case E_CC_ADCmode:
9786 switch (comp_code)
9788 case GEU: return AARCH64_CS;
9789 case LTU: return AARCH64_CC;
9790 default: return -1;
9792 break;
9794 case E_CC_Vmode:
9795 switch (comp_code)
9797 case NE: return AARCH64_VS;
9798 case EQ: return AARCH64_VC;
9799 default: return -1;
9801 break;
9803 default:
9804 return -1;
9807 return -1;
9810 bool
9811 aarch64_const_vec_all_same_in_range_p (rtx x,
9812 HOST_WIDE_INT minval,
9813 HOST_WIDE_INT maxval)
9815 rtx elt;
9816 return (const_vec_duplicate_p (x, &elt)
9817 && CONST_INT_P (elt)
9818 && IN_RANGE (INTVAL (elt), minval, maxval));
9821 bool
9822 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
9824 return aarch64_const_vec_all_same_in_range_p (x, val, val);
9827 /* Return true if VEC is a constant in which every element is in the range
9828 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
9830 static bool
9831 aarch64_const_vec_all_in_range_p (rtx vec,
9832 HOST_WIDE_INT minval,
9833 HOST_WIDE_INT maxval)
9835 if (GET_CODE (vec) != CONST_VECTOR
9836 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
9837 return false;
9839 int nunits;
9840 if (!CONST_VECTOR_STEPPED_P (vec))
9841 nunits = const_vector_encoded_nelts (vec);
9842 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
9843 return false;
9845 for (int i = 0; i < nunits; i++)
9847 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
9848 if (!CONST_INT_P (vec_elem)
9849 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
9850 return false;
9852 return true;
9855 /* N Z C V. */
9856 #define AARCH64_CC_V 1
9857 #define AARCH64_CC_C (1 << 1)
9858 #define AARCH64_CC_Z (1 << 2)
9859 #define AARCH64_CC_N (1 << 3)
9861 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
9862 static const int aarch64_nzcv_codes[] =
9864 0, /* EQ, Z == 1. */
9865 AARCH64_CC_Z, /* NE, Z == 0. */
9866 0, /* CS, C == 1. */
9867 AARCH64_CC_C, /* CC, C == 0. */
9868 0, /* MI, N == 1. */
9869 AARCH64_CC_N, /* PL, N == 0. */
9870 0, /* VS, V == 1. */
9871 AARCH64_CC_V, /* VC, V == 0. */
9872 0, /* HI, C ==1 && Z == 0. */
9873 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
9874 AARCH64_CC_V, /* GE, N == V. */
9875 0, /* LT, N != V. */
9876 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
9877 0, /* LE, !(Z == 0 && N == V). */
9878 0, /* AL, Any. */
9879 0 /* NV, Any. */
9882 /* Print floating-point vector immediate operand X to F, negating it
9883 first if NEGATE is true. Return true on success, false if it isn't
9884 a constant we can handle. */
9886 static bool
9887 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
9889 rtx elt;
9891 if (!const_vec_duplicate_p (x, &elt))
9892 return false;
9894 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
9895 if (negate)
9896 r = real_value_negate (&r);
9898 /* Handle the SVE single-bit immediates specially, since they have a
9899 fixed form in the assembly syntax. */
9900 if (real_equal (&r, &dconst0))
9901 asm_fprintf (f, "0.0");
9902 else if (real_equal (&r, &dconst2))
9903 asm_fprintf (f, "2.0");
9904 else if (real_equal (&r, &dconst1))
9905 asm_fprintf (f, "1.0");
9906 else if (real_equal (&r, &dconsthalf))
9907 asm_fprintf (f, "0.5");
9908 else
9910 const int buf_size = 20;
9911 char float_buf[buf_size] = {'\0'};
9912 real_to_decimal_for_mode (float_buf, &r, buf_size, buf_size,
9913 1, GET_MODE (elt));
9914 asm_fprintf (f, "%s", float_buf);
9917 return true;
9920 /* Return the equivalent letter for size. */
9921 static char
9922 sizetochar (int size)
9924 switch (size)
9926 case 64: return 'd';
9927 case 32: return 's';
9928 case 16: return 'h';
9929 case 8 : return 'b';
9930 default: gcc_unreachable ();
9934 /* Print operand X to file F in a target specific manner according to CODE.
9935 The acceptable formatting commands given by CODE are:
9936 'c': An integer or symbol address without a preceding #
9937 sign.
9938 'C': Take the duplicated element in a vector constant
9939 and print it in hex.
9940 'D': Take the duplicated element in a vector constant
9941 and print it as an unsigned integer, in decimal.
9942 'e': Print the sign/zero-extend size as a character 8->b,
9943 16->h, 32->w. Can also be used for masks:
9944 0xff->b, 0xffff->h, 0xffffffff->w.
9945 'I': If the operand is a duplicated vector constant,
9946 replace it with the duplicated scalar. If the
9947 operand is then a floating-point constant, replace
9948 it with the integer bit representation. Print the
9949 transformed constant as a signed decimal number.
9950 'p': Prints N such that 2^N == X (X must be power of 2 and
9951 const int).
9952 'P': Print the number of non-zero bits in X (a const_int).
9953 'H': Print the higher numbered register of a pair (TImode)
9954 of regs.
9955 'm': Print a condition (eq, ne, etc).
9956 'M': Same as 'm', but invert condition.
9957 'N': Take the duplicated element in a vector constant
9958 and print the negative of it in decimal.
9959 'b/h/s/d/q': Print a scalar FP/SIMD register name.
9960 'S/T/U/V': Print a FP/SIMD register name for a register list.
9961 The register printed is the FP/SIMD register name
9962 of X + 0/1/2/3 for S/T/U/V.
9963 'R': Print a scalar Integer/FP/SIMD register name + 1.
9964 'X': Print bottom 16 bits of integer constant in hex.
9965 'w/x': Print a general register name or the zero register
9966 (32-bit or 64-bit).
9967 '0': Print a normal operand, if it's a general register,
9968 then we assume DImode.
9969 'k': Print NZCV for conditional compare instructions.
9970 'A': Output address constant representing the first
9971 argument of X, specifying a relocation offset
9972 if appropriate.
9973 'L': Output constant address specified by X
9974 with a relocation offset if appropriate.
9975 'G': Prints address of X, specifying a PC relative
9976 relocation mode if appropriate.
9977 'y': Output address of LDP or STP - this is used for
9978 some LDP/STPs which don't use a PARALLEL in their
9979 pattern (so the mode needs to be adjusted).
9980 'z': Output address of a typical LDP or STP. */
9982 static void
9983 aarch64_print_operand (FILE *f, rtx x, int code)
9985 rtx elt;
9986 switch (code)
9988 case 'c':
9989 switch (GET_CODE (x))
9991 case CONST_INT:
9992 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
9993 break;
9995 case SYMBOL_REF:
9996 output_addr_const (f, x);
9997 break;
9999 case CONST:
10000 if (GET_CODE (XEXP (x, 0)) == PLUS
10001 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
10003 output_addr_const (f, x);
10004 break;
10006 /* Fall through. */
10008 default:
10009 output_operand_lossage ("unsupported operand for code '%c'", code);
10011 break;
10013 case 'e':
10015 x = unwrap_const_vec_duplicate (x);
10016 if (!CONST_INT_P (x))
10018 output_operand_lossage ("invalid operand for '%%%c'", code);
10019 return;
10022 HOST_WIDE_INT val = INTVAL (x);
10023 if ((val & ~7) == 8 || val == 0xff)
10024 fputc ('b', f);
10025 else if ((val & ~7) == 16 || val == 0xffff)
10026 fputc ('h', f);
10027 else if ((val & ~7) == 32 || val == 0xffffffff)
10028 fputc ('w', f);
10029 else
10031 output_operand_lossage ("invalid operand for '%%%c'", code);
10032 return;
10035 break;
10037 case 'p':
10039 int n;
10041 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
10043 output_operand_lossage ("invalid operand for '%%%c'", code);
10044 return;
10047 asm_fprintf (f, "%d", n);
10049 break;
10051 case 'P':
10052 if (!CONST_INT_P (x))
10054 output_operand_lossage ("invalid operand for '%%%c'", code);
10055 return;
10058 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
10059 break;
10061 case 'H':
10062 if (x == const0_rtx)
10064 asm_fprintf (f, "xzr");
10065 break;
10068 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
10070 output_operand_lossage ("invalid operand for '%%%c'", code);
10071 return;
10074 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
10075 break;
10077 case 'I':
10079 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
10080 if (CONST_INT_P (x))
10081 asm_fprintf (f, "%wd", INTVAL (x));
10082 else
10084 output_operand_lossage ("invalid operand for '%%%c'", code);
10085 return;
10087 break;
10090 case 'M':
10091 case 'm':
10093 int cond_code;
10094 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
10095 if (x == const_true_rtx)
10097 if (code == 'M')
10098 fputs ("nv", f);
10099 return;
10102 if (!COMPARISON_P (x))
10104 output_operand_lossage ("invalid operand for '%%%c'", code);
10105 return;
10108 cond_code = aarch64_get_condition_code (x);
10109 gcc_assert (cond_code >= 0);
10110 if (code == 'M')
10111 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
10112 if (GET_MODE (XEXP (x, 0)) == CC_NZCmode)
10113 fputs (aarch64_sve_condition_codes[cond_code], f);
10114 else
10115 fputs (aarch64_condition_codes[cond_code], f);
10117 break;
10119 case 'N':
10120 if (!const_vec_duplicate_p (x, &elt))
10122 output_operand_lossage ("invalid vector constant");
10123 return;
10126 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10127 asm_fprintf (f, "%wd", -INTVAL (elt));
10128 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10129 && aarch64_print_vector_float_operand (f, x, true))
10131 else
10133 output_operand_lossage ("invalid vector constant");
10134 return;
10136 break;
10138 case 'b':
10139 case 'h':
10140 case 's':
10141 case 'd':
10142 case 'q':
10143 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10145 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10146 return;
10148 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
10149 break;
10151 case 'S':
10152 case 'T':
10153 case 'U':
10154 case 'V':
10155 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
10157 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
10158 return;
10160 asm_fprintf (f, "%c%d",
10161 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
10162 REGNO (x) - V0_REGNUM + (code - 'S'));
10163 break;
10165 case 'R':
10166 if (REG_P (x) && FP_REGNUM_P (REGNO (x)))
10167 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
10168 else if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10169 asm_fprintf (f, "x%d", REGNO (x) - R0_REGNUM + 1);
10170 else
10171 output_operand_lossage ("incompatible register operand for '%%%c'",
10172 code);
10173 break;
10175 case 'X':
10176 if (!CONST_INT_P (x))
10178 output_operand_lossage ("invalid operand for '%%%c'", code);
10179 return;
10181 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
10182 break;
10184 case 'C':
10186 /* Print a replicated constant in hex. */
10187 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10189 output_operand_lossage ("invalid operand for '%%%c'", code);
10190 return;
10192 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10193 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10195 break;
10197 case 'D':
10199 /* Print a replicated constant in decimal, treating it as
10200 unsigned. */
10201 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
10203 output_operand_lossage ("invalid operand for '%%%c'", code);
10204 return;
10206 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
10207 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
10209 break;
10211 case 'w':
10212 case 'x':
10213 if (x == const0_rtx
10214 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
10216 asm_fprintf (f, "%czr", code);
10217 break;
10220 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
10222 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
10223 break;
10226 if (REG_P (x) && REGNO (x) == SP_REGNUM)
10228 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
10229 break;
10232 /* Fall through */
10234 case 0:
10235 if (x == NULL)
10237 output_operand_lossage ("missing operand");
10238 return;
10241 switch (GET_CODE (x))
10243 case REG:
10244 if (aarch64_sve_data_mode_p (GET_MODE (x)))
10246 if (REG_NREGS (x) == 1)
10247 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
10248 else
10250 char suffix
10251 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
10252 asm_fprintf (f, "{z%d.%c - z%d.%c}",
10253 REGNO (x) - V0_REGNUM, suffix,
10254 END_REGNO (x) - V0_REGNUM - 1, suffix);
10257 else
10258 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
10259 break;
10261 case MEM:
10262 output_address (GET_MODE (x), XEXP (x, 0));
10263 break;
10265 case LABEL_REF:
10266 case SYMBOL_REF:
10267 output_addr_const (asm_out_file, x);
10268 break;
10270 case CONST_INT:
10271 asm_fprintf (f, "%wd", INTVAL (x));
10272 break;
10274 case CONST:
10275 if (!VECTOR_MODE_P (GET_MODE (x)))
10277 output_addr_const (asm_out_file, x);
10278 break;
10280 /* fall through */
10282 case CONST_VECTOR:
10283 if (!const_vec_duplicate_p (x, &elt))
10285 output_operand_lossage ("invalid vector constant");
10286 return;
10289 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
10290 asm_fprintf (f, "%wd", INTVAL (elt));
10291 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10292 && aarch64_print_vector_float_operand (f, x, false))
10294 else
10296 output_operand_lossage ("invalid vector constant");
10297 return;
10299 break;
10301 case CONST_DOUBLE:
10302 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
10303 be getting CONST_DOUBLEs holding integers. */
10304 gcc_assert (GET_MODE (x) != VOIDmode);
10305 if (aarch64_float_const_zero_rtx_p (x))
10307 fputc ('0', f);
10308 break;
10310 else if (aarch64_float_const_representable_p (x))
10312 #define buf_size 20
10313 char float_buf[buf_size] = {'\0'};
10314 real_to_decimal_for_mode (float_buf,
10315 CONST_DOUBLE_REAL_VALUE (x),
10316 buf_size, buf_size,
10317 1, GET_MODE (x));
10318 asm_fprintf (asm_out_file, "%s", float_buf);
10319 break;
10320 #undef buf_size
10322 output_operand_lossage ("invalid constant");
10323 return;
10324 default:
10325 output_operand_lossage ("invalid operand");
10326 return;
10328 break;
10330 case 'A':
10331 if (GET_CODE (x) == HIGH)
10332 x = XEXP (x, 0);
10334 switch (aarch64_classify_symbolic_expression (x))
10336 case SYMBOL_SMALL_GOT_4G:
10337 asm_fprintf (asm_out_file, ":got:");
10338 break;
10340 case SYMBOL_SMALL_TLSGD:
10341 asm_fprintf (asm_out_file, ":tlsgd:");
10342 break;
10344 case SYMBOL_SMALL_TLSDESC:
10345 asm_fprintf (asm_out_file, ":tlsdesc:");
10346 break;
10348 case SYMBOL_SMALL_TLSIE:
10349 asm_fprintf (asm_out_file, ":gottprel:");
10350 break;
10352 case SYMBOL_TLSLE24:
10353 asm_fprintf (asm_out_file, ":tprel:");
10354 break;
10356 case SYMBOL_TINY_GOT:
10357 gcc_unreachable ();
10358 break;
10360 default:
10361 break;
10363 output_addr_const (asm_out_file, x);
10364 break;
10366 case 'L':
10367 switch (aarch64_classify_symbolic_expression (x))
10369 case SYMBOL_SMALL_GOT_4G:
10370 asm_fprintf (asm_out_file, ":lo12:");
10371 break;
10373 case SYMBOL_SMALL_TLSGD:
10374 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
10375 break;
10377 case SYMBOL_SMALL_TLSDESC:
10378 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
10379 break;
10381 case SYMBOL_SMALL_TLSIE:
10382 asm_fprintf (asm_out_file, ":gottprel_lo12:");
10383 break;
10385 case SYMBOL_TLSLE12:
10386 asm_fprintf (asm_out_file, ":tprel_lo12:");
10387 break;
10389 case SYMBOL_TLSLE24:
10390 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
10391 break;
10393 case SYMBOL_TINY_GOT:
10394 asm_fprintf (asm_out_file, ":got:");
10395 break;
10397 case SYMBOL_TINY_TLSIE:
10398 asm_fprintf (asm_out_file, ":gottprel:");
10399 break;
10401 default:
10402 break;
10404 output_addr_const (asm_out_file, x);
10405 break;
10407 case 'G':
10408 switch (aarch64_classify_symbolic_expression (x))
10410 case SYMBOL_TLSLE24:
10411 asm_fprintf (asm_out_file, ":tprel_hi12:");
10412 break;
10413 default:
10414 break;
10416 output_addr_const (asm_out_file, x);
10417 break;
10419 case 'k':
10421 HOST_WIDE_INT cond_code;
10423 if (!CONST_INT_P (x))
10425 output_operand_lossage ("invalid operand for '%%%c'", code);
10426 return;
10429 cond_code = INTVAL (x);
10430 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
10431 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
10433 break;
10435 case 'y':
10436 case 'z':
10438 machine_mode mode = GET_MODE (x);
10440 if (GET_CODE (x) != MEM
10441 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
10443 output_operand_lossage ("invalid operand for '%%%c'", code);
10444 return;
10447 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
10448 code == 'y'
10449 ? ADDR_QUERY_LDP_STP_N
10450 : ADDR_QUERY_LDP_STP))
10451 output_operand_lossage ("invalid operand prefix '%%%c'", code);
10453 break;
10455 default:
10456 output_operand_lossage ("invalid operand prefix '%%%c'", code);
10457 return;
10461 /* Print address 'x' of a memory access with mode 'mode'.
10462 'op' is the context required by aarch64_classify_address. It can either be
10463 MEM for a normal memory access or PARALLEL for LDP/STP. */
10464 static bool
10465 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
10466 aarch64_addr_query_type type)
10468 struct aarch64_address_info addr;
10469 unsigned int size, vec_flags;
10471 /* Check all addresses are Pmode - including ILP32. */
10472 if (GET_MODE (x) != Pmode
10473 && (!CONST_INT_P (x)
10474 || trunc_int_for_mode (INTVAL (x), Pmode) != INTVAL (x)))
10476 output_operand_lossage ("invalid address mode");
10477 return false;
10480 if (aarch64_classify_address (&addr, x, mode, true, type))
10481 switch (addr.type)
10483 case ADDRESS_REG_IMM:
10484 if (known_eq (addr.const_offset, 0))
10486 asm_fprintf (f, "[%s]", reg_names[REGNO (addr.base)]);
10487 return true;
10490 vec_flags = aarch64_classify_vector_mode (mode);
10491 if (vec_flags & VEC_ANY_SVE)
10493 HOST_WIDE_INT vnum
10494 = exact_div (addr.const_offset,
10495 aarch64_vl_bytes (mode, vec_flags)).to_constant ();
10496 asm_fprintf (f, "[%s, #%wd, mul vl]",
10497 reg_names[REGNO (addr.base)], vnum);
10498 return true;
10501 asm_fprintf (f, "[%s, %wd]", reg_names[REGNO (addr.base)],
10502 INTVAL (addr.offset));
10503 return true;
10505 case ADDRESS_REG_REG:
10506 if (addr.shift == 0)
10507 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
10508 reg_names [REGNO (addr.offset)]);
10509 else
10510 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
10511 reg_names [REGNO (addr.offset)], addr.shift);
10512 return true;
10514 case ADDRESS_REG_UXTW:
10515 if (addr.shift == 0)
10516 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
10517 REGNO (addr.offset) - R0_REGNUM);
10518 else
10519 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
10520 REGNO (addr.offset) - R0_REGNUM, addr.shift);
10521 return true;
10523 case ADDRESS_REG_SXTW:
10524 if (addr.shift == 0)
10525 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
10526 REGNO (addr.offset) - R0_REGNUM);
10527 else
10528 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
10529 REGNO (addr.offset) - R0_REGNUM, addr.shift);
10530 return true;
10532 case ADDRESS_REG_WB:
10533 /* Writeback is only supported for fixed-width modes. */
10534 size = GET_MODE_SIZE (mode).to_constant ();
10535 switch (GET_CODE (x))
10537 case PRE_INC:
10538 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
10539 return true;
10540 case POST_INC:
10541 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
10542 return true;
10543 case PRE_DEC:
10544 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
10545 return true;
10546 case POST_DEC:
10547 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
10548 return true;
10549 case PRE_MODIFY:
10550 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
10551 INTVAL (addr.offset));
10552 return true;
10553 case POST_MODIFY:
10554 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
10555 INTVAL (addr.offset));
10556 return true;
10557 default:
10558 break;
10560 break;
10562 case ADDRESS_LO_SUM:
10563 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
10564 output_addr_const (f, addr.offset);
10565 asm_fprintf (f, "]");
10566 return true;
10568 case ADDRESS_SYMBOLIC:
10569 output_addr_const (f, x);
10570 return true;
10573 return false;
10576 /* Print address 'x' of a memory access with mode 'mode'. */
10577 static void
10578 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
10580 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
10581 output_addr_const (f, x);
10584 bool
10585 aarch64_label_mentioned_p (rtx x)
10587 const char *fmt;
10588 int i;
10590 if (GET_CODE (x) == LABEL_REF)
10591 return true;
10593 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
10594 referencing instruction, but they are constant offsets, not
10595 symbols. */
10596 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
10597 return false;
10599 fmt = GET_RTX_FORMAT (GET_CODE (x));
10600 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
10602 if (fmt[i] == 'E')
10604 int j;
10606 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
10607 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
10608 return 1;
10610 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
10611 return 1;
10614 return 0;
10617 /* Implement REGNO_REG_CLASS. */
10619 enum reg_class
10620 aarch64_regno_regclass (unsigned regno)
10622 if (STUB_REGNUM_P (regno))
10623 return STUB_REGS;
10625 if (GP_REGNUM_P (regno))
10626 return GENERAL_REGS;
10628 if (regno == SP_REGNUM)
10629 return STACK_REG;
10631 if (regno == FRAME_POINTER_REGNUM
10632 || regno == ARG_POINTER_REGNUM)
10633 return POINTER_REGS;
10635 if (FP_REGNUM_P (regno))
10636 return (FP_LO8_REGNUM_P (regno) ? FP_LO8_REGS
10637 : FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS);
10639 if (PR_REGNUM_P (regno))
10640 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
10642 if (regno == FFR_REGNUM || regno == FFRT_REGNUM)
10643 return FFR_REGS;
10645 return NO_REGS;
10648 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
10649 If OFFSET is out of range, return an offset of an anchor point
10650 that is in range. Return 0 otherwise. */
10652 static HOST_WIDE_INT
10653 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
10654 machine_mode mode)
10656 /* Does it look like we'll need a 16-byte load/store-pair operation? */
10657 if (size > 16)
10658 return (offset + 0x400) & ~0x7f0;
10660 /* For offsets that aren't a multiple of the access size, the limit is
10661 -256...255. */
10662 if (offset & (size - 1))
10664 /* BLKmode typically uses LDP of X-registers. */
10665 if (mode == BLKmode)
10666 return (offset + 512) & ~0x3ff;
10667 return (offset + 0x100) & ~0x1ff;
10670 /* Small negative offsets are supported. */
10671 if (IN_RANGE (offset, -256, 0))
10672 return 0;
10674 if (mode == TImode || mode == TFmode)
10675 return (offset + 0x100) & ~0x1ff;
10677 /* Use 12-bit offset by access size. */
10678 return offset & (~0xfff * size);
10681 static rtx
10682 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
10684 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
10685 where mask is selected by alignment and size of the offset.
10686 We try to pick as large a range for the offset as possible to
10687 maximize the chance of a CSE. However, for aligned addresses
10688 we limit the range to 4k so that structures with different sized
10689 elements are likely to use the same base. We need to be careful
10690 not to split a CONST for some forms of address expression, otherwise
10691 it will generate sub-optimal code. */
10693 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
10695 rtx base = XEXP (x, 0);
10696 rtx offset_rtx = XEXP (x, 1);
10697 HOST_WIDE_INT offset = INTVAL (offset_rtx);
10699 if (GET_CODE (base) == PLUS)
10701 rtx op0 = XEXP (base, 0);
10702 rtx op1 = XEXP (base, 1);
10704 /* Force any scaling into a temp for CSE. */
10705 op0 = force_reg (Pmode, op0);
10706 op1 = force_reg (Pmode, op1);
10708 /* Let the pointer register be in op0. */
10709 if (REG_POINTER (op1))
10710 std::swap (op0, op1);
10712 /* If the pointer is virtual or frame related, then we know that
10713 virtual register instantiation or register elimination is going
10714 to apply a second constant. We want the two constants folded
10715 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
10716 if (virt_or_elim_regno_p (REGNO (op0)))
10718 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
10719 NULL_RTX, true, OPTAB_DIRECT);
10720 return gen_rtx_PLUS (Pmode, base, op1);
10723 /* Otherwise, in order to encourage CSE (and thence loop strength
10724 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
10725 base = expand_binop (Pmode, add_optab, op0, op1,
10726 NULL_RTX, true, OPTAB_DIRECT);
10727 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
10730 HOST_WIDE_INT size;
10731 if (GET_MODE_SIZE (mode).is_constant (&size))
10733 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
10734 mode);
10735 if (base_offset != 0)
10737 base = plus_constant (Pmode, base, base_offset);
10738 base = force_operand (base, NULL_RTX);
10739 return plus_constant (Pmode, base, offset - base_offset);
10744 return x;
10747 static reg_class_t
10748 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
10749 reg_class_t rclass,
10750 machine_mode mode,
10751 secondary_reload_info *sri)
10753 /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
10754 LDR and STR. See the comment at the head of aarch64-sve.md for
10755 more details about the big-endian handling. */
10756 if (reg_class_subset_p (rclass, FP_REGS)
10757 && !((REG_P (x) && HARD_REGISTER_P (x))
10758 || aarch64_simd_valid_immediate (x, NULL))
10759 && mode != VNx16QImode)
10761 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
10762 if ((vec_flags & VEC_SVE_DATA)
10763 && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
10765 sri->icode = CODE_FOR_aarch64_sve_reload_mem;
10766 return NO_REGS;
10770 /* If we have to disable direct literal pool loads and stores because the
10771 function is too big, then we need a scratch register. */
10772 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
10773 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
10774 || targetm.vector_mode_supported_p (GET_MODE (x)))
10775 && !aarch64_pcrelative_literal_loads)
10777 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
10778 return NO_REGS;
10781 /* Without the TARGET_SIMD instructions we cannot move a Q register
10782 to a Q register directly. We need a scratch. */
10783 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
10784 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
10785 && reg_class_subset_p (rclass, FP_REGS))
10787 sri->icode = code_for_aarch64_reload_mov (mode);
10788 return NO_REGS;
10791 /* A TFmode or TImode memory access should be handled via an FP_REGS
10792 because AArch64 has richer addressing modes for LDR/STR instructions
10793 than LDP/STP instructions. */
10794 if (TARGET_FLOAT && rclass == GENERAL_REGS
10795 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
10796 return FP_REGS;
10798 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
10799 return GENERAL_REGS;
10801 return NO_REGS;
10804 static bool
10805 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
10807 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
10809 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
10810 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
10811 if (frame_pointer_needed)
10812 return to == HARD_FRAME_POINTER_REGNUM;
10813 return true;
10816 poly_int64
10817 aarch64_initial_elimination_offset (unsigned from, unsigned to)
10819 if (to == HARD_FRAME_POINTER_REGNUM)
10821 if (from == ARG_POINTER_REGNUM)
10822 return cfun->machine->frame.hard_fp_offset;
10824 if (from == FRAME_POINTER_REGNUM)
10825 return cfun->machine->frame.hard_fp_offset
10826 - cfun->machine->frame.locals_offset;
10829 if (to == STACK_POINTER_REGNUM)
10831 if (from == FRAME_POINTER_REGNUM)
10832 return cfun->machine->frame.frame_size
10833 - cfun->machine->frame.locals_offset;
10836 return cfun->machine->frame.frame_size;
10840 /* Get return address without mangling. */
10843 aarch64_return_addr_rtx (void)
10845 rtx val = get_hard_reg_initial_val (Pmode, LR_REGNUM);
10846 /* Note: aarch64_return_address_signing_enabled only
10847 works after cfun->machine->frame.laid_out is set,
10848 so here we don't know if the return address will
10849 be signed or not. */
10850 rtx lr = gen_rtx_REG (Pmode, LR_REGNUM);
10851 emit_move_insn (lr, val);
10852 emit_insn (GEN_FCN (CODE_FOR_xpaclri) ());
10853 return lr;
10857 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
10858 previous frame. */
10861 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
10863 if (count != 0)
10864 return const0_rtx;
10865 return aarch64_return_addr_rtx ();
10868 static void
10869 aarch64_asm_trampoline_template (FILE *f)
10871 /* Even if the current function doesn't have branch protection, some
10872 later function might, so since this template is only generated once
10873 we have to add a BTI just in case. */
10874 asm_fprintf (f, "\thint\t34 // bti c\n");
10876 if (TARGET_ILP32)
10878 asm_fprintf (f, "\tldr\tw%d, .+20\n", IP1_REGNUM - R0_REGNUM);
10879 asm_fprintf (f, "\tldr\tw%d, .+20\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
10881 else
10883 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [IP1_REGNUM]);
10884 asm_fprintf (f, "\tldr\t%s, .+24\n", reg_names [STATIC_CHAIN_REGNUM]);
10886 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
10888 /* We always emit a speculation barrier.
10889 This is because the same trampoline template is used for every nested
10890 function. Since nested functions are not particularly common or
10891 performant we don't worry too much about the extra instructions to copy
10892 around.
10893 This is not yet a problem, since we have not yet implemented function
10894 specific attributes to choose between hardening against straight line
10895 speculation or not, but such function specific attributes are likely to
10896 happen in the future. */
10897 asm_fprintf (f, "\tdsb\tsy\n\tisb\n");
10899 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10900 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
10903 static void
10904 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
10906 rtx fnaddr, mem, a_tramp;
10907 const int tramp_code_sz = 24;
10909 /* Don't need to copy the trailing D-words, we fill those in below. */
10910 /* We create our own memory address in Pmode so that `emit_block_move` can
10911 use parts of the backend which expect Pmode addresses. */
10912 rtx temp = convert_memory_address (Pmode, XEXP (m_tramp, 0));
10913 emit_block_move (gen_rtx_MEM (BLKmode, temp),
10914 assemble_trampoline_template (),
10915 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
10916 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
10917 fnaddr = XEXP (DECL_RTL (fndecl), 0);
10918 if (GET_MODE (fnaddr) != ptr_mode)
10919 fnaddr = convert_memory_address (ptr_mode, fnaddr);
10920 emit_move_insn (mem, fnaddr);
10922 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
10923 emit_move_insn (mem, chain_value);
10925 /* XXX We should really define a "clear_cache" pattern and use
10926 gen_clear_cache(). */
10927 a_tramp = XEXP (m_tramp, 0);
10928 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
10929 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
10930 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
10931 ptr_mode);
10934 static unsigned char
10935 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
10937 /* ??? Logically we should only need to provide a value when
10938 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
10939 can hold MODE, but at the moment we need to handle all modes.
10940 Just ignore any runtime parts for registers that can't store them. */
10941 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
10942 unsigned int nregs, vec_flags;
10943 switch (regclass)
10945 case STUB_REGS:
10946 case TAILCALL_ADDR_REGS:
10947 case POINTER_REGS:
10948 case GENERAL_REGS:
10949 case ALL_REGS:
10950 case POINTER_AND_FP_REGS:
10951 case FP_REGS:
10952 case FP_LO_REGS:
10953 case FP_LO8_REGS:
10954 vec_flags = aarch64_classify_vector_mode (mode);
10955 if ((vec_flags & VEC_SVE_DATA)
10956 && constant_multiple_p (GET_MODE_SIZE (mode),
10957 aarch64_vl_bytes (mode, vec_flags), &nregs))
10958 return nregs;
10959 return (vec_flags & VEC_ADVSIMD
10960 ? CEIL (lowest_size, UNITS_PER_VREG)
10961 : CEIL (lowest_size, UNITS_PER_WORD));
10962 case STACK_REG:
10963 case PR_REGS:
10964 case PR_LO_REGS:
10965 case PR_HI_REGS:
10966 case FFR_REGS:
10967 case PR_AND_FFR_REGS:
10968 return 1;
10970 case NO_REGS:
10971 return 0;
10973 default:
10974 break;
10976 gcc_unreachable ();
10979 static reg_class_t
10980 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
10982 if (regclass == POINTER_REGS)
10983 return GENERAL_REGS;
10985 if (regclass == STACK_REG)
10987 if (REG_P(x)
10988 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
10989 return regclass;
10991 return NO_REGS;
10994 /* Register eliminiation can result in a request for
10995 SP+constant->FP_REGS. We cannot support such operations which
10996 use SP as source and an FP_REG as destination, so reject out
10997 right now. */
10998 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
11000 rtx lhs = XEXP (x, 0);
11002 /* Look through a possible SUBREG introduced by ILP32. */
11003 if (GET_CODE (lhs) == SUBREG)
11004 lhs = SUBREG_REG (lhs);
11006 gcc_assert (REG_P (lhs));
11007 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
11008 POINTER_REGS));
11009 return NO_REGS;
11012 return regclass;
11015 void
11016 aarch64_asm_output_labelref (FILE* f, const char *name)
11018 asm_fprintf (f, "%U%s", name);
11021 static void
11022 aarch64_elf_asm_constructor (rtx symbol, int priority)
11024 if (priority == DEFAULT_INIT_PRIORITY)
11025 default_ctor_section_asm_out_constructor (symbol, priority);
11026 else
11028 section *s;
11029 /* While priority is known to be in range [0, 65535], so 18 bytes
11030 would be enough, the compiler might not know that. To avoid
11031 -Wformat-truncation false positive, use a larger size. */
11032 char buf[23];
11033 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
11034 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11035 switch_to_section (s);
11036 assemble_align (POINTER_SIZE);
11037 assemble_aligned_integer (POINTER_BYTES, symbol);
11041 static void
11042 aarch64_elf_asm_destructor (rtx symbol, int priority)
11044 if (priority == DEFAULT_INIT_PRIORITY)
11045 default_dtor_section_asm_out_destructor (symbol, priority);
11046 else
11048 section *s;
11049 /* While priority is known to be in range [0, 65535], so 18 bytes
11050 would be enough, the compiler might not know that. To avoid
11051 -Wformat-truncation false positive, use a larger size. */
11052 char buf[23];
11053 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
11054 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
11055 switch_to_section (s);
11056 assemble_align (POINTER_SIZE);
11057 assemble_aligned_integer (POINTER_BYTES, symbol);
11061 const char*
11062 aarch64_output_casesi (rtx *operands)
11064 char buf[100];
11065 char label[100];
11066 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
11067 int index;
11068 static const char *const patterns[4][2] =
11071 "ldrb\t%w3, [%0,%w1,uxtw]",
11072 "add\t%3, %4, %w3, sxtb #2"
11075 "ldrh\t%w3, [%0,%w1,uxtw #1]",
11076 "add\t%3, %4, %w3, sxth #2"
11079 "ldr\t%w3, [%0,%w1,uxtw #2]",
11080 "add\t%3, %4, %w3, sxtw #2"
11082 /* We assume that DImode is only generated when not optimizing and
11083 that we don't really need 64-bit address offsets. That would
11084 imply an object file with 8GB of code in a single function! */
11086 "ldr\t%w3, [%0,%w1,uxtw #2]",
11087 "add\t%3, %4, %w3, sxtw #2"
11091 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
11093 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
11094 index = exact_log2 (GET_MODE_SIZE (mode));
11096 gcc_assert (index >= 0 && index <= 3);
11098 /* Need to implement table size reduction, by chaning the code below. */
11099 output_asm_insn (patterns[index][0], operands);
11100 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
11101 snprintf (buf, sizeof (buf),
11102 "adr\t%%4, %s", targetm.strip_name_encoding (label));
11103 output_asm_insn (buf, operands);
11104 output_asm_insn (patterns[index][1], operands);
11105 output_asm_insn ("br\t%3", operands);
11106 output_asm_insn (aarch64_sls_barrier (aarch64_harden_sls_retbr_p ()),
11107 operands);
11108 assemble_label (asm_out_file, label);
11109 return "";
11113 /* Return size in bits of an arithmetic operand which is shifted/scaled and
11114 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
11115 operator. */
11118 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
11120 if (shift >= 0 && shift <= 3)
11122 int size;
11123 for (size = 8; size <= 32; size *= 2)
11125 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
11126 if (mask == bits << shift)
11127 return size;
11130 return 0;
11133 /* Constant pools are per function only when PC relative
11134 literal loads are true or we are in the large memory
11135 model. */
11137 static inline bool
11138 aarch64_can_use_per_function_literal_pools_p (void)
11140 return (aarch64_pcrelative_literal_loads
11141 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
11144 static bool
11145 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
11147 /* We can't use blocks for constants when we're using a per-function
11148 constant pool. */
11149 return !aarch64_can_use_per_function_literal_pools_p ();
11152 /* Select appropriate section for constants depending
11153 on where we place literal pools. */
11155 static section *
11156 aarch64_select_rtx_section (machine_mode mode,
11157 rtx x,
11158 unsigned HOST_WIDE_INT align)
11160 if (aarch64_can_use_per_function_literal_pools_p ())
11161 return function_section (current_function_decl);
11163 return default_elf_select_rtx_section (mode, x, align);
11166 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
11167 void
11168 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
11169 HOST_WIDE_INT offset)
11171 /* When using per-function literal pools, we must ensure that any code
11172 section is aligned to the minimal instruction length, lest we get
11173 errors from the assembler re "unaligned instructions". */
11174 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
11175 ASM_OUTPUT_ALIGN (f, 2);
11178 /* Costs. */
11180 /* Helper function for rtx cost calculation. Strip a shift expression
11181 from X. Returns the inner operand if successful, or the original
11182 expression on failure. */
11183 static rtx
11184 aarch64_strip_shift (rtx x)
11186 rtx op = x;
11188 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
11189 we can convert both to ROR during final output. */
11190 if ((GET_CODE (op) == ASHIFT
11191 || GET_CODE (op) == ASHIFTRT
11192 || GET_CODE (op) == LSHIFTRT
11193 || GET_CODE (op) == ROTATERT
11194 || GET_CODE (op) == ROTATE)
11195 && CONST_INT_P (XEXP (op, 1)))
11196 return XEXP (op, 0);
11198 if (GET_CODE (op) == MULT
11199 && CONST_INT_P (XEXP (op, 1))
11200 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
11201 return XEXP (op, 0);
11203 return x;
11206 /* Helper function for rtx cost calculation. Strip an extend
11207 expression from X. Returns the inner operand if successful, or the
11208 original expression on failure. We deal with a number of possible
11209 canonicalization variations here. If STRIP_SHIFT is true, then
11210 we can strip off a shift also. */
11211 static rtx
11212 aarch64_strip_extend (rtx x, bool strip_shift)
11214 scalar_int_mode mode;
11215 rtx op = x;
11217 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
11218 return op;
11220 /* Zero and sign extraction of a widened value. */
11221 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
11222 && XEXP (op, 2) == const0_rtx
11223 && GET_CODE (XEXP (op, 0)) == MULT
11224 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
11225 XEXP (op, 1)))
11226 return XEXP (XEXP (op, 0), 0);
11228 /* It can also be represented (for zero-extend) as an AND with an
11229 immediate. */
11230 if (GET_CODE (op) == AND
11231 && GET_CODE (XEXP (op, 0)) == MULT
11232 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
11233 && CONST_INT_P (XEXP (op, 1))
11234 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
11235 INTVAL (XEXP (op, 1))) != 0)
11236 return XEXP (XEXP (op, 0), 0);
11238 /* Now handle extended register, as this may also have an optional
11239 left shift by 1..4. */
11240 if (strip_shift
11241 && GET_CODE (op) == ASHIFT
11242 && CONST_INT_P (XEXP (op, 1))
11243 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
11244 op = XEXP (op, 0);
11246 if (GET_CODE (op) == ZERO_EXTEND
11247 || GET_CODE (op) == SIGN_EXTEND)
11248 op = XEXP (op, 0);
11250 if (op != x)
11251 return op;
11253 return x;
11256 /* Return true iff CODE is a shift supported in combination
11257 with arithmetic instructions. */
11259 static bool
11260 aarch64_shift_p (enum rtx_code code)
11262 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
11266 /* Return true iff X is a cheap shift without a sign extend. */
11268 static bool
11269 aarch64_cheap_mult_shift_p (rtx x)
11271 rtx op0, op1;
11273 op0 = XEXP (x, 0);
11274 op1 = XEXP (x, 1);
11276 if (!(aarch64_tune_params.extra_tuning_flags
11277 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
11278 return false;
11280 if (GET_CODE (op0) == SIGN_EXTEND)
11281 return false;
11283 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
11284 && UINTVAL (op1) <= 4)
11285 return true;
11287 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
11288 return false;
11290 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
11292 if (l2 > 0 && l2 <= 4)
11293 return true;
11295 return false;
11298 /* Helper function for rtx cost calculation. Calculate the cost of
11299 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
11300 Return the calculated cost of the expression, recursing manually in to
11301 operands where needed. */
11303 static int
11304 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
11306 rtx op0, op1;
11307 const struct cpu_cost_table *extra_cost
11308 = aarch64_tune_params.insn_extra_cost;
11309 int cost = 0;
11310 bool compound_p = (outer == PLUS || outer == MINUS);
11311 machine_mode mode = GET_MODE (x);
11313 gcc_checking_assert (code == MULT);
11315 op0 = XEXP (x, 0);
11316 op1 = XEXP (x, 1);
11318 if (VECTOR_MODE_P (mode))
11320 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11321 mode = GET_MODE_INNER (mode);
11322 if (vec_flags & VEC_ADVSIMD)
11324 /* The by-element versions of the instruction have the same costs as
11325 the normal 3-vector version. So don't add the costs of the
11326 duplicate into the costs of the multiply. We make an assumption
11327 that the input to the VEC_DUPLICATE is already on the FP & SIMD
11328 side. This means costing of a MUL by element pre RA is a bit
11329 optimistic. */
11330 if (GET_CODE (op0) == VEC_DUPLICATE)
11331 op0 = XEXP (op0, 0);
11332 else if (GET_CODE (op1) == VEC_DUPLICATE)
11333 op1 = XEXP (op1, 0);
11337 /* Integer multiply/fma. */
11338 if (GET_MODE_CLASS (mode) == MODE_INT)
11340 /* The multiply will be canonicalized as a shift, cost it as such. */
11341 if (aarch64_shift_p (GET_CODE (x))
11342 || (CONST_INT_P (op1)
11343 && exact_log2 (INTVAL (op1)) > 0))
11345 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
11346 || GET_CODE (op0) == SIGN_EXTEND;
11347 if (speed)
11349 if (compound_p)
11351 /* If the shift is considered cheap,
11352 then don't add any cost. */
11353 if (aarch64_cheap_mult_shift_p (x))
11355 else if (REG_P (op1))
11356 /* ARITH + shift-by-register. */
11357 cost += extra_cost->alu.arith_shift_reg;
11358 else if (is_extend)
11359 /* ARITH + extended register. We don't have a cost field
11360 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
11361 cost += extra_cost->alu.extend_arith;
11362 else
11363 /* ARITH + shift-by-immediate. */
11364 cost += extra_cost->alu.arith_shift;
11366 else
11367 /* LSL (immediate). */
11368 cost += extra_cost->alu.shift;
11371 /* Strip extends as we will have costed them in the case above. */
11372 if (is_extend)
11373 op0 = aarch64_strip_extend (op0, true);
11375 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
11377 return cost;
11380 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
11381 compound and let the below cases handle it. After all, MNEG is a
11382 special-case alias of MSUB. */
11383 if (GET_CODE (op0) == NEG)
11385 op0 = XEXP (op0, 0);
11386 compound_p = true;
11389 /* Integer multiplies or FMAs have zero/sign extending variants. */
11390 if ((GET_CODE (op0) == ZERO_EXTEND
11391 && GET_CODE (op1) == ZERO_EXTEND)
11392 || (GET_CODE (op0) == SIGN_EXTEND
11393 && GET_CODE (op1) == SIGN_EXTEND))
11395 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
11396 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
11398 if (speed)
11400 if (compound_p)
11401 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
11402 cost += extra_cost->mult[0].extend_add;
11403 else
11404 /* MUL/SMULL/UMULL. */
11405 cost += extra_cost->mult[0].extend;
11408 return cost;
11411 /* This is either an integer multiply or a MADD. In both cases
11412 we want to recurse and cost the operands. */
11413 cost += rtx_cost (op0, mode, MULT, 0, speed);
11414 cost += rtx_cost (op1, mode, MULT, 1, speed);
11416 if (speed)
11418 if (compound_p)
11419 /* MADD/MSUB. */
11420 cost += extra_cost->mult[mode == DImode].add;
11421 else
11422 /* MUL. */
11423 cost += extra_cost->mult[mode == DImode].simple;
11426 return cost;
11428 else
11430 if (speed)
11432 /* Floating-point FMA/FMUL can also support negations of the
11433 operands, unless the rounding mode is upward or downward in
11434 which case FNMUL is different than FMUL with operand negation. */
11435 bool neg0 = GET_CODE (op0) == NEG;
11436 bool neg1 = GET_CODE (op1) == NEG;
11437 if (compound_p || !flag_rounding_math || (neg0 && neg1))
11439 if (neg0)
11440 op0 = XEXP (op0, 0);
11441 if (neg1)
11442 op1 = XEXP (op1, 0);
11445 if (compound_p)
11446 /* FMADD/FNMADD/FNMSUB/FMSUB. */
11447 cost += extra_cost->fp[mode == DFmode].fma;
11448 else
11449 /* FMUL/FNMUL. */
11450 cost += extra_cost->fp[mode == DFmode].mult;
11453 cost += rtx_cost (op0, mode, MULT, 0, speed);
11454 cost += rtx_cost (op1, mode, MULT, 1, speed);
11455 return cost;
11459 static int
11460 aarch64_address_cost (rtx x,
11461 machine_mode mode,
11462 addr_space_t as ATTRIBUTE_UNUSED,
11463 bool speed)
11465 enum rtx_code c = GET_CODE (x);
11466 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
11467 struct aarch64_address_info info;
11468 int cost = 0;
11469 info.shift = 0;
11471 if (!aarch64_classify_address (&info, x, mode, false))
11473 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
11475 /* This is a CONST or SYMBOL ref which will be split
11476 in a different way depending on the code model in use.
11477 Cost it through the generic infrastructure. */
11478 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
11479 /* Divide through by the cost of one instruction to
11480 bring it to the same units as the address costs. */
11481 cost_symbol_ref /= COSTS_N_INSNS (1);
11482 /* The cost is then the cost of preparing the address,
11483 followed by an immediate (possibly 0) offset. */
11484 return cost_symbol_ref + addr_cost->imm_offset;
11486 else
11488 /* This is most likely a jump table from a case
11489 statement. */
11490 return addr_cost->register_offset;
11494 switch (info.type)
11496 case ADDRESS_LO_SUM:
11497 case ADDRESS_SYMBOLIC:
11498 case ADDRESS_REG_IMM:
11499 cost += addr_cost->imm_offset;
11500 break;
11502 case ADDRESS_REG_WB:
11503 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
11504 cost += addr_cost->pre_modify;
11505 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
11506 cost += addr_cost->post_modify;
11507 else
11508 gcc_unreachable ();
11510 break;
11512 case ADDRESS_REG_REG:
11513 cost += addr_cost->register_offset;
11514 break;
11516 case ADDRESS_REG_SXTW:
11517 cost += addr_cost->register_sextend;
11518 break;
11520 case ADDRESS_REG_UXTW:
11521 cost += addr_cost->register_zextend;
11522 break;
11524 default:
11525 gcc_unreachable ();
11529 if (info.shift > 0)
11531 /* For the sake of calculating the cost of the shifted register
11532 component, we can treat same sized modes in the same way. */
11533 if (known_eq (GET_MODE_BITSIZE (mode), 16))
11534 cost += addr_cost->addr_scale_costs.hi;
11535 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
11536 cost += addr_cost->addr_scale_costs.si;
11537 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
11538 cost += addr_cost->addr_scale_costs.di;
11539 else
11540 /* We can't tell, or this is a 128-bit vector. */
11541 cost += addr_cost->addr_scale_costs.ti;
11544 return cost;
11547 /* Return the cost of a branch. If SPEED_P is true then the compiler is
11548 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
11549 to be taken. */
11552 aarch64_branch_cost (bool speed_p, bool predictable_p)
11554 /* When optimizing for speed, use the cost of unpredictable branches. */
11555 const struct cpu_branch_cost *branch_costs =
11556 aarch64_tune_params.branch_costs;
11558 if (!speed_p || predictable_p)
11559 return branch_costs->predictable;
11560 else
11561 return branch_costs->unpredictable;
11564 /* Return true if the RTX X in mode MODE is a zero or sign extract
11565 usable in an ADD or SUB (extended register) instruction. */
11566 static bool
11567 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
11569 /* Catch add with a sign extract.
11570 This is add_<optab><mode>_multp2. */
11571 if (GET_CODE (x) == SIGN_EXTRACT
11572 || GET_CODE (x) == ZERO_EXTRACT)
11574 rtx op0 = XEXP (x, 0);
11575 rtx op1 = XEXP (x, 1);
11576 rtx op2 = XEXP (x, 2);
11578 if (GET_CODE (op0) == MULT
11579 && CONST_INT_P (op1)
11580 && op2 == const0_rtx
11581 && CONST_INT_P (XEXP (op0, 1))
11582 && aarch64_is_extend_from_extract (mode,
11583 XEXP (op0, 1),
11584 op1))
11586 return true;
11589 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
11590 No shift. */
11591 else if (GET_CODE (x) == SIGN_EXTEND
11592 || GET_CODE (x) == ZERO_EXTEND)
11593 return REG_P (XEXP (x, 0));
11595 return false;
11598 static bool
11599 aarch64_frint_unspec_p (unsigned int u)
11601 switch (u)
11603 case UNSPEC_FRINTZ:
11604 case UNSPEC_FRINTP:
11605 case UNSPEC_FRINTM:
11606 case UNSPEC_FRINTA:
11607 case UNSPEC_FRINTN:
11608 case UNSPEC_FRINTX:
11609 case UNSPEC_FRINTI:
11610 return true;
11612 default:
11613 return false;
11617 /* Return true iff X is an rtx that will match an extr instruction
11618 i.e. as described in the *extr<mode>5_insn family of patterns.
11619 OP0 and OP1 will be set to the operands of the shifts involved
11620 on success and will be NULL_RTX otherwise. */
11622 static bool
11623 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
11625 rtx op0, op1;
11626 scalar_int_mode mode;
11627 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
11628 return false;
11630 *res_op0 = NULL_RTX;
11631 *res_op1 = NULL_RTX;
11633 if (GET_CODE (x) != IOR)
11634 return false;
11636 op0 = XEXP (x, 0);
11637 op1 = XEXP (x, 1);
11639 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
11640 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
11642 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
11643 if (GET_CODE (op1) == ASHIFT)
11644 std::swap (op0, op1);
11646 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
11647 return false;
11649 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
11650 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
11652 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
11653 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
11655 *res_op0 = XEXP (op0, 0);
11656 *res_op1 = XEXP (op1, 0);
11657 return true;
11661 return false;
11664 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
11665 storing it in *COST. Result is true if the total cost of the operation
11666 has now been calculated. */
11667 static bool
11668 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
11670 rtx inner;
11671 rtx comparator;
11672 enum rtx_code cmpcode;
11673 const struct cpu_cost_table *extra_cost
11674 = aarch64_tune_params.insn_extra_cost;
11676 if (COMPARISON_P (op0))
11678 inner = XEXP (op0, 0);
11679 comparator = XEXP (op0, 1);
11680 cmpcode = GET_CODE (op0);
11682 else
11684 inner = op0;
11685 comparator = const0_rtx;
11686 cmpcode = NE;
11689 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
11691 /* Conditional branch. */
11692 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11693 return true;
11694 else
11696 if (cmpcode == NE || cmpcode == EQ)
11698 if (comparator == const0_rtx)
11700 /* TBZ/TBNZ/CBZ/CBNZ. */
11701 if (GET_CODE (inner) == ZERO_EXTRACT)
11702 /* TBZ/TBNZ. */
11703 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
11704 ZERO_EXTRACT, 0, speed);
11705 else
11706 /* CBZ/CBNZ. */
11707 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
11709 return true;
11711 if (register_operand (inner, VOIDmode)
11712 && aarch64_imm24 (comparator, VOIDmode))
11714 /* SUB and SUBS. */
11715 *cost += COSTS_N_INSNS (2);
11716 if (speed)
11717 *cost += extra_cost->alu.arith * 2;
11718 return true;
11721 else if (cmpcode == LT || cmpcode == GE)
11723 /* TBZ/TBNZ. */
11724 if (comparator == const0_rtx)
11725 return true;
11729 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
11731 /* CCMP. */
11732 if (GET_CODE (op1) == COMPARE)
11734 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
11735 if (XEXP (op1, 1) == const0_rtx)
11736 *cost += 1;
11737 if (speed)
11739 machine_mode mode = GET_MODE (XEXP (op1, 0));
11740 const struct cpu_cost_table *extra_cost
11741 = aarch64_tune_params.insn_extra_cost;
11743 if (GET_MODE_CLASS (mode) == MODE_INT)
11744 *cost += extra_cost->alu.arith;
11745 else
11746 *cost += extra_cost->fp[mode == DFmode].compare;
11748 return true;
11751 /* It's a conditional operation based on the status flags,
11752 so it must be some flavor of CSEL. */
11754 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
11755 if (GET_CODE (op1) == NEG
11756 || GET_CODE (op1) == NOT
11757 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
11758 op1 = XEXP (op1, 0);
11759 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
11761 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
11762 op1 = XEXP (op1, 0);
11763 op2 = XEXP (op2, 0);
11765 else if (GET_CODE (op1) == ZERO_EXTEND && op2 == const0_rtx)
11767 inner = XEXP (op1, 0);
11768 if (GET_CODE (inner) == NEG || GET_CODE (inner) == NOT)
11769 /* CSINV/NEG with zero extend + const 0 (*csinv3_uxtw_insn3). */
11770 op1 = XEXP (inner, 0);
11773 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
11774 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
11775 return true;
11778 /* We don't know what this is, cost all operands. */
11779 return false;
11782 /* Check whether X is a bitfield operation of the form shift + extend that
11783 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
11784 operand to which the bitfield operation is applied. Otherwise return
11785 NULL_RTX. */
11787 static rtx
11788 aarch64_extend_bitfield_pattern_p (rtx x)
11790 rtx_code outer_code = GET_CODE (x);
11791 machine_mode outer_mode = GET_MODE (x);
11793 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
11794 && outer_mode != SImode && outer_mode != DImode)
11795 return NULL_RTX;
11797 rtx inner = XEXP (x, 0);
11798 rtx_code inner_code = GET_CODE (inner);
11799 machine_mode inner_mode = GET_MODE (inner);
11800 rtx op = NULL_RTX;
11802 switch (inner_code)
11804 case ASHIFT:
11805 if (CONST_INT_P (XEXP (inner, 1))
11806 && (inner_mode == QImode || inner_mode == HImode))
11807 op = XEXP (inner, 0);
11808 break;
11809 case LSHIFTRT:
11810 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
11811 && (inner_mode == QImode || inner_mode == HImode))
11812 op = XEXP (inner, 0);
11813 break;
11814 case ASHIFTRT:
11815 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
11816 && (inner_mode == QImode || inner_mode == HImode))
11817 op = XEXP (inner, 0);
11818 break;
11819 default:
11820 break;
11823 return op;
11826 /* Return true if the mask and a shift amount from an RTX of the form
11827 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
11828 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
11830 bool
11831 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
11832 rtx shft_amnt)
11834 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
11835 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
11836 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
11837 && (INTVAL (mask)
11838 & ((HOST_WIDE_INT_1U << INTVAL (shft_amnt)) - 1)) == 0;
11841 /* Return true if the masks and a shift amount from an RTX of the form
11842 ((x & MASK1) | ((y << SHIFT_AMNT) & MASK2)) are valid to combine into
11843 a BFI instruction of mode MODE. See *arch64_bfi patterns. */
11845 bool
11846 aarch64_masks_and_shift_for_bfi_p (scalar_int_mode mode,
11847 unsigned HOST_WIDE_INT mask1,
11848 unsigned HOST_WIDE_INT shft_amnt,
11849 unsigned HOST_WIDE_INT mask2)
11851 unsigned HOST_WIDE_INT t;
11853 /* Verify that there is no overlap in what bits are set in the two masks. */
11854 if (mask1 != ~mask2)
11855 return false;
11857 /* Verify that mask2 is not all zeros or ones. */
11858 if (mask2 == 0 || mask2 == HOST_WIDE_INT_M1U)
11859 return false;
11861 /* The shift amount should always be less than the mode size. */
11862 gcc_assert (shft_amnt < GET_MODE_BITSIZE (mode));
11864 /* Verify that the mask being shifted is contiguous and would be in the
11865 least significant bits after shifting by shft_amnt. */
11866 t = mask2 + (HOST_WIDE_INT_1U << shft_amnt);
11867 return (t == (t & -t));
11870 /* Calculate the cost of calculating X, storing it in *COST. Result
11871 is true if the total cost of the operation has now been calculated. */
11872 static bool
11873 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
11874 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
11876 rtx op0, op1, op2;
11877 const struct cpu_cost_table *extra_cost
11878 = aarch64_tune_params.insn_extra_cost;
11879 int code = GET_CODE (x);
11880 scalar_int_mode int_mode;
11882 /* By default, assume that everything has equivalent cost to the
11883 cheapest instruction. Any additional costs are applied as a delta
11884 above this default. */
11885 *cost = COSTS_N_INSNS (1);
11887 switch (code)
11889 case SET:
11890 /* The cost depends entirely on the operands to SET. */
11891 *cost = 0;
11892 op0 = SET_DEST (x);
11893 op1 = SET_SRC (x);
11895 switch (GET_CODE (op0))
11897 case MEM:
11898 if (speed)
11900 rtx address = XEXP (op0, 0);
11901 if (VECTOR_MODE_P (mode))
11902 *cost += extra_cost->ldst.storev;
11903 else if (GET_MODE_CLASS (mode) == MODE_INT)
11904 *cost += extra_cost->ldst.store;
11905 else if (mode == SFmode)
11906 *cost += extra_cost->ldst.storef;
11907 else if (mode == DFmode)
11908 *cost += extra_cost->ldst.stored;
11910 *cost +=
11911 COSTS_N_INSNS (aarch64_address_cost (address, mode,
11912 0, speed));
11915 *cost += rtx_cost (op1, mode, SET, 1, speed);
11916 return true;
11918 case SUBREG:
11919 if (! REG_P (SUBREG_REG (op0)))
11920 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
11922 /* Fall through. */
11923 case REG:
11924 /* The cost is one per vector-register copied. */
11925 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
11927 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
11928 *cost = COSTS_N_INSNS (nregs);
11930 /* const0_rtx is in general free, but we will use an
11931 instruction to set a register to 0. */
11932 else if (REG_P (op1) || op1 == const0_rtx)
11934 /* The cost is 1 per register copied. */
11935 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
11936 *cost = COSTS_N_INSNS (nregs);
11938 else
11939 /* Cost is just the cost of the RHS of the set. */
11940 *cost += rtx_cost (op1, mode, SET, 1, speed);
11941 return true;
11943 case ZERO_EXTRACT:
11944 case SIGN_EXTRACT:
11945 /* Bit-field insertion. Strip any redundant widening of
11946 the RHS to meet the width of the target. */
11947 if (GET_CODE (op1) == SUBREG)
11948 op1 = SUBREG_REG (op1);
11949 if ((GET_CODE (op1) == ZERO_EXTEND
11950 || GET_CODE (op1) == SIGN_EXTEND)
11951 && CONST_INT_P (XEXP (op0, 1))
11952 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
11953 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
11954 op1 = XEXP (op1, 0);
11956 if (CONST_INT_P (op1))
11958 /* MOV immediate is assumed to always be cheap. */
11959 *cost = COSTS_N_INSNS (1);
11961 else
11963 /* BFM. */
11964 if (speed)
11965 *cost += extra_cost->alu.bfi;
11966 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
11969 return true;
11971 default:
11972 /* We can't make sense of this, assume default cost. */
11973 *cost = COSTS_N_INSNS (1);
11974 return false;
11976 return false;
11978 case CONST_INT:
11979 /* If an instruction can incorporate a constant within the
11980 instruction, the instruction's expression avoids calling
11981 rtx_cost() on the constant. If rtx_cost() is called on a
11982 constant, then it is usually because the constant must be
11983 moved into a register by one or more instructions.
11985 The exception is constant 0, which can be expressed
11986 as XZR/WZR and is therefore free. The exception to this is
11987 if we have (set (reg) (const0_rtx)) in which case we must cost
11988 the move. However, we can catch that when we cost the SET, so
11989 we don't need to consider that here. */
11990 if (x == const0_rtx)
11991 *cost = 0;
11992 else
11994 /* To an approximation, building any other constant is
11995 proportionally expensive to the number of instructions
11996 required to build that constant. This is true whether we
11997 are compiling for SPEED or otherwise. */
11998 if (!is_a <scalar_int_mode> (mode, &int_mode))
11999 int_mode = word_mode;
12000 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
12001 (NULL_RTX, x, false, int_mode));
12003 return true;
12005 case CONST_DOUBLE:
12007 /* First determine number of instructions to do the move
12008 as an integer constant. */
12009 if (!aarch64_float_const_representable_p (x)
12010 && !aarch64_can_const_movi_rtx_p (x, mode)
12011 && aarch64_float_const_rtx_p (x))
12013 unsigned HOST_WIDE_INT ival;
12014 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
12015 gcc_assert (succeed);
12017 scalar_int_mode imode = (mode == HFmode
12018 ? SImode
12019 : int_mode_for_mode (mode).require ());
12020 int ncost = aarch64_internal_mov_immediate
12021 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
12022 *cost += COSTS_N_INSNS (ncost);
12023 return true;
12026 if (speed)
12028 /* mov[df,sf]_aarch64. */
12029 if (aarch64_float_const_representable_p (x))
12030 /* FMOV (scalar immediate). */
12031 *cost += extra_cost->fp[mode == DFmode].fpconst;
12032 else if (!aarch64_float_const_zero_rtx_p (x))
12034 /* This will be a load from memory. */
12035 if (mode == DFmode)
12036 *cost += extra_cost->ldst.loadd;
12037 else
12038 *cost += extra_cost->ldst.loadf;
12040 else
12041 /* Otherwise this is +0.0. We get this using MOVI d0, #0
12042 or MOV v0.s[0], wzr - neither of which are modeled by the
12043 cost tables. Just use the default cost. */
12048 return true;
12050 case MEM:
12051 if (speed)
12053 /* For loads we want the base cost of a load, plus an
12054 approximation for the additional cost of the addressing
12055 mode. */
12056 rtx address = XEXP (x, 0);
12057 if (VECTOR_MODE_P (mode))
12058 *cost += extra_cost->ldst.loadv;
12059 else if (GET_MODE_CLASS (mode) == MODE_INT)
12060 *cost += extra_cost->ldst.load;
12061 else if (mode == SFmode)
12062 *cost += extra_cost->ldst.loadf;
12063 else if (mode == DFmode)
12064 *cost += extra_cost->ldst.loadd;
12066 *cost +=
12067 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12068 0, speed));
12071 return true;
12073 case NEG:
12074 op0 = XEXP (x, 0);
12076 if (VECTOR_MODE_P (mode))
12078 if (speed)
12080 /* FNEG. */
12081 *cost += extra_cost->vect.alu;
12083 return false;
12086 if (GET_MODE_CLASS (mode) == MODE_INT)
12088 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12089 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12091 /* CSETM. */
12092 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
12093 return true;
12096 /* Cost this as SUB wzr, X. */
12097 op0 = CONST0_RTX (mode);
12098 op1 = XEXP (x, 0);
12099 goto cost_minus;
12102 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12104 /* Support (neg(fma...)) as a single instruction only if
12105 sign of zeros is unimportant. This matches the decision
12106 making in aarch64.md. */
12107 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
12109 /* FNMADD. */
12110 *cost = rtx_cost (op0, mode, NEG, 0, speed);
12111 return true;
12113 if (GET_CODE (op0) == MULT)
12115 /* FNMUL. */
12116 *cost = rtx_cost (op0, mode, NEG, 0, speed);
12117 return true;
12119 if (speed)
12120 /* FNEG. */
12121 *cost += extra_cost->fp[mode == DFmode].neg;
12122 return false;
12125 return false;
12127 case CLRSB:
12128 case CLZ:
12129 if (speed)
12131 if (VECTOR_MODE_P (mode))
12132 *cost += extra_cost->vect.alu;
12133 else
12134 *cost += extra_cost->alu.clz;
12137 return false;
12139 case CTZ:
12140 *cost = COSTS_N_INSNS (2);
12142 if (speed)
12143 *cost += extra_cost->alu.clz + extra_cost->alu.rev;
12144 return false;
12146 case COMPARE:
12147 op0 = XEXP (x, 0);
12148 op1 = XEXP (x, 1);
12150 if (op1 == const0_rtx
12151 && GET_CODE (op0) == AND)
12153 x = op0;
12154 mode = GET_MODE (op0);
12155 goto cost_logic;
12158 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
12160 /* TODO: A write to the CC flags possibly costs extra, this
12161 needs encoding in the cost tables. */
12163 mode = GET_MODE (op0);
12164 /* ANDS. */
12165 if (GET_CODE (op0) == AND)
12167 x = op0;
12168 goto cost_logic;
12171 if (GET_CODE (op0) == PLUS)
12173 /* ADDS (and CMN alias). */
12174 x = op0;
12175 goto cost_plus;
12178 if (GET_CODE (op0) == MINUS)
12180 /* SUBS. */
12181 x = op0;
12182 goto cost_minus;
12185 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
12186 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
12187 && CONST_INT_P (XEXP (op0, 2)))
12189 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
12190 Handle it here directly rather than going to cost_logic
12191 since we know the immediate generated for the TST is valid
12192 so we can avoid creating an intermediate rtx for it only
12193 for costing purposes. */
12194 if (speed)
12195 *cost += extra_cost->alu.logical;
12197 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
12198 ZERO_EXTRACT, 0, speed);
12199 return true;
12202 if (GET_CODE (op1) == NEG)
12204 /* CMN. */
12205 if (speed)
12206 *cost += extra_cost->alu.arith;
12208 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
12209 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
12210 return true;
12213 /* CMP.
12215 Compare can freely swap the order of operands, and
12216 canonicalization puts the more complex operation first.
12217 But the integer MINUS logic expects the shift/extend
12218 operation in op1. */
12219 if (! (REG_P (op0)
12220 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
12222 op0 = XEXP (x, 1);
12223 op1 = XEXP (x, 0);
12225 goto cost_minus;
12228 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
12230 /* FCMP. */
12231 if (speed)
12232 *cost += extra_cost->fp[mode == DFmode].compare;
12234 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
12236 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
12237 /* FCMP supports constant 0.0 for no extra cost. */
12238 return true;
12240 return false;
12243 if (VECTOR_MODE_P (mode))
12245 /* Vector compare. */
12246 if (speed)
12247 *cost += extra_cost->vect.alu;
12249 if (aarch64_float_const_zero_rtx_p (op1))
12251 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
12252 cost. */
12253 return true;
12255 return false;
12257 return false;
12259 case MINUS:
12261 op0 = XEXP (x, 0);
12262 op1 = XEXP (x, 1);
12264 cost_minus:
12265 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
12267 /* Detect valid immediates. */
12268 if ((GET_MODE_CLASS (mode) == MODE_INT
12269 || (GET_MODE_CLASS (mode) == MODE_CC
12270 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
12271 && CONST_INT_P (op1)
12272 && aarch64_uimm12_shift (INTVAL (op1)))
12274 if (speed)
12275 /* SUB(S) (immediate). */
12276 *cost += extra_cost->alu.arith;
12277 return true;
12280 /* Look for SUB (extended register). */
12281 if (is_a <scalar_int_mode> (mode, &int_mode)
12282 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
12284 if (speed)
12285 *cost += extra_cost->alu.extend_arith;
12287 op1 = aarch64_strip_extend (op1, true);
12288 *cost += rtx_cost (op1, VOIDmode,
12289 (enum rtx_code) GET_CODE (op1), 0, speed);
12290 return true;
12293 rtx new_op1 = aarch64_strip_extend (op1, false);
12295 /* Cost this as an FMA-alike operation. */
12296 if ((GET_CODE (new_op1) == MULT
12297 || aarch64_shift_p (GET_CODE (new_op1)))
12298 && code != COMPARE)
12300 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
12301 (enum rtx_code) code,
12302 speed);
12303 return true;
12306 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
12308 if (speed)
12310 if (VECTOR_MODE_P (mode))
12312 /* Vector SUB. */
12313 *cost += extra_cost->vect.alu;
12315 else if (GET_MODE_CLASS (mode) == MODE_INT)
12317 /* SUB(S). */
12318 *cost += extra_cost->alu.arith;
12320 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12322 /* FSUB. */
12323 *cost += extra_cost->fp[mode == DFmode].addsub;
12326 return true;
12329 case PLUS:
12331 rtx new_op0;
12333 op0 = XEXP (x, 0);
12334 op1 = XEXP (x, 1);
12336 cost_plus:
12337 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
12338 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
12340 /* CSINC. */
12341 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
12342 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12343 return true;
12346 if (GET_MODE_CLASS (mode) == MODE_INT
12347 && (aarch64_plus_immediate (op1, mode)
12348 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
12350 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
12352 if (speed)
12353 /* ADD (immediate). */
12354 *cost += extra_cost->alu.arith;
12355 return true;
12358 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
12360 /* Look for ADD (extended register). */
12361 if (is_a <scalar_int_mode> (mode, &int_mode)
12362 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
12364 if (speed)
12365 *cost += extra_cost->alu.extend_arith;
12367 op0 = aarch64_strip_extend (op0, true);
12368 *cost += rtx_cost (op0, VOIDmode,
12369 (enum rtx_code) GET_CODE (op0), 0, speed);
12370 return true;
12373 /* Strip any extend, leave shifts behind as we will
12374 cost them through mult_cost. */
12375 new_op0 = aarch64_strip_extend (op0, false);
12377 if (GET_CODE (new_op0) == MULT
12378 || aarch64_shift_p (GET_CODE (new_op0)))
12380 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
12381 speed);
12382 return true;
12385 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
12387 if (speed)
12389 if (VECTOR_MODE_P (mode))
12391 /* Vector ADD. */
12392 *cost += extra_cost->vect.alu;
12394 else if (GET_MODE_CLASS (mode) == MODE_INT)
12396 /* ADD. */
12397 *cost += extra_cost->alu.arith;
12399 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12401 /* FADD. */
12402 *cost += extra_cost->fp[mode == DFmode].addsub;
12405 return true;
12408 case BSWAP:
12409 *cost = COSTS_N_INSNS (1);
12411 if (speed)
12413 if (VECTOR_MODE_P (mode))
12414 *cost += extra_cost->vect.alu;
12415 else
12416 *cost += extra_cost->alu.rev;
12418 return false;
12420 case IOR:
12421 if (aarch_rev16_p (x))
12423 *cost = COSTS_N_INSNS (1);
12425 if (speed)
12427 if (VECTOR_MODE_P (mode))
12428 *cost += extra_cost->vect.alu;
12429 else
12430 *cost += extra_cost->alu.rev;
12432 return true;
12435 if (aarch64_extr_rtx_p (x, &op0, &op1))
12437 *cost += rtx_cost (op0, mode, IOR, 0, speed);
12438 *cost += rtx_cost (op1, mode, IOR, 1, speed);
12439 if (speed)
12440 *cost += extra_cost->alu.shift;
12442 return true;
12444 /* Fall through. */
12445 case XOR:
12446 case AND:
12447 cost_logic:
12448 op0 = XEXP (x, 0);
12449 op1 = XEXP (x, 1);
12451 if (VECTOR_MODE_P (mode))
12453 if (speed)
12454 *cost += extra_cost->vect.alu;
12455 return true;
12458 if (code == AND
12459 && GET_CODE (op0) == MULT
12460 && CONST_INT_P (XEXP (op0, 1))
12461 && CONST_INT_P (op1)
12462 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
12463 INTVAL (op1)) != 0)
12465 /* This is a UBFM/SBFM. */
12466 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
12467 if (speed)
12468 *cost += extra_cost->alu.bfx;
12469 return true;
12472 if (is_int_mode (mode, &int_mode))
12474 if (CONST_INT_P (op1))
12476 /* We have a mask + shift version of a UBFIZ
12477 i.e. the *andim_ashift<mode>_bfiz pattern. */
12478 if (GET_CODE (op0) == ASHIFT
12479 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
12480 XEXP (op0, 1)))
12482 *cost += rtx_cost (XEXP (op0, 0), int_mode,
12483 (enum rtx_code) code, 0, speed);
12484 if (speed)
12485 *cost += extra_cost->alu.bfx;
12487 return true;
12489 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
12491 /* We possibly get the immediate for free, this is not
12492 modelled. */
12493 *cost += rtx_cost (op0, int_mode,
12494 (enum rtx_code) code, 0, speed);
12495 if (speed)
12496 *cost += extra_cost->alu.logical;
12498 return true;
12501 else
12503 rtx new_op0 = op0;
12505 /* Handle ORN, EON, or BIC. */
12506 if (GET_CODE (op0) == NOT)
12507 op0 = XEXP (op0, 0);
12509 new_op0 = aarch64_strip_shift (op0);
12511 /* If we had a shift on op0 then this is a logical-shift-
12512 by-register/immediate operation. Otherwise, this is just
12513 a logical operation. */
12514 if (speed)
12516 if (new_op0 != op0)
12518 /* Shift by immediate. */
12519 if (CONST_INT_P (XEXP (op0, 1)))
12520 *cost += extra_cost->alu.log_shift;
12521 else
12522 *cost += extra_cost->alu.log_shift_reg;
12524 else
12525 *cost += extra_cost->alu.logical;
12528 /* In both cases we want to cost both operands. */
12529 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
12530 0, speed);
12531 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
12532 1, speed);
12534 return true;
12537 return false;
12539 case NOT:
12540 x = XEXP (x, 0);
12541 op0 = aarch64_strip_shift (x);
12543 if (VECTOR_MODE_P (mode))
12545 /* Vector NOT. */
12546 *cost += extra_cost->vect.alu;
12547 return false;
12550 /* MVN-shifted-reg. */
12551 if (op0 != x)
12553 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12555 if (speed)
12556 *cost += extra_cost->alu.log_shift;
12558 return true;
12560 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
12561 Handle the second form here taking care that 'a' in the above can
12562 be a shift. */
12563 else if (GET_CODE (op0) == XOR)
12565 rtx newop0 = XEXP (op0, 0);
12566 rtx newop1 = XEXP (op0, 1);
12567 rtx op0_stripped = aarch64_strip_shift (newop0);
12569 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
12570 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
12572 if (speed)
12574 if (op0_stripped != newop0)
12575 *cost += extra_cost->alu.log_shift;
12576 else
12577 *cost += extra_cost->alu.logical;
12580 return true;
12582 /* MVN. */
12583 if (speed)
12584 *cost += extra_cost->alu.logical;
12586 return false;
12588 case ZERO_EXTEND:
12590 op0 = XEXP (x, 0);
12591 /* If a value is written in SI mode, then zero extended to DI
12592 mode, the operation will in general be free as a write to
12593 a 'w' register implicitly zeroes the upper bits of an 'x'
12594 register. However, if this is
12596 (set (reg) (zero_extend (reg)))
12598 we must cost the explicit register move. */
12599 if (mode == DImode
12600 && GET_MODE (op0) == SImode
12601 && outer == SET)
12603 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
12605 /* If OP_COST is non-zero, then the cost of the zero extend
12606 is effectively the cost of the inner operation. Otherwise
12607 we have a MOV instruction and we take the cost from the MOV
12608 itself. This is true independently of whether we are
12609 optimizing for space or time. */
12610 if (op_cost)
12611 *cost = op_cost;
12613 return true;
12615 else if (MEM_P (op0))
12617 /* All loads can zero extend to any size for free. */
12618 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
12619 return true;
12622 op0 = aarch64_extend_bitfield_pattern_p (x);
12623 if (op0)
12625 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
12626 if (speed)
12627 *cost += extra_cost->alu.bfx;
12628 return true;
12631 if (speed)
12633 if (VECTOR_MODE_P (mode))
12635 /* UMOV. */
12636 *cost += extra_cost->vect.alu;
12638 else
12640 /* We generate an AND instead of UXTB/UXTH. */
12641 *cost += extra_cost->alu.logical;
12644 return false;
12646 case SIGN_EXTEND:
12647 if (MEM_P (XEXP (x, 0)))
12649 /* LDRSH. */
12650 if (speed)
12652 rtx address = XEXP (XEXP (x, 0), 0);
12653 *cost += extra_cost->ldst.load_sign_extend;
12655 *cost +=
12656 COSTS_N_INSNS (aarch64_address_cost (address, mode,
12657 0, speed));
12659 return true;
12662 op0 = aarch64_extend_bitfield_pattern_p (x);
12663 if (op0)
12665 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
12666 if (speed)
12667 *cost += extra_cost->alu.bfx;
12668 return true;
12671 if (speed)
12673 if (VECTOR_MODE_P (mode))
12674 *cost += extra_cost->vect.alu;
12675 else
12676 *cost += extra_cost->alu.extend;
12678 return false;
12680 case ASHIFT:
12681 op0 = XEXP (x, 0);
12682 op1 = XEXP (x, 1);
12684 if (CONST_INT_P (op1))
12686 if (speed)
12688 if (VECTOR_MODE_P (mode))
12690 /* Vector shift (immediate). */
12691 *cost += extra_cost->vect.alu;
12693 else
12695 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
12696 aliases. */
12697 *cost += extra_cost->alu.shift;
12701 /* We can incorporate zero/sign extend for free. */
12702 if (GET_CODE (op0) == ZERO_EXTEND
12703 || GET_CODE (op0) == SIGN_EXTEND)
12704 op0 = XEXP (op0, 0);
12706 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
12707 return true;
12709 else
12711 if (VECTOR_MODE_P (mode))
12713 if (speed)
12714 /* Vector shift (register). */
12715 *cost += extra_cost->vect.alu;
12717 else
12719 if (speed)
12720 /* LSLV. */
12721 *cost += extra_cost->alu.shift_reg;
12723 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12724 && CONST_INT_P (XEXP (op1, 1))
12725 && known_eq (INTVAL (XEXP (op1, 1)),
12726 GET_MODE_BITSIZE (mode) - 1))
12728 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12729 /* We already demanded XEXP (op1, 0) to be REG_P, so
12730 don't recurse into it. */
12731 return true;
12734 return false; /* All arguments need to be in registers. */
12737 case ROTATE:
12738 case ROTATERT:
12739 case LSHIFTRT:
12740 case ASHIFTRT:
12741 op0 = XEXP (x, 0);
12742 op1 = XEXP (x, 1);
12744 if (CONST_INT_P (op1))
12746 /* ASR (immediate) and friends. */
12747 if (speed)
12749 if (VECTOR_MODE_P (mode))
12750 *cost += extra_cost->vect.alu;
12751 else
12752 *cost += extra_cost->alu.shift;
12755 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
12756 return true;
12758 else
12760 if (VECTOR_MODE_P (mode))
12762 if (speed)
12763 /* Vector shift (register). */
12764 *cost += extra_cost->vect.alu;
12766 else
12768 if (speed)
12769 /* ASR (register) and friends. */
12770 *cost += extra_cost->alu.shift_reg;
12772 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
12773 && CONST_INT_P (XEXP (op1, 1))
12774 && known_eq (INTVAL (XEXP (op1, 1)),
12775 GET_MODE_BITSIZE (mode) - 1))
12777 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
12778 /* We already demanded XEXP (op1, 0) to be REG_P, so
12779 don't recurse into it. */
12780 return true;
12783 return false; /* All arguments need to be in registers. */
12786 case SYMBOL_REF:
12788 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
12789 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
12791 /* LDR. */
12792 if (speed)
12793 *cost += extra_cost->ldst.load;
12795 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
12796 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
12798 /* ADRP, followed by ADD. */
12799 *cost += COSTS_N_INSNS (1);
12800 if (speed)
12801 *cost += 2 * extra_cost->alu.arith;
12803 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
12804 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
12806 /* ADR. */
12807 if (speed)
12808 *cost += extra_cost->alu.arith;
12811 if (flag_pic)
12813 /* One extra load instruction, after accessing the GOT. */
12814 *cost += COSTS_N_INSNS (1);
12815 if (speed)
12816 *cost += extra_cost->ldst.load;
12818 return true;
12820 case HIGH:
12821 case LO_SUM:
12822 /* ADRP/ADD (immediate). */
12823 if (speed)
12824 *cost += extra_cost->alu.arith;
12825 return true;
12827 case ZERO_EXTRACT:
12828 case SIGN_EXTRACT:
12829 /* UBFX/SBFX. */
12830 if (speed)
12832 if (VECTOR_MODE_P (mode))
12833 *cost += extra_cost->vect.alu;
12834 else
12835 *cost += extra_cost->alu.bfx;
12838 /* We can trust that the immediates used will be correct (there
12839 are no by-register forms), so we need only cost op0. */
12840 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
12841 return true;
12843 case MULT:
12844 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
12845 /* aarch64_rtx_mult_cost always handles recursion to its
12846 operands. */
12847 return true;
12849 case MOD:
12850 /* We can expand signed mod by power of 2 using a NEGS, two parallel
12851 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
12852 an unconditional negate. This case should only ever be reached through
12853 the set_smod_pow2_cheap check in expmed.c. */
12854 if (CONST_INT_P (XEXP (x, 1))
12855 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
12856 && (mode == SImode || mode == DImode))
12858 /* We expand to 4 instructions. Reset the baseline. */
12859 *cost = COSTS_N_INSNS (4);
12861 if (speed)
12862 *cost += 2 * extra_cost->alu.logical
12863 + 2 * extra_cost->alu.arith;
12865 return true;
12868 /* Fall-through. */
12869 case UMOD:
12870 if (speed)
12872 /* Slighly prefer UMOD over SMOD. */
12873 if (VECTOR_MODE_P (mode))
12874 *cost += extra_cost->vect.alu;
12875 else if (GET_MODE_CLASS (mode) == MODE_INT)
12876 *cost += (extra_cost->mult[mode == DImode].add
12877 + extra_cost->mult[mode == DImode].idiv
12878 + (code == MOD ? 1 : 0));
12880 return false; /* All arguments need to be in registers. */
12882 case DIV:
12883 case UDIV:
12884 case SQRT:
12885 if (speed)
12887 if (VECTOR_MODE_P (mode))
12888 *cost += extra_cost->vect.alu;
12889 else if (GET_MODE_CLASS (mode) == MODE_INT)
12890 /* There is no integer SQRT, so only DIV and UDIV can get
12891 here. */
12892 *cost += (extra_cost->mult[mode == DImode].idiv
12893 /* Slighly prefer UDIV over SDIV. */
12894 + (code == DIV ? 1 : 0));
12895 else
12896 *cost += extra_cost->fp[mode == DFmode].div;
12898 return false; /* All arguments need to be in registers. */
12900 case IF_THEN_ELSE:
12901 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
12902 XEXP (x, 2), cost, speed);
12904 case EQ:
12905 case NE:
12906 case GT:
12907 case GTU:
12908 case LT:
12909 case LTU:
12910 case GE:
12911 case GEU:
12912 case LE:
12913 case LEU:
12915 return false; /* All arguments must be in registers. */
12917 case FMA:
12918 op0 = XEXP (x, 0);
12919 op1 = XEXP (x, 1);
12920 op2 = XEXP (x, 2);
12922 if (speed)
12924 if (VECTOR_MODE_P (mode))
12925 *cost += extra_cost->vect.alu;
12926 else
12927 *cost += extra_cost->fp[mode == DFmode].fma;
12930 /* FMSUB, FNMADD, and FNMSUB are free. */
12931 if (GET_CODE (op0) == NEG)
12932 op0 = XEXP (op0, 0);
12934 if (GET_CODE (op2) == NEG)
12935 op2 = XEXP (op2, 0);
12937 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
12938 and the by-element operand as operand 0. */
12939 if (GET_CODE (op1) == NEG)
12940 op1 = XEXP (op1, 0);
12942 /* Catch vector-by-element operations. The by-element operand can
12943 either be (vec_duplicate (vec_select (x))) or just
12944 (vec_select (x)), depending on whether we are multiplying by
12945 a vector or a scalar.
12947 Canonicalization is not very good in these cases, FMA4 will put the
12948 by-element operand as operand 0, FNMA4 will have it as operand 1. */
12949 if (GET_CODE (op0) == VEC_DUPLICATE)
12950 op0 = XEXP (op0, 0);
12951 else if (GET_CODE (op1) == VEC_DUPLICATE)
12952 op1 = XEXP (op1, 0);
12954 if (GET_CODE (op0) == VEC_SELECT)
12955 op0 = XEXP (op0, 0);
12956 else if (GET_CODE (op1) == VEC_SELECT)
12957 op1 = XEXP (op1, 0);
12959 /* If the remaining parameters are not registers,
12960 get the cost to put them into registers. */
12961 *cost += rtx_cost (op0, mode, FMA, 0, speed);
12962 *cost += rtx_cost (op1, mode, FMA, 1, speed);
12963 *cost += rtx_cost (op2, mode, FMA, 2, speed);
12964 return true;
12966 case FLOAT:
12967 case UNSIGNED_FLOAT:
12968 if (speed)
12969 *cost += extra_cost->fp[mode == DFmode].fromint;
12970 return false;
12972 case FLOAT_EXTEND:
12973 if (speed)
12975 if (VECTOR_MODE_P (mode))
12977 /*Vector truncate. */
12978 *cost += extra_cost->vect.alu;
12980 else
12981 *cost += extra_cost->fp[mode == DFmode].widen;
12983 return false;
12985 case FLOAT_TRUNCATE:
12986 if (speed)
12988 if (VECTOR_MODE_P (mode))
12990 /*Vector conversion. */
12991 *cost += extra_cost->vect.alu;
12993 else
12994 *cost += extra_cost->fp[mode == DFmode].narrow;
12996 return false;
12998 case FIX:
12999 case UNSIGNED_FIX:
13000 x = XEXP (x, 0);
13001 /* Strip the rounding part. They will all be implemented
13002 by the fcvt* family of instructions anyway. */
13003 if (GET_CODE (x) == UNSPEC)
13005 unsigned int uns_code = XINT (x, 1);
13007 if (uns_code == UNSPEC_FRINTA
13008 || uns_code == UNSPEC_FRINTM
13009 || uns_code == UNSPEC_FRINTN
13010 || uns_code == UNSPEC_FRINTP
13011 || uns_code == UNSPEC_FRINTZ)
13012 x = XVECEXP (x, 0, 0);
13015 if (speed)
13017 if (VECTOR_MODE_P (mode))
13018 *cost += extra_cost->vect.alu;
13019 else
13020 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
13023 /* We can combine fmul by a power of 2 followed by a fcvt into a single
13024 fixed-point fcvt. */
13025 if (GET_CODE (x) == MULT
13026 && ((VECTOR_MODE_P (mode)
13027 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
13028 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
13030 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
13031 0, speed);
13032 return true;
13035 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
13036 return true;
13038 case ABS:
13039 if (VECTOR_MODE_P (mode))
13041 /* ABS (vector). */
13042 if (speed)
13043 *cost += extra_cost->vect.alu;
13045 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
13047 op0 = XEXP (x, 0);
13049 /* FABD, which is analogous to FADD. */
13050 if (GET_CODE (op0) == MINUS)
13052 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
13053 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
13054 if (speed)
13055 *cost += extra_cost->fp[mode == DFmode].addsub;
13057 return true;
13059 /* Simple FABS is analogous to FNEG. */
13060 if (speed)
13061 *cost += extra_cost->fp[mode == DFmode].neg;
13063 else
13065 /* Integer ABS will either be split to
13066 two arithmetic instructions, or will be an ABS
13067 (scalar), which we don't model. */
13068 *cost = COSTS_N_INSNS (2);
13069 if (speed)
13070 *cost += 2 * extra_cost->alu.arith;
13072 return false;
13074 case SMAX:
13075 case SMIN:
13076 if (speed)
13078 if (VECTOR_MODE_P (mode))
13079 *cost += extra_cost->vect.alu;
13080 else
13082 /* FMAXNM/FMINNM/FMAX/FMIN.
13083 TODO: This may not be accurate for all implementations, but
13084 we do not model this in the cost tables. */
13085 *cost += extra_cost->fp[mode == DFmode].addsub;
13088 return false;
13090 case UNSPEC:
13091 /* The floating point round to integer frint* instructions. */
13092 if (aarch64_frint_unspec_p (XINT (x, 1)))
13094 if (speed)
13095 *cost += extra_cost->fp[mode == DFmode].roundint;
13097 return false;
13100 if (XINT (x, 1) == UNSPEC_RBIT)
13102 if (speed)
13103 *cost += extra_cost->alu.rev;
13105 return false;
13107 break;
13109 case TRUNCATE:
13111 /* Decompose <su>muldi3_highpart. */
13112 if (/* (truncate:DI */
13113 mode == DImode
13114 /* (lshiftrt:TI */
13115 && GET_MODE (XEXP (x, 0)) == TImode
13116 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
13117 /* (mult:TI */
13118 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
13119 /* (ANY_EXTEND:TI (reg:DI))
13120 (ANY_EXTEND:TI (reg:DI))) */
13121 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
13122 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
13123 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
13124 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
13125 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
13126 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
13127 /* (const_int 64) */
13128 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
13129 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
13131 /* UMULH/SMULH. */
13132 if (speed)
13133 *cost += extra_cost->mult[mode == DImode].extend;
13134 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
13135 mode, MULT, 0, speed);
13136 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
13137 mode, MULT, 1, speed);
13138 return true;
13141 /* Fall through. */
13142 default:
13143 break;
13146 if (dump_file
13147 && flag_aarch64_verbose_cost)
13148 fprintf (dump_file,
13149 "\nFailed to cost RTX. Assuming default cost.\n");
13151 return true;
13154 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
13155 calculated for X. This cost is stored in *COST. Returns true
13156 if the total cost of X was calculated. */
13157 static bool
13158 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
13159 int param, int *cost, bool speed)
13161 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
13163 if (dump_file
13164 && flag_aarch64_verbose_cost)
13166 print_rtl_single (dump_file, x);
13167 fprintf (dump_file, "\n%s cost: %d (%s)\n",
13168 speed ? "Hot" : "Cold",
13169 *cost, result ? "final" : "partial");
13172 return result;
13175 static int
13176 aarch64_register_move_cost (machine_mode mode,
13177 reg_class_t from_i, reg_class_t to_i)
13179 enum reg_class from = (enum reg_class) from_i;
13180 enum reg_class to = (enum reg_class) to_i;
13181 const struct cpu_regmove_cost *regmove_cost
13182 = aarch64_tune_params.regmove_cost;
13184 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
13185 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS
13186 || to == STUB_REGS)
13187 to = GENERAL_REGS;
13189 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS
13190 || from == STUB_REGS)
13191 from = GENERAL_REGS;
13193 /* Make RDFFR very expensive. In particular, if we know that the FFR
13194 contains a PTRUE (e.g. after a SETFFR), we must never use RDFFR
13195 as a way of obtaining a PTRUE. */
13196 if (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL
13197 && hard_reg_set_subset_p (reg_class_contents[from_i],
13198 reg_class_contents[FFR_REGS]))
13199 return 80;
13201 /* Moving between GPR and stack cost is the same as GP2GP. */
13202 if ((from == GENERAL_REGS && to == STACK_REG)
13203 || (to == GENERAL_REGS && from == STACK_REG))
13204 return regmove_cost->GP2GP;
13206 /* To/From the stack register, we move via the gprs. */
13207 if (to == STACK_REG || from == STACK_REG)
13208 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
13209 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
13211 if (known_eq (GET_MODE_SIZE (mode), 16))
13213 /* 128-bit operations on general registers require 2 instructions. */
13214 if (from == GENERAL_REGS && to == GENERAL_REGS)
13215 return regmove_cost->GP2GP * 2;
13216 else if (from == GENERAL_REGS)
13217 return regmove_cost->GP2FP * 2;
13218 else if (to == GENERAL_REGS)
13219 return regmove_cost->FP2GP * 2;
13221 /* When AdvSIMD instructions are disabled it is not possible to move
13222 a 128-bit value directly between Q registers. This is handled in
13223 secondary reload. A general register is used as a scratch to move
13224 the upper DI value and the lower DI value is moved directly,
13225 hence the cost is the sum of three moves. */
13226 if (! TARGET_SIMD)
13227 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
13229 return regmove_cost->FP2FP;
13232 if (from == GENERAL_REGS && to == GENERAL_REGS)
13233 return regmove_cost->GP2GP;
13234 else if (from == GENERAL_REGS)
13235 return regmove_cost->GP2FP;
13236 else if (to == GENERAL_REGS)
13237 return regmove_cost->FP2GP;
13239 return regmove_cost->FP2FP;
13242 static int
13243 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
13244 reg_class_t rclass ATTRIBUTE_UNUSED,
13245 bool in ATTRIBUTE_UNUSED)
13247 return aarch64_tune_params.memmov_cost;
13250 /* Implement TARGET_INIT_BUILTINS. */
13251 static void
13252 aarch64_init_builtins ()
13254 aarch64_general_init_builtins ();
13255 aarch64_sve::init_builtins ();
13258 /* Implement TARGET_FOLD_BUILTIN. */
13259 static tree
13260 aarch64_fold_builtin (tree fndecl, int nargs, tree *args, bool)
13262 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13263 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13264 tree type = TREE_TYPE (TREE_TYPE (fndecl));
13265 switch (code & AARCH64_BUILTIN_CLASS)
13267 case AARCH64_BUILTIN_GENERAL:
13268 return aarch64_general_fold_builtin (subcode, type, nargs, args);
13270 case AARCH64_BUILTIN_SVE:
13271 return NULL_TREE;
13273 gcc_unreachable ();
13276 /* Implement TARGET_GIMPLE_FOLD_BUILTIN. */
13277 static bool
13278 aarch64_gimple_fold_builtin (gimple_stmt_iterator *gsi)
13280 gcall *stmt = as_a <gcall *> (gsi_stmt (*gsi));
13281 tree fndecl = gimple_call_fndecl (stmt);
13282 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13283 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13284 gimple *new_stmt = NULL;
13285 switch (code & AARCH64_BUILTIN_CLASS)
13287 case AARCH64_BUILTIN_GENERAL:
13288 new_stmt = aarch64_general_gimple_fold_builtin (subcode, stmt);
13289 break;
13291 case AARCH64_BUILTIN_SVE:
13292 new_stmt = aarch64_sve::gimple_fold_builtin (subcode, gsi, stmt);
13293 break;
13296 if (!new_stmt)
13297 return false;
13299 gsi_replace (gsi, new_stmt, true);
13300 return true;
13303 /* Implement TARGET_EXPAND_BUILTIN. */
13304 static rtx
13305 aarch64_expand_builtin (tree exp, rtx target, rtx, machine_mode, int ignore)
13307 tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
13308 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13309 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13310 switch (code & AARCH64_BUILTIN_CLASS)
13312 case AARCH64_BUILTIN_GENERAL:
13313 return aarch64_general_expand_builtin (subcode, exp, target, ignore);
13315 case AARCH64_BUILTIN_SVE:
13316 return aarch64_sve::expand_builtin (subcode, exp, target);
13318 gcc_unreachable ();
13321 /* Implement TARGET_BUILTIN_DECL. */
13322 static tree
13323 aarch64_builtin_decl (unsigned int code, bool initialize_p)
13325 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13326 switch (code & AARCH64_BUILTIN_CLASS)
13328 case AARCH64_BUILTIN_GENERAL:
13329 return aarch64_general_builtin_decl (subcode, initialize_p);
13331 case AARCH64_BUILTIN_SVE:
13332 return aarch64_sve::builtin_decl (subcode, initialize_p);
13334 gcc_unreachable ();
13337 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
13338 to optimize 1.0/sqrt. */
13340 static bool
13341 use_rsqrt_p (machine_mode mode)
13343 return (!flag_trapping_math
13344 && flag_unsafe_math_optimizations
13345 && ((aarch64_tune_params.approx_modes->recip_sqrt
13346 & AARCH64_APPROX_MODE (mode))
13347 || flag_mrecip_low_precision_sqrt));
13350 /* Function to decide when to use the approximate reciprocal square root
13351 builtin. */
13353 static tree
13354 aarch64_builtin_reciprocal (tree fndecl)
13356 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
13358 if (!use_rsqrt_p (mode))
13359 return NULL_TREE;
13360 unsigned int code = DECL_MD_FUNCTION_CODE (fndecl);
13361 unsigned int subcode = code >> AARCH64_BUILTIN_SHIFT;
13362 switch (code & AARCH64_BUILTIN_CLASS)
13364 case AARCH64_BUILTIN_GENERAL:
13365 return aarch64_general_builtin_rsqrt (subcode);
13367 case AARCH64_BUILTIN_SVE:
13368 return NULL_TREE;
13370 gcc_unreachable ();
13373 /* Emit code to perform the floating-point operation:
13375 DST = SRC1 * SRC2
13377 where all three operands are already known to be registers.
13378 If the operation is an SVE one, PTRUE is a suitable all-true
13379 predicate. */
13381 static void
13382 aarch64_emit_mult (rtx dst, rtx ptrue, rtx src1, rtx src2)
13384 if (ptrue)
13385 emit_insn (gen_aarch64_pred (UNSPEC_COND_FMUL, GET_MODE (dst),
13386 dst, ptrue, src1, src2,
13387 gen_int_mode (SVE_RELAXED_GP, SImode)));
13388 else
13389 emit_set_insn (dst, gen_rtx_MULT (GET_MODE (dst), src1, src2));
13392 /* Emit instruction sequence to compute either the approximate square root
13393 or its approximate reciprocal, depending on the flag RECP, and return
13394 whether the sequence was emitted or not. */
13396 bool
13397 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
13399 machine_mode mode = GET_MODE (dst);
13401 if (GET_MODE_INNER (mode) == HFmode)
13403 gcc_assert (!recp);
13404 return false;
13407 if (!recp)
13409 if (!(flag_mlow_precision_sqrt
13410 || (aarch64_tune_params.approx_modes->sqrt
13411 & AARCH64_APPROX_MODE (mode))))
13412 return false;
13414 if (!flag_finite_math_only
13415 || flag_trapping_math
13416 || !flag_unsafe_math_optimizations
13417 || optimize_function_for_size_p (cfun))
13418 return false;
13420 else
13421 /* Caller assumes we cannot fail. */
13422 gcc_assert (use_rsqrt_p (mode));
13424 rtx pg = NULL_RTX;
13425 if (aarch64_sve_mode_p (mode))
13426 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13427 machine_mode mmsk = (VECTOR_MODE_P (mode)
13428 ? related_int_vector_mode (mode).require ()
13429 : int_mode_for_mode (mode).require ());
13430 rtx xmsk = NULL_RTX;
13431 if (!recp)
13433 /* When calculating the approximate square root, compare the
13434 argument with 0.0 and create a mask. */
13435 rtx zero = CONST0_RTX (mode);
13436 if (pg)
13438 xmsk = gen_reg_rtx (GET_MODE (pg));
13439 rtx hint = gen_int_mode (SVE_KNOWN_PTRUE, SImode);
13440 emit_insn (gen_aarch64_pred_fcm (UNSPEC_COND_FCMNE, mode,
13441 xmsk, pg, hint, src, zero));
13443 else
13445 xmsk = gen_reg_rtx (mmsk);
13446 emit_insn (gen_rtx_SET (xmsk,
13447 gen_rtx_NEG (mmsk,
13448 gen_rtx_EQ (mmsk, src, zero))));
13452 /* Estimate the approximate reciprocal square root. */
13453 rtx xdst = gen_reg_rtx (mode);
13454 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
13456 /* Iterate over the series twice for SF and thrice for DF. */
13457 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13459 /* Optionally iterate over the series once less for faster performance
13460 while sacrificing the accuracy. */
13461 if ((recp && flag_mrecip_low_precision_sqrt)
13462 || (!recp && flag_mlow_precision_sqrt))
13463 iterations--;
13465 /* Iterate over the series to calculate the approximate reciprocal square
13466 root. */
13467 rtx x1 = gen_reg_rtx (mode);
13468 while (iterations--)
13470 rtx x2 = gen_reg_rtx (mode);
13471 aarch64_emit_mult (x2, pg, xdst, xdst);
13473 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
13475 if (iterations > 0)
13476 aarch64_emit_mult (xdst, pg, xdst, x1);
13479 if (!recp)
13481 if (pg)
13482 /* Multiply nonzero source values by the corresponding intermediate
13483 result elements, so that the final calculation is the approximate
13484 square root rather than its reciprocal. Select a zero result for
13485 zero source values, to avoid the Inf * 0 -> NaN that we'd get
13486 otherwise. */
13487 emit_insn (gen_cond (UNSPEC_COND_FMUL, mode,
13488 xdst, xmsk, xdst, src, CONST0_RTX (mode)));
13489 else
13491 /* Qualify the approximate reciprocal square root when the
13492 argument is 0.0 by squashing the intermediary result to 0.0. */
13493 rtx xtmp = gen_reg_rtx (mmsk);
13494 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
13495 gen_rtx_SUBREG (mmsk, xdst, 0)));
13496 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
13498 /* Calculate the approximate square root. */
13499 aarch64_emit_mult (xdst, pg, xdst, src);
13503 /* Finalize the approximation. */
13504 aarch64_emit_mult (dst, pg, xdst, x1);
13506 return true;
13509 /* Emit the instruction sequence to compute the approximation for the division
13510 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
13512 bool
13513 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
13515 machine_mode mode = GET_MODE (quo);
13517 if (GET_MODE_INNER (mode) == HFmode)
13518 return false;
13520 bool use_approx_division_p = (flag_mlow_precision_div
13521 || (aarch64_tune_params.approx_modes->division
13522 & AARCH64_APPROX_MODE (mode)));
13524 if (!flag_finite_math_only
13525 || flag_trapping_math
13526 || !flag_unsafe_math_optimizations
13527 || optimize_function_for_size_p (cfun)
13528 || !use_approx_division_p)
13529 return false;
13531 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
13532 return false;
13534 rtx pg = NULL_RTX;
13535 if (aarch64_sve_mode_p (mode))
13536 pg = aarch64_ptrue_reg (aarch64_sve_pred_mode (mode));
13538 /* Estimate the approximate reciprocal. */
13539 rtx xrcp = gen_reg_rtx (mode);
13540 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
13542 /* Iterate over the series twice for SF and thrice for DF. */
13543 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
13545 /* Optionally iterate over the series less for faster performance,
13546 while sacrificing the accuracy. The default is 2 for DF and 1 for SF. */
13547 if (flag_mlow_precision_div)
13548 iterations = (GET_MODE_INNER (mode) == DFmode
13549 ? aarch64_double_recp_precision
13550 : aarch64_float_recp_precision);
13552 /* Iterate over the series to calculate the approximate reciprocal. */
13553 rtx xtmp = gen_reg_rtx (mode);
13554 while (iterations--)
13556 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
13558 if (iterations > 0)
13559 aarch64_emit_mult (xrcp, pg, xrcp, xtmp);
13562 if (num != CONST1_RTX (mode))
13564 /* As the approximate reciprocal of DEN is already calculated, only
13565 calculate the approximate division when NUM is not 1.0. */
13566 rtx xnum = force_reg (mode, num);
13567 aarch64_emit_mult (xrcp, pg, xrcp, xnum);
13570 /* Finalize the approximation. */
13571 aarch64_emit_mult (quo, pg, xrcp, xtmp);
13572 return true;
13575 /* Return the number of instructions that can be issued per cycle. */
13576 static int
13577 aarch64_sched_issue_rate (void)
13579 return aarch64_tune_params.issue_rate;
13582 /* Implement TARGET_SCHED_VARIABLE_ISSUE. */
13583 static int
13584 aarch64_sched_variable_issue (FILE *, int, rtx_insn *insn, int more)
13586 if (DEBUG_INSN_P (insn))
13587 return more;
13589 rtx_code code = GET_CODE (PATTERN (insn));
13590 if (code == USE || code == CLOBBER)
13591 return more;
13593 if (get_attr_type (insn) == TYPE_NO_INSN)
13594 return more;
13596 return more - 1;
13599 static int
13600 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
13602 int issue_rate = aarch64_sched_issue_rate ();
13604 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
13608 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
13609 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
13610 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
13612 static int
13613 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
13614 int ready_index)
13616 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
13620 /* Vectorizer cost model target hooks. */
13622 /* Implement targetm.vectorize.builtin_vectorization_cost. */
13623 static int
13624 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
13625 tree vectype,
13626 int misalign ATTRIBUTE_UNUSED)
13628 unsigned elements;
13629 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
13630 bool fp = false;
13632 if (vectype != NULL)
13633 fp = FLOAT_TYPE_P (vectype);
13635 switch (type_of_cost)
13637 case scalar_stmt:
13638 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
13640 case scalar_load:
13641 return costs->scalar_load_cost;
13643 case scalar_store:
13644 return costs->scalar_store_cost;
13646 case vector_stmt:
13647 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
13649 case vector_load:
13650 return costs->vec_align_load_cost;
13652 case vector_store:
13653 return costs->vec_store_cost;
13655 case vec_to_scalar:
13656 return costs->vec_to_scalar_cost;
13658 case scalar_to_vec:
13659 return costs->scalar_to_vec_cost;
13661 case unaligned_load:
13662 case vector_gather_load:
13663 return costs->vec_unalign_load_cost;
13665 case unaligned_store:
13666 case vector_scatter_store:
13667 return costs->vec_unalign_store_cost;
13669 case cond_branch_taken:
13670 return costs->cond_taken_branch_cost;
13672 case cond_branch_not_taken:
13673 return costs->cond_not_taken_branch_cost;
13675 case vec_perm:
13676 return costs->vec_permute_cost;
13678 case vec_promote_demote:
13679 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
13681 case vec_construct:
13682 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
13683 return elements / 2 + 1;
13685 default:
13686 gcc_unreachable ();
13690 /* Return true if creating multiple copies of STMT_INFO for Advanced SIMD
13691 vectors would produce a series of LDP or STP operations. KIND is the
13692 kind of statement that STMT_INFO represents. */
13693 static bool
13694 aarch64_advsimd_ldp_stp_p (enum vect_cost_for_stmt kind,
13695 stmt_vec_info stmt_info)
13697 switch (kind)
13699 case vector_load:
13700 case vector_store:
13701 case unaligned_load:
13702 case unaligned_store:
13703 break;
13705 default:
13706 return false;
13709 if (aarch64_tune_params.extra_tuning_flags
13710 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS)
13711 return false;
13713 return is_gimple_assign (stmt_info->stmt);
13716 /* Return true if STMT_INFO extends the result of a load. */
13717 static bool
13718 aarch64_extending_load_p (class vec_info *vinfo, stmt_vec_info stmt_info)
13720 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13721 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13722 return false;
13724 tree rhs = gimple_assign_rhs1 (stmt_info->stmt);
13725 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13726 tree rhs_type = TREE_TYPE (rhs);
13727 if (!INTEGRAL_TYPE_P (lhs_type)
13728 || !INTEGRAL_TYPE_P (rhs_type)
13729 || TYPE_PRECISION (lhs_type) <= TYPE_PRECISION (rhs_type))
13730 return false;
13732 stmt_vec_info def_stmt_info = vinfo->lookup_def (rhs);
13733 return (def_stmt_info
13734 && STMT_VINFO_DATA_REF (def_stmt_info)
13735 && DR_IS_READ (STMT_VINFO_DATA_REF (def_stmt_info)));
13738 /* Return true if STMT_INFO is an integer truncation. */
13739 static bool
13740 aarch64_integer_truncation_p (stmt_vec_info stmt_info)
13742 gassign *assign = dyn_cast <gassign *> (stmt_info->stmt);
13743 if (!assign || !CONVERT_EXPR_CODE_P (gimple_assign_rhs_code (assign)))
13744 return false;
13746 tree lhs_type = TREE_TYPE (gimple_assign_lhs (assign));
13747 tree rhs_type = TREE_TYPE (gimple_assign_rhs1 (assign));
13748 return (INTEGRAL_TYPE_P (lhs_type)
13749 && INTEGRAL_TYPE_P (rhs_type)
13750 && TYPE_PRECISION (lhs_type) < TYPE_PRECISION (rhs_type));
13753 /* STMT_COST is the cost calculated by aarch64_builtin_vectorization_cost
13754 for STMT_INFO, which has cost kind KIND and which when vectorized would
13755 operate on vector type VECTYPE. Adjust the cost as necessary for SVE
13756 targets. */
13757 static unsigned int
13758 aarch64_sve_adjust_stmt_cost (class vec_info *vinfo, vect_cost_for_stmt kind,
13759 stmt_vec_info stmt_info, tree vectype,
13760 unsigned int stmt_cost)
13762 /* Unlike vec_promote_demote, vector_stmt conversions do not change the
13763 vector register size or number of units. Integer promotions of this
13764 type therefore map to SXT[BHW] or UXT[BHW].
13766 Most loads have extending forms that can do the sign or zero extension
13767 on the fly. Optimistically assume that a load followed by an extension
13768 will fold to this form during combine, and that the extension therefore
13769 comes for free. */
13770 if (kind == vector_stmt && aarch64_extending_load_p (vinfo, stmt_info))
13771 stmt_cost = 0;
13773 /* For similar reasons, vector_stmt integer truncations are a no-op,
13774 because we can just ignore the unused upper bits of the source. */
13775 if (kind == vector_stmt && aarch64_integer_truncation_p (stmt_info))
13776 stmt_cost = 0;
13778 /* Advanced SIMD can load and store pairs of registers using LDP and STP,
13779 but there are no equivalent instructions for SVE. This means that
13780 (all other things being equal) 128-bit SVE needs twice as many load
13781 and store instructions as Advanced SIMD in order to process vector pairs.
13783 Also, scalar code can often use LDP and STP to access pairs of values,
13784 so it is too simplistic to say that one SVE load or store replaces
13785 VF scalar loads and stores.
13787 Ideally we would account for this in the scalar and Advanced SIMD
13788 costs by making suitable load/store pairs as cheap as a single
13789 load/store. However, that would be a very invasive change and in
13790 practice it tends to stress other parts of the cost model too much.
13791 E.g. stores of scalar constants currently count just a store,
13792 whereas stores of vector constants count a store and a vec_init.
13793 This is an artificial distinction for AArch64, where stores of
13794 nonzero scalar constants need the same kind of register invariant
13795 as vector stores.
13797 An alternative would be to double the cost of any SVE loads and stores
13798 that could be paired in Advanced SIMD (and possibly also paired in
13799 scalar code). But this tends to stress other parts of the cost model
13800 in the same way. It also means that we can fall back to Advanced SIMD
13801 even if full-loop predication would have been useful.
13803 Here we go for a more conservative version: double the costs of SVE
13804 loads and stores if one iteration of the scalar loop processes enough
13805 elements for it to use a whole number of Advanced SIMD LDP or STP
13806 instructions. This makes it very likely that the VF would be 1 for
13807 Advanced SIMD, and so no epilogue should be needed. */
13808 if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
13810 stmt_vec_info first = DR_GROUP_FIRST_ELEMENT (stmt_info);
13811 unsigned int count = DR_GROUP_SIZE (first) - DR_GROUP_GAP (first);
13812 unsigned int elt_bits = GET_MODE_UNIT_BITSIZE (TYPE_MODE (vectype));
13813 if (multiple_p (count * elt_bits, 256)
13814 && aarch64_advsimd_ldp_stp_p (kind, stmt_info))
13815 stmt_cost *= 2;
13818 return stmt_cost;
13821 /* Implement targetm.vectorize.add_stmt_cost. */
13822 static unsigned
13823 aarch64_add_stmt_cost (class vec_info *vinfo, void *data, int count,
13824 enum vect_cost_for_stmt kind,
13825 struct _stmt_vec_info *stmt_info, tree vectype,
13826 int misalign, enum vect_cost_model_location where)
13828 unsigned *cost = (unsigned *) data;
13829 unsigned retval = 0;
13831 if (flag_vect_cost_model)
13833 int stmt_cost =
13834 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
13836 if (stmt_info && vectype && aarch64_sve_mode_p (TYPE_MODE (vectype)))
13837 stmt_cost = aarch64_sve_adjust_stmt_cost (vinfo, kind, stmt_info,
13838 vectype, stmt_cost);
13840 /* Statements in an inner loop relative to the loop being
13841 vectorized are weighted more heavily. The value here is
13842 arbitrary and could potentially be improved with analysis. */
13843 if (where == vect_body && stmt_info
13844 && stmt_in_inner_loop_p (vinfo, stmt_info))
13845 count *= 50; /* FIXME */
13847 retval = (unsigned) (count * stmt_cost);
13848 cost[where] += retval;
13851 return retval;
13854 static void initialize_aarch64_code_model (struct gcc_options *);
13856 /* Parse the TO_PARSE string and put the architecture struct that it
13857 selects into RES and the architectural features into ISA_FLAGS.
13858 Return an aarch64_parse_opt_result describing the parse result.
13859 If there is an error parsing, RES and ISA_FLAGS are left unchanged.
13860 When the TO_PARSE string contains an invalid extension,
13861 a copy of the string is created and stored to INVALID_EXTENSION. */
13863 static enum aarch64_parse_opt_result
13864 aarch64_parse_arch (const char *to_parse, const struct processor **res,
13865 uint64_t *isa_flags, std::string *invalid_extension)
13867 const char *ext;
13868 const struct processor *arch;
13869 size_t len;
13871 ext = strchr (to_parse, '+');
13873 if (ext != NULL)
13874 len = ext - to_parse;
13875 else
13876 len = strlen (to_parse);
13878 if (len == 0)
13879 return AARCH64_PARSE_MISSING_ARG;
13882 /* Loop through the list of supported ARCHes to find a match. */
13883 for (arch = all_architectures; arch->name != NULL; arch++)
13885 if (strlen (arch->name) == len
13886 && strncmp (arch->name, to_parse, len) == 0)
13888 uint64_t isa_temp = arch->flags;
13890 if (ext != NULL)
13892 /* TO_PARSE string contains at least one extension. */
13893 enum aarch64_parse_opt_result ext_res
13894 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13896 if (ext_res != AARCH64_PARSE_OK)
13897 return ext_res;
13899 /* Extension parsing was successful. Confirm the result
13900 arch and ISA flags. */
13901 *res = arch;
13902 *isa_flags = isa_temp;
13903 return AARCH64_PARSE_OK;
13907 /* ARCH name not found in list. */
13908 return AARCH64_PARSE_INVALID_ARG;
13911 /* Parse the TO_PARSE string and put the result tuning in RES and the
13912 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
13913 describing the parse result. If there is an error parsing, RES and
13914 ISA_FLAGS are left unchanged.
13915 When the TO_PARSE string contains an invalid extension,
13916 a copy of the string is created and stored to INVALID_EXTENSION. */
13918 static enum aarch64_parse_opt_result
13919 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
13920 uint64_t *isa_flags, std::string *invalid_extension)
13922 const char *ext;
13923 const struct processor *cpu;
13924 size_t len;
13926 ext = strchr (to_parse, '+');
13928 if (ext != NULL)
13929 len = ext - to_parse;
13930 else
13931 len = strlen (to_parse);
13933 if (len == 0)
13934 return AARCH64_PARSE_MISSING_ARG;
13937 /* Loop through the list of supported CPUs to find a match. */
13938 for (cpu = all_cores; cpu->name != NULL; cpu++)
13940 if (strlen (cpu->name) == len && strncmp (cpu->name, to_parse, len) == 0)
13942 uint64_t isa_temp = cpu->flags;
13945 if (ext != NULL)
13947 /* TO_PARSE string contains at least one extension. */
13948 enum aarch64_parse_opt_result ext_res
13949 = aarch64_parse_extension (ext, &isa_temp, invalid_extension);
13951 if (ext_res != AARCH64_PARSE_OK)
13952 return ext_res;
13954 /* Extension parsing was successfull. Confirm the result
13955 cpu and ISA flags. */
13956 *res = cpu;
13957 *isa_flags = isa_temp;
13958 return AARCH64_PARSE_OK;
13962 /* CPU name not found in list. */
13963 return AARCH64_PARSE_INVALID_ARG;
13966 /* Parse the TO_PARSE string and put the cpu it selects into RES.
13967 Return an aarch64_parse_opt_result describing the parse result.
13968 If the parsing fails the RES does not change. */
13970 static enum aarch64_parse_opt_result
13971 aarch64_parse_tune (const char *to_parse, const struct processor **res)
13973 const struct processor *cpu;
13975 /* Loop through the list of supported CPUs to find a match. */
13976 for (cpu = all_cores; cpu->name != NULL; cpu++)
13978 if (strcmp (cpu->name, to_parse) == 0)
13980 *res = cpu;
13981 return AARCH64_PARSE_OK;
13985 /* CPU name not found in list. */
13986 return AARCH64_PARSE_INVALID_ARG;
13989 /* Parse TOKEN, which has length LENGTH to see if it is an option
13990 described in FLAG. If it is, return the index bit for that fusion type.
13991 If not, error (printing OPTION_NAME) and return zero. */
13993 static unsigned int
13994 aarch64_parse_one_option_token (const char *token,
13995 size_t length,
13996 const struct aarch64_flag_desc *flag,
13997 const char *option_name)
13999 for (; flag->name != NULL; flag++)
14001 if (length == strlen (flag->name)
14002 && !strncmp (flag->name, token, length))
14003 return flag->flag;
14006 error ("unknown flag passed in %<-moverride=%s%> (%s)", option_name, token);
14007 return 0;
14010 /* Parse OPTION which is a comma-separated list of flags to enable.
14011 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
14012 default state we inherit from the CPU tuning structures. OPTION_NAME
14013 gives the top-level option we are parsing in the -moverride string,
14014 for use in error messages. */
14016 static unsigned int
14017 aarch64_parse_boolean_options (const char *option,
14018 const struct aarch64_flag_desc *flags,
14019 unsigned int initial_state,
14020 const char *option_name)
14022 const char separator = '.';
14023 const char* specs = option;
14024 const char* ntoken = option;
14025 unsigned int found_flags = initial_state;
14027 while ((ntoken = strchr (specs, separator)))
14029 size_t token_length = ntoken - specs;
14030 unsigned token_ops = aarch64_parse_one_option_token (specs,
14031 token_length,
14032 flags,
14033 option_name);
14034 /* If we find "none" (or, for simplicity's sake, an error) anywhere
14035 in the token stream, reset the supported operations. So:
14037 adrp+add.cmp+branch.none.adrp+add
14039 would have the result of turning on only adrp+add fusion. */
14040 if (!token_ops)
14041 found_flags = 0;
14043 found_flags |= token_ops;
14044 specs = ++ntoken;
14047 /* We ended with a comma, print something. */
14048 if (!(*specs))
14050 error ("%s string ill-formed\n", option_name);
14051 return 0;
14054 /* We still have one more token to parse. */
14055 size_t token_length = strlen (specs);
14056 unsigned token_ops = aarch64_parse_one_option_token (specs,
14057 token_length,
14058 flags,
14059 option_name);
14060 if (!token_ops)
14061 found_flags = 0;
14063 found_flags |= token_ops;
14064 return found_flags;
14067 /* Support for overriding instruction fusion. */
14069 static void
14070 aarch64_parse_fuse_string (const char *fuse_string,
14071 struct tune_params *tune)
14073 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
14074 aarch64_fusible_pairs,
14075 tune->fusible_ops,
14076 "fuse=");
14079 /* Support for overriding other tuning flags. */
14081 static void
14082 aarch64_parse_tune_string (const char *tune_string,
14083 struct tune_params *tune)
14085 tune->extra_tuning_flags
14086 = aarch64_parse_boolean_options (tune_string,
14087 aarch64_tuning_flags,
14088 tune->extra_tuning_flags,
14089 "tune=");
14092 /* Parse the sve_width tuning moverride string in TUNE_STRING.
14093 Accept the valid SVE vector widths allowed by
14094 aarch64_sve_vector_bits_enum and use it to override sve_width
14095 in TUNE. */
14097 static void
14098 aarch64_parse_sve_width_string (const char *tune_string,
14099 struct tune_params *tune)
14101 int width = -1;
14103 int n = sscanf (tune_string, "%d", &width);
14104 if (n == EOF)
14106 error ("invalid format for sve_width");
14107 return;
14109 switch (width)
14111 case SVE_128:
14112 case SVE_256:
14113 case SVE_512:
14114 case SVE_1024:
14115 case SVE_2048:
14116 break;
14117 default:
14118 error ("invalid sve_width value: %d", width);
14120 tune->sve_width = (enum aarch64_sve_vector_bits_enum) width;
14123 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
14124 we understand. If it is, extract the option string and handoff to
14125 the appropriate function. */
14127 void
14128 aarch64_parse_one_override_token (const char* token,
14129 size_t length,
14130 struct tune_params *tune)
14132 const struct aarch64_tuning_override_function *fn
14133 = aarch64_tuning_override_functions;
14135 const char *option_part = strchr (token, '=');
14136 if (!option_part)
14138 error ("tuning string missing in option (%s)", token);
14139 return;
14142 /* Get the length of the option name. */
14143 length = option_part - token;
14144 /* Skip the '=' to get to the option string. */
14145 option_part++;
14147 for (; fn->name != NULL; fn++)
14149 if (!strncmp (fn->name, token, length))
14151 fn->parse_override (option_part, tune);
14152 return;
14156 error ("unknown tuning option (%s)",token);
14157 return;
14160 /* A checking mechanism for the implementation of the tls size. */
14162 static void
14163 initialize_aarch64_tls_size (struct gcc_options *opts)
14165 if (aarch64_tls_size == 0)
14166 aarch64_tls_size = 24;
14168 switch (opts->x_aarch64_cmodel_var)
14170 case AARCH64_CMODEL_TINY:
14171 /* Both the default and maximum TLS size allowed under tiny is 1M which
14172 needs two instructions to address, so we clamp the size to 24. */
14173 if (aarch64_tls_size > 24)
14174 aarch64_tls_size = 24;
14175 break;
14176 case AARCH64_CMODEL_SMALL:
14177 /* The maximum TLS size allowed under small is 4G. */
14178 if (aarch64_tls_size > 32)
14179 aarch64_tls_size = 32;
14180 break;
14181 case AARCH64_CMODEL_LARGE:
14182 /* The maximum TLS size allowed under large is 16E.
14183 FIXME: 16E should be 64bit, we only support 48bit offset now. */
14184 if (aarch64_tls_size > 48)
14185 aarch64_tls_size = 48;
14186 break;
14187 default:
14188 gcc_unreachable ();
14191 return;
14194 /* Parse STRING looking for options in the format:
14195 string :: option:string
14196 option :: name=substring
14197 name :: {a-z}
14198 substring :: defined by option. */
14200 static void
14201 aarch64_parse_override_string (const char* input_string,
14202 struct tune_params* tune)
14204 const char separator = ':';
14205 size_t string_length = strlen (input_string) + 1;
14206 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
14207 char *string = string_root;
14208 strncpy (string, input_string, string_length);
14209 string[string_length - 1] = '\0';
14211 char* ntoken = string;
14213 while ((ntoken = strchr (string, separator)))
14215 size_t token_length = ntoken - string;
14216 /* Make this substring look like a string. */
14217 *ntoken = '\0';
14218 aarch64_parse_one_override_token (string, token_length, tune);
14219 string = ++ntoken;
14222 /* One last option to parse. */
14223 aarch64_parse_one_override_token (string, strlen (string), tune);
14224 free (string_root);
14228 static void
14229 aarch64_override_options_after_change_1 (struct gcc_options *opts)
14231 if (accepted_branch_protection_string)
14233 opts->x_aarch64_branch_protection_string
14234 = xstrdup (accepted_branch_protection_string);
14237 /* PR 70044: We have to be careful about being called multiple times for the
14238 same function. This means all changes should be repeatable. */
14240 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
14241 Disable the frame pointer flag so the mid-end will not use a frame
14242 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
14243 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
14244 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
14245 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
14246 if (opts->x_flag_omit_frame_pointer == 0)
14247 opts->x_flag_omit_frame_pointer = 2;
14249 /* If not optimizing for size, set the default
14250 alignment to what the target wants. */
14251 if (!opts->x_optimize_size)
14253 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
14254 opts->x_str_align_loops = aarch64_tune_params.loop_align;
14255 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
14256 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
14257 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
14258 opts->x_str_align_functions = aarch64_tune_params.function_align;
14261 /* We default to no pc-relative literal loads. */
14263 aarch64_pcrelative_literal_loads = false;
14265 /* If -mpc-relative-literal-loads is set on the command line, this
14266 implies that the user asked for PC relative literal loads. */
14267 if (opts->x_pcrelative_literal_loads == 1)
14268 aarch64_pcrelative_literal_loads = true;
14270 /* In the tiny memory model it makes no sense to disallow PC relative
14271 literal pool loads. */
14272 if (aarch64_cmodel == AARCH64_CMODEL_TINY
14273 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
14274 aarch64_pcrelative_literal_loads = true;
14276 /* When enabling the lower precision Newton series for the square root, also
14277 enable it for the reciprocal square root, since the latter is an
14278 intermediary step for the former. */
14279 if (flag_mlow_precision_sqrt)
14280 flag_mrecip_low_precision_sqrt = true;
14283 /* 'Unpack' up the internal tuning structs and update the options
14284 in OPTS. The caller must have set up selected_tune and selected_arch
14285 as all the other target-specific codegen decisions are
14286 derived from them. */
14288 void
14289 aarch64_override_options_internal (struct gcc_options *opts)
14291 aarch64_tune_flags = selected_tune->flags;
14292 aarch64_tune = selected_tune->sched_core;
14293 /* Make a copy of the tuning parameters attached to the core, which
14294 we may later overwrite. */
14295 aarch64_tune_params = *(selected_tune->tune);
14296 aarch64_architecture_version = selected_arch->architecture_version;
14298 if (opts->x_aarch64_override_tune_string)
14299 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
14300 &aarch64_tune_params);
14302 /* This target defaults to strict volatile bitfields. */
14303 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
14304 opts->x_flag_strict_volatile_bitfields = 1;
14306 if (aarch64_stack_protector_guard == SSP_GLOBAL
14307 && opts->x_aarch64_stack_protector_guard_offset_str)
14309 error ("incompatible options %<-mstack-protector-guard=global%> and "
14310 "%<-mstack-protector-guard-offset=%s%>",
14311 aarch64_stack_protector_guard_offset_str);
14314 if (aarch64_stack_protector_guard == SSP_SYSREG
14315 && !(opts->x_aarch64_stack_protector_guard_offset_str
14316 && opts->x_aarch64_stack_protector_guard_reg_str))
14318 error ("both %<-mstack-protector-guard-offset%> and "
14319 "%<-mstack-protector-guard-reg%> must be used "
14320 "with %<-mstack-protector-guard=sysreg%>");
14323 if (opts->x_aarch64_stack_protector_guard_reg_str)
14325 if (strlen (opts->x_aarch64_stack_protector_guard_reg_str) > 100)
14326 error ("specify a system register with a small string length.");
14329 if (opts->x_aarch64_stack_protector_guard_offset_str)
14331 char *end;
14332 const char *str = aarch64_stack_protector_guard_offset_str;
14333 errno = 0;
14334 long offs = strtol (aarch64_stack_protector_guard_offset_str, &end, 0);
14335 if (!*str || *end || errno)
14336 error ("%qs is not a valid offset in %qs", str,
14337 "-mstack-protector-guard-offset=");
14338 aarch64_stack_protector_guard_offset = offs;
14341 initialize_aarch64_code_model (opts);
14342 initialize_aarch64_tls_size (opts);
14344 int queue_depth = 0;
14345 switch (aarch64_tune_params.autoprefetcher_model)
14347 case tune_params::AUTOPREFETCHER_OFF:
14348 queue_depth = -1;
14349 break;
14350 case tune_params::AUTOPREFETCHER_WEAK:
14351 queue_depth = 0;
14352 break;
14353 case tune_params::AUTOPREFETCHER_STRONG:
14354 queue_depth = max_insn_queue_index + 1;
14355 break;
14356 default:
14357 gcc_unreachable ();
14360 /* We don't mind passing in global_options_set here as we don't use
14361 the *options_set structs anyway. */
14362 SET_OPTION_IF_UNSET (opts, &global_options_set,
14363 param_sched_autopref_queue_depth, queue_depth);
14365 /* Set up parameters to be used in prefetching algorithm. Do not
14366 override the defaults unless we are tuning for a core we have
14367 researched values for. */
14368 if (aarch64_tune_params.prefetch->num_slots > 0)
14369 SET_OPTION_IF_UNSET (opts, &global_options_set,
14370 param_simultaneous_prefetches,
14371 aarch64_tune_params.prefetch->num_slots);
14372 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
14373 SET_OPTION_IF_UNSET (opts, &global_options_set,
14374 param_l1_cache_size,
14375 aarch64_tune_params.prefetch->l1_cache_size);
14376 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
14377 SET_OPTION_IF_UNSET (opts, &global_options_set,
14378 param_l1_cache_line_size,
14379 aarch64_tune_params.prefetch->l1_cache_line_size);
14380 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
14381 SET_OPTION_IF_UNSET (opts, &global_options_set,
14382 param_l2_cache_size,
14383 aarch64_tune_params.prefetch->l2_cache_size);
14384 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
14385 SET_OPTION_IF_UNSET (opts, &global_options_set,
14386 param_prefetch_dynamic_strides, 0);
14387 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
14388 SET_OPTION_IF_UNSET (opts, &global_options_set,
14389 param_prefetch_minimum_stride,
14390 aarch64_tune_params.prefetch->minimum_stride);
14392 /* Use the alternative scheduling-pressure algorithm by default. */
14393 SET_OPTION_IF_UNSET (opts, &global_options_set,
14394 param_sched_pressure_algorithm,
14395 SCHED_PRESSURE_MODEL);
14397 /* Validate the guard size. */
14398 int guard_size = param_stack_clash_protection_guard_size;
14400 if (guard_size != 12 && guard_size != 16)
14401 error ("only values 12 (4 KB) and 16 (64 KB) are supported for guard "
14402 "size. Given value %d (%llu KB) is out of range",
14403 guard_size, (1ULL << guard_size) / 1024ULL);
14405 /* Enforce that interval is the same size as size so the mid-end does the
14406 right thing. */
14407 SET_OPTION_IF_UNSET (opts, &global_options_set,
14408 param_stack_clash_protection_probe_interval,
14409 guard_size);
14411 /* The maybe_set calls won't update the value if the user has explicitly set
14412 one. Which means we need to validate that probing interval and guard size
14413 are equal. */
14414 int probe_interval
14415 = param_stack_clash_protection_probe_interval;
14416 if (guard_size != probe_interval)
14417 error ("stack clash guard size %<%d%> must be equal to probing interval "
14418 "%<%d%>", guard_size, probe_interval);
14420 /* Enable sw prefetching at specified optimization level for
14421 CPUS that have prefetch. Lower optimization level threshold by 1
14422 when profiling is enabled. */
14423 if (opts->x_flag_prefetch_loop_arrays < 0
14424 && !opts->x_optimize_size
14425 && aarch64_tune_params.prefetch->default_opt_level >= 0
14426 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
14427 opts->x_flag_prefetch_loop_arrays = 1;
14429 if (opts->x_aarch64_arch_string == NULL)
14430 opts->x_aarch64_arch_string = selected_arch->name;
14431 if (opts->x_aarch64_cpu_string == NULL)
14432 opts->x_aarch64_cpu_string = selected_cpu->name;
14433 if (opts->x_aarch64_tune_string == NULL)
14434 opts->x_aarch64_tune_string = selected_tune->name;
14436 aarch64_override_options_after_change_1 (opts);
14439 /* Print a hint with a suggestion for a core or architecture name that
14440 most closely resembles what the user passed in STR. ARCH is true if
14441 the user is asking for an architecture name. ARCH is false if the user
14442 is asking for a core name. */
14444 static void
14445 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
14447 auto_vec<const char *> candidates;
14448 const struct processor *entry = arch ? all_architectures : all_cores;
14449 for (; entry->name != NULL; entry++)
14450 candidates.safe_push (entry->name);
14452 #ifdef HAVE_LOCAL_CPU_DETECT
14453 /* Add also "native" as possible value. */
14454 if (arch)
14455 candidates.safe_push ("native");
14456 #endif
14458 char *s;
14459 const char *hint = candidates_list_and_hint (str, s, candidates);
14460 if (hint)
14461 inform (input_location, "valid arguments are: %s;"
14462 " did you mean %qs?", s, hint);
14463 else
14464 inform (input_location, "valid arguments are: %s", s);
14466 XDELETEVEC (s);
14469 /* Print a hint with a suggestion for a core name that most closely resembles
14470 what the user passed in STR. */
14472 inline static void
14473 aarch64_print_hint_for_core (const char *str)
14475 aarch64_print_hint_for_core_or_arch (str, false);
14478 /* Print a hint with a suggestion for an architecture name that most closely
14479 resembles what the user passed in STR. */
14481 inline static void
14482 aarch64_print_hint_for_arch (const char *str)
14484 aarch64_print_hint_for_core_or_arch (str, true);
14488 /* Print a hint with a suggestion for an extension name
14489 that most closely resembles what the user passed in STR. */
14491 void
14492 aarch64_print_hint_for_extensions (const std::string &str)
14494 auto_vec<const char *> candidates;
14495 aarch64_get_all_extension_candidates (&candidates);
14496 char *s;
14497 const char *hint = candidates_list_and_hint (str.c_str (), s, candidates);
14498 if (hint)
14499 inform (input_location, "valid arguments are: %s;"
14500 " did you mean %qs?", s, hint);
14501 else
14502 inform (input_location, "valid arguments are: %s;", s);
14504 XDELETEVEC (s);
14507 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
14508 specified in STR and throw errors if appropriate. Put the results if
14509 they are valid in RES and ISA_FLAGS. Return whether the option is
14510 valid. */
14512 static bool
14513 aarch64_validate_mcpu (const char *str, const struct processor **res,
14514 uint64_t *isa_flags)
14516 std::string invalid_extension;
14517 enum aarch64_parse_opt_result parse_res
14518 = aarch64_parse_cpu (str, res, isa_flags, &invalid_extension);
14520 if (parse_res == AARCH64_PARSE_OK)
14521 return true;
14523 switch (parse_res)
14525 case AARCH64_PARSE_MISSING_ARG:
14526 error ("missing cpu name in %<-mcpu=%s%>", str);
14527 break;
14528 case AARCH64_PARSE_INVALID_ARG:
14529 error ("unknown value %qs for %<-mcpu%>", str);
14530 aarch64_print_hint_for_core (str);
14531 break;
14532 case AARCH64_PARSE_INVALID_FEATURE:
14533 error ("invalid feature modifier %qs in %<-mcpu=%s%>",
14534 invalid_extension.c_str (), str);
14535 aarch64_print_hint_for_extensions (invalid_extension);
14536 break;
14537 default:
14538 gcc_unreachable ();
14541 return false;
14544 /* Straight line speculation indicators. */
14545 enum aarch64_sls_hardening_type
14547 SLS_NONE = 0,
14548 SLS_RETBR = 1,
14549 SLS_BLR = 2,
14550 SLS_ALL = 3,
14552 static enum aarch64_sls_hardening_type aarch64_sls_hardening;
14554 /* Return whether we should mitigatate Straight Line Speculation for the RET
14555 and BR instructions. */
14556 bool
14557 aarch64_harden_sls_retbr_p (void)
14559 return aarch64_sls_hardening & SLS_RETBR;
14562 /* Return whether we should mitigatate Straight Line Speculation for the BLR
14563 instruction. */
14564 bool
14565 aarch64_harden_sls_blr_p (void)
14567 return aarch64_sls_hardening & SLS_BLR;
14570 /* As of yet we only allow setting these options globally, in the future we may
14571 allow setting them per function. */
14572 static void
14573 aarch64_validate_sls_mitigation (const char *const_str)
14575 char *token_save = NULL;
14576 char *str = NULL;
14578 if (strcmp (const_str, "none") == 0)
14580 aarch64_sls_hardening = SLS_NONE;
14581 return;
14583 if (strcmp (const_str, "all") == 0)
14585 aarch64_sls_hardening = SLS_ALL;
14586 return;
14589 char *str_root = xstrdup (const_str);
14590 str = strtok_r (str_root, ",", &token_save);
14591 if (!str)
14592 error ("invalid argument given to %<-mharden-sls=%>");
14594 int temp = SLS_NONE;
14595 while (str)
14597 if (strcmp (str, "blr") == 0)
14598 temp |= SLS_BLR;
14599 else if (strcmp (str, "retbr") == 0)
14600 temp |= SLS_RETBR;
14601 else if (strcmp (str, "none") == 0 || strcmp (str, "all") == 0)
14603 error ("%<%s%> must be by itself for %<-mharden-sls=%>", str);
14604 break;
14606 else
14608 error ("invalid argument %<%s%> for %<-mharden-sls=%>", str);
14609 break;
14611 str = strtok_r (NULL, ",", &token_save);
14613 aarch64_sls_hardening = (aarch64_sls_hardening_type) temp;
14614 free (str_root);
14617 /* Parses CONST_STR for branch protection features specified in
14618 aarch64_branch_protect_types, and set any global variables required. Returns
14619 the parsing result and assigns LAST_STR to the last processed token from
14620 CONST_STR so that it can be used for error reporting. */
14622 static enum
14623 aarch64_parse_opt_result aarch64_parse_branch_protection (const char *const_str,
14624 char** last_str)
14626 char *str_root = xstrdup (const_str);
14627 char* token_save = NULL;
14628 char *str = strtok_r (str_root, "+", &token_save);
14629 enum aarch64_parse_opt_result res = AARCH64_PARSE_OK;
14630 if (!str)
14631 res = AARCH64_PARSE_MISSING_ARG;
14632 else
14634 char *next_str = strtok_r (NULL, "+", &token_save);
14635 /* Reset the branch protection features to their defaults. */
14636 aarch64_handle_no_branch_protection (NULL, NULL);
14638 while (str && res == AARCH64_PARSE_OK)
14640 const aarch64_branch_protect_type* type = aarch64_branch_protect_types;
14641 bool found = false;
14642 /* Search for this type. */
14643 while (type && type->name && !found && res == AARCH64_PARSE_OK)
14645 if (strcmp (str, type->name) == 0)
14647 found = true;
14648 res = type->handler (str, next_str);
14649 str = next_str;
14650 next_str = strtok_r (NULL, "+", &token_save);
14652 else
14653 type++;
14655 if (found && res == AARCH64_PARSE_OK)
14657 bool found_subtype = true;
14658 /* Loop through each token until we find one that isn't a
14659 subtype. */
14660 while (found_subtype)
14662 found_subtype = false;
14663 const aarch64_branch_protect_type *subtype = type->subtypes;
14664 /* Search for the subtype. */
14665 while (str && subtype && subtype->name && !found_subtype
14666 && res == AARCH64_PARSE_OK)
14668 if (strcmp (str, subtype->name) == 0)
14670 found_subtype = true;
14671 res = subtype->handler (str, next_str);
14672 str = next_str;
14673 next_str = strtok_r (NULL, "+", &token_save);
14675 else
14676 subtype++;
14680 else if (!found)
14681 res = AARCH64_PARSE_INVALID_ARG;
14684 /* Copy the last processed token into the argument to pass it back.
14685 Used by option and attribute validation to print the offending token. */
14686 if (last_str)
14688 if (str) strcpy (*last_str, str);
14689 else *last_str = NULL;
14691 if (res == AARCH64_PARSE_OK)
14693 /* If needed, alloc the accepted string then copy in const_str.
14694 Used by override_option_after_change_1. */
14695 if (!accepted_branch_protection_string)
14696 accepted_branch_protection_string = (char *) xmalloc (
14697 BRANCH_PROTECT_STR_MAX
14698 + 1);
14699 strncpy (accepted_branch_protection_string, const_str,
14700 BRANCH_PROTECT_STR_MAX + 1);
14701 /* Forcibly null-terminate. */
14702 accepted_branch_protection_string[BRANCH_PROTECT_STR_MAX] = '\0';
14704 return res;
14707 static bool
14708 aarch64_validate_mbranch_protection (const char *const_str)
14710 char *str = (char *) xmalloc (strlen (const_str));
14711 enum aarch64_parse_opt_result res =
14712 aarch64_parse_branch_protection (const_str, &str);
14713 if (res == AARCH64_PARSE_INVALID_ARG)
14714 error ("invalid argument %<%s%> for %<-mbranch-protection=%>", str);
14715 else if (res == AARCH64_PARSE_MISSING_ARG)
14716 error ("missing argument for %<-mbranch-protection=%>");
14717 free (str);
14718 return res == AARCH64_PARSE_OK;
14721 /* Validate a command-line -march option. Parse the arch and extensions
14722 (if any) specified in STR and throw errors if appropriate. Put the
14723 results, if they are valid, in RES and ISA_FLAGS. Return whether the
14724 option is valid. */
14726 static bool
14727 aarch64_validate_march (const char *str, const struct processor **res,
14728 uint64_t *isa_flags)
14730 std::string invalid_extension;
14731 enum aarch64_parse_opt_result parse_res
14732 = aarch64_parse_arch (str, res, isa_flags, &invalid_extension);
14734 if (parse_res == AARCH64_PARSE_OK)
14735 return true;
14737 switch (parse_res)
14739 case AARCH64_PARSE_MISSING_ARG:
14740 error ("missing arch name in %<-march=%s%>", str);
14741 break;
14742 case AARCH64_PARSE_INVALID_ARG:
14743 error ("unknown value %qs for %<-march%>", str);
14744 aarch64_print_hint_for_arch (str);
14745 break;
14746 case AARCH64_PARSE_INVALID_FEATURE:
14747 error ("invalid feature modifier %qs in %<-march=%s%>",
14748 invalid_extension.c_str (), str);
14749 aarch64_print_hint_for_extensions (invalid_extension);
14750 break;
14751 default:
14752 gcc_unreachable ();
14755 return false;
14758 /* Validate a command-line -mtune option. Parse the cpu
14759 specified in STR and throw errors if appropriate. Put the
14760 result, if it is valid, in RES. Return whether the option is
14761 valid. */
14763 static bool
14764 aarch64_validate_mtune (const char *str, const struct processor **res)
14766 enum aarch64_parse_opt_result parse_res
14767 = aarch64_parse_tune (str, res);
14769 if (parse_res == AARCH64_PARSE_OK)
14770 return true;
14772 switch (parse_res)
14774 case AARCH64_PARSE_MISSING_ARG:
14775 error ("missing cpu name in %<-mtune=%s%>", str);
14776 break;
14777 case AARCH64_PARSE_INVALID_ARG:
14778 error ("unknown value %qs for %<-mtune%>", str);
14779 aarch64_print_hint_for_core (str);
14780 break;
14781 default:
14782 gcc_unreachable ();
14784 return false;
14787 /* Return the CPU corresponding to the enum CPU.
14788 If it doesn't specify a cpu, return the default. */
14790 static const struct processor *
14791 aarch64_get_tune_cpu (enum aarch64_processor cpu)
14793 if (cpu != aarch64_none)
14794 return &all_cores[cpu];
14796 /* The & 0x3f is to extract the bottom 6 bits that encode the
14797 default cpu as selected by the --with-cpu GCC configure option
14798 in config.gcc.
14799 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
14800 flags mechanism should be reworked to make it more sane. */
14801 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14804 /* Return the architecture corresponding to the enum ARCH.
14805 If it doesn't specify a valid architecture, return the default. */
14807 static const struct processor *
14808 aarch64_get_arch (enum aarch64_arch arch)
14810 if (arch != aarch64_no_arch)
14811 return &all_architectures[arch];
14813 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
14815 return &all_architectures[cpu->arch];
14818 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
14820 static poly_uint16
14821 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
14823 /* 128-bit SVE and Advanced SIMD modes use different register layouts
14824 on big-endian targets, so we would need to forbid subregs that convert
14825 from one to the other. By default a reinterpret sequence would then
14826 involve a store to memory in one mode and a load back in the other.
14827 Even if we optimize that sequence using reverse instructions,
14828 it would still be a significant potential overhead.
14830 For now, it seems better to generate length-agnostic code for that
14831 case instead. */
14832 if (value == SVE_SCALABLE
14833 || (value == SVE_128 && BYTES_BIG_ENDIAN))
14834 return poly_uint16 (2, 2);
14835 else
14836 return (int) value / 64;
14839 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
14840 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
14841 tuning structs. In particular it must set selected_tune and
14842 aarch64_isa_flags that define the available ISA features and tuning
14843 decisions. It must also set selected_arch as this will be used to
14844 output the .arch asm tags for each function. */
14846 static void
14847 aarch64_override_options (void)
14849 uint64_t cpu_isa = 0;
14850 uint64_t arch_isa = 0;
14851 aarch64_isa_flags = 0;
14853 bool valid_cpu = true;
14854 bool valid_tune = true;
14855 bool valid_arch = true;
14857 selected_cpu = NULL;
14858 selected_arch = NULL;
14859 selected_tune = NULL;
14861 if (aarch64_harden_sls_string)
14862 aarch64_validate_sls_mitigation (aarch64_harden_sls_string);
14864 if (aarch64_branch_protection_string)
14865 aarch64_validate_mbranch_protection (aarch64_branch_protection_string);
14867 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
14868 If either of -march or -mtune is given, they override their
14869 respective component of -mcpu. */
14870 if (aarch64_cpu_string)
14871 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
14872 &cpu_isa);
14874 if (aarch64_arch_string)
14875 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
14876 &arch_isa);
14878 if (aarch64_tune_string)
14879 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
14881 #ifdef SUBTARGET_OVERRIDE_OPTIONS
14882 SUBTARGET_OVERRIDE_OPTIONS;
14883 #endif
14885 /* If the user did not specify a processor, choose the default
14886 one for them. This will be the CPU set during configuration using
14887 --with-cpu, otherwise it is "generic". */
14888 if (!selected_cpu)
14890 if (selected_arch)
14892 selected_cpu = &all_cores[selected_arch->ident];
14893 aarch64_isa_flags = arch_isa;
14894 explicit_arch = selected_arch->arch;
14896 else
14898 /* Get default configure-time CPU. */
14899 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
14900 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
14903 if (selected_tune)
14904 explicit_tune_core = selected_tune->ident;
14906 /* If both -mcpu and -march are specified check that they are architecturally
14907 compatible, warn if they're not and prefer the -march ISA flags. */
14908 else if (selected_arch)
14910 if (selected_arch->arch != selected_cpu->arch)
14912 warning (0, "switch %<-mcpu=%s%> conflicts with %<-march=%s%> switch",
14913 aarch64_cpu_string,
14914 aarch64_arch_string);
14916 aarch64_isa_flags = arch_isa;
14917 explicit_arch = selected_arch->arch;
14918 explicit_tune_core = selected_tune ? selected_tune->ident
14919 : selected_cpu->ident;
14921 else
14923 /* -mcpu but no -march. */
14924 aarch64_isa_flags = cpu_isa;
14925 explicit_tune_core = selected_tune ? selected_tune->ident
14926 : selected_cpu->ident;
14927 gcc_assert (selected_cpu);
14928 selected_arch = &all_architectures[selected_cpu->arch];
14929 explicit_arch = selected_arch->arch;
14932 /* Set the arch as well as we will need it when outputing
14933 the .arch directive in assembly. */
14934 if (!selected_arch)
14936 gcc_assert (selected_cpu);
14937 selected_arch = &all_architectures[selected_cpu->arch];
14940 if (!selected_tune)
14941 selected_tune = selected_cpu;
14943 if (aarch64_enable_bti == 2)
14945 #ifdef TARGET_ENABLE_BTI
14946 aarch64_enable_bti = 1;
14947 #else
14948 aarch64_enable_bti = 0;
14949 #endif
14952 /* Return address signing is currently not supported for ILP32 targets. For
14953 LP64 targets use the configured option in the absence of a command-line
14954 option for -mbranch-protection. */
14955 if (!TARGET_ILP32 && accepted_branch_protection_string == NULL)
14957 #ifdef TARGET_ENABLE_PAC_RET
14958 aarch64_ra_sign_scope = AARCH64_FUNCTION_NON_LEAF;
14959 #else
14960 aarch64_ra_sign_scope = AARCH64_FUNCTION_NONE;
14961 #endif
14964 #ifndef HAVE_AS_MABI_OPTION
14965 /* The compiler may have been configured with 2.23.* binutils, which does
14966 not have support for ILP32. */
14967 if (TARGET_ILP32)
14968 error ("assembler does not support %<-mabi=ilp32%>");
14969 #endif
14971 /* Convert -msve-vector-bits to a VG count. */
14972 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
14974 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
14975 sorry ("return address signing is only supported for %<-mabi=lp64%>");
14977 /* Make sure we properly set up the explicit options. */
14978 if ((aarch64_cpu_string && valid_cpu)
14979 || (aarch64_tune_string && valid_tune))
14980 gcc_assert (explicit_tune_core != aarch64_none);
14982 if ((aarch64_cpu_string && valid_cpu)
14983 || (aarch64_arch_string && valid_arch))
14984 gcc_assert (explicit_arch != aarch64_no_arch);
14986 /* The pass to insert speculation tracking runs before
14987 shrink-wrapping and the latter does not know how to update the
14988 tracking status. So disable it in this case. */
14989 if (aarch64_track_speculation)
14990 flag_shrink_wrap = 0;
14992 aarch64_override_options_internal (&global_options);
14994 /* Save these options as the default ones in case we push and pop them later
14995 while processing functions with potential target attributes. */
14996 target_option_default_node = target_option_current_node
14997 = build_target_option_node (&global_options);
15000 /* Implement targetm.override_options_after_change. */
15002 static void
15003 aarch64_override_options_after_change (void)
15005 aarch64_override_options_after_change_1 (&global_options);
15008 /* Implement the TARGET_OFFLOAD_OPTIONS hook. */
15009 static char *
15010 aarch64_offload_options (void)
15012 if (TARGET_ILP32)
15013 return xstrdup ("-foffload-abi=ilp32");
15014 else
15015 return xstrdup ("-foffload-abi=lp64");
15018 static struct machine_function *
15019 aarch64_init_machine_status (void)
15021 struct machine_function *machine;
15022 machine = ggc_cleared_alloc<machine_function> ();
15023 return machine;
15026 void
15027 aarch64_init_expanders (void)
15029 init_machine_status = aarch64_init_machine_status;
15032 /* A checking mechanism for the implementation of the various code models. */
15033 static void
15034 initialize_aarch64_code_model (struct gcc_options *opts)
15036 aarch64_cmodel = opts->x_aarch64_cmodel_var;
15037 switch (opts->x_aarch64_cmodel_var)
15039 case AARCH64_CMODEL_TINY:
15040 if (opts->x_flag_pic)
15041 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
15042 break;
15043 case AARCH64_CMODEL_SMALL:
15044 if (opts->x_flag_pic)
15046 #ifdef HAVE_AS_SMALL_PIC_RELOCS
15047 aarch64_cmodel = (flag_pic == 2
15048 ? AARCH64_CMODEL_SMALL_PIC
15049 : AARCH64_CMODEL_SMALL_SPIC);
15050 #else
15051 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
15052 #endif
15054 break;
15055 case AARCH64_CMODEL_LARGE:
15056 if (opts->x_flag_pic)
15057 sorry ("code model %qs with %<-f%s%>", "large",
15058 opts->x_flag_pic > 1 ? "PIC" : "pic");
15059 if (opts->x_aarch64_abi == AARCH64_ABI_ILP32)
15060 sorry ("code model %qs not supported in ilp32 mode", "large");
15061 break;
15062 case AARCH64_CMODEL_TINY_PIC:
15063 case AARCH64_CMODEL_SMALL_PIC:
15064 case AARCH64_CMODEL_SMALL_SPIC:
15065 gcc_unreachable ();
15069 /* Implement TARGET_OPTION_SAVE. */
15071 static void
15072 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
15074 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
15075 ptr->x_aarch64_branch_protection_string
15076 = opts->x_aarch64_branch_protection_string;
15079 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
15080 using the information saved in PTR. */
15082 static void
15083 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
15085 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
15086 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
15087 opts->x_explicit_arch = ptr->x_explicit_arch;
15088 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
15089 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
15090 opts->x_aarch64_branch_protection_string
15091 = ptr->x_aarch64_branch_protection_string;
15092 if (opts->x_aarch64_branch_protection_string)
15094 aarch64_parse_branch_protection (opts->x_aarch64_branch_protection_string,
15095 NULL);
15098 aarch64_override_options_internal (opts);
15101 /* Implement TARGET_OPTION_PRINT. */
15103 static void
15104 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
15106 const struct processor *cpu
15107 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
15108 uint64_t isa_flags = ptr->x_aarch64_isa_flags;
15109 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
15110 std::string extension
15111 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
15113 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
15114 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
15115 arch->name, extension.c_str ());
15118 static GTY(()) tree aarch64_previous_fndecl;
15120 void
15121 aarch64_reset_previous_fndecl (void)
15123 aarch64_previous_fndecl = NULL;
15126 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
15127 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
15128 make sure optab availability predicates are recomputed when necessary. */
15130 void
15131 aarch64_save_restore_target_globals (tree new_tree)
15133 if (TREE_TARGET_GLOBALS (new_tree))
15134 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
15135 else if (new_tree == target_option_default_node)
15136 restore_target_globals (&default_target_globals);
15137 else
15138 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
15141 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
15142 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
15143 of the function, if such exists. This function may be called multiple
15144 times on a single function so use aarch64_previous_fndecl to avoid
15145 setting up identical state. */
15147 static void
15148 aarch64_set_current_function (tree fndecl)
15150 if (!fndecl || fndecl == aarch64_previous_fndecl)
15151 return;
15153 tree old_tree = (aarch64_previous_fndecl
15154 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
15155 : NULL_TREE);
15157 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15159 /* If current function has no attributes but the previous one did,
15160 use the default node. */
15161 if (!new_tree && old_tree)
15162 new_tree = target_option_default_node;
15164 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
15165 the default have been handled by aarch64_save_restore_target_globals from
15166 aarch64_pragma_target_parse. */
15167 if (old_tree == new_tree)
15168 return;
15170 aarch64_previous_fndecl = fndecl;
15172 /* First set the target options. */
15173 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
15175 aarch64_save_restore_target_globals (new_tree);
15178 /* Enum describing the various ways we can handle attributes.
15179 In many cases we can reuse the generic option handling machinery. */
15181 enum aarch64_attr_opt_type
15183 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
15184 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
15185 aarch64_attr_enum, /* Attribute sets an enum variable. */
15186 aarch64_attr_custom /* Attribute requires a custom handling function. */
15189 /* All the information needed to handle a target attribute.
15190 NAME is the name of the attribute.
15191 ATTR_TYPE specifies the type of behavior of the attribute as described
15192 in the definition of enum aarch64_attr_opt_type.
15193 ALLOW_NEG is true if the attribute supports a "no-" form.
15194 HANDLER is the function that takes the attribute string as an argument
15195 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
15196 OPT_NUM is the enum specifying the option that the attribute modifies.
15197 This is needed for attributes that mirror the behavior of a command-line
15198 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
15199 aarch64_attr_enum. */
15201 struct aarch64_attribute_info
15203 const char *name;
15204 enum aarch64_attr_opt_type attr_type;
15205 bool allow_neg;
15206 bool (*handler) (const char *);
15207 enum opt_code opt_num;
15210 /* Handle the ARCH_STR argument to the arch= target attribute. */
15212 static bool
15213 aarch64_handle_attr_arch (const char *str)
15215 const struct processor *tmp_arch = NULL;
15216 std::string invalid_extension;
15217 enum aarch64_parse_opt_result parse_res
15218 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags, &invalid_extension);
15220 if (parse_res == AARCH64_PARSE_OK)
15222 gcc_assert (tmp_arch);
15223 selected_arch = tmp_arch;
15224 explicit_arch = selected_arch->arch;
15225 return true;
15228 switch (parse_res)
15230 case AARCH64_PARSE_MISSING_ARG:
15231 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
15232 break;
15233 case AARCH64_PARSE_INVALID_ARG:
15234 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
15235 aarch64_print_hint_for_arch (str);
15236 break;
15237 case AARCH64_PARSE_INVALID_FEATURE:
15238 error ("invalid feature modifier %s of value (\"%s\") in "
15239 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15240 aarch64_print_hint_for_extensions (invalid_extension);
15241 break;
15242 default:
15243 gcc_unreachable ();
15246 return false;
15249 /* Handle the argument CPU_STR to the cpu= target attribute. */
15251 static bool
15252 aarch64_handle_attr_cpu (const char *str)
15254 const struct processor *tmp_cpu = NULL;
15255 std::string invalid_extension;
15256 enum aarch64_parse_opt_result parse_res
15257 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags, &invalid_extension);
15259 if (parse_res == AARCH64_PARSE_OK)
15261 gcc_assert (tmp_cpu);
15262 selected_tune = tmp_cpu;
15263 explicit_tune_core = selected_tune->ident;
15265 selected_arch = &all_architectures[tmp_cpu->arch];
15266 explicit_arch = selected_arch->arch;
15267 return true;
15270 switch (parse_res)
15272 case AARCH64_PARSE_MISSING_ARG:
15273 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
15274 break;
15275 case AARCH64_PARSE_INVALID_ARG:
15276 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
15277 aarch64_print_hint_for_core (str);
15278 break;
15279 case AARCH64_PARSE_INVALID_FEATURE:
15280 error ("invalid feature modifier %s of value (\"%s\") in "
15281 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15282 aarch64_print_hint_for_extensions (invalid_extension);
15283 break;
15284 default:
15285 gcc_unreachable ();
15288 return false;
15291 /* Handle the argument STR to the branch-protection= attribute. */
15293 static bool
15294 aarch64_handle_attr_branch_protection (const char* str)
15296 char *err_str = (char *) xmalloc (strlen (str) + 1);
15297 enum aarch64_parse_opt_result res = aarch64_parse_branch_protection (str,
15298 &err_str);
15299 bool success = false;
15300 switch (res)
15302 case AARCH64_PARSE_MISSING_ARG:
15303 error ("missing argument to %<target(\"branch-protection=\")%> pragma or"
15304 " attribute");
15305 break;
15306 case AARCH64_PARSE_INVALID_ARG:
15307 error ("invalid protection type (\"%s\") in %<target(\"branch-protection"
15308 "=\")%> pragma or attribute", err_str);
15309 break;
15310 case AARCH64_PARSE_OK:
15311 success = true;
15312 /* Fall through. */
15313 case AARCH64_PARSE_INVALID_FEATURE:
15314 break;
15315 default:
15316 gcc_unreachable ();
15318 free (err_str);
15319 return success;
15322 /* Handle the argument STR to the tune= target attribute. */
15324 static bool
15325 aarch64_handle_attr_tune (const char *str)
15327 const struct processor *tmp_tune = NULL;
15328 enum aarch64_parse_opt_result parse_res
15329 = aarch64_parse_tune (str, &tmp_tune);
15331 if (parse_res == AARCH64_PARSE_OK)
15333 gcc_assert (tmp_tune);
15334 selected_tune = tmp_tune;
15335 explicit_tune_core = selected_tune->ident;
15336 return true;
15339 switch (parse_res)
15341 case AARCH64_PARSE_INVALID_ARG:
15342 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
15343 aarch64_print_hint_for_core (str);
15344 break;
15345 default:
15346 gcc_unreachable ();
15349 return false;
15352 /* Parse an architecture extensions target attribute string specified in STR.
15353 For example "+fp+nosimd". Show any errors if needed. Return TRUE
15354 if successful. Update aarch64_isa_flags to reflect the ISA features
15355 modified. */
15357 static bool
15358 aarch64_handle_attr_isa_flags (char *str)
15360 enum aarch64_parse_opt_result parse_res;
15361 uint64_t isa_flags = aarch64_isa_flags;
15363 /* We allow "+nothing" in the beginning to clear out all architectural
15364 features if the user wants to handpick specific features. */
15365 if (strncmp ("+nothing", str, 8) == 0)
15367 isa_flags = 0;
15368 str += 8;
15371 std::string invalid_extension;
15372 parse_res = aarch64_parse_extension (str, &isa_flags, &invalid_extension);
15374 if (parse_res == AARCH64_PARSE_OK)
15376 aarch64_isa_flags = isa_flags;
15377 return true;
15380 switch (parse_res)
15382 case AARCH64_PARSE_MISSING_ARG:
15383 error ("missing value in %<target()%> pragma or attribute");
15384 break;
15386 case AARCH64_PARSE_INVALID_FEATURE:
15387 error ("invalid feature modifier %s of value (\"%s\") in "
15388 "%<target()%> pragma or attribute", invalid_extension.c_str (), str);
15389 break;
15391 default:
15392 gcc_unreachable ();
15395 return false;
15398 /* The target attributes that we support. On top of these we also support just
15399 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
15400 handled explicitly in aarch64_process_one_target_attr. */
15402 static const struct aarch64_attribute_info aarch64_attributes[] =
15404 { "general-regs-only", aarch64_attr_mask, false, NULL,
15405 OPT_mgeneral_regs_only },
15406 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
15407 OPT_mfix_cortex_a53_835769 },
15408 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
15409 OPT_mfix_cortex_a53_843419 },
15410 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
15411 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
15412 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
15413 OPT_momit_leaf_frame_pointer },
15414 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
15415 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
15416 OPT_march_ },
15417 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
15418 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
15419 OPT_mtune_ },
15420 { "branch-protection", aarch64_attr_custom, false,
15421 aarch64_handle_attr_branch_protection, OPT_mbranch_protection_ },
15422 { "sign-return-address", aarch64_attr_enum, false, NULL,
15423 OPT_msign_return_address_ },
15424 { "outline-atomics", aarch64_attr_bool, true, NULL,
15425 OPT_moutline_atomics},
15426 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
15429 /* Parse ARG_STR which contains the definition of one target attribute.
15430 Show appropriate errors if any or return true if the attribute is valid. */
15432 static bool
15433 aarch64_process_one_target_attr (char *arg_str)
15435 bool invert = false;
15437 size_t len = strlen (arg_str);
15439 if (len == 0)
15441 error ("malformed %<target()%> pragma or attribute");
15442 return false;
15445 char *str_to_check = (char *) alloca (len + 1);
15446 strcpy (str_to_check, arg_str);
15448 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
15449 It is easier to detect and handle it explicitly here rather than going
15450 through the machinery for the rest of the target attributes in this
15451 function. */
15452 if (*str_to_check == '+')
15453 return aarch64_handle_attr_isa_flags (str_to_check);
15455 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
15457 invert = true;
15458 str_to_check += 3;
15460 char *arg = strchr (str_to_check, '=');
15462 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
15463 and point ARG to "foo". */
15464 if (arg)
15466 *arg = '\0';
15467 arg++;
15469 const struct aarch64_attribute_info *p_attr;
15470 bool found = false;
15471 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
15473 /* If the names don't match up, or the user has given an argument
15474 to an attribute that doesn't accept one, or didn't give an argument
15475 to an attribute that expects one, fail to match. */
15476 if (strcmp (str_to_check, p_attr->name) != 0)
15477 continue;
15479 found = true;
15480 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
15481 || p_attr->attr_type == aarch64_attr_enum;
15483 if (attr_need_arg_p ^ (arg != NULL))
15485 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
15486 return false;
15489 /* If the name matches but the attribute does not allow "no-" versions
15490 then we can't match. */
15491 if (invert && !p_attr->allow_neg)
15493 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
15494 return false;
15497 switch (p_attr->attr_type)
15499 /* Has a custom handler registered.
15500 For example, cpu=, arch=, tune=. */
15501 case aarch64_attr_custom:
15502 gcc_assert (p_attr->handler);
15503 if (!p_attr->handler (arg))
15504 return false;
15505 break;
15507 /* Either set or unset a boolean option. */
15508 case aarch64_attr_bool:
15510 struct cl_decoded_option decoded;
15512 generate_option (p_attr->opt_num, NULL, !invert,
15513 CL_TARGET, &decoded);
15514 aarch64_handle_option (&global_options, &global_options_set,
15515 &decoded, input_location);
15516 break;
15518 /* Set or unset a bit in the target_flags. aarch64_handle_option
15519 should know what mask to apply given the option number. */
15520 case aarch64_attr_mask:
15522 struct cl_decoded_option decoded;
15523 /* We only need to specify the option number.
15524 aarch64_handle_option will know which mask to apply. */
15525 decoded.opt_index = p_attr->opt_num;
15526 decoded.value = !invert;
15527 aarch64_handle_option (&global_options, &global_options_set,
15528 &decoded, input_location);
15529 break;
15531 /* Use the option setting machinery to set an option to an enum. */
15532 case aarch64_attr_enum:
15534 gcc_assert (arg);
15535 bool valid;
15536 int value;
15537 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
15538 &value, CL_TARGET);
15539 if (valid)
15541 set_option (&global_options, NULL, p_attr->opt_num, value,
15542 NULL, DK_UNSPECIFIED, input_location,
15543 global_dc);
15545 else
15547 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
15549 break;
15551 default:
15552 gcc_unreachable ();
15556 /* If we reached here we either have found an attribute and validated
15557 it or didn't match any. If we matched an attribute but its arguments
15558 were malformed we will have returned false already. */
15559 return found;
15562 /* Count how many times the character C appears in
15563 NULL-terminated string STR. */
15565 static unsigned int
15566 num_occurences_in_str (char c, char *str)
15568 unsigned int res = 0;
15569 while (*str != '\0')
15571 if (*str == c)
15572 res++;
15574 str++;
15577 return res;
15580 /* Parse the tree in ARGS that contains the target attribute information
15581 and update the global target options space. */
15583 bool
15584 aarch64_process_target_attr (tree args)
15586 if (TREE_CODE (args) == TREE_LIST)
15590 tree head = TREE_VALUE (args);
15591 if (head)
15593 if (!aarch64_process_target_attr (head))
15594 return false;
15596 args = TREE_CHAIN (args);
15597 } while (args);
15599 return true;
15602 if (TREE_CODE (args) != STRING_CST)
15604 error ("attribute %<target%> argument not a string");
15605 return false;
15608 size_t len = strlen (TREE_STRING_POINTER (args));
15609 char *str_to_check = (char *) alloca (len + 1);
15610 strcpy (str_to_check, TREE_STRING_POINTER (args));
15612 if (len == 0)
15614 error ("malformed %<target()%> pragma or attribute");
15615 return false;
15618 /* Used to catch empty spaces between commas i.e.
15619 attribute ((target ("attr1,,attr2"))). */
15620 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
15622 /* Handle multiple target attributes separated by ','. */
15623 char *token = strtok_r (str_to_check, ",", &str_to_check);
15625 unsigned int num_attrs = 0;
15626 while (token)
15628 num_attrs++;
15629 if (!aarch64_process_one_target_attr (token))
15631 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
15632 return false;
15635 token = strtok_r (NULL, ",", &str_to_check);
15638 if (num_attrs != num_commas + 1)
15640 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
15641 return false;
15644 return true;
15647 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
15648 process attribute ((target ("..."))). */
15650 static bool
15651 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
15653 struct cl_target_option cur_target;
15654 bool ret;
15655 tree old_optimize;
15656 tree new_target, new_optimize;
15657 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
15659 /* If what we're processing is the current pragma string then the
15660 target option node is already stored in target_option_current_node
15661 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
15662 having to re-parse the string. This is especially useful to keep
15663 arm_neon.h compile times down since that header contains a lot
15664 of intrinsics enclosed in pragmas. */
15665 if (!existing_target && args == current_target_pragma)
15667 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
15668 return true;
15670 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15672 old_optimize = build_optimization_node (&global_options);
15673 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
15675 /* If the function changed the optimization levels as well as setting
15676 target options, start with the optimizations specified. */
15677 if (func_optimize && func_optimize != old_optimize)
15678 cl_optimization_restore (&global_options,
15679 TREE_OPTIMIZATION (func_optimize));
15681 /* Save the current target options to restore at the end. */
15682 cl_target_option_save (&cur_target, &global_options);
15684 /* If fndecl already has some target attributes applied to it, unpack
15685 them so that we add this attribute on top of them, rather than
15686 overwriting them. */
15687 if (existing_target)
15689 struct cl_target_option *existing_options
15690 = TREE_TARGET_OPTION (existing_target);
15692 if (existing_options)
15693 cl_target_option_restore (&global_options, existing_options);
15695 else
15696 cl_target_option_restore (&global_options,
15697 TREE_TARGET_OPTION (target_option_current_node));
15699 ret = aarch64_process_target_attr (args);
15701 /* Set up any additional state. */
15702 if (ret)
15704 aarch64_override_options_internal (&global_options);
15705 /* Initialize SIMD builtins if we haven't already.
15706 Set current_target_pragma to NULL for the duration so that
15707 the builtin initialization code doesn't try to tag the functions
15708 being built with the attributes specified by any current pragma, thus
15709 going into an infinite recursion. */
15710 if (TARGET_SIMD)
15712 tree saved_current_target_pragma = current_target_pragma;
15713 current_target_pragma = NULL;
15714 aarch64_init_simd_builtins ();
15715 current_target_pragma = saved_current_target_pragma;
15717 new_target = build_target_option_node (&global_options);
15719 else
15720 new_target = NULL;
15722 new_optimize = build_optimization_node (&global_options);
15724 if (fndecl && ret)
15726 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
15728 if (old_optimize != new_optimize)
15729 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
15732 cl_target_option_restore (&global_options, &cur_target);
15734 if (old_optimize != new_optimize)
15735 cl_optimization_restore (&global_options,
15736 TREE_OPTIMIZATION (old_optimize));
15737 return ret;
15740 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
15741 tri-bool options (yes, no, don't care) and the default value is
15742 DEF, determine whether to reject inlining. */
15744 static bool
15745 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
15746 int dont_care, int def)
15748 /* If the callee doesn't care, always allow inlining. */
15749 if (callee == dont_care)
15750 return true;
15752 /* If the caller doesn't care, always allow inlining. */
15753 if (caller == dont_care)
15754 return true;
15756 /* Otherwise, allow inlining if either the callee and caller values
15757 agree, or if the callee is using the default value. */
15758 return (callee == caller || callee == def);
15761 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
15762 to inline CALLEE into CALLER based on target-specific info.
15763 Make sure that the caller and callee have compatible architectural
15764 features. Then go through the other possible target attributes
15765 and see if they can block inlining. Try not to reject always_inline
15766 callees unless they are incompatible architecturally. */
15768 static bool
15769 aarch64_can_inline_p (tree caller, tree callee)
15771 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
15772 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
15774 struct cl_target_option *caller_opts
15775 = TREE_TARGET_OPTION (caller_tree ? caller_tree
15776 : target_option_default_node);
15778 struct cl_target_option *callee_opts
15779 = TREE_TARGET_OPTION (callee_tree ? callee_tree
15780 : target_option_default_node);
15782 /* Callee's ISA flags should be a subset of the caller's. */
15783 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
15784 != callee_opts->x_aarch64_isa_flags)
15785 return false;
15787 /* Allow non-strict aligned functions inlining into strict
15788 aligned ones. */
15789 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
15790 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
15791 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
15792 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
15793 return false;
15795 bool always_inline = lookup_attribute ("always_inline",
15796 DECL_ATTRIBUTES (callee));
15798 /* If the architectural features match up and the callee is always_inline
15799 then the other attributes don't matter. */
15800 if (always_inline)
15801 return true;
15803 if (caller_opts->x_aarch64_cmodel_var
15804 != callee_opts->x_aarch64_cmodel_var)
15805 return false;
15807 if (caller_opts->x_aarch64_tls_dialect
15808 != callee_opts->x_aarch64_tls_dialect)
15809 return false;
15811 /* Honour explicit requests to workaround errata. */
15812 if (!aarch64_tribools_ok_for_inlining_p (
15813 caller_opts->x_aarch64_fix_a53_err835769,
15814 callee_opts->x_aarch64_fix_a53_err835769,
15815 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
15816 return false;
15818 if (!aarch64_tribools_ok_for_inlining_p (
15819 caller_opts->x_aarch64_fix_a53_err843419,
15820 callee_opts->x_aarch64_fix_a53_err843419,
15821 2, TARGET_FIX_ERR_A53_843419))
15822 return false;
15824 /* If the user explicitly specified -momit-leaf-frame-pointer for the
15825 caller and calle and they don't match up, reject inlining. */
15826 if (!aarch64_tribools_ok_for_inlining_p (
15827 caller_opts->x_flag_omit_leaf_frame_pointer,
15828 callee_opts->x_flag_omit_leaf_frame_pointer,
15829 2, 1))
15830 return false;
15832 /* If the callee has specific tuning overrides, respect them. */
15833 if (callee_opts->x_aarch64_override_tune_string != NULL
15834 && caller_opts->x_aarch64_override_tune_string == NULL)
15835 return false;
15837 /* If the user specified tuning override strings for the
15838 caller and callee and they don't match up, reject inlining.
15839 We just do a string compare here, we don't analyze the meaning
15840 of the string, as it would be too costly for little gain. */
15841 if (callee_opts->x_aarch64_override_tune_string
15842 && caller_opts->x_aarch64_override_tune_string
15843 && (strcmp (callee_opts->x_aarch64_override_tune_string,
15844 caller_opts->x_aarch64_override_tune_string) != 0))
15845 return false;
15847 return true;
15850 /* Return the ID of the TLDESC ABI, initializing the descriptor if hasn't
15851 been already. */
15853 unsigned int
15854 aarch64_tlsdesc_abi_id ()
15856 predefined_function_abi &tlsdesc_abi = function_abis[ARM_PCS_TLSDESC];
15857 if (!tlsdesc_abi.initialized_p ())
15859 HARD_REG_SET full_reg_clobbers;
15860 CLEAR_HARD_REG_SET (full_reg_clobbers);
15861 SET_HARD_REG_BIT (full_reg_clobbers, R0_REGNUM);
15862 SET_HARD_REG_BIT (full_reg_clobbers, CC_REGNUM);
15863 for (int regno = P0_REGNUM; regno <= P15_REGNUM; ++regno)
15864 SET_HARD_REG_BIT (full_reg_clobbers, regno);
15865 tlsdesc_abi.initialize (ARM_PCS_TLSDESC, full_reg_clobbers);
15867 return tlsdesc_abi.id ();
15870 /* Return true if SYMBOL_REF X binds locally. */
15872 static bool
15873 aarch64_symbol_binds_local_p (const_rtx x)
15875 return (SYMBOL_REF_DECL (x)
15876 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
15877 : SYMBOL_REF_LOCAL_P (x));
15880 /* Return true if SYMBOL_REF X is thread local */
15881 static bool
15882 aarch64_tls_symbol_p (rtx x)
15884 if (! TARGET_HAVE_TLS)
15885 return false;
15887 if (GET_CODE (x) != SYMBOL_REF)
15888 return false;
15890 return SYMBOL_REF_TLS_MODEL (x) != 0;
15893 /* Classify a TLS symbol into one of the TLS kinds. */
15894 enum aarch64_symbol_type
15895 aarch64_classify_tls_symbol (rtx x)
15897 enum tls_model tls_kind = tls_symbolic_operand_type (x);
15899 switch (tls_kind)
15901 case TLS_MODEL_GLOBAL_DYNAMIC:
15902 case TLS_MODEL_LOCAL_DYNAMIC:
15903 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
15905 case TLS_MODEL_INITIAL_EXEC:
15906 switch (aarch64_cmodel)
15908 case AARCH64_CMODEL_TINY:
15909 case AARCH64_CMODEL_TINY_PIC:
15910 return SYMBOL_TINY_TLSIE;
15911 default:
15912 return SYMBOL_SMALL_TLSIE;
15915 case TLS_MODEL_LOCAL_EXEC:
15916 if (aarch64_tls_size == 12)
15917 return SYMBOL_TLSLE12;
15918 else if (aarch64_tls_size == 24)
15919 return SYMBOL_TLSLE24;
15920 else if (aarch64_tls_size == 32)
15921 return SYMBOL_TLSLE32;
15922 else if (aarch64_tls_size == 48)
15923 return SYMBOL_TLSLE48;
15924 else
15925 gcc_unreachable ();
15927 case TLS_MODEL_EMULATED:
15928 case TLS_MODEL_NONE:
15929 return SYMBOL_FORCE_TO_MEM;
15931 default:
15932 gcc_unreachable ();
15936 /* Return the correct method for accessing X + OFFSET, where X is either
15937 a SYMBOL_REF or LABEL_REF. */
15939 enum aarch64_symbol_type
15940 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
15942 if (GET_CODE (x) == LABEL_REF)
15944 switch (aarch64_cmodel)
15946 case AARCH64_CMODEL_LARGE:
15947 return SYMBOL_FORCE_TO_MEM;
15949 case AARCH64_CMODEL_TINY_PIC:
15950 case AARCH64_CMODEL_TINY:
15951 return SYMBOL_TINY_ABSOLUTE;
15953 case AARCH64_CMODEL_SMALL_SPIC:
15954 case AARCH64_CMODEL_SMALL_PIC:
15955 case AARCH64_CMODEL_SMALL:
15956 return SYMBOL_SMALL_ABSOLUTE;
15958 default:
15959 gcc_unreachable ();
15963 if (GET_CODE (x) == SYMBOL_REF)
15965 if (aarch64_tls_symbol_p (x))
15966 return aarch64_classify_tls_symbol (x);
15968 switch (aarch64_cmodel)
15970 case AARCH64_CMODEL_TINY:
15971 /* When we retrieve symbol + offset address, we have to make sure
15972 the offset does not cause overflow of the final address. But
15973 we have no way of knowing the address of symbol at compile time
15974 so we can't accurately say if the distance between the PC and
15975 symbol + offset is outside the addressible range of +/-1MB in the
15976 TINY code model. So we limit the maximum offset to +/-64KB and
15977 assume the offset to the symbol is not larger than +/-(1MB - 64KB).
15978 If offset_within_block_p is true we allow larger offsets.
15979 Furthermore force to memory if the symbol is a weak reference to
15980 something that doesn't resolve to a symbol in this module. */
15982 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15983 return SYMBOL_FORCE_TO_MEM;
15984 if (!(IN_RANGE (offset, -0x10000, 0x10000)
15985 || offset_within_block_p (x, offset)))
15986 return SYMBOL_FORCE_TO_MEM;
15988 return SYMBOL_TINY_ABSOLUTE;
15990 case AARCH64_CMODEL_SMALL:
15991 /* Same reasoning as the tiny code model, but the offset cap here is
15992 1MB, allowing +/-3.9GB for the offset to the symbol. */
15994 if (SYMBOL_REF_WEAK (x) && !aarch64_symbol_binds_local_p (x))
15995 return SYMBOL_FORCE_TO_MEM;
15996 if (!(IN_RANGE (offset, -0x100000, 0x100000)
15997 || offset_within_block_p (x, offset)))
15998 return SYMBOL_FORCE_TO_MEM;
16000 return SYMBOL_SMALL_ABSOLUTE;
16002 case AARCH64_CMODEL_TINY_PIC:
16003 if (!aarch64_symbol_binds_local_p (x))
16004 return SYMBOL_TINY_GOT;
16005 return SYMBOL_TINY_ABSOLUTE;
16007 case AARCH64_CMODEL_SMALL_SPIC:
16008 case AARCH64_CMODEL_SMALL_PIC:
16009 if (!aarch64_symbol_binds_local_p (x))
16010 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
16011 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
16012 return SYMBOL_SMALL_ABSOLUTE;
16014 case AARCH64_CMODEL_LARGE:
16015 /* This is alright even in PIC code as the constant
16016 pool reference is always PC relative and within
16017 the same translation unit. */
16018 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
16019 return SYMBOL_SMALL_ABSOLUTE;
16020 else
16021 return SYMBOL_FORCE_TO_MEM;
16023 default:
16024 gcc_unreachable ();
16028 /* By default push everything into the constant pool. */
16029 return SYMBOL_FORCE_TO_MEM;
16032 bool
16033 aarch64_constant_address_p (rtx x)
16035 return (CONSTANT_P (x) && memory_address_p (DImode, x));
16038 bool
16039 aarch64_legitimate_pic_operand_p (rtx x)
16041 if (GET_CODE (x) == SYMBOL_REF
16042 || (GET_CODE (x) == CONST
16043 && GET_CODE (XEXP (x, 0)) == PLUS
16044 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
16045 return false;
16047 return true;
16050 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
16051 that should be rematerialized rather than spilled. */
16053 static bool
16054 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
16056 /* Support CSE and rematerialization of common constants. */
16057 if (CONST_INT_P (x)
16058 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
16059 || GET_CODE (x) == CONST_VECTOR)
16060 return true;
16062 /* Do not allow vector struct mode constants for Advanced SIMD.
16063 We could support 0 and -1 easily, but they need support in
16064 aarch64-simd.md. */
16065 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
16066 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
16067 return false;
16069 /* Only accept variable-length vector constants if they can be
16070 handled directly.
16072 ??? It would be possible to handle rematerialization of other
16073 constants via secondary reloads. */
16074 if (vec_flags & VEC_ANY_SVE)
16075 return aarch64_simd_valid_immediate (x, NULL);
16077 if (GET_CODE (x) == HIGH)
16078 x = XEXP (x, 0);
16080 /* Accept polynomial constants that can be calculated by using the
16081 destination of a move as the sole temporary. Constants that
16082 require a second temporary cannot be rematerialized (they can't be
16083 forced to memory and also aren't legitimate constants). */
16084 poly_int64 offset;
16085 if (poly_int_rtx_p (x, &offset))
16086 return aarch64_offset_temporaries (false, offset) <= 1;
16088 /* If an offset is being added to something else, we need to allow the
16089 base to be moved into the destination register, meaning that there
16090 are no free temporaries for the offset. */
16091 x = strip_offset (x, &offset);
16092 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
16093 return false;
16095 /* Do not allow const (plus (anchor_symbol, const_int)). */
16096 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
16097 return false;
16099 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
16100 so spilling them is better than rematerialization. */
16101 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
16102 return true;
16104 /* Label references are always constant. */
16105 if (GET_CODE (x) == LABEL_REF)
16106 return true;
16108 return false;
16112 aarch64_load_tp (rtx target)
16114 if (!target
16115 || GET_MODE (target) != Pmode
16116 || !register_operand (target, Pmode))
16117 target = gen_reg_rtx (Pmode);
16119 /* Can return in any reg. */
16120 emit_insn (gen_aarch64_load_tp_hard (target));
16121 return target;
16124 /* On AAPCS systems, this is the "struct __va_list". */
16125 static GTY(()) tree va_list_type;
16127 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
16128 Return the type to use as __builtin_va_list.
16130 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
16132 struct __va_list
16134 void *__stack;
16135 void *__gr_top;
16136 void *__vr_top;
16137 int __gr_offs;
16138 int __vr_offs;
16139 }; */
16141 static tree
16142 aarch64_build_builtin_va_list (void)
16144 tree va_list_name;
16145 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16147 /* Create the type. */
16148 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
16149 /* Give it the required name. */
16150 va_list_name = build_decl (BUILTINS_LOCATION,
16151 TYPE_DECL,
16152 get_identifier ("__va_list"),
16153 va_list_type);
16154 DECL_ARTIFICIAL (va_list_name) = 1;
16155 TYPE_NAME (va_list_type) = va_list_name;
16156 TYPE_STUB_DECL (va_list_type) = va_list_name;
16158 /* Create the fields. */
16159 f_stack = build_decl (BUILTINS_LOCATION,
16160 FIELD_DECL, get_identifier ("__stack"),
16161 ptr_type_node);
16162 f_grtop = build_decl (BUILTINS_LOCATION,
16163 FIELD_DECL, get_identifier ("__gr_top"),
16164 ptr_type_node);
16165 f_vrtop = build_decl (BUILTINS_LOCATION,
16166 FIELD_DECL, get_identifier ("__vr_top"),
16167 ptr_type_node);
16168 f_groff = build_decl (BUILTINS_LOCATION,
16169 FIELD_DECL, get_identifier ("__gr_offs"),
16170 integer_type_node);
16171 f_vroff = build_decl (BUILTINS_LOCATION,
16172 FIELD_DECL, get_identifier ("__vr_offs"),
16173 integer_type_node);
16175 /* Tell tree-stdarg pass about our internal offset fields.
16176 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
16177 purpose to identify whether the code is updating va_list internal
16178 offset fields through irregular way. */
16179 va_list_gpr_counter_field = f_groff;
16180 va_list_fpr_counter_field = f_vroff;
16182 DECL_ARTIFICIAL (f_stack) = 1;
16183 DECL_ARTIFICIAL (f_grtop) = 1;
16184 DECL_ARTIFICIAL (f_vrtop) = 1;
16185 DECL_ARTIFICIAL (f_groff) = 1;
16186 DECL_ARTIFICIAL (f_vroff) = 1;
16188 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
16189 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
16190 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
16191 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
16192 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
16194 TYPE_FIELDS (va_list_type) = f_stack;
16195 DECL_CHAIN (f_stack) = f_grtop;
16196 DECL_CHAIN (f_grtop) = f_vrtop;
16197 DECL_CHAIN (f_vrtop) = f_groff;
16198 DECL_CHAIN (f_groff) = f_vroff;
16200 /* Compute its layout. */
16201 layout_type (va_list_type);
16203 return va_list_type;
16206 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
16207 static void
16208 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
16210 const CUMULATIVE_ARGS *cum;
16211 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16212 tree stack, grtop, vrtop, groff, vroff;
16213 tree t;
16214 int gr_save_area_size = cfun->va_list_gpr_size;
16215 int vr_save_area_size = cfun->va_list_fpr_size;
16216 int vr_offset;
16218 cum = &crtl->args.info;
16219 if (cfun->va_list_gpr_size)
16220 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
16221 cfun->va_list_gpr_size);
16222 if (cfun->va_list_fpr_size)
16223 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
16224 * UNITS_PER_VREG, cfun->va_list_fpr_size);
16226 if (!TARGET_FLOAT)
16228 gcc_assert (cum->aapcs_nvrn == 0);
16229 vr_save_area_size = 0;
16232 f_stack = TYPE_FIELDS (va_list_type_node);
16233 f_grtop = DECL_CHAIN (f_stack);
16234 f_vrtop = DECL_CHAIN (f_grtop);
16235 f_groff = DECL_CHAIN (f_vrtop);
16236 f_vroff = DECL_CHAIN (f_groff);
16238 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
16239 NULL_TREE);
16240 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
16241 NULL_TREE);
16242 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
16243 NULL_TREE);
16244 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
16245 NULL_TREE);
16246 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
16247 NULL_TREE);
16249 /* Emit code to initialize STACK, which points to the next varargs stack
16250 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
16251 by named arguments. STACK is 8-byte aligned. */
16252 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
16253 if (cum->aapcs_stack_size > 0)
16254 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
16255 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
16256 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16258 /* Emit code to initialize GRTOP, the top of the GR save area.
16259 virtual_incoming_args_rtx should have been 16 byte aligned. */
16260 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
16261 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
16262 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16264 /* Emit code to initialize VRTOP, the top of the VR save area.
16265 This address is gr_save_area_bytes below GRTOP, rounded
16266 down to the next 16-byte boundary. */
16267 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
16268 vr_offset = ROUND_UP (gr_save_area_size,
16269 STACK_BOUNDARY / BITS_PER_UNIT);
16271 if (vr_offset)
16272 t = fold_build_pointer_plus_hwi (t, -vr_offset);
16273 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
16274 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16276 /* Emit code to initialize GROFF, the offset from GRTOP of the
16277 next GPR argument. */
16278 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
16279 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
16280 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16282 /* Likewise emit code to initialize VROFF, the offset from FTOP
16283 of the next VR argument. */
16284 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
16285 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
16286 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
16289 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
16291 static tree
16292 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
16293 gimple_seq *post_p ATTRIBUTE_UNUSED)
16295 tree addr;
16296 bool indirect_p;
16297 bool is_ha; /* is HFA or HVA. */
16298 bool dw_align; /* double-word align. */
16299 machine_mode ag_mode = VOIDmode;
16300 int nregs;
16301 machine_mode mode;
16303 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
16304 tree stack, f_top, f_off, off, arg, roundup, on_stack;
16305 HOST_WIDE_INT size, rsize, adjust, align;
16306 tree t, u, cond1, cond2;
16308 indirect_p = pass_va_arg_by_reference (type);
16309 if (indirect_p)
16310 type = build_pointer_type (type);
16312 mode = TYPE_MODE (type);
16314 f_stack = TYPE_FIELDS (va_list_type_node);
16315 f_grtop = DECL_CHAIN (f_stack);
16316 f_vrtop = DECL_CHAIN (f_grtop);
16317 f_groff = DECL_CHAIN (f_vrtop);
16318 f_vroff = DECL_CHAIN (f_groff);
16320 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
16321 f_stack, NULL_TREE);
16322 size = int_size_in_bytes (type);
16324 bool abi_break;
16325 align
16326 = aarch64_function_arg_alignment (mode, type, &abi_break) / BITS_PER_UNIT;
16328 dw_align = false;
16329 adjust = 0;
16330 if (aarch64_vfp_is_call_or_return_candidate (mode, type, &ag_mode, &nregs,
16331 &is_ha, false))
16333 /* No frontends can create types with variable-sized modes, so we
16334 shouldn't be asked to pass or return them. */
16335 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
16337 /* TYPE passed in fp/simd registers. */
16338 if (!TARGET_FLOAT)
16339 aarch64_err_no_fpadvsimd (mode);
16341 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
16342 unshare_expr (valist), f_vrtop, NULL_TREE);
16343 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
16344 unshare_expr (valist), f_vroff, NULL_TREE);
16346 rsize = nregs * UNITS_PER_VREG;
16348 if (is_ha)
16350 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
16351 adjust = UNITS_PER_VREG - ag_size;
16353 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16354 && size < UNITS_PER_VREG)
16356 adjust = UNITS_PER_VREG - size;
16359 else
16361 /* TYPE passed in general registers. */
16362 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
16363 unshare_expr (valist), f_grtop, NULL_TREE);
16364 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
16365 unshare_expr (valist), f_groff, NULL_TREE);
16366 rsize = ROUND_UP (size, UNITS_PER_WORD);
16367 nregs = rsize / UNITS_PER_WORD;
16369 if (align > 8)
16371 if (abi_break && warn_psabi)
16372 inform (input_location, "parameter passing for argument of type "
16373 "%qT changed in GCC 9.1", type);
16374 dw_align = true;
16377 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16378 && size < UNITS_PER_WORD)
16380 adjust = UNITS_PER_WORD - size;
16384 /* Get a local temporary for the field value. */
16385 off = get_initialized_tmp_var (f_off, pre_p, NULL);
16387 /* Emit code to branch if off >= 0. */
16388 t = build2 (GE_EXPR, boolean_type_node, off,
16389 build_int_cst (TREE_TYPE (off), 0));
16390 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
16392 if (dw_align)
16394 /* Emit: offs = (offs + 15) & -16. */
16395 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16396 build_int_cst (TREE_TYPE (off), 15));
16397 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
16398 build_int_cst (TREE_TYPE (off), -16));
16399 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
16401 else
16402 roundup = NULL;
16404 /* Update ap.__[g|v]r_offs */
16405 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
16406 build_int_cst (TREE_TYPE (off), rsize));
16407 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
16409 /* String up. */
16410 if (roundup)
16411 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16413 /* [cond2] if (ap.__[g|v]r_offs > 0) */
16414 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
16415 build_int_cst (TREE_TYPE (f_off), 0));
16416 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
16418 /* String up: make sure the assignment happens before the use. */
16419 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
16420 COND_EXPR_ELSE (cond1) = t;
16422 /* Prepare the trees handling the argument that is passed on the stack;
16423 the top level node will store in ON_STACK. */
16424 arg = get_initialized_tmp_var (stack, pre_p, NULL);
16425 if (align > 8)
16427 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
16428 t = fold_build_pointer_plus_hwi (arg, 15);
16429 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16430 build_int_cst (TREE_TYPE (t), -16));
16431 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
16433 else
16434 roundup = NULL;
16435 /* Advance ap.__stack */
16436 t = fold_build_pointer_plus_hwi (arg, size + 7);
16437 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
16438 build_int_cst (TREE_TYPE (t), -8));
16439 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
16440 /* String up roundup and advance. */
16441 if (roundup)
16442 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
16443 /* String up with arg */
16444 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
16445 /* Big-endianness related address adjustment. */
16446 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
16447 && size < UNITS_PER_WORD)
16449 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
16450 size_int (UNITS_PER_WORD - size));
16451 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
16454 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
16455 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
16457 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
16458 t = off;
16459 if (adjust)
16460 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
16461 build_int_cst (TREE_TYPE (off), adjust));
16463 t = fold_convert (sizetype, t);
16464 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
16466 if (is_ha)
16468 /* type ha; // treat as "struct {ftype field[n];}"
16469 ... [computing offs]
16470 for (i = 0; i <nregs; ++i, offs += 16)
16471 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
16472 return ha; */
16473 int i;
16474 tree tmp_ha, field_t, field_ptr_t;
16476 /* Declare a local variable. */
16477 tmp_ha = create_tmp_var_raw (type, "ha");
16478 gimple_add_tmp_var (tmp_ha);
16480 /* Establish the base type. */
16481 switch (ag_mode)
16483 case E_SFmode:
16484 field_t = float_type_node;
16485 field_ptr_t = float_ptr_type_node;
16486 break;
16487 case E_DFmode:
16488 field_t = double_type_node;
16489 field_ptr_t = double_ptr_type_node;
16490 break;
16491 case E_TFmode:
16492 field_t = long_double_type_node;
16493 field_ptr_t = long_double_ptr_type_node;
16494 break;
16495 case E_HFmode:
16496 field_t = aarch64_fp16_type_node;
16497 field_ptr_t = aarch64_fp16_ptr_type_node;
16498 break;
16499 case E_BFmode:
16500 field_t = aarch64_bf16_type_node;
16501 field_ptr_t = aarch64_bf16_ptr_type_node;
16502 break;
16503 case E_V2SImode:
16504 case E_V4SImode:
16506 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
16507 field_t = build_vector_type_for_mode (innertype, ag_mode);
16508 field_ptr_t = build_pointer_type (field_t);
16510 break;
16511 default:
16512 gcc_assert (0);
16515 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
16516 TREE_ADDRESSABLE (tmp_ha) = 1;
16517 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
16518 addr = t;
16519 t = fold_convert (field_ptr_t, addr);
16520 t = build2 (MODIFY_EXPR, field_t,
16521 build1 (INDIRECT_REF, field_t, tmp_ha),
16522 build1 (INDIRECT_REF, field_t, t));
16524 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
16525 for (i = 1; i < nregs; ++i)
16527 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
16528 u = fold_convert (field_ptr_t, addr);
16529 u = build2 (MODIFY_EXPR, field_t,
16530 build2 (MEM_REF, field_t, tmp_ha,
16531 build_int_cst (field_ptr_t,
16532 (i *
16533 int_size_in_bytes (field_t)))),
16534 build1 (INDIRECT_REF, field_t, u));
16535 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
16538 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
16539 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
16542 COND_EXPR_ELSE (cond2) = t;
16543 addr = fold_convert (build_pointer_type (type), cond1);
16544 addr = build_va_arg_indirect_ref (addr);
16546 if (indirect_p)
16547 addr = build_va_arg_indirect_ref (addr);
16549 return addr;
16552 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
16554 static void
16555 aarch64_setup_incoming_varargs (cumulative_args_t cum_v,
16556 const function_arg_info &arg,
16557 int *pretend_size ATTRIBUTE_UNUSED, int no_rtl)
16559 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
16560 CUMULATIVE_ARGS local_cum;
16561 int gr_saved = cfun->va_list_gpr_size;
16562 int vr_saved = cfun->va_list_fpr_size;
16564 /* The caller has advanced CUM up to, but not beyond, the last named
16565 argument. Advance a local copy of CUM past the last "real" named
16566 argument, to find out how many registers are left over. */
16567 local_cum = *cum;
16568 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), arg);
16570 /* Found out how many registers we need to save.
16571 Honor tree-stdvar analysis results. */
16572 if (cfun->va_list_gpr_size)
16573 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
16574 cfun->va_list_gpr_size / UNITS_PER_WORD);
16575 if (cfun->va_list_fpr_size)
16576 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
16577 cfun->va_list_fpr_size / UNITS_PER_VREG);
16579 if (!TARGET_FLOAT)
16581 gcc_assert (local_cum.aapcs_nvrn == 0);
16582 vr_saved = 0;
16585 if (!no_rtl)
16587 if (gr_saved > 0)
16589 rtx ptr, mem;
16591 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
16592 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
16593 - gr_saved * UNITS_PER_WORD);
16594 mem = gen_frame_mem (BLKmode, ptr);
16595 set_mem_alias_set (mem, get_varargs_alias_set ());
16597 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
16598 mem, gr_saved);
16600 if (vr_saved > 0)
16602 /* We can't use move_block_from_reg, because it will use
16603 the wrong mode, storing D regs only. */
16604 machine_mode mode = TImode;
16605 int off, i, vr_start;
16607 /* Set OFF to the offset from virtual_incoming_args_rtx of
16608 the first vector register. The VR save area lies below
16609 the GR one, and is aligned to 16 bytes. */
16610 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
16611 STACK_BOUNDARY / BITS_PER_UNIT);
16612 off -= vr_saved * UNITS_PER_VREG;
16614 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
16615 for (i = 0; i < vr_saved; ++i)
16617 rtx ptr, mem;
16619 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
16620 mem = gen_frame_mem (mode, ptr);
16621 set_mem_alias_set (mem, get_varargs_alias_set ());
16622 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
16623 off += UNITS_PER_VREG;
16628 /* We don't save the size into *PRETEND_SIZE because we want to avoid
16629 any complication of having crtl->args.pretend_args_size changed. */
16630 cfun->machine->frame.saved_varargs_size
16631 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
16632 STACK_BOUNDARY / BITS_PER_UNIT)
16633 + vr_saved * UNITS_PER_VREG);
16636 static void
16637 aarch64_conditional_register_usage (void)
16639 int i;
16640 if (!TARGET_FLOAT)
16642 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
16644 fixed_regs[i] = 1;
16645 call_used_regs[i] = 1;
16648 if (!TARGET_SVE)
16649 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
16651 fixed_regs[i] = 1;
16652 call_used_regs[i] = 1;
16655 /* Only allow the FFR and FFRT to be accessed via special patterns. */
16656 CLEAR_HARD_REG_BIT (operand_reg_set, FFR_REGNUM);
16657 CLEAR_HARD_REG_BIT (operand_reg_set, FFRT_REGNUM);
16659 /* When tracking speculation, we need a couple of call-clobbered registers
16660 to track the speculation state. It would be nice to just use
16661 IP0 and IP1, but currently there are numerous places that just
16662 assume these registers are free for other uses (eg pointer
16663 authentication). */
16664 if (aarch64_track_speculation)
16666 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
16667 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
16668 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16669 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
16673 /* Implement TARGET_MEMBER_TYPE_FORCES_BLK. */
16675 bool
16676 aarch64_member_type_forces_blk (const_tree field_or_array, machine_mode mode)
16678 /* For records we're passed a FIELD_DECL, for arrays we're passed
16679 an ARRAY_TYPE. In both cases we're interested in the TREE_TYPE. */
16680 const_tree type = TREE_TYPE (field_or_array);
16682 /* Assign BLKmode to anything that contains multiple SVE predicates.
16683 For structures, the "multiple" case is indicated by MODE being
16684 VOIDmode. */
16685 unsigned int num_zr, num_pr;
16686 if (aarch64_sve::builtin_type_p (type, &num_zr, &num_pr) && num_pr != 0)
16688 if (TREE_CODE (field_or_array) == ARRAY_TYPE)
16689 return !simple_cst_equal (TYPE_SIZE (field_or_array),
16690 TYPE_SIZE (type));
16691 return mode == VOIDmode;
16694 return default_member_type_forces_blk (field_or_array, mode);
16697 /* Bitmasks that indicate whether earlier versions of GCC would have
16698 taken a different path through the ABI logic. This should result in
16699 a -Wpsabi warning if the earlier path led to a different ABI decision.
16701 WARN_PSABI_EMPTY_CXX17_BASE
16702 Indicates that the type includes an artificial empty C++17 base field
16703 that, prior to GCC 10.1, would prevent the type from being treated as
16704 a HFA or HVA. See PR94383 for details.
16706 WARN_PSABI_NO_UNIQUE_ADDRESS
16707 Indicates that the type includes an empty [[no_unique_address]] field
16708 that, prior to GCC 10.1, would prevent the type from being treated as
16709 a HFA or HVA. */
16710 const unsigned int WARN_PSABI_EMPTY_CXX17_BASE = 1U << 0;
16711 const unsigned int WARN_PSABI_NO_UNIQUE_ADDRESS = 1U << 1;
16713 /* Walk down the type tree of TYPE counting consecutive base elements.
16714 If *MODEP is VOIDmode, then set it to the first valid floating point
16715 type. If a non-floating point type is found, or if a floating point
16716 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
16717 otherwise return the count in the sub-tree.
16719 The WARN_PSABI_FLAGS argument allows the caller to check whether this
16720 function has changed its behavior relative to earlier versions of GCC.
16721 Normally the argument should be nonnull and point to a zero-initialized
16722 variable. The function then records whether the ABI decision might
16723 be affected by a known fix to the ABI logic, setting the associated
16724 WARN_PSABI_* bits if so.
16726 When the argument is instead a null pointer, the function tries to
16727 simulate the behavior of GCC before all such ABI fixes were made.
16728 This is useful to check whether the function returns something
16729 different after the ABI fixes. */
16730 static int
16731 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep,
16732 unsigned int *warn_psabi_flags)
16734 machine_mode mode;
16735 HOST_WIDE_INT size;
16737 if (aarch64_sve::builtin_type_p (type))
16738 return -1;
16740 switch (TREE_CODE (type))
16742 case REAL_TYPE:
16743 mode = TYPE_MODE (type);
16744 if (mode != DFmode && mode != SFmode
16745 && mode != TFmode && mode != HFmode)
16746 return -1;
16748 if (*modep == VOIDmode)
16749 *modep = mode;
16751 if (*modep == mode)
16752 return 1;
16754 break;
16756 case COMPLEX_TYPE:
16757 mode = TYPE_MODE (TREE_TYPE (type));
16758 if (mode != DFmode && mode != SFmode
16759 && mode != TFmode && mode != HFmode)
16760 return -1;
16762 if (*modep == VOIDmode)
16763 *modep = mode;
16765 if (*modep == mode)
16766 return 2;
16768 break;
16770 case VECTOR_TYPE:
16771 /* Use V2SImode and V4SImode as representatives of all 64-bit
16772 and 128-bit vector types. */
16773 size = int_size_in_bytes (type);
16774 switch (size)
16776 case 8:
16777 mode = V2SImode;
16778 break;
16779 case 16:
16780 mode = V4SImode;
16781 break;
16782 default:
16783 return -1;
16786 if (*modep == VOIDmode)
16787 *modep = mode;
16789 /* Vector modes are considered to be opaque: two vectors are
16790 equivalent for the purposes of being homogeneous aggregates
16791 if they are the same size. */
16792 if (*modep == mode)
16793 return 1;
16795 break;
16797 case ARRAY_TYPE:
16799 int count;
16800 tree index = TYPE_DOMAIN (type);
16802 /* Can't handle incomplete types nor sizes that are not
16803 fixed. */
16804 if (!COMPLETE_TYPE_P (type)
16805 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16806 return -1;
16808 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep,
16809 warn_psabi_flags);
16810 if (count == -1
16811 || !index
16812 || !TYPE_MAX_VALUE (index)
16813 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
16814 || !TYPE_MIN_VALUE (index)
16815 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
16816 || count < 0)
16817 return -1;
16819 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
16820 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
16822 /* There must be no padding. */
16823 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16824 count * GET_MODE_BITSIZE (*modep)))
16825 return -1;
16827 return count;
16830 case RECORD_TYPE:
16832 int count = 0;
16833 int sub_count;
16834 tree field;
16836 /* Can't handle incomplete types nor sizes that are not
16837 fixed. */
16838 if (!COMPLETE_TYPE_P (type)
16839 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16840 return -1;
16842 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16844 if (TREE_CODE (field) != FIELD_DECL)
16845 continue;
16847 if (DECL_FIELD_ABI_IGNORED (field))
16849 /* See whether this is something that earlier versions of
16850 GCC failed to ignore. */
16851 unsigned int flag;
16852 if (lookup_attribute ("no_unique_address",
16853 DECL_ATTRIBUTES (field)))
16854 flag = WARN_PSABI_NO_UNIQUE_ADDRESS;
16855 else if (cxx17_empty_base_field_p (field))
16856 flag = WARN_PSABI_EMPTY_CXX17_BASE;
16857 else
16858 /* No compatibility problem. */
16859 continue;
16861 /* Simulate the old behavior when WARN_PSABI_FLAGS is null. */
16862 if (warn_psabi_flags)
16864 *warn_psabi_flags |= flag;
16865 continue;
16869 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
16870 warn_psabi_flags);
16871 if (sub_count < 0)
16872 return -1;
16873 count += sub_count;
16876 /* There must be no padding. */
16877 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16878 count * GET_MODE_BITSIZE (*modep)))
16879 return -1;
16881 return count;
16884 case UNION_TYPE:
16885 case QUAL_UNION_TYPE:
16887 /* These aren't very interesting except in a degenerate case. */
16888 int count = 0;
16889 int sub_count;
16890 tree field;
16892 /* Can't handle incomplete types nor sizes that are not
16893 fixed. */
16894 if (!COMPLETE_TYPE_P (type)
16895 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
16896 return -1;
16898 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
16900 if (TREE_CODE (field) != FIELD_DECL)
16901 continue;
16903 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep,
16904 warn_psabi_flags);
16905 if (sub_count < 0)
16906 return -1;
16907 count = count > sub_count ? count : sub_count;
16910 /* There must be no padding. */
16911 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
16912 count * GET_MODE_BITSIZE (*modep)))
16913 return -1;
16915 return count;
16918 default:
16919 break;
16922 return -1;
16925 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
16926 type as described in AAPCS64 \S 4.1.2.
16928 See the comment above aarch64_composite_type_p for the notes on MODE. */
16930 static bool
16931 aarch64_short_vector_p (const_tree type,
16932 machine_mode mode)
16934 poly_int64 size = -1;
16936 if (type && TREE_CODE (type) == VECTOR_TYPE)
16938 if (aarch64_sve::builtin_type_p (type))
16939 return false;
16940 size = int_size_in_bytes (type);
16942 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
16943 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
16945 /* Rely only on the type, not the mode, when processing SVE types. */
16946 if (type && aarch64_some_values_include_pst_objects_p (type))
16947 /* Leave later code to report an error if SVE is disabled. */
16948 gcc_assert (!TARGET_SVE || aarch64_sve_mode_p (mode));
16949 else
16950 size = GET_MODE_SIZE (mode);
16952 if (known_eq (size, 8) || known_eq (size, 16))
16954 /* 64-bit and 128-bit vectors should only acquire an SVE mode if
16955 they are being treated as scalable AAPCS64 types. */
16956 gcc_assert (!aarch64_sve_mode_p (mode));
16957 return true;
16959 return false;
16962 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
16963 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
16964 array types. The C99 floating-point complex types are also considered
16965 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
16966 types, which are GCC extensions and out of the scope of AAPCS64, are
16967 treated as composite types here as well.
16969 Note that MODE itself is not sufficient in determining whether a type
16970 is such a composite type or not. This is because
16971 stor-layout.c:compute_record_mode may have already changed the MODE
16972 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
16973 structure with only one field may have its MODE set to the mode of the
16974 field. Also an integer mode whose size matches the size of the
16975 RECORD_TYPE type may be used to substitute the original mode
16976 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
16977 solely relied on. */
16979 static bool
16980 aarch64_composite_type_p (const_tree type,
16981 machine_mode mode)
16983 if (aarch64_short_vector_p (type, mode))
16984 return false;
16986 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
16987 return true;
16989 if (mode == BLKmode
16990 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
16991 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
16992 return true;
16994 return false;
16997 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
16998 shall be passed or returned in simd/fp register(s) (providing these
16999 parameter passing registers are available).
17001 Upon successful return, *COUNT returns the number of needed registers,
17002 *BASE_MODE returns the mode of the individual register and when IS_HAF
17003 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
17004 floating-point aggregate or a homogeneous short-vector aggregate.
17006 SILENT_P is true if the function should refrain from reporting any
17007 diagnostics. This should only be used if the caller is certain that
17008 any ABI decisions would eventually come through this function with
17009 SILENT_P set to false. */
17011 static bool
17012 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
17013 const_tree type,
17014 machine_mode *base_mode,
17015 int *count,
17016 bool *is_ha,
17017 bool silent_p)
17019 if (is_ha != NULL) *is_ha = false;
17021 machine_mode new_mode = VOIDmode;
17022 bool composite_p = aarch64_composite_type_p (type, mode);
17024 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
17025 || aarch64_short_vector_p (type, mode))
17027 *count = 1;
17028 new_mode = mode;
17030 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
17032 if (is_ha != NULL) *is_ha = true;
17033 *count = 2;
17034 new_mode = GET_MODE_INNER (mode);
17036 else if (type && composite_p)
17038 unsigned int warn_psabi_flags = 0;
17039 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode,
17040 &warn_psabi_flags);
17041 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
17043 static unsigned last_reported_type_uid;
17044 unsigned uid = TYPE_UID (TYPE_MAIN_VARIANT (type));
17045 int alt;
17046 if (!silent_p
17047 && warn_psabi
17048 && warn_psabi_flags
17049 && uid != last_reported_type_uid
17050 && ((alt = aapcs_vfp_sub_candidate (type, &new_mode, NULL))
17051 != ag_count))
17053 const char *url
17054 = CHANGES_ROOT_URL "gcc-10/changes.html#empty_base";
17055 gcc_assert (alt == -1);
17056 last_reported_type_uid = uid;
17057 /* Use TYPE_MAIN_VARIANT to strip any redundant const
17058 qualification. */
17059 if (warn_psabi_flags & WARN_PSABI_NO_UNIQUE_ADDRESS)
17060 inform (input_location, "parameter passing for argument of "
17061 "type %qT with %<[[no_unique_address]]%> members "
17062 "changed %{in GCC 10.1%}",
17063 TYPE_MAIN_VARIANT (type), url);
17064 else if (warn_psabi_flags & WARN_PSABI_EMPTY_CXX17_BASE)
17065 inform (input_location, "parameter passing for argument of "
17066 "type %qT when C++17 is enabled changed to match "
17067 "C++14 %{in GCC 10.1%}",
17068 TYPE_MAIN_VARIANT (type), url);
17071 if (is_ha != NULL) *is_ha = true;
17072 *count = ag_count;
17074 else
17075 return false;
17077 else
17078 return false;
17080 gcc_assert (!aarch64_sve_mode_p (new_mode));
17081 *base_mode = new_mode;
17082 return true;
17085 /* Implement TARGET_STRUCT_VALUE_RTX. */
17087 static rtx
17088 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
17089 int incoming ATTRIBUTE_UNUSED)
17091 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
17094 /* Implements target hook vector_mode_supported_p. */
17095 static bool
17096 aarch64_vector_mode_supported_p (machine_mode mode)
17098 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17099 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
17102 /* Return the full-width SVE vector mode for element mode MODE, if one
17103 exists. */
17104 opt_machine_mode
17105 aarch64_full_sve_mode (scalar_mode mode)
17107 switch (mode)
17109 case E_DFmode:
17110 return VNx2DFmode;
17111 case E_SFmode:
17112 return VNx4SFmode;
17113 case E_HFmode:
17114 return VNx8HFmode;
17115 case E_BFmode:
17116 return VNx8BFmode;
17117 case E_DImode:
17118 return VNx2DImode;
17119 case E_SImode:
17120 return VNx4SImode;
17121 case E_HImode:
17122 return VNx8HImode;
17123 case E_QImode:
17124 return VNx16QImode;
17125 default:
17126 return opt_machine_mode ();
17130 /* Return the 128-bit Advanced SIMD vector mode for element mode MODE,
17131 if it exists. */
17132 opt_machine_mode
17133 aarch64_vq_mode (scalar_mode mode)
17135 switch (mode)
17137 case E_DFmode:
17138 return V2DFmode;
17139 case E_SFmode:
17140 return V4SFmode;
17141 case E_HFmode:
17142 return V8HFmode;
17143 case E_BFmode:
17144 return V8BFmode;
17145 case E_SImode:
17146 return V4SImode;
17147 case E_HImode:
17148 return V8HImode;
17149 case E_QImode:
17150 return V16QImode;
17151 case E_DImode:
17152 return V2DImode;
17153 default:
17154 return opt_machine_mode ();
17158 /* Return appropriate SIMD container
17159 for MODE within a vector of WIDTH bits. */
17160 static machine_mode
17161 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
17163 if (TARGET_SVE
17164 && maybe_ne (width, 128)
17165 && known_eq (width, BITS_PER_SVE_VECTOR))
17166 return aarch64_full_sve_mode (mode).else_mode (word_mode);
17168 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
17169 if (TARGET_SIMD)
17171 if (known_eq (width, 128))
17172 return aarch64_vq_mode (mode).else_mode (word_mode);
17173 else
17174 switch (mode)
17176 case E_SFmode:
17177 return V2SFmode;
17178 case E_HFmode:
17179 return V4HFmode;
17180 case E_BFmode:
17181 return V4BFmode;
17182 case E_SImode:
17183 return V2SImode;
17184 case E_HImode:
17185 return V4HImode;
17186 case E_QImode:
17187 return V8QImode;
17188 default:
17189 break;
17192 return word_mode;
17195 /* Return 128-bit container as the preferred SIMD mode for MODE. */
17196 static machine_mode
17197 aarch64_preferred_simd_mode (scalar_mode mode)
17199 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
17200 return aarch64_simd_container_mode (mode, bits);
17203 /* Return a list of possible vector sizes for the vectorizer
17204 to iterate over. */
17205 static unsigned int
17206 aarch64_autovectorize_vector_modes (vector_modes *modes, bool)
17208 static const machine_mode sve_modes[] = {
17209 /* Try using full vectors for all element types. */
17210 VNx16QImode,
17212 /* Try using 16-bit containers for 8-bit elements and full vectors
17213 for wider elements. */
17214 VNx8QImode,
17216 /* Try using 32-bit containers for 8-bit and 16-bit elements and
17217 full vectors for wider elements. */
17218 VNx4QImode,
17220 /* Try using 64-bit containers for all element types. */
17221 VNx2QImode
17224 static const machine_mode advsimd_modes[] = {
17225 /* Try using 128-bit vectors for all element types. */
17226 V16QImode,
17228 /* Try using 64-bit vectors for 8-bit elements and 128-bit vectors
17229 for wider elements. */
17230 V8QImode,
17232 /* Try using 64-bit vectors for 16-bit elements and 128-bit vectors
17233 for wider elements.
17235 TODO: We could support a limited form of V4QImode too, so that
17236 we use 32-bit vectors for 8-bit elements. */
17237 V4HImode,
17239 /* Try using 64-bit vectors for 32-bit elements and 128-bit vectors
17240 for 64-bit elements.
17242 TODO: We could similarly support limited forms of V2QImode and V2HImode
17243 for this case. */
17244 V2SImode
17247 /* Try using N-byte SVE modes only after trying N-byte Advanced SIMD mode.
17248 This is because:
17250 - If we can't use N-byte Advanced SIMD vectors then the placement
17251 doesn't matter; we'll just continue as though the Advanced SIMD
17252 entry didn't exist.
17254 - If an SVE main loop with N bytes ends up being cheaper than an
17255 Advanced SIMD main loop with N bytes then by default we'll replace
17256 the Advanced SIMD version with the SVE one.
17258 - If an Advanced SIMD main loop with N bytes ends up being cheaper
17259 than an SVE main loop with N bytes then by default we'll try to
17260 use the SVE loop to vectorize the epilogue instead. */
17261 unsigned int sve_i = TARGET_SVE ? 0 : ARRAY_SIZE (sve_modes);
17262 unsigned int advsimd_i = 0;
17263 while (advsimd_i < ARRAY_SIZE (advsimd_modes))
17265 if (sve_i < ARRAY_SIZE (sve_modes)
17266 && maybe_gt (GET_MODE_NUNITS (sve_modes[sve_i]),
17267 GET_MODE_NUNITS (advsimd_modes[advsimd_i])))
17268 modes->safe_push (sve_modes[sve_i++]);
17269 else
17270 modes->safe_push (advsimd_modes[advsimd_i++]);
17272 while (sve_i < ARRAY_SIZE (sve_modes))
17273 modes->safe_push (sve_modes[sve_i++]);
17275 unsigned int flags = 0;
17276 /* Consider enabling VECT_COMPARE_COSTS for SVE, both so that we
17277 can compare SVE against Advanced SIMD and so that we can compare
17278 multiple SVE vectorization approaches against each other. There's
17279 not really any point doing this for Advanced SIMD only, since the
17280 first mode that works should always be the best. */
17281 if (TARGET_SVE && aarch64_sve_compare_costs)
17282 flags |= VECT_COMPARE_COSTS;
17283 return flags;
17286 /* Implement TARGET_MANGLE_TYPE. */
17288 static const char *
17289 aarch64_mangle_type (const_tree type)
17291 /* The AArch64 ABI documents say that "__va_list" has to be
17292 mangled as if it is in the "std" namespace. */
17293 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
17294 return "St9__va_list";
17296 /* Half-precision floating point types. */
17297 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
17299 if (TYPE_MODE (type) == BFmode)
17300 return "u6__bf16";
17301 else
17302 return "Dh";
17305 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
17306 builtin types. */
17307 if (TYPE_NAME (type) != NULL)
17309 const char *res;
17310 if ((res = aarch64_general_mangle_builtin_type (type))
17311 || (res = aarch64_sve::mangle_builtin_type (type)))
17312 return res;
17315 /* Use the default mangling. */
17316 return NULL;
17319 /* Implement TARGET_VERIFY_TYPE_CONTEXT. */
17321 static bool
17322 aarch64_verify_type_context (location_t loc, type_context_kind context,
17323 const_tree type, bool silent_p)
17325 return aarch64_sve::verify_type_context (loc, context, type, silent_p);
17328 /* Find the first rtx_insn before insn that will generate an assembly
17329 instruction. */
17331 static rtx_insn *
17332 aarch64_prev_real_insn (rtx_insn *insn)
17334 if (!insn)
17335 return NULL;
17339 insn = prev_real_insn (insn);
17341 while (insn && recog_memoized (insn) < 0);
17343 return insn;
17346 static bool
17347 is_madd_op (enum attr_type t1)
17349 unsigned int i;
17350 /* A number of these may be AArch32 only. */
17351 enum attr_type mlatypes[] = {
17352 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
17353 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
17354 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
17357 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
17359 if (t1 == mlatypes[i])
17360 return true;
17363 return false;
17366 /* Check if there is a register dependency between a load and the insn
17367 for which we hold recog_data. */
17369 static bool
17370 dep_between_memop_and_curr (rtx memop)
17372 rtx load_reg;
17373 int opno;
17375 gcc_assert (GET_CODE (memop) == SET);
17377 if (!REG_P (SET_DEST (memop)))
17378 return false;
17380 load_reg = SET_DEST (memop);
17381 for (opno = 1; opno < recog_data.n_operands; opno++)
17383 rtx operand = recog_data.operand[opno];
17384 if (REG_P (operand)
17385 && reg_overlap_mentioned_p (load_reg, operand))
17386 return true;
17389 return false;
17393 /* When working around the Cortex-A53 erratum 835769,
17394 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
17395 instruction and has a preceding memory instruction such that a NOP
17396 should be inserted between them. */
17398 bool
17399 aarch64_madd_needs_nop (rtx_insn* insn)
17401 enum attr_type attr_type;
17402 rtx_insn *prev;
17403 rtx body;
17405 if (!TARGET_FIX_ERR_A53_835769)
17406 return false;
17408 if (!INSN_P (insn) || recog_memoized (insn) < 0)
17409 return false;
17411 attr_type = get_attr_type (insn);
17412 if (!is_madd_op (attr_type))
17413 return false;
17415 prev = aarch64_prev_real_insn (insn);
17416 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
17417 Restore recog state to INSN to avoid state corruption. */
17418 extract_constrain_insn_cached (insn);
17420 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
17421 return false;
17423 body = single_set (prev);
17425 /* If the previous insn is a memory op and there is no dependency between
17426 it and the DImode madd, emit a NOP between them. If body is NULL then we
17427 have a complex memory operation, probably a load/store pair.
17428 Be conservative for now and emit a NOP. */
17429 if (GET_MODE (recog_data.operand[0]) == DImode
17430 && (!body || !dep_between_memop_and_curr (body)))
17431 return true;
17433 return false;
17438 /* Implement FINAL_PRESCAN_INSN. */
17440 void
17441 aarch64_final_prescan_insn (rtx_insn *insn)
17443 if (aarch64_madd_needs_nop (insn))
17444 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
17448 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
17449 instruction. */
17451 bool
17452 aarch64_sve_index_immediate_p (rtx base_or_step)
17454 return (CONST_INT_P (base_or_step)
17455 && IN_RANGE (INTVAL (base_or_step), -16, 15));
17458 /* Return true if X is a valid immediate for the SVE ADD and SUB instructions
17459 when applied to mode MODE. Negate X first if NEGATE_P is true. */
17461 bool
17462 aarch64_sve_arith_immediate_p (machine_mode mode, rtx x, bool negate_p)
17464 rtx elt = unwrap_const_vec_duplicate (x);
17465 if (!CONST_INT_P (elt))
17466 return false;
17468 HOST_WIDE_INT val = INTVAL (elt);
17469 if (negate_p)
17470 val = -val;
17471 val &= GET_MODE_MASK (GET_MODE_INNER (mode));
17473 if (val & 0xff)
17474 return IN_RANGE (val, 0, 0xff);
17475 return IN_RANGE (val, 0, 0xff00);
17478 /* Return true if X is a valid immediate for the SVE SQADD and SQSUB
17479 instructions when applied to mode MODE. Negate X first if NEGATE_P
17480 is true. */
17482 bool
17483 aarch64_sve_sqadd_sqsub_immediate_p (machine_mode mode, rtx x, bool negate_p)
17485 if (!aarch64_sve_arith_immediate_p (mode, x, negate_p))
17486 return false;
17488 /* After the optional negation, the immediate must be nonnegative.
17489 E.g. a saturating add of -127 must be done via SQSUB Zn.B, Zn.B, #127
17490 instead of SQADD Zn.B, Zn.B, #129. */
17491 rtx elt = unwrap_const_vec_duplicate (x);
17492 return negate_p == (INTVAL (elt) < 0);
17495 /* Return true if X is a valid immediate operand for an SVE logical
17496 instruction such as AND. */
17498 bool
17499 aarch64_sve_bitmask_immediate_p (rtx x)
17501 rtx elt;
17503 return (const_vec_duplicate_p (x, &elt)
17504 && CONST_INT_P (elt)
17505 && aarch64_bitmask_imm (INTVAL (elt),
17506 GET_MODE_INNER (GET_MODE (x))));
17509 /* Return true if X is a valid immediate for the SVE DUP and CPY
17510 instructions. */
17512 bool
17513 aarch64_sve_dup_immediate_p (rtx x)
17515 x = aarch64_bit_representation (unwrap_const_vec_duplicate (x));
17516 if (!CONST_INT_P (x))
17517 return false;
17519 HOST_WIDE_INT val = INTVAL (x);
17520 if (val & 0xff)
17521 return IN_RANGE (val, -0x80, 0x7f);
17522 return IN_RANGE (val, -0x8000, 0x7f00);
17525 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
17526 SIGNED_P says whether the operand is signed rather than unsigned. */
17528 bool
17529 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
17531 x = unwrap_const_vec_duplicate (x);
17532 return (CONST_INT_P (x)
17533 && (signed_p
17534 ? IN_RANGE (INTVAL (x), -16, 15)
17535 : IN_RANGE (INTVAL (x), 0, 127)));
17538 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
17539 instruction. Negate X first if NEGATE_P is true. */
17541 bool
17542 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
17544 rtx elt;
17545 REAL_VALUE_TYPE r;
17547 if (!const_vec_duplicate_p (x, &elt)
17548 || GET_CODE (elt) != CONST_DOUBLE)
17549 return false;
17551 r = *CONST_DOUBLE_REAL_VALUE (elt);
17553 if (negate_p)
17554 r = real_value_negate (&r);
17556 if (real_equal (&r, &dconst1))
17557 return true;
17558 if (real_equal (&r, &dconsthalf))
17559 return true;
17560 return false;
17563 /* Return true if X is a valid immediate operand for an SVE FMUL
17564 instruction. */
17566 bool
17567 aarch64_sve_float_mul_immediate_p (rtx x)
17569 rtx elt;
17571 return (const_vec_duplicate_p (x, &elt)
17572 && GET_CODE (elt) == CONST_DOUBLE
17573 && (real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf)
17574 || real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconst2)));
17577 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
17578 for the Advanced SIMD operation described by WHICH and INSN. If INFO
17579 is nonnull, use it to describe valid immediates. */
17580 static bool
17581 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
17582 simd_immediate_info *info,
17583 enum simd_immediate_check which,
17584 simd_immediate_info::insn_type insn)
17586 /* Try a 4-byte immediate with LSL. */
17587 for (unsigned int shift = 0; shift < 32; shift += 8)
17588 if ((val32 & (0xff << shift)) == val32)
17590 if (info)
17591 *info = simd_immediate_info (SImode, val32 >> shift, insn,
17592 simd_immediate_info::LSL, shift);
17593 return true;
17596 /* Try a 2-byte immediate with LSL. */
17597 unsigned int imm16 = val32 & 0xffff;
17598 if (imm16 == (val32 >> 16))
17599 for (unsigned int shift = 0; shift < 16; shift += 8)
17600 if ((imm16 & (0xff << shift)) == imm16)
17602 if (info)
17603 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
17604 simd_immediate_info::LSL, shift);
17605 return true;
17608 /* Try a 4-byte immediate with MSL, except for cases that MVN
17609 can handle. */
17610 if (which == AARCH64_CHECK_MOV)
17611 for (unsigned int shift = 8; shift < 24; shift += 8)
17613 unsigned int low = (1 << shift) - 1;
17614 if (((val32 & (0xff << shift)) | low) == val32)
17616 if (info)
17617 *info = simd_immediate_info (SImode, val32 >> shift, insn,
17618 simd_immediate_info::MSL, shift);
17619 return true;
17623 return false;
17626 /* Return true if replicating VAL64 is a valid immediate for the
17627 Advanced SIMD operation described by WHICH. If INFO is nonnull,
17628 use it to describe valid immediates. */
17629 static bool
17630 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
17631 simd_immediate_info *info,
17632 enum simd_immediate_check which)
17634 unsigned int val32 = val64 & 0xffffffff;
17635 unsigned int val16 = val64 & 0xffff;
17636 unsigned int val8 = val64 & 0xff;
17638 if (val32 == (val64 >> 32))
17640 if ((which & AARCH64_CHECK_ORR) != 0
17641 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
17642 simd_immediate_info::MOV))
17643 return true;
17645 if ((which & AARCH64_CHECK_BIC) != 0
17646 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
17647 simd_immediate_info::MVN))
17648 return true;
17650 /* Try using a replicated byte. */
17651 if (which == AARCH64_CHECK_MOV
17652 && val16 == (val32 >> 16)
17653 && val8 == (val16 >> 8))
17655 if (info)
17656 *info = simd_immediate_info (QImode, val8);
17657 return true;
17661 /* Try using a bit-to-bytemask. */
17662 if (which == AARCH64_CHECK_MOV)
17664 unsigned int i;
17665 for (i = 0; i < 64; i += 8)
17667 unsigned char byte = (val64 >> i) & 0xff;
17668 if (byte != 0 && byte != 0xff)
17669 break;
17671 if (i == 64)
17673 if (info)
17674 *info = simd_immediate_info (DImode, val64);
17675 return true;
17678 return false;
17681 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
17682 instruction. If INFO is nonnull, use it to describe valid immediates. */
17684 static bool
17685 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
17686 simd_immediate_info *info)
17688 scalar_int_mode mode = DImode;
17689 unsigned int val32 = val64 & 0xffffffff;
17690 if (val32 == (val64 >> 32))
17692 mode = SImode;
17693 unsigned int val16 = val32 & 0xffff;
17694 if (val16 == (val32 >> 16))
17696 mode = HImode;
17697 unsigned int val8 = val16 & 0xff;
17698 if (val8 == (val16 >> 8))
17699 mode = QImode;
17702 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
17703 if (IN_RANGE (val, -0x80, 0x7f))
17705 /* DUP with no shift. */
17706 if (info)
17707 *info = simd_immediate_info (mode, val);
17708 return true;
17710 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
17712 /* DUP with LSL #8. */
17713 if (info)
17714 *info = simd_immediate_info (mode, val);
17715 return true;
17717 if (aarch64_bitmask_imm (val64, mode))
17719 /* DUPM. */
17720 if (info)
17721 *info = simd_immediate_info (mode, val);
17722 return true;
17724 return false;
17727 /* Return true if X is an UNSPEC_PTRUE constant of the form:
17729 (const (unspec [PATTERN ZERO] UNSPEC_PTRUE))
17731 where PATTERN is the svpattern as a CONST_INT and where ZERO
17732 is a zero constant of the required PTRUE mode (which can have
17733 fewer elements than X's mode, if zero bits are significant).
17735 If so, and if INFO is nonnull, describe the immediate in INFO. */
17736 bool
17737 aarch64_sve_ptrue_svpattern_p (rtx x, struct simd_immediate_info *info)
17739 if (GET_CODE (x) != CONST)
17740 return false;
17742 x = XEXP (x, 0);
17743 if (GET_CODE (x) != UNSPEC || XINT (x, 1) != UNSPEC_PTRUE)
17744 return false;
17746 if (info)
17748 aarch64_svpattern pattern
17749 = (aarch64_svpattern) INTVAL (XVECEXP (x, 0, 0));
17750 machine_mode pred_mode = GET_MODE (XVECEXP (x, 0, 1));
17751 scalar_int_mode int_mode = aarch64_sve_element_int_mode (pred_mode);
17752 *info = simd_immediate_info (int_mode, pattern);
17754 return true;
17757 /* Return true if X is a valid SVE predicate. If INFO is nonnull, use
17758 it to describe valid immediates. */
17760 static bool
17761 aarch64_sve_pred_valid_immediate (rtx x, simd_immediate_info *info)
17763 if (aarch64_sve_ptrue_svpattern_p (x, info))
17764 return true;
17766 if (x == CONST0_RTX (GET_MODE (x)))
17768 if (info)
17769 *info = simd_immediate_info (DImode, 0);
17770 return true;
17773 /* Analyze the value as a VNx16BImode. This should be relatively
17774 efficient, since rtx_vector_builder has enough built-in capacity
17775 to store all VLA predicate constants without needing the heap. */
17776 rtx_vector_builder builder;
17777 if (!aarch64_get_sve_pred_bits (builder, x))
17778 return false;
17780 unsigned int elt_size = aarch64_widest_sve_pred_elt_size (builder);
17781 if (int vl = aarch64_partial_ptrue_length (builder, elt_size))
17783 machine_mode mode = aarch64_sve_pred_mode (elt_size).require ();
17784 aarch64_svpattern pattern = aarch64_svpattern_for_vl (mode, vl);
17785 if (pattern != AARCH64_NUM_SVPATTERNS)
17787 if (info)
17789 scalar_int_mode int_mode = aarch64_sve_element_int_mode (mode);
17790 *info = simd_immediate_info (int_mode, pattern);
17792 return true;
17795 return false;
17798 /* Return true if OP is a valid SIMD immediate for the operation
17799 described by WHICH. If INFO is nonnull, use it to describe valid
17800 immediates. */
17801 bool
17802 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
17803 enum simd_immediate_check which)
17805 machine_mode mode = GET_MODE (op);
17806 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17807 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
17808 return false;
17810 if (vec_flags & VEC_SVE_PRED)
17811 return aarch64_sve_pred_valid_immediate (op, info);
17813 scalar_mode elt_mode = GET_MODE_INNER (mode);
17814 rtx base, step;
17815 unsigned int n_elts;
17816 if (GET_CODE (op) == CONST_VECTOR
17817 && CONST_VECTOR_DUPLICATE_P (op))
17818 n_elts = CONST_VECTOR_NPATTERNS (op);
17819 else if ((vec_flags & VEC_SVE_DATA)
17820 && const_vec_series_p (op, &base, &step))
17822 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
17823 if (!aarch64_sve_index_immediate_p (base)
17824 || !aarch64_sve_index_immediate_p (step))
17825 return false;
17827 if (info)
17829 /* Get the corresponding container mode. E.g. an INDEX on V2SI
17830 should yield two integer values per 128-bit block, meaning
17831 that we need to treat it in the same way as V2DI and then
17832 ignore the upper 32 bits of each element. */
17833 elt_mode = aarch64_sve_container_int_mode (mode);
17834 *info = simd_immediate_info (elt_mode, base, step);
17836 return true;
17838 else if (GET_CODE (op) == CONST_VECTOR
17839 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
17840 /* N_ELTS set above. */;
17841 else
17842 return false;
17844 scalar_float_mode elt_float_mode;
17845 if (n_elts == 1
17846 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
17848 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
17849 if (aarch64_float_const_zero_rtx_p (elt)
17850 || aarch64_float_const_representable_p (elt))
17852 if (info)
17853 *info = simd_immediate_info (elt_float_mode, elt);
17854 return true;
17858 /* If all elements in an SVE vector have the same value, we have a free
17859 choice between using the element mode and using the container mode.
17860 Using the element mode means that unused parts of the vector are
17861 duplicates of the used elements, while using the container mode means
17862 that the unused parts are an extension of the used elements. Using the
17863 element mode is better for (say) VNx4HI 0x101, since 0x01010101 is valid
17864 for its container mode VNx4SI while 0x00000101 isn't.
17866 If not all elements in an SVE vector have the same value, we need the
17867 transition from one element to the next to occur at container boundaries.
17868 E.g. a fixed-length VNx4HI containing { 1, 2, 3, 4 } should be treated
17869 in the same way as a VNx4SI containing { 1, 2, 3, 4 }. */
17870 scalar_int_mode elt_int_mode;
17871 if ((vec_flags & VEC_SVE_DATA) && n_elts > 1)
17872 elt_int_mode = aarch64_sve_container_int_mode (mode);
17873 else
17874 elt_int_mode = int_mode_for_mode (elt_mode).require ();
17876 unsigned int elt_size = GET_MODE_SIZE (elt_int_mode);
17877 if (elt_size > 8)
17878 return false;
17880 /* Expand the vector constant out into a byte vector, with the least
17881 significant byte of the register first. */
17882 auto_vec<unsigned char, 16> bytes;
17883 bytes.reserve (n_elts * elt_size);
17884 for (unsigned int i = 0; i < n_elts; i++)
17886 /* The vector is provided in gcc endian-neutral fashion.
17887 For aarch64_be Advanced SIMD, it must be laid out in the vector
17888 register in reverse order. */
17889 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
17890 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
17892 if (elt_mode != elt_int_mode)
17893 elt = gen_lowpart (elt_int_mode, elt);
17895 if (!CONST_INT_P (elt))
17896 return false;
17898 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
17899 for (unsigned int byte = 0; byte < elt_size; byte++)
17901 bytes.quick_push (elt_val & 0xff);
17902 elt_val >>= BITS_PER_UNIT;
17906 /* The immediate must repeat every eight bytes. */
17907 unsigned int nbytes = bytes.length ();
17908 for (unsigned i = 8; i < nbytes; ++i)
17909 if (bytes[i] != bytes[i - 8])
17910 return false;
17912 /* Get the repeating 8-byte value as an integer. No endian correction
17913 is needed here because bytes is already in lsb-first order. */
17914 unsigned HOST_WIDE_INT val64 = 0;
17915 for (unsigned int i = 0; i < 8; i++)
17916 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
17917 << (i * BITS_PER_UNIT));
17919 if (vec_flags & VEC_SVE_DATA)
17920 return aarch64_sve_valid_immediate (val64, info);
17921 else
17922 return aarch64_advsimd_valid_immediate (val64, info, which);
17925 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
17926 has a step in the range of INDEX. Return the index expression if so,
17927 otherwise return null. */
17929 aarch64_check_zero_based_sve_index_immediate (rtx x)
17931 rtx base, step;
17932 if (const_vec_series_p (x, &base, &step)
17933 && base == const0_rtx
17934 && aarch64_sve_index_immediate_p (step))
17935 return step;
17936 return NULL_RTX;
17939 /* Check of immediate shift constants are within range. */
17940 bool
17941 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
17943 x = unwrap_const_vec_duplicate (x);
17944 if (!CONST_INT_P (x))
17945 return false;
17946 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
17947 if (left)
17948 return IN_RANGE (INTVAL (x), 0, bit_width - 1);
17949 else
17950 return IN_RANGE (INTVAL (x), 1, bit_width);
17953 /* Return the bitmask CONST_INT to select the bits required by a zero extract
17954 operation of width WIDTH at bit position POS. */
17957 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
17959 gcc_assert (CONST_INT_P (width));
17960 gcc_assert (CONST_INT_P (pos));
17962 unsigned HOST_WIDE_INT mask
17963 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
17964 return GEN_INT (mask << UINTVAL (pos));
17967 bool
17968 aarch64_mov_operand_p (rtx x, machine_mode mode)
17970 if (GET_CODE (x) == HIGH
17971 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
17972 return true;
17974 if (CONST_INT_P (x))
17975 return true;
17977 if (VECTOR_MODE_P (GET_MODE (x)))
17979 /* Require predicate constants to be VNx16BI before RA, so that we
17980 force everything to have a canonical form. */
17981 if (!lra_in_progress
17982 && !reload_completed
17983 && GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_BOOL
17984 && GET_MODE (x) != VNx16BImode)
17985 return false;
17987 return aarch64_simd_valid_immediate (x, NULL);
17990 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
17991 return true;
17993 if (TARGET_SVE && aarch64_sve_cnt_immediate_p (x))
17994 return true;
17996 return aarch64_classify_symbolic_expression (x)
17997 == SYMBOL_TINY_ABSOLUTE;
18000 /* Return a const_int vector of VAL. */
18002 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
18004 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
18005 return gen_const_vec_duplicate (mode, c);
18008 /* Check OP is a legal scalar immediate for the MOVI instruction. */
18010 bool
18011 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
18013 machine_mode vmode;
18015 vmode = aarch64_simd_container_mode (mode, 64);
18016 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
18017 return aarch64_simd_valid_immediate (op_v, NULL);
18020 /* Construct and return a PARALLEL RTX vector with elements numbering the
18021 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
18022 the vector - from the perspective of the architecture. This does not
18023 line up with GCC's perspective on lane numbers, so we end up with
18024 different masks depending on our target endian-ness. The diagram
18025 below may help. We must draw the distinction when building masks
18026 which select one half of the vector. An instruction selecting
18027 architectural low-lanes for a big-endian target, must be described using
18028 a mask selecting GCC high-lanes.
18030 Big-Endian Little-Endian
18032 GCC 0 1 2 3 3 2 1 0
18033 | x | x | x | x | | x | x | x | x |
18034 Architecture 3 2 1 0 3 2 1 0
18036 Low Mask: { 2, 3 } { 0, 1 }
18037 High Mask: { 0, 1 } { 2, 3 }
18039 MODE Is the mode of the vector and NUNITS is the number of units in it. */
18042 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
18044 rtvec v = rtvec_alloc (nunits / 2);
18045 int high_base = nunits / 2;
18046 int low_base = 0;
18047 int base;
18048 rtx t1;
18049 int i;
18051 if (BYTES_BIG_ENDIAN)
18052 base = high ? low_base : high_base;
18053 else
18054 base = high ? high_base : low_base;
18056 for (i = 0; i < nunits / 2; i++)
18057 RTVEC_ELT (v, i) = GEN_INT (base + i);
18059 t1 = gen_rtx_PARALLEL (mode, v);
18060 return t1;
18063 /* Check OP for validity as a PARALLEL RTX vector with elements
18064 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
18065 from the perspective of the architecture. See the diagram above
18066 aarch64_simd_vect_par_cnst_half for more details. */
18068 bool
18069 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
18070 bool high)
18072 int nelts;
18073 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
18074 return false;
18076 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
18077 HOST_WIDE_INT count_op = XVECLEN (op, 0);
18078 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
18079 int i = 0;
18081 if (count_op != count_ideal)
18082 return false;
18084 for (i = 0; i < count_ideal; i++)
18086 rtx elt_op = XVECEXP (op, 0, i);
18087 rtx elt_ideal = XVECEXP (ideal, 0, i);
18089 if (!CONST_INT_P (elt_op)
18090 || INTVAL (elt_ideal) != INTVAL (elt_op))
18091 return false;
18093 return true;
18096 /* Return a PARALLEL containing NELTS elements, with element I equal
18097 to BASE + I * STEP. */
18100 aarch64_gen_stepped_int_parallel (unsigned int nelts, int base, int step)
18102 rtvec vec = rtvec_alloc (nelts);
18103 for (unsigned int i = 0; i < nelts; ++i)
18104 RTVEC_ELT (vec, i) = gen_int_mode (base + i * step, DImode);
18105 return gen_rtx_PARALLEL (VOIDmode, vec);
18108 /* Return true if OP is a PARALLEL of CONST_INTs that form a linear
18109 series with step STEP. */
18111 bool
18112 aarch64_stepped_int_parallel_p (rtx op, int step)
18114 if (GET_CODE (op) != PARALLEL || !CONST_INT_P (XVECEXP (op, 0, 0)))
18115 return false;
18117 unsigned HOST_WIDE_INT base = UINTVAL (XVECEXP (op, 0, 0));
18118 for (int i = 1; i < XVECLEN (op, 0); ++i)
18119 if (!CONST_INT_P (XVECEXP (op, 0, i))
18120 || UINTVAL (XVECEXP (op, 0, i)) != base + i * step)
18121 return false;
18123 return true;
18126 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
18127 HIGH (exclusive). */
18128 void
18129 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
18130 const_tree exp)
18132 HOST_WIDE_INT lane;
18133 gcc_assert (CONST_INT_P (operand));
18134 lane = INTVAL (operand);
18136 if (lane < low || lane >= high)
18138 if (exp)
18139 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
18140 else
18141 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
18145 /* Peform endian correction on lane number N, which indexes a vector
18146 of mode MODE, and return the result as an SImode rtx. */
18149 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
18151 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
18154 /* Return TRUE if OP is a valid vector addressing mode. */
18156 bool
18157 aarch64_simd_mem_operand_p (rtx op)
18159 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
18160 || REG_P (XEXP (op, 0)));
18163 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
18165 bool
18166 aarch64_sve_ld1r_operand_p (rtx op)
18168 struct aarch64_address_info addr;
18169 scalar_mode mode;
18171 return (MEM_P (op)
18172 && is_a <scalar_mode> (GET_MODE (op), &mode)
18173 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
18174 && addr.type == ADDRESS_REG_IMM
18175 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
18178 /* Return true if OP is a valid MEM operand for an SVE LD1R{Q,O} instruction
18179 where the size of the read data is specified by `mode` and the size of the
18180 vector elements are specified by `elem_mode`. */
18181 bool
18182 aarch64_sve_ld1rq_ld1ro_operand_p (rtx op, machine_mode mode,
18183 scalar_mode elem_mode)
18185 struct aarch64_address_info addr;
18186 if (!MEM_P (op)
18187 || !aarch64_classify_address (&addr, XEXP (op, 0), elem_mode, false))
18188 return false;
18190 if (addr.type == ADDRESS_REG_IMM)
18191 return offset_4bit_signed_scaled_p (mode, addr.const_offset);
18193 if (addr.type == ADDRESS_REG_REG)
18194 return (1U << addr.shift) == GET_MODE_SIZE (elem_mode);
18196 return false;
18199 /* Return true if OP is a valid MEM operand for an SVE LD1RQ instruction. */
18200 bool
18201 aarch64_sve_ld1rq_operand_p (rtx op)
18203 return aarch64_sve_ld1rq_ld1ro_operand_p (op, TImode,
18204 GET_MODE_INNER (GET_MODE (op)));
18207 /* Return true if OP is a valid MEM operand for an SVE LD1RO instruction for
18208 accessing a vector where the element size is specified by `elem_mode`. */
18209 bool
18210 aarch64_sve_ld1ro_operand_p (rtx op, scalar_mode elem_mode)
18212 return aarch64_sve_ld1rq_ld1ro_operand_p (op, OImode, elem_mode);
18215 /* Return true if OP is a valid MEM operand for an SVE LDFF1 instruction. */
18216 bool
18217 aarch64_sve_ldff1_operand_p (rtx op)
18219 if (!MEM_P (op))
18220 return false;
18222 struct aarch64_address_info addr;
18223 if (!aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op), false))
18224 return false;
18226 if (addr.type == ADDRESS_REG_IMM)
18227 return known_eq (addr.const_offset, 0);
18229 return addr.type == ADDRESS_REG_REG;
18232 /* Return true if OP is a valid MEM operand for an SVE LDNF1 instruction. */
18233 bool
18234 aarch64_sve_ldnf1_operand_p (rtx op)
18236 struct aarch64_address_info addr;
18238 return (MEM_P (op)
18239 && aarch64_classify_address (&addr, XEXP (op, 0),
18240 GET_MODE (op), false)
18241 && addr.type == ADDRESS_REG_IMM);
18244 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
18245 The conditions for STR are the same. */
18246 bool
18247 aarch64_sve_ldr_operand_p (rtx op)
18249 struct aarch64_address_info addr;
18251 return (MEM_P (op)
18252 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
18253 false, ADDR_QUERY_ANY)
18254 && addr.type == ADDRESS_REG_IMM);
18257 /* Return true if OP is a valid address for an SVE PRF[BHWD] instruction,
18258 addressing memory of mode MODE. */
18259 bool
18260 aarch64_sve_prefetch_operand_p (rtx op, machine_mode mode)
18262 struct aarch64_address_info addr;
18263 if (!aarch64_classify_address (&addr, op, mode, false))
18264 return false;
18266 if (addr.type == ADDRESS_REG_IMM)
18267 return known_eq (addr.const_offset, 0);
18269 return addr.type == ADDRESS_REG_REG;
18272 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
18273 We need to be able to access the individual pieces, so the range
18274 is different from LD[234] and ST[234]. */
18275 bool
18276 aarch64_sve_struct_memory_operand_p (rtx op)
18278 if (!MEM_P (op))
18279 return false;
18281 machine_mode mode = GET_MODE (op);
18282 struct aarch64_address_info addr;
18283 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
18284 ADDR_QUERY_ANY)
18285 || addr.type != ADDRESS_REG_IMM)
18286 return false;
18288 poly_int64 first = addr.const_offset;
18289 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
18290 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
18291 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
18294 /* Emit a register copy from operand to operand, taking care not to
18295 early-clobber source registers in the process.
18297 COUNT is the number of components into which the copy needs to be
18298 decomposed. */
18299 void
18300 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
18301 unsigned int count)
18303 unsigned int i;
18304 int rdest = REGNO (operands[0]);
18305 int rsrc = REGNO (operands[1]);
18307 if (!reg_overlap_mentioned_p (operands[0], operands[1])
18308 || rdest < rsrc)
18309 for (i = 0; i < count; i++)
18310 emit_move_insn (gen_rtx_REG (mode, rdest + i),
18311 gen_rtx_REG (mode, rsrc + i));
18312 else
18313 for (i = 0; i < count; i++)
18314 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
18315 gen_rtx_REG (mode, rsrc + count - i - 1));
18318 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
18319 one of VSTRUCT modes: OI, CI, or XI. */
18321 aarch64_simd_attr_length_rglist (machine_mode mode)
18323 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
18324 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
18327 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
18328 alignment of a vector to 128 bits. SVE predicates have an alignment of
18329 16 bits. */
18330 static HOST_WIDE_INT
18331 aarch64_simd_vector_alignment (const_tree type)
18333 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
18334 be set for non-predicate vectors of booleans. Modes are the most
18335 direct way we have of identifying real SVE predicate types. */
18336 if (GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL)
18337 return 16;
18338 widest_int min_size
18339 = constant_lower_bound (wi::to_poly_widest (TYPE_SIZE (type)));
18340 return wi::umin (min_size, 128).to_uhwi ();
18343 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
18344 static poly_uint64
18345 aarch64_vectorize_preferred_vector_alignment (const_tree type)
18347 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
18349 /* If the length of the vector is fixed, try to align to that length,
18350 otherwise don't try to align at all. */
18351 HOST_WIDE_INT result;
18352 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
18353 result = TYPE_ALIGN (TREE_TYPE (type));
18354 return result;
18356 return TYPE_ALIGN (type);
18359 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
18360 static bool
18361 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
18363 if (is_packed)
18364 return false;
18366 /* For fixed-length vectors, check that the vectorizer will aim for
18367 full-vector alignment. This isn't true for generic GCC vectors
18368 that are wider than the ABI maximum of 128 bits. */
18369 poly_uint64 preferred_alignment =
18370 aarch64_vectorize_preferred_vector_alignment (type);
18371 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
18372 && maybe_ne (wi::to_widest (TYPE_SIZE (type)),
18373 preferred_alignment))
18374 return false;
18376 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
18377 return true;
18380 /* Return true if the vector misalignment factor is supported by the
18381 target. */
18382 static bool
18383 aarch64_builtin_support_vector_misalignment (machine_mode mode,
18384 const_tree type, int misalignment,
18385 bool is_packed)
18387 if (TARGET_SIMD && STRICT_ALIGNMENT)
18389 /* Return if movmisalign pattern is not supported for this mode. */
18390 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
18391 return false;
18393 /* Misalignment factor is unknown at compile time. */
18394 if (misalignment == -1)
18395 return false;
18397 return default_builtin_support_vector_misalignment (mode, type, misalignment,
18398 is_packed);
18401 /* If VALS is a vector constant that can be loaded into a register
18402 using DUP, generate instructions to do so and return an RTX to
18403 assign to the register. Otherwise return NULL_RTX. */
18404 static rtx
18405 aarch64_simd_dup_constant (rtx vals)
18407 machine_mode mode = GET_MODE (vals);
18408 machine_mode inner_mode = GET_MODE_INNER (mode);
18409 rtx x;
18411 if (!const_vec_duplicate_p (vals, &x))
18412 return NULL_RTX;
18414 /* We can load this constant by using DUP and a constant in a
18415 single ARM register. This will be cheaper than a vector
18416 load. */
18417 x = copy_to_mode_reg (inner_mode, x);
18418 return gen_vec_duplicate (mode, x);
18422 /* Generate code to load VALS, which is a PARALLEL containing only
18423 constants (for vec_init) or CONST_VECTOR, efficiently into a
18424 register. Returns an RTX to copy into the register, or NULL_RTX
18425 for a PARALLEL that cannot be converted into a CONST_VECTOR. */
18426 static rtx
18427 aarch64_simd_make_constant (rtx vals)
18429 machine_mode mode = GET_MODE (vals);
18430 rtx const_dup;
18431 rtx const_vec = NULL_RTX;
18432 int n_const = 0;
18433 int i;
18435 if (GET_CODE (vals) == CONST_VECTOR)
18436 const_vec = vals;
18437 else if (GET_CODE (vals) == PARALLEL)
18439 /* A CONST_VECTOR must contain only CONST_INTs and
18440 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
18441 Only store valid constants in a CONST_VECTOR. */
18442 int n_elts = XVECLEN (vals, 0);
18443 for (i = 0; i < n_elts; ++i)
18445 rtx x = XVECEXP (vals, 0, i);
18446 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18447 n_const++;
18449 if (n_const == n_elts)
18450 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
18452 else
18453 gcc_unreachable ();
18455 if (const_vec != NULL_RTX
18456 && aarch64_simd_valid_immediate (const_vec, NULL))
18457 /* Load using MOVI/MVNI. */
18458 return const_vec;
18459 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
18460 /* Loaded using DUP. */
18461 return const_dup;
18462 else if (const_vec != NULL_RTX)
18463 /* Load from constant pool. We cannot take advantage of single-cycle
18464 LD1 because we need a PC-relative addressing mode. */
18465 return const_vec;
18466 else
18467 /* A PARALLEL containing something not valid inside CONST_VECTOR.
18468 We cannot construct an initializer. */
18469 return NULL_RTX;
18472 /* Expand a vector initialisation sequence, such that TARGET is
18473 initialised to contain VALS. */
18475 void
18476 aarch64_expand_vector_init (rtx target, rtx vals)
18478 machine_mode mode = GET_MODE (target);
18479 scalar_mode inner_mode = GET_MODE_INNER (mode);
18480 /* The number of vector elements. */
18481 int n_elts = XVECLEN (vals, 0);
18482 /* The number of vector elements which are not constant. */
18483 int n_var = 0;
18484 rtx any_const = NULL_RTX;
18485 /* The first element of vals. */
18486 rtx v0 = XVECEXP (vals, 0, 0);
18487 bool all_same = true;
18489 /* This is a special vec_init<M><N> where N is not an element mode but a
18490 vector mode with half the elements of M. We expect to find two entries
18491 of mode N in VALS and we must put their concatentation into TARGET. */
18492 if (XVECLEN (vals, 0) == 2 && VECTOR_MODE_P (GET_MODE (XVECEXP (vals, 0, 0))))
18494 gcc_assert (known_eq (GET_MODE_SIZE (mode),
18495 2 * GET_MODE_SIZE (GET_MODE (XVECEXP (vals, 0, 0)))));
18496 rtx lo = XVECEXP (vals, 0, 0);
18497 rtx hi = XVECEXP (vals, 0, 1);
18498 machine_mode narrow_mode = GET_MODE (lo);
18499 gcc_assert (GET_MODE_INNER (narrow_mode) == inner_mode);
18500 gcc_assert (narrow_mode == GET_MODE (hi));
18502 /* When we want to concatenate a half-width vector with zeroes we can
18503 use the aarch64_combinez[_be] patterns. Just make sure that the
18504 zeroes are in the right half. */
18505 if (BYTES_BIG_ENDIAN
18506 && aarch64_simd_imm_zero (lo, narrow_mode)
18507 && general_operand (hi, narrow_mode))
18508 emit_insn (gen_aarch64_combinez_be (narrow_mode, target, hi, lo));
18509 else if (!BYTES_BIG_ENDIAN
18510 && aarch64_simd_imm_zero (hi, narrow_mode)
18511 && general_operand (lo, narrow_mode))
18512 emit_insn (gen_aarch64_combinez (narrow_mode, target, lo, hi));
18513 else
18515 /* Else create the two half-width registers and combine them. */
18516 if (!REG_P (lo))
18517 lo = force_reg (GET_MODE (lo), lo);
18518 if (!REG_P (hi))
18519 hi = force_reg (GET_MODE (hi), hi);
18521 if (BYTES_BIG_ENDIAN)
18522 std::swap (lo, hi);
18523 emit_insn (gen_aarch64_simd_combine (narrow_mode, target, lo, hi));
18525 return;
18528 /* Count the number of variable elements to initialise. */
18529 for (int i = 0; i < n_elts; ++i)
18531 rtx x = XVECEXP (vals, 0, i);
18532 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
18533 ++n_var;
18534 else
18535 any_const = x;
18537 all_same &= rtx_equal_p (x, v0);
18540 /* No variable elements, hand off to aarch64_simd_make_constant which knows
18541 how best to handle this. */
18542 if (n_var == 0)
18544 rtx constant = aarch64_simd_make_constant (vals);
18545 if (constant != NULL_RTX)
18547 emit_move_insn (target, constant);
18548 return;
18552 /* Splat a single non-constant element if we can. */
18553 if (all_same)
18555 rtx x = copy_to_mode_reg (inner_mode, v0);
18556 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18557 return;
18560 enum insn_code icode = optab_handler (vec_set_optab, mode);
18561 gcc_assert (icode != CODE_FOR_nothing);
18563 /* If there are only variable elements, try to optimize
18564 the insertion using dup for the most common element
18565 followed by insertions. */
18567 /* The algorithm will fill matches[*][0] with the earliest matching element,
18568 and matches[X][1] with the count of duplicate elements (if X is the
18569 earliest element which has duplicates). */
18571 if (n_var == n_elts && n_elts <= 16)
18573 int matches[16][2] = {0};
18574 for (int i = 0; i < n_elts; i++)
18576 for (int j = 0; j <= i; j++)
18578 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
18580 matches[i][0] = j;
18581 matches[j][1]++;
18582 break;
18586 int maxelement = 0;
18587 int maxv = 0;
18588 for (int i = 0; i < n_elts; i++)
18589 if (matches[i][1] > maxv)
18591 maxelement = i;
18592 maxv = matches[i][1];
18595 /* Create a duplicate of the most common element, unless all elements
18596 are equally useless to us, in which case just immediately set the
18597 vector register using the first element. */
18599 if (maxv == 1)
18601 /* For vectors of two 64-bit elements, we can do even better. */
18602 if (n_elts == 2
18603 && (inner_mode == E_DImode
18604 || inner_mode == E_DFmode))
18607 rtx x0 = XVECEXP (vals, 0, 0);
18608 rtx x1 = XVECEXP (vals, 0, 1);
18609 /* Combine can pick up this case, but handling it directly
18610 here leaves clearer RTL.
18612 This is load_pair_lanes<mode>, and also gives us a clean-up
18613 for store_pair_lanes<mode>. */
18614 if (memory_operand (x0, inner_mode)
18615 && memory_operand (x1, inner_mode)
18616 && !STRICT_ALIGNMENT
18617 && rtx_equal_p (XEXP (x1, 0),
18618 plus_constant (Pmode,
18619 XEXP (x0, 0),
18620 GET_MODE_SIZE (inner_mode))))
18622 rtx t;
18623 if (inner_mode == DFmode)
18624 t = gen_load_pair_lanesdf (target, x0, x1);
18625 else
18626 t = gen_load_pair_lanesdi (target, x0, x1);
18627 emit_insn (t);
18628 return;
18631 /* The subreg-move sequence below will move into lane zero of the
18632 vector register. For big-endian we want that position to hold
18633 the last element of VALS. */
18634 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
18635 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18636 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
18638 else
18640 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
18641 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
18644 /* Insert the rest. */
18645 for (int i = 0; i < n_elts; i++)
18647 rtx x = XVECEXP (vals, 0, i);
18648 if (matches[i][0] == maxelement)
18649 continue;
18650 x = copy_to_mode_reg (inner_mode, x);
18651 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18653 return;
18656 /* Initialise a vector which is part-variable. We want to first try
18657 to build those lanes which are constant in the most efficient way we
18658 can. */
18659 if (n_var != n_elts)
18661 rtx copy = copy_rtx (vals);
18663 /* Load constant part of vector. We really don't care what goes into the
18664 parts we will overwrite, but we're more likely to be able to load the
18665 constant efficiently if it has fewer, larger, repeating parts
18666 (see aarch64_simd_valid_immediate). */
18667 for (int i = 0; i < n_elts; i++)
18669 rtx x = XVECEXP (vals, 0, i);
18670 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18671 continue;
18672 rtx subst = any_const;
18673 for (int bit = n_elts / 2; bit > 0; bit /= 2)
18675 /* Look in the copied vector, as more elements are const. */
18676 rtx test = XVECEXP (copy, 0, i ^ bit);
18677 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
18679 subst = test;
18680 break;
18683 XVECEXP (copy, 0, i) = subst;
18685 aarch64_expand_vector_init (target, copy);
18688 /* Insert the variable lanes directly. */
18689 for (int i = 0; i < n_elts; i++)
18691 rtx x = XVECEXP (vals, 0, i);
18692 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
18693 continue;
18694 x = copy_to_mode_reg (inner_mode, x);
18695 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
18699 /* Emit RTL corresponding to:
18700 insr TARGET, ELEM. */
18702 static void
18703 emit_insr (rtx target, rtx elem)
18705 machine_mode mode = GET_MODE (target);
18706 scalar_mode elem_mode = GET_MODE_INNER (mode);
18707 elem = force_reg (elem_mode, elem);
18709 insn_code icode = optab_handler (vec_shl_insert_optab, mode);
18710 gcc_assert (icode != CODE_FOR_nothing);
18711 emit_insn (GEN_FCN (icode) (target, target, elem));
18714 /* Subroutine of aarch64_sve_expand_vector_init for handling
18715 trailing constants.
18716 This function works as follows:
18717 (a) Create a new vector consisting of trailing constants.
18718 (b) Initialize TARGET with the constant vector using emit_move_insn.
18719 (c) Insert remaining elements in TARGET using insr.
18720 NELTS is the total number of elements in original vector while
18721 while NELTS_REQD is the number of elements that are actually
18722 significant.
18724 ??? The heuristic used is to do above only if number of constants
18725 is at least half the total number of elements. May need fine tuning. */
18727 static bool
18728 aarch64_sve_expand_vector_init_handle_trailing_constants
18729 (rtx target, const rtx_vector_builder &builder, int nelts, int nelts_reqd)
18731 machine_mode mode = GET_MODE (target);
18732 scalar_mode elem_mode = GET_MODE_INNER (mode);
18733 int n_trailing_constants = 0;
18735 for (int i = nelts_reqd - 1;
18736 i >= 0 && valid_for_const_vector_p (elem_mode, builder.elt (i));
18737 i--)
18738 n_trailing_constants++;
18740 if (n_trailing_constants >= nelts_reqd / 2)
18742 /* Try to use the natural pattern of BUILDER to extend the trailing
18743 constant elements to a full vector. Replace any variables in the
18744 extra elements with zeros.
18746 ??? It would be better if the builders supported "don't care"
18747 elements, with the builder filling in whichever elements
18748 give the most compact encoding. */
18749 rtx_vector_builder v (mode, nelts, 1);
18750 for (int i = 0; i < nelts; i++)
18752 rtx x = builder.elt (i + nelts_reqd - n_trailing_constants);
18753 if (!valid_for_const_vector_p (elem_mode, x))
18754 x = const0_rtx;
18755 v.quick_push (x);
18757 rtx const_vec = v.build ();
18758 emit_move_insn (target, const_vec);
18760 for (int i = nelts_reqd - n_trailing_constants - 1; i >= 0; i--)
18761 emit_insr (target, builder.elt (i));
18763 return true;
18766 return false;
18769 /* Subroutine of aarch64_sve_expand_vector_init.
18770 Works as follows:
18771 (a) Initialize TARGET by broadcasting element NELTS_REQD - 1 of BUILDER.
18772 (b) Skip trailing elements from BUILDER, which are the same as
18773 element NELTS_REQD - 1.
18774 (c) Insert earlier elements in reverse order in TARGET using insr. */
18776 static void
18777 aarch64_sve_expand_vector_init_insert_elems (rtx target,
18778 const rtx_vector_builder &builder,
18779 int nelts_reqd)
18781 machine_mode mode = GET_MODE (target);
18782 scalar_mode elem_mode = GET_MODE_INNER (mode);
18784 struct expand_operand ops[2];
18785 enum insn_code icode = optab_handler (vec_duplicate_optab, mode);
18786 gcc_assert (icode != CODE_FOR_nothing);
18788 create_output_operand (&ops[0], target, mode);
18789 create_input_operand (&ops[1], builder.elt (nelts_reqd - 1), elem_mode);
18790 expand_insn (icode, 2, ops);
18792 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
18793 for (int i = nelts_reqd - ndups - 1; i >= 0; i--)
18794 emit_insr (target, builder.elt (i));
18797 /* Subroutine of aarch64_sve_expand_vector_init to handle case
18798 when all trailing elements of builder are same.
18799 This works as follows:
18800 (a) Use expand_insn interface to broadcast last vector element in TARGET.
18801 (b) Insert remaining elements in TARGET using insr.
18803 ??? The heuristic used is to do above if number of same trailing elements
18804 is at least 3/4 of total number of elements, loosely based on
18805 heuristic from mostly_zeros_p. May need fine-tuning. */
18807 static bool
18808 aarch64_sve_expand_vector_init_handle_trailing_same_elem
18809 (rtx target, const rtx_vector_builder &builder, int nelts_reqd)
18811 int ndups = builder.count_dups (nelts_reqd - 1, -1, -1);
18812 if (ndups >= (3 * nelts_reqd) / 4)
18814 aarch64_sve_expand_vector_init_insert_elems (target, builder,
18815 nelts_reqd - ndups + 1);
18816 return true;
18819 return false;
18822 /* Initialize register TARGET from BUILDER. NELTS is the constant number
18823 of elements in BUILDER.
18825 The function tries to initialize TARGET from BUILDER if it fits one
18826 of the special cases outlined below.
18828 Failing that, the function divides BUILDER into two sub-vectors:
18829 v_even = even elements of BUILDER;
18830 v_odd = odd elements of BUILDER;
18832 and recursively calls itself with v_even and v_odd.
18834 if (recursive call succeeded for v_even or v_odd)
18835 TARGET = zip (v_even, v_odd)
18837 The function returns true if it managed to build TARGET from BUILDER
18838 with one of the special cases, false otherwise.
18840 Example: {a, 1, b, 2, c, 3, d, 4}
18842 The vector gets divided into:
18843 v_even = {a, b, c, d}
18844 v_odd = {1, 2, 3, 4}
18846 aarch64_sve_expand_vector_init(v_odd) hits case 1 and
18847 initialize tmp2 from constant vector v_odd using emit_move_insn.
18849 aarch64_sve_expand_vector_init(v_even) fails since v_even contains
18850 4 elements, so we construct tmp1 from v_even using insr:
18851 tmp1 = dup(d)
18852 insr tmp1, c
18853 insr tmp1, b
18854 insr tmp1, a
18856 And finally:
18857 TARGET = zip (tmp1, tmp2)
18858 which sets TARGET to {a, 1, b, 2, c, 3, d, 4}. */
18860 static bool
18861 aarch64_sve_expand_vector_init (rtx target, const rtx_vector_builder &builder,
18862 int nelts, int nelts_reqd)
18864 machine_mode mode = GET_MODE (target);
18866 /* Case 1: Vector contains trailing constants. */
18868 if (aarch64_sve_expand_vector_init_handle_trailing_constants
18869 (target, builder, nelts, nelts_reqd))
18870 return true;
18872 /* Case 2: Vector contains leading constants. */
18874 rtx_vector_builder rev_builder (mode, nelts_reqd, 1);
18875 for (int i = 0; i < nelts_reqd; i++)
18876 rev_builder.quick_push (builder.elt (nelts_reqd - i - 1));
18877 rev_builder.finalize ();
18879 if (aarch64_sve_expand_vector_init_handle_trailing_constants
18880 (target, rev_builder, nelts, nelts_reqd))
18882 emit_insn (gen_aarch64_sve_rev (mode, target, target));
18883 return true;
18886 /* Case 3: Vector contains trailing same element. */
18888 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
18889 (target, builder, nelts_reqd))
18890 return true;
18892 /* Case 4: Vector contains leading same element. */
18894 if (aarch64_sve_expand_vector_init_handle_trailing_same_elem
18895 (target, rev_builder, nelts_reqd) && nelts_reqd == nelts)
18897 emit_insn (gen_aarch64_sve_rev (mode, target, target));
18898 return true;
18901 /* Avoid recursing below 4-elements.
18902 ??? The threshold 4 may need fine-tuning. */
18904 if (nelts_reqd <= 4)
18905 return false;
18907 rtx_vector_builder v_even (mode, nelts, 1);
18908 rtx_vector_builder v_odd (mode, nelts, 1);
18910 for (int i = 0; i < nelts * 2; i += 2)
18912 v_even.quick_push (builder.elt (i));
18913 v_odd.quick_push (builder.elt (i + 1));
18916 v_even.finalize ();
18917 v_odd.finalize ();
18919 rtx tmp1 = gen_reg_rtx (mode);
18920 bool did_even_p = aarch64_sve_expand_vector_init (tmp1, v_even,
18921 nelts, nelts_reqd / 2);
18923 rtx tmp2 = gen_reg_rtx (mode);
18924 bool did_odd_p = aarch64_sve_expand_vector_init (tmp2, v_odd,
18925 nelts, nelts_reqd / 2);
18927 if (!did_even_p && !did_odd_p)
18928 return false;
18930 /* Initialize v_even and v_odd using INSR if it didn't match any of the
18931 special cases and zip v_even, v_odd. */
18933 if (!did_even_p)
18934 aarch64_sve_expand_vector_init_insert_elems (tmp1, v_even, nelts_reqd / 2);
18936 if (!did_odd_p)
18937 aarch64_sve_expand_vector_init_insert_elems (tmp2, v_odd, nelts_reqd / 2);
18939 rtvec v = gen_rtvec (2, tmp1, tmp2);
18940 emit_set_insn (target, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
18941 return true;
18944 /* Initialize register TARGET from the elements in PARALLEL rtx VALS. */
18946 void
18947 aarch64_sve_expand_vector_init (rtx target, rtx vals)
18949 machine_mode mode = GET_MODE (target);
18950 int nelts = XVECLEN (vals, 0);
18952 rtx_vector_builder v (mode, nelts, 1);
18953 for (int i = 0; i < nelts; i++)
18954 v.quick_push (XVECEXP (vals, 0, i));
18955 v.finalize ();
18957 /* If neither sub-vectors of v could be initialized specially,
18958 then use INSR to insert all elements from v into TARGET.
18959 ??? This might not be optimal for vectors with large
18960 initializers like 16-element or above.
18961 For nelts < 4, it probably isn't useful to handle specially. */
18963 if (nelts < 4
18964 || !aarch64_sve_expand_vector_init (target, v, nelts, nelts))
18965 aarch64_sve_expand_vector_init_insert_elems (target, v, nelts);
18968 /* Check whether VALUE is a vector constant in which every element
18969 is either a power of 2 or a negated power of 2. If so, return
18970 a constant vector of log2s, and flip CODE between PLUS and MINUS
18971 if VALUE contains negated powers of 2. Return NULL_RTX otherwise. */
18973 static rtx
18974 aarch64_convert_mult_to_shift (rtx value, rtx_code &code)
18976 if (GET_CODE (value) != CONST_VECTOR)
18977 return NULL_RTX;
18979 rtx_vector_builder builder;
18980 if (!builder.new_unary_operation (GET_MODE (value), value, false))
18981 return NULL_RTX;
18983 scalar_mode int_mode = GET_MODE_INNER (GET_MODE (value));
18984 /* 1 if the result of the multiplication must be negated,
18985 0 if it mustn't, or -1 if we don't yet care. */
18986 int negate = -1;
18987 unsigned int encoded_nelts = const_vector_encoded_nelts (value);
18988 for (unsigned int i = 0; i < encoded_nelts; ++i)
18990 rtx elt = CONST_VECTOR_ENCODED_ELT (value, i);
18991 if (!CONST_SCALAR_INT_P (elt))
18992 return NULL_RTX;
18993 rtx_mode_t val (elt, int_mode);
18994 wide_int pow2 = wi::neg (val);
18995 if (val != pow2)
18997 /* It matters whether we negate or not. Make that choice,
18998 and make sure that it's consistent with previous elements. */
18999 if (negate == !wi::neg_p (val))
19000 return NULL_RTX;
19001 negate = wi::neg_p (val);
19002 if (!negate)
19003 pow2 = val;
19005 /* POW2 is now the value that we want to be a power of 2. */
19006 int shift = wi::exact_log2 (pow2);
19007 if (shift < 0)
19008 return NULL_RTX;
19009 builder.quick_push (gen_int_mode (shift, int_mode));
19011 if (negate == -1)
19012 /* PLUS and MINUS are equivalent; canonicalize on PLUS. */
19013 code = PLUS;
19014 else if (negate == 1)
19015 code = code == PLUS ? MINUS : PLUS;
19016 return builder.build ();
19019 /* Prepare for an integer SVE multiply-add or multiply-subtract pattern;
19020 CODE is PLUS for the former and MINUS for the latter. OPERANDS is the
19021 operands array, in the same order as for fma_optab. Return true if
19022 the function emitted all the necessary instructions, false if the caller
19023 should generate the pattern normally with the new OPERANDS array. */
19025 bool
19026 aarch64_prepare_sve_int_fma (rtx *operands, rtx_code code)
19028 machine_mode mode = GET_MODE (operands[0]);
19029 if (rtx shifts = aarch64_convert_mult_to_shift (operands[2], code))
19031 rtx product = expand_binop (mode, vashl_optab, operands[1], shifts,
19032 NULL_RTX, true, OPTAB_DIRECT);
19033 force_expand_binop (mode, code == PLUS ? add_optab : sub_optab,
19034 operands[3], product, operands[0], true,
19035 OPTAB_DIRECT);
19036 return true;
19038 operands[2] = force_reg (mode, operands[2]);
19039 return false;
19042 /* Likewise, but for a conditional pattern. */
19044 bool
19045 aarch64_prepare_sve_cond_int_fma (rtx *operands, rtx_code code)
19047 machine_mode mode = GET_MODE (operands[0]);
19048 if (rtx shifts = aarch64_convert_mult_to_shift (operands[3], code))
19050 rtx product = expand_binop (mode, vashl_optab, operands[2], shifts,
19051 NULL_RTX, true, OPTAB_DIRECT);
19052 emit_insn (gen_cond (code, mode, operands[0], operands[1],
19053 operands[4], product, operands[5]));
19054 return true;
19056 operands[3] = force_reg (mode, operands[3]);
19057 return false;
19060 static unsigned HOST_WIDE_INT
19061 aarch64_shift_truncation_mask (machine_mode mode)
19063 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
19064 return 0;
19065 return GET_MODE_UNIT_BITSIZE (mode) - 1;
19068 /* Select a format to encode pointers in exception handling data. */
19070 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
19072 int type;
19073 switch (aarch64_cmodel)
19075 case AARCH64_CMODEL_TINY:
19076 case AARCH64_CMODEL_TINY_PIC:
19077 case AARCH64_CMODEL_SMALL:
19078 case AARCH64_CMODEL_SMALL_PIC:
19079 case AARCH64_CMODEL_SMALL_SPIC:
19080 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
19081 for everything. */
19082 type = DW_EH_PE_sdata4;
19083 break;
19084 default:
19085 /* No assumptions here. 8-byte relocs required. */
19086 type = DW_EH_PE_sdata8;
19087 break;
19089 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
19092 /* Output .variant_pcs for aarch64_vector_pcs function symbols. */
19094 static void
19095 aarch64_asm_output_variant_pcs (FILE *stream, const tree decl, const char* name)
19097 if (TREE_CODE (decl) == FUNCTION_DECL)
19099 arm_pcs pcs = (arm_pcs) fndecl_abi (decl).id ();
19100 if (pcs == ARM_PCS_SIMD || pcs == ARM_PCS_SVE)
19102 fprintf (stream, "\t.variant_pcs\t");
19103 assemble_name (stream, name);
19104 fprintf (stream, "\n");
19109 /* The last .arch and .tune assembly strings that we printed. */
19110 static std::string aarch64_last_printed_arch_string;
19111 static std::string aarch64_last_printed_tune_string;
19113 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
19114 by the function fndecl. */
19116 void
19117 aarch64_declare_function_name (FILE *stream, const char* name,
19118 tree fndecl)
19120 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
19122 struct cl_target_option *targ_options;
19123 if (target_parts)
19124 targ_options = TREE_TARGET_OPTION (target_parts);
19125 else
19126 targ_options = TREE_TARGET_OPTION (target_option_current_node);
19127 gcc_assert (targ_options);
19129 const struct processor *this_arch
19130 = aarch64_get_arch (targ_options->x_explicit_arch);
19132 uint64_t isa_flags = targ_options->x_aarch64_isa_flags;
19133 std::string extension
19134 = aarch64_get_extension_string_for_isa_flags (isa_flags,
19135 this_arch->flags);
19136 /* Only update the assembler .arch string if it is distinct from the last
19137 such string we printed. */
19138 std::string to_print = this_arch->name + extension;
19139 if (to_print != aarch64_last_printed_arch_string)
19141 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
19142 aarch64_last_printed_arch_string = to_print;
19145 /* Print the cpu name we're tuning for in the comments, might be
19146 useful to readers of the generated asm. Do it only when it changes
19147 from function to function and verbose assembly is requested. */
19148 const struct processor *this_tune
19149 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
19151 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
19153 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
19154 this_tune->name);
19155 aarch64_last_printed_tune_string = this_tune->name;
19158 aarch64_asm_output_variant_pcs (stream, fndecl, name);
19160 /* Don't forget the type directive for ELF. */
19161 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
19162 ASM_OUTPUT_LABEL (stream, name);
19164 cfun->machine->label_is_assembled = true;
19167 /* Implement PRINT_PATCHABLE_FUNCTION_ENTRY. Check if the patch area is after
19168 the function label and emit a BTI if necessary. */
19170 void
19171 aarch64_print_patchable_function_entry (FILE *file,
19172 unsigned HOST_WIDE_INT patch_area_size,
19173 bool record_p)
19175 if (cfun->machine->label_is_assembled
19176 && aarch64_bti_enabled ()
19177 && !cgraph_node::get (cfun->decl)->only_called_directly_p ())
19179 /* Remove the BTI that follows the patch area and insert a new BTI
19180 before the patch area right after the function label. */
19181 rtx_insn *insn = next_real_nondebug_insn (get_insns ());
19182 if (insn
19183 && INSN_P (insn)
19184 && GET_CODE (PATTERN (insn)) == UNSPEC_VOLATILE
19185 && XINT (PATTERN (insn), 1) == UNSPECV_BTI_C)
19186 delete_insn (insn);
19187 asm_fprintf (file, "\thint\t34 // bti c\n");
19190 default_print_patchable_function_entry (file, patch_area_size, record_p);
19193 /* Implement ASM_OUTPUT_DEF_FROM_DECLS. Output .variant_pcs for aliases. */
19195 void
19196 aarch64_asm_output_alias (FILE *stream, const tree decl, const tree target)
19198 const char *name = XSTR (XEXP (DECL_RTL (decl), 0), 0);
19199 const char *value = IDENTIFIER_POINTER (target);
19200 aarch64_asm_output_variant_pcs (stream, decl, name);
19201 ASM_OUTPUT_DEF (stream, name, value);
19204 /* Implement ASM_OUTPUT_EXTERNAL. Output .variant_pcs for undefined
19205 function symbol references. */
19207 void
19208 aarch64_asm_output_external (FILE *stream, tree decl, const char* name)
19210 default_elf_asm_output_external (stream, decl, name);
19211 aarch64_asm_output_variant_pcs (stream, decl, name);
19214 /* Triggered after a .cfi_startproc directive is emitted into the assembly file.
19215 Used to output the .cfi_b_key_frame directive when signing the current
19216 function with the B key. */
19218 void
19219 aarch64_post_cfi_startproc (FILE *f, tree ignored ATTRIBUTE_UNUSED)
19221 if (cfun->machine->frame.laid_out && aarch64_return_address_signing_enabled ()
19222 && aarch64_ra_sign_key == AARCH64_KEY_B)
19223 asm_fprintf (f, "\t.cfi_b_key_frame\n");
19226 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
19228 static void
19229 aarch64_start_file (void)
19231 struct cl_target_option *default_options
19232 = TREE_TARGET_OPTION (target_option_default_node);
19234 const struct processor *default_arch
19235 = aarch64_get_arch (default_options->x_explicit_arch);
19236 uint64_t default_isa_flags = default_options->x_aarch64_isa_flags;
19237 std::string extension
19238 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
19239 default_arch->flags);
19241 aarch64_last_printed_arch_string = default_arch->name + extension;
19242 aarch64_last_printed_tune_string = "";
19243 asm_fprintf (asm_out_file, "\t.arch %s\n",
19244 aarch64_last_printed_arch_string.c_str ());
19246 default_file_start ();
19249 /* Emit load exclusive. */
19251 static void
19252 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
19253 rtx mem, rtx model_rtx)
19255 if (mode == TImode)
19256 emit_insn (gen_aarch64_load_exclusive_pair (gen_lowpart (DImode, rval),
19257 gen_highpart (DImode, rval),
19258 mem, model_rtx));
19259 else
19260 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
19263 /* Emit store exclusive. */
19265 static void
19266 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
19267 rtx mem, rtx rval, rtx model_rtx)
19269 if (mode == TImode)
19270 emit_insn (gen_aarch64_store_exclusive_pair
19271 (bval, mem, operand_subword (rval, 0, 0, TImode),
19272 operand_subword (rval, 1, 0, TImode), model_rtx));
19273 else
19274 emit_insn (gen_aarch64_store_exclusive (mode, bval, mem, rval, model_rtx));
19277 /* Mark the previous jump instruction as unlikely. */
19279 static void
19280 aarch64_emit_unlikely_jump (rtx insn)
19282 rtx_insn *jump = emit_jump_insn (insn);
19283 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
19286 /* We store the names of the various atomic helpers in a 5x4 array.
19287 Return the libcall function given MODE, MODEL and NAMES. */
19290 aarch64_atomic_ool_func(machine_mode mode, rtx model_rtx,
19291 const atomic_ool_names *names)
19293 memmodel model = memmodel_base (INTVAL (model_rtx));
19294 int mode_idx, model_idx;
19296 switch (mode)
19298 case E_QImode:
19299 mode_idx = 0;
19300 break;
19301 case E_HImode:
19302 mode_idx = 1;
19303 break;
19304 case E_SImode:
19305 mode_idx = 2;
19306 break;
19307 case E_DImode:
19308 mode_idx = 3;
19309 break;
19310 case E_TImode:
19311 mode_idx = 4;
19312 break;
19313 default:
19314 gcc_unreachable ();
19317 switch (model)
19319 case MEMMODEL_RELAXED:
19320 model_idx = 0;
19321 break;
19322 case MEMMODEL_CONSUME:
19323 case MEMMODEL_ACQUIRE:
19324 model_idx = 1;
19325 break;
19326 case MEMMODEL_RELEASE:
19327 model_idx = 2;
19328 break;
19329 case MEMMODEL_ACQ_REL:
19330 case MEMMODEL_SEQ_CST:
19331 model_idx = 3;
19332 break;
19333 default:
19334 gcc_unreachable ();
19337 return init_one_libfunc_visibility (names->str[mode_idx][model_idx],
19338 VISIBILITY_HIDDEN);
19341 #define DEF0(B, N) \
19342 { "__aarch64_" #B #N "_relax", \
19343 "__aarch64_" #B #N "_acq", \
19344 "__aarch64_" #B #N "_rel", \
19345 "__aarch64_" #B #N "_acq_rel" }
19347 #define DEF4(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), \
19348 { NULL, NULL, NULL, NULL }
19349 #define DEF5(B) DEF0(B, 1), DEF0(B, 2), DEF0(B, 4), DEF0(B, 8), DEF0(B, 16)
19351 static const atomic_ool_names aarch64_ool_cas_names = { { DEF5(cas) } };
19352 const atomic_ool_names aarch64_ool_swp_names = { { DEF4(swp) } };
19353 const atomic_ool_names aarch64_ool_ldadd_names = { { DEF4(ldadd) } };
19354 const atomic_ool_names aarch64_ool_ldset_names = { { DEF4(ldset) } };
19355 const atomic_ool_names aarch64_ool_ldclr_names = { { DEF4(ldclr) } };
19356 const atomic_ool_names aarch64_ool_ldeor_names = { { DEF4(ldeor) } };
19358 #undef DEF0
19359 #undef DEF4
19360 #undef DEF5
19362 /* Expand a compare and swap pattern. */
19364 void
19365 aarch64_expand_compare_and_swap (rtx operands[])
19367 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x, cc_reg;
19368 machine_mode mode, r_mode;
19370 bval = operands[0];
19371 rval = operands[1];
19372 mem = operands[2];
19373 oldval = operands[3];
19374 newval = operands[4];
19375 is_weak = operands[5];
19376 mod_s = operands[6];
19377 mod_f = operands[7];
19378 mode = GET_MODE (mem);
19380 /* Normally the succ memory model must be stronger than fail, but in the
19381 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
19382 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
19383 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
19384 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
19385 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
19387 r_mode = mode;
19388 if (mode == QImode || mode == HImode)
19390 r_mode = SImode;
19391 rval = gen_reg_rtx (r_mode);
19394 if (TARGET_LSE)
19396 /* The CAS insn requires oldval and rval overlap, but we need to
19397 have a copy of oldval saved across the operation to tell if
19398 the operation is successful. */
19399 if (reg_overlap_mentioned_p (rval, oldval))
19400 rval = copy_to_mode_reg (r_mode, oldval);
19401 else
19402 emit_move_insn (rval, gen_lowpart (r_mode, oldval));
19404 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem,
19405 newval, mod_s));
19406 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19408 else if (TARGET_OUTLINE_ATOMICS)
19410 /* Oldval must satisfy compare afterward. */
19411 if (!aarch64_plus_operand (oldval, mode))
19412 oldval = force_reg (mode, oldval);
19413 rtx func = aarch64_atomic_ool_func (mode, mod_s, &aarch64_ool_cas_names);
19414 rval = emit_library_call_value (func, NULL_RTX, LCT_NORMAL, r_mode,
19415 oldval, mode, newval, mode,
19416 XEXP (mem, 0), Pmode);
19417 cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19419 else
19421 /* The oldval predicate varies by mode. Test it and force to reg. */
19422 insn_code code = code_for_aarch64_compare_and_swap (mode);
19423 if (!insn_data[code].operand[2].predicate (oldval, mode))
19424 oldval = force_reg (mode, oldval);
19426 emit_insn (GEN_FCN (code) (rval, mem, oldval, newval,
19427 is_weak, mod_s, mod_f));
19428 cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
19431 if (r_mode != mode)
19432 rval = gen_lowpart (mode, rval);
19433 emit_move_insn (operands[1], rval);
19435 x = gen_rtx_EQ (SImode, cc_reg, const0_rtx);
19436 emit_insn (gen_rtx_SET (bval, x));
19439 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
19440 sequence implementing an atomic operation. */
19442 static void
19443 aarch64_emit_post_barrier (enum memmodel model)
19445 const enum memmodel base_model = memmodel_base (model);
19447 if (is_mm_sync (model)
19448 && (base_model == MEMMODEL_ACQUIRE
19449 || base_model == MEMMODEL_ACQ_REL
19450 || base_model == MEMMODEL_SEQ_CST))
19452 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
19456 /* Split a compare and swap pattern. */
19458 void
19459 aarch64_split_compare_and_swap (rtx operands[])
19461 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
19462 gcc_assert (epilogue_completed);
19464 rtx rval, mem, oldval, newval, scratch, x, model_rtx;
19465 machine_mode mode;
19466 bool is_weak;
19467 rtx_code_label *label1, *label2;
19468 enum memmodel model;
19470 rval = operands[0];
19471 mem = operands[1];
19472 oldval = operands[2];
19473 newval = operands[3];
19474 is_weak = (operands[4] != const0_rtx);
19475 model_rtx = operands[5];
19476 scratch = operands[7];
19477 mode = GET_MODE (mem);
19478 model = memmodel_from_int (INTVAL (model_rtx));
19480 /* When OLDVAL is zero and we want the strong version we can emit a tighter
19481 loop:
19482 .label1:
19483 LD[A]XR rval, [mem]
19484 CBNZ rval, .label2
19485 ST[L]XR scratch, newval, [mem]
19486 CBNZ scratch, .label1
19487 .label2:
19488 CMP rval, 0. */
19489 bool strong_zero_p = (!is_weak && !aarch64_track_speculation &&
19490 oldval == const0_rtx && mode != TImode);
19492 label1 = NULL;
19493 if (!is_weak)
19495 label1 = gen_label_rtx ();
19496 emit_label (label1);
19498 label2 = gen_label_rtx ();
19500 /* The initial load can be relaxed for a __sync operation since a final
19501 barrier will be emitted to stop code hoisting. */
19502 if (is_mm_sync (model))
19503 aarch64_emit_load_exclusive (mode, rval, mem, GEN_INT (MEMMODEL_RELAXED));
19504 else
19505 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
19507 if (strong_zero_p)
19508 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
19509 else
19511 rtx cc_reg = aarch64_gen_compare_reg_maybe_ze (NE, rval, oldval, mode);
19512 x = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
19514 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19515 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
19516 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19518 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
19520 if (!is_weak)
19522 if (aarch64_track_speculation)
19524 /* Emit an explicit compare instruction, so that we can correctly
19525 track the condition codes. */
19526 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19527 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19529 else
19530 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
19532 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19533 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
19534 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19536 else
19537 aarch64_gen_compare_reg (NE, scratch, const0_rtx);
19539 emit_label (label2);
19541 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
19542 to set the condition flags. If this is not used it will be removed by
19543 later passes. */
19544 if (strong_zero_p)
19545 aarch64_gen_compare_reg (NE, rval, const0_rtx);
19547 /* Emit any final barrier needed for a __sync operation. */
19548 if (is_mm_sync (model))
19549 aarch64_emit_post_barrier (model);
19552 /* Split an atomic operation. */
19554 void
19555 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
19556 rtx value, rtx model_rtx, rtx cond)
19558 /* Split after prolog/epilog to avoid interactions with shrinkwrapping. */
19559 gcc_assert (epilogue_completed);
19561 machine_mode mode = GET_MODE (mem);
19562 machine_mode wmode = (mode == DImode ? DImode : SImode);
19563 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
19564 const bool is_sync = is_mm_sync (model);
19565 rtx_code_label *label;
19566 rtx x;
19568 /* Split the atomic operation into a sequence. */
19569 label = gen_label_rtx ();
19570 emit_label (label);
19572 if (new_out)
19573 new_out = gen_lowpart (wmode, new_out);
19574 if (old_out)
19575 old_out = gen_lowpart (wmode, old_out);
19576 else
19577 old_out = new_out;
19578 value = simplify_gen_subreg (wmode, value, mode, 0);
19580 /* The initial load can be relaxed for a __sync operation since a final
19581 barrier will be emitted to stop code hoisting. */
19582 if (is_sync)
19583 aarch64_emit_load_exclusive (mode, old_out, mem,
19584 GEN_INT (MEMMODEL_RELAXED));
19585 else
19586 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
19588 switch (code)
19590 case SET:
19591 new_out = value;
19592 break;
19594 case NOT:
19595 x = gen_rtx_AND (wmode, old_out, value);
19596 emit_insn (gen_rtx_SET (new_out, x));
19597 x = gen_rtx_NOT (wmode, new_out);
19598 emit_insn (gen_rtx_SET (new_out, x));
19599 break;
19601 case MINUS:
19602 if (CONST_INT_P (value))
19604 value = GEN_INT (-INTVAL (value));
19605 code = PLUS;
19607 /* Fall through. */
19609 default:
19610 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
19611 emit_insn (gen_rtx_SET (new_out, x));
19612 break;
19615 aarch64_emit_store_exclusive (mode, cond, mem,
19616 gen_lowpart (mode, new_out), model_rtx);
19618 if (aarch64_track_speculation)
19620 /* Emit an explicit compare instruction, so that we can correctly
19621 track the condition codes. */
19622 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
19623 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
19625 else
19626 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
19628 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
19629 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
19630 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
19632 /* Emit any final barrier needed for a __sync operation. */
19633 if (is_sync)
19634 aarch64_emit_post_barrier (model);
19637 static void
19638 aarch64_init_libfuncs (void)
19640 /* Half-precision float operations. The compiler handles all operations
19641 with NULL libfuncs by converting to SFmode. */
19643 /* Conversions. */
19644 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
19645 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
19647 /* Arithmetic. */
19648 set_optab_libfunc (add_optab, HFmode, NULL);
19649 set_optab_libfunc (sdiv_optab, HFmode, NULL);
19650 set_optab_libfunc (smul_optab, HFmode, NULL);
19651 set_optab_libfunc (neg_optab, HFmode, NULL);
19652 set_optab_libfunc (sub_optab, HFmode, NULL);
19654 /* Comparisons. */
19655 set_optab_libfunc (eq_optab, HFmode, NULL);
19656 set_optab_libfunc (ne_optab, HFmode, NULL);
19657 set_optab_libfunc (lt_optab, HFmode, NULL);
19658 set_optab_libfunc (le_optab, HFmode, NULL);
19659 set_optab_libfunc (ge_optab, HFmode, NULL);
19660 set_optab_libfunc (gt_optab, HFmode, NULL);
19661 set_optab_libfunc (unord_optab, HFmode, NULL);
19664 /* Target hook for c_mode_for_suffix. */
19665 static machine_mode
19666 aarch64_c_mode_for_suffix (char suffix)
19668 if (suffix == 'q')
19669 return TFmode;
19671 return VOIDmode;
19674 /* We can only represent floating point constants which will fit in
19675 "quarter-precision" values. These values are characterised by
19676 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
19679 (-1)^s * (n/16) * 2^r
19681 Where:
19682 's' is the sign bit.
19683 'n' is an integer in the range 16 <= n <= 31.
19684 'r' is an integer in the range -3 <= r <= 4. */
19686 /* Return true iff X can be represented by a quarter-precision
19687 floating point immediate operand X. Note, we cannot represent 0.0. */
19688 bool
19689 aarch64_float_const_representable_p (rtx x)
19691 /* This represents our current view of how many bits
19692 make up the mantissa. */
19693 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
19694 int exponent;
19695 unsigned HOST_WIDE_INT mantissa, mask;
19696 REAL_VALUE_TYPE r, m;
19697 bool fail;
19699 x = unwrap_const_vec_duplicate (x);
19700 if (!CONST_DOUBLE_P (x))
19701 return false;
19703 if (GET_MODE (x) == VOIDmode
19704 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
19705 return false;
19707 r = *CONST_DOUBLE_REAL_VALUE (x);
19709 /* We cannot represent infinities, NaNs or +/-zero. We won't
19710 know if we have +zero until we analyse the mantissa, but we
19711 can reject the other invalid values. */
19712 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
19713 || REAL_VALUE_MINUS_ZERO (r))
19714 return false;
19716 /* Extract exponent. */
19717 r = real_value_abs (&r);
19718 exponent = REAL_EXP (&r);
19720 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
19721 highest (sign) bit, with a fixed binary point at bit point_pos.
19722 m1 holds the low part of the mantissa, m2 the high part.
19723 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
19724 bits for the mantissa, this can fail (low bits will be lost). */
19725 real_ldexp (&m, &r, point_pos - exponent);
19726 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
19728 /* If the low part of the mantissa has bits set we cannot represent
19729 the value. */
19730 if (w.ulow () != 0)
19731 return false;
19732 /* We have rejected the lower HOST_WIDE_INT, so update our
19733 understanding of how many bits lie in the mantissa and
19734 look only at the high HOST_WIDE_INT. */
19735 mantissa = w.elt (1);
19736 point_pos -= HOST_BITS_PER_WIDE_INT;
19738 /* We can only represent values with a mantissa of the form 1.xxxx. */
19739 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
19740 if ((mantissa & mask) != 0)
19741 return false;
19743 /* Having filtered unrepresentable values, we may now remove all
19744 but the highest 5 bits. */
19745 mantissa >>= point_pos - 5;
19747 /* We cannot represent the value 0.0, so reject it. This is handled
19748 elsewhere. */
19749 if (mantissa == 0)
19750 return false;
19752 /* Then, as bit 4 is always set, we can mask it off, leaving
19753 the mantissa in the range [0, 15]. */
19754 mantissa &= ~(1 << 4);
19755 gcc_assert (mantissa <= 15);
19757 /* GCC internally does not use IEEE754-like encoding (where normalized
19758 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
19759 Our mantissa values are shifted 4 places to the left relative to
19760 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
19761 by 5 places to correct for GCC's representation. */
19762 exponent = 5 - exponent;
19764 return (exponent >= 0 && exponent <= 7);
19767 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
19768 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
19769 output MOVI/MVNI, ORR or BIC immediate. */
19770 char*
19771 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
19772 enum simd_immediate_check which)
19774 bool is_valid;
19775 static char templ[40];
19776 const char *mnemonic;
19777 const char *shift_op;
19778 unsigned int lane_count = 0;
19779 char element_char;
19781 struct simd_immediate_info info;
19783 /* This will return true to show const_vector is legal for use as either
19784 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
19785 It will also update INFO to show how the immediate should be generated.
19786 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
19787 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
19788 gcc_assert (is_valid);
19790 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19791 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
19793 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
19795 gcc_assert (info.insn == simd_immediate_info::MOV
19796 && info.u.mov.shift == 0);
19797 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
19798 move immediate path. */
19799 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
19800 info.u.mov.value = GEN_INT (0);
19801 else
19803 const unsigned int buf_size = 20;
19804 char float_buf[buf_size] = {'\0'};
19805 real_to_decimal_for_mode (float_buf,
19806 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
19807 buf_size, buf_size, 1, info.elt_mode);
19809 if (lane_count == 1)
19810 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
19811 else
19812 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
19813 lane_count, element_char, float_buf);
19814 return templ;
19818 gcc_assert (CONST_INT_P (info.u.mov.value));
19820 if (which == AARCH64_CHECK_MOV)
19822 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
19823 shift_op = (info.u.mov.modifier == simd_immediate_info::MSL
19824 ? "msl" : "lsl");
19825 if (lane_count == 1)
19826 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
19827 mnemonic, UINTVAL (info.u.mov.value));
19828 else if (info.u.mov.shift)
19829 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
19830 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
19831 element_char, UINTVAL (info.u.mov.value), shift_op,
19832 info.u.mov.shift);
19833 else
19834 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
19835 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
19836 element_char, UINTVAL (info.u.mov.value));
19838 else
19840 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
19841 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
19842 if (info.u.mov.shift)
19843 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
19844 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
19845 element_char, UINTVAL (info.u.mov.value), "lsl",
19846 info.u.mov.shift);
19847 else
19848 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
19849 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
19850 element_char, UINTVAL (info.u.mov.value));
19852 return templ;
19855 char*
19856 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
19859 /* If a floating point number was passed and we desire to use it in an
19860 integer mode do the conversion to integer. */
19861 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
19863 unsigned HOST_WIDE_INT ival;
19864 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
19865 gcc_unreachable ();
19866 immediate = gen_int_mode (ival, mode);
19869 machine_mode vmode;
19870 /* use a 64 bit mode for everything except for DI/DF mode, where we use
19871 a 128 bit vector mode. */
19872 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
19874 vmode = aarch64_simd_container_mode (mode, width);
19875 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
19876 return aarch64_output_simd_mov_immediate (v_op, width);
19879 /* Return the output string to use for moving immediate CONST_VECTOR
19880 into an SVE register. */
19882 char *
19883 aarch64_output_sve_mov_immediate (rtx const_vector)
19885 static char templ[40];
19886 struct simd_immediate_info info;
19887 char element_char;
19889 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
19890 gcc_assert (is_valid);
19892 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19894 machine_mode vec_mode = GET_MODE (const_vector);
19895 if (aarch64_sve_pred_mode_p (vec_mode))
19897 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
19898 if (info.insn == simd_immediate_info::MOV)
19900 gcc_assert (info.u.mov.value == const0_rtx);
19901 snprintf (buf, sizeof (buf), "pfalse\t%%0.b");
19903 else
19905 gcc_assert (info.insn == simd_immediate_info::PTRUE);
19906 unsigned int total_bytes;
19907 if (info.u.pattern == AARCH64_SV_ALL
19908 && BYTES_PER_SVE_VECTOR.is_constant (&total_bytes))
19909 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", element_char,
19910 total_bytes / GET_MODE_SIZE (info.elt_mode));
19911 else
19912 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, %s", element_char,
19913 svpattern_token (info.u.pattern));
19915 return buf;
19918 if (info.insn == simd_immediate_info::INDEX)
19920 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
19921 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
19922 element_char, INTVAL (info.u.index.base),
19923 INTVAL (info.u.index.step));
19924 return templ;
19927 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
19929 if (aarch64_float_const_zero_rtx_p (info.u.mov.value))
19930 info.u.mov.value = GEN_INT (0);
19931 else
19933 const int buf_size = 20;
19934 char float_buf[buf_size] = {};
19935 real_to_decimal_for_mode (float_buf,
19936 CONST_DOUBLE_REAL_VALUE (info.u.mov.value),
19937 buf_size, buf_size, 1, info.elt_mode);
19939 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
19940 element_char, float_buf);
19941 return templ;
19945 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
19946 element_char, INTVAL (info.u.mov.value));
19947 return templ;
19950 /* Return the asm template for a PTRUES. CONST_UNSPEC is the
19951 aarch64_sve_ptrue_svpattern_immediate that describes the predicate
19952 pattern. */
19954 char *
19955 aarch64_output_sve_ptrues (rtx const_unspec)
19957 static char templ[40];
19959 struct simd_immediate_info info;
19960 bool is_valid = aarch64_simd_valid_immediate (const_unspec, &info);
19961 gcc_assert (is_valid && info.insn == simd_immediate_info::PTRUE);
19963 char element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
19964 snprintf (templ, sizeof (templ), "ptrues\t%%0.%c, %s", element_char,
19965 svpattern_token (info.u.pattern));
19966 return templ;
19969 /* Split operands into moves from op[1] + op[2] into op[0]. */
19971 void
19972 aarch64_split_combinev16qi (rtx operands[3])
19974 unsigned int dest = REGNO (operands[0]);
19975 unsigned int src1 = REGNO (operands[1]);
19976 unsigned int src2 = REGNO (operands[2]);
19977 machine_mode halfmode = GET_MODE (operands[1]);
19978 unsigned int halfregs = REG_NREGS (operands[1]);
19979 rtx destlo, desthi;
19981 gcc_assert (halfmode == V16QImode);
19983 if (src1 == dest && src2 == dest + halfregs)
19985 /* No-op move. Can't split to nothing; emit something. */
19986 emit_note (NOTE_INSN_DELETED);
19987 return;
19990 /* Preserve register attributes for variable tracking. */
19991 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
19992 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
19993 GET_MODE_SIZE (halfmode));
19995 /* Special case of reversed high/low parts. */
19996 if (reg_overlap_mentioned_p (operands[2], destlo)
19997 && reg_overlap_mentioned_p (operands[1], desthi))
19999 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
20000 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
20001 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
20003 else if (!reg_overlap_mentioned_p (operands[2], destlo))
20005 /* Try to avoid unnecessary moves if part of the result
20006 is in the right place already. */
20007 if (src1 != dest)
20008 emit_move_insn (destlo, operands[1]);
20009 if (src2 != dest + halfregs)
20010 emit_move_insn (desthi, operands[2]);
20012 else
20014 if (src2 != dest + halfregs)
20015 emit_move_insn (desthi, operands[2]);
20016 if (src1 != dest)
20017 emit_move_insn (destlo, operands[1]);
20021 /* vec_perm support. */
20023 struct expand_vec_perm_d
20025 rtx target, op0, op1;
20026 vec_perm_indices perm;
20027 machine_mode vmode;
20028 unsigned int vec_flags;
20029 bool one_vector_p;
20030 bool testing_p;
20033 static bool aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d);
20035 /* Generate a variable permutation. */
20037 static void
20038 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
20040 machine_mode vmode = GET_MODE (target);
20041 bool one_vector_p = rtx_equal_p (op0, op1);
20043 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
20044 gcc_checking_assert (GET_MODE (op0) == vmode);
20045 gcc_checking_assert (GET_MODE (op1) == vmode);
20046 gcc_checking_assert (GET_MODE (sel) == vmode);
20047 gcc_checking_assert (TARGET_SIMD);
20049 if (one_vector_p)
20051 if (vmode == V8QImode)
20053 /* Expand the argument to a V16QI mode by duplicating it. */
20054 rtx pair = gen_reg_rtx (V16QImode);
20055 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
20056 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
20058 else
20060 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
20063 else
20065 rtx pair;
20067 if (vmode == V8QImode)
20069 pair = gen_reg_rtx (V16QImode);
20070 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
20071 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
20073 else
20075 pair = gen_reg_rtx (OImode);
20076 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
20077 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
20082 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
20083 NELT is the number of elements in the vector. */
20085 void
20086 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
20087 unsigned int nelt)
20089 machine_mode vmode = GET_MODE (target);
20090 bool one_vector_p = rtx_equal_p (op0, op1);
20091 rtx mask;
20093 /* The TBL instruction does not use a modulo index, so we must take care
20094 of that ourselves. */
20095 mask = aarch64_simd_gen_const_vector_dup (vmode,
20096 one_vector_p ? nelt - 1 : 2 * nelt - 1);
20097 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
20099 /* For big-endian, we also need to reverse the index within the vector
20100 (but not which vector). */
20101 if (BYTES_BIG_ENDIAN)
20103 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
20104 if (!one_vector_p)
20105 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
20106 sel = expand_simple_binop (vmode, XOR, sel, mask,
20107 NULL, 0, OPTAB_LIB_WIDEN);
20109 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
20112 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
20114 static void
20115 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
20117 emit_insn (gen_rtx_SET (target,
20118 gen_rtx_UNSPEC (GET_MODE (target),
20119 gen_rtvec (2, op0, op1), code)));
20122 /* Expand an SVE vec_perm with the given operands. */
20124 void
20125 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
20127 machine_mode data_mode = GET_MODE (target);
20128 machine_mode sel_mode = GET_MODE (sel);
20129 /* Enforced by the pattern condition. */
20130 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
20132 /* Note: vec_perm indices are supposed to wrap when they go beyond the
20133 size of the two value vectors, i.e. the upper bits of the indices
20134 are effectively ignored. SVE TBL instead produces 0 for any
20135 out-of-range indices, so we need to modulo all the vec_perm indices
20136 to ensure they are all in range. */
20137 rtx sel_reg = force_reg (sel_mode, sel);
20139 /* Check if the sel only references the first values vector. */
20140 if (GET_CODE (sel) == CONST_VECTOR
20141 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
20143 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
20144 return;
20147 /* Check if the two values vectors are the same. */
20148 if (rtx_equal_p (op0, op1))
20150 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
20151 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20152 NULL, 0, OPTAB_DIRECT);
20153 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
20154 return;
20157 /* Run TBL on for each value vector and combine the results. */
20159 rtx res0 = gen_reg_rtx (data_mode);
20160 rtx res1 = gen_reg_rtx (data_mode);
20161 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
20162 if (GET_CODE (sel) != CONST_VECTOR
20163 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
20165 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
20166 2 * nunits - 1);
20167 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
20168 NULL, 0, OPTAB_DIRECT);
20170 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
20171 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
20172 NULL, 0, OPTAB_DIRECT);
20173 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
20174 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
20175 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
20176 else
20177 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
20180 /* Recognize patterns suitable for the TRN instructions. */
20181 static bool
20182 aarch64_evpc_trn (struct expand_vec_perm_d *d)
20184 HOST_WIDE_INT odd;
20185 poly_uint64 nelt = d->perm.length ();
20186 rtx out, in0, in1, x;
20187 machine_mode vmode = d->vmode;
20189 if (GET_MODE_UNIT_SIZE (vmode) > 8)
20190 return false;
20192 /* Note that these are little-endian tests.
20193 We correct for big-endian later. */
20194 if (!d->perm[0].is_constant (&odd)
20195 || (odd != 0 && odd != 1)
20196 || !d->perm.series_p (0, 2, odd, 2)
20197 || !d->perm.series_p (1, 2, nelt + odd, 2))
20198 return false;
20200 /* Success! */
20201 if (d->testing_p)
20202 return true;
20204 in0 = d->op0;
20205 in1 = d->op1;
20206 /* We don't need a big-endian lane correction for SVE; see the comment
20207 at the head of aarch64-sve.md for details. */
20208 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20210 x = in0, in0 = in1, in1 = x;
20211 odd = !odd;
20213 out = d->target;
20215 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20216 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
20217 return true;
20220 /* Try to re-encode the PERM constant so it combines odd and even elements.
20221 This rewrites constants such as {0, 1, 4, 5}/V4SF to {0, 2}/V2DI.
20222 We retry with this new constant with the full suite of patterns. */
20223 static bool
20224 aarch64_evpc_reencode (struct expand_vec_perm_d *d)
20226 expand_vec_perm_d newd;
20227 unsigned HOST_WIDE_INT nelt;
20229 if (d->vec_flags != VEC_ADVSIMD)
20230 return false;
20232 /* Get the new mode. Always twice the size of the inner
20233 and half the elements. */
20234 poly_uint64 vec_bits = GET_MODE_BITSIZE (d->vmode);
20235 unsigned int new_elt_bits = GET_MODE_UNIT_BITSIZE (d->vmode) * 2;
20236 auto new_elt_mode = int_mode_for_size (new_elt_bits, false).require ();
20237 machine_mode new_mode = aarch64_simd_container_mode (new_elt_mode, vec_bits);
20239 if (new_mode == word_mode)
20240 return false;
20242 /* to_constant is safe since this routine is specific to Advanced SIMD
20243 vectors. */
20244 nelt = d->perm.length ().to_constant ();
20246 vec_perm_builder newpermconst;
20247 newpermconst.new_vector (nelt / 2, nelt / 2, 1);
20249 /* Convert the perm constant if we can. Require even, odd as the pairs. */
20250 for (unsigned int i = 0; i < nelt; i += 2)
20252 poly_int64 elt0 = d->perm[i];
20253 poly_int64 elt1 = d->perm[i + 1];
20254 poly_int64 newelt;
20255 if (!multiple_p (elt0, 2, &newelt) || maybe_ne (elt0 + 1, elt1))
20256 return false;
20257 newpermconst.quick_push (newelt.to_constant ());
20259 newpermconst.finalize ();
20261 newd.vmode = new_mode;
20262 newd.vec_flags = VEC_ADVSIMD;
20263 newd.target = d->target ? gen_lowpart (new_mode, d->target) : NULL;
20264 newd.op0 = d->op0 ? gen_lowpart (new_mode, d->op0) : NULL;
20265 newd.op1 = d->op1 ? gen_lowpart (new_mode, d->op1) : NULL;
20266 newd.testing_p = d->testing_p;
20267 newd.one_vector_p = d->one_vector_p;
20269 newd.perm.new_vector (newpermconst, newd.one_vector_p ? 1 : 2, nelt / 2);
20270 return aarch64_expand_vec_perm_const_1 (&newd);
20273 /* Recognize patterns suitable for the UZP instructions. */
20274 static bool
20275 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
20277 HOST_WIDE_INT odd;
20278 rtx out, in0, in1, x;
20279 machine_mode vmode = d->vmode;
20281 if (GET_MODE_UNIT_SIZE (vmode) > 8)
20282 return false;
20284 /* Note that these are little-endian tests.
20285 We correct for big-endian later. */
20286 if (!d->perm[0].is_constant (&odd)
20287 || (odd != 0 && odd != 1)
20288 || !d->perm.series_p (0, 1, odd, 2))
20289 return false;
20291 /* Success! */
20292 if (d->testing_p)
20293 return true;
20295 in0 = d->op0;
20296 in1 = d->op1;
20297 /* We don't need a big-endian lane correction for SVE; see the comment
20298 at the head of aarch64-sve.md for details. */
20299 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20301 x = in0, in0 = in1, in1 = x;
20302 odd = !odd;
20304 out = d->target;
20306 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20307 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
20308 return true;
20311 /* Recognize patterns suitable for the ZIP instructions. */
20312 static bool
20313 aarch64_evpc_zip (struct expand_vec_perm_d *d)
20315 unsigned int high;
20316 poly_uint64 nelt = d->perm.length ();
20317 rtx out, in0, in1, x;
20318 machine_mode vmode = d->vmode;
20320 if (GET_MODE_UNIT_SIZE (vmode) > 8)
20321 return false;
20323 /* Note that these are little-endian tests.
20324 We correct for big-endian later. */
20325 poly_uint64 first = d->perm[0];
20326 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
20327 || !d->perm.series_p (0, 2, first, 1)
20328 || !d->perm.series_p (1, 2, first + nelt, 1))
20329 return false;
20330 high = maybe_ne (first, 0U);
20332 /* Success! */
20333 if (d->testing_p)
20334 return true;
20336 in0 = d->op0;
20337 in1 = d->op1;
20338 /* We don't need a big-endian lane correction for SVE; see the comment
20339 at the head of aarch64-sve.md for details. */
20340 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
20342 x = in0, in0 = in1, in1 = x;
20343 high = !high;
20345 out = d->target;
20347 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
20348 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
20349 return true;
20352 /* Recognize patterns for the EXT insn. */
20354 static bool
20355 aarch64_evpc_ext (struct expand_vec_perm_d *d)
20357 HOST_WIDE_INT location;
20358 rtx offset;
20360 /* The first element always refers to the first vector.
20361 Check if the extracted indices are increasing by one. */
20362 if (d->vec_flags == VEC_SVE_PRED
20363 || !d->perm[0].is_constant (&location)
20364 || !d->perm.series_p (0, 1, location, 1))
20365 return false;
20367 /* Success! */
20368 if (d->testing_p)
20369 return true;
20371 /* The case where (location == 0) is a no-op for both big- and little-endian,
20372 and is removed by the mid-end at optimization levels -O1 and higher.
20374 We don't need a big-endian lane correction for SVE; see the comment
20375 at the head of aarch64-sve.md for details. */
20376 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
20378 /* After setup, we want the high elements of the first vector (stored
20379 at the LSB end of the register), and the low elements of the second
20380 vector (stored at the MSB end of the register). So swap. */
20381 std::swap (d->op0, d->op1);
20382 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
20383 to_constant () is safe since this is restricted to Advanced SIMD
20384 vectors. */
20385 location = d->perm.length ().to_constant () - location;
20388 offset = GEN_INT (location);
20389 emit_set_insn (d->target,
20390 gen_rtx_UNSPEC (d->vmode,
20391 gen_rtvec (3, d->op0, d->op1, offset),
20392 UNSPEC_EXT));
20393 return true;
20396 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
20397 within each 64-bit, 32-bit or 16-bit granule. */
20399 static bool
20400 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
20402 HOST_WIDE_INT diff;
20403 unsigned int i, size, unspec;
20404 machine_mode pred_mode;
20406 if (d->vec_flags == VEC_SVE_PRED
20407 || !d->one_vector_p
20408 || !d->perm[0].is_constant (&diff)
20409 || !diff)
20410 return false;
20412 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
20413 if (size == 8)
20415 unspec = UNSPEC_REV64;
20416 pred_mode = VNx2BImode;
20418 else if (size == 4)
20420 unspec = UNSPEC_REV32;
20421 pred_mode = VNx4BImode;
20423 else if (size == 2)
20425 unspec = UNSPEC_REV16;
20426 pred_mode = VNx8BImode;
20428 else
20429 return false;
20431 unsigned int step = diff + 1;
20432 for (i = 0; i < step; ++i)
20433 if (!d->perm.series_p (i, step, diff - i, step))
20434 return false;
20436 /* Success! */
20437 if (d->testing_p)
20438 return true;
20440 if (d->vec_flags == VEC_SVE_DATA)
20442 machine_mode int_mode = aarch64_sve_int_mode (pred_mode);
20443 rtx target = gen_reg_rtx (int_mode);
20444 if (BYTES_BIG_ENDIAN)
20445 /* The act of taking a subreg between INT_MODE and d->vmode
20446 is itself a reversing operation on big-endian targets;
20447 see the comment at the head of aarch64-sve.md for details.
20448 First reinterpret OP0 as INT_MODE without using a subreg
20449 and without changing the contents. */
20450 emit_insn (gen_aarch64_sve_reinterpret (int_mode, target, d->op0));
20451 else
20453 /* For SVE we use REV[BHW] unspecs derived from the element size
20454 of v->mode and vector modes whose elements have SIZE bytes.
20455 This ensures that the vector modes match the predicate modes. */
20456 int unspec = aarch64_sve_rev_unspec (d->vmode);
20457 rtx pred = aarch64_ptrue_reg (pred_mode);
20458 emit_insn (gen_aarch64_pred (unspec, int_mode, target, pred,
20459 gen_lowpart (int_mode, d->op0)));
20461 emit_move_insn (d->target, gen_lowpart (d->vmode, target));
20462 return true;
20464 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
20465 emit_set_insn (d->target, src);
20466 return true;
20469 /* Recognize patterns for the REV insn, which reverses elements within
20470 a full vector. */
20472 static bool
20473 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
20475 poly_uint64 nelt = d->perm.length ();
20477 if (!d->one_vector_p || d->vec_flags == VEC_ADVSIMD)
20478 return false;
20480 if (!d->perm.series_p (0, 1, nelt - 1, -1))
20481 return false;
20483 /* Success! */
20484 if (d->testing_p)
20485 return true;
20487 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
20488 emit_set_insn (d->target, src);
20489 return true;
20492 static bool
20493 aarch64_evpc_dup (struct expand_vec_perm_d *d)
20495 rtx out = d->target;
20496 rtx in0;
20497 HOST_WIDE_INT elt;
20498 machine_mode vmode = d->vmode;
20499 rtx lane;
20501 if (d->vec_flags == VEC_SVE_PRED
20502 || d->perm.encoding ().encoded_nelts () != 1
20503 || !d->perm[0].is_constant (&elt))
20504 return false;
20506 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
20507 return false;
20509 /* Success! */
20510 if (d->testing_p)
20511 return true;
20513 /* The generic preparation in aarch64_expand_vec_perm_const_1
20514 swaps the operand order and the permute indices if it finds
20515 d->perm[0] to be in the second operand. Thus, we can always
20516 use d->op0 and need not do any extra arithmetic to get the
20517 correct lane number. */
20518 in0 = d->op0;
20519 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
20521 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
20522 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
20523 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
20524 return true;
20527 static bool
20528 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
20530 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
20531 machine_mode vmode = d->vmode;
20533 /* Make sure that the indices are constant. */
20534 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
20535 for (unsigned int i = 0; i < encoded_nelts; ++i)
20536 if (!d->perm[i].is_constant ())
20537 return false;
20539 if (d->testing_p)
20540 return true;
20542 /* Generic code will try constant permutation twice. Once with the
20543 original mode and again with the elements lowered to QImode.
20544 So wait and don't do the selector expansion ourselves. */
20545 if (vmode != V8QImode && vmode != V16QImode)
20546 return false;
20548 /* to_constant is safe since this routine is specific to Advanced SIMD
20549 vectors. */
20550 unsigned int nelt = d->perm.length ().to_constant ();
20551 for (unsigned int i = 0; i < nelt; ++i)
20552 /* If big-endian and two vectors we end up with a weird mixed-endian
20553 mode on NEON. Reverse the index within each word but not the word
20554 itself. to_constant is safe because we checked is_constant above. */
20555 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
20556 ? d->perm[i].to_constant () ^ (nelt - 1)
20557 : d->perm[i].to_constant ());
20559 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
20560 sel = force_reg (vmode, sel);
20562 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
20563 return true;
20566 /* Try to implement D using an SVE TBL instruction. */
20568 static bool
20569 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
20571 unsigned HOST_WIDE_INT nelt;
20573 /* Permuting two variable-length vectors could overflow the
20574 index range. */
20575 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
20576 return false;
20578 if (d->testing_p)
20579 return true;
20581 machine_mode sel_mode = related_int_vector_mode (d->vmode).require ();
20582 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
20583 if (d->one_vector_p)
20584 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
20585 else
20586 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
20587 return true;
20590 /* Try to implement D using SVE SEL instruction. */
20592 static bool
20593 aarch64_evpc_sel (struct expand_vec_perm_d *d)
20595 machine_mode vmode = d->vmode;
20596 int unit_size = GET_MODE_UNIT_SIZE (vmode);
20598 if (d->vec_flags != VEC_SVE_DATA
20599 || unit_size > 8)
20600 return false;
20602 int n_patterns = d->perm.encoding ().npatterns ();
20603 poly_int64 vec_len = d->perm.length ();
20605 for (int i = 0; i < n_patterns; ++i)
20606 if (!known_eq (d->perm[i], i)
20607 && !known_eq (d->perm[i], vec_len + i))
20608 return false;
20610 for (int i = n_patterns; i < n_patterns * 2; i++)
20611 if (!d->perm.series_p (i, n_patterns, i, n_patterns)
20612 && !d->perm.series_p (i, n_patterns, vec_len + i, n_patterns))
20613 return false;
20615 if (d->testing_p)
20616 return true;
20618 machine_mode pred_mode = aarch64_sve_pred_mode (vmode);
20620 /* Build a predicate that is true when op0 elements should be used. */
20621 rtx_vector_builder builder (pred_mode, n_patterns, 2);
20622 for (int i = 0; i < n_patterns * 2; i++)
20624 rtx elem = known_eq (d->perm[i], i) ? CONST1_RTX (BImode)
20625 : CONST0_RTX (BImode);
20626 builder.quick_push (elem);
20629 rtx const_vec = builder.build ();
20630 rtx pred = force_reg (pred_mode, const_vec);
20631 /* TARGET = PRED ? OP0 : OP1. */
20632 emit_insn (gen_vcond_mask (vmode, vmode, d->target, d->op0, d->op1, pred));
20633 return true;
20636 /* Recognize patterns suitable for the INS instructions. */
20637 static bool
20638 aarch64_evpc_ins (struct expand_vec_perm_d *d)
20640 machine_mode mode = d->vmode;
20641 unsigned HOST_WIDE_INT nelt;
20643 if (d->vec_flags != VEC_ADVSIMD)
20644 return false;
20646 /* to_constant is safe since this routine is specific to Advanced SIMD
20647 vectors. */
20648 nelt = d->perm.length ().to_constant ();
20649 rtx insv = d->op0;
20651 HOST_WIDE_INT idx = -1;
20653 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
20655 HOST_WIDE_INT elt;
20656 if (!d->perm[i].is_constant (&elt))
20657 return false;
20658 if (elt == (HOST_WIDE_INT) i)
20659 continue;
20660 if (idx != -1)
20662 idx = -1;
20663 break;
20665 idx = i;
20668 if (idx == -1)
20670 insv = d->op1;
20671 for (unsigned HOST_WIDE_INT i = 0; i < nelt; i++)
20673 if (d->perm[i].to_constant () == (HOST_WIDE_INT) (i + nelt))
20674 continue;
20675 if (idx != -1)
20676 return false;
20677 idx = i;
20680 if (idx == -1)
20681 return false;
20684 if (d->testing_p)
20685 return true;
20687 gcc_assert (idx != -1);
20689 unsigned extractindex = d->perm[idx].to_constant ();
20690 rtx extractv = d->op0;
20691 if (extractindex >= nelt)
20693 extractv = d->op1;
20694 extractindex -= nelt;
20696 gcc_assert (extractindex < nelt);
20698 emit_move_insn (d->target, insv);
20699 insn_code icode = code_for_aarch64_simd_vec_copy_lane (mode);
20700 expand_operand ops[5];
20701 create_output_operand (&ops[0], d->target, mode);
20702 create_input_operand (&ops[1], d->target, mode);
20703 create_integer_operand (&ops[2], 1 << idx);
20704 create_input_operand (&ops[3], extractv, mode);
20705 create_integer_operand (&ops[4], extractindex);
20706 expand_insn (icode, 5, ops);
20708 return true;
20711 static bool
20712 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
20714 /* The pattern matching functions above are written to look for a small
20715 number to begin the sequence (0, 1, N/2). If we begin with an index
20716 from the second operand, we can swap the operands. */
20717 poly_int64 nelt = d->perm.length ();
20718 if (known_ge (d->perm[0], nelt))
20720 d->perm.rotate_inputs (1);
20721 std::swap (d->op0, d->op1);
20724 if ((d->vec_flags == VEC_ADVSIMD
20725 || d->vec_flags == VEC_SVE_DATA
20726 || d->vec_flags == VEC_SVE_PRED)
20727 && known_gt (nelt, 1))
20729 if (aarch64_evpc_rev_local (d))
20730 return true;
20731 else if (aarch64_evpc_rev_global (d))
20732 return true;
20733 else if (aarch64_evpc_ext (d))
20734 return true;
20735 else if (aarch64_evpc_dup (d))
20736 return true;
20737 else if (aarch64_evpc_zip (d))
20738 return true;
20739 else if (aarch64_evpc_uzp (d))
20740 return true;
20741 else if (aarch64_evpc_trn (d))
20742 return true;
20743 else if (aarch64_evpc_sel (d))
20744 return true;
20745 else if (aarch64_evpc_ins (d))
20746 return true;
20747 else if (aarch64_evpc_reencode (d))
20748 return true;
20749 if (d->vec_flags == VEC_SVE_DATA)
20750 return aarch64_evpc_sve_tbl (d);
20751 else if (d->vec_flags == VEC_ADVSIMD)
20752 return aarch64_evpc_tbl (d);
20754 return false;
20757 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
20759 static bool
20760 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
20761 rtx op1, const vec_perm_indices &sel)
20763 struct expand_vec_perm_d d;
20765 /* Check whether the mask can be applied to a single vector. */
20766 if (sel.ninputs () == 1
20767 || (op0 && rtx_equal_p (op0, op1)))
20768 d.one_vector_p = true;
20769 else if (sel.all_from_input_p (0))
20771 d.one_vector_p = true;
20772 op1 = op0;
20774 else if (sel.all_from_input_p (1))
20776 d.one_vector_p = true;
20777 op0 = op1;
20779 else
20780 d.one_vector_p = false;
20782 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
20783 sel.nelts_per_input ());
20784 d.vmode = vmode;
20785 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
20786 d.target = target;
20787 d.op0 = op0;
20788 d.op1 = op1;
20789 d.testing_p = !target;
20791 if (!d.testing_p)
20792 return aarch64_expand_vec_perm_const_1 (&d);
20794 rtx_insn *last = get_last_insn ();
20795 bool ret = aarch64_expand_vec_perm_const_1 (&d);
20796 gcc_assert (last == get_last_insn ());
20798 return ret;
20801 /* Generate a byte permute mask for a register of mode MODE,
20802 which has NUNITS units. */
20805 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
20807 /* We have to reverse each vector because we dont have
20808 a permuted load that can reverse-load according to ABI rules. */
20809 rtx mask;
20810 rtvec v = rtvec_alloc (16);
20811 unsigned int i, j;
20812 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
20814 gcc_assert (BYTES_BIG_ENDIAN);
20815 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
20817 for (i = 0; i < nunits; i++)
20818 for (j = 0; j < usize; j++)
20819 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
20820 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
20821 return force_reg (V16QImode, mask);
20824 /* Expand an SVE integer comparison using the SVE equivalent of:
20826 (set TARGET (CODE OP0 OP1)). */
20828 void
20829 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
20831 machine_mode pred_mode = GET_MODE (target);
20832 machine_mode data_mode = GET_MODE (op0);
20833 rtx res = aarch64_sve_emit_int_cmp (target, pred_mode, code, data_mode,
20834 op0, op1);
20835 if (!rtx_equal_p (target, res))
20836 emit_move_insn (target, res);
20839 /* Return the UNSPEC_COND_* code for comparison CODE. */
20841 static unsigned int
20842 aarch64_unspec_cond_code (rtx_code code)
20844 switch (code)
20846 case NE:
20847 return UNSPEC_COND_FCMNE;
20848 case EQ:
20849 return UNSPEC_COND_FCMEQ;
20850 case LT:
20851 return UNSPEC_COND_FCMLT;
20852 case GT:
20853 return UNSPEC_COND_FCMGT;
20854 case LE:
20855 return UNSPEC_COND_FCMLE;
20856 case GE:
20857 return UNSPEC_COND_FCMGE;
20858 case UNORDERED:
20859 return UNSPEC_COND_FCMUO;
20860 default:
20861 gcc_unreachable ();
20865 /* Emit:
20867 (set TARGET (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
20869 where <X> is the operation associated with comparison CODE.
20870 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
20872 static void
20873 aarch64_emit_sve_fp_cond (rtx target, rtx_code code, rtx pred,
20874 bool known_ptrue_p, rtx op0, rtx op1)
20876 rtx flag = gen_int_mode (known_ptrue_p, SImode);
20877 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
20878 gen_rtvec (4, pred, flag, op0, op1),
20879 aarch64_unspec_cond_code (code));
20880 emit_set_insn (target, unspec);
20883 /* Emit the SVE equivalent of:
20885 (set TMP1 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X1>))
20886 (set TMP2 (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X2>))
20887 (set TARGET (ior:PRED_MODE TMP1 TMP2))
20889 where <Xi> is the operation associated with comparison CODEi.
20890 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
20892 static void
20893 aarch64_emit_sve_or_fp_conds (rtx target, rtx_code code1, rtx_code code2,
20894 rtx pred, bool known_ptrue_p, rtx op0, rtx op1)
20896 machine_mode pred_mode = GET_MODE (pred);
20897 rtx tmp1 = gen_reg_rtx (pred_mode);
20898 aarch64_emit_sve_fp_cond (tmp1, code1, pred, known_ptrue_p, op0, op1);
20899 rtx tmp2 = gen_reg_rtx (pred_mode);
20900 aarch64_emit_sve_fp_cond (tmp2, code2, pred, known_ptrue_p, op0, op1);
20901 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
20904 /* Emit the SVE equivalent of:
20906 (set TMP (unspec [PRED KNOWN_PTRUE_P OP0 OP1] UNSPEC_COND_<X>))
20907 (set TARGET (not TMP))
20909 where <X> is the operation associated with comparison CODE.
20910 KNOWN_PTRUE_P is true if PRED is known to be a PTRUE. */
20912 static void
20913 aarch64_emit_sve_invert_fp_cond (rtx target, rtx_code code, rtx pred,
20914 bool known_ptrue_p, rtx op0, rtx op1)
20916 machine_mode pred_mode = GET_MODE (pred);
20917 rtx tmp = gen_reg_rtx (pred_mode);
20918 aarch64_emit_sve_fp_cond (tmp, code, pred, known_ptrue_p, op0, op1);
20919 aarch64_emit_unop (target, one_cmpl_optab, tmp);
20922 /* Expand an SVE floating-point comparison using the SVE equivalent of:
20924 (set TARGET (CODE OP0 OP1))
20926 If CAN_INVERT_P is true, the caller can also handle inverted results;
20927 return true if the result is in fact inverted. */
20929 bool
20930 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
20931 rtx op0, rtx op1, bool can_invert_p)
20933 machine_mode pred_mode = GET_MODE (target);
20934 machine_mode data_mode = GET_MODE (op0);
20936 rtx ptrue = aarch64_ptrue_reg (pred_mode);
20937 switch (code)
20939 case UNORDERED:
20940 /* UNORDERED has no immediate form. */
20941 op1 = force_reg (data_mode, op1);
20942 /* fall through */
20943 case LT:
20944 case LE:
20945 case GT:
20946 case GE:
20947 case EQ:
20948 case NE:
20950 /* There is native support for the comparison. */
20951 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
20952 return false;
20955 case LTGT:
20956 /* This is a trapping operation (LT or GT). */
20957 aarch64_emit_sve_or_fp_conds (target, LT, GT, ptrue, true, op0, op1);
20958 return false;
20960 case UNEQ:
20961 if (!flag_trapping_math)
20963 /* This would trap for signaling NaNs. */
20964 op1 = force_reg (data_mode, op1);
20965 aarch64_emit_sve_or_fp_conds (target, UNORDERED, EQ,
20966 ptrue, true, op0, op1);
20967 return false;
20969 /* fall through */
20970 case UNLT:
20971 case UNLE:
20972 case UNGT:
20973 case UNGE:
20974 if (flag_trapping_math)
20976 /* Work out which elements are ordered. */
20977 rtx ordered = gen_reg_rtx (pred_mode);
20978 op1 = force_reg (data_mode, op1);
20979 aarch64_emit_sve_invert_fp_cond (ordered, UNORDERED,
20980 ptrue, true, op0, op1);
20982 /* Test the opposite condition for the ordered elements,
20983 then invert the result. */
20984 if (code == UNEQ)
20985 code = NE;
20986 else
20987 code = reverse_condition_maybe_unordered (code);
20988 if (can_invert_p)
20990 aarch64_emit_sve_fp_cond (target, code,
20991 ordered, false, op0, op1);
20992 return true;
20994 aarch64_emit_sve_invert_fp_cond (target, code,
20995 ordered, false, op0, op1);
20996 return false;
20998 break;
21000 case ORDERED:
21001 /* ORDERED has no immediate form. */
21002 op1 = force_reg (data_mode, op1);
21003 break;
21005 default:
21006 gcc_unreachable ();
21009 /* There is native support for the inverse comparison. */
21010 code = reverse_condition_maybe_unordered (code);
21011 if (can_invert_p)
21013 aarch64_emit_sve_fp_cond (target, code, ptrue, true, op0, op1);
21014 return true;
21016 aarch64_emit_sve_invert_fp_cond (target, code, ptrue, true, op0, op1);
21017 return false;
21020 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
21021 of the data being selected and CMP_MODE is the mode of the values being
21022 compared. */
21024 void
21025 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
21026 rtx *ops)
21028 machine_mode pred_mode = aarch64_get_mask_mode (cmp_mode).require ();
21029 rtx pred = gen_reg_rtx (pred_mode);
21030 if (FLOAT_MODE_P (cmp_mode))
21032 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
21033 ops[4], ops[5], true))
21034 std::swap (ops[1], ops[2]);
21036 else
21037 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
21039 if (!aarch64_sve_reg_or_dup_imm (ops[1], data_mode))
21040 ops[1] = force_reg (data_mode, ops[1]);
21041 /* The "false" value can only be zero if the "true" value is a constant. */
21042 if (register_operand (ops[1], data_mode)
21043 || !aarch64_simd_reg_or_zero (ops[2], data_mode))
21044 ops[2] = force_reg (data_mode, ops[2]);
21046 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
21047 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
21050 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
21051 true. However due to issues with register allocation it is preferable
21052 to avoid tieing integer scalar and FP scalar modes. Executing integer
21053 operations in general registers is better than treating them as scalar
21054 vector operations. This reduces latency and avoids redundant int<->FP
21055 moves. So tie modes if they are either the same class, or vector modes
21056 with other vector modes, vector structs or any scalar mode. */
21058 static bool
21059 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
21061 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
21062 return true;
21064 /* We specifically want to allow elements of "structure" modes to
21065 be tieable to the structure. This more general condition allows
21066 other rarer situations too. The reason we don't extend this to
21067 predicate modes is that there are no predicate structure modes
21068 nor any specific instructions for extracting part of a predicate
21069 register. */
21070 if (aarch64_vector_data_mode_p (mode1)
21071 && aarch64_vector_data_mode_p (mode2))
21072 return true;
21074 /* Also allow any scalar modes with vectors. */
21075 if (aarch64_vector_mode_supported_p (mode1)
21076 || aarch64_vector_mode_supported_p (mode2))
21077 return true;
21079 return false;
21082 /* Return a new RTX holding the result of moving POINTER forward by
21083 AMOUNT bytes. */
21085 static rtx
21086 aarch64_move_pointer (rtx pointer, poly_int64 amount)
21088 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
21090 return adjust_automodify_address (pointer, GET_MODE (pointer),
21091 next, amount);
21094 /* Return a new RTX holding the result of moving POINTER forward by the
21095 size of the mode it points to. */
21097 static rtx
21098 aarch64_progress_pointer (rtx pointer)
21100 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
21103 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
21104 MODE bytes. */
21106 static void
21107 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
21108 machine_mode mode)
21110 rtx reg = gen_reg_rtx (mode);
21112 /* "Cast" the pointers to the correct mode. */
21113 *src = adjust_address (*src, mode, 0);
21114 *dst = adjust_address (*dst, mode, 0);
21115 /* Emit the memcpy. */
21116 emit_move_insn (reg, *src);
21117 emit_move_insn (*dst, reg);
21118 /* Move the pointers forward. */
21119 *src = aarch64_progress_pointer (*src);
21120 *dst = aarch64_progress_pointer (*dst);
21123 /* Expand cpymem, as if from a __builtin_memcpy. Return true if
21124 we succeed, otherwise return false. */
21126 bool
21127 aarch64_expand_cpymem (rtx *operands)
21129 int n, mode_bits;
21130 rtx dst = operands[0];
21131 rtx src = operands[1];
21132 rtx base;
21133 machine_mode cur_mode = BLKmode, next_mode;
21134 bool speed_p = !optimize_function_for_size_p (cfun);
21136 /* When optimizing for size, give a better estimate of the length of a
21137 memcpy call, but use the default otherwise. Moves larger than 8 bytes
21138 will always require an even number of instructions to do now. And each
21139 operation requires both a load+store, so devide the max number by 2. */
21140 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
21142 /* We can't do anything smart if the amount to copy is not constant. */
21143 if (!CONST_INT_P (operands[2]))
21144 return false;
21146 n = INTVAL (operands[2]);
21148 /* Try to keep the number of instructions low. For all cases we will do at
21149 most two moves for the residual amount, since we'll always overlap the
21150 remainder. */
21151 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
21152 return false;
21154 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
21155 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
21157 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
21158 src = adjust_automodify_address (src, VOIDmode, base, 0);
21160 /* Convert n to bits to make the rest of the code simpler. */
21161 n = n * BITS_PER_UNIT;
21163 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
21164 larger than TImode, but we should not use them for loads/stores here. */
21165 const int copy_limit = GET_MODE_BITSIZE (TImode);
21167 while (n > 0)
21169 /* Find the largest mode in which to do the copy in without over reading
21170 or writing. */
21171 opt_scalar_int_mode mode_iter;
21172 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
21173 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
21174 cur_mode = mode_iter.require ();
21176 gcc_assert (cur_mode != BLKmode);
21178 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
21179 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
21181 n -= mode_bits;
21183 /* Do certain trailing copies as overlapping if it's going to be
21184 cheaper. i.e. less instructions to do so. For instance doing a 15
21185 byte copy it's more efficient to do two overlapping 8 byte copies than
21186 8 + 6 + 1. */
21187 if (n > 0 && n <= 8 * BITS_PER_UNIT)
21189 next_mode = smallest_mode_for_size (n, MODE_INT);
21190 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
21191 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
21192 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
21193 n = n_bits;
21197 return true;
21200 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
21201 SImode stores. Handle the case when the constant has identical
21202 bottom and top halves. This is beneficial when the two stores can be
21203 merged into an STP and we avoid synthesising potentially expensive
21204 immediates twice. Return true if such a split is possible. */
21206 bool
21207 aarch64_split_dimode_const_store (rtx dst, rtx src)
21209 rtx lo = gen_lowpart (SImode, src);
21210 rtx hi = gen_highpart_mode (SImode, DImode, src);
21212 bool size_p = optimize_function_for_size_p (cfun);
21214 if (!rtx_equal_p (lo, hi))
21215 return false;
21217 unsigned int orig_cost
21218 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
21219 unsigned int lo_cost
21220 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
21222 /* We want to transform:
21223 MOV x1, 49370
21224 MOVK x1, 0x140, lsl 16
21225 MOVK x1, 0xc0da, lsl 32
21226 MOVK x1, 0x140, lsl 48
21227 STR x1, [x0]
21228 into:
21229 MOV w1, 49370
21230 MOVK w1, 0x140, lsl 16
21231 STP w1, w1, [x0]
21232 So we want to perform this only when we save two instructions
21233 or more. When optimizing for size, however, accept any code size
21234 savings we can. */
21235 if (size_p && orig_cost <= lo_cost)
21236 return false;
21238 if (!size_p
21239 && (orig_cost <= lo_cost + 1))
21240 return false;
21242 rtx mem_lo = adjust_address (dst, SImode, 0);
21243 if (!aarch64_mem_pair_operand (mem_lo, SImode))
21244 return false;
21246 rtx tmp_reg = gen_reg_rtx (SImode);
21247 aarch64_expand_mov_immediate (tmp_reg, lo);
21248 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
21249 /* Don't emit an explicit store pair as this may not be always profitable.
21250 Let the sched-fusion logic decide whether to merge them. */
21251 emit_move_insn (mem_lo, tmp_reg);
21252 emit_move_insn (mem_hi, tmp_reg);
21254 return true;
21257 /* Generate RTL for a conditional branch with rtx comparison CODE in
21258 mode CC_MODE. The destination of the unlikely conditional branch
21259 is LABEL_REF. */
21261 void
21262 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
21263 rtx label_ref)
21265 rtx x;
21266 x = gen_rtx_fmt_ee (code, VOIDmode,
21267 gen_rtx_REG (cc_mode, CC_REGNUM),
21268 const0_rtx);
21270 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
21271 gen_rtx_LABEL_REF (VOIDmode, label_ref),
21272 pc_rtx);
21273 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
21276 /* Generate DImode scratch registers for 128-bit (TImode) addition.
21278 OP1 represents the TImode destination operand 1
21279 OP2 represents the TImode destination operand 2
21280 LOW_DEST represents the low half (DImode) of TImode operand 0
21281 LOW_IN1 represents the low half (DImode) of TImode operand 1
21282 LOW_IN2 represents the low half (DImode) of TImode operand 2
21283 HIGH_DEST represents the high half (DImode) of TImode operand 0
21284 HIGH_IN1 represents the high half (DImode) of TImode operand 1
21285 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
21287 void
21288 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21289 rtx *low_in1, rtx *low_in2,
21290 rtx *high_dest, rtx *high_in1,
21291 rtx *high_in2)
21293 *low_dest = gen_reg_rtx (DImode);
21294 *low_in1 = gen_lowpart (DImode, op1);
21295 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21296 subreg_lowpart_offset (DImode, TImode));
21297 *high_dest = gen_reg_rtx (DImode);
21298 *high_in1 = gen_highpart (DImode, op1);
21299 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21300 subreg_highpart_offset (DImode, TImode));
21303 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
21305 This function differs from 'arch64_addti_scratch_regs' in that
21306 OP1 can be an immediate constant (zero). We must call
21307 subreg_highpart_offset with DImode and TImode arguments, otherwise
21308 VOIDmode will be used for the const_int which generates an internal
21309 error from subreg_size_highpart_offset which does not expect a size of zero.
21311 OP1 represents the TImode destination operand 1
21312 OP2 represents the TImode destination operand 2
21313 LOW_DEST represents the low half (DImode) of TImode operand 0
21314 LOW_IN1 represents the low half (DImode) of TImode operand 1
21315 LOW_IN2 represents the low half (DImode) of TImode operand 2
21316 HIGH_DEST represents the high half (DImode) of TImode operand 0
21317 HIGH_IN1 represents the high half (DImode) of TImode operand 1
21318 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
21321 void
21322 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
21323 rtx *low_in1, rtx *low_in2,
21324 rtx *high_dest, rtx *high_in1,
21325 rtx *high_in2)
21327 *low_dest = gen_reg_rtx (DImode);
21328 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
21329 subreg_lowpart_offset (DImode, TImode));
21331 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
21332 subreg_lowpart_offset (DImode, TImode));
21333 *high_dest = gen_reg_rtx (DImode);
21335 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
21336 subreg_highpart_offset (DImode, TImode));
21337 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
21338 subreg_highpart_offset (DImode, TImode));
21341 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
21343 OP0 represents the TImode destination operand 0
21344 LOW_DEST represents the low half (DImode) of TImode operand 0
21345 LOW_IN1 represents the low half (DImode) of TImode operand 1
21346 LOW_IN2 represents the low half (DImode) of TImode operand 2
21347 HIGH_DEST represents the high half (DImode) of TImode operand 0
21348 HIGH_IN1 represents the high half (DImode) of TImode operand 1
21349 HIGH_IN2 represents the high half (DImode) of TImode operand 2
21350 UNSIGNED_P is true if the operation is being performed on unsigned
21351 values. */
21352 void
21353 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
21354 rtx low_in2, rtx high_dest, rtx high_in1,
21355 rtx high_in2, bool unsigned_p)
21357 if (low_in2 == const0_rtx)
21359 low_dest = low_in1;
21360 high_in2 = force_reg (DImode, high_in2);
21361 if (unsigned_p)
21362 emit_insn (gen_subdi3_compare1 (high_dest, high_in1, high_in2));
21363 else
21364 emit_insn (gen_subvdi_insn (high_dest, high_in1, high_in2));
21366 else
21368 if (aarch64_plus_immediate (low_in2, DImode))
21369 emit_insn (gen_subdi3_compare1_imm (low_dest, low_in1, low_in2,
21370 GEN_INT (-INTVAL (low_in2))));
21371 else
21373 low_in2 = force_reg (DImode, low_in2);
21374 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
21376 high_in2 = force_reg (DImode, high_in2);
21378 if (unsigned_p)
21379 emit_insn (gen_usubdi3_carryinC (high_dest, high_in1, high_in2));
21380 else
21381 emit_insn (gen_subdi3_carryinV (high_dest, high_in1, high_in2));
21384 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
21385 emit_move_insn (gen_highpart (DImode, op0), high_dest);
21389 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
21391 static unsigned HOST_WIDE_INT
21392 aarch64_asan_shadow_offset (void)
21394 if (TARGET_ILP32)
21395 return (HOST_WIDE_INT_1 << 29);
21396 else
21397 return (HOST_WIDE_INT_1 << 36);
21400 static rtx
21401 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
21402 int code, tree treeop0, tree treeop1)
21404 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
21405 rtx op0, op1;
21406 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
21407 insn_code icode;
21408 struct expand_operand ops[4];
21410 start_sequence ();
21411 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21413 op_mode = GET_MODE (op0);
21414 if (op_mode == VOIDmode)
21415 op_mode = GET_MODE (op1);
21417 switch (op_mode)
21419 case E_QImode:
21420 case E_HImode:
21421 case E_SImode:
21422 cmp_mode = SImode;
21423 icode = CODE_FOR_cmpsi;
21424 break;
21426 case E_DImode:
21427 cmp_mode = DImode;
21428 icode = CODE_FOR_cmpdi;
21429 break;
21431 case E_SFmode:
21432 cmp_mode = SFmode;
21433 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21434 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
21435 break;
21437 case E_DFmode:
21438 cmp_mode = DFmode;
21439 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
21440 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
21441 break;
21443 default:
21444 end_sequence ();
21445 return NULL_RTX;
21448 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
21449 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
21450 if (!op0 || !op1)
21452 end_sequence ();
21453 return NULL_RTX;
21455 *prep_seq = get_insns ();
21456 end_sequence ();
21458 create_fixed_operand (&ops[0], op0);
21459 create_fixed_operand (&ops[1], op1);
21461 start_sequence ();
21462 if (!maybe_expand_insn (icode, 2, ops))
21464 end_sequence ();
21465 return NULL_RTX;
21467 *gen_seq = get_insns ();
21468 end_sequence ();
21470 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
21471 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
21474 static rtx
21475 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
21476 int cmp_code, tree treeop0, tree treeop1, int bit_code)
21478 rtx op0, op1, target;
21479 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
21480 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
21481 insn_code icode;
21482 struct expand_operand ops[6];
21483 int aarch64_cond;
21485 push_to_sequence (*prep_seq);
21486 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
21488 op_mode = GET_MODE (op0);
21489 if (op_mode == VOIDmode)
21490 op_mode = GET_MODE (op1);
21492 switch (op_mode)
21494 case E_QImode:
21495 case E_HImode:
21496 case E_SImode:
21497 cmp_mode = SImode;
21498 break;
21500 case E_DImode:
21501 cmp_mode = DImode;
21502 break;
21504 case E_SFmode:
21505 cmp_mode = SFmode;
21506 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
21507 break;
21509 case E_DFmode:
21510 cmp_mode = DFmode;
21511 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
21512 break;
21514 default:
21515 end_sequence ();
21516 return NULL_RTX;
21519 icode = code_for_ccmp (cc_mode, cmp_mode);
21521 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
21522 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
21523 if (!op0 || !op1)
21525 end_sequence ();
21526 return NULL_RTX;
21528 *prep_seq = get_insns ();
21529 end_sequence ();
21531 target = gen_rtx_REG (cc_mode, CC_REGNUM);
21532 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
21534 if (bit_code != AND)
21536 /* Treat the ccmp patterns as canonical and use them where possible,
21537 but fall back to ccmp_rev patterns if there's no other option. */
21538 rtx_code prev_code = GET_CODE (prev);
21539 machine_mode prev_mode = GET_MODE (XEXP (prev, 0));
21540 if ((prev_mode == CCFPmode || prev_mode == CCFPEmode)
21541 && !(prev_code == EQ
21542 || prev_code == NE
21543 || prev_code == ORDERED
21544 || prev_code == UNORDERED))
21545 icode = code_for_ccmp_rev (cc_mode, cmp_mode);
21546 else
21548 rtx_code code = reverse_condition (prev_code);
21549 prev = gen_rtx_fmt_ee (code, VOIDmode, XEXP (prev, 0), const0_rtx);
21551 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
21554 create_fixed_operand (&ops[0], XEXP (prev, 0));
21555 create_fixed_operand (&ops[1], target);
21556 create_fixed_operand (&ops[2], op0);
21557 create_fixed_operand (&ops[3], op1);
21558 create_fixed_operand (&ops[4], prev);
21559 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
21561 push_to_sequence (*gen_seq);
21562 if (!maybe_expand_insn (icode, 6, ops))
21564 end_sequence ();
21565 return NULL_RTX;
21568 *gen_seq = get_insns ();
21569 end_sequence ();
21571 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
21574 #undef TARGET_GEN_CCMP_FIRST
21575 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
21577 #undef TARGET_GEN_CCMP_NEXT
21578 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
21580 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
21581 instruction fusion of some sort. */
21583 static bool
21584 aarch64_macro_fusion_p (void)
21586 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
21590 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
21591 should be kept together during scheduling. */
21593 static bool
21594 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
21596 rtx set_dest;
21597 rtx prev_set = single_set (prev);
21598 rtx curr_set = single_set (curr);
21599 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
21600 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
21602 if (!aarch64_macro_fusion_p ())
21603 return false;
21605 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
21607 /* We are trying to match:
21608 prev (mov) == (set (reg r0) (const_int imm16))
21609 curr (movk) == (set (zero_extract (reg r0)
21610 (const_int 16)
21611 (const_int 16))
21612 (const_int imm16_1)) */
21614 set_dest = SET_DEST (curr_set);
21616 if (GET_CODE (set_dest) == ZERO_EXTRACT
21617 && CONST_INT_P (SET_SRC (curr_set))
21618 && CONST_INT_P (SET_SRC (prev_set))
21619 && CONST_INT_P (XEXP (set_dest, 2))
21620 && INTVAL (XEXP (set_dest, 2)) == 16
21621 && REG_P (XEXP (set_dest, 0))
21622 && REG_P (SET_DEST (prev_set))
21623 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
21625 return true;
21629 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
21632 /* We're trying to match:
21633 prev (adrp) == (set (reg r1)
21634 (high (symbol_ref ("SYM"))))
21635 curr (add) == (set (reg r0)
21636 (lo_sum (reg r1)
21637 (symbol_ref ("SYM"))))
21638 Note that r0 need not necessarily be the same as r1, especially
21639 during pre-regalloc scheduling. */
21641 if (satisfies_constraint_Ush (SET_SRC (prev_set))
21642 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
21644 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
21645 && REG_P (XEXP (SET_SRC (curr_set), 0))
21646 && REGNO (XEXP (SET_SRC (curr_set), 0))
21647 == REGNO (SET_DEST (prev_set))
21648 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
21649 XEXP (SET_SRC (curr_set), 1)))
21650 return true;
21654 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
21657 /* We're trying to match:
21658 prev (movk) == (set (zero_extract (reg r0)
21659 (const_int 16)
21660 (const_int 32))
21661 (const_int imm16_1))
21662 curr (movk) == (set (zero_extract (reg r0)
21663 (const_int 16)
21664 (const_int 48))
21665 (const_int imm16_2)) */
21667 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
21668 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
21669 && REG_P (XEXP (SET_DEST (prev_set), 0))
21670 && REG_P (XEXP (SET_DEST (curr_set), 0))
21671 && REGNO (XEXP (SET_DEST (prev_set), 0))
21672 == REGNO (XEXP (SET_DEST (curr_set), 0))
21673 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
21674 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
21675 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
21676 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
21677 && CONST_INT_P (SET_SRC (prev_set))
21678 && CONST_INT_P (SET_SRC (curr_set)))
21679 return true;
21682 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
21684 /* We're trying to match:
21685 prev (adrp) == (set (reg r0)
21686 (high (symbol_ref ("SYM"))))
21687 curr (ldr) == (set (reg r1)
21688 (mem (lo_sum (reg r0)
21689 (symbol_ref ("SYM")))))
21691 curr (ldr) == (set (reg r1)
21692 (zero_extend (mem
21693 (lo_sum (reg r0)
21694 (symbol_ref ("SYM")))))) */
21695 if (satisfies_constraint_Ush (SET_SRC (prev_set))
21696 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
21698 rtx curr_src = SET_SRC (curr_set);
21700 if (GET_CODE (curr_src) == ZERO_EXTEND)
21701 curr_src = XEXP (curr_src, 0);
21703 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
21704 && REG_P (XEXP (XEXP (curr_src, 0), 0))
21705 && REGNO (XEXP (XEXP (curr_src, 0), 0))
21706 == REGNO (SET_DEST (prev_set))
21707 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
21708 XEXP (SET_SRC (prev_set), 0)))
21709 return true;
21713 /* Fuse compare (CMP/CMN/TST/BICS) and conditional branch. */
21714 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
21715 && prev_set && curr_set && any_condjump_p (curr)
21716 && GET_CODE (SET_SRC (prev_set)) == COMPARE
21717 && SCALAR_INT_MODE_P (GET_MODE (XEXP (SET_SRC (prev_set), 0)))
21718 && reg_referenced_p (SET_DEST (prev_set), PATTERN (curr)))
21719 return true;
21721 /* Fuse flag-setting ALU instructions and conditional branch. */
21722 if (aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
21723 && any_condjump_p (curr))
21725 unsigned int condreg1, condreg2;
21726 rtx cc_reg_1;
21727 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
21728 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
21730 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
21731 && prev
21732 && modified_in_p (cc_reg_1, prev))
21734 enum attr_type prev_type = get_attr_type (prev);
21736 /* FIXME: this misses some which is considered simple arthematic
21737 instructions for ThunderX. Simple shifts are missed here. */
21738 if (prev_type == TYPE_ALUS_SREG
21739 || prev_type == TYPE_ALUS_IMM
21740 || prev_type == TYPE_LOGICS_REG
21741 || prev_type == TYPE_LOGICS_IMM)
21742 return true;
21746 /* Fuse ALU instructions and CBZ/CBNZ. */
21747 if (prev_set
21748 && curr_set
21749 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_CBZ)
21750 && any_condjump_p (curr))
21752 /* We're trying to match:
21753 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
21754 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
21755 (const_int 0))
21756 (label_ref ("SYM"))
21757 (pc)) */
21758 if (SET_DEST (curr_set) == (pc_rtx)
21759 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
21760 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
21761 && REG_P (SET_DEST (prev_set))
21762 && REGNO (SET_DEST (prev_set))
21763 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
21765 /* Fuse ALU operations followed by conditional branch instruction. */
21766 switch (get_attr_type (prev))
21768 case TYPE_ALU_IMM:
21769 case TYPE_ALU_SREG:
21770 case TYPE_ADC_REG:
21771 case TYPE_ADC_IMM:
21772 case TYPE_ADCS_REG:
21773 case TYPE_ADCS_IMM:
21774 case TYPE_LOGIC_REG:
21775 case TYPE_LOGIC_IMM:
21776 case TYPE_CSEL:
21777 case TYPE_ADR:
21778 case TYPE_MOV_IMM:
21779 case TYPE_SHIFT_REG:
21780 case TYPE_SHIFT_IMM:
21781 case TYPE_BFM:
21782 case TYPE_RBIT:
21783 case TYPE_REV:
21784 case TYPE_EXTEND:
21785 return true;
21787 default:;
21792 return false;
21795 /* Return true iff the instruction fusion described by OP is enabled. */
21797 bool
21798 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
21800 return (aarch64_tune_params.fusible_ops & op) != 0;
21803 /* If MEM is in the form of [base+offset], extract the two parts
21804 of address and set to BASE and OFFSET, otherwise return false
21805 after clearing BASE and OFFSET. */
21807 bool
21808 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
21810 rtx addr;
21812 gcc_assert (MEM_P (mem));
21814 addr = XEXP (mem, 0);
21816 if (REG_P (addr))
21818 *base = addr;
21819 *offset = const0_rtx;
21820 return true;
21823 if (GET_CODE (addr) == PLUS
21824 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
21826 *base = XEXP (addr, 0);
21827 *offset = XEXP (addr, 1);
21828 return true;
21831 *base = NULL_RTX;
21832 *offset = NULL_RTX;
21834 return false;
21837 /* Types for scheduling fusion. */
21838 enum sched_fusion_type
21840 SCHED_FUSION_NONE = 0,
21841 SCHED_FUSION_LD_SIGN_EXTEND,
21842 SCHED_FUSION_LD_ZERO_EXTEND,
21843 SCHED_FUSION_LD,
21844 SCHED_FUSION_ST,
21845 SCHED_FUSION_NUM
21848 /* If INSN is a load or store of address in the form of [base+offset],
21849 extract the two parts and set to BASE and OFFSET. Return scheduling
21850 fusion type this INSN is. */
21852 static enum sched_fusion_type
21853 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
21855 rtx x, dest, src;
21856 enum sched_fusion_type fusion = SCHED_FUSION_LD;
21858 gcc_assert (INSN_P (insn));
21859 x = PATTERN (insn);
21860 if (GET_CODE (x) != SET)
21861 return SCHED_FUSION_NONE;
21863 src = SET_SRC (x);
21864 dest = SET_DEST (x);
21866 machine_mode dest_mode = GET_MODE (dest);
21868 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
21869 return SCHED_FUSION_NONE;
21871 if (GET_CODE (src) == SIGN_EXTEND)
21873 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
21874 src = XEXP (src, 0);
21875 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
21876 return SCHED_FUSION_NONE;
21878 else if (GET_CODE (src) == ZERO_EXTEND)
21880 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
21881 src = XEXP (src, 0);
21882 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
21883 return SCHED_FUSION_NONE;
21886 if (GET_CODE (src) == MEM && REG_P (dest))
21887 extract_base_offset_in_addr (src, base, offset);
21888 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
21890 fusion = SCHED_FUSION_ST;
21891 extract_base_offset_in_addr (dest, base, offset);
21893 else
21894 return SCHED_FUSION_NONE;
21896 if (*base == NULL_RTX || *offset == NULL_RTX)
21897 fusion = SCHED_FUSION_NONE;
21899 return fusion;
21902 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
21904 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
21905 and PRI are only calculated for these instructions. For other instruction,
21906 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
21907 type instruction fusion can be added by returning different priorities.
21909 It's important that irrelevant instructions get the largest FUSION_PRI. */
21911 static void
21912 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
21913 int *fusion_pri, int *pri)
21915 int tmp, off_val;
21916 rtx base, offset;
21917 enum sched_fusion_type fusion;
21919 gcc_assert (INSN_P (insn));
21921 tmp = max_pri - 1;
21922 fusion = fusion_load_store (insn, &base, &offset);
21923 if (fusion == SCHED_FUSION_NONE)
21925 *pri = tmp;
21926 *fusion_pri = tmp;
21927 return;
21930 /* Set FUSION_PRI according to fusion type and base register. */
21931 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
21933 /* Calculate PRI. */
21934 tmp /= 2;
21936 /* INSN with smaller offset goes first. */
21937 off_val = (int)(INTVAL (offset));
21938 if (off_val >= 0)
21939 tmp -= (off_val & 0xfffff);
21940 else
21941 tmp += ((- off_val) & 0xfffff);
21943 *pri = tmp;
21944 return;
21947 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
21948 Adjust priority of sha1h instructions so they are scheduled before
21949 other SHA1 instructions. */
21951 static int
21952 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
21954 rtx x = PATTERN (insn);
21956 if (GET_CODE (x) == SET)
21958 x = SET_SRC (x);
21960 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
21961 return priority + 10;
21964 return priority;
21967 /* Given OPERANDS of consecutive load/store, check if we can merge
21968 them into ldp/stp. LOAD is true if they are load instructions.
21969 MODE is the mode of memory operands. */
21971 bool
21972 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
21973 machine_mode mode)
21975 HOST_WIDE_INT offval_1, offval_2, msize;
21976 enum reg_class rclass_1, rclass_2;
21977 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
21979 if (load)
21981 mem_1 = operands[1];
21982 mem_2 = operands[3];
21983 reg_1 = operands[0];
21984 reg_2 = operands[2];
21985 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
21986 if (REGNO (reg_1) == REGNO (reg_2))
21987 return false;
21989 else
21991 mem_1 = operands[0];
21992 mem_2 = operands[2];
21993 reg_1 = operands[1];
21994 reg_2 = operands[3];
21997 /* The mems cannot be volatile. */
21998 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
21999 return false;
22001 /* If we have SImode and slow unaligned ldp,
22002 check the alignment to be at least 8 byte. */
22003 if (mode == SImode
22004 && (aarch64_tune_params.extra_tuning_flags
22005 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
22006 && !optimize_size
22007 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
22008 return false;
22010 /* Check if the addresses are in the form of [base+offset]. */
22011 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
22012 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
22013 return false;
22014 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
22015 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
22016 return false;
22018 /* Check if the bases are same. */
22019 if (!rtx_equal_p (base_1, base_2))
22020 return false;
22022 /* The operands must be of the same size. */
22023 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
22024 GET_MODE_SIZE (GET_MODE (mem_2))));
22026 offval_1 = INTVAL (offset_1);
22027 offval_2 = INTVAL (offset_2);
22028 /* We should only be trying this for fixed-sized modes. There is no
22029 SVE LDP/STP instruction. */
22030 msize = GET_MODE_SIZE (mode).to_constant ();
22031 /* Check if the offsets are consecutive. */
22032 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
22033 return false;
22035 /* Check if the addresses are clobbered by load. */
22036 if (load)
22038 if (reg_mentioned_p (reg_1, mem_1))
22039 return false;
22041 /* In increasing order, the last load can clobber the address. */
22042 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
22043 return false;
22046 /* One of the memory accesses must be a mempair operand.
22047 If it is not the first one, they need to be swapped by the
22048 peephole. */
22049 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
22050 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
22051 return false;
22053 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
22054 rclass_1 = FP_REGS;
22055 else
22056 rclass_1 = GENERAL_REGS;
22058 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
22059 rclass_2 = FP_REGS;
22060 else
22061 rclass_2 = GENERAL_REGS;
22063 /* Check if the registers are of same class. */
22064 if (rclass_1 != rclass_2)
22065 return false;
22067 return true;
22070 /* Given OPERANDS of consecutive load/store that can be merged,
22071 swap them if they are not in ascending order. */
22072 void
22073 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
22075 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
22076 HOST_WIDE_INT offval_1, offval_2;
22078 if (load)
22080 mem_1 = operands[1];
22081 mem_2 = operands[3];
22083 else
22085 mem_1 = operands[0];
22086 mem_2 = operands[2];
22089 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
22090 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
22092 offval_1 = INTVAL (offset_1);
22093 offval_2 = INTVAL (offset_2);
22095 if (offval_1 > offval_2)
22097 /* Irrespective of whether this is a load or a store,
22098 we do the same swap. */
22099 std::swap (operands[0], operands[2]);
22100 std::swap (operands[1], operands[3]);
22104 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
22105 comparison between the two. */
22107 aarch64_host_wide_int_compare (const void *x, const void *y)
22109 return wi::cmps (* ((const HOST_WIDE_INT *) x),
22110 * ((const HOST_WIDE_INT *) y));
22113 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
22114 other pointing to a REG rtx containing an offset, compare the offsets
22115 of the two pairs.
22117 Return:
22119 1 iff offset (X) > offset (Y)
22120 0 iff offset (X) == offset (Y)
22121 -1 iff offset (X) < offset (Y) */
22123 aarch64_ldrstr_offset_compare (const void *x, const void *y)
22125 const rtx * operands_1 = (const rtx *) x;
22126 const rtx * operands_2 = (const rtx *) y;
22127 rtx mem_1, mem_2, base, offset_1, offset_2;
22129 if (MEM_P (operands_1[0]))
22130 mem_1 = operands_1[0];
22131 else
22132 mem_1 = operands_1[1];
22134 if (MEM_P (operands_2[0]))
22135 mem_2 = operands_2[0];
22136 else
22137 mem_2 = operands_2[1];
22139 /* Extract the offsets. */
22140 extract_base_offset_in_addr (mem_1, &base, &offset_1);
22141 extract_base_offset_in_addr (mem_2, &base, &offset_2);
22143 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
22145 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
22148 /* Given OPERANDS of consecutive load/store, check if we can merge
22149 them into ldp/stp by adjusting the offset. LOAD is true if they
22150 are load instructions. MODE is the mode of memory operands.
22152 Given below consecutive stores:
22154 str w1, [xb, 0x100]
22155 str w1, [xb, 0x104]
22156 str w1, [xb, 0x108]
22157 str w1, [xb, 0x10c]
22159 Though the offsets are out of the range supported by stp, we can
22160 still pair them after adjusting the offset, like:
22162 add scratch, xb, 0x100
22163 stp w1, w1, [scratch]
22164 stp w1, w1, [scratch, 0x8]
22166 The peephole patterns detecting this opportunity should guarantee
22167 the scratch register is avaliable. */
22169 bool
22170 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
22171 machine_mode mode)
22173 const int num_insns = 4;
22174 enum reg_class rclass;
22175 HOST_WIDE_INT offvals[num_insns], msize;
22176 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
22178 if (load)
22180 for (int i = 0; i < num_insns; i++)
22182 reg[i] = operands[2 * i];
22183 mem[i] = operands[2 * i + 1];
22185 gcc_assert (REG_P (reg[i]));
22188 /* Do not attempt to merge the loads if the loads clobber each other. */
22189 for (int i = 0; i < 8; i += 2)
22190 for (int j = i + 2; j < 8; j += 2)
22191 if (reg_overlap_mentioned_p (operands[i], operands[j]))
22192 return false;
22194 else
22195 for (int i = 0; i < num_insns; i++)
22197 mem[i] = operands[2 * i];
22198 reg[i] = operands[2 * i + 1];
22201 /* Skip if memory operand is by itself valid for ldp/stp. */
22202 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
22203 return false;
22205 for (int i = 0; i < num_insns; i++)
22207 /* The mems cannot be volatile. */
22208 if (MEM_VOLATILE_P (mem[i]))
22209 return false;
22211 /* Check if the addresses are in the form of [base+offset]. */
22212 extract_base_offset_in_addr (mem[i], base + i, offset + i);
22213 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
22214 return false;
22217 /* Check if the registers are of same class. */
22218 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
22219 ? FP_REGS : GENERAL_REGS;
22221 for (int i = 1; i < num_insns; i++)
22222 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
22224 if (rclass != FP_REGS)
22225 return false;
22227 else
22229 if (rclass != GENERAL_REGS)
22230 return false;
22233 /* Only the last register in the order in which they occur
22234 may be clobbered by the load. */
22235 if (rclass == GENERAL_REGS && load)
22236 for (int i = 0; i < num_insns - 1; i++)
22237 if (reg_mentioned_p (reg[i], mem[i]))
22238 return false;
22240 /* Check if the bases are same. */
22241 for (int i = 0; i < num_insns - 1; i++)
22242 if (!rtx_equal_p (base[i], base[i + 1]))
22243 return false;
22245 for (int i = 0; i < num_insns; i++)
22246 offvals[i] = INTVAL (offset[i]);
22248 msize = GET_MODE_SIZE (mode).to_constant ();
22250 /* Check if the offsets can be put in the right order to do a ldp/stp. */
22251 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
22252 aarch64_host_wide_int_compare);
22254 if (!(offvals[1] == offvals[0] + msize
22255 && offvals[3] == offvals[2] + msize))
22256 return false;
22258 /* Check that offsets are within range of each other. The ldp/stp
22259 instructions have 7 bit immediate offsets, so use 0x80. */
22260 if (offvals[2] - offvals[0] >= msize * 0x80)
22261 return false;
22263 /* The offsets must be aligned with respect to each other. */
22264 if (offvals[0] % msize != offvals[2] % msize)
22265 return false;
22267 /* If we have SImode and slow unaligned ldp,
22268 check the alignment to be at least 8 byte. */
22269 if (mode == SImode
22270 && (aarch64_tune_params.extra_tuning_flags
22271 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
22272 && !optimize_size
22273 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
22274 return false;
22276 return true;
22279 /* Given OPERANDS of consecutive load/store, this function pairs them
22280 into LDP/STP after adjusting the offset. It depends on the fact
22281 that the operands can be sorted so the offsets are correct for STP.
22282 MODE is the mode of memory operands. CODE is the rtl operator
22283 which should be applied to all memory operands, it's SIGN_EXTEND,
22284 ZERO_EXTEND or UNKNOWN. */
22286 bool
22287 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
22288 machine_mode mode, RTX_CODE code)
22290 rtx base, offset_1, offset_3, t1, t2;
22291 rtx mem_1, mem_2, mem_3, mem_4;
22292 rtx temp_operands[8];
22293 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
22294 stp_off_upper_limit, stp_off_lower_limit, msize;
22296 /* We make changes on a copy as we may still bail out. */
22297 for (int i = 0; i < 8; i ++)
22298 temp_operands[i] = operands[i];
22300 /* Sort the operands. */
22301 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
22303 /* Copy the memory operands so that if we have to bail for some
22304 reason the original addresses are unchanged. */
22305 if (load)
22307 mem_1 = copy_rtx (temp_operands[1]);
22308 mem_2 = copy_rtx (temp_operands[3]);
22309 mem_3 = copy_rtx (temp_operands[5]);
22310 mem_4 = copy_rtx (temp_operands[7]);
22312 else
22314 mem_1 = copy_rtx (temp_operands[0]);
22315 mem_2 = copy_rtx (temp_operands[2]);
22316 mem_3 = copy_rtx (temp_operands[4]);
22317 mem_4 = copy_rtx (temp_operands[6]);
22318 gcc_assert (code == UNKNOWN);
22321 extract_base_offset_in_addr (mem_1, &base, &offset_1);
22322 extract_base_offset_in_addr (mem_3, &base, &offset_3);
22323 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
22324 && offset_3 != NULL_RTX);
22326 /* Adjust offset so it can fit in LDP/STP instruction. */
22327 msize = GET_MODE_SIZE (mode).to_constant();
22328 stp_off_upper_limit = msize * (0x40 - 1);
22329 stp_off_lower_limit = - msize * 0x40;
22331 off_val_1 = INTVAL (offset_1);
22332 off_val_3 = INTVAL (offset_3);
22334 /* The base offset is optimally half way between the two STP/LDP offsets. */
22335 if (msize <= 4)
22336 base_off = (off_val_1 + off_val_3) / 2;
22337 else
22338 /* However, due to issues with negative LDP/STP offset generation for
22339 larger modes, for DF, DI and vector modes. we must not use negative
22340 addresses smaller than 9 signed unadjusted bits can store. This
22341 provides the most range in this case. */
22342 base_off = off_val_1;
22344 /* Adjust the base so that it is aligned with the addresses but still
22345 optimal. */
22346 if (base_off % msize != off_val_1 % msize)
22347 /* Fix the offset, bearing in mind we want to make it bigger not
22348 smaller. */
22349 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22350 else if (msize <= 4)
22351 /* The negative range of LDP/STP is one larger than the positive range. */
22352 base_off += msize;
22354 /* Check if base offset is too big or too small. We can attempt to resolve
22355 this issue by setting it to the maximum value and seeing if the offsets
22356 still fit. */
22357 if (base_off >= 0x1000)
22359 base_off = 0x1000 - 1;
22360 /* We must still make sure that the base offset is aligned with respect
22361 to the address. But it may not be made any bigger. */
22362 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22365 /* Likewise for the case where the base is too small. */
22366 if (base_off <= -0x1000)
22368 base_off = -0x1000 + 1;
22369 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
22372 /* Offset of the first STP/LDP. */
22373 new_off_1 = off_val_1 - base_off;
22375 /* Offset of the second STP/LDP. */
22376 new_off_3 = off_val_3 - base_off;
22378 /* The offsets must be within the range of the LDP/STP instructions. */
22379 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
22380 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
22381 return false;
22383 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
22384 new_off_1), true);
22385 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
22386 new_off_1 + msize), true);
22387 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
22388 new_off_3), true);
22389 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
22390 new_off_3 + msize), true);
22392 if (!aarch64_mem_pair_operand (mem_1, mode)
22393 || !aarch64_mem_pair_operand (mem_3, mode))
22394 return false;
22396 if (code == ZERO_EXTEND)
22398 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
22399 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
22400 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
22401 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
22403 else if (code == SIGN_EXTEND)
22405 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
22406 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
22407 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
22408 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
22411 if (load)
22413 operands[0] = temp_operands[0];
22414 operands[1] = mem_1;
22415 operands[2] = temp_operands[2];
22416 operands[3] = mem_2;
22417 operands[4] = temp_operands[4];
22418 operands[5] = mem_3;
22419 operands[6] = temp_operands[6];
22420 operands[7] = mem_4;
22422 else
22424 operands[0] = mem_1;
22425 operands[1] = temp_operands[1];
22426 operands[2] = mem_2;
22427 operands[3] = temp_operands[3];
22428 operands[4] = mem_3;
22429 operands[5] = temp_operands[5];
22430 operands[6] = mem_4;
22431 operands[7] = temp_operands[7];
22434 /* Emit adjusting instruction. */
22435 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
22436 /* Emit ldp/stp instructions. */
22437 t1 = gen_rtx_SET (operands[0], operands[1]);
22438 t2 = gen_rtx_SET (operands[2], operands[3]);
22439 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
22440 t1 = gen_rtx_SET (operands[4], operands[5]);
22441 t2 = gen_rtx_SET (operands[6], operands[7]);
22442 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
22443 return true;
22446 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
22447 it isn't worth branching around empty masked ops (including masked
22448 stores). */
22450 static bool
22451 aarch64_empty_mask_is_expensive (unsigned)
22453 return false;
22456 /* Return 1 if pseudo register should be created and used to hold
22457 GOT address for PIC code. */
22459 bool
22460 aarch64_use_pseudo_pic_reg (void)
22462 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
22465 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
22467 static int
22468 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
22470 switch (XINT (x, 1))
22472 case UNSPEC_GOTSMALLPIC:
22473 case UNSPEC_GOTSMALLPIC28K:
22474 case UNSPEC_GOTTINYPIC:
22475 return 0;
22476 default:
22477 break;
22480 return default_unspec_may_trap_p (x, flags);
22484 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
22485 return the log2 of that value. Otherwise return -1. */
22488 aarch64_fpconst_pow_of_2 (rtx x)
22490 const REAL_VALUE_TYPE *r;
22492 if (!CONST_DOUBLE_P (x))
22493 return -1;
22495 r = CONST_DOUBLE_REAL_VALUE (x);
22497 if (REAL_VALUE_NEGATIVE (*r)
22498 || REAL_VALUE_ISNAN (*r)
22499 || REAL_VALUE_ISINF (*r)
22500 || !real_isinteger (r, DFmode))
22501 return -1;
22503 return exact_log2 (real_to_integer (r));
22506 /* If X is a positive CONST_DOUBLE with a value that is the reciprocal of a
22507 power of 2 (i.e 1/2^n) return the number of float bits. e.g. for x==(1/2^n)
22508 return n. Otherwise return -1. */
22511 aarch64_fpconst_pow2_recip (rtx x)
22513 REAL_VALUE_TYPE r0;
22515 if (!CONST_DOUBLE_P (x))
22516 return -1;
22518 r0 = *CONST_DOUBLE_REAL_VALUE (x);
22519 if (exact_real_inverse (DFmode, &r0)
22520 && !REAL_VALUE_NEGATIVE (r0))
22522 int ret = exact_log2 (real_to_integer (&r0));
22523 if (ret >= 1 && ret <= 32)
22524 return ret;
22526 return -1;
22529 /* If X is a vector of equal CONST_DOUBLE values and that value is
22530 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
22533 aarch64_vec_fpconst_pow_of_2 (rtx x)
22535 int nelts;
22536 if (GET_CODE (x) != CONST_VECTOR
22537 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
22538 return -1;
22540 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
22541 return -1;
22543 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
22544 if (firstval <= 0)
22545 return -1;
22547 for (int i = 1; i < nelts; i++)
22548 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
22549 return -1;
22551 return firstval;
22554 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
22555 to float.
22557 __fp16 always promotes through this hook.
22558 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
22559 through the generic excess precision logic rather than here. */
22561 static tree
22562 aarch64_promoted_type (const_tree t)
22564 if (SCALAR_FLOAT_TYPE_P (t)
22565 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
22566 return float_type_node;
22568 return NULL_TREE;
22571 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
22573 static bool
22574 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
22575 optimization_type opt_type)
22577 switch (op)
22579 case rsqrt_optab:
22580 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
22582 default:
22583 return true;
22587 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
22589 static unsigned int
22590 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
22591 int *offset)
22593 /* Polynomial invariant 1 == (VG / 2) - 1. */
22594 gcc_assert (i == 1);
22595 *factor = 2;
22596 *offset = 1;
22597 return AARCH64_DWARF_VG;
22600 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
22601 if MODE is HFmode, and punt to the generic implementation otherwise. */
22603 static bool
22604 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
22606 return (mode == HFmode
22607 ? true
22608 : default_libgcc_floating_mode_supported_p (mode));
22611 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
22612 if MODE is HFmode, and punt to the generic implementation otherwise. */
22614 static bool
22615 aarch64_scalar_mode_supported_p (scalar_mode mode)
22617 return (mode == HFmode
22618 ? true
22619 : default_scalar_mode_supported_p (mode));
22622 /* Set the value of FLT_EVAL_METHOD.
22623 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
22625 0: evaluate all operations and constants, whose semantic type has at
22626 most the range and precision of type float, to the range and
22627 precision of float; evaluate all other operations and constants to
22628 the range and precision of the semantic type;
22630 N, where _FloatN is a supported interchange floating type
22631 evaluate all operations and constants, whose semantic type has at
22632 most the range and precision of _FloatN type, to the range and
22633 precision of the _FloatN type; evaluate all other operations and
22634 constants to the range and precision of the semantic type;
22636 If we have the ARMv8.2-A extensions then we support _Float16 in native
22637 precision, so we should set this to 16. Otherwise, we support the type,
22638 but want to evaluate expressions in float precision, so set this to
22639 0. */
22641 static enum flt_eval_method
22642 aarch64_excess_precision (enum excess_precision_type type)
22644 switch (type)
22646 case EXCESS_PRECISION_TYPE_FAST:
22647 case EXCESS_PRECISION_TYPE_STANDARD:
22648 /* We can calculate either in 16-bit range and precision or
22649 32-bit range and precision. Make that decision based on whether
22650 we have native support for the ARMv8.2-A 16-bit floating-point
22651 instructions or not. */
22652 return (TARGET_FP_F16INST
22653 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
22654 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
22655 case EXCESS_PRECISION_TYPE_IMPLICIT:
22656 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
22657 default:
22658 gcc_unreachable ();
22660 return FLT_EVAL_METHOD_UNPREDICTABLE;
22663 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
22664 scheduled for speculative execution. Reject the long-running division
22665 and square-root instructions. */
22667 static bool
22668 aarch64_sched_can_speculate_insn (rtx_insn *insn)
22670 switch (get_attr_type (insn))
22672 case TYPE_SDIV:
22673 case TYPE_UDIV:
22674 case TYPE_FDIVS:
22675 case TYPE_FDIVD:
22676 case TYPE_FSQRTS:
22677 case TYPE_FSQRTD:
22678 case TYPE_NEON_FP_SQRT_S:
22679 case TYPE_NEON_FP_SQRT_D:
22680 case TYPE_NEON_FP_SQRT_S_Q:
22681 case TYPE_NEON_FP_SQRT_D_Q:
22682 case TYPE_NEON_FP_DIV_S:
22683 case TYPE_NEON_FP_DIV_D:
22684 case TYPE_NEON_FP_DIV_S_Q:
22685 case TYPE_NEON_FP_DIV_D_Q:
22686 return false;
22687 default:
22688 return true;
22692 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
22694 static int
22695 aarch64_compute_pressure_classes (reg_class *classes)
22697 int i = 0;
22698 classes[i++] = GENERAL_REGS;
22699 classes[i++] = FP_REGS;
22700 /* PR_REGS isn't a useful pressure class because many predicate pseudo
22701 registers need to go in PR_LO_REGS at some point during their
22702 lifetime. Splitting it into two halves has the effect of making
22703 all predicates count against PR_LO_REGS, so that we try whenever
22704 possible to restrict the number of live predicates to 8. This
22705 greatly reduces the amount of spilling in certain loops. */
22706 classes[i++] = PR_LO_REGS;
22707 classes[i++] = PR_HI_REGS;
22708 return i;
22711 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
22713 static bool
22714 aarch64_can_change_mode_class (machine_mode from,
22715 machine_mode to, reg_class_t)
22717 unsigned int from_flags = aarch64_classify_vector_mode (from);
22718 unsigned int to_flags = aarch64_classify_vector_mode (to);
22720 bool from_sve_p = (from_flags & VEC_ANY_SVE);
22721 bool to_sve_p = (to_flags & VEC_ANY_SVE);
22723 bool from_partial_sve_p = from_sve_p && (from_flags & VEC_PARTIAL);
22724 bool to_partial_sve_p = to_sve_p && (to_flags & VEC_PARTIAL);
22726 bool from_pred_p = (from_flags & VEC_SVE_PRED);
22727 bool to_pred_p = (to_flags & VEC_SVE_PRED);
22729 /* Don't allow changes between predicate modes and other modes.
22730 Only predicate registers can hold predicate modes and only
22731 non-predicate registers can hold non-predicate modes, so any
22732 attempt to mix them would require a round trip through memory. */
22733 if (from_pred_p != to_pred_p)
22734 return false;
22736 /* Don't allow changes between partial SVE modes and other modes.
22737 The contents of partial SVE modes are distributed evenly across
22738 the register, whereas GCC expects them to be clustered together. */
22739 if (from_partial_sve_p != to_partial_sve_p)
22740 return false;
22742 /* Similarly reject changes between partial SVE modes that have
22743 different patterns of significant and insignificant bits. */
22744 if (from_partial_sve_p
22745 && (aarch64_sve_container_bits (from) != aarch64_sve_container_bits (to)
22746 || GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to)))
22747 return false;
22749 if (maybe_ne (BITS_PER_SVE_VECTOR, 128u))
22751 /* Don't allow changes between SVE modes and other modes that might
22752 be bigger than 128 bits. In particular, OImode, CImode and XImode
22753 divide into 128-bit quantities while SVE modes divide into
22754 BITS_PER_SVE_VECTOR quantities. */
22755 if (from_sve_p && !to_sve_p && maybe_gt (GET_MODE_BITSIZE (to), 128))
22756 return false;
22757 if (to_sve_p && !from_sve_p && maybe_gt (GET_MODE_BITSIZE (from), 128))
22758 return false;
22761 if (BYTES_BIG_ENDIAN)
22763 /* Don't allow changes between SVE data modes and non-SVE modes.
22764 See the comment at the head of aarch64-sve.md for details. */
22765 if (from_sve_p != to_sve_p)
22766 return false;
22768 /* Don't allow changes in element size: lane 0 of the new vector
22769 would not then be lane 0 of the old vector. See the comment
22770 above aarch64_maybe_expand_sve_subreg_move for a more detailed
22771 description.
22773 In the worst case, this forces a register to be spilled in
22774 one mode and reloaded in the other, which handles the
22775 endianness correctly. */
22776 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
22777 return false;
22779 return true;
22782 /* Implement TARGET_EARLY_REMAT_MODES. */
22784 static void
22785 aarch64_select_early_remat_modes (sbitmap modes)
22787 /* SVE values are not normally live across a call, so it should be
22788 worth doing early rematerialization even in VL-specific mode. */
22789 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
22790 if (aarch64_sve_mode_p ((machine_mode) i))
22791 bitmap_set_bit (modes, i);
22794 /* Override the default target speculation_safe_value. */
22795 static rtx
22796 aarch64_speculation_safe_value (machine_mode mode,
22797 rtx result, rtx val, rtx failval)
22799 /* Maybe we should warn if falling back to hard barriers. They are
22800 likely to be noticably more expensive than the alternative below. */
22801 if (!aarch64_track_speculation)
22802 return default_speculation_safe_value (mode, result, val, failval);
22804 if (!REG_P (val))
22805 val = copy_to_mode_reg (mode, val);
22807 if (!aarch64_reg_or_zero (failval, mode))
22808 failval = copy_to_mode_reg (mode, failval);
22810 emit_insn (gen_despeculate_copy (mode, result, val, failval));
22811 return result;
22814 /* Implement TARGET_ESTIMATED_POLY_VALUE.
22815 Look into the tuning structure for an estimate.
22816 VAL.coeffs[1] is multiplied by the number of VQ chunks over the initial
22817 Advanced SIMD 128 bits. */
22819 static HOST_WIDE_INT
22820 aarch64_estimated_poly_value (poly_int64 val)
22822 enum aarch64_sve_vector_bits_enum width_source
22823 = aarch64_tune_params.sve_width;
22825 /* If we still don't have an estimate, use the default. */
22826 if (width_source == SVE_SCALABLE)
22827 return default_estimated_poly_value (val);
22829 HOST_WIDE_INT over_128 = width_source - 128;
22830 return val.coeffs[0] + val.coeffs[1] * over_128 / 128;
22834 /* Return true for types that could be supported as SIMD return or
22835 argument types. */
22837 static bool
22838 supported_simd_type (tree t)
22840 if (SCALAR_FLOAT_TYPE_P (t) || INTEGRAL_TYPE_P (t) || POINTER_TYPE_P (t))
22842 HOST_WIDE_INT s = tree_to_shwi (TYPE_SIZE_UNIT (t));
22843 return s == 1 || s == 2 || s == 4 || s == 8;
22845 return false;
22848 /* Return true for types that currently are supported as SIMD return
22849 or argument types. */
22851 static bool
22852 currently_supported_simd_type (tree t, tree b)
22854 if (COMPLEX_FLOAT_TYPE_P (t))
22855 return false;
22857 if (TYPE_SIZE (t) != TYPE_SIZE (b))
22858 return false;
22860 return supported_simd_type (t);
22863 /* Implement TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN. */
22865 static int
22866 aarch64_simd_clone_compute_vecsize_and_simdlen (struct cgraph_node *node,
22867 struct cgraph_simd_clone *clonei,
22868 tree base_type, int num)
22870 tree t, ret_type, arg_type;
22871 unsigned int elt_bits, vec_bits, count;
22873 if (!TARGET_SIMD)
22874 return 0;
22876 if (clonei->simdlen
22877 && (clonei->simdlen < 2
22878 || clonei->simdlen > 1024
22879 || (clonei->simdlen & (clonei->simdlen - 1)) != 0))
22881 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22882 "unsupported simdlen %d", clonei->simdlen);
22883 return 0;
22886 ret_type = TREE_TYPE (TREE_TYPE (node->decl));
22887 if (TREE_CODE (ret_type) != VOID_TYPE
22888 && !currently_supported_simd_type (ret_type, base_type))
22890 if (TYPE_SIZE (ret_type) != TYPE_SIZE (base_type))
22891 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22892 "GCC does not currently support mixed size types "
22893 "for %<simd%> functions");
22894 else if (supported_simd_type (ret_type))
22895 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22896 "GCC does not currently support return type %qT "
22897 "for %<simd%> functions", ret_type);
22898 else
22899 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22900 "unsupported return type %qT for %<simd%> functions",
22901 ret_type);
22902 return 0;
22905 for (t = DECL_ARGUMENTS (node->decl); t; t = DECL_CHAIN (t))
22907 arg_type = TREE_TYPE (t);
22909 if (!currently_supported_simd_type (arg_type, base_type))
22911 if (TYPE_SIZE (arg_type) != TYPE_SIZE (base_type))
22912 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22913 "GCC does not currently support mixed size types "
22914 "for %<simd%> functions");
22915 else
22916 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22917 "GCC does not currently support argument type %qT "
22918 "for %<simd%> functions", arg_type);
22919 return 0;
22923 clonei->vecsize_mangle = 'n';
22924 clonei->mask_mode = VOIDmode;
22925 elt_bits = GET_MODE_BITSIZE (SCALAR_TYPE_MODE (base_type));
22926 if (clonei->simdlen == 0)
22928 count = 2;
22929 vec_bits = (num == 0 ? 64 : 128);
22930 clonei->simdlen = vec_bits / elt_bits;
22932 else
22934 count = 1;
22935 vec_bits = clonei->simdlen * elt_bits;
22936 if (vec_bits != 64 && vec_bits != 128)
22938 warning_at (DECL_SOURCE_LOCATION (node->decl), 0,
22939 "GCC does not currently support simdlen %d for type %qT",
22940 clonei->simdlen, base_type);
22941 return 0;
22944 clonei->vecsize_int = vec_bits;
22945 clonei->vecsize_float = vec_bits;
22946 return count;
22949 /* Implement TARGET_SIMD_CLONE_ADJUST. */
22951 static void
22952 aarch64_simd_clone_adjust (struct cgraph_node *node)
22954 /* Add aarch64_vector_pcs target attribute to SIMD clones so they
22955 use the correct ABI. */
22957 tree t = TREE_TYPE (node->decl);
22958 TYPE_ATTRIBUTES (t) = make_attribute ("aarch64_vector_pcs", "default",
22959 TYPE_ATTRIBUTES (t));
22962 /* Implement TARGET_SIMD_CLONE_USABLE. */
22964 static int
22965 aarch64_simd_clone_usable (struct cgraph_node *node)
22967 switch (node->simdclone->vecsize_mangle)
22969 case 'n':
22970 if (!TARGET_SIMD)
22971 return -1;
22972 return 0;
22973 default:
22974 gcc_unreachable ();
22978 /* Implement TARGET_COMP_TYPE_ATTRIBUTES */
22980 static int
22981 aarch64_comp_type_attributes (const_tree type1, const_tree type2)
22983 auto check_attr = [&](const char *name) {
22984 tree attr1 = lookup_attribute (name, TYPE_ATTRIBUTES (type1));
22985 tree attr2 = lookup_attribute (name, TYPE_ATTRIBUTES (type2));
22986 if (!attr1 && !attr2)
22987 return true;
22989 return attr1 && attr2 && attribute_value_equal (attr1, attr2);
22992 if (!check_attr ("aarch64_vector_pcs"))
22993 return 0;
22994 if (!check_attr ("Advanced SIMD type"))
22995 return 0;
22996 return 1;
22999 /* Implement TARGET_GET_MULTILIB_ABI_NAME */
23001 static const char *
23002 aarch64_get_multilib_abi_name (void)
23004 if (TARGET_BIG_END)
23005 return TARGET_ILP32 ? "aarch64_be_ilp32" : "aarch64_be";
23006 return TARGET_ILP32 ? "aarch64_ilp32" : "aarch64";
23009 /* Implement TARGET_STACK_PROTECT_GUARD. In case of a
23010 global variable based guard use the default else
23011 return a null tree. */
23012 static tree
23013 aarch64_stack_protect_guard (void)
23015 if (aarch64_stack_protector_guard == SSP_GLOBAL)
23016 return default_stack_protect_guard ();
23018 return NULL_TREE;
23021 /* Return the diagnostic message string if conversion from FROMTYPE to
23022 TOTYPE is not allowed, NULL otherwise. */
23024 static const char *
23025 aarch64_invalid_conversion (const_tree fromtype, const_tree totype)
23027 if (element_mode (fromtype) != element_mode (totype))
23029 /* Do no allow conversions to/from BFmode scalar types. */
23030 if (TYPE_MODE (fromtype) == BFmode)
23031 return N_("invalid conversion from type %<bfloat16_t%>");
23032 if (TYPE_MODE (totype) == BFmode)
23033 return N_("invalid conversion to type %<bfloat16_t%>");
23036 /* Conversion allowed. */
23037 return NULL;
23040 /* Return the diagnostic message string if the unary operation OP is
23041 not permitted on TYPE, NULL otherwise. */
23043 static const char *
23044 aarch64_invalid_unary_op (int op, const_tree type)
23046 /* Reject all single-operand operations on BFmode except for &. */
23047 if (element_mode (type) == BFmode && op != ADDR_EXPR)
23048 return N_("operation not permitted on type %<bfloat16_t%>");
23050 /* Operation allowed. */
23051 return NULL;
23054 /* Return the diagnostic message string if the binary operation OP is
23055 not permitted on TYPE1 and TYPE2, NULL otherwise. */
23057 static const char *
23058 aarch64_invalid_binary_op (int op ATTRIBUTE_UNUSED, const_tree type1,
23059 const_tree type2)
23061 /* Reject all 2-operand operations on BFmode. */
23062 if (element_mode (type1) == BFmode
23063 || element_mode (type2) == BFmode)
23064 return N_("operation not permitted on type %<bfloat16_t%>");
23066 if (VECTOR_TYPE_P (type1)
23067 && VECTOR_TYPE_P (type2)
23068 && !TYPE_INDIVISIBLE_P (type1)
23069 && !TYPE_INDIVISIBLE_P (type2)
23070 && (aarch64_sve::builtin_type_p (type1)
23071 != aarch64_sve::builtin_type_p (type2)))
23072 return N_("cannot combine GNU and SVE vectors in a binary operation");
23074 /* Operation allowed. */
23075 return NULL;
23078 /* Implement TARGET_ASM_FILE_END for AArch64. This adds the AArch64 GNU NOTE
23079 section at the end if needed. */
23080 #define GNU_PROPERTY_AARCH64_FEATURE_1_AND 0xc0000000
23081 #define GNU_PROPERTY_AARCH64_FEATURE_1_BTI (1U << 0)
23082 #define GNU_PROPERTY_AARCH64_FEATURE_1_PAC (1U << 1)
23083 void
23084 aarch64_file_end_indicate_exec_stack ()
23086 file_end_indicate_exec_stack ();
23088 unsigned feature_1_and = 0;
23089 if (aarch64_bti_enabled ())
23090 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
23092 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE)
23093 feature_1_and |= GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
23095 if (feature_1_and)
23097 /* Generate .note.gnu.property section. */
23098 switch_to_section (get_section (".note.gnu.property",
23099 SECTION_NOTYPE, NULL));
23101 /* PT_NOTE header: namesz, descsz, type.
23102 namesz = 4 ("GNU\0")
23103 descsz = 16 (Size of the program property array)
23104 [(12 + padding) * Number of array elements]
23105 type = 5 (NT_GNU_PROPERTY_TYPE_0). */
23106 assemble_align (POINTER_SIZE);
23107 assemble_integer (GEN_INT (4), 4, 32, 1);
23108 assemble_integer (GEN_INT (ROUND_UP (12, POINTER_BYTES)), 4, 32, 1);
23109 assemble_integer (GEN_INT (5), 4, 32, 1);
23111 /* PT_NOTE name. */
23112 assemble_string ("GNU", 4);
23114 /* PT_NOTE contents for NT_GNU_PROPERTY_TYPE_0:
23115 type = GNU_PROPERTY_AARCH64_FEATURE_1_AND
23116 datasz = 4
23117 data = feature_1_and. */
23118 assemble_integer (GEN_INT (GNU_PROPERTY_AARCH64_FEATURE_1_AND), 4, 32, 1);
23119 assemble_integer (GEN_INT (4), 4, 32, 1);
23120 assemble_integer (GEN_INT (feature_1_and), 4, 32, 1);
23122 /* Pad the size of the note to the required alignment. */
23123 assemble_align (POINTER_SIZE);
23126 #undef GNU_PROPERTY_AARCH64_FEATURE_1_PAC
23127 #undef GNU_PROPERTY_AARCH64_FEATURE_1_BTI
23128 #undef GNU_PROPERTY_AARCH64_FEATURE_1_AND
23130 /* Helper function for straight line speculation.
23131 Return what barrier should be emitted for straight line speculation
23132 mitigation.
23133 When not mitigating against straight line speculation this function returns
23134 an empty string.
23135 When mitigating against straight line speculation, use:
23136 * SB when the v8.5-A SB extension is enabled.
23137 * DSB+ISB otherwise. */
23138 const char *
23139 aarch64_sls_barrier (int mitigation_required)
23141 return mitigation_required
23142 ? (TARGET_SB ? "sb" : "dsb\tsy\n\tisb")
23143 : "";
23146 static GTY (()) tree aarch64_sls_shared_thunks[30];
23147 static GTY (()) bool aarch64_sls_shared_thunks_needed = false;
23148 const char *indirect_symbol_names[30] = {
23149 "__call_indirect_x0",
23150 "__call_indirect_x1",
23151 "__call_indirect_x2",
23152 "__call_indirect_x3",
23153 "__call_indirect_x4",
23154 "__call_indirect_x5",
23155 "__call_indirect_x6",
23156 "__call_indirect_x7",
23157 "__call_indirect_x8",
23158 "__call_indirect_x9",
23159 "__call_indirect_x10",
23160 "__call_indirect_x11",
23161 "__call_indirect_x12",
23162 "__call_indirect_x13",
23163 "__call_indirect_x14",
23164 "__call_indirect_x15",
23165 "", /* "__call_indirect_x16", */
23166 "", /* "__call_indirect_x17", */
23167 "__call_indirect_x18",
23168 "__call_indirect_x19",
23169 "__call_indirect_x20",
23170 "__call_indirect_x21",
23171 "__call_indirect_x22",
23172 "__call_indirect_x23",
23173 "__call_indirect_x24",
23174 "__call_indirect_x25",
23175 "__call_indirect_x26",
23176 "__call_indirect_x27",
23177 "__call_indirect_x28",
23178 "__call_indirect_x29",
23181 /* Function to create a BLR thunk. This thunk is used to mitigate straight
23182 line speculation. Instead of a simple BLR that can be speculated past,
23183 we emit a BL to this thunk, and this thunk contains a BR to the relevant
23184 register. These thunks have the relevant speculation barries put after
23185 their indirect branch so that speculation is blocked.
23187 We use such a thunk so the speculation barriers are kept off the
23188 architecturally executed path in order to reduce the performance overhead.
23190 When optimizing for size we use stubs shared by the linked object.
23191 When optimizing for performance we emit stubs for each function in the hope
23192 that the branch predictor can better train on jumps specific for a given
23193 function. */
23195 aarch64_sls_create_blr_label (int regnum)
23197 gcc_assert (STUB_REGNUM_P (regnum));
23198 if (optimize_function_for_size_p (cfun))
23200 /* For the thunks shared between different functions in this compilation
23201 unit we use a named symbol -- this is just for users to more easily
23202 understand the generated assembly. */
23203 aarch64_sls_shared_thunks_needed = true;
23204 const char *thunk_name = indirect_symbol_names[regnum];
23205 if (aarch64_sls_shared_thunks[regnum] == NULL)
23207 /* Build a decl representing this function stub and record it for
23208 later. We build a decl here so we can use the GCC machinery for
23209 handling sections automatically (through `get_named_section` and
23210 `make_decl_one_only`). That saves us a lot of trouble handling
23211 the specifics of different output file formats. */
23212 tree decl = build_decl (BUILTINS_LOCATION, FUNCTION_DECL,
23213 get_identifier (thunk_name),
23214 build_function_type_list (void_type_node,
23215 NULL_TREE));
23216 DECL_RESULT (decl) = build_decl (BUILTINS_LOCATION, RESULT_DECL,
23217 NULL_TREE, void_type_node);
23218 TREE_PUBLIC (decl) = 1;
23219 TREE_STATIC (decl) = 1;
23220 DECL_IGNORED_P (decl) = 1;
23221 DECL_ARTIFICIAL (decl) = 1;
23222 make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
23223 resolve_unique_section (decl, 0, false);
23224 aarch64_sls_shared_thunks[regnum] = decl;
23227 return gen_rtx_SYMBOL_REF (Pmode, thunk_name);
23230 if (cfun->machine->call_via[regnum] == NULL)
23231 cfun->machine->call_via[regnum]
23232 = gen_rtx_LABEL_REF (Pmode, gen_label_rtx ());
23233 return cfun->machine->call_via[regnum];
23236 /* Helper function for aarch64_sls_emit_blr_function_thunks and
23237 aarch64_sls_emit_shared_blr_thunks below. */
23238 static void
23239 aarch64_sls_emit_function_stub (FILE *out_file, int regnum)
23241 /* Save in x16 and branch to that function so this transformation does
23242 not prevent jumping to `BTI c` instructions. */
23243 asm_fprintf (out_file, "\tmov\tx16, x%d\n", regnum);
23244 asm_fprintf (out_file, "\tbr\tx16\n");
23247 /* Emit all BLR stubs for this particular function.
23248 Here we emit all the BLR stubs needed for the current function. Since we
23249 emit these stubs in a consecutive block we know there will be no speculation
23250 gadgets between each stub, and hence we only emit a speculation barrier at
23251 the end of the stub sequences.
23253 This is called in the TARGET_ASM_FUNCTION_EPILOGUE hook. */
23254 void
23255 aarch64_sls_emit_blr_function_thunks (FILE *out_file)
23257 if (! aarch64_harden_sls_blr_p ())
23258 return;
23260 bool any_functions_emitted = false;
23261 /* We must save and restore the current function section since this assembly
23262 is emitted at the end of the function. This means it can be emitted *just
23263 after* the cold section of a function. That cold part would be emitted in
23264 a different section. That switch would trigger a `.cfi_endproc` directive
23265 to be emitted in the original section and a `.cfi_startproc` directive to
23266 be emitted in the new section. Switching to the original section without
23267 restoring would mean that the `.cfi_endproc` emitted as a function ends
23268 would happen in a different section -- leaving an unmatched
23269 `.cfi_startproc` in the cold text section and an unmatched `.cfi_endproc`
23270 in the standard text section. */
23271 section *save_text_section = in_section;
23272 switch_to_section (function_section (current_function_decl));
23273 for (int regnum = 0; regnum < 30; ++regnum)
23275 rtx specu_label = cfun->machine->call_via[regnum];
23276 if (specu_label == NULL)
23277 continue;
23279 targetm.asm_out.print_operand (out_file, specu_label, 0);
23280 asm_fprintf (out_file, ":\n");
23281 aarch64_sls_emit_function_stub (out_file, regnum);
23282 any_functions_emitted = true;
23284 if (any_functions_emitted)
23285 /* Can use the SB if needs be here, since this stub will only be used
23286 by the current function, and hence for the current target. */
23287 asm_fprintf (out_file, "\t%s\n", aarch64_sls_barrier (true));
23288 switch_to_section (save_text_section);
23291 /* Emit shared BLR stubs for the current compilation unit.
23292 Over the course of compiling this unit we may have converted some BLR
23293 instructions to a BL to a shared stub function. This is where we emit those
23294 stub functions.
23295 This function is for the stubs shared between different functions in this
23296 compilation unit. We share when optimizing for size instead of speed.
23298 This function is called through the TARGET_ASM_FILE_END hook. */
23299 void
23300 aarch64_sls_emit_shared_blr_thunks (FILE *out_file)
23302 if (! aarch64_sls_shared_thunks_needed)
23303 return;
23305 for (int regnum = 0; regnum < 30; ++regnum)
23307 tree decl = aarch64_sls_shared_thunks[regnum];
23308 if (!decl)
23309 continue;
23311 const char *name = indirect_symbol_names[regnum];
23312 switch_to_section (get_named_section (decl, NULL, 0));
23313 ASM_OUTPUT_ALIGN (out_file, 2);
23314 targetm.asm_out.globalize_label (out_file, name);
23315 /* Only emits if the compiler is configured for an assembler that can
23316 handle visibility directives. */
23317 targetm.asm_out.assemble_visibility (decl, VISIBILITY_HIDDEN);
23318 ASM_OUTPUT_TYPE_DIRECTIVE (out_file, name, "function");
23319 ASM_OUTPUT_LABEL (out_file, name);
23320 aarch64_sls_emit_function_stub (out_file, regnum);
23321 /* Use the most conservative target to ensure it can always be used by any
23322 function in the translation unit. */
23323 asm_fprintf (out_file, "\tdsb\tsy\n\tisb\n");
23324 ASM_DECLARE_FUNCTION_SIZE (out_file, name, decl);
23328 /* Implement TARGET_ASM_FILE_END. */
23329 void
23330 aarch64_asm_file_end ()
23332 aarch64_sls_emit_shared_blr_thunks (asm_out_file);
23333 /* Since this function will be called for the ASM_FILE_END hook, we ensure
23334 that what would be called otherwise (e.g. `file_end_indicate_exec_stack`
23335 for FreeBSD) still gets called. */
23336 #ifdef TARGET_ASM_FILE_END
23337 TARGET_ASM_FILE_END ();
23338 #endif
23341 const char *
23342 aarch64_indirect_call_asm (rtx addr)
23344 gcc_assert (REG_P (addr));
23345 if (aarch64_harden_sls_blr_p ())
23347 rtx stub_label = aarch64_sls_create_blr_label (REGNO (addr));
23348 output_asm_insn ("bl\t%0", &stub_label);
23350 else
23351 output_asm_insn ("blr\t%0", &addr);
23352 return "";
23355 /* Target-specific selftests. */
23357 #if CHECKING_P
23359 namespace selftest {
23361 /* Selftest for the RTL loader.
23362 Verify that the RTL loader copes with a dump from
23363 print_rtx_function. This is essentially just a test that class
23364 function_reader can handle a real dump, but it also verifies
23365 that lookup_reg_by_dump_name correctly handles hard regs.
23366 The presence of hard reg names in the dump means that the test is
23367 target-specific, hence it is in this file. */
23369 static void
23370 aarch64_test_loading_full_dump ()
23372 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
23374 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
23376 rtx_insn *insn_1 = get_insn_by_uid (1);
23377 ASSERT_EQ (NOTE, GET_CODE (insn_1));
23379 rtx_insn *insn_15 = get_insn_by_uid (15);
23380 ASSERT_EQ (INSN, GET_CODE (insn_15));
23381 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
23383 /* Verify crtl->return_rtx. */
23384 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
23385 ASSERT_EQ (0, REGNO (crtl->return_rtx));
23386 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
23389 /* Run all target-specific selftests. */
23391 static void
23392 aarch64_run_selftests (void)
23394 aarch64_test_loading_full_dump ();
23397 } // namespace selftest
23399 #endif /* #if CHECKING_P */
23401 #undef TARGET_STACK_PROTECT_GUARD
23402 #define TARGET_STACK_PROTECT_GUARD aarch64_stack_protect_guard
23404 #undef TARGET_ADDRESS_COST
23405 #define TARGET_ADDRESS_COST aarch64_address_cost
23407 /* This hook will determines whether unnamed bitfields affect the alignment
23408 of the containing structure. The hook returns true if the structure
23409 should inherit the alignment requirements of an unnamed bitfield's
23410 type. */
23411 #undef TARGET_ALIGN_ANON_BITFIELD
23412 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
23414 #undef TARGET_ASM_ALIGNED_DI_OP
23415 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
23417 #undef TARGET_ASM_ALIGNED_HI_OP
23418 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
23420 #undef TARGET_ASM_ALIGNED_SI_OP
23421 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
23423 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
23424 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
23425 hook_bool_const_tree_hwi_hwi_const_tree_true
23427 #undef TARGET_ASM_FILE_START
23428 #define TARGET_ASM_FILE_START aarch64_start_file
23430 #undef TARGET_ASM_OUTPUT_MI_THUNK
23431 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
23433 #undef TARGET_ASM_SELECT_RTX_SECTION
23434 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
23436 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
23437 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
23439 #undef TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY
23440 #define TARGET_ASM_PRINT_PATCHABLE_FUNCTION_ENTRY aarch64_print_patchable_function_entry
23442 #undef TARGET_BUILD_BUILTIN_VA_LIST
23443 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
23445 #undef TARGET_CALLEE_COPIES
23446 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_arg_info_false
23448 #undef TARGET_CAN_ELIMINATE
23449 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
23451 #undef TARGET_CAN_INLINE_P
23452 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
23454 #undef TARGET_CANNOT_FORCE_CONST_MEM
23455 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
23457 #undef TARGET_CASE_VALUES_THRESHOLD
23458 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
23460 #undef TARGET_CONDITIONAL_REGISTER_USAGE
23461 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
23463 #undef TARGET_MEMBER_TYPE_FORCES_BLK
23464 #define TARGET_MEMBER_TYPE_FORCES_BLK aarch64_member_type_forces_blk
23466 /* Only the least significant bit is used for initialization guard
23467 variables. */
23468 #undef TARGET_CXX_GUARD_MASK_BIT
23469 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
23471 #undef TARGET_C_MODE_FOR_SUFFIX
23472 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
23474 #ifdef TARGET_BIG_ENDIAN_DEFAULT
23475 #undef TARGET_DEFAULT_TARGET_FLAGS
23476 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
23477 #endif
23479 #undef TARGET_CLASS_MAX_NREGS
23480 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
23482 #undef TARGET_BUILTIN_DECL
23483 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
23485 #undef TARGET_BUILTIN_RECIPROCAL
23486 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
23488 #undef TARGET_C_EXCESS_PRECISION
23489 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
23491 #undef TARGET_EXPAND_BUILTIN
23492 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
23494 #undef TARGET_EXPAND_BUILTIN_VA_START
23495 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
23497 #undef TARGET_FOLD_BUILTIN
23498 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
23500 #undef TARGET_FUNCTION_ARG
23501 #define TARGET_FUNCTION_ARG aarch64_function_arg
23503 #undef TARGET_FUNCTION_ARG_ADVANCE
23504 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
23506 #undef TARGET_FUNCTION_ARG_BOUNDARY
23507 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
23509 #undef TARGET_FUNCTION_ARG_PADDING
23510 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
23512 #undef TARGET_GET_RAW_RESULT_MODE
23513 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
23514 #undef TARGET_GET_RAW_ARG_MODE
23515 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
23517 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
23518 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
23520 #undef TARGET_FUNCTION_VALUE
23521 #define TARGET_FUNCTION_VALUE aarch64_function_value
23523 #undef TARGET_FUNCTION_VALUE_REGNO_P
23524 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
23526 #undef TARGET_GIMPLE_FOLD_BUILTIN
23527 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
23529 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
23530 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
23532 #undef TARGET_INIT_BUILTINS
23533 #define TARGET_INIT_BUILTINS aarch64_init_builtins
23535 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
23536 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
23537 aarch64_ira_change_pseudo_allocno_class
23539 #undef TARGET_LEGITIMATE_ADDRESS_P
23540 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
23542 #undef TARGET_LEGITIMATE_CONSTANT_P
23543 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
23545 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
23546 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
23547 aarch64_legitimize_address_displacement
23549 #undef TARGET_LIBGCC_CMP_RETURN_MODE
23550 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
23552 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
23553 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
23554 aarch64_libgcc_floating_mode_supported_p
23556 #undef TARGET_MANGLE_TYPE
23557 #define TARGET_MANGLE_TYPE aarch64_mangle_type
23559 #undef TARGET_INVALID_CONVERSION
23560 #define TARGET_INVALID_CONVERSION aarch64_invalid_conversion
23562 #undef TARGET_INVALID_UNARY_OP
23563 #define TARGET_INVALID_UNARY_OP aarch64_invalid_unary_op
23565 #undef TARGET_INVALID_BINARY_OP
23566 #define TARGET_INVALID_BINARY_OP aarch64_invalid_binary_op
23568 #undef TARGET_VERIFY_TYPE_CONTEXT
23569 #define TARGET_VERIFY_TYPE_CONTEXT aarch64_verify_type_context
23571 #undef TARGET_MEMORY_MOVE_COST
23572 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
23574 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
23575 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
23577 #undef TARGET_MUST_PASS_IN_STACK
23578 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
23580 /* This target hook should return true if accesses to volatile bitfields
23581 should use the narrowest mode possible. It should return false if these
23582 accesses should use the bitfield container type. */
23583 #undef TARGET_NARROW_VOLATILE_BITFIELD
23584 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
23586 #undef TARGET_OPTION_OVERRIDE
23587 #define TARGET_OPTION_OVERRIDE aarch64_override_options
23589 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
23590 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
23591 aarch64_override_options_after_change
23593 #undef TARGET_OFFLOAD_OPTIONS
23594 #define TARGET_OFFLOAD_OPTIONS aarch64_offload_options
23596 #undef TARGET_OPTION_SAVE
23597 #define TARGET_OPTION_SAVE aarch64_option_save
23599 #undef TARGET_OPTION_RESTORE
23600 #define TARGET_OPTION_RESTORE aarch64_option_restore
23602 #undef TARGET_OPTION_PRINT
23603 #define TARGET_OPTION_PRINT aarch64_option_print
23605 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
23606 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
23608 #undef TARGET_SET_CURRENT_FUNCTION
23609 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
23611 #undef TARGET_PASS_BY_REFERENCE
23612 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
23614 #undef TARGET_PREFERRED_RELOAD_CLASS
23615 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
23617 #undef TARGET_SCHED_REASSOCIATION_WIDTH
23618 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
23620 #undef TARGET_PROMOTED_TYPE
23621 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
23623 #undef TARGET_SECONDARY_RELOAD
23624 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
23626 #undef TARGET_SHIFT_TRUNCATION_MASK
23627 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
23629 #undef TARGET_SETUP_INCOMING_VARARGS
23630 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
23632 #undef TARGET_STRUCT_VALUE_RTX
23633 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
23635 #undef TARGET_REGISTER_MOVE_COST
23636 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
23638 #undef TARGET_RETURN_IN_MEMORY
23639 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
23641 #undef TARGET_RETURN_IN_MSB
23642 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
23644 #undef TARGET_RTX_COSTS
23645 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
23647 #undef TARGET_SCALAR_MODE_SUPPORTED_P
23648 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
23650 #undef TARGET_SCHED_ISSUE_RATE
23651 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
23653 #undef TARGET_SCHED_VARIABLE_ISSUE
23654 #define TARGET_SCHED_VARIABLE_ISSUE aarch64_sched_variable_issue
23656 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
23657 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
23658 aarch64_sched_first_cycle_multipass_dfa_lookahead
23660 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
23661 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
23662 aarch64_first_cycle_multipass_dfa_lookahead_guard
23664 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
23665 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
23666 aarch64_get_separate_components
23668 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
23669 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
23670 aarch64_components_for_bb
23672 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
23673 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
23674 aarch64_disqualify_components
23676 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
23677 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
23678 aarch64_emit_prologue_components
23680 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
23681 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
23682 aarch64_emit_epilogue_components
23684 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
23685 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
23686 aarch64_set_handled_components
23688 #undef TARGET_TRAMPOLINE_INIT
23689 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
23691 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
23692 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
23694 #undef TARGET_VECTOR_MODE_SUPPORTED_P
23695 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
23697 #undef TARGET_COMPATIBLE_VECTOR_TYPES_P
23698 #define TARGET_COMPATIBLE_VECTOR_TYPES_P aarch64_compatible_vector_types_p
23700 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
23701 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
23702 aarch64_builtin_support_vector_misalignment
23704 #undef TARGET_ARRAY_MODE
23705 #define TARGET_ARRAY_MODE aarch64_array_mode
23707 #undef TARGET_ARRAY_MODE_SUPPORTED_P
23708 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
23710 #undef TARGET_VECTORIZE_ADD_STMT_COST
23711 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
23713 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
23714 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
23715 aarch64_builtin_vectorization_cost
23717 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
23718 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
23720 #undef TARGET_VECTORIZE_BUILTINS
23721 #define TARGET_VECTORIZE_BUILTINS
23723 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
23724 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
23725 aarch64_builtin_vectorized_function
23727 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES
23728 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_MODES \
23729 aarch64_autovectorize_vector_modes
23731 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
23732 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
23733 aarch64_atomic_assign_expand_fenv
23735 /* Section anchor support. */
23737 #undef TARGET_MIN_ANCHOR_OFFSET
23738 #define TARGET_MIN_ANCHOR_OFFSET -256
23740 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
23741 byte offset; we can do much more for larger data types, but have no way
23742 to determine the size of the access. We assume accesses are aligned. */
23743 #undef TARGET_MAX_ANCHOR_OFFSET
23744 #define TARGET_MAX_ANCHOR_OFFSET 4095
23746 #undef TARGET_VECTOR_ALIGNMENT
23747 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
23749 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
23750 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
23751 aarch64_vectorize_preferred_vector_alignment
23752 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
23753 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
23754 aarch64_simd_vector_alignment_reachable
23756 /* vec_perm support. */
23758 #undef TARGET_VECTORIZE_VEC_PERM_CONST
23759 #define TARGET_VECTORIZE_VEC_PERM_CONST \
23760 aarch64_vectorize_vec_perm_const
23762 #undef TARGET_VECTORIZE_RELATED_MODE
23763 #define TARGET_VECTORIZE_RELATED_MODE aarch64_vectorize_related_mode
23764 #undef TARGET_VECTORIZE_GET_MASK_MODE
23765 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
23766 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
23767 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
23768 aarch64_empty_mask_is_expensive
23769 #undef TARGET_PREFERRED_ELSE_VALUE
23770 #define TARGET_PREFERRED_ELSE_VALUE \
23771 aarch64_preferred_else_value
23773 #undef TARGET_INIT_LIBFUNCS
23774 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
23776 #undef TARGET_FIXED_CONDITION_CODE_REGS
23777 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
23779 #undef TARGET_FLAGS_REGNUM
23780 #define TARGET_FLAGS_REGNUM CC_REGNUM
23782 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
23783 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
23785 #undef TARGET_ASAN_SHADOW_OFFSET
23786 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
23788 #undef TARGET_LEGITIMIZE_ADDRESS
23789 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
23791 #undef TARGET_SCHED_CAN_SPECULATE_INSN
23792 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
23794 #undef TARGET_CAN_USE_DOLOOP_P
23795 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
23797 #undef TARGET_SCHED_ADJUST_PRIORITY
23798 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
23800 #undef TARGET_SCHED_MACRO_FUSION_P
23801 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
23803 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
23804 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
23806 #undef TARGET_SCHED_FUSION_PRIORITY
23807 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
23809 #undef TARGET_UNSPEC_MAY_TRAP_P
23810 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
23812 #undef TARGET_USE_PSEUDO_PIC_REG
23813 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
23815 #undef TARGET_PRINT_OPERAND
23816 #define TARGET_PRINT_OPERAND aarch64_print_operand
23818 #undef TARGET_PRINT_OPERAND_ADDRESS
23819 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
23821 #undef TARGET_OPTAB_SUPPORTED_P
23822 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
23824 #undef TARGET_OMIT_STRUCT_RETURN_REG
23825 #define TARGET_OMIT_STRUCT_RETURN_REG true
23827 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
23828 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
23829 aarch64_dwarf_poly_indeterminate_value
23831 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
23832 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
23833 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
23835 #undef TARGET_HARD_REGNO_NREGS
23836 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
23837 #undef TARGET_HARD_REGNO_MODE_OK
23838 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
23840 #undef TARGET_MODES_TIEABLE_P
23841 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
23843 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
23844 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
23845 aarch64_hard_regno_call_part_clobbered
23847 #undef TARGET_INSN_CALLEE_ABI
23848 #define TARGET_INSN_CALLEE_ABI aarch64_insn_callee_abi
23850 #undef TARGET_CONSTANT_ALIGNMENT
23851 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
23853 #undef TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE
23854 #define TARGET_STACK_CLASH_PROTECTION_ALLOCA_PROBE_RANGE \
23855 aarch64_stack_clash_protection_alloca_probe_range
23857 #undef TARGET_COMPUTE_PRESSURE_CLASSES
23858 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
23860 #undef TARGET_CAN_CHANGE_MODE_CLASS
23861 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
23863 #undef TARGET_SELECT_EARLY_REMAT_MODES
23864 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
23866 #undef TARGET_SPECULATION_SAFE_VALUE
23867 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
23869 #undef TARGET_ESTIMATED_POLY_VALUE
23870 #define TARGET_ESTIMATED_POLY_VALUE aarch64_estimated_poly_value
23872 #undef TARGET_ATTRIBUTE_TABLE
23873 #define TARGET_ATTRIBUTE_TABLE aarch64_attribute_table
23875 #undef TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN
23876 #define TARGET_SIMD_CLONE_COMPUTE_VECSIZE_AND_SIMDLEN \
23877 aarch64_simd_clone_compute_vecsize_and_simdlen
23879 #undef TARGET_SIMD_CLONE_ADJUST
23880 #define TARGET_SIMD_CLONE_ADJUST aarch64_simd_clone_adjust
23882 #undef TARGET_SIMD_CLONE_USABLE
23883 #define TARGET_SIMD_CLONE_USABLE aarch64_simd_clone_usable
23885 #undef TARGET_COMP_TYPE_ATTRIBUTES
23886 #define TARGET_COMP_TYPE_ATTRIBUTES aarch64_comp_type_attributes
23888 #undef TARGET_GET_MULTILIB_ABI_NAME
23889 #define TARGET_GET_MULTILIB_ABI_NAME aarch64_get_multilib_abi_name
23891 #undef TARGET_FNTYPE_ABI
23892 #define TARGET_FNTYPE_ABI aarch64_fntype_abi
23894 #if CHECKING_P
23895 #undef TARGET_RUN_TARGET_SELFTESTS
23896 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
23897 #endif /* #if CHECKING_P */
23899 #undef TARGET_ASM_POST_CFI_STARTPROC
23900 #define TARGET_ASM_POST_CFI_STARTPROC aarch64_post_cfi_startproc
23902 #undef TARGET_STRICT_ARGUMENT_NAMING
23903 #define TARGET_STRICT_ARGUMENT_NAMING hook_bool_CUMULATIVE_ARGS_true
23905 #undef TARGET_MD_ASM_ADJUST
23906 #define TARGET_MD_ASM_ADJUST arm_md_asm_adjust
23908 #undef TARGET_ASM_FILE_END
23909 #define TARGET_ASM_FILE_END aarch64_asm_file_end
23911 #undef TARGET_ASM_FUNCTION_EPILOGUE
23912 #define TARGET_ASM_FUNCTION_EPILOGUE aarch64_sls_emit_blr_function_thunks
23914 struct gcc_target targetm = TARGET_INITIALIZER;
23916 #include "gt-aarch64.h"