2018-07-04 Denys Vlasenko <dvlasenk@redhat.com>
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob143f9d03722609d9679b7b8cdbb5dde7073b0d77
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Classifies an address.
83 ADDRESS_REG_IMM
84 A simple base register plus immediate offset.
86 ADDRESS_REG_WB
87 A base register indexed by immediate offset with writeback.
89 ADDRESS_REG_REG
90 A base register indexed by (optionally scaled) register.
92 ADDRESS_REG_UXTW
93 A base register indexed by (optionally scaled) zero-extended register.
95 ADDRESS_REG_SXTW
96 A base register indexed by (optionally scaled) sign-extended register.
98 ADDRESS_LO_SUM
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
101 ADDRESS_SYMBOLIC:
102 A constant symbolic address, in pc-relative literal pool. */
104 enum aarch64_address_type {
105 ADDRESS_REG_IMM,
106 ADDRESS_REG_WB,
107 ADDRESS_REG_REG,
108 ADDRESS_REG_UXTW,
109 ADDRESS_REG_SXTW,
110 ADDRESS_LO_SUM,
111 ADDRESS_SYMBOLIC
114 struct aarch64_address_info {
115 enum aarch64_address_type type;
116 rtx base;
117 rtx offset;
118 poly_int64 const_offset;
119 int shift;
120 enum aarch64_symbol_type symbol_type;
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
126 enum insn_type { MOV, MVN };
127 enum modifier_type { LSL, MSL };
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode, rtx);
131 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
132 insn_type = MOV, modifier_type = LSL,
133 unsigned int = 0);
134 simd_immediate_info (scalar_mode, rtx, rtx);
136 /* The mode of the elements. */
137 scalar_mode elt_mode;
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
141 rtx value;
143 /* The value of the step if the constant is a series, null otherwise. */
144 rtx step;
146 /* The instruction to use to move the immediate into a vector. */
147 insn_type insn;
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier;
152 unsigned int shift;
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
159 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
160 modifier (LSL), shift (0)
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
165 fields. */
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
168 unsigned HOST_WIDE_INT value_in,
169 insn_type insn_in, modifier_type modifier_in,
170 unsigned int shift_in)
171 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
172 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
179 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
180 modifier (LSL), shift (0)
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel;
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg;
189 #ifdef HAVE_AS_TLS
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
192 #endif
194 static bool aarch64_composite_type_p (const_tree, machine_mode);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
196 const_tree,
197 machine_mode *, int *,
198 bool *);
199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode);
203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
205 const_tree type,
206 int misalignment,
207 bool is_packed);
208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
211 /* Major revision number of the ARM Architecture implemented by the target. */
212 unsigned aarch64_architecture_version;
214 /* The processor for which instructions should be scheduled. */
215 enum aarch64_processor aarch64_tune = cortexa53;
217 /* Mask to specify which instruction scheduling options should be used. */
218 unsigned long aarch64_tune_flags = 0;
220 /* Global flag for PC relative loads. */
221 bool aarch64_pcrelative_literal_loads;
223 /* Global flag for whether frame pointer is enabled. */
224 bool aarch64_use_frame_pointer;
226 /* Support for command line parsing of boolean flags in the tuning
227 structures. */
228 struct aarch64_flag_desc
230 const char* name;
231 unsigned int flag;
234 #define AARCH64_FUSION_PAIR(name, internal_name) \
235 { name, AARCH64_FUSE_##internal_name },
236 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
238 { "none", AARCH64_FUSE_NOTHING },
239 #include "aarch64-fusion-pairs.def"
240 { "all", AARCH64_FUSE_ALL },
241 { NULL, AARCH64_FUSE_NOTHING }
244 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
245 { name, AARCH64_EXTRA_TUNE_##internal_name },
246 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
248 { "none", AARCH64_EXTRA_TUNE_NONE },
249 #include "aarch64-tuning-flags.def"
250 { "all", AARCH64_EXTRA_TUNE_ALL },
251 { NULL, AARCH64_EXTRA_TUNE_NONE }
254 /* Tuning parameters. */
256 static const struct cpu_addrcost_table generic_addrcost_table =
259 1, /* hi */
260 0, /* si */
261 0, /* di */
262 1, /* ti */
264 0, /* pre_modify */
265 0, /* post_modify */
266 0, /* register_offset */
267 0, /* register_sextend */
268 0, /* register_zextend */
269 0 /* imm_offset */
272 static const struct cpu_addrcost_table exynosm1_addrcost_table =
275 0, /* hi */
276 0, /* si */
277 0, /* di */
278 2, /* ti */
280 0, /* pre_modify */
281 0, /* post_modify */
282 1, /* register_offset */
283 1, /* register_sextend */
284 2, /* register_zextend */
285 0, /* imm_offset */
288 static const struct cpu_addrcost_table xgene1_addrcost_table =
291 1, /* hi */
292 0, /* si */
293 0, /* di */
294 1, /* ti */
296 1, /* pre_modify */
297 0, /* post_modify */
298 0, /* register_offset */
299 1, /* register_sextend */
300 1, /* register_zextend */
301 0, /* imm_offset */
304 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
307 1, /* hi */
308 1, /* si */
309 1, /* di */
310 2, /* ti */
312 0, /* pre_modify */
313 0, /* post_modify */
314 2, /* register_offset */
315 3, /* register_sextend */
316 3, /* register_zextend */
317 0, /* imm_offset */
320 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
323 1, /* hi */
324 1, /* si */
325 1, /* di */
326 2, /* ti */
328 1, /* pre_modify */
329 1, /* post_modify */
330 3, /* register_offset */
331 4, /* register_sextend */
332 3, /* register_zextend */
333 2, /* imm_offset */
336 static const struct cpu_regmove_cost generic_regmove_cost =
338 1, /* GP2GP */
339 /* Avoid the use of slow int<->fp moves for spilling by setting
340 their cost higher than memmov_cost. */
341 5, /* GP2FP */
342 5, /* FP2GP */
343 2 /* FP2FP */
346 static const struct cpu_regmove_cost cortexa57_regmove_cost =
348 1, /* GP2GP */
349 /* Avoid the use of slow int<->fp moves for spilling by setting
350 their cost higher than memmov_cost. */
351 5, /* GP2FP */
352 5, /* FP2GP */
353 2 /* FP2FP */
356 static const struct cpu_regmove_cost cortexa53_regmove_cost =
358 1, /* GP2GP */
359 /* Avoid the use of slow int<->fp moves for spilling by setting
360 their cost higher than memmov_cost. */
361 5, /* GP2FP */
362 5, /* FP2GP */
363 2 /* FP2FP */
366 static const struct cpu_regmove_cost exynosm1_regmove_cost =
368 1, /* GP2GP */
369 /* Avoid the use of slow int<->fp moves for spilling by setting
370 their cost higher than memmov_cost (actual, 4 and 9). */
371 9, /* GP2FP */
372 9, /* FP2GP */
373 1 /* FP2FP */
376 static const struct cpu_regmove_cost thunderx_regmove_cost =
378 2, /* GP2GP */
379 2, /* GP2FP */
380 6, /* FP2GP */
381 4 /* FP2FP */
384 static const struct cpu_regmove_cost xgene1_regmove_cost =
386 1, /* GP2GP */
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost. */
389 8, /* GP2FP */
390 8, /* FP2GP */
391 2 /* FP2FP */
394 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
396 2, /* GP2GP */
397 /* Avoid the use of int<->fp moves for spilling. */
398 6, /* GP2FP */
399 6, /* FP2GP */
400 4 /* FP2FP */
403 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
405 1, /* GP2GP */
406 /* Avoid the use of int<->fp moves for spilling. */
407 8, /* GP2FP */
408 8, /* FP2GP */
409 4 /* FP2FP */
412 /* Generic costs for vector insn classes. */
413 static const struct cpu_vector_cost generic_vector_cost =
415 1, /* scalar_int_stmt_cost */
416 1, /* scalar_fp_stmt_cost */
417 1, /* scalar_load_cost */
418 1, /* scalar_store_cost */
419 1, /* vec_int_stmt_cost */
420 1, /* vec_fp_stmt_cost */
421 2, /* vec_permute_cost */
422 1, /* vec_to_scalar_cost */
423 1, /* scalar_to_vec_cost */
424 1, /* vec_align_load_cost */
425 1, /* vec_unalign_load_cost */
426 1, /* vec_unalign_store_cost */
427 1, /* vec_store_cost */
428 3, /* cond_taken_branch_cost */
429 1 /* cond_not_taken_branch_cost */
432 /* ThunderX costs for vector insn classes. */
433 static const struct cpu_vector_cost thunderx_vector_cost =
435 1, /* scalar_int_stmt_cost */
436 1, /* scalar_fp_stmt_cost */
437 3, /* scalar_load_cost */
438 1, /* scalar_store_cost */
439 4, /* vec_int_stmt_cost */
440 1, /* vec_fp_stmt_cost */
441 4, /* vec_permute_cost */
442 2, /* vec_to_scalar_cost */
443 2, /* scalar_to_vec_cost */
444 3, /* vec_align_load_cost */
445 5, /* vec_unalign_load_cost */
446 5, /* vec_unalign_store_cost */
447 1, /* vec_store_cost */
448 3, /* cond_taken_branch_cost */
449 3 /* cond_not_taken_branch_cost */
452 /* Generic costs for vector insn classes. */
453 static const struct cpu_vector_cost cortexa57_vector_cost =
455 1, /* scalar_int_stmt_cost */
456 1, /* scalar_fp_stmt_cost */
457 4, /* scalar_load_cost */
458 1, /* scalar_store_cost */
459 2, /* vec_int_stmt_cost */
460 2, /* vec_fp_stmt_cost */
461 3, /* vec_permute_cost */
462 8, /* vec_to_scalar_cost */
463 8, /* scalar_to_vec_cost */
464 4, /* vec_align_load_cost */
465 4, /* vec_unalign_load_cost */
466 1, /* vec_unalign_store_cost */
467 1, /* vec_store_cost */
468 1, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
472 static const struct cpu_vector_cost exynosm1_vector_cost =
474 1, /* scalar_int_stmt_cost */
475 1, /* scalar_fp_stmt_cost */
476 5, /* scalar_load_cost */
477 1, /* scalar_store_cost */
478 3, /* vec_int_stmt_cost */
479 3, /* vec_fp_stmt_cost */
480 3, /* vec_permute_cost */
481 3, /* vec_to_scalar_cost */
482 3, /* scalar_to_vec_cost */
483 5, /* vec_align_load_cost */
484 5, /* vec_unalign_load_cost */
485 1, /* vec_unalign_store_cost */
486 1, /* vec_store_cost */
487 1, /* cond_taken_branch_cost */
488 1 /* cond_not_taken_branch_cost */
491 /* Generic costs for vector insn classes. */
492 static const struct cpu_vector_cost xgene1_vector_cost =
494 1, /* scalar_int_stmt_cost */
495 1, /* scalar_fp_stmt_cost */
496 5, /* scalar_load_cost */
497 1, /* scalar_store_cost */
498 2, /* vec_int_stmt_cost */
499 2, /* vec_fp_stmt_cost */
500 2, /* vec_permute_cost */
501 4, /* vec_to_scalar_cost */
502 4, /* scalar_to_vec_cost */
503 10, /* vec_align_load_cost */
504 10, /* vec_unalign_load_cost */
505 2, /* vec_unalign_store_cost */
506 2, /* vec_store_cost */
507 2, /* cond_taken_branch_cost */
508 1 /* cond_not_taken_branch_cost */
511 /* Costs for vector insn classes for Vulcan. */
512 static const struct cpu_vector_cost thunderx2t99_vector_cost =
514 1, /* scalar_int_stmt_cost */
515 6, /* scalar_fp_stmt_cost */
516 4, /* scalar_load_cost */
517 1, /* scalar_store_cost */
518 5, /* vec_int_stmt_cost */
519 6, /* vec_fp_stmt_cost */
520 3, /* vec_permute_cost */
521 6, /* vec_to_scalar_cost */
522 5, /* scalar_to_vec_cost */
523 8, /* vec_align_load_cost */
524 8, /* vec_unalign_load_cost */
525 4, /* vec_unalign_store_cost */
526 4, /* vec_store_cost */
527 2, /* cond_taken_branch_cost */
528 1 /* cond_not_taken_branch_cost */
531 /* Generic costs for branch instructions. */
532 static const struct cpu_branch_cost generic_branch_cost =
534 1, /* Predictable. */
535 3 /* Unpredictable. */
538 /* Generic approximation modes. */
539 static const cpu_approx_modes generic_approx_modes =
541 AARCH64_APPROX_NONE, /* division */
542 AARCH64_APPROX_NONE, /* sqrt */
543 AARCH64_APPROX_NONE /* recip_sqrt */
546 /* Approximation modes for Exynos M1. */
547 static const cpu_approx_modes exynosm1_approx_modes =
549 AARCH64_APPROX_NONE, /* division */
550 AARCH64_APPROX_ALL, /* sqrt */
551 AARCH64_APPROX_ALL /* recip_sqrt */
554 /* Approximation modes for X-Gene 1. */
555 static const cpu_approx_modes xgene1_approx_modes =
557 AARCH64_APPROX_NONE, /* division */
558 AARCH64_APPROX_NONE, /* sqrt */
559 AARCH64_APPROX_ALL /* recip_sqrt */
562 /* Generic prefetch settings (which disable prefetch). */
563 static const cpu_prefetch_tune generic_prefetch_tune =
565 0, /* num_slots */
566 -1, /* l1_cache_size */
567 -1, /* l1_cache_line_size */
568 -1, /* l2_cache_size */
569 true, /* prefetch_dynamic_strides */
570 -1, /* minimum_stride */
571 -1 /* default_opt_level */
574 static const cpu_prefetch_tune exynosm1_prefetch_tune =
576 0, /* num_slots */
577 -1, /* l1_cache_size */
578 64, /* l1_cache_line_size */
579 -1, /* l2_cache_size */
580 true, /* prefetch_dynamic_strides */
581 -1, /* minimum_stride */
582 -1 /* default_opt_level */
585 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
587 4, /* num_slots */
588 32, /* l1_cache_size */
589 64, /* l1_cache_line_size */
590 512, /* l2_cache_size */
591 false, /* prefetch_dynamic_strides */
592 2048, /* minimum_stride */
593 3 /* default_opt_level */
596 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
598 8, /* num_slots */
599 32, /* l1_cache_size */
600 128, /* l1_cache_line_size */
601 16*1024, /* l2_cache_size */
602 true, /* prefetch_dynamic_strides */
603 -1, /* minimum_stride */
604 3 /* default_opt_level */
607 static const cpu_prefetch_tune thunderx_prefetch_tune =
609 8, /* num_slots */
610 32, /* l1_cache_size */
611 128, /* l1_cache_line_size */
612 -1, /* l2_cache_size */
613 true, /* prefetch_dynamic_strides */
614 -1, /* minimum_stride */
615 -1 /* default_opt_level */
618 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
620 8, /* num_slots */
621 32, /* l1_cache_size */
622 64, /* l1_cache_line_size */
623 256, /* l2_cache_size */
624 true, /* prefetch_dynamic_strides */
625 -1, /* minimum_stride */
626 -1 /* default_opt_level */
629 static const struct tune_params generic_tunings =
631 &cortexa57_extra_costs,
632 &generic_addrcost_table,
633 &generic_regmove_cost,
634 &generic_vector_cost,
635 &generic_branch_cost,
636 &generic_approx_modes,
637 4, /* memmov_cost */
638 2, /* issue_rate */
639 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
640 "8", /* function_align. */
641 "4", /* jump_align. */
642 "8", /* loop_align. */
643 2, /* int_reassoc_width. */
644 4, /* fp_reassoc_width. */
645 1, /* vec_reassoc_width. */
646 2, /* min_div_recip_mul_sf. */
647 2, /* min_div_recip_mul_df. */
648 0, /* max_case_values. */
649 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
650 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
651 &generic_prefetch_tune
654 static const struct tune_params cortexa35_tunings =
656 &cortexa53_extra_costs,
657 &generic_addrcost_table,
658 &cortexa53_regmove_cost,
659 &generic_vector_cost,
660 &generic_branch_cost,
661 &generic_approx_modes,
662 4, /* memmov_cost */
663 1, /* issue_rate */
664 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
665 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
666 "16", /* function_align. */
667 "4", /* jump_align. */
668 "8", /* loop_align. */
669 2, /* int_reassoc_width. */
670 4, /* fp_reassoc_width. */
671 1, /* vec_reassoc_width. */
672 2, /* min_div_recip_mul_sf. */
673 2, /* min_div_recip_mul_df. */
674 0, /* max_case_values. */
675 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
676 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
677 &generic_prefetch_tune
680 static const struct tune_params cortexa53_tunings =
682 &cortexa53_extra_costs,
683 &generic_addrcost_table,
684 &cortexa53_regmove_cost,
685 &generic_vector_cost,
686 &generic_branch_cost,
687 &generic_approx_modes,
688 4, /* memmov_cost */
689 2, /* issue_rate */
690 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
691 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
692 "16", /* function_align. */
693 "4", /* jump_align. */
694 "8", /* loop_align. */
695 2, /* int_reassoc_width. */
696 4, /* fp_reassoc_width. */
697 1, /* vec_reassoc_width. */
698 2, /* min_div_recip_mul_sf. */
699 2, /* min_div_recip_mul_df. */
700 0, /* max_case_values. */
701 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
702 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
703 &generic_prefetch_tune
706 static const struct tune_params cortexa57_tunings =
708 &cortexa57_extra_costs,
709 &generic_addrcost_table,
710 &cortexa57_regmove_cost,
711 &cortexa57_vector_cost,
712 &generic_branch_cost,
713 &generic_approx_modes,
714 4, /* memmov_cost */
715 3, /* issue_rate */
716 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
717 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
718 "16", /* function_align. */
719 "4", /* jump_align. */
720 "8", /* loop_align. */
721 2, /* int_reassoc_width. */
722 4, /* fp_reassoc_width. */
723 1, /* vec_reassoc_width. */
724 2, /* min_div_recip_mul_sf. */
725 2, /* min_div_recip_mul_df. */
726 0, /* max_case_values. */
727 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
728 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
729 &generic_prefetch_tune
732 static const struct tune_params cortexa72_tunings =
734 &cortexa57_extra_costs,
735 &generic_addrcost_table,
736 &cortexa57_regmove_cost,
737 &cortexa57_vector_cost,
738 &generic_branch_cost,
739 &generic_approx_modes,
740 4, /* memmov_cost */
741 3, /* issue_rate */
742 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
743 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
744 "16", /* function_align. */
745 "4", /* jump_align. */
746 "8", /* loop_align. */
747 2, /* int_reassoc_width. */
748 4, /* fp_reassoc_width. */
749 1, /* vec_reassoc_width. */
750 2, /* min_div_recip_mul_sf. */
751 2, /* min_div_recip_mul_df. */
752 0, /* max_case_values. */
753 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
754 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
755 &generic_prefetch_tune
758 static const struct tune_params cortexa73_tunings =
760 &cortexa57_extra_costs,
761 &generic_addrcost_table,
762 &cortexa57_regmove_cost,
763 &cortexa57_vector_cost,
764 &generic_branch_cost,
765 &generic_approx_modes,
766 4, /* memmov_cost. */
767 2, /* issue_rate. */
768 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
769 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
770 "16", /* function_align. */
771 "4", /* jump_align. */
772 "8", /* loop_align. */
773 2, /* int_reassoc_width. */
774 4, /* fp_reassoc_width. */
775 1, /* vec_reassoc_width. */
776 2, /* min_div_recip_mul_sf. */
777 2, /* min_div_recip_mul_df. */
778 0, /* max_case_values. */
779 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
780 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
781 &generic_prefetch_tune
786 static const struct tune_params exynosm1_tunings =
788 &exynosm1_extra_costs,
789 &exynosm1_addrcost_table,
790 &exynosm1_regmove_cost,
791 &exynosm1_vector_cost,
792 &generic_branch_cost,
793 &exynosm1_approx_modes,
794 4, /* memmov_cost */
795 3, /* issue_rate */
796 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
797 "4", /* function_align. */
798 "4", /* jump_align. */
799 "4", /* loop_align. */
800 2, /* int_reassoc_width. */
801 4, /* fp_reassoc_width. */
802 1, /* vec_reassoc_width. */
803 2, /* min_div_recip_mul_sf. */
804 2, /* min_div_recip_mul_df. */
805 48, /* max_case_values. */
806 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
807 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
808 &exynosm1_prefetch_tune
811 static const struct tune_params thunderxt88_tunings =
813 &thunderx_extra_costs,
814 &generic_addrcost_table,
815 &thunderx_regmove_cost,
816 &thunderx_vector_cost,
817 &generic_branch_cost,
818 &generic_approx_modes,
819 6, /* memmov_cost */
820 2, /* issue_rate */
821 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
822 "8", /* function_align. */
823 "8", /* jump_align. */
824 "8", /* loop_align. */
825 2, /* int_reassoc_width. */
826 4, /* fp_reassoc_width. */
827 1, /* vec_reassoc_width. */
828 2, /* min_div_recip_mul_sf. */
829 2, /* min_div_recip_mul_df. */
830 0, /* max_case_values. */
831 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
832 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
833 &thunderxt88_prefetch_tune
836 static const struct tune_params thunderx_tunings =
838 &thunderx_extra_costs,
839 &generic_addrcost_table,
840 &thunderx_regmove_cost,
841 &thunderx_vector_cost,
842 &generic_branch_cost,
843 &generic_approx_modes,
844 6, /* memmov_cost */
845 2, /* issue_rate */
846 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
847 "8", /* function_align. */
848 "8", /* jump_align. */
849 "8", /* loop_align. */
850 2, /* int_reassoc_width. */
851 4, /* fp_reassoc_width. */
852 1, /* vec_reassoc_width. */
853 2, /* min_div_recip_mul_sf. */
854 2, /* min_div_recip_mul_df. */
855 0, /* max_case_values. */
856 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
857 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
858 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
859 &thunderx_prefetch_tune
862 static const struct tune_params xgene1_tunings =
864 &xgene1_extra_costs,
865 &xgene1_addrcost_table,
866 &xgene1_regmove_cost,
867 &xgene1_vector_cost,
868 &generic_branch_cost,
869 &xgene1_approx_modes,
870 6, /* memmov_cost */
871 4, /* issue_rate */
872 AARCH64_FUSE_NOTHING, /* fusible_ops */
873 "16", /* function_align. */
874 "8", /* jump_align. */
875 "16", /* loop_align. */
876 2, /* int_reassoc_width. */
877 4, /* fp_reassoc_width. */
878 1, /* vec_reassoc_width. */
879 2, /* min_div_recip_mul_sf. */
880 2, /* min_div_recip_mul_df. */
881 0, /* max_case_values. */
882 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
883 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
884 &generic_prefetch_tune
887 static const struct tune_params qdf24xx_tunings =
889 &qdf24xx_extra_costs,
890 &qdf24xx_addrcost_table,
891 &qdf24xx_regmove_cost,
892 &generic_vector_cost,
893 &generic_branch_cost,
894 &generic_approx_modes,
895 4, /* memmov_cost */
896 4, /* issue_rate */
897 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
898 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
899 "16", /* function_align. */
900 "8", /* jump_align. */
901 "16", /* loop_align. */
902 2, /* int_reassoc_width. */
903 4, /* fp_reassoc_width. */
904 1, /* vec_reassoc_width. */
905 2, /* min_div_recip_mul_sf. */
906 2, /* min_div_recip_mul_df. */
907 0, /* max_case_values. */
908 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
909 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
910 &qdf24xx_prefetch_tune
913 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
914 for now. */
915 static const struct tune_params saphira_tunings =
917 &generic_extra_costs,
918 &generic_addrcost_table,
919 &generic_regmove_cost,
920 &generic_vector_cost,
921 &generic_branch_cost,
922 &generic_approx_modes,
923 4, /* memmov_cost */
924 4, /* issue_rate */
925 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
926 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
927 "16", /* function_align. */
928 "8", /* jump_align. */
929 "16", /* loop_align. */
930 2, /* int_reassoc_width. */
931 4, /* fp_reassoc_width. */
932 1, /* vec_reassoc_width. */
933 2, /* min_div_recip_mul_sf. */
934 2, /* min_div_recip_mul_df. */
935 0, /* max_case_values. */
936 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
937 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
938 &generic_prefetch_tune
941 static const struct tune_params thunderx2t99_tunings =
943 &thunderx2t99_extra_costs,
944 &thunderx2t99_addrcost_table,
945 &thunderx2t99_regmove_cost,
946 &thunderx2t99_vector_cost,
947 &generic_branch_cost,
948 &generic_approx_modes,
949 4, /* memmov_cost. */
950 4, /* issue_rate. */
951 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
952 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
953 "16", /* function_align. */
954 "8", /* jump_align. */
955 "16", /* loop_align. */
956 3, /* int_reassoc_width. */
957 2, /* fp_reassoc_width. */
958 2, /* vec_reassoc_width. */
959 2, /* min_div_recip_mul_sf. */
960 2, /* min_div_recip_mul_df. */
961 0, /* max_case_values. */
962 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
963 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
964 &thunderx2t99_prefetch_tune
967 /* Support for fine-grained override of the tuning structures. */
968 struct aarch64_tuning_override_function
970 const char* name;
971 void (*parse_override)(const char*, struct tune_params*);
974 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
975 static void aarch64_parse_tune_string (const char*, struct tune_params*);
977 static const struct aarch64_tuning_override_function
978 aarch64_tuning_override_functions[] =
980 { "fuse", aarch64_parse_fuse_string },
981 { "tune", aarch64_parse_tune_string },
982 { NULL, NULL }
985 /* A processor implementing AArch64. */
986 struct processor
988 const char *const name;
989 enum aarch64_processor ident;
990 enum aarch64_processor sched_core;
991 enum aarch64_arch arch;
992 unsigned architecture_version;
993 const unsigned long flags;
994 const struct tune_params *const tune;
997 /* Architectures implementing AArch64. */
998 static const struct processor all_architectures[] =
1000 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1001 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1002 #include "aarch64-arches.def"
1003 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1006 /* Processor cores implementing AArch64. */
1007 static const struct processor all_cores[] =
1009 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1010 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1011 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1012 FLAGS, &COSTS##_tunings},
1013 #include "aarch64-cores.def"
1014 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1015 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1016 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1020 /* Target specification. These are populated by the -march, -mtune, -mcpu
1021 handling code or by target attributes. */
1022 static const struct processor *selected_arch;
1023 static const struct processor *selected_cpu;
1024 static const struct processor *selected_tune;
1026 /* The current tuning set. */
1027 struct tune_params aarch64_tune_params = generic_tunings;
1029 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1031 /* An ISA extension in the co-processor and main instruction set space. */
1032 struct aarch64_option_extension
1034 const char *const name;
1035 const unsigned long flags_on;
1036 const unsigned long flags_off;
1039 typedef enum aarch64_cond_code
1041 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1042 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1043 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1045 aarch64_cc;
1047 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1049 /* The condition codes of the processor, and the inverse function. */
1050 static const char * const aarch64_condition_codes[] =
1052 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1053 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1056 /* Generate code to enable conditional branches in functions over 1 MiB. */
1057 const char *
1058 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1059 const char * branch_format)
1061 rtx_code_label * tmp_label = gen_label_rtx ();
1062 char label_buf[256];
1063 char buffer[128];
1064 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1065 CODE_LABEL_NUMBER (tmp_label));
1066 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1067 rtx dest_label = operands[pos_label];
1068 operands[pos_label] = tmp_label;
1070 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1071 output_asm_insn (buffer, operands);
1073 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1074 operands[pos_label] = dest_label;
1075 output_asm_insn (buffer, operands);
1076 return "";
1079 void
1080 aarch64_err_no_fpadvsimd (machine_mode mode)
1082 if (TARGET_GENERAL_REGS_ONLY)
1083 if (FLOAT_MODE_P (mode))
1084 error ("%qs is incompatible with the use of floating-point types",
1085 "-mgeneral-regs-only");
1086 else
1087 error ("%qs is incompatible with the use of vector types",
1088 "-mgeneral-regs-only");
1089 else
1090 if (FLOAT_MODE_P (mode))
1091 error ("%qs feature modifier is incompatible with the use of"
1092 " floating-point types", "+nofp");
1093 else
1094 error ("%qs feature modifier is incompatible with the use of"
1095 " vector types", "+nofp");
1098 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1099 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1100 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1101 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1102 and GENERAL_REGS is lower than the memory cost (in this case the best class
1103 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1104 cost results in bad allocations with many redundant int<->FP moves which
1105 are expensive on various cores.
1106 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1107 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1108 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1109 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1110 The result of this is that it is no longer inefficient to have a higher
1111 memory move cost than the register move cost.
1114 static reg_class_t
1115 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1116 reg_class_t best_class)
1118 machine_mode mode;
1120 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1121 || !reg_class_subset_p (FP_REGS, allocno_class))
1122 return allocno_class;
1124 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1125 || !reg_class_subset_p (FP_REGS, best_class))
1126 return best_class;
1128 mode = PSEUDO_REGNO_MODE (regno);
1129 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1132 static unsigned int
1133 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1135 if (GET_MODE_UNIT_SIZE (mode) == 4)
1136 return aarch64_tune_params.min_div_recip_mul_sf;
1137 return aarch64_tune_params.min_div_recip_mul_df;
1140 /* Return the reassociation width of treeop OPC with mode MODE. */
1141 static int
1142 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1144 if (VECTOR_MODE_P (mode))
1145 return aarch64_tune_params.vec_reassoc_width;
1146 if (INTEGRAL_MODE_P (mode))
1147 return aarch64_tune_params.int_reassoc_width;
1148 /* Avoid reassociating floating point addition so we emit more FMAs. */
1149 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1150 return aarch64_tune_params.fp_reassoc_width;
1151 return 1;
1154 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1155 unsigned
1156 aarch64_dbx_register_number (unsigned regno)
1158 if (GP_REGNUM_P (regno))
1159 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1160 else if (regno == SP_REGNUM)
1161 return AARCH64_DWARF_SP;
1162 else if (FP_REGNUM_P (regno))
1163 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1164 else if (PR_REGNUM_P (regno))
1165 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1166 else if (regno == VG_REGNUM)
1167 return AARCH64_DWARF_VG;
1169 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1170 equivalent DWARF register. */
1171 return DWARF_FRAME_REGISTERS;
1174 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1175 static bool
1176 aarch64_advsimd_struct_mode_p (machine_mode mode)
1178 return (TARGET_SIMD
1179 && (mode == OImode || mode == CImode || mode == XImode));
1182 /* Return true if MODE is an SVE predicate mode. */
1183 static bool
1184 aarch64_sve_pred_mode_p (machine_mode mode)
1186 return (TARGET_SVE
1187 && (mode == VNx16BImode
1188 || mode == VNx8BImode
1189 || mode == VNx4BImode
1190 || mode == VNx2BImode));
1193 /* Three mutually-exclusive flags describing a vector or predicate type. */
1194 const unsigned int VEC_ADVSIMD = 1;
1195 const unsigned int VEC_SVE_DATA = 2;
1196 const unsigned int VEC_SVE_PRED = 4;
1197 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1198 a structure of 2, 3 or 4 vectors. */
1199 const unsigned int VEC_STRUCT = 8;
1200 /* Useful combinations of the above. */
1201 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1202 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1204 /* Return a set of flags describing the vector properties of mode MODE.
1205 Ignore modes that are not supported by the current target. */
1206 static unsigned int
1207 aarch64_classify_vector_mode (machine_mode mode)
1209 if (aarch64_advsimd_struct_mode_p (mode))
1210 return VEC_ADVSIMD | VEC_STRUCT;
1212 if (aarch64_sve_pred_mode_p (mode))
1213 return VEC_SVE_PRED;
1215 scalar_mode inner = GET_MODE_INNER (mode);
1216 if (VECTOR_MODE_P (mode)
1217 && (inner == QImode
1218 || inner == HImode
1219 || inner == HFmode
1220 || inner == SImode
1221 || inner == SFmode
1222 || inner == DImode
1223 || inner == DFmode))
1225 if (TARGET_SVE)
1227 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1228 return VEC_SVE_DATA;
1229 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1230 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1231 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1232 return VEC_SVE_DATA | VEC_STRUCT;
1235 /* This includes V1DF but not V1DI (which doesn't exist). */
1236 if (TARGET_SIMD
1237 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1238 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1239 return VEC_ADVSIMD;
1242 return 0;
1245 /* Return true if MODE is any of the data vector modes, including
1246 structure modes. */
1247 static bool
1248 aarch64_vector_data_mode_p (machine_mode mode)
1250 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1253 /* Return true if MODE is an SVE data vector mode; either a single vector
1254 or a structure of vectors. */
1255 static bool
1256 aarch64_sve_data_mode_p (machine_mode mode)
1258 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1261 /* Implement target hook TARGET_ARRAY_MODE. */
1262 static opt_machine_mode
1263 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1265 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1266 && IN_RANGE (nelems, 2, 4))
1267 return mode_for_vector (GET_MODE_INNER (mode),
1268 GET_MODE_NUNITS (mode) * nelems);
1270 return opt_machine_mode ();
1273 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1274 static bool
1275 aarch64_array_mode_supported_p (machine_mode mode,
1276 unsigned HOST_WIDE_INT nelems)
1278 if (TARGET_SIMD
1279 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1280 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1281 && (nelems >= 2 && nelems <= 4))
1282 return true;
1284 return false;
1287 /* Return the SVE predicate mode to use for elements that have
1288 ELEM_NBYTES bytes, if such a mode exists. */
1290 opt_machine_mode
1291 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1293 if (TARGET_SVE)
1295 if (elem_nbytes == 1)
1296 return VNx16BImode;
1297 if (elem_nbytes == 2)
1298 return VNx8BImode;
1299 if (elem_nbytes == 4)
1300 return VNx4BImode;
1301 if (elem_nbytes == 8)
1302 return VNx2BImode;
1304 return opt_machine_mode ();
1307 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1309 static opt_machine_mode
1310 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1312 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1314 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1315 machine_mode pred_mode;
1316 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1317 return pred_mode;
1320 return default_get_mask_mode (nunits, nbytes);
1323 /* Implement TARGET_HARD_REGNO_NREGS. */
1325 static unsigned int
1326 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1328 /* ??? Logically we should only need to provide a value when
1329 HARD_REGNO_MODE_OK says that the combination is valid,
1330 but at the moment we need to handle all modes. Just ignore
1331 any runtime parts for registers that can't store them. */
1332 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1333 switch (aarch64_regno_regclass (regno))
1335 case FP_REGS:
1336 case FP_LO_REGS:
1337 if (aarch64_sve_data_mode_p (mode))
1338 return exact_div (GET_MODE_SIZE (mode),
1339 BYTES_PER_SVE_VECTOR).to_constant ();
1340 return CEIL (lowest_size, UNITS_PER_VREG);
1341 case PR_REGS:
1342 case PR_LO_REGS:
1343 case PR_HI_REGS:
1344 return 1;
1345 default:
1346 return CEIL (lowest_size, UNITS_PER_WORD);
1348 gcc_unreachable ();
1351 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1353 static bool
1354 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1356 if (GET_MODE_CLASS (mode) == MODE_CC)
1357 return regno == CC_REGNUM;
1359 if (regno == VG_REGNUM)
1360 /* This must have the same size as _Unwind_Word. */
1361 return mode == DImode;
1363 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1364 if (vec_flags & VEC_SVE_PRED)
1365 return PR_REGNUM_P (regno);
1367 if (PR_REGNUM_P (regno))
1368 return 0;
1370 if (regno == SP_REGNUM)
1371 /* The purpose of comparing with ptr_mode is to support the
1372 global register variable associated with the stack pointer
1373 register via the syntax of asm ("wsp") in ILP32. */
1374 return mode == Pmode || mode == ptr_mode;
1376 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1377 return mode == Pmode;
1379 if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1380 return true;
1382 if (FP_REGNUM_P (regno))
1384 if (vec_flags & VEC_STRUCT)
1385 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1386 else
1387 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1390 return false;
1393 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1394 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1395 clobbers the top 64 bits when restoring the bottom 64 bits. */
1397 static bool
1398 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1400 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1403 /* Implement REGMODE_NATURAL_SIZE. */
1404 poly_uint64
1405 aarch64_regmode_natural_size (machine_mode mode)
1407 /* The natural size for SVE data modes is one SVE data vector,
1408 and similarly for predicates. We can't independently modify
1409 anything smaller than that. */
1410 /* ??? For now, only do this for variable-width SVE registers.
1411 Doing it for constant-sized registers breaks lower-subreg.c. */
1412 /* ??? And once that's fixed, we should probably have similar
1413 code for Advanced SIMD. */
1414 if (!aarch64_sve_vg.is_constant ())
1416 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1417 if (vec_flags & VEC_SVE_PRED)
1418 return BYTES_PER_SVE_PRED;
1419 if (vec_flags & VEC_SVE_DATA)
1420 return BYTES_PER_SVE_VECTOR;
1422 return UNITS_PER_WORD;
1425 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1426 machine_mode
1427 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1428 machine_mode mode)
1430 /* The predicate mode determines which bits are significant and
1431 which are "don't care". Decreasing the number of lanes would
1432 lose data while increasing the number of lanes would make bits
1433 unnecessarily significant. */
1434 if (PR_REGNUM_P (regno))
1435 return mode;
1436 if (known_ge (GET_MODE_SIZE (mode), 4))
1437 return mode;
1438 else
1439 return SImode;
1442 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1443 that strcpy from constants will be faster. */
1445 static HOST_WIDE_INT
1446 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1448 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1449 return MAX (align, BITS_PER_WORD);
1450 return align;
1453 /* Return true if calls to DECL should be treated as
1454 long-calls (ie called via a register). */
1455 static bool
1456 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1458 return false;
1461 /* Return true if calls to symbol-ref SYM should be treated as
1462 long-calls (ie called via a register). */
1463 bool
1464 aarch64_is_long_call_p (rtx sym)
1466 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1469 /* Return true if calls to symbol-ref SYM should not go through
1470 plt stubs. */
1472 bool
1473 aarch64_is_noplt_call_p (rtx sym)
1475 const_tree decl = SYMBOL_REF_DECL (sym);
1477 if (flag_pic
1478 && decl
1479 && (!flag_plt
1480 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1481 && !targetm.binds_local_p (decl))
1482 return true;
1484 return false;
1487 /* Return true if the offsets to a zero/sign-extract operation
1488 represent an expression that matches an extend operation. The
1489 operands represent the paramters from
1491 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1492 bool
1493 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1494 rtx extract_imm)
1496 HOST_WIDE_INT mult_val, extract_val;
1498 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1499 return false;
1501 mult_val = INTVAL (mult_imm);
1502 extract_val = INTVAL (extract_imm);
1504 if (extract_val > 8
1505 && extract_val < GET_MODE_BITSIZE (mode)
1506 && exact_log2 (extract_val & ~7) > 0
1507 && (extract_val & 7) <= 4
1508 && mult_val == (1 << (extract_val & 7)))
1509 return true;
1511 return false;
1514 /* Emit an insn that's a simple single-set. Both the operands must be
1515 known to be valid. */
1516 inline static rtx_insn *
1517 emit_set_insn (rtx x, rtx y)
1519 return emit_insn (gen_rtx_SET (x, y));
1522 /* X and Y are two things to compare using CODE. Emit the compare insn and
1523 return the rtx for register 0 in the proper mode. */
1525 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1527 machine_mode mode = SELECT_CC_MODE (code, x, y);
1528 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1530 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1531 return cc_reg;
1534 /* Build the SYMBOL_REF for __tls_get_addr. */
1536 static GTY(()) rtx tls_get_addr_libfunc;
1539 aarch64_tls_get_addr (void)
1541 if (!tls_get_addr_libfunc)
1542 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1543 return tls_get_addr_libfunc;
1546 /* Return the TLS model to use for ADDR. */
1548 static enum tls_model
1549 tls_symbolic_operand_type (rtx addr)
1551 enum tls_model tls_kind = TLS_MODEL_NONE;
1552 if (GET_CODE (addr) == CONST)
1554 poly_int64 addend;
1555 rtx sym = strip_offset (addr, &addend);
1556 if (GET_CODE (sym) == SYMBOL_REF)
1557 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1559 else if (GET_CODE (addr) == SYMBOL_REF)
1560 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1562 return tls_kind;
1565 /* We'll allow lo_sum's in addresses in our legitimate addresses
1566 so that combine would take care of combining addresses where
1567 necessary, but for generation purposes, we'll generate the address
1568 as :
1569 RTL Absolute
1570 tmp = hi (symbol_ref); adrp x1, foo
1571 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1574 PIC TLS
1575 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1576 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1577 bl __tls_get_addr
1580 Load TLS symbol, depending on TLS mechanism and TLS access model.
1582 Global Dynamic - Traditional TLS:
1583 adrp tmp, :tlsgd:imm
1584 add dest, tmp, #:tlsgd_lo12:imm
1585 bl __tls_get_addr
1587 Global Dynamic - TLS Descriptors:
1588 adrp dest, :tlsdesc:imm
1589 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1590 add dest, dest, #:tlsdesc_lo12:imm
1591 blr tmp
1592 mrs tp, tpidr_el0
1593 add dest, dest, tp
1595 Initial Exec:
1596 mrs tp, tpidr_el0
1597 adrp tmp, :gottprel:imm
1598 ldr dest, [tmp, #:gottprel_lo12:imm]
1599 add dest, dest, tp
1601 Local Exec:
1602 mrs tp, tpidr_el0
1603 add t0, tp, #:tprel_hi12:imm, lsl #12
1604 add t0, t0, #:tprel_lo12_nc:imm
1607 static void
1608 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1609 enum aarch64_symbol_type type)
1611 switch (type)
1613 case SYMBOL_SMALL_ABSOLUTE:
1615 /* In ILP32, the mode of dest can be either SImode or DImode. */
1616 rtx tmp_reg = dest;
1617 machine_mode mode = GET_MODE (dest);
1619 gcc_assert (mode == Pmode || mode == ptr_mode);
1621 if (can_create_pseudo_p ())
1622 tmp_reg = gen_reg_rtx (mode);
1624 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1625 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1626 return;
1629 case SYMBOL_TINY_ABSOLUTE:
1630 emit_insn (gen_rtx_SET (dest, imm));
1631 return;
1633 case SYMBOL_SMALL_GOT_28K:
1635 machine_mode mode = GET_MODE (dest);
1636 rtx gp_rtx = pic_offset_table_rtx;
1637 rtx insn;
1638 rtx mem;
1640 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1641 here before rtl expand. Tree IVOPT will generate rtl pattern to
1642 decide rtx costs, in which case pic_offset_table_rtx is not
1643 initialized. For that case no need to generate the first adrp
1644 instruction as the final cost for global variable access is
1645 one instruction. */
1646 if (gp_rtx != NULL)
1648 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1649 using the page base as GOT base, the first page may be wasted,
1650 in the worst scenario, there is only 28K space for GOT).
1652 The generate instruction sequence for accessing global variable
1655 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1657 Only one instruction needed. But we must initialize
1658 pic_offset_table_rtx properly. We generate initialize insn for
1659 every global access, and allow CSE to remove all redundant.
1661 The final instruction sequences will look like the following
1662 for multiply global variables access.
1664 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1666 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1667 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1668 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1669 ... */
1671 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1672 crtl->uses_pic_offset_table = 1;
1673 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1675 if (mode != GET_MODE (gp_rtx))
1676 gp_rtx = gen_lowpart (mode, gp_rtx);
1680 if (mode == ptr_mode)
1682 if (mode == DImode)
1683 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1684 else
1685 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1687 mem = XVECEXP (SET_SRC (insn), 0, 0);
1689 else
1691 gcc_assert (mode == Pmode);
1693 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1694 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1697 /* The operand is expected to be MEM. Whenever the related insn
1698 pattern changed, above code which calculate mem should be
1699 updated. */
1700 gcc_assert (GET_CODE (mem) == MEM);
1701 MEM_READONLY_P (mem) = 1;
1702 MEM_NOTRAP_P (mem) = 1;
1703 emit_insn (insn);
1704 return;
1707 case SYMBOL_SMALL_GOT_4G:
1709 /* In ILP32, the mode of dest can be either SImode or DImode,
1710 while the got entry is always of SImode size. The mode of
1711 dest depends on how dest is used: if dest is assigned to a
1712 pointer (e.g. in the memory), it has SImode; it may have
1713 DImode if dest is dereferenced to access the memeory.
1714 This is why we have to handle three different ldr_got_small
1715 patterns here (two patterns for ILP32). */
1717 rtx insn;
1718 rtx mem;
1719 rtx tmp_reg = dest;
1720 machine_mode mode = GET_MODE (dest);
1722 if (can_create_pseudo_p ())
1723 tmp_reg = gen_reg_rtx (mode);
1725 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1726 if (mode == ptr_mode)
1728 if (mode == DImode)
1729 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1730 else
1731 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1733 mem = XVECEXP (SET_SRC (insn), 0, 0);
1735 else
1737 gcc_assert (mode == Pmode);
1739 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1740 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1743 gcc_assert (GET_CODE (mem) == MEM);
1744 MEM_READONLY_P (mem) = 1;
1745 MEM_NOTRAP_P (mem) = 1;
1746 emit_insn (insn);
1747 return;
1750 case SYMBOL_SMALL_TLSGD:
1752 rtx_insn *insns;
1753 machine_mode mode = GET_MODE (dest);
1754 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1756 start_sequence ();
1757 if (TARGET_ILP32)
1758 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1759 else
1760 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1761 insns = get_insns ();
1762 end_sequence ();
1764 RTL_CONST_CALL_P (insns) = 1;
1765 emit_libcall_block (insns, dest, result, imm);
1766 return;
1769 case SYMBOL_SMALL_TLSDESC:
1771 machine_mode mode = GET_MODE (dest);
1772 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1773 rtx tp;
1775 gcc_assert (mode == Pmode || mode == ptr_mode);
1777 /* In ILP32, the got entry is always of SImode size. Unlike
1778 small GOT, the dest is fixed at reg 0. */
1779 if (TARGET_ILP32)
1780 emit_insn (gen_tlsdesc_small_si (imm));
1781 else
1782 emit_insn (gen_tlsdesc_small_di (imm));
1783 tp = aarch64_load_tp (NULL);
1785 if (mode != Pmode)
1786 tp = gen_lowpart (mode, tp);
1788 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1789 if (REG_P (dest))
1790 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1791 return;
1794 case SYMBOL_SMALL_TLSIE:
1796 /* In ILP32, the mode of dest can be either SImode or DImode,
1797 while the got entry is always of SImode size. The mode of
1798 dest depends on how dest is used: if dest is assigned to a
1799 pointer (e.g. in the memory), it has SImode; it may have
1800 DImode if dest is dereferenced to access the memeory.
1801 This is why we have to handle three different tlsie_small
1802 patterns here (two patterns for ILP32). */
1803 machine_mode mode = GET_MODE (dest);
1804 rtx tmp_reg = gen_reg_rtx (mode);
1805 rtx tp = aarch64_load_tp (NULL);
1807 if (mode == ptr_mode)
1809 if (mode == DImode)
1810 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1811 else
1813 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1814 tp = gen_lowpart (mode, tp);
1817 else
1819 gcc_assert (mode == Pmode);
1820 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1823 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1824 if (REG_P (dest))
1825 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1826 return;
1829 case SYMBOL_TLSLE12:
1830 case SYMBOL_TLSLE24:
1831 case SYMBOL_TLSLE32:
1832 case SYMBOL_TLSLE48:
1834 machine_mode mode = GET_MODE (dest);
1835 rtx tp = aarch64_load_tp (NULL);
1837 if (mode != Pmode)
1838 tp = gen_lowpart (mode, tp);
1840 switch (type)
1842 case SYMBOL_TLSLE12:
1843 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1844 (dest, tp, imm));
1845 break;
1846 case SYMBOL_TLSLE24:
1847 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1848 (dest, tp, imm));
1849 break;
1850 case SYMBOL_TLSLE32:
1851 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1852 (dest, imm));
1853 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1854 (dest, dest, tp));
1855 break;
1856 case SYMBOL_TLSLE48:
1857 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1858 (dest, imm));
1859 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1860 (dest, dest, tp));
1861 break;
1862 default:
1863 gcc_unreachable ();
1866 if (REG_P (dest))
1867 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1868 return;
1871 case SYMBOL_TINY_GOT:
1872 emit_insn (gen_ldr_got_tiny (dest, imm));
1873 return;
1875 case SYMBOL_TINY_TLSIE:
1877 machine_mode mode = GET_MODE (dest);
1878 rtx tp = aarch64_load_tp (NULL);
1880 if (mode == ptr_mode)
1882 if (mode == DImode)
1883 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1884 else
1886 tp = gen_lowpart (mode, tp);
1887 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1890 else
1892 gcc_assert (mode == Pmode);
1893 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1896 if (REG_P (dest))
1897 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1898 return;
1901 default:
1902 gcc_unreachable ();
1906 /* Emit a move from SRC to DEST. Assume that the move expanders can
1907 handle all moves if !can_create_pseudo_p (). The distinction is
1908 important because, unlike emit_move_insn, the move expanders know
1909 how to force Pmode objects into the constant pool even when the
1910 constant pool address is not itself legitimate. */
1911 static rtx
1912 aarch64_emit_move (rtx dest, rtx src)
1914 return (can_create_pseudo_p ()
1915 ? emit_move_insn (dest, src)
1916 : emit_move_insn_1 (dest, src));
1919 /* Apply UNOPTAB to OP and store the result in DEST. */
1921 static void
1922 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1924 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1925 if (dest != tmp)
1926 emit_move_insn (dest, tmp);
1929 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
1931 static void
1932 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1934 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1935 OPTAB_DIRECT);
1936 if (dest != tmp)
1937 emit_move_insn (dest, tmp);
1940 /* Split a 128-bit move operation into two 64-bit move operations,
1941 taking care to handle partial overlap of register to register
1942 copies. Special cases are needed when moving between GP regs and
1943 FP regs. SRC can be a register, constant or memory; DST a register
1944 or memory. If either operand is memory it must not have any side
1945 effects. */
1946 void
1947 aarch64_split_128bit_move (rtx dst, rtx src)
1949 rtx dst_lo, dst_hi;
1950 rtx src_lo, src_hi;
1952 machine_mode mode = GET_MODE (dst);
1954 gcc_assert (mode == TImode || mode == TFmode);
1955 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1956 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1958 if (REG_P (dst) && REG_P (src))
1960 int src_regno = REGNO (src);
1961 int dst_regno = REGNO (dst);
1963 /* Handle FP <-> GP regs. */
1964 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1966 src_lo = gen_lowpart (word_mode, src);
1967 src_hi = gen_highpart (word_mode, src);
1969 if (mode == TImode)
1971 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1972 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1974 else
1976 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1977 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1979 return;
1981 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1983 dst_lo = gen_lowpart (word_mode, dst);
1984 dst_hi = gen_highpart (word_mode, dst);
1986 if (mode == TImode)
1988 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1989 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1991 else
1993 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1994 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1996 return;
2000 dst_lo = gen_lowpart (word_mode, dst);
2001 dst_hi = gen_highpart (word_mode, dst);
2002 src_lo = gen_lowpart (word_mode, src);
2003 src_hi = gen_highpart_mode (word_mode, mode, src);
2005 /* At most one pairing may overlap. */
2006 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2008 aarch64_emit_move (dst_hi, src_hi);
2009 aarch64_emit_move (dst_lo, src_lo);
2011 else
2013 aarch64_emit_move (dst_lo, src_lo);
2014 aarch64_emit_move (dst_hi, src_hi);
2018 bool
2019 aarch64_split_128bit_move_p (rtx dst, rtx src)
2021 return (! REG_P (src)
2022 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2025 /* Split a complex SIMD combine. */
2027 void
2028 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2030 machine_mode src_mode = GET_MODE (src1);
2031 machine_mode dst_mode = GET_MODE (dst);
2033 gcc_assert (VECTOR_MODE_P (dst_mode));
2034 gcc_assert (register_operand (dst, dst_mode)
2035 && register_operand (src1, src_mode)
2036 && register_operand (src2, src_mode));
2038 rtx (*gen) (rtx, rtx, rtx);
2040 switch (src_mode)
2042 case E_V8QImode:
2043 gen = gen_aarch64_simd_combinev8qi;
2044 break;
2045 case E_V4HImode:
2046 gen = gen_aarch64_simd_combinev4hi;
2047 break;
2048 case E_V2SImode:
2049 gen = gen_aarch64_simd_combinev2si;
2050 break;
2051 case E_V4HFmode:
2052 gen = gen_aarch64_simd_combinev4hf;
2053 break;
2054 case E_V2SFmode:
2055 gen = gen_aarch64_simd_combinev2sf;
2056 break;
2057 case E_DImode:
2058 gen = gen_aarch64_simd_combinedi;
2059 break;
2060 case E_DFmode:
2061 gen = gen_aarch64_simd_combinedf;
2062 break;
2063 default:
2064 gcc_unreachable ();
2067 emit_insn (gen (dst, src1, src2));
2068 return;
2071 /* Split a complex SIMD move. */
2073 void
2074 aarch64_split_simd_move (rtx dst, rtx src)
2076 machine_mode src_mode = GET_MODE (src);
2077 machine_mode dst_mode = GET_MODE (dst);
2079 gcc_assert (VECTOR_MODE_P (dst_mode));
2081 if (REG_P (dst) && REG_P (src))
2083 rtx (*gen) (rtx, rtx);
2085 gcc_assert (VECTOR_MODE_P (src_mode));
2087 switch (src_mode)
2089 case E_V16QImode:
2090 gen = gen_aarch64_split_simd_movv16qi;
2091 break;
2092 case E_V8HImode:
2093 gen = gen_aarch64_split_simd_movv8hi;
2094 break;
2095 case E_V4SImode:
2096 gen = gen_aarch64_split_simd_movv4si;
2097 break;
2098 case E_V2DImode:
2099 gen = gen_aarch64_split_simd_movv2di;
2100 break;
2101 case E_V8HFmode:
2102 gen = gen_aarch64_split_simd_movv8hf;
2103 break;
2104 case E_V4SFmode:
2105 gen = gen_aarch64_split_simd_movv4sf;
2106 break;
2107 case E_V2DFmode:
2108 gen = gen_aarch64_split_simd_movv2df;
2109 break;
2110 default:
2111 gcc_unreachable ();
2114 emit_insn (gen (dst, src));
2115 return;
2119 bool
2120 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2121 machine_mode ymode, rtx y)
2123 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2124 gcc_assert (r != NULL);
2125 return rtx_equal_p (x, r);
2129 static rtx
2130 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2132 if (can_create_pseudo_p ())
2133 return force_reg (mode, value);
2134 else
2136 gcc_assert (x);
2137 aarch64_emit_move (x, value);
2138 return x;
2142 /* Return true if we can move VALUE into a register using a single
2143 CNT[BHWD] instruction. */
2145 static bool
2146 aarch64_sve_cnt_immediate_p (poly_int64 value)
2148 HOST_WIDE_INT factor = value.coeffs[0];
2149 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2150 return (value.coeffs[1] == factor
2151 && IN_RANGE (factor, 2, 16 * 16)
2152 && (factor & 1) == 0
2153 && factor <= 16 * (factor & -factor));
2156 /* Likewise for rtx X. */
2158 bool
2159 aarch64_sve_cnt_immediate_p (rtx x)
2161 poly_int64 value;
2162 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2165 /* Return the asm string for an instruction with a CNT-like vector size
2166 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2167 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2168 first part of the operands template (the part that comes before the
2169 vector size itself). FACTOR is the number of quadwords.
2170 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2171 If it is zero, we can use any element size. */
2173 static char *
2174 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2175 unsigned int factor,
2176 unsigned int nelts_per_vq)
2178 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2180 if (nelts_per_vq == 0)
2181 /* There is some overlap in the ranges of the four CNT instructions.
2182 Here we always use the smallest possible element size, so that the
2183 multiplier is 1 whereever possible. */
2184 nelts_per_vq = factor & -factor;
2185 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2186 gcc_assert (IN_RANGE (shift, 1, 4));
2187 char suffix = "dwhb"[shift - 1];
2189 factor >>= shift;
2190 unsigned int written;
2191 if (factor == 1)
2192 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2193 prefix, suffix, operands);
2194 else
2195 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2196 prefix, suffix, operands, factor);
2197 gcc_assert (written < sizeof (buffer));
2198 return buffer;
2201 /* Return the asm string for an instruction with a CNT-like vector size
2202 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2203 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2204 first part of the operands template (the part that comes before the
2205 vector size itself). X is the value of the vector size operand,
2206 as a polynomial integer rtx. */
2208 char *
2209 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2210 rtx x)
2212 poly_int64 value = rtx_to_poly_int64 (x);
2213 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2214 return aarch64_output_sve_cnt_immediate (prefix, operands,
2215 value.coeffs[1], 0);
2218 /* Return true if we can add VALUE to a register using a single ADDVL
2219 or ADDPL instruction. */
2221 static bool
2222 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2224 HOST_WIDE_INT factor = value.coeffs[0];
2225 if (factor == 0 || value.coeffs[1] != factor)
2226 return false;
2227 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2228 and a value of 16 is one vector width. */
2229 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2230 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2233 /* Likewise for rtx X. */
2235 bool
2236 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2238 poly_int64 value;
2239 return (poly_int_rtx_p (x, &value)
2240 && aarch64_sve_addvl_addpl_immediate_p (value));
2243 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2244 and storing the result in operand 0. */
2246 char *
2247 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2249 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2250 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2251 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2253 /* Use INC or DEC if possible. */
2254 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2256 if (aarch64_sve_cnt_immediate_p (offset_value))
2257 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2258 offset_value.coeffs[1], 0);
2259 if (aarch64_sve_cnt_immediate_p (-offset_value))
2260 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2261 -offset_value.coeffs[1], 0);
2264 int factor = offset_value.coeffs[1];
2265 if ((factor & 15) == 0)
2266 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2267 else
2268 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2269 return buffer;
2272 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2273 instruction. If it is, store the number of elements in each vector
2274 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2275 factor in *FACTOR_OUT (if nonnull). */
2277 bool
2278 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2279 unsigned int *nelts_per_vq_out)
2281 rtx elt;
2282 poly_int64 value;
2284 if (!const_vec_duplicate_p (x, &elt)
2285 || !poly_int_rtx_p (elt, &value))
2286 return false;
2288 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2289 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2290 /* There's no vector INCB. */
2291 return false;
2293 HOST_WIDE_INT factor = value.coeffs[0];
2294 if (value.coeffs[1] != factor)
2295 return false;
2297 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2298 if ((factor % nelts_per_vq) != 0
2299 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2300 return false;
2302 if (factor_out)
2303 *factor_out = factor;
2304 if (nelts_per_vq_out)
2305 *nelts_per_vq_out = nelts_per_vq;
2306 return true;
2309 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2310 instruction. */
2312 bool
2313 aarch64_sve_inc_dec_immediate_p (rtx x)
2315 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2318 /* Return the asm template for an SVE vector INC or DEC instruction.
2319 OPERANDS gives the operands before the vector count and X is the
2320 value of the vector count operand itself. */
2322 char *
2323 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2325 int factor;
2326 unsigned int nelts_per_vq;
2327 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2328 gcc_unreachable ();
2329 if (factor < 0)
2330 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2331 nelts_per_vq);
2332 else
2333 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2334 nelts_per_vq);
2337 static int
2338 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2339 scalar_int_mode mode)
2341 int i;
2342 unsigned HOST_WIDE_INT val, val2, mask;
2343 int one_match, zero_match;
2344 int num_insns;
2346 val = INTVAL (imm);
2348 if (aarch64_move_imm (val, mode))
2350 if (generate)
2351 emit_insn (gen_rtx_SET (dest, imm));
2352 return 1;
2355 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2356 (with XXXX non-zero). In that case check to see if the move can be done in
2357 a smaller mode. */
2358 val2 = val & 0xffffffff;
2359 if (mode == DImode
2360 && aarch64_move_imm (val2, SImode)
2361 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2363 if (generate)
2364 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2366 /* Check if we have to emit a second instruction by checking to see
2367 if any of the upper 32 bits of the original DI mode value is set. */
2368 if (val == val2)
2369 return 1;
2371 i = (val >> 48) ? 48 : 32;
2373 if (generate)
2374 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2375 GEN_INT ((val >> i) & 0xffff)));
2377 return 2;
2380 if ((val >> 32) == 0 || mode == SImode)
2382 if (generate)
2384 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2385 if (mode == SImode)
2386 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2387 GEN_INT ((val >> 16) & 0xffff)));
2388 else
2389 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2390 GEN_INT ((val >> 16) & 0xffff)));
2392 return 2;
2395 /* Remaining cases are all for DImode. */
2397 mask = 0xffff;
2398 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2399 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2400 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2401 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2403 if (zero_match != 2 && one_match != 2)
2405 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2406 For a 64-bit bitmask try whether changing 16 bits to all ones or
2407 zeroes creates a valid bitmask. To check any repeated bitmask,
2408 try using 16 bits from the other 32-bit half of val. */
2410 for (i = 0; i < 64; i += 16, mask <<= 16)
2412 val2 = val & ~mask;
2413 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2414 break;
2415 val2 = val | mask;
2416 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2417 break;
2418 val2 = val2 & ~mask;
2419 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2420 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2421 break;
2423 if (i != 64)
2425 if (generate)
2427 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2428 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2429 GEN_INT ((val >> i) & 0xffff)));
2431 return 2;
2435 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2436 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2437 otherwise skip zero bits. */
2439 num_insns = 1;
2440 mask = 0xffff;
2441 val2 = one_match > zero_match ? ~val : val;
2442 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2444 if (generate)
2445 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2446 ? (val | ~(mask << i))
2447 : (val & (mask << i)))));
2448 for (i += 16; i < 64; i += 16)
2450 if ((val2 & (mask << i)) == 0)
2451 continue;
2452 if (generate)
2453 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2454 GEN_INT ((val >> i) & 0xffff)));
2455 num_insns ++;
2458 return num_insns;
2461 /* Return whether imm is a 128-bit immediate which is simple enough to
2462 expand inline. */
2463 bool
2464 aarch64_mov128_immediate (rtx imm)
2466 if (GET_CODE (imm) == CONST_INT)
2467 return true;
2469 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2471 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2472 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2474 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2475 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2479 /* Return the number of temporary registers that aarch64_add_offset_1
2480 would need to add OFFSET to a register. */
2482 static unsigned int
2483 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2485 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2488 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2489 a non-polynomial OFFSET. MODE is the mode of the addition.
2490 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2491 be set and CFA adjustments added to the generated instructions.
2493 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2494 temporary if register allocation is already complete. This temporary
2495 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2496 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2497 the immediate again.
2499 Since this function may be used to adjust the stack pointer, we must
2500 ensure that it cannot cause transient stack deallocation (for example
2501 by first incrementing SP and then decrementing when adjusting by a
2502 large immediate). */
2504 static void
2505 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2506 rtx src, HOST_WIDE_INT offset, rtx temp1,
2507 bool frame_related_p, bool emit_move_imm)
2509 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2510 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2512 HOST_WIDE_INT moffset = abs_hwi (offset);
2513 rtx_insn *insn;
2515 if (!moffset)
2517 if (!rtx_equal_p (dest, src))
2519 insn = emit_insn (gen_rtx_SET (dest, src));
2520 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2522 return;
2525 /* Single instruction adjustment. */
2526 if (aarch64_uimm12_shift (moffset))
2528 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2529 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2530 return;
2533 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2534 and either:
2536 a) the offset cannot be loaded by a 16-bit move or
2537 b) there is no spare register into which we can move it. */
2538 if (moffset < 0x1000000
2539 && ((!temp1 && !can_create_pseudo_p ())
2540 || !aarch64_move_imm (moffset, mode)))
2542 HOST_WIDE_INT low_off = moffset & 0xfff;
2544 low_off = offset < 0 ? -low_off : low_off;
2545 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2546 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2547 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2548 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2549 return;
2552 /* Emit a move immediate if required and an addition/subtraction. */
2553 if (emit_move_imm)
2555 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2556 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2558 insn = emit_insn (offset < 0
2559 ? gen_sub3_insn (dest, src, temp1)
2560 : gen_add3_insn (dest, src, temp1));
2561 if (frame_related_p)
2563 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2564 rtx adj = plus_constant (mode, src, offset);
2565 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2569 /* Return the number of temporary registers that aarch64_add_offset
2570 would need to move OFFSET into a register or add OFFSET to a register;
2571 ADD_P is true if we want the latter rather than the former. */
2573 static unsigned int
2574 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2576 /* This follows the same structure as aarch64_add_offset. */
2577 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2578 return 0;
2580 unsigned int count = 0;
2581 HOST_WIDE_INT factor = offset.coeffs[1];
2582 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2583 poly_int64 poly_offset (factor, factor);
2584 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2585 /* Need one register for the ADDVL/ADDPL result. */
2586 count += 1;
2587 else if (factor != 0)
2589 factor = abs (factor);
2590 if (factor > 16 * (factor & -factor))
2591 /* Need one register for the CNT result and one for the multiplication
2592 factor. If necessary, the second temporary can be reused for the
2593 constant part of the offset. */
2594 return 2;
2595 /* Need one register for the CNT result (which might then
2596 be shifted). */
2597 count += 1;
2599 return count + aarch64_add_offset_1_temporaries (constant);
2602 /* If X can be represented as a poly_int64, return the number
2603 of temporaries that are required to add it to a register.
2604 Return -1 otherwise. */
2607 aarch64_add_offset_temporaries (rtx x)
2609 poly_int64 offset;
2610 if (!poly_int_rtx_p (x, &offset))
2611 return -1;
2612 return aarch64_offset_temporaries (true, offset);
2615 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2616 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2617 be set and CFA adjustments added to the generated instructions.
2619 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2620 temporary if register allocation is already complete. This temporary
2621 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2622 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2623 false to avoid emitting the immediate again.
2625 TEMP2, if nonnull, is a second temporary register that doesn't
2626 overlap either DEST or REG.
2628 Since this function may be used to adjust the stack pointer, we must
2629 ensure that it cannot cause transient stack deallocation (for example
2630 by first incrementing SP and then decrementing when adjusting by a
2631 large immediate). */
2633 static void
2634 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2635 poly_int64 offset, rtx temp1, rtx temp2,
2636 bool frame_related_p, bool emit_move_imm = true)
2638 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2639 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2640 gcc_assert (temp1 == NULL_RTX
2641 || !frame_related_p
2642 || !reg_overlap_mentioned_p (temp1, dest));
2643 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2645 /* Try using ADDVL or ADDPL to add the whole value. */
2646 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2648 rtx offset_rtx = gen_int_mode (offset, mode);
2649 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2650 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2651 return;
2654 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2655 SVE vector register, over and above the minimum size of 128 bits.
2656 This is equivalent to half the value returned by CNTD with a
2657 vector shape of ALL. */
2658 HOST_WIDE_INT factor = offset.coeffs[1];
2659 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2661 /* Try using ADDVL or ADDPL to add the VG-based part. */
2662 poly_int64 poly_offset (factor, factor);
2663 if (src != const0_rtx
2664 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2666 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2667 if (frame_related_p)
2669 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2670 RTX_FRAME_RELATED_P (insn) = true;
2671 src = dest;
2673 else
2675 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2676 src = aarch64_force_temporary (mode, temp1, addr);
2677 temp1 = temp2;
2678 temp2 = NULL_RTX;
2681 /* Otherwise use a CNT-based sequence. */
2682 else if (factor != 0)
2684 /* Use a subtraction if we have a negative factor. */
2685 rtx_code code = PLUS;
2686 if (factor < 0)
2688 factor = -factor;
2689 code = MINUS;
2692 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2693 into the multiplication. */
2694 rtx val;
2695 int shift = 0;
2696 if (factor & 1)
2697 /* Use a right shift by 1. */
2698 shift = -1;
2699 else
2700 factor /= 2;
2701 HOST_WIDE_INT low_bit = factor & -factor;
2702 if (factor <= 16 * low_bit)
2704 if (factor > 16 * 8)
2706 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2707 the value with the minimum multiplier and shift it into
2708 position. */
2709 int extra_shift = exact_log2 (low_bit);
2710 shift += extra_shift;
2711 factor >>= extra_shift;
2713 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2715 else
2717 /* Use CNTD, then multiply it by FACTOR. */
2718 val = gen_int_mode (poly_int64 (2, 2), mode);
2719 val = aarch64_force_temporary (mode, temp1, val);
2721 /* Go back to using a negative multiplication factor if we have
2722 no register from which to subtract. */
2723 if (code == MINUS && src == const0_rtx)
2725 factor = -factor;
2726 code = PLUS;
2728 rtx coeff1 = gen_int_mode (factor, mode);
2729 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2730 val = gen_rtx_MULT (mode, val, coeff1);
2733 if (shift > 0)
2735 /* Multiply by 1 << SHIFT. */
2736 val = aarch64_force_temporary (mode, temp1, val);
2737 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2739 else if (shift == -1)
2741 /* Divide by 2. */
2742 val = aarch64_force_temporary (mode, temp1, val);
2743 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2746 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2747 if (src != const0_rtx)
2749 val = aarch64_force_temporary (mode, temp1, val);
2750 val = gen_rtx_fmt_ee (code, mode, src, val);
2752 else if (code == MINUS)
2754 val = aarch64_force_temporary (mode, temp1, val);
2755 val = gen_rtx_NEG (mode, val);
2758 if (constant == 0 || frame_related_p)
2760 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2761 if (frame_related_p)
2763 RTX_FRAME_RELATED_P (insn) = true;
2764 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2765 gen_rtx_SET (dest, plus_constant (Pmode, src,
2766 poly_offset)));
2768 src = dest;
2769 if (constant == 0)
2770 return;
2772 else
2774 src = aarch64_force_temporary (mode, temp1, val);
2775 temp1 = temp2;
2776 temp2 = NULL_RTX;
2779 emit_move_imm = true;
2782 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2783 frame_related_p, emit_move_imm);
2786 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2787 than a poly_int64. */
2789 void
2790 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2791 rtx offset_rtx, rtx temp1, rtx temp2)
2793 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2794 temp1, temp2, false);
2797 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2798 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2799 if TEMP1 already contains abs (DELTA). */
2801 static inline void
2802 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2804 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2805 temp1, temp2, true, emit_move_imm);
2808 /* Subtract DELTA from the stack pointer, marking the instructions
2809 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2810 if nonnull. */
2812 static inline void
2813 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2815 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2816 temp1, temp2, frame_related_p);
2819 /* Set DEST to (vec_series BASE STEP). */
2821 static void
2822 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2824 machine_mode mode = GET_MODE (dest);
2825 scalar_mode inner = GET_MODE_INNER (mode);
2827 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2828 if (!aarch64_sve_index_immediate_p (base))
2829 base = force_reg (inner, base);
2830 if (!aarch64_sve_index_immediate_p (step))
2831 step = force_reg (inner, step);
2833 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2836 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2837 integer of mode INT_MODE. Return true on success. */
2839 static bool
2840 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2841 rtx src)
2843 /* If the constant is smaller than 128 bits, we can do the move
2844 using a vector of SRC_MODEs. */
2845 if (src_mode != TImode)
2847 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2848 GET_MODE_SIZE (src_mode));
2849 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2850 emit_move_insn (gen_lowpart (dup_mode, dest),
2851 gen_const_vec_duplicate (dup_mode, src));
2852 return true;
2855 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2856 src = force_const_mem (src_mode, src);
2857 if (!src)
2858 return false;
2860 /* Make sure that the address is legitimate. */
2861 if (!aarch64_sve_ld1r_operand_p (src))
2863 rtx addr = force_reg (Pmode, XEXP (src, 0));
2864 src = replace_equiv_address (src, addr);
2867 machine_mode mode = GET_MODE (dest);
2868 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2869 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2870 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2871 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2872 emit_insn (gen_rtx_SET (dest, src));
2873 return true;
2876 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2877 isn't a simple duplicate or series. */
2879 static void
2880 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2882 machine_mode mode = GET_MODE (src);
2883 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2884 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2885 gcc_assert (npatterns > 1);
2887 if (nelts_per_pattern == 1)
2889 /* The constant is a repeating seqeuence of at least two elements,
2890 where the repeating elements occupy no more than 128 bits.
2891 Get an integer representation of the replicated value. */
2892 scalar_int_mode int_mode;
2893 if (BYTES_BIG_ENDIAN)
2894 /* For now, always use LD1RQ to load the value on big-endian
2895 targets, since the handling of smaller integers includes a
2896 subreg that is semantically an element reverse. */
2897 int_mode = TImode;
2898 else
2900 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2901 gcc_assert (int_bits <= 128);
2902 int_mode = int_mode_for_size (int_bits, 0).require ();
2904 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2905 if (int_value
2906 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2907 return;
2910 /* Expand each pattern individually. */
2911 rtx_vector_builder builder;
2912 auto_vec<rtx, 16> vectors (npatterns);
2913 for (unsigned int i = 0; i < npatterns; ++i)
2915 builder.new_vector (mode, 1, nelts_per_pattern);
2916 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2917 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2918 vectors.quick_push (force_reg (mode, builder.build ()));
2921 /* Use permutes to interleave the separate vectors. */
2922 while (npatterns > 1)
2924 npatterns /= 2;
2925 for (unsigned int i = 0; i < npatterns; ++i)
2927 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2928 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2929 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2930 vectors[i] = tmp;
2933 gcc_assert (vectors[0] == dest);
2936 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2937 is a pattern that can be used to set DEST to a replicated scalar
2938 element. */
2940 void
2941 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2942 rtx (*gen_vec_duplicate) (rtx, rtx))
2944 machine_mode mode = GET_MODE (dest);
2946 /* Check on what type of symbol it is. */
2947 scalar_int_mode int_mode;
2948 if ((GET_CODE (imm) == SYMBOL_REF
2949 || GET_CODE (imm) == LABEL_REF
2950 || GET_CODE (imm) == CONST
2951 || GET_CODE (imm) == CONST_POLY_INT)
2952 && is_a <scalar_int_mode> (mode, &int_mode))
2954 rtx mem;
2955 poly_int64 offset;
2956 HOST_WIDE_INT const_offset;
2957 enum aarch64_symbol_type sty;
2959 /* If we have (const (plus symbol offset)), separate out the offset
2960 before we start classifying the symbol. */
2961 rtx base = strip_offset (imm, &offset);
2963 /* We must always add an offset involving VL separately, rather than
2964 folding it into the relocation. */
2965 if (!offset.is_constant (&const_offset))
2967 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2968 emit_insn (gen_rtx_SET (dest, imm));
2969 else
2971 /* Do arithmetic on 32-bit values if the result is smaller
2972 than that. */
2973 if (partial_subreg_p (int_mode, SImode))
2975 /* It is invalid to do symbol calculations in modes
2976 narrower than SImode. */
2977 gcc_assert (base == const0_rtx);
2978 dest = gen_lowpart (SImode, dest);
2979 int_mode = SImode;
2981 if (base != const0_rtx)
2983 base = aarch64_force_temporary (int_mode, dest, base);
2984 aarch64_add_offset (int_mode, dest, base, offset,
2985 NULL_RTX, NULL_RTX, false);
2987 else
2988 aarch64_add_offset (int_mode, dest, base, offset,
2989 dest, NULL_RTX, false);
2991 return;
2994 sty = aarch64_classify_symbol (base, const_offset);
2995 switch (sty)
2997 case SYMBOL_FORCE_TO_MEM:
2998 if (const_offset != 0
2999 && targetm.cannot_force_const_mem (int_mode, imm))
3001 gcc_assert (can_create_pseudo_p ());
3002 base = aarch64_force_temporary (int_mode, dest, base);
3003 aarch64_add_offset (int_mode, dest, base, const_offset,
3004 NULL_RTX, NULL_RTX, false);
3005 return;
3008 mem = force_const_mem (ptr_mode, imm);
3009 gcc_assert (mem);
3011 /* If we aren't generating PC relative literals, then
3012 we need to expand the literal pool access carefully.
3013 This is something that needs to be done in a number
3014 of places, so could well live as a separate function. */
3015 if (!aarch64_pcrelative_literal_loads)
3017 gcc_assert (can_create_pseudo_p ());
3018 base = gen_reg_rtx (ptr_mode);
3019 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3020 if (ptr_mode != Pmode)
3021 base = convert_memory_address (Pmode, base);
3022 mem = gen_rtx_MEM (ptr_mode, base);
3025 if (int_mode != ptr_mode)
3026 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3028 emit_insn (gen_rtx_SET (dest, mem));
3030 return;
3032 case SYMBOL_SMALL_TLSGD:
3033 case SYMBOL_SMALL_TLSDESC:
3034 case SYMBOL_SMALL_TLSIE:
3035 case SYMBOL_SMALL_GOT_28K:
3036 case SYMBOL_SMALL_GOT_4G:
3037 case SYMBOL_TINY_GOT:
3038 case SYMBOL_TINY_TLSIE:
3039 if (const_offset != 0)
3041 gcc_assert(can_create_pseudo_p ());
3042 base = aarch64_force_temporary (int_mode, dest, base);
3043 aarch64_add_offset (int_mode, dest, base, const_offset,
3044 NULL_RTX, NULL_RTX, false);
3045 return;
3047 /* FALLTHRU */
3049 case SYMBOL_SMALL_ABSOLUTE:
3050 case SYMBOL_TINY_ABSOLUTE:
3051 case SYMBOL_TLSLE12:
3052 case SYMBOL_TLSLE24:
3053 case SYMBOL_TLSLE32:
3054 case SYMBOL_TLSLE48:
3055 aarch64_load_symref_appropriately (dest, imm, sty);
3056 return;
3058 default:
3059 gcc_unreachable ();
3063 if (!CONST_INT_P (imm))
3065 rtx base, step, value;
3066 if (GET_CODE (imm) == HIGH
3067 || aarch64_simd_valid_immediate (imm, NULL))
3068 emit_insn (gen_rtx_SET (dest, imm));
3069 else if (const_vec_series_p (imm, &base, &step))
3070 aarch64_expand_vec_series (dest, base, step);
3071 else if (const_vec_duplicate_p (imm, &value))
3073 /* If the constant is out of range of an SVE vector move,
3074 load it from memory if we can, otherwise move it into
3075 a register and use a DUP. */
3076 scalar_mode inner_mode = GET_MODE_INNER (mode);
3077 rtx op = force_const_mem (inner_mode, value);
3078 if (!op)
3079 op = force_reg (inner_mode, value);
3080 else if (!aarch64_sve_ld1r_operand_p (op))
3082 rtx addr = force_reg (Pmode, XEXP (op, 0));
3083 op = replace_equiv_address (op, addr);
3085 emit_insn (gen_vec_duplicate (dest, op));
3087 else if (GET_CODE (imm) == CONST_VECTOR
3088 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3089 aarch64_expand_sve_const_vector (dest, imm);
3090 else
3092 rtx mem = force_const_mem (mode, imm);
3093 gcc_assert (mem);
3094 emit_move_insn (dest, mem);
3097 return;
3100 aarch64_internal_mov_immediate (dest, imm, true,
3101 as_a <scalar_int_mode> (mode));
3104 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3105 that is known to contain PTRUE. */
3107 void
3108 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3110 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3111 gen_rtvec (2, pred, src),
3112 UNSPEC_MERGE_PTRUE)));
3115 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3116 operand is in memory. In this case we need to use the predicated LD1
3117 and ST1 instead of LDR and STR, both for correctness on big-endian
3118 targets and because LD1 and ST1 support a wider range of addressing modes.
3119 PRED_MODE is the mode of the predicate.
3121 See the comment at the head of aarch64-sve.md for details about the
3122 big-endian handling. */
3124 void
3125 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3127 machine_mode mode = GET_MODE (dest);
3128 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3129 if (!register_operand (src, mode)
3130 && !register_operand (dest, mode))
3132 rtx tmp = gen_reg_rtx (mode);
3133 if (MEM_P (src))
3134 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3135 else
3136 emit_move_insn (tmp, src);
3137 src = tmp;
3139 aarch64_emit_sve_pred_move (dest, ptrue, src);
3142 /* Called only on big-endian targets. See whether an SVE vector move
3143 from SRC to DEST is effectively a REV[BHW] instruction, because at
3144 least one operand is a subreg of an SVE vector that has wider or
3145 narrower elements. Return true and emit the instruction if so.
3147 For example:
3149 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3151 represents a VIEW_CONVERT between the following vectors, viewed
3152 in memory order:
3154 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3155 R1: { [0], [1], [2], [3], ... }
3157 The high part of lane X in R2 should therefore correspond to lane X*2
3158 of R1, but the register representations are:
3160 msb lsb
3161 R2: ...... [1].high [1].low [0].high [0].low
3162 R1: ...... [3] [2] [1] [0]
3164 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3165 We therefore need a reverse operation to swap the high and low values
3166 around.
3168 This is purely an optimization. Without it we would spill the
3169 subreg operand to the stack in one mode and reload it in the
3170 other mode, which has the same effect as the REV. */
3172 bool
3173 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3175 gcc_assert (BYTES_BIG_ENDIAN);
3176 if (GET_CODE (dest) == SUBREG)
3177 dest = SUBREG_REG (dest);
3178 if (GET_CODE (src) == SUBREG)
3179 src = SUBREG_REG (src);
3181 /* The optimization handles two single SVE REGs with different element
3182 sizes. */
3183 if (!REG_P (dest)
3184 || !REG_P (src)
3185 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3186 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3187 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3188 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3189 return false;
3191 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3192 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3193 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3194 UNSPEC_REV_SUBREG);
3195 emit_insn (gen_rtx_SET (dest, unspec));
3196 return true;
3199 /* Return a copy of X with mode MODE, without changing its other
3200 attributes. Unlike gen_lowpart, this doesn't care whether the
3201 mode change is valid. */
3203 static rtx
3204 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3206 if (GET_MODE (x) == mode)
3207 return x;
3209 x = shallow_copy_rtx (x);
3210 set_mode_and_regno (x, mode, REGNO (x));
3211 return x;
3214 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3215 operands. */
3217 void
3218 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3220 /* Decide which REV operation we need. The mode with narrower elements
3221 determines the mode of the operands and the mode with the wider
3222 elements determines the reverse width. */
3223 machine_mode mode_with_wider_elts = GET_MODE (dest);
3224 machine_mode mode_with_narrower_elts = GET_MODE (src);
3225 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3226 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3227 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3229 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3230 unsigned int unspec;
3231 if (wider_bytes == 8)
3232 unspec = UNSPEC_REV64;
3233 else if (wider_bytes == 4)
3234 unspec = UNSPEC_REV32;
3235 else if (wider_bytes == 2)
3236 unspec = UNSPEC_REV16;
3237 else
3238 gcc_unreachable ();
3239 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3241 /* Emit:
3243 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3244 UNSPEC_MERGE_PTRUE))
3246 with the appropriate modes. */
3247 ptrue = gen_lowpart (pred_mode, ptrue);
3248 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3249 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3250 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3251 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3252 UNSPEC_MERGE_PTRUE);
3253 emit_insn (gen_rtx_SET (dest, src));
3256 static bool
3257 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3258 tree exp ATTRIBUTE_UNUSED)
3260 /* Currently, always true. */
3261 return true;
3264 /* Implement TARGET_PASS_BY_REFERENCE. */
3266 static bool
3267 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3268 machine_mode mode,
3269 const_tree type,
3270 bool named ATTRIBUTE_UNUSED)
3272 HOST_WIDE_INT size;
3273 machine_mode dummymode;
3274 int nregs;
3276 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3277 if (mode == BLKmode && type)
3278 size = int_size_in_bytes (type);
3279 else
3280 /* No frontends can create types with variable-sized modes, so we
3281 shouldn't be asked to pass or return them. */
3282 size = GET_MODE_SIZE (mode).to_constant ();
3284 /* Aggregates are passed by reference based on their size. */
3285 if (type && AGGREGATE_TYPE_P (type))
3287 size = int_size_in_bytes (type);
3290 /* Variable sized arguments are always returned by reference. */
3291 if (size < 0)
3292 return true;
3294 /* Can this be a candidate to be passed in fp/simd register(s)? */
3295 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3296 &dummymode, &nregs,
3297 NULL))
3298 return false;
3300 /* Arguments which are variable sized or larger than 2 registers are
3301 passed by reference unless they are a homogenous floating point
3302 aggregate. */
3303 return size > 2 * UNITS_PER_WORD;
3306 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3307 static bool
3308 aarch64_return_in_msb (const_tree valtype)
3310 machine_mode dummy_mode;
3311 int dummy_int;
3313 /* Never happens in little-endian mode. */
3314 if (!BYTES_BIG_ENDIAN)
3315 return false;
3317 /* Only composite types smaller than or equal to 16 bytes can
3318 be potentially returned in registers. */
3319 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3320 || int_size_in_bytes (valtype) <= 0
3321 || int_size_in_bytes (valtype) > 16)
3322 return false;
3324 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3325 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3326 is always passed/returned in the least significant bits of fp/simd
3327 register(s). */
3328 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3329 &dummy_mode, &dummy_int, NULL))
3330 return false;
3332 return true;
3335 /* Implement TARGET_FUNCTION_VALUE.
3336 Define how to find the value returned by a function. */
3338 static rtx
3339 aarch64_function_value (const_tree type, const_tree func,
3340 bool outgoing ATTRIBUTE_UNUSED)
3342 machine_mode mode;
3343 int unsignedp;
3344 int count;
3345 machine_mode ag_mode;
3347 mode = TYPE_MODE (type);
3348 if (INTEGRAL_TYPE_P (type))
3349 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3351 if (aarch64_return_in_msb (type))
3353 HOST_WIDE_INT size = int_size_in_bytes (type);
3355 if (size % UNITS_PER_WORD != 0)
3357 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3358 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3362 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3363 &ag_mode, &count, NULL))
3365 if (!aarch64_composite_type_p (type, mode))
3367 gcc_assert (count == 1 && mode == ag_mode);
3368 return gen_rtx_REG (mode, V0_REGNUM);
3370 else
3372 int i;
3373 rtx par;
3375 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3376 for (i = 0; i < count; i++)
3378 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3379 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3380 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3381 XVECEXP (par, 0, i) = tmp;
3383 return par;
3386 else
3387 return gen_rtx_REG (mode, R0_REGNUM);
3390 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3391 Return true if REGNO is the number of a hard register in which the values
3392 of called function may come back. */
3394 static bool
3395 aarch64_function_value_regno_p (const unsigned int regno)
3397 /* Maximum of 16 bytes can be returned in the general registers. Examples
3398 of 16-byte return values are: 128-bit integers and 16-byte small
3399 structures (excluding homogeneous floating-point aggregates). */
3400 if (regno == R0_REGNUM || regno == R1_REGNUM)
3401 return true;
3403 /* Up to four fp/simd registers can return a function value, e.g. a
3404 homogeneous floating-point aggregate having four members. */
3405 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3406 return TARGET_FLOAT;
3408 return false;
3411 /* Implement TARGET_RETURN_IN_MEMORY.
3413 If the type T of the result of a function is such that
3414 void func (T arg)
3415 would require that arg be passed as a value in a register (or set of
3416 registers) according to the parameter passing rules, then the result
3417 is returned in the same registers as would be used for such an
3418 argument. */
3420 static bool
3421 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3423 HOST_WIDE_INT size;
3424 machine_mode ag_mode;
3425 int count;
3427 if (!AGGREGATE_TYPE_P (type)
3428 && TREE_CODE (type) != COMPLEX_TYPE
3429 && TREE_CODE (type) != VECTOR_TYPE)
3430 /* Simple scalar types always returned in registers. */
3431 return false;
3433 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3434 type,
3435 &ag_mode,
3436 &count,
3437 NULL))
3438 return false;
3440 /* Types larger than 2 registers returned in memory. */
3441 size = int_size_in_bytes (type);
3442 return (size < 0 || size > 2 * UNITS_PER_WORD);
3445 static bool
3446 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3447 const_tree type, int *nregs)
3449 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3450 return aarch64_vfp_is_call_or_return_candidate (mode,
3451 type,
3452 &pcum->aapcs_vfp_rmode,
3453 nregs,
3454 NULL);
3457 /* Given MODE and TYPE of a function argument, return the alignment in
3458 bits. The idea is to suppress any stronger alignment requested by
3459 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3460 This is a helper function for local use only. */
3462 static unsigned int
3463 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3465 if (!type)
3466 return GET_MODE_ALIGNMENT (mode);
3468 if (integer_zerop (TYPE_SIZE (type)))
3469 return 0;
3471 gcc_assert (TYPE_MODE (type) == mode);
3473 if (!AGGREGATE_TYPE_P (type))
3474 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3476 if (TREE_CODE (type) == ARRAY_TYPE)
3477 return TYPE_ALIGN (TREE_TYPE (type));
3479 unsigned int alignment = 0;
3480 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3481 if (TREE_CODE (field) == FIELD_DECL)
3482 alignment = std::max (alignment, DECL_ALIGN (field));
3484 return alignment;
3487 /* Layout a function argument according to the AAPCS64 rules. The rule
3488 numbers refer to the rule numbers in the AAPCS64. */
3490 static void
3491 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3492 const_tree type,
3493 bool named ATTRIBUTE_UNUSED)
3495 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3496 int ncrn, nvrn, nregs;
3497 bool allocate_ncrn, allocate_nvrn;
3498 HOST_WIDE_INT size;
3500 /* We need to do this once per argument. */
3501 if (pcum->aapcs_arg_processed)
3502 return;
3504 pcum->aapcs_arg_processed = true;
3506 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3507 if (type)
3508 size = int_size_in_bytes (type);
3509 else
3510 /* No frontends can create types with variable-sized modes, so we
3511 shouldn't be asked to pass or return them. */
3512 size = GET_MODE_SIZE (mode).to_constant ();
3513 size = ROUND_UP (size, UNITS_PER_WORD);
3515 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3516 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3517 mode,
3518 type,
3519 &nregs);
3521 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3522 The following code thus handles passing by SIMD/FP registers first. */
3524 nvrn = pcum->aapcs_nvrn;
3526 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3527 and homogenous short-vector aggregates (HVA). */
3528 if (allocate_nvrn)
3530 if (!TARGET_FLOAT)
3531 aarch64_err_no_fpadvsimd (mode);
3533 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3535 pcum->aapcs_nextnvrn = nvrn + nregs;
3536 if (!aarch64_composite_type_p (type, mode))
3538 gcc_assert (nregs == 1);
3539 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3541 else
3543 rtx par;
3544 int i;
3545 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3546 for (i = 0; i < nregs; i++)
3548 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3549 V0_REGNUM + nvrn + i);
3550 rtx offset = gen_int_mode
3551 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3552 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3553 XVECEXP (par, 0, i) = tmp;
3555 pcum->aapcs_reg = par;
3557 return;
3559 else
3561 /* C.3 NSRN is set to 8. */
3562 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3563 goto on_stack;
3567 ncrn = pcum->aapcs_ncrn;
3568 nregs = size / UNITS_PER_WORD;
3570 /* C6 - C9. though the sign and zero extension semantics are
3571 handled elsewhere. This is the case where the argument fits
3572 entirely general registers. */
3573 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3576 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3578 /* C.8 if the argument has an alignment of 16 then the NGRN is
3579 rounded up to the next even number. */
3580 if (nregs == 2
3581 && ncrn % 2
3582 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3583 comparison is there because for > 16 * BITS_PER_UNIT
3584 alignment nregs should be > 2 and therefore it should be
3585 passed by reference rather than value. */
3586 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3588 ++ncrn;
3589 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3592 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3593 A reg is still generated for it, but the caller should be smart
3594 enough not to use it. */
3595 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3596 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3597 else
3599 rtx par;
3600 int i;
3602 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3603 for (i = 0; i < nregs; i++)
3605 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3606 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3607 GEN_INT (i * UNITS_PER_WORD));
3608 XVECEXP (par, 0, i) = tmp;
3610 pcum->aapcs_reg = par;
3613 pcum->aapcs_nextncrn = ncrn + nregs;
3614 return;
3617 /* C.11 */
3618 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3620 /* The argument is passed on stack; record the needed number of words for
3621 this argument and align the total size if necessary. */
3622 on_stack:
3623 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3625 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3626 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3627 16 / UNITS_PER_WORD);
3628 return;
3631 /* Implement TARGET_FUNCTION_ARG. */
3633 static rtx
3634 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3635 const_tree type, bool named)
3637 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3638 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3640 if (mode == VOIDmode)
3641 return NULL_RTX;
3643 aarch64_layout_arg (pcum_v, mode, type, named);
3644 return pcum->aapcs_reg;
3647 void
3648 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3649 const_tree fntype ATTRIBUTE_UNUSED,
3650 rtx libname ATTRIBUTE_UNUSED,
3651 const_tree fndecl ATTRIBUTE_UNUSED,
3652 unsigned n_named ATTRIBUTE_UNUSED)
3654 pcum->aapcs_ncrn = 0;
3655 pcum->aapcs_nvrn = 0;
3656 pcum->aapcs_nextncrn = 0;
3657 pcum->aapcs_nextnvrn = 0;
3658 pcum->pcs_variant = ARM_PCS_AAPCS64;
3659 pcum->aapcs_reg = NULL_RTX;
3660 pcum->aapcs_arg_processed = false;
3661 pcum->aapcs_stack_words = 0;
3662 pcum->aapcs_stack_size = 0;
3664 if (!TARGET_FLOAT
3665 && fndecl && TREE_PUBLIC (fndecl)
3666 && fntype && fntype != error_mark_node)
3668 const_tree type = TREE_TYPE (fntype);
3669 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3670 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3671 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3672 &mode, &nregs, NULL))
3673 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3675 return;
3678 static void
3679 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3680 machine_mode mode,
3681 const_tree type,
3682 bool named)
3684 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3685 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3687 aarch64_layout_arg (pcum_v, mode, type, named);
3688 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3689 != (pcum->aapcs_stack_words != 0));
3690 pcum->aapcs_arg_processed = false;
3691 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3692 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3693 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3694 pcum->aapcs_stack_words = 0;
3695 pcum->aapcs_reg = NULL_RTX;
3699 bool
3700 aarch64_function_arg_regno_p (unsigned regno)
3702 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3703 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3706 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3707 PARM_BOUNDARY bits of alignment, but will be given anything up
3708 to STACK_BOUNDARY bits if the type requires it. This makes sure
3709 that both before and after the layout of each argument, the Next
3710 Stacked Argument Address (NSAA) will have a minimum alignment of
3711 8 bytes. */
3713 static unsigned int
3714 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3716 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3717 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3720 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3722 static fixed_size_mode
3723 aarch64_get_reg_raw_mode (int regno)
3725 if (TARGET_SVE && FP_REGNUM_P (regno))
3726 /* Don't use the SVE part of the register for __builtin_apply and
3727 __builtin_return. The SVE registers aren't used by the normal PCS,
3728 so using them there would be a waste of time. The PCS extensions
3729 for SVE types are fundamentally incompatible with the
3730 __builtin_return/__builtin_apply interface. */
3731 return as_a <fixed_size_mode> (V16QImode);
3732 return default_get_reg_raw_mode (regno);
3735 /* Implement TARGET_FUNCTION_ARG_PADDING.
3737 Small aggregate types are placed in the lowest memory address.
3739 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3741 static pad_direction
3742 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3744 /* On little-endian targets, the least significant byte of every stack
3745 argument is passed at the lowest byte address of the stack slot. */
3746 if (!BYTES_BIG_ENDIAN)
3747 return PAD_UPWARD;
3749 /* Otherwise, integral, floating-point and pointer types are padded downward:
3750 the least significant byte of a stack argument is passed at the highest
3751 byte address of the stack slot. */
3752 if (type
3753 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3754 || POINTER_TYPE_P (type))
3755 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3756 return PAD_DOWNWARD;
3758 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3759 return PAD_UPWARD;
3762 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3764 It specifies padding for the last (may also be the only)
3765 element of a block move between registers and memory. If
3766 assuming the block is in the memory, padding upward means that
3767 the last element is padded after its highest significant byte,
3768 while in downward padding, the last element is padded at the
3769 its least significant byte side.
3771 Small aggregates and small complex types are always padded
3772 upwards.
3774 We don't need to worry about homogeneous floating-point or
3775 short-vector aggregates; their move is not affected by the
3776 padding direction determined here. Regardless of endianness,
3777 each element of such an aggregate is put in the least
3778 significant bits of a fp/simd register.
3780 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3781 register has useful data, and return the opposite if the most
3782 significant byte does. */
3784 bool
3785 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3786 bool first ATTRIBUTE_UNUSED)
3789 /* Small composite types are always padded upward. */
3790 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3792 HOST_WIDE_INT size;
3793 if (type)
3794 size = int_size_in_bytes (type);
3795 else
3796 /* No frontends can create types with variable-sized modes, so we
3797 shouldn't be asked to pass or return them. */
3798 size = GET_MODE_SIZE (mode).to_constant ();
3799 if (size < 2 * UNITS_PER_WORD)
3800 return true;
3803 /* Otherwise, use the default padding. */
3804 return !BYTES_BIG_ENDIAN;
3807 static scalar_int_mode
3808 aarch64_libgcc_cmp_return_mode (void)
3810 return SImode;
3813 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3815 /* We use the 12-bit shifted immediate arithmetic instructions so values
3816 must be multiple of (1 << 12), i.e. 4096. */
3817 #define ARITH_FACTOR 4096
3819 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3820 #error Cannot use simple address calculation for stack probing
3821 #endif
3823 /* The pair of scratch registers used for stack probing. */
3824 #define PROBE_STACK_FIRST_REG 9
3825 #define PROBE_STACK_SECOND_REG 10
3827 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3828 inclusive. These are offsets from the current stack pointer. */
3830 static void
3831 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3833 HOST_WIDE_INT size;
3834 if (!poly_size.is_constant (&size))
3836 sorry ("stack probes for SVE frames");
3837 return;
3840 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3842 /* See the same assertion on PROBE_INTERVAL above. */
3843 gcc_assert ((first % ARITH_FACTOR) == 0);
3845 /* See if we have a constant small number of probes to generate. If so,
3846 that's the easy case. */
3847 if (size <= PROBE_INTERVAL)
3849 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3851 emit_set_insn (reg1,
3852 plus_constant (Pmode,
3853 stack_pointer_rtx, -(first + base)));
3854 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3857 /* The run-time loop is made up of 8 insns in the generic case while the
3858 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3859 else if (size <= 4 * PROBE_INTERVAL)
3861 HOST_WIDE_INT i, rem;
3863 emit_set_insn (reg1,
3864 plus_constant (Pmode,
3865 stack_pointer_rtx,
3866 -(first + PROBE_INTERVAL)));
3867 emit_stack_probe (reg1);
3869 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3870 it exceeds SIZE. If only two probes are needed, this will not
3871 generate any code. Then probe at FIRST + SIZE. */
3872 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3874 emit_set_insn (reg1,
3875 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3876 emit_stack_probe (reg1);
3879 rem = size - (i - PROBE_INTERVAL);
3880 if (rem > 256)
3882 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3884 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3885 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3887 else
3888 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3891 /* Otherwise, do the same as above, but in a loop. Note that we must be
3892 extra careful with variables wrapping around because we might be at
3893 the very top (or the very bottom) of the address space and we have
3894 to be able to handle this case properly; in particular, we use an
3895 equality test for the loop condition. */
3896 else
3898 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3900 /* Step 1: round SIZE to the previous multiple of the interval. */
3902 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3905 /* Step 2: compute initial and final value of the loop counter. */
3907 /* TEST_ADDR = SP + FIRST. */
3908 emit_set_insn (reg1,
3909 plus_constant (Pmode, stack_pointer_rtx, -first));
3911 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3912 HOST_WIDE_INT adjustment = - (first + rounded_size);
3913 if (! aarch64_uimm12_shift (adjustment))
3915 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3916 true, Pmode);
3917 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3919 else
3920 emit_set_insn (reg2,
3921 plus_constant (Pmode, stack_pointer_rtx, adjustment));
3923 /* Step 3: the loop
3927 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3928 probe at TEST_ADDR
3930 while (TEST_ADDR != LAST_ADDR)
3932 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3933 until it is equal to ROUNDED_SIZE. */
3935 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3938 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3939 that SIZE is equal to ROUNDED_SIZE. */
3941 if (size != rounded_size)
3943 HOST_WIDE_INT rem = size - rounded_size;
3945 if (rem > 256)
3947 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3949 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3950 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3952 else
3953 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3957 /* Make sure nothing is scheduled before we are done. */
3958 emit_insn (gen_blockage ());
3961 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3962 absolute addresses. */
3964 const char *
3965 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3967 static int labelno = 0;
3968 char loop_lab[32];
3969 rtx xops[2];
3971 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3973 /* Loop. */
3974 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3976 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3977 xops[0] = reg1;
3978 xops[1] = GEN_INT (PROBE_INTERVAL);
3979 output_asm_insn ("sub\t%0, %0, %1", xops);
3981 /* Probe at TEST_ADDR. */
3982 output_asm_insn ("str\txzr, [%0]", xops);
3984 /* Test if TEST_ADDR == LAST_ADDR. */
3985 xops[1] = reg2;
3986 output_asm_insn ("cmp\t%0, %1", xops);
3988 /* Branch. */
3989 fputs ("\tb.ne\t", asm_out_file);
3990 assemble_name_raw (asm_out_file, loop_lab);
3991 fputc ('\n', asm_out_file);
3993 return "";
3996 /* Determine whether a frame chain needs to be generated. */
3997 static bool
3998 aarch64_needs_frame_chain (void)
4000 /* Force a frame chain for EH returns so the return address is at FP+8. */
4001 if (frame_pointer_needed || crtl->calls_eh_return)
4002 return true;
4004 /* A leaf function cannot have calls or write LR. */
4005 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4007 /* Don't use a frame chain in leaf functions if leaf frame pointers
4008 are disabled. */
4009 if (flag_omit_leaf_frame_pointer && is_leaf)
4010 return false;
4012 return aarch64_use_frame_pointer;
4015 /* Mark the registers that need to be saved by the callee and calculate
4016 the size of the callee-saved registers area and frame record (both FP
4017 and LR may be omitted). */
4018 static void
4019 aarch64_layout_frame (void)
4021 HOST_WIDE_INT offset = 0;
4022 int regno, last_fp_reg = INVALID_REGNUM;
4024 if (reload_completed && cfun->machine->frame.laid_out)
4025 return;
4027 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4029 #define SLOT_NOT_REQUIRED (-2)
4030 #define SLOT_REQUIRED (-1)
4032 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4033 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4035 /* First mark all the registers that really need to be saved... */
4036 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4037 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4039 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4040 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4042 /* ... that includes the eh data registers (if needed)... */
4043 if (crtl->calls_eh_return)
4044 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4045 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4046 = SLOT_REQUIRED;
4048 /* ... and any callee saved register that dataflow says is live. */
4049 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4050 if (df_regs_ever_live_p (regno)
4051 && (regno == R30_REGNUM
4052 || !call_used_regs[regno]))
4053 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4055 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4056 if (df_regs_ever_live_p (regno)
4057 && !call_used_regs[regno])
4059 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4060 last_fp_reg = regno;
4063 if (cfun->machine->frame.emit_frame_chain)
4065 /* FP and LR are placed in the linkage record. */
4066 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4067 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4068 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4069 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4070 offset = 2 * UNITS_PER_WORD;
4073 /* Now assign stack slots for them. */
4074 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4075 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4077 cfun->machine->frame.reg_offset[regno] = offset;
4078 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4079 cfun->machine->frame.wb_candidate1 = regno;
4080 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4081 cfun->machine->frame.wb_candidate2 = regno;
4082 offset += UNITS_PER_WORD;
4085 HOST_WIDE_INT max_int_offset = offset;
4086 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4087 bool has_align_gap = offset != max_int_offset;
4089 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4090 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4092 /* If there is an alignment gap between integer and fp callee-saves,
4093 allocate the last fp register to it if possible. */
4094 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4096 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4097 break;
4100 cfun->machine->frame.reg_offset[regno] = offset;
4101 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4102 cfun->machine->frame.wb_candidate1 = regno;
4103 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4104 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4105 cfun->machine->frame.wb_candidate2 = regno;
4106 offset += UNITS_PER_WORD;
4109 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4111 cfun->machine->frame.saved_regs_size = offset;
4113 HOST_WIDE_INT varargs_and_saved_regs_size
4114 = offset + cfun->machine->frame.saved_varargs_size;
4116 cfun->machine->frame.hard_fp_offset
4117 = aligned_upper_bound (varargs_and_saved_regs_size
4118 + get_frame_size (),
4119 STACK_BOUNDARY / BITS_PER_UNIT);
4121 /* Both these values are already aligned. */
4122 gcc_assert (multiple_p (crtl->outgoing_args_size,
4123 STACK_BOUNDARY / BITS_PER_UNIT));
4124 cfun->machine->frame.frame_size
4125 = (cfun->machine->frame.hard_fp_offset
4126 + crtl->outgoing_args_size);
4128 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4130 cfun->machine->frame.initial_adjust = 0;
4131 cfun->machine->frame.final_adjust = 0;
4132 cfun->machine->frame.callee_adjust = 0;
4133 cfun->machine->frame.callee_offset = 0;
4135 HOST_WIDE_INT max_push_offset = 0;
4136 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4137 max_push_offset = 512;
4138 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4139 max_push_offset = 256;
4141 HOST_WIDE_INT const_size, const_fp_offset;
4142 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4143 && const_size < max_push_offset
4144 && known_eq (crtl->outgoing_args_size, 0))
4146 /* Simple, small frame with no outgoing arguments:
4147 stp reg1, reg2, [sp, -frame_size]!
4148 stp reg3, reg4, [sp, 16] */
4149 cfun->machine->frame.callee_adjust = const_size;
4151 else if (known_lt (crtl->outgoing_args_size
4152 + cfun->machine->frame.saved_regs_size, 512)
4153 && !(cfun->calls_alloca
4154 && known_lt (cfun->machine->frame.hard_fp_offset,
4155 max_push_offset)))
4157 /* Frame with small outgoing arguments:
4158 sub sp, sp, frame_size
4159 stp reg1, reg2, [sp, outgoing_args_size]
4160 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4161 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4162 cfun->machine->frame.callee_offset
4163 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4165 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4166 && const_fp_offset < max_push_offset)
4168 /* Frame with large outgoing arguments but a small local area:
4169 stp reg1, reg2, [sp, -hard_fp_offset]!
4170 stp reg3, reg4, [sp, 16]
4171 sub sp, sp, outgoing_args_size */
4172 cfun->machine->frame.callee_adjust = const_fp_offset;
4173 cfun->machine->frame.final_adjust
4174 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4176 else
4178 /* Frame with large local area and outgoing arguments using frame pointer:
4179 sub sp, sp, hard_fp_offset
4180 stp x29, x30, [sp, 0]
4181 add x29, sp, 0
4182 stp reg3, reg4, [sp, 16]
4183 sub sp, sp, outgoing_args_size */
4184 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4185 cfun->machine->frame.final_adjust
4186 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4189 cfun->machine->frame.laid_out = true;
4192 /* Return true if the register REGNO is saved on entry to
4193 the current function. */
4195 static bool
4196 aarch64_register_saved_on_entry (int regno)
4198 return cfun->machine->frame.reg_offset[regno] >= 0;
4201 /* Return the next register up from REGNO up to LIMIT for the callee
4202 to save. */
4204 static unsigned
4205 aarch64_next_callee_save (unsigned regno, unsigned limit)
4207 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4208 regno ++;
4209 return regno;
4212 /* Push the register number REGNO of mode MODE to the stack with write-back
4213 adjusting the stack by ADJUSTMENT. */
4215 static void
4216 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4217 HOST_WIDE_INT adjustment)
4219 rtx base_rtx = stack_pointer_rtx;
4220 rtx insn, reg, mem;
4222 reg = gen_rtx_REG (mode, regno);
4223 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4224 plus_constant (Pmode, base_rtx, -adjustment));
4225 mem = gen_frame_mem (mode, mem);
4227 insn = emit_move_insn (mem, reg);
4228 RTX_FRAME_RELATED_P (insn) = 1;
4231 /* Generate and return an instruction to store the pair of registers
4232 REG and REG2 of mode MODE to location BASE with write-back adjusting
4233 the stack location BASE by ADJUSTMENT. */
4235 static rtx
4236 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4237 HOST_WIDE_INT adjustment)
4239 switch (mode)
4241 case E_DImode:
4242 return gen_storewb_pairdi_di (base, base, reg, reg2,
4243 GEN_INT (-adjustment),
4244 GEN_INT (UNITS_PER_WORD - adjustment));
4245 case E_DFmode:
4246 return gen_storewb_pairdf_di (base, base, reg, reg2,
4247 GEN_INT (-adjustment),
4248 GEN_INT (UNITS_PER_WORD - adjustment));
4249 default:
4250 gcc_unreachable ();
4254 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4255 stack pointer by ADJUSTMENT. */
4257 static void
4258 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4260 rtx_insn *insn;
4261 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4263 if (regno2 == INVALID_REGNUM)
4264 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4266 rtx reg1 = gen_rtx_REG (mode, regno1);
4267 rtx reg2 = gen_rtx_REG (mode, regno2);
4269 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4270 reg2, adjustment));
4271 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4272 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4273 RTX_FRAME_RELATED_P (insn) = 1;
4276 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4277 adjusting it by ADJUSTMENT afterwards. */
4279 static rtx
4280 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4281 HOST_WIDE_INT adjustment)
4283 switch (mode)
4285 case E_DImode:
4286 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4287 GEN_INT (UNITS_PER_WORD));
4288 case E_DFmode:
4289 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4290 GEN_INT (UNITS_PER_WORD));
4291 default:
4292 gcc_unreachable ();
4296 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4297 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4298 into CFI_OPS. */
4300 static void
4301 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4302 rtx *cfi_ops)
4304 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4305 rtx reg1 = gen_rtx_REG (mode, regno1);
4307 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4309 if (regno2 == INVALID_REGNUM)
4311 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4312 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4313 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4315 else
4317 rtx reg2 = gen_rtx_REG (mode, regno2);
4318 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4319 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4320 reg2, adjustment));
4324 /* Generate and return a store pair instruction of mode MODE to store
4325 register REG1 to MEM1 and register REG2 to MEM2. */
4327 static rtx
4328 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4329 rtx reg2)
4331 switch (mode)
4333 case E_DImode:
4334 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4336 case E_DFmode:
4337 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4339 default:
4340 gcc_unreachable ();
4344 /* Generate and regurn a load pair isntruction of mode MODE to load register
4345 REG1 from MEM1 and register REG2 from MEM2. */
4347 static rtx
4348 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4349 rtx mem2)
4351 switch (mode)
4353 case E_DImode:
4354 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4356 case E_DFmode:
4357 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4359 default:
4360 gcc_unreachable ();
4364 /* Return TRUE if return address signing should be enabled for the current
4365 function, otherwise return FALSE. */
4367 bool
4368 aarch64_return_address_signing_enabled (void)
4370 /* This function should only be called after frame laid out. */
4371 gcc_assert (cfun->machine->frame.laid_out);
4373 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4374 if it's LR is pushed onto stack. */
4375 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4376 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4377 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4380 /* Emit code to save the callee-saved registers from register number START
4381 to LIMIT to the stack at the location starting at offset START_OFFSET,
4382 skipping any write-back candidates if SKIP_WB is true. */
4384 static void
4385 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4386 unsigned start, unsigned limit, bool skip_wb)
4388 rtx_insn *insn;
4389 unsigned regno;
4390 unsigned regno2;
4392 for (regno = aarch64_next_callee_save (start, limit);
4393 regno <= limit;
4394 regno = aarch64_next_callee_save (regno + 1, limit))
4396 rtx reg, mem;
4397 poly_int64 offset;
4399 if (skip_wb
4400 && (regno == cfun->machine->frame.wb_candidate1
4401 || regno == cfun->machine->frame.wb_candidate2))
4402 continue;
4404 if (cfun->machine->reg_is_wrapped_separately[regno])
4405 continue;
4407 reg = gen_rtx_REG (mode, regno);
4408 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4409 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4410 offset));
4412 regno2 = aarch64_next_callee_save (regno + 1, limit);
4414 if (regno2 <= limit
4415 && !cfun->machine->reg_is_wrapped_separately[regno2]
4416 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4417 == cfun->machine->frame.reg_offset[regno2]))
4420 rtx reg2 = gen_rtx_REG (mode, regno2);
4421 rtx mem2;
4423 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4424 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4425 offset));
4426 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4427 reg2));
4429 /* The first part of a frame-related parallel insn is
4430 always assumed to be relevant to the frame
4431 calculations; subsequent parts, are only
4432 frame-related if explicitly marked. */
4433 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4434 regno = regno2;
4436 else
4437 insn = emit_move_insn (mem, reg);
4439 RTX_FRAME_RELATED_P (insn) = 1;
4443 /* Emit code to restore the callee registers of mode MODE from register
4444 number START up to and including LIMIT. Restore from the stack offset
4445 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4446 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4448 static void
4449 aarch64_restore_callee_saves (machine_mode mode,
4450 poly_int64 start_offset, unsigned start,
4451 unsigned limit, bool skip_wb, rtx *cfi_ops)
4453 rtx base_rtx = stack_pointer_rtx;
4454 unsigned regno;
4455 unsigned regno2;
4456 poly_int64 offset;
4458 for (regno = aarch64_next_callee_save (start, limit);
4459 regno <= limit;
4460 regno = aarch64_next_callee_save (regno + 1, limit))
4462 if (cfun->machine->reg_is_wrapped_separately[regno])
4463 continue;
4465 rtx reg, mem;
4467 if (skip_wb
4468 && (regno == cfun->machine->frame.wb_candidate1
4469 || regno == cfun->machine->frame.wb_candidate2))
4470 continue;
4472 reg = gen_rtx_REG (mode, regno);
4473 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4474 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4476 regno2 = aarch64_next_callee_save (regno + 1, limit);
4478 if (regno2 <= limit
4479 && !cfun->machine->reg_is_wrapped_separately[regno2]
4480 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4481 == cfun->machine->frame.reg_offset[regno2]))
4483 rtx reg2 = gen_rtx_REG (mode, regno2);
4484 rtx mem2;
4486 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4487 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4488 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4490 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4491 regno = regno2;
4493 else
4494 emit_move_insn (reg, mem);
4495 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4499 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4500 of MODE. */
4502 static inline bool
4503 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4505 HOST_WIDE_INT multiple;
4506 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4507 && IN_RANGE (multiple, -8, 7));
4510 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4511 of MODE. */
4513 static inline bool
4514 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4516 HOST_WIDE_INT multiple;
4517 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4518 && IN_RANGE (multiple, 0, 63));
4521 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4522 of MODE. */
4524 bool
4525 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4527 HOST_WIDE_INT multiple;
4528 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4529 && IN_RANGE (multiple, -64, 63));
4532 /* Return true if OFFSET is a signed 9-bit value. */
4534 static inline bool
4535 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4536 poly_int64 offset)
4538 HOST_WIDE_INT const_offset;
4539 return (offset.is_constant (&const_offset)
4540 && IN_RANGE (const_offset, -256, 255));
4543 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4544 of MODE. */
4546 static inline bool
4547 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4549 HOST_WIDE_INT multiple;
4550 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4551 && IN_RANGE (multiple, -256, 255));
4554 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4555 of MODE. */
4557 static inline bool
4558 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4560 HOST_WIDE_INT multiple;
4561 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4562 && IN_RANGE (multiple, 0, 4095));
4565 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4567 static sbitmap
4568 aarch64_get_separate_components (void)
4570 aarch64_layout_frame ();
4572 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4573 bitmap_clear (components);
4575 /* The registers we need saved to the frame. */
4576 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4577 if (aarch64_register_saved_on_entry (regno))
4579 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4580 if (!frame_pointer_needed)
4581 offset += cfun->machine->frame.frame_size
4582 - cfun->machine->frame.hard_fp_offset;
4583 /* Check that we can access the stack slot of the register with one
4584 direct load with no adjustments needed. */
4585 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4586 bitmap_set_bit (components, regno);
4589 /* Don't mess with the hard frame pointer. */
4590 if (frame_pointer_needed)
4591 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4593 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4594 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4595 /* If aarch64_layout_frame has chosen registers to store/restore with
4596 writeback don't interfere with them to avoid having to output explicit
4597 stack adjustment instructions. */
4598 if (reg2 != INVALID_REGNUM)
4599 bitmap_clear_bit (components, reg2);
4600 if (reg1 != INVALID_REGNUM)
4601 bitmap_clear_bit (components, reg1);
4603 bitmap_clear_bit (components, LR_REGNUM);
4604 bitmap_clear_bit (components, SP_REGNUM);
4606 return components;
4609 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4611 static sbitmap
4612 aarch64_components_for_bb (basic_block bb)
4614 bitmap in = DF_LIVE_IN (bb);
4615 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4616 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4618 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4619 bitmap_clear (components);
4621 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4622 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4623 if ((!call_used_regs[regno])
4624 && (bitmap_bit_p (in, regno)
4625 || bitmap_bit_p (gen, regno)
4626 || bitmap_bit_p (kill, regno)))
4628 unsigned regno2, offset, offset2;
4629 bitmap_set_bit (components, regno);
4631 /* If there is a callee-save at an adjacent offset, add it too
4632 to increase the use of LDP/STP. */
4633 offset = cfun->machine->frame.reg_offset[regno];
4634 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4636 if (regno2 <= LAST_SAVED_REGNUM)
4638 offset2 = cfun->machine->frame.reg_offset[regno2];
4639 if ((offset & ~8) == (offset2 & ~8))
4640 bitmap_set_bit (components, regno2);
4644 return components;
4647 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4648 Nothing to do for aarch64. */
4650 static void
4651 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4655 /* Return the next set bit in BMP from START onwards. Return the total number
4656 of bits in BMP if no set bit is found at or after START. */
4658 static unsigned int
4659 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4661 unsigned int nbits = SBITMAP_SIZE (bmp);
4662 if (start == nbits)
4663 return start;
4665 gcc_assert (start < nbits);
4666 for (unsigned int i = start; i < nbits; i++)
4667 if (bitmap_bit_p (bmp, i))
4668 return i;
4670 return nbits;
4673 /* Do the work for aarch64_emit_prologue_components and
4674 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4675 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4676 for these components or the epilogue sequence. That is, it determines
4677 whether we should emit stores or loads and what kind of CFA notes to attach
4678 to the insns. Otherwise the logic for the two sequences is very
4679 similar. */
4681 static void
4682 aarch64_process_components (sbitmap components, bool prologue_p)
4684 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4685 ? HARD_FRAME_POINTER_REGNUM
4686 : STACK_POINTER_REGNUM);
4688 unsigned last_regno = SBITMAP_SIZE (components);
4689 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4690 rtx_insn *insn = NULL;
4692 while (regno != last_regno)
4694 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4695 so DFmode for the vector registers is enough. */
4696 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4697 rtx reg = gen_rtx_REG (mode, regno);
4698 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4699 if (!frame_pointer_needed)
4700 offset += cfun->machine->frame.frame_size
4701 - cfun->machine->frame.hard_fp_offset;
4702 rtx addr = plus_constant (Pmode, ptr_reg, offset);
4703 rtx mem = gen_frame_mem (mode, addr);
4705 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4706 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4707 /* No more registers to handle after REGNO.
4708 Emit a single save/restore and exit. */
4709 if (regno2 == last_regno)
4711 insn = emit_insn (set);
4712 RTX_FRAME_RELATED_P (insn) = 1;
4713 if (prologue_p)
4714 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4715 else
4716 add_reg_note (insn, REG_CFA_RESTORE, reg);
4717 break;
4720 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4721 /* The next register is not of the same class or its offset is not
4722 mergeable with the current one into a pair. */
4723 if (!satisfies_constraint_Ump (mem)
4724 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4725 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4726 GET_MODE_SIZE (mode)))
4728 insn = emit_insn (set);
4729 RTX_FRAME_RELATED_P (insn) = 1;
4730 if (prologue_p)
4731 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4732 else
4733 add_reg_note (insn, REG_CFA_RESTORE, reg);
4735 regno = regno2;
4736 continue;
4739 /* REGNO2 can be saved/restored in a pair with REGNO. */
4740 rtx reg2 = gen_rtx_REG (mode, regno2);
4741 if (!frame_pointer_needed)
4742 offset2 += cfun->machine->frame.frame_size
4743 - cfun->machine->frame.hard_fp_offset;
4744 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4745 rtx mem2 = gen_frame_mem (mode, addr2);
4746 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4747 : gen_rtx_SET (reg2, mem2);
4749 if (prologue_p)
4750 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4751 else
4752 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4754 RTX_FRAME_RELATED_P (insn) = 1;
4755 if (prologue_p)
4757 add_reg_note (insn, REG_CFA_OFFSET, set);
4758 add_reg_note (insn, REG_CFA_OFFSET, set2);
4760 else
4762 add_reg_note (insn, REG_CFA_RESTORE, reg);
4763 add_reg_note (insn, REG_CFA_RESTORE, reg2);
4766 regno = aarch64_get_next_set_bit (components, regno2 + 1);
4770 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4772 static void
4773 aarch64_emit_prologue_components (sbitmap components)
4775 aarch64_process_components (components, true);
4778 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4780 static void
4781 aarch64_emit_epilogue_components (sbitmap components)
4783 aarch64_process_components (components, false);
4786 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4788 static void
4789 aarch64_set_handled_components (sbitmap components)
4791 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4792 if (bitmap_bit_p (components, regno))
4793 cfun->machine->reg_is_wrapped_separately[regno] = true;
4796 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4797 is saved at BASE + OFFSET. */
4799 static void
4800 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4801 rtx base, poly_int64 offset)
4803 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4804 add_reg_note (insn, REG_CFA_EXPRESSION,
4805 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4808 /* AArch64 stack frames generated by this compiler look like:
4810 +-------------------------------+
4812 | incoming stack arguments |
4814 +-------------------------------+
4815 | | <-- incoming stack pointer (aligned)
4816 | callee-allocated save area |
4817 | for register varargs |
4819 +-------------------------------+
4820 | local variables | <-- frame_pointer_rtx
4822 +-------------------------------+
4823 | padding0 | \
4824 +-------------------------------+ |
4825 | callee-saved registers | | frame.saved_regs_size
4826 +-------------------------------+ |
4827 | LR' | |
4828 +-------------------------------+ |
4829 | FP' | / <- hard_frame_pointer_rtx (aligned)
4830 +-------------------------------+
4831 | dynamic allocation |
4832 +-------------------------------+
4833 | padding |
4834 +-------------------------------+
4835 | outgoing stack arguments | <-- arg_pointer
4837 +-------------------------------+
4838 | | <-- stack_pointer_rtx (aligned)
4840 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4841 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4842 unchanged. */
4844 /* Generate the prologue instructions for entry into a function.
4845 Establish the stack frame by decreasing the stack pointer with a
4846 properly calculated size and, if necessary, create a frame record
4847 filled with the values of LR and previous frame pointer. The
4848 current FP is also set up if it is in use. */
4850 void
4851 aarch64_expand_prologue (void)
4853 aarch64_layout_frame ();
4855 poly_int64 frame_size = cfun->machine->frame.frame_size;
4856 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4857 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4858 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4859 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4860 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4861 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4862 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4863 rtx_insn *insn;
4865 /* Sign return address for functions. */
4866 if (aarch64_return_address_signing_enabled ())
4868 insn = emit_insn (gen_pacisp ());
4869 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4870 RTX_FRAME_RELATED_P (insn) = 1;
4873 if (flag_stack_usage_info)
4874 current_function_static_stack_size = constant_lower_bound (frame_size);
4876 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4878 if (crtl->is_leaf && !cfun->calls_alloca)
4880 if (maybe_gt (frame_size, PROBE_INTERVAL)
4881 && maybe_gt (frame_size, get_stack_check_protect ()))
4882 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4883 (frame_size
4884 - get_stack_check_protect ()));
4886 else if (maybe_gt (frame_size, 0))
4887 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4890 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4891 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4893 aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4895 if (callee_adjust != 0)
4896 aarch64_push_regs (reg1, reg2, callee_adjust);
4898 if (emit_frame_chain)
4900 poly_int64 reg_offset = callee_adjust;
4901 if (callee_adjust == 0)
4903 reg1 = R29_REGNUM;
4904 reg2 = R30_REGNUM;
4905 reg_offset = callee_offset;
4906 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4908 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4909 stack_pointer_rtx, callee_offset,
4910 ip1_rtx, ip0_rtx, frame_pointer_needed);
4911 if (frame_pointer_needed && !frame_size.is_constant ())
4913 /* Variable-sized frames need to describe the save slot
4914 address using DW_CFA_expression rather than DW_CFA_offset.
4915 This means that, without taking further action, the
4916 locations of the registers that we've already saved would
4917 remain based on the stack pointer even after we redefine
4918 the CFA based on the frame pointer. We therefore need new
4919 DW_CFA_expressions to re-express the save slots with addresses
4920 based on the frame pointer. */
4921 rtx_insn *insn = get_last_insn ();
4922 gcc_assert (RTX_FRAME_RELATED_P (insn));
4924 /* Add an explicit CFA definition if this was previously
4925 implicit. */
4926 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4928 rtx src = plus_constant (Pmode, stack_pointer_rtx,
4929 callee_offset);
4930 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4931 gen_rtx_SET (hard_frame_pointer_rtx, src));
4934 /* Change the save slot expressions for the registers that
4935 we've already saved. */
4936 reg_offset -= callee_offset;
4937 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4938 reg_offset + UNITS_PER_WORD);
4939 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4940 reg_offset);
4942 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4945 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4946 callee_adjust != 0 || emit_frame_chain);
4947 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4948 callee_adjust != 0 || emit_frame_chain);
4949 aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4952 /* Return TRUE if we can use a simple_return insn.
4954 This function checks whether the callee saved stack is empty, which
4955 means no restore actions are need. The pro_and_epilogue will use
4956 this to check whether shrink-wrapping opt is feasible. */
4958 bool
4959 aarch64_use_return_insn_p (void)
4961 if (!reload_completed)
4962 return false;
4964 if (crtl->profile)
4965 return false;
4967 aarch64_layout_frame ();
4969 return known_eq (cfun->machine->frame.frame_size, 0);
4972 /* Generate the epilogue instructions for returning from a function.
4973 This is almost exactly the reverse of the prolog sequence, except
4974 that we need to insert barriers to avoid scheduling loads that read
4975 from a deallocated stack, and we optimize the unwind records by
4976 emitting them all together if possible. */
4977 void
4978 aarch64_expand_epilogue (bool for_sibcall)
4980 aarch64_layout_frame ();
4982 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4983 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4984 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4985 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4986 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4987 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4988 rtx cfi_ops = NULL;
4989 rtx_insn *insn;
4990 /* A stack clash protection prologue may not have left IP0_REGNUM or
4991 IP1_REGNUM in a usable state. The same is true for allocations
4992 with an SVE component, since we then need both temporary registers
4993 for each allocation. */
4994 bool can_inherit_p = (initial_adjust.is_constant ()
4995 && final_adjust.is_constant ()
4996 && !flag_stack_clash_protection);
4998 /* We need to add memory barrier to prevent read from deallocated stack. */
4999 bool need_barrier_p
5000 = maybe_ne (get_frame_size ()
5001 + cfun->machine->frame.saved_varargs_size, 0);
5003 /* Emit a barrier to prevent loads from a deallocated stack. */
5004 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5005 || cfun->calls_alloca
5006 || crtl->calls_eh_return)
5008 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5009 need_barrier_p = false;
5012 /* Restore the stack pointer from the frame pointer if it may not
5013 be the same as the stack pointer. */
5014 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
5015 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
5016 if (frame_pointer_needed
5017 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5018 /* If writeback is used when restoring callee-saves, the CFA
5019 is restored on the instruction doing the writeback. */
5020 aarch64_add_offset (Pmode, stack_pointer_rtx,
5021 hard_frame_pointer_rtx, -callee_offset,
5022 ip1_rtx, ip0_rtx, callee_adjust == 0);
5023 else
5024 aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
5025 !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
5027 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5028 callee_adjust != 0, &cfi_ops);
5029 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5030 callee_adjust != 0, &cfi_ops);
5032 if (need_barrier_p)
5033 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5035 if (callee_adjust != 0)
5036 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5038 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5040 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5041 insn = get_last_insn ();
5042 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5043 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5044 RTX_FRAME_RELATED_P (insn) = 1;
5045 cfi_ops = NULL;
5048 aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5049 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5051 if (cfi_ops)
5053 /* Emit delayed restores and reset the CFA to be SP. */
5054 insn = get_last_insn ();
5055 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5056 REG_NOTES (insn) = cfi_ops;
5057 RTX_FRAME_RELATED_P (insn) = 1;
5060 /* We prefer to emit the combined return/authenticate instruction RETAA,
5061 however there are three cases in which we must instead emit an explicit
5062 authentication instruction.
5064 1) Sibcalls don't return in a normal way, so if we're about to call one
5065 we must authenticate.
5067 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5068 generating code for !TARGET_ARMV8_3 we can't use it and must
5069 explicitly authenticate.
5071 3) On an eh_return path we make extra stack adjustments to update the
5072 canonical frame address to be the exception handler's CFA. We want
5073 to authenticate using the CFA of the function which calls eh_return.
5075 if (aarch64_return_address_signing_enabled ()
5076 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5078 insn = emit_insn (gen_autisp ());
5079 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5080 RTX_FRAME_RELATED_P (insn) = 1;
5083 /* Stack adjustment for exception handler. */
5084 if (crtl->calls_eh_return)
5086 /* We need to unwind the stack by the offset computed by
5087 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5088 to be SP; letting the CFA move during this adjustment
5089 is just as correct as retaining the CFA from the body
5090 of the function. Therefore, do nothing special. */
5091 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5094 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5095 if (!for_sibcall)
5096 emit_jump_insn (ret_rtx);
5099 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5100 normally or return to a previous frame after unwinding.
5102 An EH return uses a single shared return sequence. The epilogue is
5103 exactly like a normal epilogue except that it has an extra input
5104 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5105 that must be applied after the frame has been destroyed. An extra label
5106 is inserted before the epilogue which initializes this register to zero,
5107 and this is the entry point for a normal return.
5109 An actual EH return updates the return address, initializes the stack
5110 adjustment and jumps directly into the epilogue (bypassing the zeroing
5111 of the adjustment). Since the return address is typically saved on the
5112 stack when a function makes a call, the saved LR must be updated outside
5113 the epilogue.
5115 This poses problems as the store is generated well before the epilogue,
5116 so the offset of LR is not known yet. Also optimizations will remove the
5117 store as it appears dead, even after the epilogue is generated (as the
5118 base or offset for loading LR is different in many cases).
5120 To avoid these problems this implementation forces the frame pointer
5121 in eh_return functions so that the location of LR is fixed and known early.
5122 It also marks the store volatile, so no optimization is permitted to
5123 remove the store. */
5125 aarch64_eh_return_handler_rtx (void)
5127 rtx tmp = gen_frame_mem (Pmode,
5128 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5130 /* Mark the store volatile, so no optimization is permitted to remove it. */
5131 MEM_VOLATILE_P (tmp) = true;
5132 return tmp;
5135 /* Output code to add DELTA to the first argument, and then jump
5136 to FUNCTION. Used for C++ multiple inheritance. */
5137 static void
5138 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5139 HOST_WIDE_INT delta,
5140 HOST_WIDE_INT vcall_offset,
5141 tree function)
5143 /* The this pointer is always in x0. Note that this differs from
5144 Arm where the this pointer maybe bumped to r1 if r0 is required
5145 to return a pointer to an aggregate. On AArch64 a result value
5146 pointer will be in x8. */
5147 int this_regno = R0_REGNUM;
5148 rtx this_rtx, temp0, temp1, addr, funexp;
5149 rtx_insn *insn;
5151 reload_completed = 1;
5152 emit_note (NOTE_INSN_PROLOGUE_END);
5154 this_rtx = gen_rtx_REG (Pmode, this_regno);
5155 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5156 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5158 if (vcall_offset == 0)
5159 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5160 else
5162 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5164 addr = this_rtx;
5165 if (delta != 0)
5167 if (delta >= -256 && delta < 256)
5168 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5169 plus_constant (Pmode, this_rtx, delta));
5170 else
5171 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5172 temp1, temp0, false);
5175 if (Pmode == ptr_mode)
5176 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5177 else
5178 aarch64_emit_move (temp0,
5179 gen_rtx_ZERO_EXTEND (Pmode,
5180 gen_rtx_MEM (ptr_mode, addr)));
5182 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5183 addr = plus_constant (Pmode, temp0, vcall_offset);
5184 else
5186 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5187 Pmode);
5188 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5191 if (Pmode == ptr_mode)
5192 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5193 else
5194 aarch64_emit_move (temp1,
5195 gen_rtx_SIGN_EXTEND (Pmode,
5196 gen_rtx_MEM (ptr_mode, addr)));
5198 emit_insn (gen_add2_insn (this_rtx, temp1));
5201 /* Generate a tail call to the target function. */
5202 if (!TREE_USED (function))
5204 assemble_external (function);
5205 TREE_USED (function) = 1;
5207 funexp = XEXP (DECL_RTL (function), 0);
5208 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5209 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5210 SIBLING_CALL_P (insn) = 1;
5212 insn = get_insns ();
5213 shorten_branches (insn);
5214 final_start_function (insn, file, 1);
5215 final (insn, file, 1);
5216 final_end_function ();
5218 /* Stop pretending to be a post-reload pass. */
5219 reload_completed = 0;
5222 static bool
5223 aarch64_tls_referenced_p (rtx x)
5225 if (!TARGET_HAVE_TLS)
5226 return false;
5227 subrtx_iterator::array_type array;
5228 FOR_EACH_SUBRTX (iter, array, x, ALL)
5230 const_rtx x = *iter;
5231 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5232 return true;
5233 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5234 TLS offsets, not real symbol references. */
5235 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5236 iter.skip_subrtxes ();
5238 return false;
5242 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5243 a left shift of 0 or 12 bits. */
5244 bool
5245 aarch64_uimm12_shift (HOST_WIDE_INT val)
5247 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5248 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5253 /* Return true if val is an immediate that can be loaded into a
5254 register by a MOVZ instruction. */
5255 static bool
5256 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5258 if (GET_MODE_SIZE (mode) > 4)
5260 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5261 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5262 return 1;
5264 else
5266 /* Ignore sign extension. */
5267 val &= (HOST_WIDE_INT) 0xffffffff;
5269 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5270 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5273 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5274 64-bit (DImode) integer. */
5276 static unsigned HOST_WIDE_INT
5277 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5279 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5280 while (size < 64)
5282 val &= (HOST_WIDE_INT_1U << size) - 1;
5283 val |= val << size;
5284 size *= 2;
5286 return val;
5289 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5291 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5293 0x0000000100000001ull,
5294 0x0001000100010001ull,
5295 0x0101010101010101ull,
5296 0x1111111111111111ull,
5297 0x5555555555555555ull,
5301 /* Return true if val is a valid bitmask immediate. */
5303 bool
5304 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5306 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5307 int bits;
5309 /* Check for a single sequence of one bits and return quickly if so.
5310 The special cases of all ones and all zeroes returns false. */
5311 val = aarch64_replicate_bitmask_imm (val_in, mode);
5312 tmp = val + (val & -val);
5314 if (tmp == (tmp & -tmp))
5315 return (val + 1) > 1;
5317 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5318 if (mode == SImode)
5319 val = (val << 32) | (val & 0xffffffff);
5321 /* Invert if the immediate doesn't start with a zero bit - this means we
5322 only need to search for sequences of one bits. */
5323 if (val & 1)
5324 val = ~val;
5326 /* Find the first set bit and set tmp to val with the first sequence of one
5327 bits removed. Return success if there is a single sequence of ones. */
5328 first_one = val & -val;
5329 tmp = val & (val + first_one);
5331 if (tmp == 0)
5332 return true;
5334 /* Find the next set bit and compute the difference in bit position. */
5335 next_one = tmp & -tmp;
5336 bits = clz_hwi (first_one) - clz_hwi (next_one);
5337 mask = val ^ tmp;
5339 /* Check the bit position difference is a power of 2, and that the first
5340 sequence of one bits fits within 'bits' bits. */
5341 if ((mask >> bits) != 0 || bits != (bits & -bits))
5342 return false;
5344 /* Check the sequence of one bits is repeated 64/bits times. */
5345 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5348 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5349 Assumed precondition: VAL_IN Is not zero. */
5351 unsigned HOST_WIDE_INT
5352 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5354 int lowest_bit_set = ctz_hwi (val_in);
5355 int highest_bit_set = floor_log2 (val_in);
5356 gcc_assert (val_in != 0);
5358 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5359 (HOST_WIDE_INT_1U << lowest_bit_set));
5362 /* Create constant where bits outside of lowest bit set to highest bit set
5363 are set to 1. */
5365 unsigned HOST_WIDE_INT
5366 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5368 return val_in | ~aarch64_and_split_imm1 (val_in);
5371 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5373 bool
5374 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5376 scalar_int_mode int_mode;
5377 if (!is_a <scalar_int_mode> (mode, &int_mode))
5378 return false;
5380 if (aarch64_bitmask_imm (val_in, int_mode))
5381 return false;
5383 if (aarch64_move_imm (val_in, int_mode))
5384 return false;
5386 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5388 return aarch64_bitmask_imm (imm2, int_mode);
5391 /* Return true if val is an immediate that can be loaded into a
5392 register in a single instruction. */
5393 bool
5394 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5396 scalar_int_mode int_mode;
5397 if (!is_a <scalar_int_mode> (mode, &int_mode))
5398 return false;
5400 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5401 return 1;
5402 return aarch64_bitmask_imm (val, int_mode);
5405 static bool
5406 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5408 rtx base, offset;
5410 if (GET_CODE (x) == HIGH)
5411 return true;
5413 /* There's no way to calculate VL-based values using relocations. */
5414 subrtx_iterator::array_type array;
5415 FOR_EACH_SUBRTX (iter, array, x, ALL)
5416 if (GET_CODE (*iter) == CONST_POLY_INT)
5417 return true;
5419 split_const (x, &base, &offset);
5420 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5422 if (aarch64_classify_symbol (base, INTVAL (offset))
5423 != SYMBOL_FORCE_TO_MEM)
5424 return true;
5425 else
5426 /* Avoid generating a 64-bit relocation in ILP32; leave
5427 to aarch64_expand_mov_immediate to handle it properly. */
5428 return mode != ptr_mode;
5431 return aarch64_tls_referenced_p (x);
5434 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5435 The expansion for a table switch is quite expensive due to the number
5436 of instructions, the table lookup and hard to predict indirect jump.
5437 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5438 set, otherwise use tables for > 16 cases as a tradeoff between size and
5439 performance. When optimizing for size, use the default setting. */
5441 static unsigned int
5442 aarch64_case_values_threshold (void)
5444 /* Use the specified limit for the number of cases before using jump
5445 tables at higher optimization levels. */
5446 if (optimize > 2
5447 && selected_cpu->tune->max_case_values != 0)
5448 return selected_cpu->tune->max_case_values;
5449 else
5450 return optimize_size ? default_case_values_threshold () : 17;
5453 /* Return true if register REGNO is a valid index register.
5454 STRICT_P is true if REG_OK_STRICT is in effect. */
5456 bool
5457 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5459 if (!HARD_REGISTER_NUM_P (regno))
5461 if (!strict_p)
5462 return true;
5464 if (!reg_renumber)
5465 return false;
5467 regno = reg_renumber[regno];
5469 return GP_REGNUM_P (regno);
5472 /* Return true if register REGNO is a valid base register for mode MODE.
5473 STRICT_P is true if REG_OK_STRICT is in effect. */
5475 bool
5476 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5478 if (!HARD_REGISTER_NUM_P (regno))
5480 if (!strict_p)
5481 return true;
5483 if (!reg_renumber)
5484 return false;
5486 regno = reg_renumber[regno];
5489 /* The fake registers will be eliminated to either the stack or
5490 hard frame pointer, both of which are usually valid base registers.
5491 Reload deals with the cases where the eliminated form isn't valid. */
5492 return (GP_REGNUM_P (regno)
5493 || regno == SP_REGNUM
5494 || regno == FRAME_POINTER_REGNUM
5495 || regno == ARG_POINTER_REGNUM);
5498 /* Return true if X is a valid base register for mode MODE.
5499 STRICT_P is true if REG_OK_STRICT is in effect. */
5501 static bool
5502 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5504 if (!strict_p
5505 && GET_CODE (x) == SUBREG
5506 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5507 x = SUBREG_REG (x);
5509 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5512 /* Return true if address offset is a valid index. If it is, fill in INFO
5513 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5515 static bool
5516 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5517 machine_mode mode, bool strict_p)
5519 enum aarch64_address_type type;
5520 rtx index;
5521 int shift;
5523 /* (reg:P) */
5524 if ((REG_P (x) || GET_CODE (x) == SUBREG)
5525 && GET_MODE (x) == Pmode)
5527 type = ADDRESS_REG_REG;
5528 index = x;
5529 shift = 0;
5531 /* (sign_extend:DI (reg:SI)) */
5532 else if ((GET_CODE (x) == SIGN_EXTEND
5533 || GET_CODE (x) == ZERO_EXTEND)
5534 && GET_MODE (x) == DImode
5535 && GET_MODE (XEXP (x, 0)) == SImode)
5537 type = (GET_CODE (x) == SIGN_EXTEND)
5538 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5539 index = XEXP (x, 0);
5540 shift = 0;
5542 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5543 else if (GET_CODE (x) == MULT
5544 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5545 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5546 && GET_MODE (XEXP (x, 0)) == DImode
5547 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5548 && CONST_INT_P (XEXP (x, 1)))
5550 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5551 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5552 index = XEXP (XEXP (x, 0), 0);
5553 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5555 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5556 else if (GET_CODE (x) == ASHIFT
5557 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5558 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5559 && GET_MODE (XEXP (x, 0)) == DImode
5560 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5561 && CONST_INT_P (XEXP (x, 1)))
5563 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5564 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5565 index = XEXP (XEXP (x, 0), 0);
5566 shift = INTVAL (XEXP (x, 1));
5568 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5569 else if ((GET_CODE (x) == SIGN_EXTRACT
5570 || GET_CODE (x) == ZERO_EXTRACT)
5571 && GET_MODE (x) == DImode
5572 && GET_CODE (XEXP (x, 0)) == MULT
5573 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5574 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5576 type = (GET_CODE (x) == SIGN_EXTRACT)
5577 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5578 index = XEXP (XEXP (x, 0), 0);
5579 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5580 if (INTVAL (XEXP (x, 1)) != 32 + shift
5581 || INTVAL (XEXP (x, 2)) != 0)
5582 shift = -1;
5584 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5585 (const_int 0xffffffff<<shift)) */
5586 else if (GET_CODE (x) == AND
5587 && GET_MODE (x) == DImode
5588 && GET_CODE (XEXP (x, 0)) == MULT
5589 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5590 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5591 && CONST_INT_P (XEXP (x, 1)))
5593 type = ADDRESS_REG_UXTW;
5594 index = XEXP (XEXP (x, 0), 0);
5595 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5596 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5597 shift = -1;
5599 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5600 else if ((GET_CODE (x) == SIGN_EXTRACT
5601 || GET_CODE (x) == ZERO_EXTRACT)
5602 && GET_MODE (x) == DImode
5603 && GET_CODE (XEXP (x, 0)) == ASHIFT
5604 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5605 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5607 type = (GET_CODE (x) == SIGN_EXTRACT)
5608 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5609 index = XEXP (XEXP (x, 0), 0);
5610 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5611 if (INTVAL (XEXP (x, 1)) != 32 + shift
5612 || INTVAL (XEXP (x, 2)) != 0)
5613 shift = -1;
5615 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5616 (const_int 0xffffffff<<shift)) */
5617 else if (GET_CODE (x) == AND
5618 && GET_MODE (x) == DImode
5619 && GET_CODE (XEXP (x, 0)) == ASHIFT
5620 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5621 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5622 && CONST_INT_P (XEXP (x, 1)))
5624 type = ADDRESS_REG_UXTW;
5625 index = XEXP (XEXP (x, 0), 0);
5626 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5627 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5628 shift = -1;
5630 /* (mult:P (reg:P) (const_int scale)) */
5631 else if (GET_CODE (x) == MULT
5632 && GET_MODE (x) == Pmode
5633 && GET_MODE (XEXP (x, 0)) == Pmode
5634 && CONST_INT_P (XEXP (x, 1)))
5636 type = ADDRESS_REG_REG;
5637 index = XEXP (x, 0);
5638 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5640 /* (ashift:P (reg:P) (const_int shift)) */
5641 else if (GET_CODE (x) == ASHIFT
5642 && GET_MODE (x) == Pmode
5643 && GET_MODE (XEXP (x, 0)) == Pmode
5644 && CONST_INT_P (XEXP (x, 1)))
5646 type = ADDRESS_REG_REG;
5647 index = XEXP (x, 0);
5648 shift = INTVAL (XEXP (x, 1));
5650 else
5651 return false;
5653 if (!strict_p
5654 && GET_CODE (index) == SUBREG
5655 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5656 index = SUBREG_REG (index);
5658 if (aarch64_sve_data_mode_p (mode))
5660 if (type != ADDRESS_REG_REG
5661 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5662 return false;
5664 else
5666 if (shift != 0
5667 && !(IN_RANGE (shift, 1, 3)
5668 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5669 return false;
5672 if (REG_P (index)
5673 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5675 info->type = type;
5676 info->offset = index;
5677 info->shift = shift;
5678 return true;
5681 return false;
5684 /* Return true if MODE is one of the modes for which we
5685 support LDP/STP operations. */
5687 static bool
5688 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5690 return mode == SImode || mode == DImode
5691 || mode == SFmode || mode == DFmode
5692 || (aarch64_vector_mode_supported_p (mode)
5693 && (known_eq (GET_MODE_SIZE (mode), 8)
5694 || (known_eq (GET_MODE_SIZE (mode), 16)
5695 && (aarch64_tune_params.extra_tuning_flags
5696 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
5699 /* Return true if REGNO is a virtual pointer register, or an eliminable
5700 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5701 include stack_pointer or hard_frame_pointer. */
5702 static bool
5703 virt_or_elim_regno_p (unsigned regno)
5705 return ((regno >= FIRST_VIRTUAL_REGISTER
5706 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5707 || regno == FRAME_POINTER_REGNUM
5708 || regno == ARG_POINTER_REGNUM);
5711 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5712 If it is, fill in INFO appropriately. STRICT_P is true if
5713 REG_OK_STRICT is in effect. */
5715 static bool
5716 aarch64_classify_address (struct aarch64_address_info *info,
5717 rtx x, machine_mode mode, bool strict_p,
5718 aarch64_addr_query_type type = ADDR_QUERY_M)
5720 enum rtx_code code = GET_CODE (x);
5721 rtx op0, op1;
5722 poly_int64 offset;
5724 HOST_WIDE_INT const_size;
5726 /* On BE, we use load/store pair for all large int mode load/stores.
5727 TI/TFmode may also use a load/store pair. */
5728 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5729 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5730 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5731 || mode == TImode
5732 || mode == TFmode
5733 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5735 bool allow_reg_index_p = (!load_store_pair_p
5736 && (known_lt (GET_MODE_SIZE (mode), 16)
5737 || vec_flags == VEC_ADVSIMD
5738 || vec_flags == VEC_SVE_DATA));
5740 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5741 [Rn, #offset, MUL VL]. */
5742 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5743 && (code != REG && code != PLUS))
5744 return false;
5746 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5747 REG addressing. */
5748 if (advsimd_struct_p
5749 && !BYTES_BIG_ENDIAN
5750 && (code != POST_INC && code != REG))
5751 return false;
5753 gcc_checking_assert (GET_MODE (x) == VOIDmode
5754 || SCALAR_INT_MODE_P (GET_MODE (x)));
5756 switch (code)
5758 case REG:
5759 case SUBREG:
5760 info->type = ADDRESS_REG_IMM;
5761 info->base = x;
5762 info->offset = const0_rtx;
5763 info->const_offset = 0;
5764 return aarch64_base_register_rtx_p (x, strict_p);
5766 case PLUS:
5767 op0 = XEXP (x, 0);
5768 op1 = XEXP (x, 1);
5770 if (! strict_p
5771 && REG_P (op0)
5772 && virt_or_elim_regno_p (REGNO (op0))
5773 && poly_int_rtx_p (op1, &offset))
5775 info->type = ADDRESS_REG_IMM;
5776 info->base = op0;
5777 info->offset = op1;
5778 info->const_offset = offset;
5780 return true;
5783 if (maybe_ne (GET_MODE_SIZE (mode), 0)
5784 && aarch64_base_register_rtx_p (op0, strict_p)
5785 && poly_int_rtx_p (op1, &offset))
5787 info->type = ADDRESS_REG_IMM;
5788 info->base = op0;
5789 info->offset = op1;
5790 info->const_offset = offset;
5792 /* TImode and TFmode values are allowed in both pairs of X
5793 registers and individual Q registers. The available
5794 address modes are:
5795 X,X: 7-bit signed scaled offset
5796 Q: 9-bit signed offset
5797 We conservatively require an offset representable in either mode.
5798 When performing the check for pairs of X registers i.e. LDP/STP
5799 pass down DImode since that is the natural size of the LDP/STP
5800 instruction memory accesses. */
5801 if (mode == TImode || mode == TFmode)
5802 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5803 && (offset_9bit_signed_unscaled_p (mode, offset)
5804 || offset_12bit_unsigned_scaled_p (mode, offset)));
5806 /* A 7bit offset check because OImode will emit a ldp/stp
5807 instruction (only big endian will get here).
5808 For ldp/stp instructions, the offset is scaled for the size of a
5809 single element of the pair. */
5810 if (mode == OImode)
5811 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5813 /* Three 9/12 bit offsets checks because CImode will emit three
5814 ldr/str instructions (only big endian will get here). */
5815 if (mode == CImode)
5816 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5817 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5818 || offset_12bit_unsigned_scaled_p (V16QImode,
5819 offset + 32)));
5821 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5822 instructions (only big endian will get here). */
5823 if (mode == XImode)
5824 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5825 && aarch64_offset_7bit_signed_scaled_p (TImode,
5826 offset + 32));
5828 /* Make "m" use the LD1 offset range for SVE data modes, so
5829 that pre-RTL optimizers like ivopts will work to that
5830 instead of the wider LDR/STR range. */
5831 if (vec_flags == VEC_SVE_DATA)
5832 return (type == ADDR_QUERY_M
5833 ? offset_4bit_signed_scaled_p (mode, offset)
5834 : offset_9bit_signed_scaled_p (mode, offset));
5836 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5838 poly_int64 end_offset = (offset
5839 + GET_MODE_SIZE (mode)
5840 - BYTES_PER_SVE_VECTOR);
5841 return (type == ADDR_QUERY_M
5842 ? offset_4bit_signed_scaled_p (mode, offset)
5843 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5844 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5845 end_offset)));
5848 if (vec_flags == VEC_SVE_PRED)
5849 return offset_9bit_signed_scaled_p (mode, offset);
5851 if (load_store_pair_p)
5852 return ((known_eq (GET_MODE_SIZE (mode), 4)
5853 || known_eq (GET_MODE_SIZE (mode), 8)
5854 || known_eq (GET_MODE_SIZE (mode), 16))
5855 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5856 else
5857 return (offset_9bit_signed_unscaled_p (mode, offset)
5858 || offset_12bit_unsigned_scaled_p (mode, offset));
5861 if (allow_reg_index_p)
5863 /* Look for base + (scaled/extended) index register. */
5864 if (aarch64_base_register_rtx_p (op0, strict_p)
5865 && aarch64_classify_index (info, op1, mode, strict_p))
5867 info->base = op0;
5868 return true;
5870 if (aarch64_base_register_rtx_p (op1, strict_p)
5871 && aarch64_classify_index (info, op0, mode, strict_p))
5873 info->base = op1;
5874 return true;
5878 return false;
5880 case POST_INC:
5881 case POST_DEC:
5882 case PRE_INC:
5883 case PRE_DEC:
5884 info->type = ADDRESS_REG_WB;
5885 info->base = XEXP (x, 0);
5886 info->offset = NULL_RTX;
5887 return aarch64_base_register_rtx_p (info->base, strict_p);
5889 case POST_MODIFY:
5890 case PRE_MODIFY:
5891 info->type = ADDRESS_REG_WB;
5892 info->base = XEXP (x, 0);
5893 if (GET_CODE (XEXP (x, 1)) == PLUS
5894 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5895 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5896 && aarch64_base_register_rtx_p (info->base, strict_p))
5898 info->offset = XEXP (XEXP (x, 1), 1);
5899 info->const_offset = offset;
5901 /* TImode and TFmode values are allowed in both pairs of X
5902 registers and individual Q registers. The available
5903 address modes are:
5904 X,X: 7-bit signed scaled offset
5905 Q: 9-bit signed offset
5906 We conservatively require an offset representable in either mode.
5908 if (mode == TImode || mode == TFmode)
5909 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5910 && offset_9bit_signed_unscaled_p (mode, offset));
5912 if (load_store_pair_p)
5913 return ((known_eq (GET_MODE_SIZE (mode), 4)
5914 || known_eq (GET_MODE_SIZE (mode), 8)
5915 || known_eq (GET_MODE_SIZE (mode), 16))
5916 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5917 else
5918 return offset_9bit_signed_unscaled_p (mode, offset);
5920 return false;
5922 case CONST:
5923 case SYMBOL_REF:
5924 case LABEL_REF:
5925 /* load literal: pc-relative constant pool entry. Only supported
5926 for SI mode or larger. */
5927 info->type = ADDRESS_SYMBOLIC;
5929 if (!load_store_pair_p
5930 && GET_MODE_SIZE (mode).is_constant (&const_size)
5931 && const_size >= 4)
5933 rtx sym, addend;
5935 split_const (x, &sym, &addend);
5936 return ((GET_CODE (sym) == LABEL_REF
5937 || (GET_CODE (sym) == SYMBOL_REF
5938 && CONSTANT_POOL_ADDRESS_P (sym)
5939 && aarch64_pcrelative_literal_loads)));
5941 return false;
5943 case LO_SUM:
5944 info->type = ADDRESS_LO_SUM;
5945 info->base = XEXP (x, 0);
5946 info->offset = XEXP (x, 1);
5947 if (allow_reg_index_p
5948 && aarch64_base_register_rtx_p (info->base, strict_p))
5950 rtx sym, offs;
5951 split_const (info->offset, &sym, &offs);
5952 if (GET_CODE (sym) == SYMBOL_REF
5953 && (aarch64_classify_symbol (sym, INTVAL (offs))
5954 == SYMBOL_SMALL_ABSOLUTE))
5956 /* The symbol and offset must be aligned to the access size. */
5957 unsigned int align;
5959 if (CONSTANT_POOL_ADDRESS_P (sym))
5960 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5961 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5963 tree exp = SYMBOL_REF_DECL (sym);
5964 align = TYPE_ALIGN (TREE_TYPE (exp));
5965 align = aarch64_constant_alignment (exp, align);
5967 else if (SYMBOL_REF_DECL (sym))
5968 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5969 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5970 && SYMBOL_REF_BLOCK (sym) != NULL)
5971 align = SYMBOL_REF_BLOCK (sym)->alignment;
5972 else
5973 align = BITS_PER_UNIT;
5975 poly_int64 ref_size = GET_MODE_SIZE (mode);
5976 if (known_eq (ref_size, 0))
5977 ref_size = GET_MODE_SIZE (DImode);
5979 return (multiple_p (INTVAL (offs), ref_size)
5980 && multiple_p (align / BITS_PER_UNIT, ref_size));
5983 return false;
5985 default:
5986 return false;
5990 /* Return true if the address X is valid for a PRFM instruction.
5991 STRICT_P is true if we should do strict checking with
5992 aarch64_classify_address. */
5994 bool
5995 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5997 struct aarch64_address_info addr;
5999 /* PRFM accepts the same addresses as DImode... */
6000 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6001 if (!res)
6002 return false;
6004 /* ... except writeback forms. */
6005 return addr.type != ADDRESS_REG_WB;
6008 bool
6009 aarch64_symbolic_address_p (rtx x)
6011 rtx offset;
6013 split_const (x, &x, &offset);
6014 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6017 /* Classify the base of symbolic expression X. */
6019 enum aarch64_symbol_type
6020 aarch64_classify_symbolic_expression (rtx x)
6022 rtx offset;
6024 split_const (x, &x, &offset);
6025 return aarch64_classify_symbol (x, INTVAL (offset));
6029 /* Return TRUE if X is a legitimate address for accessing memory in
6030 mode MODE. */
6031 static bool
6032 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6034 struct aarch64_address_info addr;
6036 return aarch64_classify_address (&addr, x, mode, strict_p);
6039 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6040 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6041 bool
6042 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6043 aarch64_addr_query_type type)
6045 struct aarch64_address_info addr;
6047 return aarch64_classify_address (&addr, x, mode, strict_p, type);
6050 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6052 static bool
6053 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6054 poly_int64 orig_offset,
6055 machine_mode mode)
6057 HOST_WIDE_INT size;
6058 if (GET_MODE_SIZE (mode).is_constant (&size))
6060 HOST_WIDE_INT const_offset, second_offset;
6062 /* A general SVE offset is A * VQ + B. Remove the A component from
6063 coefficient 0 in order to get the constant B. */
6064 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6066 /* Split an out-of-range address displacement into a base and
6067 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6068 range otherwise to increase opportunities for sharing the base
6069 address of different sizes. Unaligned accesses use the signed
6070 9-bit range, TImode/TFmode use the intersection of signed
6071 scaled 7-bit and signed 9-bit offset. */
6072 if (mode == TImode || mode == TFmode)
6073 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6074 else if ((const_offset & (size - 1)) != 0)
6075 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6076 else
6077 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6079 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6080 return false;
6082 /* Split the offset into second_offset and the rest. */
6083 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6084 *offset2 = gen_int_mode (second_offset, Pmode);
6085 return true;
6087 else
6089 /* Get the mode we should use as the basis of the range. For structure
6090 modes this is the mode of one vector. */
6091 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6092 machine_mode step_mode
6093 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6095 /* Get the "mul vl" multiplier we'd like to use. */
6096 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6097 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6098 if (vec_flags & VEC_SVE_DATA)
6099 /* LDR supports a 9-bit range, but the move patterns for
6100 structure modes require all vectors to be in range of the
6101 same base. The simplest way of accomodating that while still
6102 promoting reuse of anchor points between different modes is
6103 to use an 8-bit range unconditionally. */
6104 vnum = ((vnum + 128) & 255) - 128;
6105 else
6106 /* Predicates are only handled singly, so we might as well use
6107 the full range. */
6108 vnum = ((vnum + 256) & 511) - 256;
6109 if (vnum == 0)
6110 return false;
6112 /* Convert the "mul vl" multiplier into a byte offset. */
6113 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6114 if (known_eq (second_offset, orig_offset))
6115 return false;
6117 /* Split the offset into second_offset and the rest. */
6118 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6119 *offset2 = gen_int_mode (second_offset, Pmode);
6120 return true;
6124 /* Return the binary representation of floating point constant VALUE in INTVAL.
6125 If the value cannot be converted, return false without setting INTVAL.
6126 The conversion is done in the given MODE. */
6127 bool
6128 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6131 /* We make a general exception for 0. */
6132 if (aarch64_float_const_zero_rtx_p (value))
6134 *intval = 0;
6135 return true;
6138 scalar_float_mode mode;
6139 if (GET_CODE (value) != CONST_DOUBLE
6140 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6141 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6142 /* Only support up to DF mode. */
6143 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6144 return false;
6146 unsigned HOST_WIDE_INT ival = 0;
6148 long res[2];
6149 real_to_target (res,
6150 CONST_DOUBLE_REAL_VALUE (value),
6151 REAL_MODE_FORMAT (mode));
6153 if (mode == DFmode)
6155 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6156 ival = zext_hwi (res[order], 32);
6157 ival |= (zext_hwi (res[1 - order], 32) << 32);
6159 else
6160 ival = zext_hwi (res[0], 32);
6162 *intval = ival;
6163 return true;
6166 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6167 single MOV(+MOVK) followed by an FMOV. */
6168 bool
6169 aarch64_float_const_rtx_p (rtx x)
6171 machine_mode mode = GET_MODE (x);
6172 if (mode == VOIDmode)
6173 return false;
6175 /* Determine whether it's cheaper to write float constants as
6176 mov/movk pairs over ldr/adrp pairs. */
6177 unsigned HOST_WIDE_INT ival;
6179 if (GET_CODE (x) == CONST_DOUBLE
6180 && SCALAR_FLOAT_MODE_P (mode)
6181 && aarch64_reinterpret_float_as_int (x, &ival))
6183 scalar_int_mode imode = (mode == HFmode
6184 ? SImode
6185 : int_mode_for_mode (mode).require ());
6186 int num_instr = aarch64_internal_mov_immediate
6187 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6188 return num_instr < 3;
6191 return false;
6194 /* Return TRUE if rtx X is immediate constant 0.0 */
6195 bool
6196 aarch64_float_const_zero_rtx_p (rtx x)
6198 if (GET_MODE (x) == VOIDmode)
6199 return false;
6201 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6202 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6203 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6206 /* Return TRUE if rtx X is immediate constant that fits in a single
6207 MOVI immediate operation. */
6208 bool
6209 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6211 if (!TARGET_SIMD)
6212 return false;
6214 machine_mode vmode;
6215 scalar_int_mode imode;
6216 unsigned HOST_WIDE_INT ival;
6218 if (GET_CODE (x) == CONST_DOUBLE
6219 && SCALAR_FLOAT_MODE_P (mode))
6221 if (!aarch64_reinterpret_float_as_int (x, &ival))
6222 return false;
6224 /* We make a general exception for 0. */
6225 if (aarch64_float_const_zero_rtx_p (x))
6226 return true;
6228 imode = int_mode_for_mode (mode).require ();
6230 else if (GET_CODE (x) == CONST_INT
6231 && is_a <scalar_int_mode> (mode, &imode))
6232 ival = INTVAL (x);
6233 else
6234 return false;
6236 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6237 a 128 bit vector mode. */
6238 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6240 vmode = aarch64_simd_container_mode (imode, width);
6241 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6243 return aarch64_simd_valid_immediate (v_op, NULL);
6247 /* Return the fixed registers used for condition codes. */
6249 static bool
6250 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6252 *p1 = CC_REGNUM;
6253 *p2 = INVALID_REGNUM;
6254 return true;
6257 /* This function is used by the call expanders of the machine description.
6258 RESULT is the register in which the result is returned. It's NULL for
6259 "call" and "sibcall".
6260 MEM is the location of the function call.
6261 SIBCALL indicates whether this function call is normal call or sibling call.
6262 It will generate different pattern accordingly. */
6264 void
6265 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6267 rtx call, callee, tmp;
6268 rtvec vec;
6269 machine_mode mode;
6271 gcc_assert (MEM_P (mem));
6272 callee = XEXP (mem, 0);
6273 mode = GET_MODE (callee);
6274 gcc_assert (mode == Pmode);
6276 /* Decide if we should generate indirect calls by loading the
6277 address of the callee into a register before performing
6278 the branch-and-link. */
6279 if (SYMBOL_REF_P (callee)
6280 ? (aarch64_is_long_call_p (callee)
6281 || aarch64_is_noplt_call_p (callee))
6282 : !REG_P (callee))
6283 XEXP (mem, 0) = force_reg (mode, callee);
6285 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6287 if (result != NULL_RTX)
6288 call = gen_rtx_SET (result, call);
6290 if (sibcall)
6291 tmp = ret_rtx;
6292 else
6293 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6295 vec = gen_rtvec (2, call, tmp);
6296 call = gen_rtx_PARALLEL (VOIDmode, vec);
6298 aarch64_emit_call_insn (call);
6301 /* Emit call insn with PAT and do aarch64-specific handling. */
6303 void
6304 aarch64_emit_call_insn (rtx pat)
6306 rtx insn = emit_call_insn (pat);
6308 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6309 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6310 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6313 machine_mode
6314 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6316 /* All floating point compares return CCFP if it is an equality
6317 comparison, and CCFPE otherwise. */
6318 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6320 switch (code)
6322 case EQ:
6323 case NE:
6324 case UNORDERED:
6325 case ORDERED:
6326 case UNLT:
6327 case UNLE:
6328 case UNGT:
6329 case UNGE:
6330 case UNEQ:
6331 return CCFPmode;
6333 case LT:
6334 case LE:
6335 case GT:
6336 case GE:
6337 case LTGT:
6338 return CCFPEmode;
6340 default:
6341 gcc_unreachable ();
6345 /* Equality comparisons of short modes against zero can be performed
6346 using the TST instruction with the appropriate bitmask. */
6347 if (y == const0_rtx && REG_P (x)
6348 && (code == EQ || code == NE)
6349 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6350 return CC_NZmode;
6352 /* Similarly, comparisons of zero_extends from shorter modes can
6353 be performed using an ANDS with an immediate mask. */
6354 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6355 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6356 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6357 && (code == EQ || code == NE))
6358 return CC_NZmode;
6360 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6361 && y == const0_rtx
6362 && (code == EQ || code == NE || code == LT || code == GE)
6363 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6364 || GET_CODE (x) == NEG
6365 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6366 && CONST_INT_P (XEXP (x, 2)))))
6367 return CC_NZmode;
6369 /* A compare with a shifted operand. Because of canonicalization,
6370 the comparison will have to be swapped when we emit the assembly
6371 code. */
6372 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6373 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6374 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6375 || GET_CODE (x) == LSHIFTRT
6376 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6377 return CC_SWPmode;
6379 /* Similarly for a negated operand, but we can only do this for
6380 equalities. */
6381 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6382 && (REG_P (y) || GET_CODE (y) == SUBREG)
6383 && (code == EQ || code == NE)
6384 && GET_CODE (x) == NEG)
6385 return CC_Zmode;
6387 /* A test for unsigned overflow. */
6388 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6389 && code == NE
6390 && GET_CODE (x) == PLUS
6391 && GET_CODE (y) == ZERO_EXTEND)
6392 return CC_Cmode;
6394 /* For everything else, return CCmode. */
6395 return CCmode;
6398 static int
6399 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6402 aarch64_get_condition_code (rtx x)
6404 machine_mode mode = GET_MODE (XEXP (x, 0));
6405 enum rtx_code comp_code = GET_CODE (x);
6407 if (GET_MODE_CLASS (mode) != MODE_CC)
6408 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6409 return aarch64_get_condition_code_1 (mode, comp_code);
6412 static int
6413 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6415 switch (mode)
6417 case E_CCFPmode:
6418 case E_CCFPEmode:
6419 switch (comp_code)
6421 case GE: return AARCH64_GE;
6422 case GT: return AARCH64_GT;
6423 case LE: return AARCH64_LS;
6424 case LT: return AARCH64_MI;
6425 case NE: return AARCH64_NE;
6426 case EQ: return AARCH64_EQ;
6427 case ORDERED: return AARCH64_VC;
6428 case UNORDERED: return AARCH64_VS;
6429 case UNLT: return AARCH64_LT;
6430 case UNLE: return AARCH64_LE;
6431 case UNGT: return AARCH64_HI;
6432 case UNGE: return AARCH64_PL;
6433 default: return -1;
6435 break;
6437 case E_CCmode:
6438 switch (comp_code)
6440 case NE: return AARCH64_NE;
6441 case EQ: return AARCH64_EQ;
6442 case GE: return AARCH64_GE;
6443 case GT: return AARCH64_GT;
6444 case LE: return AARCH64_LE;
6445 case LT: return AARCH64_LT;
6446 case GEU: return AARCH64_CS;
6447 case GTU: return AARCH64_HI;
6448 case LEU: return AARCH64_LS;
6449 case LTU: return AARCH64_CC;
6450 default: return -1;
6452 break;
6454 case E_CC_SWPmode:
6455 switch (comp_code)
6457 case NE: return AARCH64_NE;
6458 case EQ: return AARCH64_EQ;
6459 case GE: return AARCH64_LE;
6460 case GT: return AARCH64_LT;
6461 case LE: return AARCH64_GE;
6462 case LT: return AARCH64_GT;
6463 case GEU: return AARCH64_LS;
6464 case GTU: return AARCH64_CC;
6465 case LEU: return AARCH64_CS;
6466 case LTU: return AARCH64_HI;
6467 default: return -1;
6469 break;
6471 case E_CC_NZmode:
6472 switch (comp_code)
6474 case NE: return AARCH64_NE;
6475 case EQ: return AARCH64_EQ;
6476 case GE: return AARCH64_PL;
6477 case LT: return AARCH64_MI;
6478 default: return -1;
6480 break;
6482 case E_CC_Zmode:
6483 switch (comp_code)
6485 case NE: return AARCH64_NE;
6486 case EQ: return AARCH64_EQ;
6487 default: return -1;
6489 break;
6491 case E_CC_Cmode:
6492 switch (comp_code)
6494 case NE: return AARCH64_CS;
6495 case EQ: return AARCH64_CC;
6496 default: return -1;
6498 break;
6500 default:
6501 return -1;
6504 return -1;
6507 bool
6508 aarch64_const_vec_all_same_in_range_p (rtx x,
6509 HOST_WIDE_INT minval,
6510 HOST_WIDE_INT maxval)
6512 rtx elt;
6513 return (const_vec_duplicate_p (x, &elt)
6514 && CONST_INT_P (elt)
6515 && IN_RANGE (INTVAL (elt), minval, maxval));
6518 bool
6519 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6521 return aarch64_const_vec_all_same_in_range_p (x, val, val);
6524 /* Return true if VEC is a constant in which every element is in the range
6525 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6527 static bool
6528 aarch64_const_vec_all_in_range_p (rtx vec,
6529 HOST_WIDE_INT minval,
6530 HOST_WIDE_INT maxval)
6532 if (GET_CODE (vec) != CONST_VECTOR
6533 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6534 return false;
6536 int nunits;
6537 if (!CONST_VECTOR_STEPPED_P (vec))
6538 nunits = const_vector_encoded_nelts (vec);
6539 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6540 return false;
6542 for (int i = 0; i < nunits; i++)
6544 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6545 if (!CONST_INT_P (vec_elem)
6546 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6547 return false;
6549 return true;
6552 /* N Z C V. */
6553 #define AARCH64_CC_V 1
6554 #define AARCH64_CC_C (1 << 1)
6555 #define AARCH64_CC_Z (1 << 2)
6556 #define AARCH64_CC_N (1 << 3)
6558 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6559 static const int aarch64_nzcv_codes[] =
6561 0, /* EQ, Z == 1. */
6562 AARCH64_CC_Z, /* NE, Z == 0. */
6563 0, /* CS, C == 1. */
6564 AARCH64_CC_C, /* CC, C == 0. */
6565 0, /* MI, N == 1. */
6566 AARCH64_CC_N, /* PL, N == 0. */
6567 0, /* VS, V == 1. */
6568 AARCH64_CC_V, /* VC, V == 0. */
6569 0, /* HI, C ==1 && Z == 0. */
6570 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
6571 AARCH64_CC_V, /* GE, N == V. */
6572 0, /* LT, N != V. */
6573 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
6574 0, /* LE, !(Z == 0 && N == V). */
6575 0, /* AL, Any. */
6576 0 /* NV, Any. */
6579 /* Print floating-point vector immediate operand X to F, negating it
6580 first if NEGATE is true. Return true on success, false if it isn't
6581 a constant we can handle. */
6583 static bool
6584 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6586 rtx elt;
6588 if (!const_vec_duplicate_p (x, &elt))
6589 return false;
6591 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6592 if (negate)
6593 r = real_value_negate (&r);
6595 /* We only handle the SVE single-bit immediates here. */
6596 if (real_equal (&r, &dconst0))
6597 asm_fprintf (f, "0.0");
6598 else if (real_equal (&r, &dconst1))
6599 asm_fprintf (f, "1.0");
6600 else if (real_equal (&r, &dconsthalf))
6601 asm_fprintf (f, "0.5");
6602 else
6603 return false;
6605 return true;
6608 /* Return the equivalent letter for size. */
6609 static char
6610 sizetochar (int size)
6612 switch (size)
6614 case 64: return 'd';
6615 case 32: return 's';
6616 case 16: return 'h';
6617 case 8 : return 'b';
6618 default: gcc_unreachable ();
6622 /* Print operand X to file F in a target specific manner according to CODE.
6623 The acceptable formatting commands given by CODE are:
6624 'c': An integer or symbol address without a preceding #
6625 sign.
6626 'C': Take the duplicated element in a vector constant
6627 and print it in hex.
6628 'D': Take the duplicated element in a vector constant
6629 and print it as an unsigned integer, in decimal.
6630 'e': Print the sign/zero-extend size as a character 8->b,
6631 16->h, 32->w.
6632 'p': Prints N such that 2^N == X (X must be power of 2 and
6633 const int).
6634 'P': Print the number of non-zero bits in X (a const_int).
6635 'H': Print the higher numbered register of a pair (TImode)
6636 of regs.
6637 'm': Print a condition (eq, ne, etc).
6638 'M': Same as 'm', but invert condition.
6639 'N': Take the duplicated element in a vector constant
6640 and print the negative of it in decimal.
6641 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6642 'S/T/U/V': Print a FP/SIMD register name for a register list.
6643 The register printed is the FP/SIMD register name
6644 of X + 0/1/2/3 for S/T/U/V.
6645 'R': Print a scalar FP/SIMD register name + 1.
6646 'X': Print bottom 16 bits of integer constant in hex.
6647 'w/x': Print a general register name or the zero register
6648 (32-bit or 64-bit).
6649 '0': Print a normal operand, if it's a general register,
6650 then we assume DImode.
6651 'k': Print NZCV for conditional compare instructions.
6652 'A': Output address constant representing the first
6653 argument of X, specifying a relocation offset
6654 if appropriate.
6655 'L': Output constant address specified by X
6656 with a relocation offset if appropriate.
6657 'G': Prints address of X, specifying a PC relative
6658 relocation mode if appropriate.
6659 'y': Output address of LDP or STP - this is used for
6660 some LDP/STPs which don't use a PARALLEL in their
6661 pattern (so the mode needs to be adjusted).
6662 'z': Output address of a typical LDP or STP. */
6664 static void
6665 aarch64_print_operand (FILE *f, rtx x, int code)
6667 rtx elt;
6668 switch (code)
6670 case 'c':
6671 switch (GET_CODE (x))
6673 case CONST_INT:
6674 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6675 break;
6677 case SYMBOL_REF:
6678 output_addr_const (f, x);
6679 break;
6681 case CONST:
6682 if (GET_CODE (XEXP (x, 0)) == PLUS
6683 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6685 output_addr_const (f, x);
6686 break;
6688 /* Fall through. */
6690 default:
6691 output_operand_lossage ("unsupported operand for code '%c'", code);
6693 break;
6695 case 'e':
6697 int n;
6699 if (!CONST_INT_P (x)
6700 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6702 output_operand_lossage ("invalid operand for '%%%c'", code);
6703 return;
6706 switch (n)
6708 case 3:
6709 fputc ('b', f);
6710 break;
6711 case 4:
6712 fputc ('h', f);
6713 break;
6714 case 5:
6715 fputc ('w', f);
6716 break;
6717 default:
6718 output_operand_lossage ("invalid operand for '%%%c'", code);
6719 return;
6722 break;
6724 case 'p':
6726 int n;
6728 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6730 output_operand_lossage ("invalid operand for '%%%c'", code);
6731 return;
6734 asm_fprintf (f, "%d", n);
6736 break;
6738 case 'P':
6739 if (!CONST_INT_P (x))
6741 output_operand_lossage ("invalid operand for '%%%c'", code);
6742 return;
6745 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6746 break;
6748 case 'H':
6749 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6751 output_operand_lossage ("invalid operand for '%%%c'", code);
6752 return;
6755 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6756 break;
6758 case 'M':
6759 case 'm':
6761 int cond_code;
6762 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6763 if (x == const_true_rtx)
6765 if (code == 'M')
6766 fputs ("nv", f);
6767 return;
6770 if (!COMPARISON_P (x))
6772 output_operand_lossage ("invalid operand for '%%%c'", code);
6773 return;
6776 cond_code = aarch64_get_condition_code (x);
6777 gcc_assert (cond_code >= 0);
6778 if (code == 'M')
6779 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6780 fputs (aarch64_condition_codes[cond_code], f);
6782 break;
6784 case 'N':
6785 if (!const_vec_duplicate_p (x, &elt))
6787 output_operand_lossage ("invalid vector constant");
6788 return;
6791 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6792 asm_fprintf (f, "%wd", -INTVAL (elt));
6793 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6794 && aarch64_print_vector_float_operand (f, x, true))
6796 else
6798 output_operand_lossage ("invalid vector constant");
6799 return;
6801 break;
6803 case 'b':
6804 case 'h':
6805 case 's':
6806 case 'd':
6807 case 'q':
6808 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6810 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6811 return;
6813 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6814 break;
6816 case 'S':
6817 case 'T':
6818 case 'U':
6819 case 'V':
6820 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6822 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6823 return;
6825 asm_fprintf (f, "%c%d",
6826 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6827 REGNO (x) - V0_REGNUM + (code - 'S'));
6828 break;
6830 case 'R':
6831 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6833 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6834 return;
6836 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6837 break;
6839 case 'X':
6840 if (!CONST_INT_P (x))
6842 output_operand_lossage ("invalid operand for '%%%c'", code);
6843 return;
6845 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6846 break;
6848 case 'C':
6850 /* Print a replicated constant in hex. */
6851 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6853 output_operand_lossage ("invalid operand for '%%%c'", code);
6854 return;
6856 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6857 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6859 break;
6861 case 'D':
6863 /* Print a replicated constant in decimal, treating it as
6864 unsigned. */
6865 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6867 output_operand_lossage ("invalid operand for '%%%c'", code);
6868 return;
6870 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6871 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6873 break;
6875 case 'w':
6876 case 'x':
6877 if (x == const0_rtx
6878 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6880 asm_fprintf (f, "%czr", code);
6881 break;
6884 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6886 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6887 break;
6890 if (REG_P (x) && REGNO (x) == SP_REGNUM)
6892 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6893 break;
6896 /* Fall through */
6898 case 0:
6899 if (x == NULL)
6901 output_operand_lossage ("missing operand");
6902 return;
6905 switch (GET_CODE (x))
6907 case REG:
6908 if (aarch64_sve_data_mode_p (GET_MODE (x)))
6910 if (REG_NREGS (x) == 1)
6911 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6912 else
6914 char suffix
6915 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6916 asm_fprintf (f, "{z%d.%c - z%d.%c}",
6917 REGNO (x) - V0_REGNUM, suffix,
6918 END_REGNO (x) - V0_REGNUM - 1, suffix);
6921 else
6922 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6923 break;
6925 case MEM:
6926 output_address (GET_MODE (x), XEXP (x, 0));
6927 break;
6929 case LABEL_REF:
6930 case SYMBOL_REF:
6931 output_addr_const (asm_out_file, x);
6932 break;
6934 case CONST_INT:
6935 asm_fprintf (f, "%wd", INTVAL (x));
6936 break;
6938 case CONST:
6939 if (!VECTOR_MODE_P (GET_MODE (x)))
6941 output_addr_const (asm_out_file, x);
6942 break;
6944 /* fall through */
6946 case CONST_VECTOR:
6947 if (!const_vec_duplicate_p (x, &elt))
6949 output_operand_lossage ("invalid vector constant");
6950 return;
6953 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6954 asm_fprintf (f, "%wd", INTVAL (elt));
6955 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6956 && aarch64_print_vector_float_operand (f, x, false))
6958 else
6960 output_operand_lossage ("invalid vector constant");
6961 return;
6963 break;
6965 case CONST_DOUBLE:
6966 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6967 be getting CONST_DOUBLEs holding integers. */
6968 gcc_assert (GET_MODE (x) != VOIDmode);
6969 if (aarch64_float_const_zero_rtx_p (x))
6971 fputc ('0', f);
6972 break;
6974 else if (aarch64_float_const_representable_p (x))
6976 #define buf_size 20
6977 char float_buf[buf_size] = {'\0'};
6978 real_to_decimal_for_mode (float_buf,
6979 CONST_DOUBLE_REAL_VALUE (x),
6980 buf_size, buf_size,
6981 1, GET_MODE (x));
6982 asm_fprintf (asm_out_file, "%s", float_buf);
6983 break;
6984 #undef buf_size
6986 output_operand_lossage ("invalid constant");
6987 return;
6988 default:
6989 output_operand_lossage ("invalid operand");
6990 return;
6992 break;
6994 case 'A':
6995 if (GET_CODE (x) == HIGH)
6996 x = XEXP (x, 0);
6998 switch (aarch64_classify_symbolic_expression (x))
7000 case SYMBOL_SMALL_GOT_4G:
7001 asm_fprintf (asm_out_file, ":got:");
7002 break;
7004 case SYMBOL_SMALL_TLSGD:
7005 asm_fprintf (asm_out_file, ":tlsgd:");
7006 break;
7008 case SYMBOL_SMALL_TLSDESC:
7009 asm_fprintf (asm_out_file, ":tlsdesc:");
7010 break;
7012 case SYMBOL_SMALL_TLSIE:
7013 asm_fprintf (asm_out_file, ":gottprel:");
7014 break;
7016 case SYMBOL_TLSLE24:
7017 asm_fprintf (asm_out_file, ":tprel:");
7018 break;
7020 case SYMBOL_TINY_GOT:
7021 gcc_unreachable ();
7022 break;
7024 default:
7025 break;
7027 output_addr_const (asm_out_file, x);
7028 break;
7030 case 'L':
7031 switch (aarch64_classify_symbolic_expression (x))
7033 case SYMBOL_SMALL_GOT_4G:
7034 asm_fprintf (asm_out_file, ":lo12:");
7035 break;
7037 case SYMBOL_SMALL_TLSGD:
7038 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7039 break;
7041 case SYMBOL_SMALL_TLSDESC:
7042 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7043 break;
7045 case SYMBOL_SMALL_TLSIE:
7046 asm_fprintf (asm_out_file, ":gottprel_lo12:");
7047 break;
7049 case SYMBOL_TLSLE12:
7050 asm_fprintf (asm_out_file, ":tprel_lo12:");
7051 break;
7053 case SYMBOL_TLSLE24:
7054 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7055 break;
7057 case SYMBOL_TINY_GOT:
7058 asm_fprintf (asm_out_file, ":got:");
7059 break;
7061 case SYMBOL_TINY_TLSIE:
7062 asm_fprintf (asm_out_file, ":gottprel:");
7063 break;
7065 default:
7066 break;
7068 output_addr_const (asm_out_file, x);
7069 break;
7071 case 'G':
7072 switch (aarch64_classify_symbolic_expression (x))
7074 case SYMBOL_TLSLE24:
7075 asm_fprintf (asm_out_file, ":tprel_hi12:");
7076 break;
7077 default:
7078 break;
7080 output_addr_const (asm_out_file, x);
7081 break;
7083 case 'k':
7085 HOST_WIDE_INT cond_code;
7087 if (!CONST_INT_P (x))
7089 output_operand_lossage ("invalid operand for '%%%c'", code);
7090 return;
7093 cond_code = INTVAL (x);
7094 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7095 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7097 break;
7099 case 'y':
7100 case 'z':
7102 machine_mode mode = GET_MODE (x);
7104 if (GET_CODE (x) != MEM
7105 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7107 output_operand_lossage ("invalid operand for '%%%c'", code);
7108 return;
7111 if (code == 'y')
7112 /* LDP/STP which uses a single double-width memory operand.
7113 Adjust the mode to appear like a typical LDP/STP.
7114 Currently this is supported for 16-byte accesses only. */
7115 mode = DFmode;
7117 if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7118 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7120 break;
7122 default:
7123 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7124 return;
7128 /* Print address 'x' of a memory access with mode 'mode'.
7129 'op' is the context required by aarch64_classify_address. It can either be
7130 MEM for a normal memory access or PARALLEL for LDP/STP. */
7131 static bool
7132 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7133 aarch64_addr_query_type type)
7135 struct aarch64_address_info addr;
7136 unsigned int size;
7138 /* Check all addresses are Pmode - including ILP32. */
7139 if (GET_MODE (x) != Pmode)
7140 output_operand_lossage ("invalid address mode");
7142 if (aarch64_classify_address (&addr, x, mode, true, type))
7143 switch (addr.type)
7145 case ADDRESS_REG_IMM:
7146 if (known_eq (addr.const_offset, 0))
7147 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7148 else if (aarch64_sve_data_mode_p (mode))
7150 HOST_WIDE_INT vnum
7151 = exact_div (addr.const_offset,
7152 BYTES_PER_SVE_VECTOR).to_constant ();
7153 asm_fprintf (f, "[%s, #%wd, mul vl]",
7154 reg_names[REGNO (addr.base)], vnum);
7156 else if (aarch64_sve_pred_mode_p (mode))
7158 HOST_WIDE_INT vnum
7159 = exact_div (addr.const_offset,
7160 BYTES_PER_SVE_PRED).to_constant ();
7161 asm_fprintf (f, "[%s, #%wd, mul vl]",
7162 reg_names[REGNO (addr.base)], vnum);
7164 else
7165 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7166 INTVAL (addr.offset));
7167 return true;
7169 case ADDRESS_REG_REG:
7170 if (addr.shift == 0)
7171 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7172 reg_names [REGNO (addr.offset)]);
7173 else
7174 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7175 reg_names [REGNO (addr.offset)], addr.shift);
7176 return true;
7178 case ADDRESS_REG_UXTW:
7179 if (addr.shift == 0)
7180 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7181 REGNO (addr.offset) - R0_REGNUM);
7182 else
7183 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7184 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7185 return true;
7187 case ADDRESS_REG_SXTW:
7188 if (addr.shift == 0)
7189 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7190 REGNO (addr.offset) - R0_REGNUM);
7191 else
7192 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7193 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7194 return true;
7196 case ADDRESS_REG_WB:
7197 /* Writeback is only supported for fixed-width modes. */
7198 size = GET_MODE_SIZE (mode).to_constant ();
7199 switch (GET_CODE (x))
7201 case PRE_INC:
7202 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7203 return true;
7204 case POST_INC:
7205 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7206 return true;
7207 case PRE_DEC:
7208 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7209 return true;
7210 case POST_DEC:
7211 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7212 return true;
7213 case PRE_MODIFY:
7214 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7215 INTVAL (addr.offset));
7216 return true;
7217 case POST_MODIFY:
7218 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7219 INTVAL (addr.offset));
7220 return true;
7221 default:
7222 break;
7224 break;
7226 case ADDRESS_LO_SUM:
7227 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7228 output_addr_const (f, addr.offset);
7229 asm_fprintf (f, "]");
7230 return true;
7232 case ADDRESS_SYMBOLIC:
7233 output_addr_const (f, x);
7234 return true;
7237 return false;
7240 /* Print address 'x' of a LDP/STP with mode 'mode'. */
7241 static bool
7242 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7244 return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7247 /* Print address 'x' of a memory access with mode 'mode'. */
7248 static void
7249 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7251 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7252 output_addr_const (f, x);
7255 bool
7256 aarch64_label_mentioned_p (rtx x)
7258 const char *fmt;
7259 int i;
7261 if (GET_CODE (x) == LABEL_REF)
7262 return true;
7264 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7265 referencing instruction, but they are constant offsets, not
7266 symbols. */
7267 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7268 return false;
7270 fmt = GET_RTX_FORMAT (GET_CODE (x));
7271 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7273 if (fmt[i] == 'E')
7275 int j;
7277 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7278 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7279 return 1;
7281 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7282 return 1;
7285 return 0;
7288 /* Implement REGNO_REG_CLASS. */
7290 enum reg_class
7291 aarch64_regno_regclass (unsigned regno)
7293 if (GP_REGNUM_P (regno))
7294 return GENERAL_REGS;
7296 if (regno == SP_REGNUM)
7297 return STACK_REG;
7299 if (regno == FRAME_POINTER_REGNUM
7300 || regno == ARG_POINTER_REGNUM)
7301 return POINTER_REGS;
7303 if (FP_REGNUM_P (regno))
7304 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
7306 if (PR_REGNUM_P (regno))
7307 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7309 return NO_REGS;
7312 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7313 If OFFSET is out of range, return an offset of an anchor point
7314 that is in range. Return 0 otherwise. */
7316 static HOST_WIDE_INT
7317 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7318 machine_mode mode)
7320 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7321 if (size > 16)
7322 return (offset + 0x400) & ~0x7f0;
7324 /* For offsets that aren't a multiple of the access size, the limit is
7325 -256...255. */
7326 if (offset & (size - 1))
7328 /* BLKmode typically uses LDP of X-registers. */
7329 if (mode == BLKmode)
7330 return (offset + 512) & ~0x3ff;
7331 return (offset + 0x100) & ~0x1ff;
7334 /* Small negative offsets are supported. */
7335 if (IN_RANGE (offset, -256, 0))
7336 return 0;
7338 if (mode == TImode || mode == TFmode)
7339 return (offset + 0x100) & ~0x1ff;
7341 /* Use 12-bit offset by access size. */
7342 return offset & (~0xfff * size);
7345 static rtx
7346 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
7348 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7349 where mask is selected by alignment and size of the offset.
7350 We try to pick as large a range for the offset as possible to
7351 maximize the chance of a CSE. However, for aligned addresses
7352 we limit the range to 4k so that structures with different sized
7353 elements are likely to use the same base. We need to be careful
7354 not to split a CONST for some forms of address expression, otherwise
7355 it will generate sub-optimal code. */
7357 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7359 rtx base = XEXP (x, 0);
7360 rtx offset_rtx = XEXP (x, 1);
7361 HOST_WIDE_INT offset = INTVAL (offset_rtx);
7363 if (GET_CODE (base) == PLUS)
7365 rtx op0 = XEXP (base, 0);
7366 rtx op1 = XEXP (base, 1);
7368 /* Force any scaling into a temp for CSE. */
7369 op0 = force_reg (Pmode, op0);
7370 op1 = force_reg (Pmode, op1);
7372 /* Let the pointer register be in op0. */
7373 if (REG_POINTER (op1))
7374 std::swap (op0, op1);
7376 /* If the pointer is virtual or frame related, then we know that
7377 virtual register instantiation or register elimination is going
7378 to apply a second constant. We want the two constants folded
7379 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7380 if (virt_or_elim_regno_p (REGNO (op0)))
7382 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7383 NULL_RTX, true, OPTAB_DIRECT);
7384 return gen_rtx_PLUS (Pmode, base, op1);
7387 /* Otherwise, in order to encourage CSE (and thence loop strength
7388 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7389 base = expand_binop (Pmode, add_optab, op0, op1,
7390 NULL_RTX, true, OPTAB_DIRECT);
7391 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7394 HOST_WIDE_INT size;
7395 if (GET_MODE_SIZE (mode).is_constant (&size))
7397 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7398 mode);
7399 if (base_offset != 0)
7401 base = plus_constant (Pmode, base, base_offset);
7402 base = force_operand (base, NULL_RTX);
7403 return plus_constant (Pmode, base, offset - base_offset);
7408 return x;
7411 /* Return the reload icode required for a constant pool in mode. */
7412 static enum insn_code
7413 aarch64_constant_pool_reload_icode (machine_mode mode)
7415 switch (mode)
7417 case E_SFmode:
7418 return CODE_FOR_aarch64_reload_movcpsfdi;
7420 case E_DFmode:
7421 return CODE_FOR_aarch64_reload_movcpdfdi;
7423 case E_TFmode:
7424 return CODE_FOR_aarch64_reload_movcptfdi;
7426 case E_V8QImode:
7427 return CODE_FOR_aarch64_reload_movcpv8qidi;
7429 case E_V16QImode:
7430 return CODE_FOR_aarch64_reload_movcpv16qidi;
7432 case E_V4HImode:
7433 return CODE_FOR_aarch64_reload_movcpv4hidi;
7435 case E_V8HImode:
7436 return CODE_FOR_aarch64_reload_movcpv8hidi;
7438 case E_V2SImode:
7439 return CODE_FOR_aarch64_reload_movcpv2sidi;
7441 case E_V4SImode:
7442 return CODE_FOR_aarch64_reload_movcpv4sidi;
7444 case E_V2DImode:
7445 return CODE_FOR_aarch64_reload_movcpv2didi;
7447 case E_V2DFmode:
7448 return CODE_FOR_aarch64_reload_movcpv2dfdi;
7450 default:
7451 gcc_unreachable ();
7454 gcc_unreachable ();
7456 static reg_class_t
7457 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7458 reg_class_t rclass,
7459 machine_mode mode,
7460 secondary_reload_info *sri)
7462 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7463 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7464 comment at the head of aarch64-sve.md for more details about the
7465 big-endian handling. */
7466 if (BYTES_BIG_ENDIAN
7467 && reg_class_subset_p (rclass, FP_REGS)
7468 && !((REG_P (x) && HARD_REGISTER_P (x))
7469 || aarch64_simd_valid_immediate (x, NULL))
7470 && aarch64_sve_data_mode_p (mode))
7472 sri->icode = CODE_FOR_aarch64_sve_reload_be;
7473 return NO_REGS;
7476 /* If we have to disable direct literal pool loads and stores because the
7477 function is too big, then we need a scratch register. */
7478 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7479 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7480 || targetm.vector_mode_supported_p (GET_MODE (x)))
7481 && !aarch64_pcrelative_literal_loads)
7483 sri->icode = aarch64_constant_pool_reload_icode (mode);
7484 return NO_REGS;
7487 /* Without the TARGET_SIMD instructions we cannot move a Q register
7488 to a Q register directly. We need a scratch. */
7489 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7490 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7491 && reg_class_subset_p (rclass, FP_REGS))
7493 if (mode == TFmode)
7494 sri->icode = CODE_FOR_aarch64_reload_movtf;
7495 else if (mode == TImode)
7496 sri->icode = CODE_FOR_aarch64_reload_movti;
7497 return NO_REGS;
7500 /* A TFmode or TImode memory access should be handled via an FP_REGS
7501 because AArch64 has richer addressing modes for LDR/STR instructions
7502 than LDP/STP instructions. */
7503 if (TARGET_FLOAT && rclass == GENERAL_REGS
7504 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7505 return FP_REGS;
7507 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7508 return GENERAL_REGS;
7510 return NO_REGS;
7513 static bool
7514 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7516 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7518 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7519 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7520 if (frame_pointer_needed)
7521 return to == HARD_FRAME_POINTER_REGNUM;
7522 return true;
7525 poly_int64
7526 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7528 aarch64_layout_frame ();
7530 if (to == HARD_FRAME_POINTER_REGNUM)
7532 if (from == ARG_POINTER_REGNUM)
7533 return cfun->machine->frame.hard_fp_offset;
7535 if (from == FRAME_POINTER_REGNUM)
7536 return cfun->machine->frame.hard_fp_offset
7537 - cfun->machine->frame.locals_offset;
7540 if (to == STACK_POINTER_REGNUM)
7542 if (from == FRAME_POINTER_REGNUM)
7543 return cfun->machine->frame.frame_size
7544 - cfun->machine->frame.locals_offset;
7547 return cfun->machine->frame.frame_size;
7550 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7551 previous frame. */
7554 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7556 if (count != 0)
7557 return const0_rtx;
7558 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7562 static void
7563 aarch64_asm_trampoline_template (FILE *f)
7565 if (TARGET_ILP32)
7567 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7568 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7570 else
7572 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7573 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7575 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7576 assemble_aligned_integer (4, const0_rtx);
7577 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7578 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7581 static void
7582 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7584 rtx fnaddr, mem, a_tramp;
7585 const int tramp_code_sz = 16;
7587 /* Don't need to copy the trailing D-words, we fill those in below. */
7588 emit_block_move (m_tramp, assemble_trampoline_template (),
7589 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7590 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7591 fnaddr = XEXP (DECL_RTL (fndecl), 0);
7592 if (GET_MODE (fnaddr) != ptr_mode)
7593 fnaddr = convert_memory_address (ptr_mode, fnaddr);
7594 emit_move_insn (mem, fnaddr);
7596 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7597 emit_move_insn (mem, chain_value);
7599 /* XXX We should really define a "clear_cache" pattern and use
7600 gen_clear_cache(). */
7601 a_tramp = XEXP (m_tramp, 0);
7602 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7603 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7604 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7605 ptr_mode);
7608 static unsigned char
7609 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7611 /* ??? Logically we should only need to provide a value when
7612 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7613 can hold MODE, but at the moment we need to handle all modes.
7614 Just ignore any runtime parts for registers that can't store them. */
7615 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7616 unsigned int nregs;
7617 switch (regclass)
7619 case TAILCALL_ADDR_REGS:
7620 case POINTER_REGS:
7621 case GENERAL_REGS:
7622 case ALL_REGS:
7623 case POINTER_AND_FP_REGS:
7624 case FP_REGS:
7625 case FP_LO_REGS:
7626 if (aarch64_sve_data_mode_p (mode)
7627 && constant_multiple_p (GET_MODE_SIZE (mode),
7628 BYTES_PER_SVE_VECTOR, &nregs))
7629 return nregs;
7630 return (aarch64_vector_data_mode_p (mode)
7631 ? CEIL (lowest_size, UNITS_PER_VREG)
7632 : CEIL (lowest_size, UNITS_PER_WORD));
7633 case STACK_REG:
7634 case PR_REGS:
7635 case PR_LO_REGS:
7636 case PR_HI_REGS:
7637 return 1;
7639 case NO_REGS:
7640 return 0;
7642 default:
7643 break;
7645 gcc_unreachable ();
7648 static reg_class_t
7649 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7651 if (regclass == POINTER_REGS)
7652 return GENERAL_REGS;
7654 if (regclass == STACK_REG)
7656 if (REG_P(x)
7657 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7658 return regclass;
7660 return NO_REGS;
7663 /* Register eliminiation can result in a request for
7664 SP+constant->FP_REGS. We cannot support such operations which
7665 use SP as source and an FP_REG as destination, so reject out
7666 right now. */
7667 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7669 rtx lhs = XEXP (x, 0);
7671 /* Look through a possible SUBREG introduced by ILP32. */
7672 if (GET_CODE (lhs) == SUBREG)
7673 lhs = SUBREG_REG (lhs);
7675 gcc_assert (REG_P (lhs));
7676 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7677 POINTER_REGS));
7678 return NO_REGS;
7681 return regclass;
7684 void
7685 aarch64_asm_output_labelref (FILE* f, const char *name)
7687 asm_fprintf (f, "%U%s", name);
7690 static void
7691 aarch64_elf_asm_constructor (rtx symbol, int priority)
7693 if (priority == DEFAULT_INIT_PRIORITY)
7694 default_ctor_section_asm_out_constructor (symbol, priority);
7695 else
7697 section *s;
7698 /* While priority is known to be in range [0, 65535], so 18 bytes
7699 would be enough, the compiler might not know that. To avoid
7700 -Wformat-truncation false positive, use a larger size. */
7701 char buf[23];
7702 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7703 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7704 switch_to_section (s);
7705 assemble_align (POINTER_SIZE);
7706 assemble_aligned_integer (POINTER_BYTES, symbol);
7710 static void
7711 aarch64_elf_asm_destructor (rtx symbol, int priority)
7713 if (priority == DEFAULT_INIT_PRIORITY)
7714 default_dtor_section_asm_out_destructor (symbol, priority);
7715 else
7717 section *s;
7718 /* While priority is known to be in range [0, 65535], so 18 bytes
7719 would be enough, the compiler might not know that. To avoid
7720 -Wformat-truncation false positive, use a larger size. */
7721 char buf[23];
7722 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7723 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7724 switch_to_section (s);
7725 assemble_align (POINTER_SIZE);
7726 assemble_aligned_integer (POINTER_BYTES, symbol);
7730 const char*
7731 aarch64_output_casesi (rtx *operands)
7733 char buf[100];
7734 char label[100];
7735 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7736 int index;
7737 static const char *const patterns[4][2] =
7740 "ldrb\t%w3, [%0,%w1,uxtw]",
7741 "add\t%3, %4, %w3, sxtb #2"
7744 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7745 "add\t%3, %4, %w3, sxth #2"
7748 "ldr\t%w3, [%0,%w1,uxtw #2]",
7749 "add\t%3, %4, %w3, sxtw #2"
7751 /* We assume that DImode is only generated when not optimizing and
7752 that we don't really need 64-bit address offsets. That would
7753 imply an object file with 8GB of code in a single function! */
7755 "ldr\t%w3, [%0,%w1,uxtw #2]",
7756 "add\t%3, %4, %w3, sxtw #2"
7760 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7762 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7763 index = exact_log2 (GET_MODE_SIZE (mode));
7765 gcc_assert (index >= 0 && index <= 3);
7767 /* Need to implement table size reduction, by chaning the code below. */
7768 output_asm_insn (patterns[index][0], operands);
7769 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7770 snprintf (buf, sizeof (buf),
7771 "adr\t%%4, %s", targetm.strip_name_encoding (label));
7772 output_asm_insn (buf, operands);
7773 output_asm_insn (patterns[index][1], operands);
7774 output_asm_insn ("br\t%3", operands);
7775 assemble_label (asm_out_file, label);
7776 return "";
7780 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7781 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7782 operator. */
7785 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7787 if (shift >= 0 && shift <= 3)
7789 int size;
7790 for (size = 8; size <= 32; size *= 2)
7792 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7793 if (mask == bits << shift)
7794 return size;
7797 return 0;
7800 /* Constant pools are per function only when PC relative
7801 literal loads are true or we are in the large memory
7802 model. */
7804 static inline bool
7805 aarch64_can_use_per_function_literal_pools_p (void)
7807 return (aarch64_pcrelative_literal_loads
7808 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7811 static bool
7812 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7814 /* We can't use blocks for constants when we're using a per-function
7815 constant pool. */
7816 return !aarch64_can_use_per_function_literal_pools_p ();
7819 /* Select appropriate section for constants depending
7820 on where we place literal pools. */
7822 static section *
7823 aarch64_select_rtx_section (machine_mode mode,
7824 rtx x,
7825 unsigned HOST_WIDE_INT align)
7827 if (aarch64_can_use_per_function_literal_pools_p ())
7828 return function_section (current_function_decl);
7830 return default_elf_select_rtx_section (mode, x, align);
7833 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7834 void
7835 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7836 HOST_WIDE_INT offset)
7838 /* When using per-function literal pools, we must ensure that any code
7839 section is aligned to the minimal instruction length, lest we get
7840 errors from the assembler re "unaligned instructions". */
7841 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7842 ASM_OUTPUT_ALIGN (f, 2);
7845 /* Costs. */
7847 /* Helper function for rtx cost calculation. Strip a shift expression
7848 from X. Returns the inner operand if successful, or the original
7849 expression on failure. */
7850 static rtx
7851 aarch64_strip_shift (rtx x)
7853 rtx op = x;
7855 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7856 we can convert both to ROR during final output. */
7857 if ((GET_CODE (op) == ASHIFT
7858 || GET_CODE (op) == ASHIFTRT
7859 || GET_CODE (op) == LSHIFTRT
7860 || GET_CODE (op) == ROTATERT
7861 || GET_CODE (op) == ROTATE)
7862 && CONST_INT_P (XEXP (op, 1)))
7863 return XEXP (op, 0);
7865 if (GET_CODE (op) == MULT
7866 && CONST_INT_P (XEXP (op, 1))
7867 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7868 return XEXP (op, 0);
7870 return x;
7873 /* Helper function for rtx cost calculation. Strip an extend
7874 expression from X. Returns the inner operand if successful, or the
7875 original expression on failure. We deal with a number of possible
7876 canonicalization variations here. If STRIP_SHIFT is true, then
7877 we can strip off a shift also. */
7878 static rtx
7879 aarch64_strip_extend (rtx x, bool strip_shift)
7881 scalar_int_mode mode;
7882 rtx op = x;
7884 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7885 return op;
7887 /* Zero and sign extraction of a widened value. */
7888 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7889 && XEXP (op, 2) == const0_rtx
7890 && GET_CODE (XEXP (op, 0)) == MULT
7891 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7892 XEXP (op, 1)))
7893 return XEXP (XEXP (op, 0), 0);
7895 /* It can also be represented (for zero-extend) as an AND with an
7896 immediate. */
7897 if (GET_CODE (op) == AND
7898 && GET_CODE (XEXP (op, 0)) == MULT
7899 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7900 && CONST_INT_P (XEXP (op, 1))
7901 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7902 INTVAL (XEXP (op, 1))) != 0)
7903 return XEXP (XEXP (op, 0), 0);
7905 /* Now handle extended register, as this may also have an optional
7906 left shift by 1..4. */
7907 if (strip_shift
7908 && GET_CODE (op) == ASHIFT
7909 && CONST_INT_P (XEXP (op, 1))
7910 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7911 op = XEXP (op, 0);
7913 if (GET_CODE (op) == ZERO_EXTEND
7914 || GET_CODE (op) == SIGN_EXTEND)
7915 op = XEXP (op, 0);
7917 if (op != x)
7918 return op;
7920 return x;
7923 /* Return true iff CODE is a shift supported in combination
7924 with arithmetic instructions. */
7926 static bool
7927 aarch64_shift_p (enum rtx_code code)
7929 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7933 /* Return true iff X is a cheap shift without a sign extend. */
7935 static bool
7936 aarch64_cheap_mult_shift_p (rtx x)
7938 rtx op0, op1;
7940 op0 = XEXP (x, 0);
7941 op1 = XEXP (x, 1);
7943 if (!(aarch64_tune_params.extra_tuning_flags
7944 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7945 return false;
7947 if (GET_CODE (op0) == SIGN_EXTEND)
7948 return false;
7950 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7951 && UINTVAL (op1) <= 4)
7952 return true;
7954 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7955 return false;
7957 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7959 if (l2 > 0 && l2 <= 4)
7960 return true;
7962 return false;
7965 /* Helper function for rtx cost calculation. Calculate the cost of
7966 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7967 Return the calculated cost of the expression, recursing manually in to
7968 operands where needed. */
7970 static int
7971 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7973 rtx op0, op1;
7974 const struct cpu_cost_table *extra_cost
7975 = aarch64_tune_params.insn_extra_cost;
7976 int cost = 0;
7977 bool compound_p = (outer == PLUS || outer == MINUS);
7978 machine_mode mode = GET_MODE (x);
7980 gcc_checking_assert (code == MULT);
7982 op0 = XEXP (x, 0);
7983 op1 = XEXP (x, 1);
7985 if (VECTOR_MODE_P (mode))
7986 mode = GET_MODE_INNER (mode);
7988 /* Integer multiply/fma. */
7989 if (GET_MODE_CLASS (mode) == MODE_INT)
7991 /* The multiply will be canonicalized as a shift, cost it as such. */
7992 if (aarch64_shift_p (GET_CODE (x))
7993 || (CONST_INT_P (op1)
7994 && exact_log2 (INTVAL (op1)) > 0))
7996 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7997 || GET_CODE (op0) == SIGN_EXTEND;
7998 if (speed)
8000 if (compound_p)
8002 /* If the shift is considered cheap,
8003 then don't add any cost. */
8004 if (aarch64_cheap_mult_shift_p (x))
8006 else if (REG_P (op1))
8007 /* ARITH + shift-by-register. */
8008 cost += extra_cost->alu.arith_shift_reg;
8009 else if (is_extend)
8010 /* ARITH + extended register. We don't have a cost field
8011 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8012 cost += extra_cost->alu.extend_arith;
8013 else
8014 /* ARITH + shift-by-immediate. */
8015 cost += extra_cost->alu.arith_shift;
8017 else
8018 /* LSL (immediate). */
8019 cost += extra_cost->alu.shift;
8022 /* Strip extends as we will have costed them in the case above. */
8023 if (is_extend)
8024 op0 = aarch64_strip_extend (op0, true);
8026 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8028 return cost;
8031 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8032 compound and let the below cases handle it. After all, MNEG is a
8033 special-case alias of MSUB. */
8034 if (GET_CODE (op0) == NEG)
8036 op0 = XEXP (op0, 0);
8037 compound_p = true;
8040 /* Integer multiplies or FMAs have zero/sign extending variants. */
8041 if ((GET_CODE (op0) == ZERO_EXTEND
8042 && GET_CODE (op1) == ZERO_EXTEND)
8043 || (GET_CODE (op0) == SIGN_EXTEND
8044 && GET_CODE (op1) == SIGN_EXTEND))
8046 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8047 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8049 if (speed)
8051 if (compound_p)
8052 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8053 cost += extra_cost->mult[0].extend_add;
8054 else
8055 /* MUL/SMULL/UMULL. */
8056 cost += extra_cost->mult[0].extend;
8059 return cost;
8062 /* This is either an integer multiply or a MADD. In both cases
8063 we want to recurse and cost the operands. */
8064 cost += rtx_cost (op0, mode, MULT, 0, speed);
8065 cost += rtx_cost (op1, mode, MULT, 1, speed);
8067 if (speed)
8069 if (compound_p)
8070 /* MADD/MSUB. */
8071 cost += extra_cost->mult[mode == DImode].add;
8072 else
8073 /* MUL. */
8074 cost += extra_cost->mult[mode == DImode].simple;
8077 return cost;
8079 else
8081 if (speed)
8083 /* Floating-point FMA/FMUL can also support negations of the
8084 operands, unless the rounding mode is upward or downward in
8085 which case FNMUL is different than FMUL with operand negation. */
8086 bool neg0 = GET_CODE (op0) == NEG;
8087 bool neg1 = GET_CODE (op1) == NEG;
8088 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8090 if (neg0)
8091 op0 = XEXP (op0, 0);
8092 if (neg1)
8093 op1 = XEXP (op1, 0);
8096 if (compound_p)
8097 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8098 cost += extra_cost->fp[mode == DFmode].fma;
8099 else
8100 /* FMUL/FNMUL. */
8101 cost += extra_cost->fp[mode == DFmode].mult;
8104 cost += rtx_cost (op0, mode, MULT, 0, speed);
8105 cost += rtx_cost (op1, mode, MULT, 1, speed);
8106 return cost;
8110 static int
8111 aarch64_address_cost (rtx x,
8112 machine_mode mode,
8113 addr_space_t as ATTRIBUTE_UNUSED,
8114 bool speed)
8116 enum rtx_code c = GET_CODE (x);
8117 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8118 struct aarch64_address_info info;
8119 int cost = 0;
8120 info.shift = 0;
8122 if (!aarch64_classify_address (&info, x, mode, false))
8124 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8126 /* This is a CONST or SYMBOL ref which will be split
8127 in a different way depending on the code model in use.
8128 Cost it through the generic infrastructure. */
8129 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8130 /* Divide through by the cost of one instruction to
8131 bring it to the same units as the address costs. */
8132 cost_symbol_ref /= COSTS_N_INSNS (1);
8133 /* The cost is then the cost of preparing the address,
8134 followed by an immediate (possibly 0) offset. */
8135 return cost_symbol_ref + addr_cost->imm_offset;
8137 else
8139 /* This is most likely a jump table from a case
8140 statement. */
8141 return addr_cost->register_offset;
8145 switch (info.type)
8147 case ADDRESS_LO_SUM:
8148 case ADDRESS_SYMBOLIC:
8149 case ADDRESS_REG_IMM:
8150 cost += addr_cost->imm_offset;
8151 break;
8153 case ADDRESS_REG_WB:
8154 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8155 cost += addr_cost->pre_modify;
8156 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8157 cost += addr_cost->post_modify;
8158 else
8159 gcc_unreachable ();
8161 break;
8163 case ADDRESS_REG_REG:
8164 cost += addr_cost->register_offset;
8165 break;
8167 case ADDRESS_REG_SXTW:
8168 cost += addr_cost->register_sextend;
8169 break;
8171 case ADDRESS_REG_UXTW:
8172 cost += addr_cost->register_zextend;
8173 break;
8175 default:
8176 gcc_unreachable ();
8180 if (info.shift > 0)
8182 /* For the sake of calculating the cost of the shifted register
8183 component, we can treat same sized modes in the same way. */
8184 if (known_eq (GET_MODE_BITSIZE (mode), 16))
8185 cost += addr_cost->addr_scale_costs.hi;
8186 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8187 cost += addr_cost->addr_scale_costs.si;
8188 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8189 cost += addr_cost->addr_scale_costs.di;
8190 else
8191 /* We can't tell, or this is a 128-bit vector. */
8192 cost += addr_cost->addr_scale_costs.ti;
8195 return cost;
8198 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8199 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8200 to be taken. */
8203 aarch64_branch_cost (bool speed_p, bool predictable_p)
8205 /* When optimizing for speed, use the cost of unpredictable branches. */
8206 const struct cpu_branch_cost *branch_costs =
8207 aarch64_tune_params.branch_costs;
8209 if (!speed_p || predictable_p)
8210 return branch_costs->predictable;
8211 else
8212 return branch_costs->unpredictable;
8215 /* Return true if the RTX X in mode MODE is a zero or sign extract
8216 usable in an ADD or SUB (extended register) instruction. */
8217 static bool
8218 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8220 /* Catch add with a sign extract.
8221 This is add_<optab><mode>_multp2. */
8222 if (GET_CODE (x) == SIGN_EXTRACT
8223 || GET_CODE (x) == ZERO_EXTRACT)
8225 rtx op0 = XEXP (x, 0);
8226 rtx op1 = XEXP (x, 1);
8227 rtx op2 = XEXP (x, 2);
8229 if (GET_CODE (op0) == MULT
8230 && CONST_INT_P (op1)
8231 && op2 == const0_rtx
8232 && CONST_INT_P (XEXP (op0, 1))
8233 && aarch64_is_extend_from_extract (mode,
8234 XEXP (op0, 1),
8235 op1))
8237 return true;
8240 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8241 No shift. */
8242 else if (GET_CODE (x) == SIGN_EXTEND
8243 || GET_CODE (x) == ZERO_EXTEND)
8244 return REG_P (XEXP (x, 0));
8246 return false;
8249 static bool
8250 aarch64_frint_unspec_p (unsigned int u)
8252 switch (u)
8254 case UNSPEC_FRINTZ:
8255 case UNSPEC_FRINTP:
8256 case UNSPEC_FRINTM:
8257 case UNSPEC_FRINTA:
8258 case UNSPEC_FRINTN:
8259 case UNSPEC_FRINTX:
8260 case UNSPEC_FRINTI:
8261 return true;
8263 default:
8264 return false;
8268 /* Return true iff X is an rtx that will match an extr instruction
8269 i.e. as described in the *extr<mode>5_insn family of patterns.
8270 OP0 and OP1 will be set to the operands of the shifts involved
8271 on success and will be NULL_RTX otherwise. */
8273 static bool
8274 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8276 rtx op0, op1;
8277 scalar_int_mode mode;
8278 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8279 return false;
8281 *res_op0 = NULL_RTX;
8282 *res_op1 = NULL_RTX;
8284 if (GET_CODE (x) != IOR)
8285 return false;
8287 op0 = XEXP (x, 0);
8288 op1 = XEXP (x, 1);
8290 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8291 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8293 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8294 if (GET_CODE (op1) == ASHIFT)
8295 std::swap (op0, op1);
8297 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8298 return false;
8300 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8301 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8303 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8304 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8306 *res_op0 = XEXP (op0, 0);
8307 *res_op1 = XEXP (op1, 0);
8308 return true;
8312 return false;
8315 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8316 storing it in *COST. Result is true if the total cost of the operation
8317 has now been calculated. */
8318 static bool
8319 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8321 rtx inner;
8322 rtx comparator;
8323 enum rtx_code cmpcode;
8325 if (COMPARISON_P (op0))
8327 inner = XEXP (op0, 0);
8328 comparator = XEXP (op0, 1);
8329 cmpcode = GET_CODE (op0);
8331 else
8333 inner = op0;
8334 comparator = const0_rtx;
8335 cmpcode = NE;
8338 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8340 /* Conditional branch. */
8341 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8342 return true;
8343 else
8345 if (cmpcode == NE || cmpcode == EQ)
8347 if (comparator == const0_rtx)
8349 /* TBZ/TBNZ/CBZ/CBNZ. */
8350 if (GET_CODE (inner) == ZERO_EXTRACT)
8351 /* TBZ/TBNZ. */
8352 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8353 ZERO_EXTRACT, 0, speed);
8354 else
8355 /* CBZ/CBNZ. */
8356 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8358 return true;
8361 else if (cmpcode == LT || cmpcode == GE)
8363 /* TBZ/TBNZ. */
8364 if (comparator == const0_rtx)
8365 return true;
8369 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8371 /* CCMP. */
8372 if (GET_CODE (op1) == COMPARE)
8374 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8375 if (XEXP (op1, 1) == const0_rtx)
8376 *cost += 1;
8377 if (speed)
8379 machine_mode mode = GET_MODE (XEXP (op1, 0));
8380 const struct cpu_cost_table *extra_cost
8381 = aarch64_tune_params.insn_extra_cost;
8383 if (GET_MODE_CLASS (mode) == MODE_INT)
8384 *cost += extra_cost->alu.arith;
8385 else
8386 *cost += extra_cost->fp[mode == DFmode].compare;
8388 return true;
8391 /* It's a conditional operation based on the status flags,
8392 so it must be some flavor of CSEL. */
8394 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8395 if (GET_CODE (op1) == NEG
8396 || GET_CODE (op1) == NOT
8397 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8398 op1 = XEXP (op1, 0);
8399 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8401 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8402 op1 = XEXP (op1, 0);
8403 op2 = XEXP (op2, 0);
8406 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8407 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8408 return true;
8411 /* We don't know what this is, cost all operands. */
8412 return false;
8415 /* Check whether X is a bitfield operation of the form shift + extend that
8416 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8417 operand to which the bitfield operation is applied. Otherwise return
8418 NULL_RTX. */
8420 static rtx
8421 aarch64_extend_bitfield_pattern_p (rtx x)
8423 rtx_code outer_code = GET_CODE (x);
8424 machine_mode outer_mode = GET_MODE (x);
8426 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8427 && outer_mode != SImode && outer_mode != DImode)
8428 return NULL_RTX;
8430 rtx inner = XEXP (x, 0);
8431 rtx_code inner_code = GET_CODE (inner);
8432 machine_mode inner_mode = GET_MODE (inner);
8433 rtx op = NULL_RTX;
8435 switch (inner_code)
8437 case ASHIFT:
8438 if (CONST_INT_P (XEXP (inner, 1))
8439 && (inner_mode == QImode || inner_mode == HImode))
8440 op = XEXP (inner, 0);
8441 break;
8442 case LSHIFTRT:
8443 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8444 && (inner_mode == QImode || inner_mode == HImode))
8445 op = XEXP (inner, 0);
8446 break;
8447 case ASHIFTRT:
8448 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8449 && (inner_mode == QImode || inner_mode == HImode))
8450 op = XEXP (inner, 0);
8451 break;
8452 default:
8453 break;
8456 return op;
8459 /* Return true if the mask and a shift amount from an RTX of the form
8460 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8461 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8463 bool
8464 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8465 rtx shft_amnt)
8467 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8468 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8469 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8470 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8473 /* Calculate the cost of calculating X, storing it in *COST. Result
8474 is true if the total cost of the operation has now been calculated. */
8475 static bool
8476 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8477 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8479 rtx op0, op1, op2;
8480 const struct cpu_cost_table *extra_cost
8481 = aarch64_tune_params.insn_extra_cost;
8482 int code = GET_CODE (x);
8483 scalar_int_mode int_mode;
8485 /* By default, assume that everything has equivalent cost to the
8486 cheapest instruction. Any additional costs are applied as a delta
8487 above this default. */
8488 *cost = COSTS_N_INSNS (1);
8490 switch (code)
8492 case SET:
8493 /* The cost depends entirely on the operands to SET. */
8494 *cost = 0;
8495 op0 = SET_DEST (x);
8496 op1 = SET_SRC (x);
8498 switch (GET_CODE (op0))
8500 case MEM:
8501 if (speed)
8503 rtx address = XEXP (op0, 0);
8504 if (VECTOR_MODE_P (mode))
8505 *cost += extra_cost->ldst.storev;
8506 else if (GET_MODE_CLASS (mode) == MODE_INT)
8507 *cost += extra_cost->ldst.store;
8508 else if (mode == SFmode)
8509 *cost += extra_cost->ldst.storef;
8510 else if (mode == DFmode)
8511 *cost += extra_cost->ldst.stored;
8513 *cost +=
8514 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8515 0, speed));
8518 *cost += rtx_cost (op1, mode, SET, 1, speed);
8519 return true;
8521 case SUBREG:
8522 if (! REG_P (SUBREG_REG (op0)))
8523 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8525 /* Fall through. */
8526 case REG:
8527 /* The cost is one per vector-register copied. */
8528 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8530 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8531 *cost = COSTS_N_INSNS (nregs);
8533 /* const0_rtx is in general free, but we will use an
8534 instruction to set a register to 0. */
8535 else if (REG_P (op1) || op1 == const0_rtx)
8537 /* The cost is 1 per register copied. */
8538 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8539 *cost = COSTS_N_INSNS (nregs);
8541 else
8542 /* Cost is just the cost of the RHS of the set. */
8543 *cost += rtx_cost (op1, mode, SET, 1, speed);
8544 return true;
8546 case ZERO_EXTRACT:
8547 case SIGN_EXTRACT:
8548 /* Bit-field insertion. Strip any redundant widening of
8549 the RHS to meet the width of the target. */
8550 if (GET_CODE (op1) == SUBREG)
8551 op1 = SUBREG_REG (op1);
8552 if ((GET_CODE (op1) == ZERO_EXTEND
8553 || GET_CODE (op1) == SIGN_EXTEND)
8554 && CONST_INT_P (XEXP (op0, 1))
8555 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8556 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8557 op1 = XEXP (op1, 0);
8559 if (CONST_INT_P (op1))
8561 /* MOV immediate is assumed to always be cheap. */
8562 *cost = COSTS_N_INSNS (1);
8564 else
8566 /* BFM. */
8567 if (speed)
8568 *cost += extra_cost->alu.bfi;
8569 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8572 return true;
8574 default:
8575 /* We can't make sense of this, assume default cost. */
8576 *cost = COSTS_N_INSNS (1);
8577 return false;
8579 return false;
8581 case CONST_INT:
8582 /* If an instruction can incorporate a constant within the
8583 instruction, the instruction's expression avoids calling
8584 rtx_cost() on the constant. If rtx_cost() is called on a
8585 constant, then it is usually because the constant must be
8586 moved into a register by one or more instructions.
8588 The exception is constant 0, which can be expressed
8589 as XZR/WZR and is therefore free. The exception to this is
8590 if we have (set (reg) (const0_rtx)) in which case we must cost
8591 the move. However, we can catch that when we cost the SET, so
8592 we don't need to consider that here. */
8593 if (x == const0_rtx)
8594 *cost = 0;
8595 else
8597 /* To an approximation, building any other constant is
8598 proportionally expensive to the number of instructions
8599 required to build that constant. This is true whether we
8600 are compiling for SPEED or otherwise. */
8601 if (!is_a <scalar_int_mode> (mode, &int_mode))
8602 int_mode = word_mode;
8603 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8604 (NULL_RTX, x, false, int_mode));
8606 return true;
8608 case CONST_DOUBLE:
8610 /* First determine number of instructions to do the move
8611 as an integer constant. */
8612 if (!aarch64_float_const_representable_p (x)
8613 && !aarch64_can_const_movi_rtx_p (x, mode)
8614 && aarch64_float_const_rtx_p (x))
8616 unsigned HOST_WIDE_INT ival;
8617 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8618 gcc_assert (succeed);
8620 scalar_int_mode imode = (mode == HFmode
8621 ? SImode
8622 : int_mode_for_mode (mode).require ());
8623 int ncost = aarch64_internal_mov_immediate
8624 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8625 *cost += COSTS_N_INSNS (ncost);
8626 return true;
8629 if (speed)
8631 /* mov[df,sf]_aarch64. */
8632 if (aarch64_float_const_representable_p (x))
8633 /* FMOV (scalar immediate). */
8634 *cost += extra_cost->fp[mode == DFmode].fpconst;
8635 else if (!aarch64_float_const_zero_rtx_p (x))
8637 /* This will be a load from memory. */
8638 if (mode == DFmode)
8639 *cost += extra_cost->ldst.loadd;
8640 else
8641 *cost += extra_cost->ldst.loadf;
8643 else
8644 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8645 or MOV v0.s[0], wzr - neither of which are modeled by the
8646 cost tables. Just use the default cost. */
8651 return true;
8653 case MEM:
8654 if (speed)
8656 /* For loads we want the base cost of a load, plus an
8657 approximation for the additional cost of the addressing
8658 mode. */
8659 rtx address = XEXP (x, 0);
8660 if (VECTOR_MODE_P (mode))
8661 *cost += extra_cost->ldst.loadv;
8662 else if (GET_MODE_CLASS (mode) == MODE_INT)
8663 *cost += extra_cost->ldst.load;
8664 else if (mode == SFmode)
8665 *cost += extra_cost->ldst.loadf;
8666 else if (mode == DFmode)
8667 *cost += extra_cost->ldst.loadd;
8669 *cost +=
8670 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8671 0, speed));
8674 return true;
8676 case NEG:
8677 op0 = XEXP (x, 0);
8679 if (VECTOR_MODE_P (mode))
8681 if (speed)
8683 /* FNEG. */
8684 *cost += extra_cost->vect.alu;
8686 return false;
8689 if (GET_MODE_CLASS (mode) == MODE_INT)
8691 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8692 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8694 /* CSETM. */
8695 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8696 return true;
8699 /* Cost this as SUB wzr, X. */
8700 op0 = CONST0_RTX (mode);
8701 op1 = XEXP (x, 0);
8702 goto cost_minus;
8705 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8707 /* Support (neg(fma...)) as a single instruction only if
8708 sign of zeros is unimportant. This matches the decision
8709 making in aarch64.md. */
8710 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8712 /* FNMADD. */
8713 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8714 return true;
8716 if (GET_CODE (op0) == MULT)
8718 /* FNMUL. */
8719 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8720 return true;
8722 if (speed)
8723 /* FNEG. */
8724 *cost += extra_cost->fp[mode == DFmode].neg;
8725 return false;
8728 return false;
8730 case CLRSB:
8731 case CLZ:
8732 if (speed)
8734 if (VECTOR_MODE_P (mode))
8735 *cost += extra_cost->vect.alu;
8736 else
8737 *cost += extra_cost->alu.clz;
8740 return false;
8742 case COMPARE:
8743 op0 = XEXP (x, 0);
8744 op1 = XEXP (x, 1);
8746 if (op1 == const0_rtx
8747 && GET_CODE (op0) == AND)
8749 x = op0;
8750 mode = GET_MODE (op0);
8751 goto cost_logic;
8754 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8756 /* TODO: A write to the CC flags possibly costs extra, this
8757 needs encoding in the cost tables. */
8759 mode = GET_MODE (op0);
8760 /* ANDS. */
8761 if (GET_CODE (op0) == AND)
8763 x = op0;
8764 goto cost_logic;
8767 if (GET_CODE (op0) == PLUS)
8769 /* ADDS (and CMN alias). */
8770 x = op0;
8771 goto cost_plus;
8774 if (GET_CODE (op0) == MINUS)
8776 /* SUBS. */
8777 x = op0;
8778 goto cost_minus;
8781 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8782 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8783 && CONST_INT_P (XEXP (op0, 2)))
8785 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8786 Handle it here directly rather than going to cost_logic
8787 since we know the immediate generated for the TST is valid
8788 so we can avoid creating an intermediate rtx for it only
8789 for costing purposes. */
8790 if (speed)
8791 *cost += extra_cost->alu.logical;
8793 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8794 ZERO_EXTRACT, 0, speed);
8795 return true;
8798 if (GET_CODE (op1) == NEG)
8800 /* CMN. */
8801 if (speed)
8802 *cost += extra_cost->alu.arith;
8804 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8805 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8806 return true;
8809 /* CMP.
8811 Compare can freely swap the order of operands, and
8812 canonicalization puts the more complex operation first.
8813 But the integer MINUS logic expects the shift/extend
8814 operation in op1. */
8815 if (! (REG_P (op0)
8816 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8818 op0 = XEXP (x, 1);
8819 op1 = XEXP (x, 0);
8821 goto cost_minus;
8824 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8826 /* FCMP. */
8827 if (speed)
8828 *cost += extra_cost->fp[mode == DFmode].compare;
8830 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8832 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8833 /* FCMP supports constant 0.0 for no extra cost. */
8834 return true;
8836 return false;
8839 if (VECTOR_MODE_P (mode))
8841 /* Vector compare. */
8842 if (speed)
8843 *cost += extra_cost->vect.alu;
8845 if (aarch64_float_const_zero_rtx_p (op1))
8847 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8848 cost. */
8849 return true;
8851 return false;
8853 return false;
8855 case MINUS:
8857 op0 = XEXP (x, 0);
8858 op1 = XEXP (x, 1);
8860 cost_minus:
8861 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8863 /* Detect valid immediates. */
8864 if ((GET_MODE_CLASS (mode) == MODE_INT
8865 || (GET_MODE_CLASS (mode) == MODE_CC
8866 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8867 && CONST_INT_P (op1)
8868 && aarch64_uimm12_shift (INTVAL (op1)))
8870 if (speed)
8871 /* SUB(S) (immediate). */
8872 *cost += extra_cost->alu.arith;
8873 return true;
8876 /* Look for SUB (extended register). */
8877 if (is_a <scalar_int_mode> (mode, &int_mode)
8878 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8880 if (speed)
8881 *cost += extra_cost->alu.extend_arith;
8883 op1 = aarch64_strip_extend (op1, true);
8884 *cost += rtx_cost (op1, VOIDmode,
8885 (enum rtx_code) GET_CODE (op1), 0, speed);
8886 return true;
8889 rtx new_op1 = aarch64_strip_extend (op1, false);
8891 /* Cost this as an FMA-alike operation. */
8892 if ((GET_CODE (new_op1) == MULT
8893 || aarch64_shift_p (GET_CODE (new_op1)))
8894 && code != COMPARE)
8896 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8897 (enum rtx_code) code,
8898 speed);
8899 return true;
8902 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8904 if (speed)
8906 if (VECTOR_MODE_P (mode))
8908 /* Vector SUB. */
8909 *cost += extra_cost->vect.alu;
8911 else if (GET_MODE_CLASS (mode) == MODE_INT)
8913 /* SUB(S). */
8914 *cost += extra_cost->alu.arith;
8916 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8918 /* FSUB. */
8919 *cost += extra_cost->fp[mode == DFmode].addsub;
8922 return true;
8925 case PLUS:
8927 rtx new_op0;
8929 op0 = XEXP (x, 0);
8930 op1 = XEXP (x, 1);
8932 cost_plus:
8933 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8934 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8936 /* CSINC. */
8937 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8938 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8939 return true;
8942 if (GET_MODE_CLASS (mode) == MODE_INT
8943 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8944 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8946 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8948 if (speed)
8949 /* ADD (immediate). */
8950 *cost += extra_cost->alu.arith;
8951 return true;
8954 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8956 /* Look for ADD (extended register). */
8957 if (is_a <scalar_int_mode> (mode, &int_mode)
8958 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8960 if (speed)
8961 *cost += extra_cost->alu.extend_arith;
8963 op0 = aarch64_strip_extend (op0, true);
8964 *cost += rtx_cost (op0, VOIDmode,
8965 (enum rtx_code) GET_CODE (op0), 0, speed);
8966 return true;
8969 /* Strip any extend, leave shifts behind as we will
8970 cost them through mult_cost. */
8971 new_op0 = aarch64_strip_extend (op0, false);
8973 if (GET_CODE (new_op0) == MULT
8974 || aarch64_shift_p (GET_CODE (new_op0)))
8976 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8977 speed);
8978 return true;
8981 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8983 if (speed)
8985 if (VECTOR_MODE_P (mode))
8987 /* Vector ADD. */
8988 *cost += extra_cost->vect.alu;
8990 else if (GET_MODE_CLASS (mode) == MODE_INT)
8992 /* ADD. */
8993 *cost += extra_cost->alu.arith;
8995 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8997 /* FADD. */
8998 *cost += extra_cost->fp[mode == DFmode].addsub;
9001 return true;
9004 case BSWAP:
9005 *cost = COSTS_N_INSNS (1);
9007 if (speed)
9009 if (VECTOR_MODE_P (mode))
9010 *cost += extra_cost->vect.alu;
9011 else
9012 *cost += extra_cost->alu.rev;
9014 return false;
9016 case IOR:
9017 if (aarch_rev16_p (x))
9019 *cost = COSTS_N_INSNS (1);
9021 if (speed)
9023 if (VECTOR_MODE_P (mode))
9024 *cost += extra_cost->vect.alu;
9025 else
9026 *cost += extra_cost->alu.rev;
9028 return true;
9031 if (aarch64_extr_rtx_p (x, &op0, &op1))
9033 *cost += rtx_cost (op0, mode, IOR, 0, speed);
9034 *cost += rtx_cost (op1, mode, IOR, 1, speed);
9035 if (speed)
9036 *cost += extra_cost->alu.shift;
9038 return true;
9040 /* Fall through. */
9041 case XOR:
9042 case AND:
9043 cost_logic:
9044 op0 = XEXP (x, 0);
9045 op1 = XEXP (x, 1);
9047 if (VECTOR_MODE_P (mode))
9049 if (speed)
9050 *cost += extra_cost->vect.alu;
9051 return true;
9054 if (code == AND
9055 && GET_CODE (op0) == MULT
9056 && CONST_INT_P (XEXP (op0, 1))
9057 && CONST_INT_P (op1)
9058 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9059 INTVAL (op1)) != 0)
9061 /* This is a UBFM/SBFM. */
9062 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9063 if (speed)
9064 *cost += extra_cost->alu.bfx;
9065 return true;
9068 if (is_int_mode (mode, &int_mode))
9070 if (CONST_INT_P (op1))
9072 /* We have a mask + shift version of a UBFIZ
9073 i.e. the *andim_ashift<mode>_bfiz pattern. */
9074 if (GET_CODE (op0) == ASHIFT
9075 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9076 XEXP (op0, 1)))
9078 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9079 (enum rtx_code) code, 0, speed);
9080 if (speed)
9081 *cost += extra_cost->alu.bfx;
9083 return true;
9085 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9087 /* We possibly get the immediate for free, this is not
9088 modelled. */
9089 *cost += rtx_cost (op0, int_mode,
9090 (enum rtx_code) code, 0, speed);
9091 if (speed)
9092 *cost += extra_cost->alu.logical;
9094 return true;
9097 else
9099 rtx new_op0 = op0;
9101 /* Handle ORN, EON, or BIC. */
9102 if (GET_CODE (op0) == NOT)
9103 op0 = XEXP (op0, 0);
9105 new_op0 = aarch64_strip_shift (op0);
9107 /* If we had a shift on op0 then this is a logical-shift-
9108 by-register/immediate operation. Otherwise, this is just
9109 a logical operation. */
9110 if (speed)
9112 if (new_op0 != op0)
9114 /* Shift by immediate. */
9115 if (CONST_INT_P (XEXP (op0, 1)))
9116 *cost += extra_cost->alu.log_shift;
9117 else
9118 *cost += extra_cost->alu.log_shift_reg;
9120 else
9121 *cost += extra_cost->alu.logical;
9124 /* In both cases we want to cost both operands. */
9125 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9126 0, speed);
9127 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9128 1, speed);
9130 return true;
9133 return false;
9135 case NOT:
9136 x = XEXP (x, 0);
9137 op0 = aarch64_strip_shift (x);
9139 if (VECTOR_MODE_P (mode))
9141 /* Vector NOT. */
9142 *cost += extra_cost->vect.alu;
9143 return false;
9146 /* MVN-shifted-reg. */
9147 if (op0 != x)
9149 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9151 if (speed)
9152 *cost += extra_cost->alu.log_shift;
9154 return true;
9156 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9157 Handle the second form here taking care that 'a' in the above can
9158 be a shift. */
9159 else if (GET_CODE (op0) == XOR)
9161 rtx newop0 = XEXP (op0, 0);
9162 rtx newop1 = XEXP (op0, 1);
9163 rtx op0_stripped = aarch64_strip_shift (newop0);
9165 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9166 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9168 if (speed)
9170 if (op0_stripped != newop0)
9171 *cost += extra_cost->alu.log_shift;
9172 else
9173 *cost += extra_cost->alu.logical;
9176 return true;
9178 /* MVN. */
9179 if (speed)
9180 *cost += extra_cost->alu.logical;
9182 return false;
9184 case ZERO_EXTEND:
9186 op0 = XEXP (x, 0);
9187 /* If a value is written in SI mode, then zero extended to DI
9188 mode, the operation will in general be free as a write to
9189 a 'w' register implicitly zeroes the upper bits of an 'x'
9190 register. However, if this is
9192 (set (reg) (zero_extend (reg)))
9194 we must cost the explicit register move. */
9195 if (mode == DImode
9196 && GET_MODE (op0) == SImode
9197 && outer == SET)
9199 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9201 /* If OP_COST is non-zero, then the cost of the zero extend
9202 is effectively the cost of the inner operation. Otherwise
9203 we have a MOV instruction and we take the cost from the MOV
9204 itself. This is true independently of whether we are
9205 optimizing for space or time. */
9206 if (op_cost)
9207 *cost = op_cost;
9209 return true;
9211 else if (MEM_P (op0))
9213 /* All loads can zero extend to any size for free. */
9214 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9215 return true;
9218 op0 = aarch64_extend_bitfield_pattern_p (x);
9219 if (op0)
9221 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9222 if (speed)
9223 *cost += extra_cost->alu.bfx;
9224 return true;
9227 if (speed)
9229 if (VECTOR_MODE_P (mode))
9231 /* UMOV. */
9232 *cost += extra_cost->vect.alu;
9234 else
9236 /* We generate an AND instead of UXTB/UXTH. */
9237 *cost += extra_cost->alu.logical;
9240 return false;
9242 case SIGN_EXTEND:
9243 if (MEM_P (XEXP (x, 0)))
9245 /* LDRSH. */
9246 if (speed)
9248 rtx address = XEXP (XEXP (x, 0), 0);
9249 *cost += extra_cost->ldst.load_sign_extend;
9251 *cost +=
9252 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9253 0, speed));
9255 return true;
9258 op0 = aarch64_extend_bitfield_pattern_p (x);
9259 if (op0)
9261 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9262 if (speed)
9263 *cost += extra_cost->alu.bfx;
9264 return true;
9267 if (speed)
9269 if (VECTOR_MODE_P (mode))
9270 *cost += extra_cost->vect.alu;
9271 else
9272 *cost += extra_cost->alu.extend;
9274 return false;
9276 case ASHIFT:
9277 op0 = XEXP (x, 0);
9278 op1 = XEXP (x, 1);
9280 if (CONST_INT_P (op1))
9282 if (speed)
9284 if (VECTOR_MODE_P (mode))
9286 /* Vector shift (immediate). */
9287 *cost += extra_cost->vect.alu;
9289 else
9291 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9292 aliases. */
9293 *cost += extra_cost->alu.shift;
9297 /* We can incorporate zero/sign extend for free. */
9298 if (GET_CODE (op0) == ZERO_EXTEND
9299 || GET_CODE (op0) == SIGN_EXTEND)
9300 op0 = XEXP (op0, 0);
9302 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9303 return true;
9305 else
9307 if (VECTOR_MODE_P (mode))
9309 if (speed)
9310 /* Vector shift (register). */
9311 *cost += extra_cost->vect.alu;
9313 else
9315 if (speed)
9316 /* LSLV. */
9317 *cost += extra_cost->alu.shift_reg;
9319 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9320 && CONST_INT_P (XEXP (op1, 1))
9321 && known_eq (INTVAL (XEXP (op1, 1)),
9322 GET_MODE_BITSIZE (mode) - 1))
9324 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9325 /* We already demanded XEXP (op1, 0) to be REG_P, so
9326 don't recurse into it. */
9327 return true;
9330 return false; /* All arguments need to be in registers. */
9333 case ROTATE:
9334 case ROTATERT:
9335 case LSHIFTRT:
9336 case ASHIFTRT:
9337 op0 = XEXP (x, 0);
9338 op1 = XEXP (x, 1);
9340 if (CONST_INT_P (op1))
9342 /* ASR (immediate) and friends. */
9343 if (speed)
9345 if (VECTOR_MODE_P (mode))
9346 *cost += extra_cost->vect.alu;
9347 else
9348 *cost += extra_cost->alu.shift;
9351 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9352 return true;
9354 else
9356 if (VECTOR_MODE_P (mode))
9358 if (speed)
9359 /* Vector shift (register). */
9360 *cost += extra_cost->vect.alu;
9362 else
9364 if (speed)
9365 /* ASR (register) and friends. */
9366 *cost += extra_cost->alu.shift_reg;
9368 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9369 && CONST_INT_P (XEXP (op1, 1))
9370 && known_eq (INTVAL (XEXP (op1, 1)),
9371 GET_MODE_BITSIZE (mode) - 1))
9373 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9374 /* We already demanded XEXP (op1, 0) to be REG_P, so
9375 don't recurse into it. */
9376 return true;
9379 return false; /* All arguments need to be in registers. */
9382 case SYMBOL_REF:
9384 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9385 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9387 /* LDR. */
9388 if (speed)
9389 *cost += extra_cost->ldst.load;
9391 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9392 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9394 /* ADRP, followed by ADD. */
9395 *cost += COSTS_N_INSNS (1);
9396 if (speed)
9397 *cost += 2 * extra_cost->alu.arith;
9399 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9400 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9402 /* ADR. */
9403 if (speed)
9404 *cost += extra_cost->alu.arith;
9407 if (flag_pic)
9409 /* One extra load instruction, after accessing the GOT. */
9410 *cost += COSTS_N_INSNS (1);
9411 if (speed)
9412 *cost += extra_cost->ldst.load;
9414 return true;
9416 case HIGH:
9417 case LO_SUM:
9418 /* ADRP/ADD (immediate). */
9419 if (speed)
9420 *cost += extra_cost->alu.arith;
9421 return true;
9423 case ZERO_EXTRACT:
9424 case SIGN_EXTRACT:
9425 /* UBFX/SBFX. */
9426 if (speed)
9428 if (VECTOR_MODE_P (mode))
9429 *cost += extra_cost->vect.alu;
9430 else
9431 *cost += extra_cost->alu.bfx;
9434 /* We can trust that the immediates used will be correct (there
9435 are no by-register forms), so we need only cost op0. */
9436 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9437 return true;
9439 case MULT:
9440 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9441 /* aarch64_rtx_mult_cost always handles recursion to its
9442 operands. */
9443 return true;
9445 case MOD:
9446 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9447 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9448 an unconditional negate. This case should only ever be reached through
9449 the set_smod_pow2_cheap check in expmed.c. */
9450 if (CONST_INT_P (XEXP (x, 1))
9451 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9452 && (mode == SImode || mode == DImode))
9454 /* We expand to 4 instructions. Reset the baseline. */
9455 *cost = COSTS_N_INSNS (4);
9457 if (speed)
9458 *cost += 2 * extra_cost->alu.logical
9459 + 2 * extra_cost->alu.arith;
9461 return true;
9464 /* Fall-through. */
9465 case UMOD:
9466 if (speed)
9468 /* Slighly prefer UMOD over SMOD. */
9469 if (VECTOR_MODE_P (mode))
9470 *cost += extra_cost->vect.alu;
9471 else if (GET_MODE_CLASS (mode) == MODE_INT)
9472 *cost += (extra_cost->mult[mode == DImode].add
9473 + extra_cost->mult[mode == DImode].idiv
9474 + (code == MOD ? 1 : 0));
9476 return false; /* All arguments need to be in registers. */
9478 case DIV:
9479 case UDIV:
9480 case SQRT:
9481 if (speed)
9483 if (VECTOR_MODE_P (mode))
9484 *cost += extra_cost->vect.alu;
9485 else if (GET_MODE_CLASS (mode) == MODE_INT)
9486 /* There is no integer SQRT, so only DIV and UDIV can get
9487 here. */
9488 *cost += (extra_cost->mult[mode == DImode].idiv
9489 /* Slighly prefer UDIV over SDIV. */
9490 + (code == DIV ? 1 : 0));
9491 else
9492 *cost += extra_cost->fp[mode == DFmode].div;
9494 return false; /* All arguments need to be in registers. */
9496 case IF_THEN_ELSE:
9497 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9498 XEXP (x, 2), cost, speed);
9500 case EQ:
9501 case NE:
9502 case GT:
9503 case GTU:
9504 case LT:
9505 case LTU:
9506 case GE:
9507 case GEU:
9508 case LE:
9509 case LEU:
9511 return false; /* All arguments must be in registers. */
9513 case FMA:
9514 op0 = XEXP (x, 0);
9515 op1 = XEXP (x, 1);
9516 op2 = XEXP (x, 2);
9518 if (speed)
9520 if (VECTOR_MODE_P (mode))
9521 *cost += extra_cost->vect.alu;
9522 else
9523 *cost += extra_cost->fp[mode == DFmode].fma;
9526 /* FMSUB, FNMADD, and FNMSUB are free. */
9527 if (GET_CODE (op0) == NEG)
9528 op0 = XEXP (op0, 0);
9530 if (GET_CODE (op2) == NEG)
9531 op2 = XEXP (op2, 0);
9533 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9534 and the by-element operand as operand 0. */
9535 if (GET_CODE (op1) == NEG)
9536 op1 = XEXP (op1, 0);
9538 /* Catch vector-by-element operations. The by-element operand can
9539 either be (vec_duplicate (vec_select (x))) or just
9540 (vec_select (x)), depending on whether we are multiplying by
9541 a vector or a scalar.
9543 Canonicalization is not very good in these cases, FMA4 will put the
9544 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9545 if (GET_CODE (op0) == VEC_DUPLICATE)
9546 op0 = XEXP (op0, 0);
9547 else if (GET_CODE (op1) == VEC_DUPLICATE)
9548 op1 = XEXP (op1, 0);
9550 if (GET_CODE (op0) == VEC_SELECT)
9551 op0 = XEXP (op0, 0);
9552 else if (GET_CODE (op1) == VEC_SELECT)
9553 op1 = XEXP (op1, 0);
9555 /* If the remaining parameters are not registers,
9556 get the cost to put them into registers. */
9557 *cost += rtx_cost (op0, mode, FMA, 0, speed);
9558 *cost += rtx_cost (op1, mode, FMA, 1, speed);
9559 *cost += rtx_cost (op2, mode, FMA, 2, speed);
9560 return true;
9562 case FLOAT:
9563 case UNSIGNED_FLOAT:
9564 if (speed)
9565 *cost += extra_cost->fp[mode == DFmode].fromint;
9566 return false;
9568 case FLOAT_EXTEND:
9569 if (speed)
9571 if (VECTOR_MODE_P (mode))
9573 /*Vector truncate. */
9574 *cost += extra_cost->vect.alu;
9576 else
9577 *cost += extra_cost->fp[mode == DFmode].widen;
9579 return false;
9581 case FLOAT_TRUNCATE:
9582 if (speed)
9584 if (VECTOR_MODE_P (mode))
9586 /*Vector conversion. */
9587 *cost += extra_cost->vect.alu;
9589 else
9590 *cost += extra_cost->fp[mode == DFmode].narrow;
9592 return false;
9594 case FIX:
9595 case UNSIGNED_FIX:
9596 x = XEXP (x, 0);
9597 /* Strip the rounding part. They will all be implemented
9598 by the fcvt* family of instructions anyway. */
9599 if (GET_CODE (x) == UNSPEC)
9601 unsigned int uns_code = XINT (x, 1);
9603 if (uns_code == UNSPEC_FRINTA
9604 || uns_code == UNSPEC_FRINTM
9605 || uns_code == UNSPEC_FRINTN
9606 || uns_code == UNSPEC_FRINTP
9607 || uns_code == UNSPEC_FRINTZ)
9608 x = XVECEXP (x, 0, 0);
9611 if (speed)
9613 if (VECTOR_MODE_P (mode))
9614 *cost += extra_cost->vect.alu;
9615 else
9616 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9619 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9620 fixed-point fcvt. */
9621 if (GET_CODE (x) == MULT
9622 && ((VECTOR_MODE_P (mode)
9623 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9624 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9626 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9627 0, speed);
9628 return true;
9631 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9632 return true;
9634 case ABS:
9635 if (VECTOR_MODE_P (mode))
9637 /* ABS (vector). */
9638 if (speed)
9639 *cost += extra_cost->vect.alu;
9641 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9643 op0 = XEXP (x, 0);
9645 /* FABD, which is analogous to FADD. */
9646 if (GET_CODE (op0) == MINUS)
9648 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9649 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9650 if (speed)
9651 *cost += extra_cost->fp[mode == DFmode].addsub;
9653 return true;
9655 /* Simple FABS is analogous to FNEG. */
9656 if (speed)
9657 *cost += extra_cost->fp[mode == DFmode].neg;
9659 else
9661 /* Integer ABS will either be split to
9662 two arithmetic instructions, or will be an ABS
9663 (scalar), which we don't model. */
9664 *cost = COSTS_N_INSNS (2);
9665 if (speed)
9666 *cost += 2 * extra_cost->alu.arith;
9668 return false;
9670 case SMAX:
9671 case SMIN:
9672 if (speed)
9674 if (VECTOR_MODE_P (mode))
9675 *cost += extra_cost->vect.alu;
9676 else
9678 /* FMAXNM/FMINNM/FMAX/FMIN.
9679 TODO: This may not be accurate for all implementations, but
9680 we do not model this in the cost tables. */
9681 *cost += extra_cost->fp[mode == DFmode].addsub;
9684 return false;
9686 case UNSPEC:
9687 /* The floating point round to integer frint* instructions. */
9688 if (aarch64_frint_unspec_p (XINT (x, 1)))
9690 if (speed)
9691 *cost += extra_cost->fp[mode == DFmode].roundint;
9693 return false;
9696 if (XINT (x, 1) == UNSPEC_RBIT)
9698 if (speed)
9699 *cost += extra_cost->alu.rev;
9701 return false;
9703 break;
9705 case TRUNCATE:
9707 /* Decompose <su>muldi3_highpart. */
9708 if (/* (truncate:DI */
9709 mode == DImode
9710 /* (lshiftrt:TI */
9711 && GET_MODE (XEXP (x, 0)) == TImode
9712 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9713 /* (mult:TI */
9714 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9715 /* (ANY_EXTEND:TI (reg:DI))
9716 (ANY_EXTEND:TI (reg:DI))) */
9717 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9718 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9719 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9720 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9721 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9722 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9723 /* (const_int 64) */
9724 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9725 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9727 /* UMULH/SMULH. */
9728 if (speed)
9729 *cost += extra_cost->mult[mode == DImode].extend;
9730 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9731 mode, MULT, 0, speed);
9732 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9733 mode, MULT, 1, speed);
9734 return true;
9737 /* Fall through. */
9738 default:
9739 break;
9742 if (dump_file
9743 && flag_aarch64_verbose_cost)
9744 fprintf (dump_file,
9745 "\nFailed to cost RTX. Assuming default cost.\n");
9747 return true;
9750 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9751 calculated for X. This cost is stored in *COST. Returns true
9752 if the total cost of X was calculated. */
9753 static bool
9754 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9755 int param, int *cost, bool speed)
9757 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9759 if (dump_file
9760 && flag_aarch64_verbose_cost)
9762 print_rtl_single (dump_file, x);
9763 fprintf (dump_file, "\n%s cost: %d (%s)\n",
9764 speed ? "Hot" : "Cold",
9765 *cost, result ? "final" : "partial");
9768 return result;
9771 static int
9772 aarch64_register_move_cost (machine_mode mode,
9773 reg_class_t from_i, reg_class_t to_i)
9775 enum reg_class from = (enum reg_class) from_i;
9776 enum reg_class to = (enum reg_class) to_i;
9777 const struct cpu_regmove_cost *regmove_cost
9778 = aarch64_tune_params.regmove_cost;
9780 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9781 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9782 to = GENERAL_REGS;
9784 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9785 from = GENERAL_REGS;
9787 /* Moving between GPR and stack cost is the same as GP2GP. */
9788 if ((from == GENERAL_REGS && to == STACK_REG)
9789 || (to == GENERAL_REGS && from == STACK_REG))
9790 return regmove_cost->GP2GP;
9792 /* To/From the stack register, we move via the gprs. */
9793 if (to == STACK_REG || from == STACK_REG)
9794 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9795 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9797 if (known_eq (GET_MODE_SIZE (mode), 16))
9799 /* 128-bit operations on general registers require 2 instructions. */
9800 if (from == GENERAL_REGS && to == GENERAL_REGS)
9801 return regmove_cost->GP2GP * 2;
9802 else if (from == GENERAL_REGS)
9803 return regmove_cost->GP2FP * 2;
9804 else if (to == GENERAL_REGS)
9805 return regmove_cost->FP2GP * 2;
9807 /* When AdvSIMD instructions are disabled it is not possible to move
9808 a 128-bit value directly between Q registers. This is handled in
9809 secondary reload. A general register is used as a scratch to move
9810 the upper DI value and the lower DI value is moved directly,
9811 hence the cost is the sum of three moves. */
9812 if (! TARGET_SIMD)
9813 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9815 return regmove_cost->FP2FP;
9818 if (from == GENERAL_REGS && to == GENERAL_REGS)
9819 return regmove_cost->GP2GP;
9820 else if (from == GENERAL_REGS)
9821 return regmove_cost->GP2FP;
9822 else if (to == GENERAL_REGS)
9823 return regmove_cost->FP2GP;
9825 return regmove_cost->FP2FP;
9828 static int
9829 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9830 reg_class_t rclass ATTRIBUTE_UNUSED,
9831 bool in ATTRIBUTE_UNUSED)
9833 return aarch64_tune_params.memmov_cost;
9836 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9837 to optimize 1.0/sqrt. */
9839 static bool
9840 use_rsqrt_p (machine_mode mode)
9842 return (!flag_trapping_math
9843 && flag_unsafe_math_optimizations
9844 && ((aarch64_tune_params.approx_modes->recip_sqrt
9845 & AARCH64_APPROX_MODE (mode))
9846 || flag_mrecip_low_precision_sqrt));
9849 /* Function to decide when to use the approximate reciprocal square root
9850 builtin. */
9852 static tree
9853 aarch64_builtin_reciprocal (tree fndecl)
9855 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9857 if (!use_rsqrt_p (mode))
9858 return NULL_TREE;
9859 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9862 typedef rtx (*rsqrte_type) (rtx, rtx);
9864 /* Select reciprocal square root initial estimate insn depending on machine
9865 mode. */
9867 static rsqrte_type
9868 get_rsqrte_type (machine_mode mode)
9870 switch (mode)
9872 case E_DFmode: return gen_aarch64_rsqrtedf;
9873 case E_SFmode: return gen_aarch64_rsqrtesf;
9874 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9875 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9876 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9877 default: gcc_unreachable ();
9881 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9883 /* Select reciprocal square root series step insn depending on machine mode. */
9885 static rsqrts_type
9886 get_rsqrts_type (machine_mode mode)
9888 switch (mode)
9890 case E_DFmode: return gen_aarch64_rsqrtsdf;
9891 case E_SFmode: return gen_aarch64_rsqrtssf;
9892 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9893 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9894 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9895 default: gcc_unreachable ();
9899 /* Emit instruction sequence to compute either the approximate square root
9900 or its approximate reciprocal, depending on the flag RECP, and return
9901 whether the sequence was emitted or not. */
9903 bool
9904 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9906 machine_mode mode = GET_MODE (dst);
9908 if (GET_MODE_INNER (mode) == HFmode)
9910 gcc_assert (!recp);
9911 return false;
9914 if (!recp)
9916 if (!(flag_mlow_precision_sqrt
9917 || (aarch64_tune_params.approx_modes->sqrt
9918 & AARCH64_APPROX_MODE (mode))))
9919 return false;
9921 if (flag_finite_math_only
9922 || flag_trapping_math
9923 || !flag_unsafe_math_optimizations
9924 || optimize_function_for_size_p (cfun))
9925 return false;
9927 else
9928 /* Caller assumes we cannot fail. */
9929 gcc_assert (use_rsqrt_p (mode));
9931 machine_mode mmsk = mode_for_int_vector (mode).require ();
9932 rtx xmsk = gen_reg_rtx (mmsk);
9933 if (!recp)
9934 /* When calculating the approximate square root, compare the
9935 argument with 0.0 and create a mask. */
9936 emit_insn (gen_rtx_SET (xmsk,
9937 gen_rtx_NEG (mmsk,
9938 gen_rtx_EQ (mmsk, src,
9939 CONST0_RTX (mode)))));
9941 /* Estimate the approximate reciprocal square root. */
9942 rtx xdst = gen_reg_rtx (mode);
9943 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9945 /* Iterate over the series twice for SF and thrice for DF. */
9946 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9948 /* Optionally iterate over the series once less for faster performance
9949 while sacrificing the accuracy. */
9950 if ((recp && flag_mrecip_low_precision_sqrt)
9951 || (!recp && flag_mlow_precision_sqrt))
9952 iterations--;
9954 /* Iterate over the series to calculate the approximate reciprocal square
9955 root. */
9956 rtx x1 = gen_reg_rtx (mode);
9957 while (iterations--)
9959 rtx x2 = gen_reg_rtx (mode);
9960 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9962 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9964 if (iterations > 0)
9965 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9968 if (!recp)
9970 /* Qualify the approximate reciprocal square root when the argument is
9971 0.0 by squashing the intermediary result to 0.0. */
9972 rtx xtmp = gen_reg_rtx (mmsk);
9973 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9974 gen_rtx_SUBREG (mmsk, xdst, 0)));
9975 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9977 /* Calculate the approximate square root. */
9978 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9981 /* Finalize the approximation. */
9982 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9984 return true;
9987 typedef rtx (*recpe_type) (rtx, rtx);
9989 /* Select reciprocal initial estimate insn depending on machine mode. */
9991 static recpe_type
9992 get_recpe_type (machine_mode mode)
9994 switch (mode)
9996 case E_SFmode: return (gen_aarch64_frecpesf);
9997 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9998 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9999 case E_DFmode: return (gen_aarch64_frecpedf);
10000 case E_V2DFmode: return (gen_aarch64_frecpev2df);
10001 default: gcc_unreachable ();
10005 typedef rtx (*recps_type) (rtx, rtx, rtx);
10007 /* Select reciprocal series step insn depending on machine mode. */
10009 static recps_type
10010 get_recps_type (machine_mode mode)
10012 switch (mode)
10014 case E_SFmode: return (gen_aarch64_frecpssf);
10015 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
10016 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
10017 case E_DFmode: return (gen_aarch64_frecpsdf);
10018 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
10019 default: gcc_unreachable ();
10023 /* Emit the instruction sequence to compute the approximation for the division
10024 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10026 bool
10027 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10029 machine_mode mode = GET_MODE (quo);
10031 if (GET_MODE_INNER (mode) == HFmode)
10032 return false;
10034 bool use_approx_division_p = (flag_mlow_precision_div
10035 || (aarch64_tune_params.approx_modes->division
10036 & AARCH64_APPROX_MODE (mode)));
10038 if (!flag_finite_math_only
10039 || flag_trapping_math
10040 || !flag_unsafe_math_optimizations
10041 || optimize_function_for_size_p (cfun)
10042 || !use_approx_division_p)
10043 return false;
10045 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10046 return false;
10048 /* Estimate the approximate reciprocal. */
10049 rtx xrcp = gen_reg_rtx (mode);
10050 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
10052 /* Iterate over the series twice for SF and thrice for DF. */
10053 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10055 /* Optionally iterate over the series once less for faster performance,
10056 while sacrificing the accuracy. */
10057 if (flag_mlow_precision_div)
10058 iterations--;
10060 /* Iterate over the series to calculate the approximate reciprocal. */
10061 rtx xtmp = gen_reg_rtx (mode);
10062 while (iterations--)
10064 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10066 if (iterations > 0)
10067 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10070 if (num != CONST1_RTX (mode))
10072 /* As the approximate reciprocal of DEN is already calculated, only
10073 calculate the approximate division when NUM is not 1.0. */
10074 rtx xnum = force_reg (mode, num);
10075 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10078 /* Finalize the approximation. */
10079 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10080 return true;
10083 /* Return the number of instructions that can be issued per cycle. */
10084 static int
10085 aarch64_sched_issue_rate (void)
10087 return aarch64_tune_params.issue_rate;
10090 static int
10091 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10093 int issue_rate = aarch64_sched_issue_rate ();
10095 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10099 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10100 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10101 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10103 static int
10104 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10105 int ready_index)
10107 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10111 /* Vectorizer cost model target hooks. */
10113 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10114 static int
10115 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10116 tree vectype,
10117 int misalign ATTRIBUTE_UNUSED)
10119 unsigned elements;
10120 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10121 bool fp = false;
10123 if (vectype != NULL)
10124 fp = FLOAT_TYPE_P (vectype);
10126 switch (type_of_cost)
10128 case scalar_stmt:
10129 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10131 case scalar_load:
10132 return costs->scalar_load_cost;
10134 case scalar_store:
10135 return costs->scalar_store_cost;
10137 case vector_stmt:
10138 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10140 case vector_load:
10141 return costs->vec_align_load_cost;
10143 case vector_store:
10144 return costs->vec_store_cost;
10146 case vec_to_scalar:
10147 return costs->vec_to_scalar_cost;
10149 case scalar_to_vec:
10150 return costs->scalar_to_vec_cost;
10152 case unaligned_load:
10153 case vector_gather_load:
10154 return costs->vec_unalign_load_cost;
10156 case unaligned_store:
10157 case vector_scatter_store:
10158 return costs->vec_unalign_store_cost;
10160 case cond_branch_taken:
10161 return costs->cond_taken_branch_cost;
10163 case cond_branch_not_taken:
10164 return costs->cond_not_taken_branch_cost;
10166 case vec_perm:
10167 return costs->vec_permute_cost;
10169 case vec_promote_demote:
10170 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10172 case vec_construct:
10173 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10174 return elements / 2 + 1;
10176 default:
10177 gcc_unreachable ();
10181 /* Implement targetm.vectorize.add_stmt_cost. */
10182 static unsigned
10183 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10184 struct _stmt_vec_info *stmt_info, int misalign,
10185 enum vect_cost_model_location where)
10187 unsigned *cost = (unsigned *) data;
10188 unsigned retval = 0;
10190 if (flag_vect_cost_model)
10192 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10193 int stmt_cost =
10194 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10196 /* Statements in an inner loop relative to the loop being
10197 vectorized are weighted more heavily. The value here is
10198 arbitrary and could potentially be improved with analysis. */
10199 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10200 count *= 50; /* FIXME */
10202 retval = (unsigned) (count * stmt_cost);
10203 cost[where] += retval;
10206 return retval;
10209 static void initialize_aarch64_code_model (struct gcc_options *);
10211 /* Parse the TO_PARSE string and put the architecture struct that it
10212 selects into RES and the architectural features into ISA_FLAGS.
10213 Return an aarch64_parse_opt_result describing the parse result.
10214 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10216 static enum aarch64_parse_opt_result
10217 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10218 unsigned long *isa_flags)
10220 char *ext;
10221 const struct processor *arch;
10222 char *str = (char *) alloca (strlen (to_parse) + 1);
10223 size_t len;
10225 strcpy (str, to_parse);
10227 ext = strchr (str, '+');
10229 if (ext != NULL)
10230 len = ext - str;
10231 else
10232 len = strlen (str);
10234 if (len == 0)
10235 return AARCH64_PARSE_MISSING_ARG;
10238 /* Loop through the list of supported ARCHes to find a match. */
10239 for (arch = all_architectures; arch->name != NULL; arch++)
10241 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10243 unsigned long isa_temp = arch->flags;
10245 if (ext != NULL)
10247 /* TO_PARSE string contains at least one extension. */
10248 enum aarch64_parse_opt_result ext_res
10249 = aarch64_parse_extension (ext, &isa_temp);
10251 if (ext_res != AARCH64_PARSE_OK)
10252 return ext_res;
10254 /* Extension parsing was successful. Confirm the result
10255 arch and ISA flags. */
10256 *res = arch;
10257 *isa_flags = isa_temp;
10258 return AARCH64_PARSE_OK;
10262 /* ARCH name not found in list. */
10263 return AARCH64_PARSE_INVALID_ARG;
10266 /* Parse the TO_PARSE string and put the result tuning in RES and the
10267 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10268 describing the parse result. If there is an error parsing, RES and
10269 ISA_FLAGS are left unchanged. */
10271 static enum aarch64_parse_opt_result
10272 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10273 unsigned long *isa_flags)
10275 char *ext;
10276 const struct processor *cpu;
10277 char *str = (char *) alloca (strlen (to_parse) + 1);
10278 size_t len;
10280 strcpy (str, to_parse);
10282 ext = strchr (str, '+');
10284 if (ext != NULL)
10285 len = ext - str;
10286 else
10287 len = strlen (str);
10289 if (len == 0)
10290 return AARCH64_PARSE_MISSING_ARG;
10293 /* Loop through the list of supported CPUs to find a match. */
10294 for (cpu = all_cores; cpu->name != NULL; cpu++)
10296 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10298 unsigned long isa_temp = cpu->flags;
10301 if (ext != NULL)
10303 /* TO_PARSE string contains at least one extension. */
10304 enum aarch64_parse_opt_result ext_res
10305 = aarch64_parse_extension (ext, &isa_temp);
10307 if (ext_res != AARCH64_PARSE_OK)
10308 return ext_res;
10310 /* Extension parsing was successfull. Confirm the result
10311 cpu and ISA flags. */
10312 *res = cpu;
10313 *isa_flags = isa_temp;
10314 return AARCH64_PARSE_OK;
10318 /* CPU name not found in list. */
10319 return AARCH64_PARSE_INVALID_ARG;
10322 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10323 Return an aarch64_parse_opt_result describing the parse result.
10324 If the parsing fails the RES does not change. */
10326 static enum aarch64_parse_opt_result
10327 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10329 const struct processor *cpu;
10330 char *str = (char *) alloca (strlen (to_parse) + 1);
10332 strcpy (str, to_parse);
10334 /* Loop through the list of supported CPUs to find a match. */
10335 for (cpu = all_cores; cpu->name != NULL; cpu++)
10337 if (strcmp (cpu->name, str) == 0)
10339 *res = cpu;
10340 return AARCH64_PARSE_OK;
10344 /* CPU name not found in list. */
10345 return AARCH64_PARSE_INVALID_ARG;
10348 /* Parse TOKEN, which has length LENGTH to see if it is an option
10349 described in FLAG. If it is, return the index bit for that fusion type.
10350 If not, error (printing OPTION_NAME) and return zero. */
10352 static unsigned int
10353 aarch64_parse_one_option_token (const char *token,
10354 size_t length,
10355 const struct aarch64_flag_desc *flag,
10356 const char *option_name)
10358 for (; flag->name != NULL; flag++)
10360 if (length == strlen (flag->name)
10361 && !strncmp (flag->name, token, length))
10362 return flag->flag;
10365 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10366 return 0;
10369 /* Parse OPTION which is a comma-separated list of flags to enable.
10370 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10371 default state we inherit from the CPU tuning structures. OPTION_NAME
10372 gives the top-level option we are parsing in the -moverride string,
10373 for use in error messages. */
10375 static unsigned int
10376 aarch64_parse_boolean_options (const char *option,
10377 const struct aarch64_flag_desc *flags,
10378 unsigned int initial_state,
10379 const char *option_name)
10381 const char separator = '.';
10382 const char* specs = option;
10383 const char* ntoken = option;
10384 unsigned int found_flags = initial_state;
10386 while ((ntoken = strchr (specs, separator)))
10388 size_t token_length = ntoken - specs;
10389 unsigned token_ops = aarch64_parse_one_option_token (specs,
10390 token_length,
10391 flags,
10392 option_name);
10393 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10394 in the token stream, reset the supported operations. So:
10396 adrp+add.cmp+branch.none.adrp+add
10398 would have the result of turning on only adrp+add fusion. */
10399 if (!token_ops)
10400 found_flags = 0;
10402 found_flags |= token_ops;
10403 specs = ++ntoken;
10406 /* We ended with a comma, print something. */
10407 if (!(*specs))
10409 error ("%s string ill-formed\n", option_name);
10410 return 0;
10413 /* We still have one more token to parse. */
10414 size_t token_length = strlen (specs);
10415 unsigned token_ops = aarch64_parse_one_option_token (specs,
10416 token_length,
10417 flags,
10418 option_name);
10419 if (!token_ops)
10420 found_flags = 0;
10422 found_flags |= token_ops;
10423 return found_flags;
10426 /* Support for overriding instruction fusion. */
10428 static void
10429 aarch64_parse_fuse_string (const char *fuse_string,
10430 struct tune_params *tune)
10432 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10433 aarch64_fusible_pairs,
10434 tune->fusible_ops,
10435 "fuse=");
10438 /* Support for overriding other tuning flags. */
10440 static void
10441 aarch64_parse_tune_string (const char *tune_string,
10442 struct tune_params *tune)
10444 tune->extra_tuning_flags
10445 = aarch64_parse_boolean_options (tune_string,
10446 aarch64_tuning_flags,
10447 tune->extra_tuning_flags,
10448 "tune=");
10451 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10452 we understand. If it is, extract the option string and handoff to
10453 the appropriate function. */
10455 void
10456 aarch64_parse_one_override_token (const char* token,
10457 size_t length,
10458 struct tune_params *tune)
10460 const struct aarch64_tuning_override_function *fn
10461 = aarch64_tuning_override_functions;
10463 const char *option_part = strchr (token, '=');
10464 if (!option_part)
10466 error ("tuning string missing in option (%s)", token);
10467 return;
10470 /* Get the length of the option name. */
10471 length = option_part - token;
10472 /* Skip the '=' to get to the option string. */
10473 option_part++;
10475 for (; fn->name != NULL; fn++)
10477 if (!strncmp (fn->name, token, length))
10479 fn->parse_override (option_part, tune);
10480 return;
10484 error ("unknown tuning option (%s)",token);
10485 return;
10488 /* A checking mechanism for the implementation of the tls size. */
10490 static void
10491 initialize_aarch64_tls_size (struct gcc_options *opts)
10493 if (aarch64_tls_size == 0)
10494 aarch64_tls_size = 24;
10496 switch (opts->x_aarch64_cmodel_var)
10498 case AARCH64_CMODEL_TINY:
10499 /* Both the default and maximum TLS size allowed under tiny is 1M which
10500 needs two instructions to address, so we clamp the size to 24. */
10501 if (aarch64_tls_size > 24)
10502 aarch64_tls_size = 24;
10503 break;
10504 case AARCH64_CMODEL_SMALL:
10505 /* The maximum TLS size allowed under small is 4G. */
10506 if (aarch64_tls_size > 32)
10507 aarch64_tls_size = 32;
10508 break;
10509 case AARCH64_CMODEL_LARGE:
10510 /* The maximum TLS size allowed under large is 16E.
10511 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10512 if (aarch64_tls_size > 48)
10513 aarch64_tls_size = 48;
10514 break;
10515 default:
10516 gcc_unreachable ();
10519 return;
10522 /* Parse STRING looking for options in the format:
10523 string :: option:string
10524 option :: name=substring
10525 name :: {a-z}
10526 substring :: defined by option. */
10528 static void
10529 aarch64_parse_override_string (const char* input_string,
10530 struct tune_params* tune)
10532 const char separator = ':';
10533 size_t string_length = strlen (input_string) + 1;
10534 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10535 char *string = string_root;
10536 strncpy (string, input_string, string_length);
10537 string[string_length - 1] = '\0';
10539 char* ntoken = string;
10541 while ((ntoken = strchr (string, separator)))
10543 size_t token_length = ntoken - string;
10544 /* Make this substring look like a string. */
10545 *ntoken = '\0';
10546 aarch64_parse_one_override_token (string, token_length, tune);
10547 string = ++ntoken;
10550 /* One last option to parse. */
10551 aarch64_parse_one_override_token (string, strlen (string), tune);
10552 free (string_root);
10556 static void
10557 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10559 /* PR 70044: We have to be careful about being called multiple times for the
10560 same function. This means all changes should be repeatable. */
10562 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10563 Disable the frame pointer flag so the mid-end will not use a frame
10564 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10565 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10566 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
10567 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10568 if (opts->x_flag_omit_frame_pointer == 0)
10569 opts->x_flag_omit_frame_pointer = 2;
10571 /* If not optimizing for size, set the default
10572 alignment to what the target wants. */
10573 if (!opts->x_optimize_size)
10575 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
10576 opts->x_str_align_loops = aarch64_tune_params.loop_align;
10577 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
10578 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
10579 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
10580 opts->x_str_align_functions = aarch64_tune_params.function_align;
10583 /* We default to no pc-relative literal loads. */
10585 aarch64_pcrelative_literal_loads = false;
10587 /* If -mpc-relative-literal-loads is set on the command line, this
10588 implies that the user asked for PC relative literal loads. */
10589 if (opts->x_pcrelative_literal_loads == 1)
10590 aarch64_pcrelative_literal_loads = true;
10592 /* In the tiny memory model it makes no sense to disallow PC relative
10593 literal pool loads. */
10594 if (aarch64_cmodel == AARCH64_CMODEL_TINY
10595 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10596 aarch64_pcrelative_literal_loads = true;
10598 /* When enabling the lower precision Newton series for the square root, also
10599 enable it for the reciprocal square root, since the latter is an
10600 intermediary step for the former. */
10601 if (flag_mlow_precision_sqrt)
10602 flag_mrecip_low_precision_sqrt = true;
10605 /* 'Unpack' up the internal tuning structs and update the options
10606 in OPTS. The caller must have set up selected_tune and selected_arch
10607 as all the other target-specific codegen decisions are
10608 derived from them. */
10610 void
10611 aarch64_override_options_internal (struct gcc_options *opts)
10613 aarch64_tune_flags = selected_tune->flags;
10614 aarch64_tune = selected_tune->sched_core;
10615 /* Make a copy of the tuning parameters attached to the core, which
10616 we may later overwrite. */
10617 aarch64_tune_params = *(selected_tune->tune);
10618 aarch64_architecture_version = selected_arch->architecture_version;
10620 if (opts->x_aarch64_override_tune_string)
10621 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10622 &aarch64_tune_params);
10624 /* This target defaults to strict volatile bitfields. */
10625 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10626 opts->x_flag_strict_volatile_bitfields = 1;
10628 initialize_aarch64_code_model (opts);
10629 initialize_aarch64_tls_size (opts);
10631 int queue_depth = 0;
10632 switch (aarch64_tune_params.autoprefetcher_model)
10634 case tune_params::AUTOPREFETCHER_OFF:
10635 queue_depth = -1;
10636 break;
10637 case tune_params::AUTOPREFETCHER_WEAK:
10638 queue_depth = 0;
10639 break;
10640 case tune_params::AUTOPREFETCHER_STRONG:
10641 queue_depth = max_insn_queue_index + 1;
10642 break;
10643 default:
10644 gcc_unreachable ();
10647 /* We don't mind passing in global_options_set here as we don't use
10648 the *options_set structs anyway. */
10649 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10650 queue_depth,
10651 opts->x_param_values,
10652 global_options_set.x_param_values);
10654 /* Set up parameters to be used in prefetching algorithm. Do not
10655 override the defaults unless we are tuning for a core we have
10656 researched values for. */
10657 if (aarch64_tune_params.prefetch->num_slots > 0)
10658 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10659 aarch64_tune_params.prefetch->num_slots,
10660 opts->x_param_values,
10661 global_options_set.x_param_values);
10662 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10663 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10664 aarch64_tune_params.prefetch->l1_cache_size,
10665 opts->x_param_values,
10666 global_options_set.x_param_values);
10667 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10668 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10669 aarch64_tune_params.prefetch->l1_cache_line_size,
10670 opts->x_param_values,
10671 global_options_set.x_param_values);
10672 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10673 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10674 aarch64_tune_params.prefetch->l2_cache_size,
10675 opts->x_param_values,
10676 global_options_set.x_param_values);
10677 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10678 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10680 opts->x_param_values,
10681 global_options_set.x_param_values);
10682 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10683 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10684 aarch64_tune_params.prefetch->minimum_stride,
10685 opts->x_param_values,
10686 global_options_set.x_param_values);
10688 /* Use the alternative scheduling-pressure algorithm by default. */
10689 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10690 opts->x_param_values,
10691 global_options_set.x_param_values);
10693 /* Enable sw prefetching at specified optimization level for
10694 CPUS that have prefetch. Lower optimization level threshold by 1
10695 when profiling is enabled. */
10696 if (opts->x_flag_prefetch_loop_arrays < 0
10697 && !opts->x_optimize_size
10698 && aarch64_tune_params.prefetch->default_opt_level >= 0
10699 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10700 opts->x_flag_prefetch_loop_arrays = 1;
10702 aarch64_override_options_after_change_1 (opts);
10705 /* Print a hint with a suggestion for a core or architecture name that
10706 most closely resembles what the user passed in STR. ARCH is true if
10707 the user is asking for an architecture name. ARCH is false if the user
10708 is asking for a core name. */
10710 static void
10711 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10713 auto_vec<const char *> candidates;
10714 const struct processor *entry = arch ? all_architectures : all_cores;
10715 for (; entry->name != NULL; entry++)
10716 candidates.safe_push (entry->name);
10718 #ifdef HAVE_LOCAL_CPU_DETECT
10719 /* Add also "native" as possible value. */
10720 if (arch)
10721 candidates.safe_push ("native");
10722 #endif
10724 char *s;
10725 const char *hint = candidates_list_and_hint (str, s, candidates);
10726 if (hint)
10727 inform (input_location, "valid arguments are: %s;"
10728 " did you mean %qs?", s, hint);
10729 else
10730 inform (input_location, "valid arguments are: %s", s);
10732 XDELETEVEC (s);
10735 /* Print a hint with a suggestion for a core name that most closely resembles
10736 what the user passed in STR. */
10738 inline static void
10739 aarch64_print_hint_for_core (const char *str)
10741 aarch64_print_hint_for_core_or_arch (str, false);
10744 /* Print a hint with a suggestion for an architecture name that most closely
10745 resembles what the user passed in STR. */
10747 inline static void
10748 aarch64_print_hint_for_arch (const char *str)
10750 aarch64_print_hint_for_core_or_arch (str, true);
10753 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10754 specified in STR and throw errors if appropriate. Put the results if
10755 they are valid in RES and ISA_FLAGS. Return whether the option is
10756 valid. */
10758 static bool
10759 aarch64_validate_mcpu (const char *str, const struct processor **res,
10760 unsigned long *isa_flags)
10762 enum aarch64_parse_opt_result parse_res
10763 = aarch64_parse_cpu (str, res, isa_flags);
10765 if (parse_res == AARCH64_PARSE_OK)
10766 return true;
10768 switch (parse_res)
10770 case AARCH64_PARSE_MISSING_ARG:
10771 error ("missing cpu name in %<-mcpu=%s%>", str);
10772 break;
10773 case AARCH64_PARSE_INVALID_ARG:
10774 error ("unknown value %qs for -mcpu", str);
10775 aarch64_print_hint_for_core (str);
10776 break;
10777 case AARCH64_PARSE_INVALID_FEATURE:
10778 error ("invalid feature modifier in %<-mcpu=%s%>", str);
10779 break;
10780 default:
10781 gcc_unreachable ();
10784 return false;
10787 /* Validate a command-line -march option. Parse the arch and extensions
10788 (if any) specified in STR and throw errors if appropriate. Put the
10789 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10790 option is valid. */
10792 static bool
10793 aarch64_validate_march (const char *str, const struct processor **res,
10794 unsigned long *isa_flags)
10796 enum aarch64_parse_opt_result parse_res
10797 = aarch64_parse_arch (str, res, isa_flags);
10799 if (parse_res == AARCH64_PARSE_OK)
10800 return true;
10802 switch (parse_res)
10804 case AARCH64_PARSE_MISSING_ARG:
10805 error ("missing arch name in %<-march=%s%>", str);
10806 break;
10807 case AARCH64_PARSE_INVALID_ARG:
10808 error ("unknown value %qs for -march", str);
10809 aarch64_print_hint_for_arch (str);
10810 break;
10811 case AARCH64_PARSE_INVALID_FEATURE:
10812 error ("invalid feature modifier in %<-march=%s%>", str);
10813 break;
10814 default:
10815 gcc_unreachable ();
10818 return false;
10821 /* Validate a command-line -mtune option. Parse the cpu
10822 specified in STR and throw errors if appropriate. Put the
10823 result, if it is valid, in RES. Return whether the option is
10824 valid. */
10826 static bool
10827 aarch64_validate_mtune (const char *str, const struct processor **res)
10829 enum aarch64_parse_opt_result parse_res
10830 = aarch64_parse_tune (str, res);
10832 if (parse_res == AARCH64_PARSE_OK)
10833 return true;
10835 switch (parse_res)
10837 case AARCH64_PARSE_MISSING_ARG:
10838 error ("missing cpu name in %<-mtune=%s%>", str);
10839 break;
10840 case AARCH64_PARSE_INVALID_ARG:
10841 error ("unknown value %qs for -mtune", str);
10842 aarch64_print_hint_for_core (str);
10843 break;
10844 default:
10845 gcc_unreachable ();
10847 return false;
10850 /* Return the CPU corresponding to the enum CPU.
10851 If it doesn't specify a cpu, return the default. */
10853 static const struct processor *
10854 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10856 if (cpu != aarch64_none)
10857 return &all_cores[cpu];
10859 /* The & 0x3f is to extract the bottom 6 bits that encode the
10860 default cpu as selected by the --with-cpu GCC configure option
10861 in config.gcc.
10862 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10863 flags mechanism should be reworked to make it more sane. */
10864 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10867 /* Return the architecture corresponding to the enum ARCH.
10868 If it doesn't specify a valid architecture, return the default. */
10870 static const struct processor *
10871 aarch64_get_arch (enum aarch64_arch arch)
10873 if (arch != aarch64_no_arch)
10874 return &all_architectures[arch];
10876 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10878 return &all_architectures[cpu->arch];
10881 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10883 static poly_uint16
10884 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10886 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10887 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10888 deciding which .md file patterns to use and when deciding whether
10889 something is a legitimate address or constant. */
10890 if (value == SVE_SCALABLE || value == SVE_128)
10891 return poly_uint16 (2, 2);
10892 else
10893 return (int) value / 64;
10896 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10897 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10898 tuning structs. In particular it must set selected_tune and
10899 aarch64_isa_flags that define the available ISA features and tuning
10900 decisions. It must also set selected_arch as this will be used to
10901 output the .arch asm tags for each function. */
10903 static void
10904 aarch64_override_options (void)
10906 unsigned long cpu_isa = 0;
10907 unsigned long arch_isa = 0;
10908 aarch64_isa_flags = 0;
10910 bool valid_cpu = true;
10911 bool valid_tune = true;
10912 bool valid_arch = true;
10914 selected_cpu = NULL;
10915 selected_arch = NULL;
10916 selected_tune = NULL;
10918 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10919 If either of -march or -mtune is given, they override their
10920 respective component of -mcpu. */
10921 if (aarch64_cpu_string)
10922 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10923 &cpu_isa);
10925 if (aarch64_arch_string)
10926 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10927 &arch_isa);
10929 if (aarch64_tune_string)
10930 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10932 /* If the user did not specify a processor, choose the default
10933 one for them. This will be the CPU set during configuration using
10934 --with-cpu, otherwise it is "generic". */
10935 if (!selected_cpu)
10937 if (selected_arch)
10939 selected_cpu = &all_cores[selected_arch->ident];
10940 aarch64_isa_flags = arch_isa;
10941 explicit_arch = selected_arch->arch;
10943 else
10945 /* Get default configure-time CPU. */
10946 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10947 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10950 if (selected_tune)
10951 explicit_tune_core = selected_tune->ident;
10953 /* If both -mcpu and -march are specified check that they are architecturally
10954 compatible, warn if they're not and prefer the -march ISA flags. */
10955 else if (selected_arch)
10957 if (selected_arch->arch != selected_cpu->arch)
10959 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10960 all_architectures[selected_cpu->arch].name,
10961 selected_arch->name);
10963 aarch64_isa_flags = arch_isa;
10964 explicit_arch = selected_arch->arch;
10965 explicit_tune_core = selected_tune ? selected_tune->ident
10966 : selected_cpu->ident;
10968 else
10970 /* -mcpu but no -march. */
10971 aarch64_isa_flags = cpu_isa;
10972 explicit_tune_core = selected_tune ? selected_tune->ident
10973 : selected_cpu->ident;
10974 gcc_assert (selected_cpu);
10975 selected_arch = &all_architectures[selected_cpu->arch];
10976 explicit_arch = selected_arch->arch;
10979 /* Set the arch as well as we will need it when outputing
10980 the .arch directive in assembly. */
10981 if (!selected_arch)
10983 gcc_assert (selected_cpu);
10984 selected_arch = &all_architectures[selected_cpu->arch];
10987 if (!selected_tune)
10988 selected_tune = selected_cpu;
10990 #ifndef HAVE_AS_MABI_OPTION
10991 /* The compiler may have been configured with 2.23.* binutils, which does
10992 not have support for ILP32. */
10993 if (TARGET_ILP32)
10994 error ("assembler does not support -mabi=ilp32");
10995 #endif
10997 /* Convert -msve-vector-bits to a VG count. */
10998 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
11000 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
11001 sorry ("return address signing is only supported for -mabi=lp64");
11003 /* Make sure we properly set up the explicit options. */
11004 if ((aarch64_cpu_string && valid_cpu)
11005 || (aarch64_tune_string && valid_tune))
11006 gcc_assert (explicit_tune_core != aarch64_none);
11008 if ((aarch64_cpu_string && valid_cpu)
11009 || (aarch64_arch_string && valid_arch))
11010 gcc_assert (explicit_arch != aarch64_no_arch);
11012 aarch64_override_options_internal (&global_options);
11014 /* Save these options as the default ones in case we push and pop them later
11015 while processing functions with potential target attributes. */
11016 target_option_default_node = target_option_current_node
11017 = build_target_option_node (&global_options);
11020 /* Implement targetm.override_options_after_change. */
11022 static void
11023 aarch64_override_options_after_change (void)
11025 aarch64_override_options_after_change_1 (&global_options);
11028 static struct machine_function *
11029 aarch64_init_machine_status (void)
11031 struct machine_function *machine;
11032 machine = ggc_cleared_alloc<machine_function> ();
11033 return machine;
11036 void
11037 aarch64_init_expanders (void)
11039 init_machine_status = aarch64_init_machine_status;
11042 /* A checking mechanism for the implementation of the various code models. */
11043 static void
11044 initialize_aarch64_code_model (struct gcc_options *opts)
11046 if (opts->x_flag_pic)
11048 switch (opts->x_aarch64_cmodel_var)
11050 case AARCH64_CMODEL_TINY:
11051 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11052 break;
11053 case AARCH64_CMODEL_SMALL:
11054 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11055 aarch64_cmodel = (flag_pic == 2
11056 ? AARCH64_CMODEL_SMALL_PIC
11057 : AARCH64_CMODEL_SMALL_SPIC);
11058 #else
11059 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11060 #endif
11061 break;
11062 case AARCH64_CMODEL_LARGE:
11063 sorry ("code model %qs with -f%s", "large",
11064 opts->x_flag_pic > 1 ? "PIC" : "pic");
11065 break;
11066 default:
11067 gcc_unreachable ();
11070 else
11071 aarch64_cmodel = opts->x_aarch64_cmodel_var;
11074 /* Implement TARGET_OPTION_SAVE. */
11076 static void
11077 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11079 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11082 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11083 using the information saved in PTR. */
11085 static void
11086 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11088 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11089 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11090 opts->x_explicit_arch = ptr->x_explicit_arch;
11091 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11092 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11094 aarch64_override_options_internal (opts);
11097 /* Implement TARGET_OPTION_PRINT. */
11099 static void
11100 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11102 const struct processor *cpu
11103 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11104 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11105 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11106 std::string extension
11107 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11109 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11110 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11111 arch->name, extension.c_str ());
11114 static GTY(()) tree aarch64_previous_fndecl;
11116 void
11117 aarch64_reset_previous_fndecl (void)
11119 aarch64_previous_fndecl = NULL;
11122 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11123 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11124 make sure optab availability predicates are recomputed when necessary. */
11126 void
11127 aarch64_save_restore_target_globals (tree new_tree)
11129 if (TREE_TARGET_GLOBALS (new_tree))
11130 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11131 else if (new_tree == target_option_default_node)
11132 restore_target_globals (&default_target_globals);
11133 else
11134 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11137 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11138 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11139 of the function, if such exists. This function may be called multiple
11140 times on a single function so use aarch64_previous_fndecl to avoid
11141 setting up identical state. */
11143 static void
11144 aarch64_set_current_function (tree fndecl)
11146 if (!fndecl || fndecl == aarch64_previous_fndecl)
11147 return;
11149 tree old_tree = (aarch64_previous_fndecl
11150 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11151 : NULL_TREE);
11153 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11155 /* If current function has no attributes but the previous one did,
11156 use the default node. */
11157 if (!new_tree && old_tree)
11158 new_tree = target_option_default_node;
11160 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11161 the default have been handled by aarch64_save_restore_target_globals from
11162 aarch64_pragma_target_parse. */
11163 if (old_tree == new_tree)
11164 return;
11166 aarch64_previous_fndecl = fndecl;
11168 /* First set the target options. */
11169 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11171 aarch64_save_restore_target_globals (new_tree);
11174 /* Enum describing the various ways we can handle attributes.
11175 In many cases we can reuse the generic option handling machinery. */
11177 enum aarch64_attr_opt_type
11179 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
11180 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
11181 aarch64_attr_enum, /* Attribute sets an enum variable. */
11182 aarch64_attr_custom /* Attribute requires a custom handling function. */
11185 /* All the information needed to handle a target attribute.
11186 NAME is the name of the attribute.
11187 ATTR_TYPE specifies the type of behavior of the attribute as described
11188 in the definition of enum aarch64_attr_opt_type.
11189 ALLOW_NEG is true if the attribute supports a "no-" form.
11190 HANDLER is the function that takes the attribute string as an argument
11191 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11192 OPT_NUM is the enum specifying the option that the attribute modifies.
11193 This is needed for attributes that mirror the behavior of a command-line
11194 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11195 aarch64_attr_enum. */
11197 struct aarch64_attribute_info
11199 const char *name;
11200 enum aarch64_attr_opt_type attr_type;
11201 bool allow_neg;
11202 bool (*handler) (const char *);
11203 enum opt_code opt_num;
11206 /* Handle the ARCH_STR argument to the arch= target attribute. */
11208 static bool
11209 aarch64_handle_attr_arch (const char *str)
11211 const struct processor *tmp_arch = NULL;
11212 enum aarch64_parse_opt_result parse_res
11213 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11215 if (parse_res == AARCH64_PARSE_OK)
11217 gcc_assert (tmp_arch);
11218 selected_arch = tmp_arch;
11219 explicit_arch = selected_arch->arch;
11220 return true;
11223 switch (parse_res)
11225 case AARCH64_PARSE_MISSING_ARG:
11226 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11227 break;
11228 case AARCH64_PARSE_INVALID_ARG:
11229 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11230 aarch64_print_hint_for_arch (str);
11231 break;
11232 case AARCH64_PARSE_INVALID_FEATURE:
11233 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11234 break;
11235 default:
11236 gcc_unreachable ();
11239 return false;
11242 /* Handle the argument CPU_STR to the cpu= target attribute. */
11244 static bool
11245 aarch64_handle_attr_cpu (const char *str)
11247 const struct processor *tmp_cpu = NULL;
11248 enum aarch64_parse_opt_result parse_res
11249 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11251 if (parse_res == AARCH64_PARSE_OK)
11253 gcc_assert (tmp_cpu);
11254 selected_tune = tmp_cpu;
11255 explicit_tune_core = selected_tune->ident;
11257 selected_arch = &all_architectures[tmp_cpu->arch];
11258 explicit_arch = selected_arch->arch;
11259 return true;
11262 switch (parse_res)
11264 case AARCH64_PARSE_MISSING_ARG:
11265 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11266 break;
11267 case AARCH64_PARSE_INVALID_ARG:
11268 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11269 aarch64_print_hint_for_core (str);
11270 break;
11271 case AARCH64_PARSE_INVALID_FEATURE:
11272 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11273 break;
11274 default:
11275 gcc_unreachable ();
11278 return false;
11281 /* Handle the argument STR to the tune= target attribute. */
11283 static bool
11284 aarch64_handle_attr_tune (const char *str)
11286 const struct processor *tmp_tune = NULL;
11287 enum aarch64_parse_opt_result parse_res
11288 = aarch64_parse_tune (str, &tmp_tune);
11290 if (parse_res == AARCH64_PARSE_OK)
11292 gcc_assert (tmp_tune);
11293 selected_tune = tmp_tune;
11294 explicit_tune_core = selected_tune->ident;
11295 return true;
11298 switch (parse_res)
11300 case AARCH64_PARSE_INVALID_ARG:
11301 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11302 aarch64_print_hint_for_core (str);
11303 break;
11304 default:
11305 gcc_unreachable ();
11308 return false;
11311 /* Parse an architecture extensions target attribute string specified in STR.
11312 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11313 if successful. Update aarch64_isa_flags to reflect the ISA features
11314 modified. */
11316 static bool
11317 aarch64_handle_attr_isa_flags (char *str)
11319 enum aarch64_parse_opt_result parse_res;
11320 unsigned long isa_flags = aarch64_isa_flags;
11322 /* We allow "+nothing" in the beginning to clear out all architectural
11323 features if the user wants to handpick specific features. */
11324 if (strncmp ("+nothing", str, 8) == 0)
11326 isa_flags = 0;
11327 str += 8;
11330 parse_res = aarch64_parse_extension (str, &isa_flags);
11332 if (parse_res == AARCH64_PARSE_OK)
11334 aarch64_isa_flags = isa_flags;
11335 return true;
11338 switch (parse_res)
11340 case AARCH64_PARSE_MISSING_ARG:
11341 error ("missing value in %<target()%> pragma or attribute");
11342 break;
11344 case AARCH64_PARSE_INVALID_FEATURE:
11345 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11346 break;
11348 default:
11349 gcc_unreachable ();
11352 return false;
11355 /* The target attributes that we support. On top of these we also support just
11356 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11357 handled explicitly in aarch64_process_one_target_attr. */
11359 static const struct aarch64_attribute_info aarch64_attributes[] =
11361 { "general-regs-only", aarch64_attr_mask, false, NULL,
11362 OPT_mgeneral_regs_only },
11363 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11364 OPT_mfix_cortex_a53_835769 },
11365 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11366 OPT_mfix_cortex_a53_843419 },
11367 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11368 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11369 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11370 OPT_momit_leaf_frame_pointer },
11371 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11372 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11373 OPT_march_ },
11374 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11375 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11376 OPT_mtune_ },
11377 { "sign-return-address", aarch64_attr_enum, false, NULL,
11378 OPT_msign_return_address_ },
11379 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11382 /* Parse ARG_STR which contains the definition of one target attribute.
11383 Show appropriate errors if any or return true if the attribute is valid. */
11385 static bool
11386 aarch64_process_one_target_attr (char *arg_str)
11388 bool invert = false;
11390 size_t len = strlen (arg_str);
11392 if (len == 0)
11394 error ("malformed %<target()%> pragma or attribute");
11395 return false;
11398 char *str_to_check = (char *) alloca (len + 1);
11399 strcpy (str_to_check, arg_str);
11401 /* Skip leading whitespace. */
11402 while (*str_to_check == ' ' || *str_to_check == '\t')
11403 str_to_check++;
11405 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11406 It is easier to detect and handle it explicitly here rather than going
11407 through the machinery for the rest of the target attributes in this
11408 function. */
11409 if (*str_to_check == '+')
11410 return aarch64_handle_attr_isa_flags (str_to_check);
11412 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11414 invert = true;
11415 str_to_check += 3;
11417 char *arg = strchr (str_to_check, '=');
11419 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11420 and point ARG to "foo". */
11421 if (arg)
11423 *arg = '\0';
11424 arg++;
11426 const struct aarch64_attribute_info *p_attr;
11427 bool found = false;
11428 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11430 /* If the names don't match up, or the user has given an argument
11431 to an attribute that doesn't accept one, or didn't give an argument
11432 to an attribute that expects one, fail to match. */
11433 if (strcmp (str_to_check, p_attr->name) != 0)
11434 continue;
11436 found = true;
11437 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11438 || p_attr->attr_type == aarch64_attr_enum;
11440 if (attr_need_arg_p ^ (arg != NULL))
11442 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11443 return false;
11446 /* If the name matches but the attribute does not allow "no-" versions
11447 then we can't match. */
11448 if (invert && !p_attr->allow_neg)
11450 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11451 return false;
11454 switch (p_attr->attr_type)
11456 /* Has a custom handler registered.
11457 For example, cpu=, arch=, tune=. */
11458 case aarch64_attr_custom:
11459 gcc_assert (p_attr->handler);
11460 if (!p_attr->handler (arg))
11461 return false;
11462 break;
11464 /* Either set or unset a boolean option. */
11465 case aarch64_attr_bool:
11467 struct cl_decoded_option decoded;
11469 generate_option (p_attr->opt_num, NULL, !invert,
11470 CL_TARGET, &decoded);
11471 aarch64_handle_option (&global_options, &global_options_set,
11472 &decoded, input_location);
11473 break;
11475 /* Set or unset a bit in the target_flags. aarch64_handle_option
11476 should know what mask to apply given the option number. */
11477 case aarch64_attr_mask:
11479 struct cl_decoded_option decoded;
11480 /* We only need to specify the option number.
11481 aarch64_handle_option will know which mask to apply. */
11482 decoded.opt_index = p_attr->opt_num;
11483 decoded.value = !invert;
11484 aarch64_handle_option (&global_options, &global_options_set,
11485 &decoded, input_location);
11486 break;
11488 /* Use the option setting machinery to set an option to an enum. */
11489 case aarch64_attr_enum:
11491 gcc_assert (arg);
11492 bool valid;
11493 int value;
11494 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11495 &value, CL_TARGET);
11496 if (valid)
11498 set_option (&global_options, NULL, p_attr->opt_num, value,
11499 NULL, DK_UNSPECIFIED, input_location,
11500 global_dc);
11502 else
11504 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11506 break;
11508 default:
11509 gcc_unreachable ();
11513 /* If we reached here we either have found an attribute and validated
11514 it or didn't match any. If we matched an attribute but its arguments
11515 were malformed we will have returned false already. */
11516 return found;
11519 /* Count how many times the character C appears in
11520 NULL-terminated string STR. */
11522 static unsigned int
11523 num_occurences_in_str (char c, char *str)
11525 unsigned int res = 0;
11526 while (*str != '\0')
11528 if (*str == c)
11529 res++;
11531 str++;
11534 return res;
11537 /* Parse the tree in ARGS that contains the target attribute information
11538 and update the global target options space. */
11540 bool
11541 aarch64_process_target_attr (tree args)
11543 if (TREE_CODE (args) == TREE_LIST)
11547 tree head = TREE_VALUE (args);
11548 if (head)
11550 if (!aarch64_process_target_attr (head))
11551 return false;
11553 args = TREE_CHAIN (args);
11554 } while (args);
11556 return true;
11559 if (TREE_CODE (args) != STRING_CST)
11561 error ("attribute %<target%> argument not a string");
11562 return false;
11565 size_t len = strlen (TREE_STRING_POINTER (args));
11566 char *str_to_check = (char *) alloca (len + 1);
11567 strcpy (str_to_check, TREE_STRING_POINTER (args));
11569 if (len == 0)
11571 error ("malformed %<target()%> pragma or attribute");
11572 return false;
11575 /* Used to catch empty spaces between commas i.e.
11576 attribute ((target ("attr1,,attr2"))). */
11577 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11579 /* Handle multiple target attributes separated by ','. */
11580 char *token = strtok (str_to_check, ",");
11582 unsigned int num_attrs = 0;
11583 while (token)
11585 num_attrs++;
11586 if (!aarch64_process_one_target_attr (token))
11588 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11589 return false;
11592 token = strtok (NULL, ",");
11595 if (num_attrs != num_commas + 1)
11597 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11598 return false;
11601 return true;
11604 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11605 process attribute ((target ("..."))). */
11607 static bool
11608 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11610 struct cl_target_option cur_target;
11611 bool ret;
11612 tree old_optimize;
11613 tree new_target, new_optimize;
11614 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11616 /* If what we're processing is the current pragma string then the
11617 target option node is already stored in target_option_current_node
11618 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11619 having to re-parse the string. This is especially useful to keep
11620 arm_neon.h compile times down since that header contains a lot
11621 of intrinsics enclosed in pragmas. */
11622 if (!existing_target && args == current_target_pragma)
11624 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11625 return true;
11627 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11629 old_optimize = build_optimization_node (&global_options);
11630 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11632 /* If the function changed the optimization levels as well as setting
11633 target options, start with the optimizations specified. */
11634 if (func_optimize && func_optimize != old_optimize)
11635 cl_optimization_restore (&global_options,
11636 TREE_OPTIMIZATION (func_optimize));
11638 /* Save the current target options to restore at the end. */
11639 cl_target_option_save (&cur_target, &global_options);
11641 /* If fndecl already has some target attributes applied to it, unpack
11642 them so that we add this attribute on top of them, rather than
11643 overwriting them. */
11644 if (existing_target)
11646 struct cl_target_option *existing_options
11647 = TREE_TARGET_OPTION (existing_target);
11649 if (existing_options)
11650 cl_target_option_restore (&global_options, existing_options);
11652 else
11653 cl_target_option_restore (&global_options,
11654 TREE_TARGET_OPTION (target_option_current_node));
11656 ret = aarch64_process_target_attr (args);
11658 /* Set up any additional state. */
11659 if (ret)
11661 aarch64_override_options_internal (&global_options);
11662 /* Initialize SIMD builtins if we haven't already.
11663 Set current_target_pragma to NULL for the duration so that
11664 the builtin initialization code doesn't try to tag the functions
11665 being built with the attributes specified by any current pragma, thus
11666 going into an infinite recursion. */
11667 if (TARGET_SIMD)
11669 tree saved_current_target_pragma = current_target_pragma;
11670 current_target_pragma = NULL;
11671 aarch64_init_simd_builtins ();
11672 current_target_pragma = saved_current_target_pragma;
11674 new_target = build_target_option_node (&global_options);
11676 else
11677 new_target = NULL;
11679 new_optimize = build_optimization_node (&global_options);
11681 if (fndecl && ret)
11683 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11685 if (old_optimize != new_optimize)
11686 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11689 cl_target_option_restore (&global_options, &cur_target);
11691 if (old_optimize != new_optimize)
11692 cl_optimization_restore (&global_options,
11693 TREE_OPTIMIZATION (old_optimize));
11694 return ret;
11697 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11698 tri-bool options (yes, no, don't care) and the default value is
11699 DEF, determine whether to reject inlining. */
11701 static bool
11702 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11703 int dont_care, int def)
11705 /* If the callee doesn't care, always allow inlining. */
11706 if (callee == dont_care)
11707 return true;
11709 /* If the caller doesn't care, always allow inlining. */
11710 if (caller == dont_care)
11711 return true;
11713 /* Otherwise, allow inlining if either the callee and caller values
11714 agree, or if the callee is using the default value. */
11715 return (callee == caller || callee == def);
11718 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11719 to inline CALLEE into CALLER based on target-specific info.
11720 Make sure that the caller and callee have compatible architectural
11721 features. Then go through the other possible target attributes
11722 and see if they can block inlining. Try not to reject always_inline
11723 callees unless they are incompatible architecturally. */
11725 static bool
11726 aarch64_can_inline_p (tree caller, tree callee)
11728 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11729 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11731 struct cl_target_option *caller_opts
11732 = TREE_TARGET_OPTION (caller_tree ? caller_tree
11733 : target_option_default_node);
11735 struct cl_target_option *callee_opts
11736 = TREE_TARGET_OPTION (callee_tree ? callee_tree
11737 : target_option_default_node);
11739 /* Callee's ISA flags should be a subset of the caller's. */
11740 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11741 != callee_opts->x_aarch64_isa_flags)
11742 return false;
11744 /* Allow non-strict aligned functions inlining into strict
11745 aligned ones. */
11746 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11747 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11748 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11749 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11750 return false;
11752 bool always_inline = lookup_attribute ("always_inline",
11753 DECL_ATTRIBUTES (callee));
11755 /* If the architectural features match up and the callee is always_inline
11756 then the other attributes don't matter. */
11757 if (always_inline)
11758 return true;
11760 if (caller_opts->x_aarch64_cmodel_var
11761 != callee_opts->x_aarch64_cmodel_var)
11762 return false;
11764 if (caller_opts->x_aarch64_tls_dialect
11765 != callee_opts->x_aarch64_tls_dialect)
11766 return false;
11768 /* Honour explicit requests to workaround errata. */
11769 if (!aarch64_tribools_ok_for_inlining_p (
11770 caller_opts->x_aarch64_fix_a53_err835769,
11771 callee_opts->x_aarch64_fix_a53_err835769,
11772 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11773 return false;
11775 if (!aarch64_tribools_ok_for_inlining_p (
11776 caller_opts->x_aarch64_fix_a53_err843419,
11777 callee_opts->x_aarch64_fix_a53_err843419,
11778 2, TARGET_FIX_ERR_A53_843419))
11779 return false;
11781 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11782 caller and calle and they don't match up, reject inlining. */
11783 if (!aarch64_tribools_ok_for_inlining_p (
11784 caller_opts->x_flag_omit_leaf_frame_pointer,
11785 callee_opts->x_flag_omit_leaf_frame_pointer,
11786 2, 1))
11787 return false;
11789 /* If the callee has specific tuning overrides, respect them. */
11790 if (callee_opts->x_aarch64_override_tune_string != NULL
11791 && caller_opts->x_aarch64_override_tune_string == NULL)
11792 return false;
11794 /* If the user specified tuning override strings for the
11795 caller and callee and they don't match up, reject inlining.
11796 We just do a string compare here, we don't analyze the meaning
11797 of the string, as it would be too costly for little gain. */
11798 if (callee_opts->x_aarch64_override_tune_string
11799 && caller_opts->x_aarch64_override_tune_string
11800 && (strcmp (callee_opts->x_aarch64_override_tune_string,
11801 caller_opts->x_aarch64_override_tune_string) != 0))
11802 return false;
11804 return true;
11807 /* Return true if SYMBOL_REF X binds locally. */
11809 static bool
11810 aarch64_symbol_binds_local_p (const_rtx x)
11812 return (SYMBOL_REF_DECL (x)
11813 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11814 : SYMBOL_REF_LOCAL_P (x));
11817 /* Return true if SYMBOL_REF X is thread local */
11818 static bool
11819 aarch64_tls_symbol_p (rtx x)
11821 if (! TARGET_HAVE_TLS)
11822 return false;
11824 if (GET_CODE (x) != SYMBOL_REF)
11825 return false;
11827 return SYMBOL_REF_TLS_MODEL (x) != 0;
11830 /* Classify a TLS symbol into one of the TLS kinds. */
11831 enum aarch64_symbol_type
11832 aarch64_classify_tls_symbol (rtx x)
11834 enum tls_model tls_kind = tls_symbolic_operand_type (x);
11836 switch (tls_kind)
11838 case TLS_MODEL_GLOBAL_DYNAMIC:
11839 case TLS_MODEL_LOCAL_DYNAMIC:
11840 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11842 case TLS_MODEL_INITIAL_EXEC:
11843 switch (aarch64_cmodel)
11845 case AARCH64_CMODEL_TINY:
11846 case AARCH64_CMODEL_TINY_PIC:
11847 return SYMBOL_TINY_TLSIE;
11848 default:
11849 return SYMBOL_SMALL_TLSIE;
11852 case TLS_MODEL_LOCAL_EXEC:
11853 if (aarch64_tls_size == 12)
11854 return SYMBOL_TLSLE12;
11855 else if (aarch64_tls_size == 24)
11856 return SYMBOL_TLSLE24;
11857 else if (aarch64_tls_size == 32)
11858 return SYMBOL_TLSLE32;
11859 else if (aarch64_tls_size == 48)
11860 return SYMBOL_TLSLE48;
11861 else
11862 gcc_unreachable ();
11864 case TLS_MODEL_EMULATED:
11865 case TLS_MODEL_NONE:
11866 return SYMBOL_FORCE_TO_MEM;
11868 default:
11869 gcc_unreachable ();
11873 /* Return the correct method for accessing X + OFFSET, where X is either
11874 a SYMBOL_REF or LABEL_REF. */
11876 enum aarch64_symbol_type
11877 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11879 if (GET_CODE (x) == LABEL_REF)
11881 switch (aarch64_cmodel)
11883 case AARCH64_CMODEL_LARGE:
11884 return SYMBOL_FORCE_TO_MEM;
11886 case AARCH64_CMODEL_TINY_PIC:
11887 case AARCH64_CMODEL_TINY:
11888 return SYMBOL_TINY_ABSOLUTE;
11890 case AARCH64_CMODEL_SMALL_SPIC:
11891 case AARCH64_CMODEL_SMALL_PIC:
11892 case AARCH64_CMODEL_SMALL:
11893 return SYMBOL_SMALL_ABSOLUTE;
11895 default:
11896 gcc_unreachable ();
11900 if (GET_CODE (x) == SYMBOL_REF)
11902 if (aarch64_tls_symbol_p (x))
11903 return aarch64_classify_tls_symbol (x);
11905 switch (aarch64_cmodel)
11907 case AARCH64_CMODEL_TINY:
11908 /* When we retrieve symbol + offset address, we have to make sure
11909 the offset does not cause overflow of the final address. But
11910 we have no way of knowing the address of symbol at compile time
11911 so we can't accurately say if the distance between the PC and
11912 symbol + offset is outside the addressible range of +/-1M in the
11913 TINY code model. So we rely on images not being greater than
11914 1M and cap the offset at 1M and anything beyond 1M will have to
11915 be loaded using an alternative mechanism. Furthermore if the
11916 symbol is a weak reference to something that isn't known to
11917 resolve to a symbol in this module, then force to memory. */
11918 if ((SYMBOL_REF_WEAK (x)
11919 && !aarch64_symbol_binds_local_p (x))
11920 || !IN_RANGE (offset, -1048575, 1048575))
11921 return SYMBOL_FORCE_TO_MEM;
11922 return SYMBOL_TINY_ABSOLUTE;
11924 case AARCH64_CMODEL_SMALL:
11925 /* Same reasoning as the tiny code model, but the offset cap here is
11926 4G. */
11927 if ((SYMBOL_REF_WEAK (x)
11928 && !aarch64_symbol_binds_local_p (x))
11929 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11930 HOST_WIDE_INT_C (4294967264)))
11931 return SYMBOL_FORCE_TO_MEM;
11932 return SYMBOL_SMALL_ABSOLUTE;
11934 case AARCH64_CMODEL_TINY_PIC:
11935 if (!aarch64_symbol_binds_local_p (x))
11936 return SYMBOL_TINY_GOT;
11937 return SYMBOL_TINY_ABSOLUTE;
11939 case AARCH64_CMODEL_SMALL_SPIC:
11940 case AARCH64_CMODEL_SMALL_PIC:
11941 if (!aarch64_symbol_binds_local_p (x))
11942 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11943 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11944 return SYMBOL_SMALL_ABSOLUTE;
11946 case AARCH64_CMODEL_LARGE:
11947 /* This is alright even in PIC code as the constant
11948 pool reference is always PC relative and within
11949 the same translation unit. */
11950 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11951 return SYMBOL_SMALL_ABSOLUTE;
11952 else
11953 return SYMBOL_FORCE_TO_MEM;
11955 default:
11956 gcc_unreachable ();
11960 /* By default push everything into the constant pool. */
11961 return SYMBOL_FORCE_TO_MEM;
11964 bool
11965 aarch64_constant_address_p (rtx x)
11967 return (CONSTANT_P (x) && memory_address_p (DImode, x));
11970 bool
11971 aarch64_legitimate_pic_operand_p (rtx x)
11973 if (GET_CODE (x) == SYMBOL_REF
11974 || (GET_CODE (x) == CONST
11975 && GET_CODE (XEXP (x, 0)) == PLUS
11976 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11977 return false;
11979 return true;
11982 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11983 that should be rematerialized rather than spilled. */
11985 static bool
11986 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11988 /* Support CSE and rematerialization of common constants. */
11989 if (CONST_INT_P (x)
11990 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11991 || GET_CODE (x) == CONST_VECTOR)
11992 return true;
11994 /* Do not allow vector struct mode constants for Advanced SIMD.
11995 We could support 0 and -1 easily, but they need support in
11996 aarch64-simd.md. */
11997 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11998 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11999 return false;
12001 /* Only accept variable-length vector constants if they can be
12002 handled directly.
12004 ??? It would be possible to handle rematerialization of other
12005 constants via secondary reloads. */
12006 if (vec_flags & VEC_ANY_SVE)
12007 return aarch64_simd_valid_immediate (x, NULL);
12009 if (GET_CODE (x) == HIGH)
12010 x = XEXP (x, 0);
12012 /* Accept polynomial constants that can be calculated by using the
12013 destination of a move as the sole temporary. Constants that
12014 require a second temporary cannot be rematerialized (they can't be
12015 forced to memory and also aren't legitimate constants). */
12016 poly_int64 offset;
12017 if (poly_int_rtx_p (x, &offset))
12018 return aarch64_offset_temporaries (false, offset) <= 1;
12020 /* If an offset is being added to something else, we need to allow the
12021 base to be moved into the destination register, meaning that there
12022 are no free temporaries for the offset. */
12023 x = strip_offset (x, &offset);
12024 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
12025 return false;
12027 /* Do not allow const (plus (anchor_symbol, const_int)). */
12028 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
12029 return false;
12031 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
12032 so spilling them is better than rematerialization. */
12033 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
12034 return true;
12036 /* Label references are always constant. */
12037 if (GET_CODE (x) == LABEL_REF)
12038 return true;
12040 return false;
12044 aarch64_load_tp (rtx target)
12046 if (!target
12047 || GET_MODE (target) != Pmode
12048 || !register_operand (target, Pmode))
12049 target = gen_reg_rtx (Pmode);
12051 /* Can return in any reg. */
12052 emit_insn (gen_aarch64_load_tp_hard (target));
12053 return target;
12056 /* On AAPCS systems, this is the "struct __va_list". */
12057 static GTY(()) tree va_list_type;
12059 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12060 Return the type to use as __builtin_va_list.
12062 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12064 struct __va_list
12066 void *__stack;
12067 void *__gr_top;
12068 void *__vr_top;
12069 int __gr_offs;
12070 int __vr_offs;
12071 }; */
12073 static tree
12074 aarch64_build_builtin_va_list (void)
12076 tree va_list_name;
12077 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12079 /* Create the type. */
12080 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12081 /* Give it the required name. */
12082 va_list_name = build_decl (BUILTINS_LOCATION,
12083 TYPE_DECL,
12084 get_identifier ("__va_list"),
12085 va_list_type);
12086 DECL_ARTIFICIAL (va_list_name) = 1;
12087 TYPE_NAME (va_list_type) = va_list_name;
12088 TYPE_STUB_DECL (va_list_type) = va_list_name;
12090 /* Create the fields. */
12091 f_stack = build_decl (BUILTINS_LOCATION,
12092 FIELD_DECL, get_identifier ("__stack"),
12093 ptr_type_node);
12094 f_grtop = build_decl (BUILTINS_LOCATION,
12095 FIELD_DECL, get_identifier ("__gr_top"),
12096 ptr_type_node);
12097 f_vrtop = build_decl (BUILTINS_LOCATION,
12098 FIELD_DECL, get_identifier ("__vr_top"),
12099 ptr_type_node);
12100 f_groff = build_decl (BUILTINS_LOCATION,
12101 FIELD_DECL, get_identifier ("__gr_offs"),
12102 integer_type_node);
12103 f_vroff = build_decl (BUILTINS_LOCATION,
12104 FIELD_DECL, get_identifier ("__vr_offs"),
12105 integer_type_node);
12107 /* Tell tree-stdarg pass about our internal offset fields.
12108 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12109 purpose to identify whether the code is updating va_list internal
12110 offset fields through irregular way. */
12111 va_list_gpr_counter_field = f_groff;
12112 va_list_fpr_counter_field = f_vroff;
12114 DECL_ARTIFICIAL (f_stack) = 1;
12115 DECL_ARTIFICIAL (f_grtop) = 1;
12116 DECL_ARTIFICIAL (f_vrtop) = 1;
12117 DECL_ARTIFICIAL (f_groff) = 1;
12118 DECL_ARTIFICIAL (f_vroff) = 1;
12120 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12121 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12122 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12123 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12124 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12126 TYPE_FIELDS (va_list_type) = f_stack;
12127 DECL_CHAIN (f_stack) = f_grtop;
12128 DECL_CHAIN (f_grtop) = f_vrtop;
12129 DECL_CHAIN (f_vrtop) = f_groff;
12130 DECL_CHAIN (f_groff) = f_vroff;
12132 /* Compute its layout. */
12133 layout_type (va_list_type);
12135 return va_list_type;
12138 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12139 static void
12140 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12142 const CUMULATIVE_ARGS *cum;
12143 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12144 tree stack, grtop, vrtop, groff, vroff;
12145 tree t;
12146 int gr_save_area_size = cfun->va_list_gpr_size;
12147 int vr_save_area_size = cfun->va_list_fpr_size;
12148 int vr_offset;
12150 cum = &crtl->args.info;
12151 if (cfun->va_list_gpr_size)
12152 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12153 cfun->va_list_gpr_size);
12154 if (cfun->va_list_fpr_size)
12155 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12156 * UNITS_PER_VREG, cfun->va_list_fpr_size);
12158 if (!TARGET_FLOAT)
12160 gcc_assert (cum->aapcs_nvrn == 0);
12161 vr_save_area_size = 0;
12164 f_stack = TYPE_FIELDS (va_list_type_node);
12165 f_grtop = DECL_CHAIN (f_stack);
12166 f_vrtop = DECL_CHAIN (f_grtop);
12167 f_groff = DECL_CHAIN (f_vrtop);
12168 f_vroff = DECL_CHAIN (f_groff);
12170 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12171 NULL_TREE);
12172 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12173 NULL_TREE);
12174 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12175 NULL_TREE);
12176 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12177 NULL_TREE);
12178 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12179 NULL_TREE);
12181 /* Emit code to initialize STACK, which points to the next varargs stack
12182 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12183 by named arguments. STACK is 8-byte aligned. */
12184 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12185 if (cum->aapcs_stack_size > 0)
12186 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12187 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12188 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12190 /* Emit code to initialize GRTOP, the top of the GR save area.
12191 virtual_incoming_args_rtx should have been 16 byte aligned. */
12192 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12193 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12194 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12196 /* Emit code to initialize VRTOP, the top of the VR save area.
12197 This address is gr_save_area_bytes below GRTOP, rounded
12198 down to the next 16-byte boundary. */
12199 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12200 vr_offset = ROUND_UP (gr_save_area_size,
12201 STACK_BOUNDARY / BITS_PER_UNIT);
12203 if (vr_offset)
12204 t = fold_build_pointer_plus_hwi (t, -vr_offset);
12205 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12206 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12208 /* Emit code to initialize GROFF, the offset from GRTOP of the
12209 next GPR argument. */
12210 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12211 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12212 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12214 /* Likewise emit code to initialize VROFF, the offset from FTOP
12215 of the next VR argument. */
12216 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12217 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12218 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12221 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12223 static tree
12224 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12225 gimple_seq *post_p ATTRIBUTE_UNUSED)
12227 tree addr;
12228 bool indirect_p;
12229 bool is_ha; /* is HFA or HVA. */
12230 bool dw_align; /* double-word align. */
12231 machine_mode ag_mode = VOIDmode;
12232 int nregs;
12233 machine_mode mode;
12235 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12236 tree stack, f_top, f_off, off, arg, roundup, on_stack;
12237 HOST_WIDE_INT size, rsize, adjust, align;
12238 tree t, u, cond1, cond2;
12240 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12241 if (indirect_p)
12242 type = build_pointer_type (type);
12244 mode = TYPE_MODE (type);
12246 f_stack = TYPE_FIELDS (va_list_type_node);
12247 f_grtop = DECL_CHAIN (f_stack);
12248 f_vrtop = DECL_CHAIN (f_grtop);
12249 f_groff = DECL_CHAIN (f_vrtop);
12250 f_vroff = DECL_CHAIN (f_groff);
12252 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12253 f_stack, NULL_TREE);
12254 size = int_size_in_bytes (type);
12255 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12257 dw_align = false;
12258 adjust = 0;
12259 if (aarch64_vfp_is_call_or_return_candidate (mode,
12260 type,
12261 &ag_mode,
12262 &nregs,
12263 &is_ha))
12265 /* No frontends can create types with variable-sized modes, so we
12266 shouldn't be asked to pass or return them. */
12267 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12269 /* TYPE passed in fp/simd registers. */
12270 if (!TARGET_FLOAT)
12271 aarch64_err_no_fpadvsimd (mode);
12273 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12274 unshare_expr (valist), f_vrtop, NULL_TREE);
12275 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12276 unshare_expr (valist), f_vroff, NULL_TREE);
12278 rsize = nregs * UNITS_PER_VREG;
12280 if (is_ha)
12282 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12283 adjust = UNITS_PER_VREG - ag_size;
12285 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12286 && size < UNITS_PER_VREG)
12288 adjust = UNITS_PER_VREG - size;
12291 else
12293 /* TYPE passed in general registers. */
12294 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12295 unshare_expr (valist), f_grtop, NULL_TREE);
12296 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12297 unshare_expr (valist), f_groff, NULL_TREE);
12298 rsize = ROUND_UP (size, UNITS_PER_WORD);
12299 nregs = rsize / UNITS_PER_WORD;
12301 if (align > 8)
12302 dw_align = true;
12304 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12305 && size < UNITS_PER_WORD)
12307 adjust = UNITS_PER_WORD - size;
12311 /* Get a local temporary for the field value. */
12312 off = get_initialized_tmp_var (f_off, pre_p, NULL);
12314 /* Emit code to branch if off >= 0. */
12315 t = build2 (GE_EXPR, boolean_type_node, off,
12316 build_int_cst (TREE_TYPE (off), 0));
12317 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12319 if (dw_align)
12321 /* Emit: offs = (offs + 15) & -16. */
12322 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12323 build_int_cst (TREE_TYPE (off), 15));
12324 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12325 build_int_cst (TREE_TYPE (off), -16));
12326 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12328 else
12329 roundup = NULL;
12331 /* Update ap.__[g|v]r_offs */
12332 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12333 build_int_cst (TREE_TYPE (off), rsize));
12334 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12336 /* String up. */
12337 if (roundup)
12338 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12340 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12341 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12342 build_int_cst (TREE_TYPE (f_off), 0));
12343 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12345 /* String up: make sure the assignment happens before the use. */
12346 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12347 COND_EXPR_ELSE (cond1) = t;
12349 /* Prepare the trees handling the argument that is passed on the stack;
12350 the top level node will store in ON_STACK. */
12351 arg = get_initialized_tmp_var (stack, pre_p, NULL);
12352 if (align > 8)
12354 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12355 t = fold_build_pointer_plus_hwi (arg, 15);
12356 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12357 build_int_cst (TREE_TYPE (t), -16));
12358 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12360 else
12361 roundup = NULL;
12362 /* Advance ap.__stack */
12363 t = fold_build_pointer_plus_hwi (arg, size + 7);
12364 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12365 build_int_cst (TREE_TYPE (t), -8));
12366 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12367 /* String up roundup and advance. */
12368 if (roundup)
12369 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12370 /* String up with arg */
12371 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12372 /* Big-endianness related address adjustment. */
12373 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12374 && size < UNITS_PER_WORD)
12376 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12377 size_int (UNITS_PER_WORD - size));
12378 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12381 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12382 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12384 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12385 t = off;
12386 if (adjust)
12387 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12388 build_int_cst (TREE_TYPE (off), adjust));
12390 t = fold_convert (sizetype, t);
12391 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12393 if (is_ha)
12395 /* type ha; // treat as "struct {ftype field[n];}"
12396 ... [computing offs]
12397 for (i = 0; i <nregs; ++i, offs += 16)
12398 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12399 return ha; */
12400 int i;
12401 tree tmp_ha, field_t, field_ptr_t;
12403 /* Declare a local variable. */
12404 tmp_ha = create_tmp_var_raw (type, "ha");
12405 gimple_add_tmp_var (tmp_ha);
12407 /* Establish the base type. */
12408 switch (ag_mode)
12410 case E_SFmode:
12411 field_t = float_type_node;
12412 field_ptr_t = float_ptr_type_node;
12413 break;
12414 case E_DFmode:
12415 field_t = double_type_node;
12416 field_ptr_t = double_ptr_type_node;
12417 break;
12418 case E_TFmode:
12419 field_t = long_double_type_node;
12420 field_ptr_t = long_double_ptr_type_node;
12421 break;
12422 case E_HFmode:
12423 field_t = aarch64_fp16_type_node;
12424 field_ptr_t = aarch64_fp16_ptr_type_node;
12425 break;
12426 case E_V2SImode:
12427 case E_V4SImode:
12429 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12430 field_t = build_vector_type_for_mode (innertype, ag_mode);
12431 field_ptr_t = build_pointer_type (field_t);
12433 break;
12434 default:
12435 gcc_assert (0);
12438 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12439 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12440 addr = t;
12441 t = fold_convert (field_ptr_t, addr);
12442 t = build2 (MODIFY_EXPR, field_t,
12443 build1 (INDIRECT_REF, field_t, tmp_ha),
12444 build1 (INDIRECT_REF, field_t, t));
12446 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12447 for (i = 1; i < nregs; ++i)
12449 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12450 u = fold_convert (field_ptr_t, addr);
12451 u = build2 (MODIFY_EXPR, field_t,
12452 build2 (MEM_REF, field_t, tmp_ha,
12453 build_int_cst (field_ptr_t,
12454 (i *
12455 int_size_in_bytes (field_t)))),
12456 build1 (INDIRECT_REF, field_t, u));
12457 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12460 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12461 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12464 COND_EXPR_ELSE (cond2) = t;
12465 addr = fold_convert (build_pointer_type (type), cond1);
12466 addr = build_va_arg_indirect_ref (addr);
12468 if (indirect_p)
12469 addr = build_va_arg_indirect_ref (addr);
12471 return addr;
12474 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12476 static void
12477 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12478 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12479 int no_rtl)
12481 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12482 CUMULATIVE_ARGS local_cum;
12483 int gr_saved = cfun->va_list_gpr_size;
12484 int vr_saved = cfun->va_list_fpr_size;
12486 /* The caller has advanced CUM up to, but not beyond, the last named
12487 argument. Advance a local copy of CUM past the last "real" named
12488 argument, to find out how many registers are left over. */
12489 local_cum = *cum;
12490 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12492 /* Found out how many registers we need to save.
12493 Honor tree-stdvar analysis results. */
12494 if (cfun->va_list_gpr_size)
12495 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12496 cfun->va_list_gpr_size / UNITS_PER_WORD);
12497 if (cfun->va_list_fpr_size)
12498 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12499 cfun->va_list_fpr_size / UNITS_PER_VREG);
12501 if (!TARGET_FLOAT)
12503 gcc_assert (local_cum.aapcs_nvrn == 0);
12504 vr_saved = 0;
12507 if (!no_rtl)
12509 if (gr_saved > 0)
12511 rtx ptr, mem;
12513 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12514 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12515 - gr_saved * UNITS_PER_WORD);
12516 mem = gen_frame_mem (BLKmode, ptr);
12517 set_mem_alias_set (mem, get_varargs_alias_set ());
12519 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12520 mem, gr_saved);
12522 if (vr_saved > 0)
12524 /* We can't use move_block_from_reg, because it will use
12525 the wrong mode, storing D regs only. */
12526 machine_mode mode = TImode;
12527 int off, i, vr_start;
12529 /* Set OFF to the offset from virtual_incoming_args_rtx of
12530 the first vector register. The VR save area lies below
12531 the GR one, and is aligned to 16 bytes. */
12532 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12533 STACK_BOUNDARY / BITS_PER_UNIT);
12534 off -= vr_saved * UNITS_PER_VREG;
12536 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12537 for (i = 0; i < vr_saved; ++i)
12539 rtx ptr, mem;
12541 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12542 mem = gen_frame_mem (mode, ptr);
12543 set_mem_alias_set (mem, get_varargs_alias_set ());
12544 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12545 off += UNITS_PER_VREG;
12550 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12551 any complication of having crtl->args.pretend_args_size changed. */
12552 cfun->machine->frame.saved_varargs_size
12553 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12554 STACK_BOUNDARY / BITS_PER_UNIT)
12555 + vr_saved * UNITS_PER_VREG);
12558 static void
12559 aarch64_conditional_register_usage (void)
12561 int i;
12562 if (!TARGET_FLOAT)
12564 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12566 fixed_regs[i] = 1;
12567 call_used_regs[i] = 1;
12570 if (!TARGET_SVE)
12571 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12573 fixed_regs[i] = 1;
12574 call_used_regs[i] = 1;
12578 /* Walk down the type tree of TYPE counting consecutive base elements.
12579 If *MODEP is VOIDmode, then set it to the first valid floating point
12580 type. If a non-floating point type is found, or if a floating point
12581 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12582 otherwise return the count in the sub-tree. */
12583 static int
12584 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12586 machine_mode mode;
12587 HOST_WIDE_INT size;
12589 switch (TREE_CODE (type))
12591 case REAL_TYPE:
12592 mode = TYPE_MODE (type);
12593 if (mode != DFmode && mode != SFmode
12594 && mode != TFmode && mode != HFmode)
12595 return -1;
12597 if (*modep == VOIDmode)
12598 *modep = mode;
12600 if (*modep == mode)
12601 return 1;
12603 break;
12605 case COMPLEX_TYPE:
12606 mode = TYPE_MODE (TREE_TYPE (type));
12607 if (mode != DFmode && mode != SFmode
12608 && mode != TFmode && mode != HFmode)
12609 return -1;
12611 if (*modep == VOIDmode)
12612 *modep = mode;
12614 if (*modep == mode)
12615 return 2;
12617 break;
12619 case VECTOR_TYPE:
12620 /* Use V2SImode and V4SImode as representatives of all 64-bit
12621 and 128-bit vector types. */
12622 size = int_size_in_bytes (type);
12623 switch (size)
12625 case 8:
12626 mode = V2SImode;
12627 break;
12628 case 16:
12629 mode = V4SImode;
12630 break;
12631 default:
12632 return -1;
12635 if (*modep == VOIDmode)
12636 *modep = mode;
12638 /* Vector modes are considered to be opaque: two vectors are
12639 equivalent for the purposes of being homogeneous aggregates
12640 if they are the same size. */
12641 if (*modep == mode)
12642 return 1;
12644 break;
12646 case ARRAY_TYPE:
12648 int count;
12649 tree index = TYPE_DOMAIN (type);
12651 /* Can't handle incomplete types nor sizes that are not
12652 fixed. */
12653 if (!COMPLETE_TYPE_P (type)
12654 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12655 return -1;
12657 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12658 if (count == -1
12659 || !index
12660 || !TYPE_MAX_VALUE (index)
12661 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12662 || !TYPE_MIN_VALUE (index)
12663 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12664 || count < 0)
12665 return -1;
12667 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12668 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12670 /* There must be no padding. */
12671 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12672 count * GET_MODE_BITSIZE (*modep)))
12673 return -1;
12675 return count;
12678 case RECORD_TYPE:
12680 int count = 0;
12681 int sub_count;
12682 tree field;
12684 /* Can't handle incomplete types nor sizes that are not
12685 fixed. */
12686 if (!COMPLETE_TYPE_P (type)
12687 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12688 return -1;
12690 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12692 if (TREE_CODE (field) != FIELD_DECL)
12693 continue;
12695 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12696 if (sub_count < 0)
12697 return -1;
12698 count += sub_count;
12701 /* There must be no padding. */
12702 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12703 count * GET_MODE_BITSIZE (*modep)))
12704 return -1;
12706 return count;
12709 case UNION_TYPE:
12710 case QUAL_UNION_TYPE:
12712 /* These aren't very interesting except in a degenerate case. */
12713 int count = 0;
12714 int sub_count;
12715 tree field;
12717 /* Can't handle incomplete types nor sizes that are not
12718 fixed. */
12719 if (!COMPLETE_TYPE_P (type)
12720 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12721 return -1;
12723 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12725 if (TREE_CODE (field) != FIELD_DECL)
12726 continue;
12728 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12729 if (sub_count < 0)
12730 return -1;
12731 count = count > sub_count ? count : sub_count;
12734 /* There must be no padding. */
12735 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12736 count * GET_MODE_BITSIZE (*modep)))
12737 return -1;
12739 return count;
12742 default:
12743 break;
12746 return -1;
12749 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12750 type as described in AAPCS64 \S 4.1.2.
12752 See the comment above aarch64_composite_type_p for the notes on MODE. */
12754 static bool
12755 aarch64_short_vector_p (const_tree type,
12756 machine_mode mode)
12758 poly_int64 size = -1;
12760 if (type && TREE_CODE (type) == VECTOR_TYPE)
12761 size = int_size_in_bytes (type);
12762 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12763 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12764 size = GET_MODE_SIZE (mode);
12766 return known_eq (size, 8) || known_eq (size, 16);
12769 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12770 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12771 array types. The C99 floating-point complex types are also considered
12772 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12773 types, which are GCC extensions and out of the scope of AAPCS64, are
12774 treated as composite types here as well.
12776 Note that MODE itself is not sufficient in determining whether a type
12777 is such a composite type or not. This is because
12778 stor-layout.c:compute_record_mode may have already changed the MODE
12779 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12780 structure with only one field may have its MODE set to the mode of the
12781 field. Also an integer mode whose size matches the size of the
12782 RECORD_TYPE type may be used to substitute the original mode
12783 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12784 solely relied on. */
12786 static bool
12787 aarch64_composite_type_p (const_tree type,
12788 machine_mode mode)
12790 if (aarch64_short_vector_p (type, mode))
12791 return false;
12793 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12794 return true;
12796 if (mode == BLKmode
12797 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12798 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12799 return true;
12801 return false;
12804 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12805 shall be passed or returned in simd/fp register(s) (providing these
12806 parameter passing registers are available).
12808 Upon successful return, *COUNT returns the number of needed registers,
12809 *BASE_MODE returns the mode of the individual register and when IS_HAF
12810 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12811 floating-point aggregate or a homogeneous short-vector aggregate. */
12813 static bool
12814 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12815 const_tree type,
12816 machine_mode *base_mode,
12817 int *count,
12818 bool *is_ha)
12820 machine_mode new_mode = VOIDmode;
12821 bool composite_p = aarch64_composite_type_p (type, mode);
12823 if (is_ha != NULL) *is_ha = false;
12825 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12826 || aarch64_short_vector_p (type, mode))
12828 *count = 1;
12829 new_mode = mode;
12831 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12833 if (is_ha != NULL) *is_ha = true;
12834 *count = 2;
12835 new_mode = GET_MODE_INNER (mode);
12837 else if (type && composite_p)
12839 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12841 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12843 if (is_ha != NULL) *is_ha = true;
12844 *count = ag_count;
12846 else
12847 return false;
12849 else
12850 return false;
12852 *base_mode = new_mode;
12853 return true;
12856 /* Implement TARGET_STRUCT_VALUE_RTX. */
12858 static rtx
12859 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12860 int incoming ATTRIBUTE_UNUSED)
12862 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12865 /* Implements target hook vector_mode_supported_p. */
12866 static bool
12867 aarch64_vector_mode_supported_p (machine_mode mode)
12869 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12870 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12873 /* Return appropriate SIMD container
12874 for MODE within a vector of WIDTH bits. */
12875 static machine_mode
12876 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12878 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12879 switch (mode)
12881 case E_DFmode:
12882 return VNx2DFmode;
12883 case E_SFmode:
12884 return VNx4SFmode;
12885 case E_HFmode:
12886 return VNx8HFmode;
12887 case E_DImode:
12888 return VNx2DImode;
12889 case E_SImode:
12890 return VNx4SImode;
12891 case E_HImode:
12892 return VNx8HImode;
12893 case E_QImode:
12894 return VNx16QImode;
12895 default:
12896 return word_mode;
12899 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12900 if (TARGET_SIMD)
12902 if (known_eq (width, 128))
12903 switch (mode)
12905 case E_DFmode:
12906 return V2DFmode;
12907 case E_SFmode:
12908 return V4SFmode;
12909 case E_HFmode:
12910 return V8HFmode;
12911 case E_SImode:
12912 return V4SImode;
12913 case E_HImode:
12914 return V8HImode;
12915 case E_QImode:
12916 return V16QImode;
12917 case E_DImode:
12918 return V2DImode;
12919 default:
12920 break;
12922 else
12923 switch (mode)
12925 case E_SFmode:
12926 return V2SFmode;
12927 case E_HFmode:
12928 return V4HFmode;
12929 case E_SImode:
12930 return V2SImode;
12931 case E_HImode:
12932 return V4HImode;
12933 case E_QImode:
12934 return V8QImode;
12935 default:
12936 break;
12939 return word_mode;
12942 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12943 static machine_mode
12944 aarch64_preferred_simd_mode (scalar_mode mode)
12946 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12947 return aarch64_simd_container_mode (mode, bits);
12950 /* Return a list of possible vector sizes for the vectorizer
12951 to iterate over. */
12952 static void
12953 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12955 if (TARGET_SVE)
12956 sizes->safe_push (BYTES_PER_SVE_VECTOR);
12957 sizes->safe_push (16);
12958 sizes->safe_push (8);
12961 /* Implement TARGET_MANGLE_TYPE. */
12963 static const char *
12964 aarch64_mangle_type (const_tree type)
12966 /* The AArch64 ABI documents say that "__va_list" has to be
12967 managled as if it is in the "std" namespace. */
12968 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12969 return "St9__va_list";
12971 /* Half-precision float. */
12972 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12973 return "Dh";
12975 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12976 builtin types. */
12977 if (TYPE_NAME (type) != NULL)
12978 return aarch64_mangle_builtin_type (type);
12980 /* Use the default mangling. */
12981 return NULL;
12984 /* Find the first rtx_insn before insn that will generate an assembly
12985 instruction. */
12987 static rtx_insn *
12988 aarch64_prev_real_insn (rtx_insn *insn)
12990 if (!insn)
12991 return NULL;
12995 insn = prev_real_insn (insn);
12997 while (insn && recog_memoized (insn) < 0);
12999 return insn;
13002 static bool
13003 is_madd_op (enum attr_type t1)
13005 unsigned int i;
13006 /* A number of these may be AArch32 only. */
13007 enum attr_type mlatypes[] = {
13008 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
13009 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
13010 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
13013 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
13015 if (t1 == mlatypes[i])
13016 return true;
13019 return false;
13022 /* Check if there is a register dependency between a load and the insn
13023 for which we hold recog_data. */
13025 static bool
13026 dep_between_memop_and_curr (rtx memop)
13028 rtx load_reg;
13029 int opno;
13031 gcc_assert (GET_CODE (memop) == SET);
13033 if (!REG_P (SET_DEST (memop)))
13034 return false;
13036 load_reg = SET_DEST (memop);
13037 for (opno = 1; opno < recog_data.n_operands; opno++)
13039 rtx operand = recog_data.operand[opno];
13040 if (REG_P (operand)
13041 && reg_overlap_mentioned_p (load_reg, operand))
13042 return true;
13045 return false;
13049 /* When working around the Cortex-A53 erratum 835769,
13050 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13051 instruction and has a preceding memory instruction such that a NOP
13052 should be inserted between them. */
13054 bool
13055 aarch64_madd_needs_nop (rtx_insn* insn)
13057 enum attr_type attr_type;
13058 rtx_insn *prev;
13059 rtx body;
13061 if (!TARGET_FIX_ERR_A53_835769)
13062 return false;
13064 if (!INSN_P (insn) || recog_memoized (insn) < 0)
13065 return false;
13067 attr_type = get_attr_type (insn);
13068 if (!is_madd_op (attr_type))
13069 return false;
13071 prev = aarch64_prev_real_insn (insn);
13072 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13073 Restore recog state to INSN to avoid state corruption. */
13074 extract_constrain_insn_cached (insn);
13076 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13077 return false;
13079 body = single_set (prev);
13081 /* If the previous insn is a memory op and there is no dependency between
13082 it and the DImode madd, emit a NOP between them. If body is NULL then we
13083 have a complex memory operation, probably a load/store pair.
13084 Be conservative for now and emit a NOP. */
13085 if (GET_MODE (recog_data.operand[0]) == DImode
13086 && (!body || !dep_between_memop_and_curr (body)))
13087 return true;
13089 return false;
13094 /* Implement FINAL_PRESCAN_INSN. */
13096 void
13097 aarch64_final_prescan_insn (rtx_insn *insn)
13099 if (aarch64_madd_needs_nop (insn))
13100 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13104 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13105 instruction. */
13107 bool
13108 aarch64_sve_index_immediate_p (rtx base_or_step)
13110 return (CONST_INT_P (base_or_step)
13111 && IN_RANGE (INTVAL (base_or_step), -16, 15));
13114 /* Return true if X is a valid immediate for the SVE ADD and SUB
13115 instructions. Negate X first if NEGATE_P is true. */
13117 bool
13118 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13120 rtx elt;
13122 if (!const_vec_duplicate_p (x, &elt)
13123 || !CONST_INT_P (elt))
13124 return false;
13126 HOST_WIDE_INT val = INTVAL (elt);
13127 if (negate_p)
13128 val = -val;
13129 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13131 if (val & 0xff)
13132 return IN_RANGE (val, 0, 0xff);
13133 return IN_RANGE (val, 0, 0xff00);
13136 /* Return true if X is a valid immediate operand for an SVE logical
13137 instruction such as AND. */
13139 bool
13140 aarch64_sve_bitmask_immediate_p (rtx x)
13142 rtx elt;
13144 return (const_vec_duplicate_p (x, &elt)
13145 && CONST_INT_P (elt)
13146 && aarch64_bitmask_imm (INTVAL (elt),
13147 GET_MODE_INNER (GET_MODE (x))));
13150 /* Return true if X is a valid immediate for the SVE DUP and CPY
13151 instructions. */
13153 bool
13154 aarch64_sve_dup_immediate_p (rtx x)
13156 rtx elt;
13158 if (!const_vec_duplicate_p (x, &elt)
13159 || !CONST_INT_P (elt))
13160 return false;
13162 HOST_WIDE_INT val = INTVAL (elt);
13163 if (val & 0xff)
13164 return IN_RANGE (val, -0x80, 0x7f);
13165 return IN_RANGE (val, -0x8000, 0x7f00);
13168 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13169 SIGNED_P says whether the operand is signed rather than unsigned. */
13171 bool
13172 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13174 rtx elt;
13176 return (const_vec_duplicate_p (x, &elt)
13177 && CONST_INT_P (elt)
13178 && (signed_p
13179 ? IN_RANGE (INTVAL (elt), -16, 15)
13180 : IN_RANGE (INTVAL (elt), 0, 127)));
13183 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13184 instruction. Negate X first if NEGATE_P is true. */
13186 bool
13187 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13189 rtx elt;
13190 REAL_VALUE_TYPE r;
13192 if (!const_vec_duplicate_p (x, &elt)
13193 || GET_CODE (elt) != CONST_DOUBLE)
13194 return false;
13196 r = *CONST_DOUBLE_REAL_VALUE (elt);
13198 if (negate_p)
13199 r = real_value_negate (&r);
13201 if (real_equal (&r, &dconst1))
13202 return true;
13203 if (real_equal (&r, &dconsthalf))
13204 return true;
13205 return false;
13208 /* Return true if X is a valid immediate operand for an SVE FMUL
13209 instruction. */
13211 bool
13212 aarch64_sve_float_mul_immediate_p (rtx x)
13214 rtx elt;
13216 /* GCC will never generate a multiply with an immediate of 2, so there is no
13217 point testing for it (even though it is a valid constant). */
13218 return (const_vec_duplicate_p (x, &elt)
13219 && GET_CODE (elt) == CONST_DOUBLE
13220 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13223 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13224 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13225 is nonnull, use it to describe valid immediates. */
13226 static bool
13227 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13228 simd_immediate_info *info,
13229 enum simd_immediate_check which,
13230 simd_immediate_info::insn_type insn)
13232 /* Try a 4-byte immediate with LSL. */
13233 for (unsigned int shift = 0; shift < 32; shift += 8)
13234 if ((val32 & (0xff << shift)) == val32)
13236 if (info)
13237 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13238 simd_immediate_info::LSL, shift);
13239 return true;
13242 /* Try a 2-byte immediate with LSL. */
13243 unsigned int imm16 = val32 & 0xffff;
13244 if (imm16 == (val32 >> 16))
13245 for (unsigned int shift = 0; shift < 16; shift += 8)
13246 if ((imm16 & (0xff << shift)) == imm16)
13248 if (info)
13249 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13250 simd_immediate_info::LSL, shift);
13251 return true;
13254 /* Try a 4-byte immediate with MSL, except for cases that MVN
13255 can handle. */
13256 if (which == AARCH64_CHECK_MOV)
13257 for (unsigned int shift = 8; shift < 24; shift += 8)
13259 unsigned int low = (1 << shift) - 1;
13260 if (((val32 & (0xff << shift)) | low) == val32)
13262 if (info)
13263 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13264 simd_immediate_info::MSL, shift);
13265 return true;
13269 return false;
13272 /* Return true if replicating VAL64 is a valid immediate for the
13273 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13274 use it to describe valid immediates. */
13275 static bool
13276 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13277 simd_immediate_info *info,
13278 enum simd_immediate_check which)
13280 unsigned int val32 = val64 & 0xffffffff;
13281 unsigned int val16 = val64 & 0xffff;
13282 unsigned int val8 = val64 & 0xff;
13284 if (val32 == (val64 >> 32))
13286 if ((which & AARCH64_CHECK_ORR) != 0
13287 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13288 simd_immediate_info::MOV))
13289 return true;
13291 if ((which & AARCH64_CHECK_BIC) != 0
13292 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13293 simd_immediate_info::MVN))
13294 return true;
13296 /* Try using a replicated byte. */
13297 if (which == AARCH64_CHECK_MOV
13298 && val16 == (val32 >> 16)
13299 && val8 == (val16 >> 8))
13301 if (info)
13302 *info = simd_immediate_info (QImode, val8);
13303 return true;
13307 /* Try using a bit-to-bytemask. */
13308 if (which == AARCH64_CHECK_MOV)
13310 unsigned int i;
13311 for (i = 0; i < 64; i += 8)
13313 unsigned char byte = (val64 >> i) & 0xff;
13314 if (byte != 0 && byte != 0xff)
13315 break;
13317 if (i == 64)
13319 if (info)
13320 *info = simd_immediate_info (DImode, val64);
13321 return true;
13324 return false;
13327 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13328 instruction. If INFO is nonnull, use it to describe valid immediates. */
13330 static bool
13331 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13332 simd_immediate_info *info)
13334 scalar_int_mode mode = DImode;
13335 unsigned int val32 = val64 & 0xffffffff;
13336 if (val32 == (val64 >> 32))
13338 mode = SImode;
13339 unsigned int val16 = val32 & 0xffff;
13340 if (val16 == (val32 >> 16))
13342 mode = HImode;
13343 unsigned int val8 = val16 & 0xff;
13344 if (val8 == (val16 >> 8))
13345 mode = QImode;
13348 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13349 if (IN_RANGE (val, -0x80, 0x7f))
13351 /* DUP with no shift. */
13352 if (info)
13353 *info = simd_immediate_info (mode, val);
13354 return true;
13356 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13358 /* DUP with LSL #8. */
13359 if (info)
13360 *info = simd_immediate_info (mode, val);
13361 return true;
13363 if (aarch64_bitmask_imm (val64, mode))
13365 /* DUPM. */
13366 if (info)
13367 *info = simd_immediate_info (mode, val);
13368 return true;
13370 return false;
13373 /* Return true if OP is a valid SIMD immediate for the operation
13374 described by WHICH. If INFO is nonnull, use it to describe valid
13375 immediates. */
13376 bool
13377 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13378 enum simd_immediate_check which)
13380 machine_mode mode = GET_MODE (op);
13381 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13382 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13383 return false;
13385 scalar_mode elt_mode = GET_MODE_INNER (mode);
13386 rtx base, step;
13387 unsigned int n_elts;
13388 if (GET_CODE (op) == CONST_VECTOR
13389 && CONST_VECTOR_DUPLICATE_P (op))
13390 n_elts = CONST_VECTOR_NPATTERNS (op);
13391 else if ((vec_flags & VEC_SVE_DATA)
13392 && const_vec_series_p (op, &base, &step))
13394 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13395 if (!aarch64_sve_index_immediate_p (base)
13396 || !aarch64_sve_index_immediate_p (step))
13397 return false;
13399 if (info)
13400 *info = simd_immediate_info (elt_mode, base, step);
13401 return true;
13403 else if (GET_CODE (op) == CONST_VECTOR
13404 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13405 /* N_ELTS set above. */;
13406 else
13407 return false;
13409 /* Handle PFALSE and PTRUE. */
13410 if (vec_flags & VEC_SVE_PRED)
13411 return (op == CONST0_RTX (mode)
13412 || op == CONSTM1_RTX (mode));
13414 scalar_float_mode elt_float_mode;
13415 if (n_elts == 1
13416 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13418 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13419 if (aarch64_float_const_zero_rtx_p (elt)
13420 || aarch64_float_const_representable_p (elt))
13422 if (info)
13423 *info = simd_immediate_info (elt_float_mode, elt);
13424 return true;
13428 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13429 if (elt_size > 8)
13430 return false;
13432 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13434 /* Expand the vector constant out into a byte vector, with the least
13435 significant byte of the register first. */
13436 auto_vec<unsigned char, 16> bytes;
13437 bytes.reserve (n_elts * elt_size);
13438 for (unsigned int i = 0; i < n_elts; i++)
13440 /* The vector is provided in gcc endian-neutral fashion.
13441 For aarch64_be Advanced SIMD, it must be laid out in the vector
13442 register in reverse order. */
13443 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13444 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13446 if (elt_mode != elt_int_mode)
13447 elt = gen_lowpart (elt_int_mode, elt);
13449 if (!CONST_INT_P (elt))
13450 return false;
13452 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13453 for (unsigned int byte = 0; byte < elt_size; byte++)
13455 bytes.quick_push (elt_val & 0xff);
13456 elt_val >>= BITS_PER_UNIT;
13460 /* The immediate must repeat every eight bytes. */
13461 unsigned int nbytes = bytes.length ();
13462 for (unsigned i = 8; i < nbytes; ++i)
13463 if (bytes[i] != bytes[i - 8])
13464 return false;
13466 /* Get the repeating 8-byte value as an integer. No endian correction
13467 is needed here because bytes is already in lsb-first order. */
13468 unsigned HOST_WIDE_INT val64 = 0;
13469 for (unsigned int i = 0; i < 8; i++)
13470 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13471 << (i * BITS_PER_UNIT));
13473 if (vec_flags & VEC_SVE_DATA)
13474 return aarch64_sve_valid_immediate (val64, info);
13475 else
13476 return aarch64_advsimd_valid_immediate (val64, info, which);
13479 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13480 has a step in the range of INDEX. Return the index expression if so,
13481 otherwise return null. */
13483 aarch64_check_zero_based_sve_index_immediate (rtx x)
13485 rtx base, step;
13486 if (const_vec_series_p (x, &base, &step)
13487 && base == const0_rtx
13488 && aarch64_sve_index_immediate_p (step))
13489 return step;
13490 return NULL_RTX;
13493 /* Check of immediate shift constants are within range. */
13494 bool
13495 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13497 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13498 if (left)
13499 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13500 else
13501 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13504 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13505 operation of width WIDTH at bit position POS. */
13508 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13510 gcc_assert (CONST_INT_P (width));
13511 gcc_assert (CONST_INT_P (pos));
13513 unsigned HOST_WIDE_INT mask
13514 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13515 return GEN_INT (mask << UINTVAL (pos));
13518 bool
13519 aarch64_mov_operand_p (rtx x, machine_mode mode)
13521 if (GET_CODE (x) == HIGH
13522 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13523 return true;
13525 if (CONST_INT_P (x))
13526 return true;
13528 if (VECTOR_MODE_P (GET_MODE (x)))
13529 return aarch64_simd_valid_immediate (x, NULL);
13531 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13532 return true;
13534 if (aarch64_sve_cnt_immediate_p (x))
13535 return true;
13537 return aarch64_classify_symbolic_expression (x)
13538 == SYMBOL_TINY_ABSOLUTE;
13541 /* Return a const_int vector of VAL. */
13543 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13545 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13546 return gen_const_vec_duplicate (mode, c);
13549 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13551 bool
13552 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13554 machine_mode vmode;
13556 vmode = aarch64_simd_container_mode (mode, 64);
13557 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13558 return aarch64_simd_valid_immediate (op_v, NULL);
13561 /* Construct and return a PARALLEL RTX vector with elements numbering the
13562 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13563 the vector - from the perspective of the architecture. This does not
13564 line up with GCC's perspective on lane numbers, so we end up with
13565 different masks depending on our target endian-ness. The diagram
13566 below may help. We must draw the distinction when building masks
13567 which select one half of the vector. An instruction selecting
13568 architectural low-lanes for a big-endian target, must be described using
13569 a mask selecting GCC high-lanes.
13571 Big-Endian Little-Endian
13573 GCC 0 1 2 3 3 2 1 0
13574 | x | x | x | x | | x | x | x | x |
13575 Architecture 3 2 1 0 3 2 1 0
13577 Low Mask: { 2, 3 } { 0, 1 }
13578 High Mask: { 0, 1 } { 2, 3 }
13580 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13583 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13585 rtvec v = rtvec_alloc (nunits / 2);
13586 int high_base = nunits / 2;
13587 int low_base = 0;
13588 int base;
13589 rtx t1;
13590 int i;
13592 if (BYTES_BIG_ENDIAN)
13593 base = high ? low_base : high_base;
13594 else
13595 base = high ? high_base : low_base;
13597 for (i = 0; i < nunits / 2; i++)
13598 RTVEC_ELT (v, i) = GEN_INT (base + i);
13600 t1 = gen_rtx_PARALLEL (mode, v);
13601 return t1;
13604 /* Check OP for validity as a PARALLEL RTX vector with elements
13605 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13606 from the perspective of the architecture. See the diagram above
13607 aarch64_simd_vect_par_cnst_half for more details. */
13609 bool
13610 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13611 bool high)
13613 int nelts;
13614 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13615 return false;
13617 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13618 HOST_WIDE_INT count_op = XVECLEN (op, 0);
13619 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13620 int i = 0;
13622 if (count_op != count_ideal)
13623 return false;
13625 for (i = 0; i < count_ideal; i++)
13627 rtx elt_op = XVECEXP (op, 0, i);
13628 rtx elt_ideal = XVECEXP (ideal, 0, i);
13630 if (!CONST_INT_P (elt_op)
13631 || INTVAL (elt_ideal) != INTVAL (elt_op))
13632 return false;
13634 return true;
13637 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13638 HIGH (exclusive). */
13639 void
13640 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13641 const_tree exp)
13643 HOST_WIDE_INT lane;
13644 gcc_assert (CONST_INT_P (operand));
13645 lane = INTVAL (operand);
13647 if (lane < low || lane >= high)
13649 if (exp)
13650 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13651 else
13652 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13656 /* Peform endian correction on lane number N, which indexes a vector
13657 of mode MODE, and return the result as an SImode rtx. */
13660 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13662 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13665 /* Return TRUE if OP is a valid vector addressing mode. */
13667 bool
13668 aarch64_simd_mem_operand_p (rtx op)
13670 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13671 || REG_P (XEXP (op, 0)));
13674 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13676 bool
13677 aarch64_sve_ld1r_operand_p (rtx op)
13679 struct aarch64_address_info addr;
13680 scalar_mode mode;
13682 return (MEM_P (op)
13683 && is_a <scalar_mode> (GET_MODE (op), &mode)
13684 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13685 && addr.type == ADDRESS_REG_IMM
13686 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13689 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13690 The conditions for STR are the same. */
13691 bool
13692 aarch64_sve_ldr_operand_p (rtx op)
13694 struct aarch64_address_info addr;
13696 return (MEM_P (op)
13697 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13698 false, ADDR_QUERY_ANY)
13699 && addr.type == ADDRESS_REG_IMM);
13702 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13703 We need to be able to access the individual pieces, so the range
13704 is different from LD[234] and ST[234]. */
13705 bool
13706 aarch64_sve_struct_memory_operand_p (rtx op)
13708 if (!MEM_P (op))
13709 return false;
13711 machine_mode mode = GET_MODE (op);
13712 struct aarch64_address_info addr;
13713 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13714 ADDR_QUERY_ANY)
13715 || addr.type != ADDRESS_REG_IMM)
13716 return false;
13718 poly_int64 first = addr.const_offset;
13719 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13720 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13721 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13724 /* Emit a register copy from operand to operand, taking care not to
13725 early-clobber source registers in the process.
13727 COUNT is the number of components into which the copy needs to be
13728 decomposed. */
13729 void
13730 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13731 unsigned int count)
13733 unsigned int i;
13734 int rdest = REGNO (operands[0]);
13735 int rsrc = REGNO (operands[1]);
13737 if (!reg_overlap_mentioned_p (operands[0], operands[1])
13738 || rdest < rsrc)
13739 for (i = 0; i < count; i++)
13740 emit_move_insn (gen_rtx_REG (mode, rdest + i),
13741 gen_rtx_REG (mode, rsrc + i));
13742 else
13743 for (i = 0; i < count; i++)
13744 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13745 gen_rtx_REG (mode, rsrc + count - i - 1));
13748 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13749 one of VSTRUCT modes: OI, CI, or XI. */
13751 aarch64_simd_attr_length_rglist (machine_mode mode)
13753 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13754 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13757 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13758 alignment of a vector to 128 bits. SVE predicates have an alignment of
13759 16 bits. */
13760 static HOST_WIDE_INT
13761 aarch64_simd_vector_alignment (const_tree type)
13763 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13764 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13765 be set for non-predicate vectors of booleans. Modes are the most
13766 direct way we have of identifying real SVE predicate types. */
13767 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13768 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13769 return MIN (align, 128);
13772 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13773 static HOST_WIDE_INT
13774 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13776 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13778 /* If the length of the vector is fixed, try to align to that length,
13779 otherwise don't try to align at all. */
13780 HOST_WIDE_INT result;
13781 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13782 result = TYPE_ALIGN (TREE_TYPE (type));
13783 return result;
13785 return TYPE_ALIGN (type);
13788 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13789 static bool
13790 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13792 if (is_packed)
13793 return false;
13795 /* For fixed-length vectors, check that the vectorizer will aim for
13796 full-vector alignment. This isn't true for generic GCC vectors
13797 that are wider than the ABI maximum of 128 bits. */
13798 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13799 && (wi::to_widest (TYPE_SIZE (type))
13800 != aarch64_vectorize_preferred_vector_alignment (type)))
13801 return false;
13803 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13804 return true;
13807 /* Return true if the vector misalignment factor is supported by the
13808 target. */
13809 static bool
13810 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13811 const_tree type, int misalignment,
13812 bool is_packed)
13814 if (TARGET_SIMD && STRICT_ALIGNMENT)
13816 /* Return if movmisalign pattern is not supported for this mode. */
13817 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13818 return false;
13820 /* Misalignment factor is unknown at compile time. */
13821 if (misalignment == -1)
13822 return false;
13824 return default_builtin_support_vector_misalignment (mode, type, misalignment,
13825 is_packed);
13828 /* If VALS is a vector constant that can be loaded into a register
13829 using DUP, generate instructions to do so and return an RTX to
13830 assign to the register. Otherwise return NULL_RTX. */
13831 static rtx
13832 aarch64_simd_dup_constant (rtx vals)
13834 machine_mode mode = GET_MODE (vals);
13835 machine_mode inner_mode = GET_MODE_INNER (mode);
13836 rtx x;
13838 if (!const_vec_duplicate_p (vals, &x))
13839 return NULL_RTX;
13841 /* We can load this constant by using DUP and a constant in a
13842 single ARM register. This will be cheaper than a vector
13843 load. */
13844 x = copy_to_mode_reg (inner_mode, x);
13845 return gen_vec_duplicate (mode, x);
13849 /* Generate code to load VALS, which is a PARALLEL containing only
13850 constants (for vec_init) or CONST_VECTOR, efficiently into a
13851 register. Returns an RTX to copy into the register, or NULL_RTX
13852 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13853 static rtx
13854 aarch64_simd_make_constant (rtx vals)
13856 machine_mode mode = GET_MODE (vals);
13857 rtx const_dup;
13858 rtx const_vec = NULL_RTX;
13859 int n_const = 0;
13860 int i;
13862 if (GET_CODE (vals) == CONST_VECTOR)
13863 const_vec = vals;
13864 else if (GET_CODE (vals) == PARALLEL)
13866 /* A CONST_VECTOR must contain only CONST_INTs and
13867 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13868 Only store valid constants in a CONST_VECTOR. */
13869 int n_elts = XVECLEN (vals, 0);
13870 for (i = 0; i < n_elts; ++i)
13872 rtx x = XVECEXP (vals, 0, i);
13873 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13874 n_const++;
13876 if (n_const == n_elts)
13877 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13879 else
13880 gcc_unreachable ();
13882 if (const_vec != NULL_RTX
13883 && aarch64_simd_valid_immediate (const_vec, NULL))
13884 /* Load using MOVI/MVNI. */
13885 return const_vec;
13886 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13887 /* Loaded using DUP. */
13888 return const_dup;
13889 else if (const_vec != NULL_RTX)
13890 /* Load from constant pool. We can not take advantage of single-cycle
13891 LD1 because we need a PC-relative addressing mode. */
13892 return const_vec;
13893 else
13894 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13895 We can not construct an initializer. */
13896 return NULL_RTX;
13899 /* Expand a vector initialisation sequence, such that TARGET is
13900 initialised to contain VALS. */
13902 void
13903 aarch64_expand_vector_init (rtx target, rtx vals)
13905 machine_mode mode = GET_MODE (target);
13906 scalar_mode inner_mode = GET_MODE_INNER (mode);
13907 /* The number of vector elements. */
13908 int n_elts = XVECLEN (vals, 0);
13909 /* The number of vector elements which are not constant. */
13910 int n_var = 0;
13911 rtx any_const = NULL_RTX;
13912 /* The first element of vals. */
13913 rtx v0 = XVECEXP (vals, 0, 0);
13914 bool all_same = true;
13916 /* Count the number of variable elements to initialise. */
13917 for (int i = 0; i < n_elts; ++i)
13919 rtx x = XVECEXP (vals, 0, i);
13920 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13921 ++n_var;
13922 else
13923 any_const = x;
13925 all_same &= rtx_equal_p (x, v0);
13928 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13929 how best to handle this. */
13930 if (n_var == 0)
13932 rtx constant = aarch64_simd_make_constant (vals);
13933 if (constant != NULL_RTX)
13935 emit_move_insn (target, constant);
13936 return;
13940 /* Splat a single non-constant element if we can. */
13941 if (all_same)
13943 rtx x = copy_to_mode_reg (inner_mode, v0);
13944 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13945 return;
13948 enum insn_code icode = optab_handler (vec_set_optab, mode);
13949 gcc_assert (icode != CODE_FOR_nothing);
13951 /* If there are only variable elements, try to optimize
13952 the insertion using dup for the most common element
13953 followed by insertions. */
13955 /* The algorithm will fill matches[*][0] with the earliest matching element,
13956 and matches[X][1] with the count of duplicate elements (if X is the
13957 earliest element which has duplicates). */
13959 if (n_var == n_elts && n_elts <= 16)
13961 int matches[16][2] = {0};
13962 for (int i = 0; i < n_elts; i++)
13964 for (int j = 0; j <= i; j++)
13966 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13968 matches[i][0] = j;
13969 matches[j][1]++;
13970 break;
13974 int maxelement = 0;
13975 int maxv = 0;
13976 for (int i = 0; i < n_elts; i++)
13977 if (matches[i][1] > maxv)
13979 maxelement = i;
13980 maxv = matches[i][1];
13983 /* Create a duplicate of the most common element, unless all elements
13984 are equally useless to us, in which case just immediately set the
13985 vector register using the first element. */
13987 if (maxv == 1)
13989 /* For vectors of two 64-bit elements, we can do even better. */
13990 if (n_elts == 2
13991 && (inner_mode == E_DImode
13992 || inner_mode == E_DFmode))
13995 rtx x0 = XVECEXP (vals, 0, 0);
13996 rtx x1 = XVECEXP (vals, 0, 1);
13997 /* Combine can pick up this case, but handling it directly
13998 here leaves clearer RTL.
14000 This is load_pair_lanes<mode>, and also gives us a clean-up
14001 for store_pair_lanes<mode>. */
14002 if (memory_operand (x0, inner_mode)
14003 && memory_operand (x1, inner_mode)
14004 && !STRICT_ALIGNMENT
14005 && rtx_equal_p (XEXP (x1, 0),
14006 plus_constant (Pmode,
14007 XEXP (x0, 0),
14008 GET_MODE_SIZE (inner_mode))))
14010 rtx t;
14011 if (inner_mode == DFmode)
14012 t = gen_load_pair_lanesdf (target, x0, x1);
14013 else
14014 t = gen_load_pair_lanesdi (target, x0, x1);
14015 emit_insn (t);
14016 return;
14019 /* The subreg-move sequence below will move into lane zero of the
14020 vector register. For big-endian we want that position to hold
14021 the last element of VALS. */
14022 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
14023 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14024 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
14026 else
14028 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14029 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14032 /* Insert the rest. */
14033 for (int i = 0; i < n_elts; i++)
14035 rtx x = XVECEXP (vals, 0, i);
14036 if (matches[i][0] == maxelement)
14037 continue;
14038 x = copy_to_mode_reg (inner_mode, x);
14039 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14041 return;
14044 /* Initialise a vector which is part-variable. We want to first try
14045 to build those lanes which are constant in the most efficient way we
14046 can. */
14047 if (n_var != n_elts)
14049 rtx copy = copy_rtx (vals);
14051 /* Load constant part of vector. We really don't care what goes into the
14052 parts we will overwrite, but we're more likely to be able to load the
14053 constant efficiently if it has fewer, larger, repeating parts
14054 (see aarch64_simd_valid_immediate). */
14055 for (int i = 0; i < n_elts; i++)
14057 rtx x = XVECEXP (vals, 0, i);
14058 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14059 continue;
14060 rtx subst = any_const;
14061 for (int bit = n_elts / 2; bit > 0; bit /= 2)
14063 /* Look in the copied vector, as more elements are const. */
14064 rtx test = XVECEXP (copy, 0, i ^ bit);
14065 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14067 subst = test;
14068 break;
14071 XVECEXP (copy, 0, i) = subst;
14073 aarch64_expand_vector_init (target, copy);
14076 /* Insert the variable lanes directly. */
14077 for (int i = 0; i < n_elts; i++)
14079 rtx x = XVECEXP (vals, 0, i);
14080 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14081 continue;
14082 x = copy_to_mode_reg (inner_mode, x);
14083 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14087 static unsigned HOST_WIDE_INT
14088 aarch64_shift_truncation_mask (machine_mode mode)
14090 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14091 return 0;
14092 return GET_MODE_UNIT_BITSIZE (mode) - 1;
14095 /* Select a format to encode pointers in exception handling data. */
14097 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14099 int type;
14100 switch (aarch64_cmodel)
14102 case AARCH64_CMODEL_TINY:
14103 case AARCH64_CMODEL_TINY_PIC:
14104 case AARCH64_CMODEL_SMALL:
14105 case AARCH64_CMODEL_SMALL_PIC:
14106 case AARCH64_CMODEL_SMALL_SPIC:
14107 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
14108 for everything. */
14109 type = DW_EH_PE_sdata4;
14110 break;
14111 default:
14112 /* No assumptions here. 8-byte relocs required. */
14113 type = DW_EH_PE_sdata8;
14114 break;
14116 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14119 /* The last .arch and .tune assembly strings that we printed. */
14120 static std::string aarch64_last_printed_arch_string;
14121 static std::string aarch64_last_printed_tune_string;
14123 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14124 by the function fndecl. */
14126 void
14127 aarch64_declare_function_name (FILE *stream, const char* name,
14128 tree fndecl)
14130 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14132 struct cl_target_option *targ_options;
14133 if (target_parts)
14134 targ_options = TREE_TARGET_OPTION (target_parts);
14135 else
14136 targ_options = TREE_TARGET_OPTION (target_option_current_node);
14137 gcc_assert (targ_options);
14139 const struct processor *this_arch
14140 = aarch64_get_arch (targ_options->x_explicit_arch);
14142 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14143 std::string extension
14144 = aarch64_get_extension_string_for_isa_flags (isa_flags,
14145 this_arch->flags);
14146 /* Only update the assembler .arch string if it is distinct from the last
14147 such string we printed. */
14148 std::string to_print = this_arch->name + extension;
14149 if (to_print != aarch64_last_printed_arch_string)
14151 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14152 aarch64_last_printed_arch_string = to_print;
14155 /* Print the cpu name we're tuning for in the comments, might be
14156 useful to readers of the generated asm. Do it only when it changes
14157 from function to function and verbose assembly is requested. */
14158 const struct processor *this_tune
14159 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14161 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14163 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14164 this_tune->name);
14165 aarch64_last_printed_tune_string = this_tune->name;
14168 /* Don't forget the type directive for ELF. */
14169 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14170 ASM_OUTPUT_LABEL (stream, name);
14173 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14175 static void
14176 aarch64_start_file (void)
14178 struct cl_target_option *default_options
14179 = TREE_TARGET_OPTION (target_option_default_node);
14181 const struct processor *default_arch
14182 = aarch64_get_arch (default_options->x_explicit_arch);
14183 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14184 std::string extension
14185 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14186 default_arch->flags);
14188 aarch64_last_printed_arch_string = default_arch->name + extension;
14189 aarch64_last_printed_tune_string = "";
14190 asm_fprintf (asm_out_file, "\t.arch %s\n",
14191 aarch64_last_printed_arch_string.c_str ());
14193 default_file_start ();
14196 /* Emit load exclusive. */
14198 static void
14199 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14200 rtx mem, rtx model_rtx)
14202 rtx (*gen) (rtx, rtx, rtx);
14204 switch (mode)
14206 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14207 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14208 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14209 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14210 default:
14211 gcc_unreachable ();
14214 emit_insn (gen (rval, mem, model_rtx));
14217 /* Emit store exclusive. */
14219 static void
14220 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14221 rtx rval, rtx mem, rtx model_rtx)
14223 rtx (*gen) (rtx, rtx, rtx, rtx);
14225 switch (mode)
14227 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14228 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14229 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14230 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14231 default:
14232 gcc_unreachable ();
14235 emit_insn (gen (bval, rval, mem, model_rtx));
14238 /* Mark the previous jump instruction as unlikely. */
14240 static void
14241 aarch64_emit_unlikely_jump (rtx insn)
14243 rtx_insn *jump = emit_jump_insn (insn);
14244 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14247 /* Expand a compare and swap pattern. */
14249 void
14250 aarch64_expand_compare_and_swap (rtx operands[])
14252 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14253 machine_mode mode, cmp_mode;
14254 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14255 int idx;
14256 gen_cas_fn gen;
14257 const gen_cas_fn split_cas[] =
14259 gen_aarch64_compare_and_swapqi,
14260 gen_aarch64_compare_and_swaphi,
14261 gen_aarch64_compare_and_swapsi,
14262 gen_aarch64_compare_and_swapdi
14264 const gen_cas_fn atomic_cas[] =
14266 gen_aarch64_compare_and_swapqi_lse,
14267 gen_aarch64_compare_and_swaphi_lse,
14268 gen_aarch64_compare_and_swapsi_lse,
14269 gen_aarch64_compare_and_swapdi_lse
14272 bval = operands[0];
14273 rval = operands[1];
14274 mem = operands[2];
14275 oldval = operands[3];
14276 newval = operands[4];
14277 is_weak = operands[5];
14278 mod_s = operands[6];
14279 mod_f = operands[7];
14280 mode = GET_MODE (mem);
14281 cmp_mode = mode;
14283 /* Normally the succ memory model must be stronger than fail, but in the
14284 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14285 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14287 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14288 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14289 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14291 switch (mode)
14293 case E_QImode:
14294 case E_HImode:
14295 /* For short modes, we're going to perform the comparison in SImode,
14296 so do the zero-extension now. */
14297 cmp_mode = SImode;
14298 rval = gen_reg_rtx (SImode);
14299 oldval = convert_modes (SImode, mode, oldval, true);
14300 /* Fall through. */
14302 case E_SImode:
14303 case E_DImode:
14304 /* Force the value into a register if needed. */
14305 if (!aarch64_plus_operand (oldval, mode))
14306 oldval = force_reg (cmp_mode, oldval);
14307 break;
14309 default:
14310 gcc_unreachable ();
14313 switch (mode)
14315 case E_QImode: idx = 0; break;
14316 case E_HImode: idx = 1; break;
14317 case E_SImode: idx = 2; break;
14318 case E_DImode: idx = 3; break;
14319 default:
14320 gcc_unreachable ();
14322 if (TARGET_LSE)
14323 gen = atomic_cas[idx];
14324 else
14325 gen = split_cas[idx];
14327 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14329 if (mode == QImode || mode == HImode)
14330 emit_move_insn (operands[1], gen_lowpart (mode, rval));
14332 x = gen_rtx_REG (CCmode, CC_REGNUM);
14333 x = gen_rtx_EQ (SImode, x, const0_rtx);
14334 emit_insn (gen_rtx_SET (bval, x));
14337 /* Test whether the target supports using a atomic load-operate instruction.
14338 CODE is the operation and AFTER is TRUE if the data in memory after the
14339 operation should be returned and FALSE if the data before the operation
14340 should be returned. Returns FALSE if the operation isn't supported by the
14341 architecture. */
14343 bool
14344 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14346 if (!TARGET_LSE)
14347 return false;
14349 switch (code)
14351 case SET:
14352 case AND:
14353 case IOR:
14354 case XOR:
14355 case MINUS:
14356 case PLUS:
14357 return true;
14358 default:
14359 return false;
14363 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14364 sequence implementing an atomic operation. */
14366 static void
14367 aarch64_emit_post_barrier (enum memmodel model)
14369 const enum memmodel base_model = memmodel_base (model);
14371 if (is_mm_sync (model)
14372 && (base_model == MEMMODEL_ACQUIRE
14373 || base_model == MEMMODEL_ACQ_REL
14374 || base_model == MEMMODEL_SEQ_CST))
14376 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14380 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14381 for the data in memory. EXPECTED is the value expected to be in memory.
14382 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14383 is the memory ordering to use. */
14385 void
14386 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14387 rtx expected, rtx desired,
14388 rtx model)
14390 rtx (*gen) (rtx, rtx, rtx, rtx);
14391 machine_mode mode;
14393 mode = GET_MODE (mem);
14395 switch (mode)
14397 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14398 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14399 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14400 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14401 default:
14402 gcc_unreachable ();
14405 /* Move the expected value into the CAS destination register. */
14406 emit_insn (gen_rtx_SET (rval, expected));
14408 /* Emit the CAS. */
14409 emit_insn (gen (rval, mem, desired, model));
14411 /* Compare the expected value with the value loaded by the CAS, to establish
14412 whether the swap was made. */
14413 aarch64_gen_compare_reg (EQ, rval, expected);
14416 /* Split a compare and swap pattern. */
14418 void
14419 aarch64_split_compare_and_swap (rtx operands[])
14421 rtx rval, mem, oldval, newval, scratch;
14422 machine_mode mode;
14423 bool is_weak;
14424 rtx_code_label *label1, *label2;
14425 rtx x, cond;
14426 enum memmodel model;
14427 rtx model_rtx;
14429 rval = operands[0];
14430 mem = operands[1];
14431 oldval = operands[2];
14432 newval = operands[3];
14433 is_weak = (operands[4] != const0_rtx);
14434 model_rtx = operands[5];
14435 scratch = operands[7];
14436 mode = GET_MODE (mem);
14437 model = memmodel_from_int (INTVAL (model_rtx));
14439 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14440 loop:
14441 .label1:
14442 LD[A]XR rval, [mem]
14443 CBNZ rval, .label2
14444 ST[L]XR scratch, newval, [mem]
14445 CBNZ scratch, .label1
14446 .label2:
14447 CMP rval, 0. */
14448 bool strong_zero_p = !is_weak && oldval == const0_rtx;
14450 label1 = NULL;
14451 if (!is_weak)
14453 label1 = gen_label_rtx ();
14454 emit_label (label1);
14456 label2 = gen_label_rtx ();
14458 /* The initial load can be relaxed for a __sync operation since a final
14459 barrier will be emitted to stop code hoisting. */
14460 if (is_mm_sync (model))
14461 aarch64_emit_load_exclusive (mode, rval, mem,
14462 GEN_INT (MEMMODEL_RELAXED));
14463 else
14464 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14466 if (strong_zero_p)
14468 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14469 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14470 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14471 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14473 else
14475 cond = aarch64_gen_compare_reg (NE, rval, oldval);
14476 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14477 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14478 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14479 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14482 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14484 if (!is_weak)
14486 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14487 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14488 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14489 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14491 else
14493 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14494 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14495 emit_insn (gen_rtx_SET (cond, x));
14498 emit_label (label2);
14499 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14500 to set the condition flags. If this is not used it will be removed by
14501 later passes. */
14502 if (strong_zero_p)
14504 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14505 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14506 emit_insn (gen_rtx_SET (cond, x));
14508 /* Emit any final barrier needed for a __sync operation. */
14509 if (is_mm_sync (model))
14510 aarch64_emit_post_barrier (model);
14513 /* Emit a BIC instruction. */
14515 static void
14516 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14518 rtx shift_rtx = GEN_INT (shift);
14519 rtx (*gen) (rtx, rtx, rtx, rtx);
14521 switch (mode)
14523 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14524 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14525 default:
14526 gcc_unreachable ();
14529 emit_insn (gen (dst, s2, shift_rtx, s1));
14532 /* Emit an atomic swap. */
14534 static void
14535 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14536 rtx mem, rtx model)
14538 rtx (*gen) (rtx, rtx, rtx, rtx);
14540 switch (mode)
14542 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14543 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14544 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14545 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14546 default:
14547 gcc_unreachable ();
14550 emit_insn (gen (dst, mem, value, model));
14553 /* Operations supported by aarch64_emit_atomic_load_op. */
14555 enum aarch64_atomic_load_op_code
14557 AARCH64_LDOP_PLUS, /* A + B */
14558 AARCH64_LDOP_XOR, /* A ^ B */
14559 AARCH64_LDOP_OR, /* A | B */
14560 AARCH64_LDOP_BIC /* A & ~B */
14563 /* Emit an atomic load-operate. */
14565 static void
14566 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14567 machine_mode mode, rtx dst, rtx src,
14568 rtx mem, rtx model)
14570 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14571 const aarch64_atomic_load_op_fn plus[] =
14573 gen_aarch64_atomic_loadaddqi,
14574 gen_aarch64_atomic_loadaddhi,
14575 gen_aarch64_atomic_loadaddsi,
14576 gen_aarch64_atomic_loadadddi
14578 const aarch64_atomic_load_op_fn eor[] =
14580 gen_aarch64_atomic_loadeorqi,
14581 gen_aarch64_atomic_loadeorhi,
14582 gen_aarch64_atomic_loadeorsi,
14583 gen_aarch64_atomic_loadeordi
14585 const aarch64_atomic_load_op_fn ior[] =
14587 gen_aarch64_atomic_loadsetqi,
14588 gen_aarch64_atomic_loadsethi,
14589 gen_aarch64_atomic_loadsetsi,
14590 gen_aarch64_atomic_loadsetdi
14592 const aarch64_atomic_load_op_fn bic[] =
14594 gen_aarch64_atomic_loadclrqi,
14595 gen_aarch64_atomic_loadclrhi,
14596 gen_aarch64_atomic_loadclrsi,
14597 gen_aarch64_atomic_loadclrdi
14599 aarch64_atomic_load_op_fn gen;
14600 int idx = 0;
14602 switch (mode)
14604 case E_QImode: idx = 0; break;
14605 case E_HImode: idx = 1; break;
14606 case E_SImode: idx = 2; break;
14607 case E_DImode: idx = 3; break;
14608 default:
14609 gcc_unreachable ();
14612 switch (code)
14614 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14615 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14616 case AARCH64_LDOP_OR: gen = ior[idx]; break;
14617 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14618 default:
14619 gcc_unreachable ();
14622 emit_insn (gen (dst, mem, src, model));
14625 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14626 location to store the data read from memory. OUT_RESULT is the location to
14627 store the result of the operation. MEM is the memory location to read and
14628 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14629 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14630 be NULL. */
14632 void
14633 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14634 rtx mem, rtx value, rtx model_rtx)
14636 machine_mode mode = GET_MODE (mem);
14637 machine_mode wmode = (mode == DImode ? DImode : SImode);
14638 const bool short_mode = (mode < SImode);
14639 aarch64_atomic_load_op_code ldop_code;
14640 rtx src;
14641 rtx x;
14643 if (out_data)
14644 out_data = gen_lowpart (mode, out_data);
14646 if (out_result)
14647 out_result = gen_lowpart (mode, out_result);
14649 /* Make sure the value is in a register, putting it into a destination
14650 register if it needs to be manipulated. */
14651 if (!register_operand (value, mode)
14652 || code == AND || code == MINUS)
14654 src = out_result ? out_result : out_data;
14655 emit_move_insn (src, gen_lowpart (mode, value));
14657 else
14658 src = value;
14659 gcc_assert (register_operand (src, mode));
14661 /* Preprocess the data for the operation as necessary. If the operation is
14662 a SET then emit a swap instruction and finish. */
14663 switch (code)
14665 case SET:
14666 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14667 return;
14669 case MINUS:
14670 /* Negate the value and treat it as a PLUS. */
14672 rtx neg_src;
14674 /* Resize the value if necessary. */
14675 if (short_mode)
14676 src = gen_lowpart (wmode, src);
14678 neg_src = gen_rtx_NEG (wmode, src);
14679 emit_insn (gen_rtx_SET (src, neg_src));
14681 if (short_mode)
14682 src = gen_lowpart (mode, src);
14684 /* Fall-through. */
14685 case PLUS:
14686 ldop_code = AARCH64_LDOP_PLUS;
14687 break;
14689 case IOR:
14690 ldop_code = AARCH64_LDOP_OR;
14691 break;
14693 case XOR:
14694 ldop_code = AARCH64_LDOP_XOR;
14695 break;
14697 case AND:
14699 rtx not_src;
14701 /* Resize the value if necessary. */
14702 if (short_mode)
14703 src = gen_lowpart (wmode, src);
14705 not_src = gen_rtx_NOT (wmode, src);
14706 emit_insn (gen_rtx_SET (src, not_src));
14708 if (short_mode)
14709 src = gen_lowpart (mode, src);
14711 ldop_code = AARCH64_LDOP_BIC;
14712 break;
14714 default:
14715 /* The operation can't be done with atomic instructions. */
14716 gcc_unreachable ();
14719 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14721 /* If necessary, calculate the data in memory after the update by redoing the
14722 operation from values in registers. */
14723 if (!out_result)
14724 return;
14726 if (short_mode)
14728 src = gen_lowpart (wmode, src);
14729 out_data = gen_lowpart (wmode, out_data);
14730 out_result = gen_lowpart (wmode, out_result);
14733 x = NULL_RTX;
14735 switch (code)
14737 case MINUS:
14738 case PLUS:
14739 x = gen_rtx_PLUS (wmode, out_data, src);
14740 break;
14741 case IOR:
14742 x = gen_rtx_IOR (wmode, out_data, src);
14743 break;
14744 case XOR:
14745 x = gen_rtx_XOR (wmode, out_data, src);
14746 break;
14747 case AND:
14748 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14749 return;
14750 default:
14751 gcc_unreachable ();
14754 emit_set_insn (out_result, x);
14756 return;
14759 /* Split an atomic operation. */
14761 void
14762 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14763 rtx value, rtx model_rtx, rtx cond)
14765 machine_mode mode = GET_MODE (mem);
14766 machine_mode wmode = (mode == DImode ? DImode : SImode);
14767 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14768 const bool is_sync = is_mm_sync (model);
14769 rtx_code_label *label;
14770 rtx x;
14772 /* Split the atomic operation into a sequence. */
14773 label = gen_label_rtx ();
14774 emit_label (label);
14776 if (new_out)
14777 new_out = gen_lowpart (wmode, new_out);
14778 if (old_out)
14779 old_out = gen_lowpart (wmode, old_out);
14780 else
14781 old_out = new_out;
14782 value = simplify_gen_subreg (wmode, value, mode, 0);
14784 /* The initial load can be relaxed for a __sync operation since a final
14785 barrier will be emitted to stop code hoisting. */
14786 if (is_sync)
14787 aarch64_emit_load_exclusive (mode, old_out, mem,
14788 GEN_INT (MEMMODEL_RELAXED));
14789 else
14790 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14792 switch (code)
14794 case SET:
14795 new_out = value;
14796 break;
14798 case NOT:
14799 x = gen_rtx_AND (wmode, old_out, value);
14800 emit_insn (gen_rtx_SET (new_out, x));
14801 x = gen_rtx_NOT (wmode, new_out);
14802 emit_insn (gen_rtx_SET (new_out, x));
14803 break;
14805 case MINUS:
14806 if (CONST_INT_P (value))
14808 value = GEN_INT (-INTVAL (value));
14809 code = PLUS;
14811 /* Fall through. */
14813 default:
14814 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14815 emit_insn (gen_rtx_SET (new_out, x));
14816 break;
14819 aarch64_emit_store_exclusive (mode, cond, mem,
14820 gen_lowpart (mode, new_out), model_rtx);
14822 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14823 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14824 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14825 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14827 /* Emit any final barrier needed for a __sync operation. */
14828 if (is_sync)
14829 aarch64_emit_post_barrier (model);
14832 static void
14833 aarch64_init_libfuncs (void)
14835 /* Half-precision float operations. The compiler handles all operations
14836 with NULL libfuncs by converting to SFmode. */
14838 /* Conversions. */
14839 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14840 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14842 /* Arithmetic. */
14843 set_optab_libfunc (add_optab, HFmode, NULL);
14844 set_optab_libfunc (sdiv_optab, HFmode, NULL);
14845 set_optab_libfunc (smul_optab, HFmode, NULL);
14846 set_optab_libfunc (neg_optab, HFmode, NULL);
14847 set_optab_libfunc (sub_optab, HFmode, NULL);
14849 /* Comparisons. */
14850 set_optab_libfunc (eq_optab, HFmode, NULL);
14851 set_optab_libfunc (ne_optab, HFmode, NULL);
14852 set_optab_libfunc (lt_optab, HFmode, NULL);
14853 set_optab_libfunc (le_optab, HFmode, NULL);
14854 set_optab_libfunc (ge_optab, HFmode, NULL);
14855 set_optab_libfunc (gt_optab, HFmode, NULL);
14856 set_optab_libfunc (unord_optab, HFmode, NULL);
14859 /* Target hook for c_mode_for_suffix. */
14860 static machine_mode
14861 aarch64_c_mode_for_suffix (char suffix)
14863 if (suffix == 'q')
14864 return TFmode;
14866 return VOIDmode;
14869 /* We can only represent floating point constants which will fit in
14870 "quarter-precision" values. These values are characterised by
14871 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14874 (-1)^s * (n/16) * 2^r
14876 Where:
14877 's' is the sign bit.
14878 'n' is an integer in the range 16 <= n <= 31.
14879 'r' is an integer in the range -3 <= r <= 4. */
14881 /* Return true iff X can be represented by a quarter-precision
14882 floating point immediate operand X. Note, we cannot represent 0.0. */
14883 bool
14884 aarch64_float_const_representable_p (rtx x)
14886 /* This represents our current view of how many bits
14887 make up the mantissa. */
14888 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14889 int exponent;
14890 unsigned HOST_WIDE_INT mantissa, mask;
14891 REAL_VALUE_TYPE r, m;
14892 bool fail;
14894 if (!CONST_DOUBLE_P (x))
14895 return false;
14897 /* We don't support HFmode constants yet. */
14898 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14899 return false;
14901 r = *CONST_DOUBLE_REAL_VALUE (x);
14903 /* We cannot represent infinities, NaNs or +/-zero. We won't
14904 know if we have +zero until we analyse the mantissa, but we
14905 can reject the other invalid values. */
14906 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14907 || REAL_VALUE_MINUS_ZERO (r))
14908 return false;
14910 /* Extract exponent. */
14911 r = real_value_abs (&r);
14912 exponent = REAL_EXP (&r);
14914 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14915 highest (sign) bit, with a fixed binary point at bit point_pos.
14916 m1 holds the low part of the mantissa, m2 the high part.
14917 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14918 bits for the mantissa, this can fail (low bits will be lost). */
14919 real_ldexp (&m, &r, point_pos - exponent);
14920 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14922 /* If the low part of the mantissa has bits set we cannot represent
14923 the value. */
14924 if (w.ulow () != 0)
14925 return false;
14926 /* We have rejected the lower HOST_WIDE_INT, so update our
14927 understanding of how many bits lie in the mantissa and
14928 look only at the high HOST_WIDE_INT. */
14929 mantissa = w.elt (1);
14930 point_pos -= HOST_BITS_PER_WIDE_INT;
14932 /* We can only represent values with a mantissa of the form 1.xxxx. */
14933 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14934 if ((mantissa & mask) != 0)
14935 return false;
14937 /* Having filtered unrepresentable values, we may now remove all
14938 but the highest 5 bits. */
14939 mantissa >>= point_pos - 5;
14941 /* We cannot represent the value 0.0, so reject it. This is handled
14942 elsewhere. */
14943 if (mantissa == 0)
14944 return false;
14946 /* Then, as bit 4 is always set, we can mask it off, leaving
14947 the mantissa in the range [0, 15]. */
14948 mantissa &= ~(1 << 4);
14949 gcc_assert (mantissa <= 15);
14951 /* GCC internally does not use IEEE754-like encoding (where normalized
14952 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14953 Our mantissa values are shifted 4 places to the left relative to
14954 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14955 by 5 places to correct for GCC's representation. */
14956 exponent = 5 - exponent;
14958 return (exponent >= 0 && exponent <= 7);
14961 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14962 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14963 output MOVI/MVNI, ORR or BIC immediate. */
14964 char*
14965 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14966 enum simd_immediate_check which)
14968 bool is_valid;
14969 static char templ[40];
14970 const char *mnemonic;
14971 const char *shift_op;
14972 unsigned int lane_count = 0;
14973 char element_char;
14975 struct simd_immediate_info info;
14977 /* This will return true to show const_vector is legal for use as either
14978 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14979 It will also update INFO to show how the immediate should be generated.
14980 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14981 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14982 gcc_assert (is_valid);
14984 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14985 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14987 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14989 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14990 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14991 move immediate path. */
14992 if (aarch64_float_const_zero_rtx_p (info.value))
14993 info.value = GEN_INT (0);
14994 else
14996 const unsigned int buf_size = 20;
14997 char float_buf[buf_size] = {'\0'};
14998 real_to_decimal_for_mode (float_buf,
14999 CONST_DOUBLE_REAL_VALUE (info.value),
15000 buf_size, buf_size, 1, info.elt_mode);
15002 if (lane_count == 1)
15003 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15004 else
15005 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15006 lane_count, element_char, float_buf);
15007 return templ;
15011 gcc_assert (CONST_INT_P (info.value));
15013 if (which == AARCH64_CHECK_MOV)
15015 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15016 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15017 if (lane_count == 1)
15018 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15019 mnemonic, UINTVAL (info.value));
15020 else if (info.shift)
15021 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15022 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15023 element_char, UINTVAL (info.value), shift_op, info.shift);
15024 else
15025 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15026 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15027 element_char, UINTVAL (info.value));
15029 else
15031 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
15032 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15033 if (info.shift)
15034 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15035 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15036 element_char, UINTVAL (info.value), "lsl", info.shift);
15037 else
15038 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15039 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15040 element_char, UINTVAL (info.value));
15042 return templ;
15045 char*
15046 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15049 /* If a floating point number was passed and we desire to use it in an
15050 integer mode do the conversion to integer. */
15051 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15053 unsigned HOST_WIDE_INT ival;
15054 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15055 gcc_unreachable ();
15056 immediate = gen_int_mode (ival, mode);
15059 machine_mode vmode;
15060 /* use a 64 bit mode for everything except for DI/DF mode, where we use
15061 a 128 bit vector mode. */
15062 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15064 vmode = aarch64_simd_container_mode (mode, width);
15065 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15066 return aarch64_output_simd_mov_immediate (v_op, width);
15069 /* Return the output string to use for moving immediate CONST_VECTOR
15070 into an SVE register. */
15072 char *
15073 aarch64_output_sve_mov_immediate (rtx const_vector)
15075 static char templ[40];
15076 struct simd_immediate_info info;
15077 char element_char;
15079 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15080 gcc_assert (is_valid);
15082 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15084 if (info.step)
15086 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15087 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15088 element_char, INTVAL (info.value), INTVAL (info.step));
15089 return templ;
15092 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15094 if (aarch64_float_const_zero_rtx_p (info.value))
15095 info.value = GEN_INT (0);
15096 else
15098 const int buf_size = 20;
15099 char float_buf[buf_size] = {};
15100 real_to_decimal_for_mode (float_buf,
15101 CONST_DOUBLE_REAL_VALUE (info.value),
15102 buf_size, buf_size, 1, info.elt_mode);
15104 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15105 element_char, float_buf);
15106 return templ;
15110 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15111 element_char, INTVAL (info.value));
15112 return templ;
15115 /* Return the asm format for a PTRUE instruction whose destination has
15116 mode MODE. SUFFIX is the element size suffix. */
15118 char *
15119 aarch64_output_ptrue (machine_mode mode, char suffix)
15121 unsigned int nunits;
15122 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15123 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15124 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15125 else
15126 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15127 return buf;
15130 /* Split operands into moves from op[1] + op[2] into op[0]. */
15132 void
15133 aarch64_split_combinev16qi (rtx operands[3])
15135 unsigned int dest = REGNO (operands[0]);
15136 unsigned int src1 = REGNO (operands[1]);
15137 unsigned int src2 = REGNO (operands[2]);
15138 machine_mode halfmode = GET_MODE (operands[1]);
15139 unsigned int halfregs = REG_NREGS (operands[1]);
15140 rtx destlo, desthi;
15142 gcc_assert (halfmode == V16QImode);
15144 if (src1 == dest && src2 == dest + halfregs)
15146 /* No-op move. Can't split to nothing; emit something. */
15147 emit_note (NOTE_INSN_DELETED);
15148 return;
15151 /* Preserve register attributes for variable tracking. */
15152 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15153 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15154 GET_MODE_SIZE (halfmode));
15156 /* Special case of reversed high/low parts. */
15157 if (reg_overlap_mentioned_p (operands[2], destlo)
15158 && reg_overlap_mentioned_p (operands[1], desthi))
15160 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15161 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15162 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15164 else if (!reg_overlap_mentioned_p (operands[2], destlo))
15166 /* Try to avoid unnecessary moves if part of the result
15167 is in the right place already. */
15168 if (src1 != dest)
15169 emit_move_insn (destlo, operands[1]);
15170 if (src2 != dest + halfregs)
15171 emit_move_insn (desthi, operands[2]);
15173 else
15175 if (src2 != dest + halfregs)
15176 emit_move_insn (desthi, operands[2]);
15177 if (src1 != dest)
15178 emit_move_insn (destlo, operands[1]);
15182 /* vec_perm support. */
15184 struct expand_vec_perm_d
15186 rtx target, op0, op1;
15187 vec_perm_indices perm;
15188 machine_mode vmode;
15189 unsigned int vec_flags;
15190 bool one_vector_p;
15191 bool testing_p;
15194 /* Generate a variable permutation. */
15196 static void
15197 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15199 machine_mode vmode = GET_MODE (target);
15200 bool one_vector_p = rtx_equal_p (op0, op1);
15202 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15203 gcc_checking_assert (GET_MODE (op0) == vmode);
15204 gcc_checking_assert (GET_MODE (op1) == vmode);
15205 gcc_checking_assert (GET_MODE (sel) == vmode);
15206 gcc_checking_assert (TARGET_SIMD);
15208 if (one_vector_p)
15210 if (vmode == V8QImode)
15212 /* Expand the argument to a V16QI mode by duplicating it. */
15213 rtx pair = gen_reg_rtx (V16QImode);
15214 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15215 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15217 else
15219 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15222 else
15224 rtx pair;
15226 if (vmode == V8QImode)
15228 pair = gen_reg_rtx (V16QImode);
15229 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15230 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15232 else
15234 pair = gen_reg_rtx (OImode);
15235 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15236 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15241 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15242 NELT is the number of elements in the vector. */
15244 void
15245 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15246 unsigned int nelt)
15248 machine_mode vmode = GET_MODE (target);
15249 bool one_vector_p = rtx_equal_p (op0, op1);
15250 rtx mask;
15252 /* The TBL instruction does not use a modulo index, so we must take care
15253 of that ourselves. */
15254 mask = aarch64_simd_gen_const_vector_dup (vmode,
15255 one_vector_p ? nelt - 1 : 2 * nelt - 1);
15256 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15258 /* For big-endian, we also need to reverse the index within the vector
15259 (but not which vector). */
15260 if (BYTES_BIG_ENDIAN)
15262 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15263 if (!one_vector_p)
15264 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15265 sel = expand_simple_binop (vmode, XOR, sel, mask,
15266 NULL, 0, OPTAB_LIB_WIDEN);
15268 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15271 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15273 static void
15274 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15276 emit_insn (gen_rtx_SET (target,
15277 gen_rtx_UNSPEC (GET_MODE (target),
15278 gen_rtvec (2, op0, op1), code)));
15281 /* Expand an SVE vec_perm with the given operands. */
15283 void
15284 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15286 machine_mode data_mode = GET_MODE (target);
15287 machine_mode sel_mode = GET_MODE (sel);
15288 /* Enforced by the pattern condition. */
15289 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15291 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15292 size of the two value vectors, i.e. the upper bits of the indices
15293 are effectively ignored. SVE TBL instead produces 0 for any
15294 out-of-range indices, so we need to modulo all the vec_perm indices
15295 to ensure they are all in range. */
15296 rtx sel_reg = force_reg (sel_mode, sel);
15298 /* Check if the sel only references the first values vector. */
15299 if (GET_CODE (sel) == CONST_VECTOR
15300 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15302 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15303 return;
15306 /* Check if the two values vectors are the same. */
15307 if (rtx_equal_p (op0, op1))
15309 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15310 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15311 NULL, 0, OPTAB_DIRECT);
15312 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15313 return;
15316 /* Run TBL on for each value vector and combine the results. */
15318 rtx res0 = gen_reg_rtx (data_mode);
15319 rtx res1 = gen_reg_rtx (data_mode);
15320 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15321 if (GET_CODE (sel) != CONST_VECTOR
15322 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15324 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15325 2 * nunits - 1);
15326 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15327 NULL, 0, OPTAB_DIRECT);
15329 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15330 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15331 NULL, 0, OPTAB_DIRECT);
15332 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15333 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15334 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15335 else
15336 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15339 /* Recognize patterns suitable for the TRN instructions. */
15340 static bool
15341 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15343 HOST_WIDE_INT odd;
15344 poly_uint64 nelt = d->perm.length ();
15345 rtx out, in0, in1, x;
15346 machine_mode vmode = d->vmode;
15348 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15349 return false;
15351 /* Note that these are little-endian tests.
15352 We correct for big-endian later. */
15353 if (!d->perm[0].is_constant (&odd)
15354 || (odd != 0 && odd != 1)
15355 || !d->perm.series_p (0, 2, odd, 2)
15356 || !d->perm.series_p (1, 2, nelt + odd, 2))
15357 return false;
15359 /* Success! */
15360 if (d->testing_p)
15361 return true;
15363 in0 = d->op0;
15364 in1 = d->op1;
15365 /* We don't need a big-endian lane correction for SVE; see the comment
15366 at the head of aarch64-sve.md for details. */
15367 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15369 x = in0, in0 = in1, in1 = x;
15370 odd = !odd;
15372 out = d->target;
15374 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15375 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15376 return true;
15379 /* Recognize patterns suitable for the UZP instructions. */
15380 static bool
15381 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15383 HOST_WIDE_INT odd;
15384 rtx out, in0, in1, x;
15385 machine_mode vmode = d->vmode;
15387 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15388 return false;
15390 /* Note that these are little-endian tests.
15391 We correct for big-endian later. */
15392 if (!d->perm[0].is_constant (&odd)
15393 || (odd != 0 && odd != 1)
15394 || !d->perm.series_p (0, 1, odd, 2))
15395 return false;
15397 /* Success! */
15398 if (d->testing_p)
15399 return true;
15401 in0 = d->op0;
15402 in1 = d->op1;
15403 /* We don't need a big-endian lane correction for SVE; see the comment
15404 at the head of aarch64-sve.md for details. */
15405 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15407 x = in0, in0 = in1, in1 = x;
15408 odd = !odd;
15410 out = d->target;
15412 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15413 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15414 return true;
15417 /* Recognize patterns suitable for the ZIP instructions. */
15418 static bool
15419 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15421 unsigned int high;
15422 poly_uint64 nelt = d->perm.length ();
15423 rtx out, in0, in1, x;
15424 machine_mode vmode = d->vmode;
15426 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15427 return false;
15429 /* Note that these are little-endian tests.
15430 We correct for big-endian later. */
15431 poly_uint64 first = d->perm[0];
15432 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15433 || !d->perm.series_p (0, 2, first, 1)
15434 || !d->perm.series_p (1, 2, first + nelt, 1))
15435 return false;
15436 high = maybe_ne (first, 0U);
15438 /* Success! */
15439 if (d->testing_p)
15440 return true;
15442 in0 = d->op0;
15443 in1 = d->op1;
15444 /* We don't need a big-endian lane correction for SVE; see the comment
15445 at the head of aarch64-sve.md for details. */
15446 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15448 x = in0, in0 = in1, in1 = x;
15449 high = !high;
15451 out = d->target;
15453 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15454 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15455 return true;
15458 /* Recognize patterns for the EXT insn. */
15460 static bool
15461 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15463 HOST_WIDE_INT location;
15464 rtx offset;
15466 /* The first element always refers to the first vector.
15467 Check if the extracted indices are increasing by one. */
15468 if (d->vec_flags == VEC_SVE_PRED
15469 || !d->perm[0].is_constant (&location)
15470 || !d->perm.series_p (0, 1, location, 1))
15471 return false;
15473 /* Success! */
15474 if (d->testing_p)
15475 return true;
15477 /* The case where (location == 0) is a no-op for both big- and little-endian,
15478 and is removed by the mid-end at optimization levels -O1 and higher.
15480 We don't need a big-endian lane correction for SVE; see the comment
15481 at the head of aarch64-sve.md for details. */
15482 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15484 /* After setup, we want the high elements of the first vector (stored
15485 at the LSB end of the register), and the low elements of the second
15486 vector (stored at the MSB end of the register). So swap. */
15487 std::swap (d->op0, d->op1);
15488 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15489 to_constant () is safe since this is restricted to Advanced SIMD
15490 vectors. */
15491 location = d->perm.length ().to_constant () - location;
15494 offset = GEN_INT (location);
15495 emit_set_insn (d->target,
15496 gen_rtx_UNSPEC (d->vmode,
15497 gen_rtvec (3, d->op0, d->op1, offset),
15498 UNSPEC_EXT));
15499 return true;
15502 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15503 within each 64-bit, 32-bit or 16-bit granule. */
15505 static bool
15506 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15508 HOST_WIDE_INT diff;
15509 unsigned int i, size, unspec;
15510 machine_mode pred_mode;
15512 if (d->vec_flags == VEC_SVE_PRED
15513 || !d->one_vector_p
15514 || !d->perm[0].is_constant (&diff))
15515 return false;
15517 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15518 if (size == 8)
15520 unspec = UNSPEC_REV64;
15521 pred_mode = VNx2BImode;
15523 else if (size == 4)
15525 unspec = UNSPEC_REV32;
15526 pred_mode = VNx4BImode;
15528 else if (size == 2)
15530 unspec = UNSPEC_REV16;
15531 pred_mode = VNx8BImode;
15533 else
15534 return false;
15536 unsigned int step = diff + 1;
15537 for (i = 0; i < step; ++i)
15538 if (!d->perm.series_p (i, step, diff - i, step))
15539 return false;
15541 /* Success! */
15542 if (d->testing_p)
15543 return true;
15545 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15546 if (d->vec_flags == VEC_SVE_DATA)
15548 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15549 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15550 UNSPEC_MERGE_PTRUE);
15552 emit_set_insn (d->target, src);
15553 return true;
15556 /* Recognize patterns for the REV insn, which reverses elements within
15557 a full vector. */
15559 static bool
15560 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15562 poly_uint64 nelt = d->perm.length ();
15564 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15565 return false;
15567 if (!d->perm.series_p (0, 1, nelt - 1, -1))
15568 return false;
15570 /* Success! */
15571 if (d->testing_p)
15572 return true;
15574 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15575 emit_set_insn (d->target, src);
15576 return true;
15579 static bool
15580 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15582 rtx out = d->target;
15583 rtx in0;
15584 HOST_WIDE_INT elt;
15585 machine_mode vmode = d->vmode;
15586 rtx lane;
15588 if (d->vec_flags == VEC_SVE_PRED
15589 || d->perm.encoding ().encoded_nelts () != 1
15590 || !d->perm[0].is_constant (&elt))
15591 return false;
15593 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15594 return false;
15596 /* Success! */
15597 if (d->testing_p)
15598 return true;
15600 /* The generic preparation in aarch64_expand_vec_perm_const_1
15601 swaps the operand order and the permute indices if it finds
15602 d->perm[0] to be in the second operand. Thus, we can always
15603 use d->op0 and need not do any extra arithmetic to get the
15604 correct lane number. */
15605 in0 = d->op0;
15606 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
15608 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15609 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15610 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15611 return true;
15614 static bool
15615 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15617 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15618 machine_mode vmode = d->vmode;
15620 /* Make sure that the indices are constant. */
15621 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15622 for (unsigned int i = 0; i < encoded_nelts; ++i)
15623 if (!d->perm[i].is_constant ())
15624 return false;
15626 if (d->testing_p)
15627 return true;
15629 /* Generic code will try constant permutation twice. Once with the
15630 original mode and again with the elements lowered to QImode.
15631 So wait and don't do the selector expansion ourselves. */
15632 if (vmode != V8QImode && vmode != V16QImode)
15633 return false;
15635 /* to_constant is safe since this routine is specific to Advanced SIMD
15636 vectors. */
15637 unsigned int nelt = d->perm.length ().to_constant ();
15638 for (unsigned int i = 0; i < nelt; ++i)
15639 /* If big-endian and two vectors we end up with a weird mixed-endian
15640 mode on NEON. Reverse the index within each word but not the word
15641 itself. to_constant is safe because we checked is_constant above. */
15642 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15643 ? d->perm[i].to_constant () ^ (nelt - 1)
15644 : d->perm[i].to_constant ());
15646 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15647 sel = force_reg (vmode, sel);
15649 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15650 return true;
15653 /* Try to implement D using an SVE TBL instruction. */
15655 static bool
15656 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15658 unsigned HOST_WIDE_INT nelt;
15660 /* Permuting two variable-length vectors could overflow the
15661 index range. */
15662 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15663 return false;
15665 if (d->testing_p)
15666 return true;
15668 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15669 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15670 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15671 return true;
15674 static bool
15675 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15677 /* The pattern matching functions above are written to look for a small
15678 number to begin the sequence (0, 1, N/2). If we begin with an index
15679 from the second operand, we can swap the operands. */
15680 poly_int64 nelt = d->perm.length ();
15681 if (known_ge (d->perm[0], nelt))
15683 d->perm.rotate_inputs (1);
15684 std::swap (d->op0, d->op1);
15687 if ((d->vec_flags == VEC_ADVSIMD
15688 || d->vec_flags == VEC_SVE_DATA
15689 || d->vec_flags == VEC_SVE_PRED)
15690 && known_gt (nelt, 1))
15692 if (aarch64_evpc_rev_local (d))
15693 return true;
15694 else if (aarch64_evpc_rev_global (d))
15695 return true;
15696 else if (aarch64_evpc_ext (d))
15697 return true;
15698 else if (aarch64_evpc_dup (d))
15699 return true;
15700 else if (aarch64_evpc_zip (d))
15701 return true;
15702 else if (aarch64_evpc_uzp (d))
15703 return true;
15704 else if (aarch64_evpc_trn (d))
15705 return true;
15706 if (d->vec_flags == VEC_SVE_DATA)
15707 return aarch64_evpc_sve_tbl (d);
15708 else if (d->vec_flags == VEC_SVE_DATA)
15709 return aarch64_evpc_tbl (d);
15711 return false;
15714 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15716 static bool
15717 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15718 rtx op1, const vec_perm_indices &sel)
15720 struct expand_vec_perm_d d;
15722 /* Check whether the mask can be applied to a single vector. */
15723 if (op0 && rtx_equal_p (op0, op1))
15724 d.one_vector_p = true;
15725 else if (sel.all_from_input_p (0))
15727 d.one_vector_p = true;
15728 op1 = op0;
15730 else if (sel.all_from_input_p (1))
15732 d.one_vector_p = true;
15733 op0 = op1;
15735 else
15736 d.one_vector_p = false;
15738 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15739 sel.nelts_per_input ());
15740 d.vmode = vmode;
15741 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15742 d.target = target;
15743 d.op0 = op0;
15744 d.op1 = op1;
15745 d.testing_p = !target;
15747 if (!d.testing_p)
15748 return aarch64_expand_vec_perm_const_1 (&d);
15750 rtx_insn *last = get_last_insn ();
15751 bool ret = aarch64_expand_vec_perm_const_1 (&d);
15752 gcc_assert (last == get_last_insn ());
15754 return ret;
15757 /* Generate a byte permute mask for a register of mode MODE,
15758 which has NUNITS units. */
15761 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15763 /* We have to reverse each vector because we dont have
15764 a permuted load that can reverse-load according to ABI rules. */
15765 rtx mask;
15766 rtvec v = rtvec_alloc (16);
15767 unsigned int i, j;
15768 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15770 gcc_assert (BYTES_BIG_ENDIAN);
15771 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15773 for (i = 0; i < nunits; i++)
15774 for (j = 0; j < usize; j++)
15775 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15776 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15777 return force_reg (V16QImode, mask);
15780 /* Return true if X is a valid second operand for the SVE instruction
15781 that implements integer comparison OP_CODE. */
15783 static bool
15784 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15786 if (register_operand (x, VOIDmode))
15787 return true;
15789 switch (op_code)
15791 case LTU:
15792 case LEU:
15793 case GEU:
15794 case GTU:
15795 return aarch64_sve_cmp_immediate_p (x, false);
15796 case LT:
15797 case LE:
15798 case GE:
15799 case GT:
15800 case NE:
15801 case EQ:
15802 return aarch64_sve_cmp_immediate_p (x, true);
15803 default:
15804 gcc_unreachable ();
15808 /* Use predicated SVE instructions to implement the equivalent of:
15810 (set TARGET OP)
15812 given that PTRUE is an all-true predicate of the appropriate mode. */
15814 static void
15815 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15817 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15818 gen_rtvec (2, ptrue, op),
15819 UNSPEC_MERGE_PTRUE);
15820 rtx_insn *insn = emit_set_insn (target, unspec);
15821 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15824 /* Likewise, but also clobber the condition codes. */
15826 static void
15827 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15829 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15830 gen_rtvec (2, ptrue, op),
15831 UNSPEC_MERGE_PTRUE);
15832 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15833 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15836 /* Return the UNSPEC_COND_* code for comparison CODE. */
15838 static unsigned int
15839 aarch64_unspec_cond_code (rtx_code code)
15841 switch (code)
15843 case NE:
15844 return UNSPEC_COND_NE;
15845 case EQ:
15846 return UNSPEC_COND_EQ;
15847 case LT:
15848 return UNSPEC_COND_LT;
15849 case GT:
15850 return UNSPEC_COND_GT;
15851 case LE:
15852 return UNSPEC_COND_LE;
15853 case GE:
15854 return UNSPEC_COND_GE;
15855 default:
15856 gcc_unreachable ();
15860 /* Emit:
15862 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15864 where <X> is the operation associated with comparison CODE. This form
15865 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15866 semantics, such as when PRED might not be all-true and when comparing
15867 inactive lanes could have side effects. */
15869 static void
15870 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15871 rtx pred, rtx op0, rtx op1)
15873 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15874 gen_rtvec (3, pred, op0, op1),
15875 aarch64_unspec_cond_code (code));
15876 emit_set_insn (target, unspec);
15879 /* Expand an SVE integer comparison using the SVE equivalent of:
15881 (set TARGET (CODE OP0 OP1)). */
15883 void
15884 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15886 machine_mode pred_mode = GET_MODE (target);
15887 machine_mode data_mode = GET_MODE (op0);
15889 if (!aarch64_sve_cmp_operand_p (code, op1))
15890 op1 = force_reg (data_mode, op1);
15892 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15893 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15894 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15897 /* Emit the SVE equivalent of:
15899 (set TMP1 (CODE1 OP0 OP1))
15900 (set TMP2 (CODE2 OP0 OP1))
15901 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15903 PTRUE is an all-true predicate with the same mode as TARGET. */
15905 static void
15906 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15907 rtx ptrue, rtx op0, rtx op1)
15909 machine_mode pred_mode = GET_MODE (ptrue);
15910 rtx tmp1 = gen_reg_rtx (pred_mode);
15911 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15912 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15913 rtx tmp2 = gen_reg_rtx (pred_mode);
15914 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15915 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15916 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15919 /* Emit the SVE equivalent of:
15921 (set TMP (CODE OP0 OP1))
15922 (set TARGET (not TMP))
15924 PTRUE is an all-true predicate with the same mode as TARGET. */
15926 static void
15927 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15928 rtx op0, rtx op1)
15930 machine_mode pred_mode = GET_MODE (ptrue);
15931 rtx tmp = gen_reg_rtx (pred_mode);
15932 aarch64_emit_sve_ptrue_op (tmp, ptrue,
15933 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15934 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15937 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15939 (set TARGET (CODE OP0 OP1))
15941 If CAN_INVERT_P is true, the caller can also handle inverted results;
15942 return true if the result is in fact inverted. */
15944 bool
15945 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15946 rtx op0, rtx op1, bool can_invert_p)
15948 machine_mode pred_mode = GET_MODE (target);
15949 machine_mode data_mode = GET_MODE (op0);
15951 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15952 switch (code)
15954 case UNORDERED:
15955 /* UNORDERED has no immediate form. */
15956 op1 = force_reg (data_mode, op1);
15957 /* fall through */
15958 case LT:
15959 case LE:
15960 case GT:
15961 case GE:
15962 case EQ:
15963 case NE:
15965 /* There is native support for the comparison. */
15966 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15967 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15968 return false;
15971 case LTGT:
15972 /* This is a trapping operation (LT or GT). */
15973 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15974 return false;
15976 case UNEQ:
15977 if (!flag_trapping_math)
15979 /* This would trap for signaling NaNs. */
15980 op1 = force_reg (data_mode, op1);
15981 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15982 return false;
15984 /* fall through */
15985 case UNLT:
15986 case UNLE:
15987 case UNGT:
15988 case UNGE:
15989 if (flag_trapping_math)
15991 /* Work out which elements are ordered. */
15992 rtx ordered = gen_reg_rtx (pred_mode);
15993 op1 = force_reg (data_mode, op1);
15994 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
15996 /* Test the opposite condition for the ordered elements,
15997 then invert the result. */
15998 if (code == UNEQ)
15999 code = NE;
16000 else
16001 code = reverse_condition_maybe_unordered (code);
16002 if (can_invert_p)
16004 aarch64_emit_sve_predicated_cond (target, code,
16005 ordered, op0, op1);
16006 return true;
16008 rtx tmp = gen_reg_rtx (pred_mode);
16009 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16010 aarch64_emit_unop (target, one_cmpl_optab, tmp);
16011 return false;
16013 break;
16015 case ORDERED:
16016 /* ORDERED has no immediate form. */
16017 op1 = force_reg (data_mode, op1);
16018 break;
16020 default:
16021 gcc_unreachable ();
16024 /* There is native support for the inverse comparison. */
16025 code = reverse_condition_maybe_unordered (code);
16026 if (can_invert_p)
16028 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16029 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16030 return true;
16032 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16033 return false;
16036 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
16037 of the data being selected and CMP_MODE is the mode of the values being
16038 compared. */
16040 void
16041 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16042 rtx *ops)
16044 machine_mode pred_mode
16045 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16046 GET_MODE_SIZE (cmp_mode)).require ();
16047 rtx pred = gen_reg_rtx (pred_mode);
16048 if (FLOAT_MODE_P (cmp_mode))
16050 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16051 ops[4], ops[5], true))
16052 std::swap (ops[1], ops[2]);
16054 else
16055 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16057 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16058 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16061 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
16062 true. However due to issues with register allocation it is preferable
16063 to avoid tieing integer scalar and FP scalar modes. Executing integer
16064 operations in general registers is better than treating them as scalar
16065 vector operations. This reduces latency and avoids redundant int<->FP
16066 moves. So tie modes if they are either the same class, or vector modes
16067 with other vector modes, vector structs or any scalar mode. */
16069 static bool
16070 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16072 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16073 return true;
16075 /* We specifically want to allow elements of "structure" modes to
16076 be tieable to the structure. This more general condition allows
16077 other rarer situations too. The reason we don't extend this to
16078 predicate modes is that there are no predicate structure modes
16079 nor any specific instructions for extracting part of a predicate
16080 register. */
16081 if (aarch64_vector_data_mode_p (mode1)
16082 && aarch64_vector_data_mode_p (mode2))
16083 return true;
16085 /* Also allow any scalar modes with vectors. */
16086 if (aarch64_vector_mode_supported_p (mode1)
16087 || aarch64_vector_mode_supported_p (mode2))
16088 return true;
16090 return false;
16093 /* Return a new RTX holding the result of moving POINTER forward by
16094 AMOUNT bytes. */
16096 static rtx
16097 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16099 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16101 return adjust_automodify_address (pointer, GET_MODE (pointer),
16102 next, amount);
16105 /* Return a new RTX holding the result of moving POINTER forward by the
16106 size of the mode it points to. */
16108 static rtx
16109 aarch64_progress_pointer (rtx pointer)
16111 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16114 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16115 MODE bytes. */
16117 static void
16118 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16119 machine_mode mode)
16121 rtx reg = gen_reg_rtx (mode);
16123 /* "Cast" the pointers to the correct mode. */
16124 *src = adjust_address (*src, mode, 0);
16125 *dst = adjust_address (*dst, mode, 0);
16126 /* Emit the memcpy. */
16127 emit_move_insn (reg, *src);
16128 emit_move_insn (*dst, reg);
16129 /* Move the pointers forward. */
16130 *src = aarch64_progress_pointer (*src);
16131 *dst = aarch64_progress_pointer (*dst);
16134 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16135 we succeed, otherwise return false. */
16137 bool
16138 aarch64_expand_movmem (rtx *operands)
16140 unsigned int n;
16141 rtx dst = operands[0];
16142 rtx src = operands[1];
16143 rtx base;
16144 bool speed_p = !optimize_function_for_size_p (cfun);
16146 /* When optimizing for size, give a better estimate of the length of a
16147 memcpy call, but use the default otherwise. */
16148 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16150 /* We can't do anything smart if the amount to copy is not constant. */
16151 if (!CONST_INT_P (operands[2]))
16152 return false;
16154 n = UINTVAL (operands[2]);
16156 /* Try to keep the number of instructions low. For cases below 16 bytes we
16157 need to make at most two moves. For cases above 16 bytes it will be one
16158 move for each 16 byte chunk, then at most two additional moves. */
16159 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16160 return false;
16162 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16163 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16165 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16166 src = adjust_automodify_address (src, VOIDmode, base, 0);
16168 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16169 1-byte chunk. */
16170 if (n < 4)
16172 if (n >= 2)
16174 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16175 n -= 2;
16178 if (n == 1)
16179 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16181 return true;
16184 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
16185 4-byte chunk, partially overlapping with the previously copied chunk. */
16186 if (n < 8)
16188 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16189 n -= 4;
16190 if (n > 0)
16192 int move = n - 4;
16194 src = aarch64_move_pointer (src, move);
16195 dst = aarch64_move_pointer (dst, move);
16196 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16198 return true;
16201 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
16202 them, then (if applicable) an 8-byte chunk. */
16203 while (n >= 8)
16205 if (n / 16)
16207 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16208 n -= 16;
16210 else
16212 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16213 n -= 8;
16217 /* Finish the final bytes of the copy. We can always do this in one
16218 instruction. We either copy the exact amount we need, or partially
16219 overlap with the previous chunk we copied and copy 8-bytes. */
16220 if (n == 0)
16221 return true;
16222 else if (n == 1)
16223 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16224 else if (n == 2)
16225 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16226 else if (n == 4)
16227 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16228 else
16230 if (n == 3)
16232 src = aarch64_move_pointer (src, -1);
16233 dst = aarch64_move_pointer (dst, -1);
16234 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16236 else
16238 int move = n - 8;
16240 src = aarch64_move_pointer (src, move);
16241 dst = aarch64_move_pointer (dst, move);
16242 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16246 return true;
16249 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16250 SImode stores. Handle the case when the constant has identical
16251 bottom and top halves. This is beneficial when the two stores can be
16252 merged into an STP and we avoid synthesising potentially expensive
16253 immediates twice. Return true if such a split is possible. */
16255 bool
16256 aarch64_split_dimode_const_store (rtx dst, rtx src)
16258 rtx lo = gen_lowpart (SImode, src);
16259 rtx hi = gen_highpart_mode (SImode, DImode, src);
16261 bool size_p = optimize_function_for_size_p (cfun);
16263 if (!rtx_equal_p (lo, hi))
16264 return false;
16266 unsigned int orig_cost
16267 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16268 unsigned int lo_cost
16269 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16271 /* We want to transform:
16272 MOV x1, 49370
16273 MOVK x1, 0x140, lsl 16
16274 MOVK x1, 0xc0da, lsl 32
16275 MOVK x1, 0x140, lsl 48
16276 STR x1, [x0]
16277 into:
16278 MOV w1, 49370
16279 MOVK w1, 0x140, lsl 16
16280 STP w1, w1, [x0]
16281 So we want to perform this only when we save two instructions
16282 or more. When optimizing for size, however, accept any code size
16283 savings we can. */
16284 if (size_p && orig_cost <= lo_cost)
16285 return false;
16287 if (!size_p
16288 && (orig_cost <= lo_cost + 1))
16289 return false;
16291 rtx mem_lo = adjust_address (dst, SImode, 0);
16292 if (!aarch64_mem_pair_operand (mem_lo, SImode))
16293 return false;
16295 rtx tmp_reg = gen_reg_rtx (SImode);
16296 aarch64_expand_mov_immediate (tmp_reg, lo);
16297 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16298 /* Don't emit an explicit store pair as this may not be always profitable.
16299 Let the sched-fusion logic decide whether to merge them. */
16300 emit_move_insn (mem_lo, tmp_reg);
16301 emit_move_insn (mem_hi, tmp_reg);
16303 return true;
16306 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16308 static unsigned HOST_WIDE_INT
16309 aarch64_asan_shadow_offset (void)
16311 return (HOST_WIDE_INT_1 << 36);
16314 static rtx
16315 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16316 int code, tree treeop0, tree treeop1)
16318 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16319 rtx op0, op1;
16320 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16321 insn_code icode;
16322 struct expand_operand ops[4];
16324 start_sequence ();
16325 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16327 op_mode = GET_MODE (op0);
16328 if (op_mode == VOIDmode)
16329 op_mode = GET_MODE (op1);
16331 switch (op_mode)
16333 case E_QImode:
16334 case E_HImode:
16335 case E_SImode:
16336 cmp_mode = SImode;
16337 icode = CODE_FOR_cmpsi;
16338 break;
16340 case E_DImode:
16341 cmp_mode = DImode;
16342 icode = CODE_FOR_cmpdi;
16343 break;
16345 case E_SFmode:
16346 cmp_mode = SFmode;
16347 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16348 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16349 break;
16351 case E_DFmode:
16352 cmp_mode = DFmode;
16353 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16354 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16355 break;
16357 default:
16358 end_sequence ();
16359 return NULL_RTX;
16362 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16363 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16364 if (!op0 || !op1)
16366 end_sequence ();
16367 return NULL_RTX;
16369 *prep_seq = get_insns ();
16370 end_sequence ();
16372 create_fixed_operand (&ops[0], op0);
16373 create_fixed_operand (&ops[1], op1);
16375 start_sequence ();
16376 if (!maybe_expand_insn (icode, 2, ops))
16378 end_sequence ();
16379 return NULL_RTX;
16381 *gen_seq = get_insns ();
16382 end_sequence ();
16384 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16385 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16388 static rtx
16389 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16390 int cmp_code, tree treeop0, tree treeop1, int bit_code)
16392 rtx op0, op1, target;
16393 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16394 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16395 insn_code icode;
16396 struct expand_operand ops[6];
16397 int aarch64_cond;
16399 push_to_sequence (*prep_seq);
16400 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16402 op_mode = GET_MODE (op0);
16403 if (op_mode == VOIDmode)
16404 op_mode = GET_MODE (op1);
16406 switch (op_mode)
16408 case E_QImode:
16409 case E_HImode:
16410 case E_SImode:
16411 cmp_mode = SImode;
16412 icode = CODE_FOR_ccmpsi;
16413 break;
16415 case E_DImode:
16416 cmp_mode = DImode;
16417 icode = CODE_FOR_ccmpdi;
16418 break;
16420 case E_SFmode:
16421 cmp_mode = SFmode;
16422 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16423 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16424 break;
16426 case E_DFmode:
16427 cmp_mode = DFmode;
16428 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16429 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16430 break;
16432 default:
16433 end_sequence ();
16434 return NULL_RTX;
16437 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16438 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16439 if (!op0 || !op1)
16441 end_sequence ();
16442 return NULL_RTX;
16444 *prep_seq = get_insns ();
16445 end_sequence ();
16447 target = gen_rtx_REG (cc_mode, CC_REGNUM);
16448 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16450 if (bit_code != AND)
16452 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16453 GET_MODE (XEXP (prev, 0))),
16454 VOIDmode, XEXP (prev, 0), const0_rtx);
16455 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16458 create_fixed_operand (&ops[0], XEXP (prev, 0));
16459 create_fixed_operand (&ops[1], target);
16460 create_fixed_operand (&ops[2], op0);
16461 create_fixed_operand (&ops[3], op1);
16462 create_fixed_operand (&ops[4], prev);
16463 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16465 push_to_sequence (*gen_seq);
16466 if (!maybe_expand_insn (icode, 6, ops))
16468 end_sequence ();
16469 return NULL_RTX;
16472 *gen_seq = get_insns ();
16473 end_sequence ();
16475 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16478 #undef TARGET_GEN_CCMP_FIRST
16479 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16481 #undef TARGET_GEN_CCMP_NEXT
16482 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16484 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16485 instruction fusion of some sort. */
16487 static bool
16488 aarch64_macro_fusion_p (void)
16490 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16494 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16495 should be kept together during scheduling. */
16497 static bool
16498 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16500 rtx set_dest;
16501 rtx prev_set = single_set (prev);
16502 rtx curr_set = single_set (curr);
16503 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16504 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16506 if (!aarch64_macro_fusion_p ())
16507 return false;
16509 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16511 /* We are trying to match:
16512 prev (mov) == (set (reg r0) (const_int imm16))
16513 curr (movk) == (set (zero_extract (reg r0)
16514 (const_int 16)
16515 (const_int 16))
16516 (const_int imm16_1)) */
16518 set_dest = SET_DEST (curr_set);
16520 if (GET_CODE (set_dest) == ZERO_EXTRACT
16521 && CONST_INT_P (SET_SRC (curr_set))
16522 && CONST_INT_P (SET_SRC (prev_set))
16523 && CONST_INT_P (XEXP (set_dest, 2))
16524 && INTVAL (XEXP (set_dest, 2)) == 16
16525 && REG_P (XEXP (set_dest, 0))
16526 && REG_P (SET_DEST (prev_set))
16527 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16529 return true;
16533 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16536 /* We're trying to match:
16537 prev (adrp) == (set (reg r1)
16538 (high (symbol_ref ("SYM"))))
16539 curr (add) == (set (reg r0)
16540 (lo_sum (reg r1)
16541 (symbol_ref ("SYM"))))
16542 Note that r0 need not necessarily be the same as r1, especially
16543 during pre-regalloc scheduling. */
16545 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16546 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16548 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16549 && REG_P (XEXP (SET_SRC (curr_set), 0))
16550 && REGNO (XEXP (SET_SRC (curr_set), 0))
16551 == REGNO (SET_DEST (prev_set))
16552 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16553 XEXP (SET_SRC (curr_set), 1)))
16554 return true;
16558 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16561 /* We're trying to match:
16562 prev (movk) == (set (zero_extract (reg r0)
16563 (const_int 16)
16564 (const_int 32))
16565 (const_int imm16_1))
16566 curr (movk) == (set (zero_extract (reg r0)
16567 (const_int 16)
16568 (const_int 48))
16569 (const_int imm16_2)) */
16571 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16572 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16573 && REG_P (XEXP (SET_DEST (prev_set), 0))
16574 && REG_P (XEXP (SET_DEST (curr_set), 0))
16575 && REGNO (XEXP (SET_DEST (prev_set), 0))
16576 == REGNO (XEXP (SET_DEST (curr_set), 0))
16577 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16578 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16579 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16580 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16581 && CONST_INT_P (SET_SRC (prev_set))
16582 && CONST_INT_P (SET_SRC (curr_set)))
16583 return true;
16586 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16588 /* We're trying to match:
16589 prev (adrp) == (set (reg r0)
16590 (high (symbol_ref ("SYM"))))
16591 curr (ldr) == (set (reg r1)
16592 (mem (lo_sum (reg r0)
16593 (symbol_ref ("SYM")))))
16595 curr (ldr) == (set (reg r1)
16596 (zero_extend (mem
16597 (lo_sum (reg r0)
16598 (symbol_ref ("SYM")))))) */
16599 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16600 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16602 rtx curr_src = SET_SRC (curr_set);
16604 if (GET_CODE (curr_src) == ZERO_EXTEND)
16605 curr_src = XEXP (curr_src, 0);
16607 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16608 && REG_P (XEXP (XEXP (curr_src, 0), 0))
16609 && REGNO (XEXP (XEXP (curr_src, 0), 0))
16610 == REGNO (SET_DEST (prev_set))
16611 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16612 XEXP (SET_SRC (prev_set), 0)))
16613 return true;
16617 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16618 && aarch_crypto_can_dual_issue (prev, curr))
16619 return true;
16621 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16622 && any_condjump_p (curr))
16624 enum attr_type prev_type = get_attr_type (prev);
16626 unsigned int condreg1, condreg2;
16627 rtx cc_reg_1;
16628 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16629 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16631 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16632 && prev
16633 && modified_in_p (cc_reg_1, prev))
16635 /* FIXME: this misses some which is considered simple arthematic
16636 instructions for ThunderX. Simple shifts are missed here. */
16637 if (prev_type == TYPE_ALUS_SREG
16638 || prev_type == TYPE_ALUS_IMM
16639 || prev_type == TYPE_LOGICS_REG
16640 || prev_type == TYPE_LOGICS_IMM)
16641 return true;
16645 if (prev_set
16646 && curr_set
16647 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16648 && any_condjump_p (curr))
16650 /* We're trying to match:
16651 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16652 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16653 (const_int 0))
16654 (label_ref ("SYM"))
16655 (pc)) */
16656 if (SET_DEST (curr_set) == (pc_rtx)
16657 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16658 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16659 && REG_P (SET_DEST (prev_set))
16660 && REGNO (SET_DEST (prev_set))
16661 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16663 /* Fuse ALU operations followed by conditional branch instruction. */
16664 switch (get_attr_type (prev))
16666 case TYPE_ALU_IMM:
16667 case TYPE_ALU_SREG:
16668 case TYPE_ADC_REG:
16669 case TYPE_ADC_IMM:
16670 case TYPE_ADCS_REG:
16671 case TYPE_ADCS_IMM:
16672 case TYPE_LOGIC_REG:
16673 case TYPE_LOGIC_IMM:
16674 case TYPE_CSEL:
16675 case TYPE_ADR:
16676 case TYPE_MOV_IMM:
16677 case TYPE_SHIFT_REG:
16678 case TYPE_SHIFT_IMM:
16679 case TYPE_BFM:
16680 case TYPE_RBIT:
16681 case TYPE_REV:
16682 case TYPE_EXTEND:
16683 return true;
16685 default:;
16690 return false;
16693 /* Return true iff the instruction fusion described by OP is enabled. */
16695 bool
16696 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16698 return (aarch64_tune_params.fusible_ops & op) != 0;
16701 /* If MEM is in the form of [base+offset], extract the two parts
16702 of address and set to BASE and OFFSET, otherwise return false
16703 after clearing BASE and OFFSET. */
16705 bool
16706 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16708 rtx addr;
16710 gcc_assert (MEM_P (mem));
16712 addr = XEXP (mem, 0);
16714 if (REG_P (addr))
16716 *base = addr;
16717 *offset = const0_rtx;
16718 return true;
16721 if (GET_CODE (addr) == PLUS
16722 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16724 *base = XEXP (addr, 0);
16725 *offset = XEXP (addr, 1);
16726 return true;
16729 *base = NULL_RTX;
16730 *offset = NULL_RTX;
16732 return false;
16735 /* Types for scheduling fusion. */
16736 enum sched_fusion_type
16738 SCHED_FUSION_NONE = 0,
16739 SCHED_FUSION_LD_SIGN_EXTEND,
16740 SCHED_FUSION_LD_ZERO_EXTEND,
16741 SCHED_FUSION_LD,
16742 SCHED_FUSION_ST,
16743 SCHED_FUSION_NUM
16746 /* If INSN is a load or store of address in the form of [base+offset],
16747 extract the two parts and set to BASE and OFFSET. Return scheduling
16748 fusion type this INSN is. */
16750 static enum sched_fusion_type
16751 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16753 rtx x, dest, src;
16754 enum sched_fusion_type fusion = SCHED_FUSION_LD;
16756 gcc_assert (INSN_P (insn));
16757 x = PATTERN (insn);
16758 if (GET_CODE (x) != SET)
16759 return SCHED_FUSION_NONE;
16761 src = SET_SRC (x);
16762 dest = SET_DEST (x);
16764 machine_mode dest_mode = GET_MODE (dest);
16766 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16767 return SCHED_FUSION_NONE;
16769 if (GET_CODE (src) == SIGN_EXTEND)
16771 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16772 src = XEXP (src, 0);
16773 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16774 return SCHED_FUSION_NONE;
16776 else if (GET_CODE (src) == ZERO_EXTEND)
16778 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16779 src = XEXP (src, 0);
16780 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16781 return SCHED_FUSION_NONE;
16784 if (GET_CODE (src) == MEM && REG_P (dest))
16785 extract_base_offset_in_addr (src, base, offset);
16786 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16788 fusion = SCHED_FUSION_ST;
16789 extract_base_offset_in_addr (dest, base, offset);
16791 else
16792 return SCHED_FUSION_NONE;
16794 if (*base == NULL_RTX || *offset == NULL_RTX)
16795 fusion = SCHED_FUSION_NONE;
16797 return fusion;
16800 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16802 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16803 and PRI are only calculated for these instructions. For other instruction,
16804 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16805 type instruction fusion can be added by returning different priorities.
16807 It's important that irrelevant instructions get the largest FUSION_PRI. */
16809 static void
16810 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16811 int *fusion_pri, int *pri)
16813 int tmp, off_val;
16814 rtx base, offset;
16815 enum sched_fusion_type fusion;
16817 gcc_assert (INSN_P (insn));
16819 tmp = max_pri - 1;
16820 fusion = fusion_load_store (insn, &base, &offset);
16821 if (fusion == SCHED_FUSION_NONE)
16823 *pri = tmp;
16824 *fusion_pri = tmp;
16825 return;
16828 /* Set FUSION_PRI according to fusion type and base register. */
16829 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16831 /* Calculate PRI. */
16832 tmp /= 2;
16834 /* INSN with smaller offset goes first. */
16835 off_val = (int)(INTVAL (offset));
16836 if (off_val >= 0)
16837 tmp -= (off_val & 0xfffff);
16838 else
16839 tmp += ((- off_val) & 0xfffff);
16841 *pri = tmp;
16842 return;
16845 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16846 Adjust priority of sha1h instructions so they are scheduled before
16847 other SHA1 instructions. */
16849 static int
16850 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16852 rtx x = PATTERN (insn);
16854 if (GET_CODE (x) == SET)
16856 x = SET_SRC (x);
16858 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16859 return priority + 10;
16862 return priority;
16865 /* Given OPERANDS of consecutive load/store, check if we can merge
16866 them into ldp/stp. LOAD is true if they are load instructions.
16867 MODE is the mode of memory operands. */
16869 bool
16870 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16871 machine_mode mode)
16873 HOST_WIDE_INT offval_1, offval_2, msize;
16874 enum reg_class rclass_1, rclass_2;
16875 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16877 if (load)
16879 mem_1 = operands[1];
16880 mem_2 = operands[3];
16881 reg_1 = operands[0];
16882 reg_2 = operands[2];
16883 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16884 if (REGNO (reg_1) == REGNO (reg_2))
16885 return false;
16887 else
16889 mem_1 = operands[0];
16890 mem_2 = operands[2];
16891 reg_1 = operands[1];
16892 reg_2 = operands[3];
16895 /* The mems cannot be volatile. */
16896 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16897 return false;
16899 /* If we have SImode and slow unaligned ldp,
16900 check the alignment to be at least 8 byte. */
16901 if (mode == SImode
16902 && (aarch64_tune_params.extra_tuning_flags
16903 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16904 && !optimize_size
16905 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16906 return false;
16908 /* Check if the addresses are in the form of [base+offset]. */
16909 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16910 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16911 return false;
16912 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16913 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16914 return false;
16916 /* Check if the bases are same. */
16917 if (!rtx_equal_p (base_1, base_2))
16918 return false;
16920 /* The operands must be of the same size. */
16921 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16922 GET_MODE_SIZE (GET_MODE (mem_2))));
16924 offval_1 = INTVAL (offset_1);
16925 offval_2 = INTVAL (offset_2);
16926 /* We should only be trying this for fixed-sized modes. There is no
16927 SVE LDP/STP instruction. */
16928 msize = GET_MODE_SIZE (mode).to_constant ();
16929 /* Check if the offsets are consecutive. */
16930 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16931 return false;
16933 /* Check if the addresses are clobbered by load. */
16934 if (load)
16936 if (reg_mentioned_p (reg_1, mem_1))
16937 return false;
16939 /* In increasing order, the last load can clobber the address. */
16940 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16941 return false;
16944 /* One of the memory accesses must be a mempair operand.
16945 If it is not the first one, they need to be swapped by the
16946 peephole. */
16947 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16948 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16949 return false;
16951 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16952 rclass_1 = FP_REGS;
16953 else
16954 rclass_1 = GENERAL_REGS;
16956 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16957 rclass_2 = FP_REGS;
16958 else
16959 rclass_2 = GENERAL_REGS;
16961 /* Check if the registers are of same class. */
16962 if (rclass_1 != rclass_2)
16963 return false;
16965 return true;
16968 /* Given OPERANDS of consecutive load/store that can be merged,
16969 swap them if they are not in ascending order. */
16970 void
16971 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16973 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16974 HOST_WIDE_INT offval_1, offval_2;
16976 if (load)
16978 mem_1 = operands[1];
16979 mem_2 = operands[3];
16981 else
16983 mem_1 = operands[0];
16984 mem_2 = operands[2];
16987 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16988 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16990 offval_1 = INTVAL (offset_1);
16991 offval_2 = INTVAL (offset_2);
16993 if (offval_1 > offval_2)
16995 /* Irrespective of whether this is a load or a store,
16996 we do the same swap. */
16997 std::swap (operands[0], operands[2]);
16998 std::swap (operands[1], operands[3]);
17002 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17003 comparison between the two. */
17005 aarch64_host_wide_int_compare (const void *x, const void *y)
17007 return wi::cmps (* ((const HOST_WIDE_INT *) x),
17008 * ((const HOST_WIDE_INT *) y));
17011 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17012 other pointing to a REG rtx containing an offset, compare the offsets
17013 of the two pairs.
17015 Return:
17017 1 iff offset (X) > offset (Y)
17018 0 iff offset (X) == offset (Y)
17019 -1 iff offset (X) < offset (Y) */
17021 aarch64_ldrstr_offset_compare (const void *x, const void *y)
17023 const rtx * operands_1 = (const rtx *) x;
17024 const rtx * operands_2 = (const rtx *) y;
17025 rtx mem_1, mem_2, base, offset_1, offset_2;
17027 if (MEM_P (operands_1[0]))
17028 mem_1 = operands_1[0];
17029 else
17030 mem_1 = operands_1[1];
17032 if (MEM_P (operands_2[0]))
17033 mem_2 = operands_2[0];
17034 else
17035 mem_2 = operands_2[1];
17037 /* Extract the offsets. */
17038 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17039 extract_base_offset_in_addr (mem_2, &base, &offset_2);
17041 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17043 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17046 /* Given OPERANDS of consecutive load/store, check if we can merge
17047 them into ldp/stp by adjusting the offset. LOAD is true if they
17048 are load instructions. MODE is the mode of memory operands.
17050 Given below consecutive stores:
17052 str w1, [xb, 0x100]
17053 str w1, [xb, 0x104]
17054 str w1, [xb, 0x108]
17055 str w1, [xb, 0x10c]
17057 Though the offsets are out of the range supported by stp, we can
17058 still pair them after adjusting the offset, like:
17060 add scratch, xb, 0x100
17061 stp w1, w1, [scratch]
17062 stp w1, w1, [scratch, 0x8]
17064 The peephole patterns detecting this opportunity should guarantee
17065 the scratch register is avaliable. */
17067 bool
17068 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17069 scalar_mode mode)
17071 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
17072 HOST_WIDE_INT offvals[4], msize;
17073 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
17074 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
17076 if (load)
17078 reg_1 = operands[0];
17079 mem_1 = operands[1];
17080 reg_2 = operands[2];
17081 mem_2 = operands[3];
17082 reg_3 = operands[4];
17083 mem_3 = operands[5];
17084 reg_4 = operands[6];
17085 mem_4 = operands[7];
17086 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
17087 && REG_P (reg_3) && REG_P (reg_4));
17089 /* Do not attempt to merge the loads if the loads clobber each other. */
17090 for (int i = 0; i < 8; i += 2)
17091 for (int j = i + 2; j < 8; j += 2)
17092 if (reg_overlap_mentioned_p (operands[i], operands[j]))
17093 return false;
17095 else
17097 mem_1 = operands[0];
17098 reg_1 = operands[1];
17099 mem_2 = operands[2];
17100 reg_2 = operands[3];
17101 mem_3 = operands[4];
17102 reg_3 = operands[5];
17103 mem_4 = operands[6];
17104 reg_4 = operands[7];
17106 /* Skip if memory operand is by itslef valid for ldp/stp. */
17107 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
17108 return false;
17110 /* The mems cannot be volatile. */
17111 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
17112 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
17113 return false;
17115 /* Check if the addresses are in the form of [base+offset]. */
17116 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17117 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17118 return false;
17119 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17120 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17121 return false;
17122 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
17123 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
17124 return false;
17125 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
17126 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
17127 return false;
17129 /* Check if the bases are same. */
17130 if (!rtx_equal_p (base_1, base_2)
17131 || !rtx_equal_p (base_2, base_3)
17132 || !rtx_equal_p (base_3, base_4))
17133 return false;
17135 offvals[0] = INTVAL (offset_1);
17136 offvals[1] = INTVAL (offset_2);
17137 offvals[2] = INTVAL (offset_3);
17138 offvals[3] = INTVAL (offset_4);
17139 msize = GET_MODE_SIZE (mode);
17141 /* Check if the offsets can be put in the right order to do a ldp/stp. */
17142 qsort (offvals, 4, sizeof (HOST_WIDE_INT), aarch64_host_wide_int_compare);
17144 if (!(offvals[1] == offvals[0] + msize
17145 && offvals[3] == offvals[2] + msize))
17146 return false;
17148 /* Check that offsets are within range of each other. The ldp/stp
17149 instructions have 7 bit immediate offsets, so use 0x80. */
17150 if (offvals[2] - offvals[0] >= msize * 0x80)
17151 return false;
17153 /* The offsets must be aligned with respect to each other. */
17154 if (offvals[0] % msize != offvals[2] % msize)
17155 return false;
17157 /* Check if the addresses are clobbered by load. */
17158 if (load && (reg_mentioned_p (reg_1, mem_1)
17159 || reg_mentioned_p (reg_2, mem_2)
17160 || reg_mentioned_p (reg_3, mem_3)
17161 || reg_mentioned_p (reg_4, mem_4)))
17162 return false;
17164 /* If we have SImode and slow unaligned ldp,
17165 check the alignment to be at least 8 byte. */
17166 if (mode == SImode
17167 && (aarch64_tune_params.extra_tuning_flags
17168 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17169 && !optimize_size
17170 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17171 return false;
17173 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17174 rclass_1 = FP_REGS;
17175 else
17176 rclass_1 = GENERAL_REGS;
17178 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17179 rclass_2 = FP_REGS;
17180 else
17181 rclass_2 = GENERAL_REGS;
17183 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
17184 rclass_3 = FP_REGS;
17185 else
17186 rclass_3 = GENERAL_REGS;
17188 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
17189 rclass_4 = FP_REGS;
17190 else
17191 rclass_4 = GENERAL_REGS;
17193 /* Check if the registers are of same class. */
17194 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17195 return false;
17197 return true;
17200 /* Given OPERANDS of consecutive load/store, this function pairs them
17201 into LDP/STP after adjusting the offset. It depends on the fact
17202 that the operands can be sorted so the offsets are correct for STP.
17203 MODE is the mode of memory operands. CODE is the rtl operator
17204 which should be applied to all memory operands, it's SIGN_EXTEND,
17205 ZERO_EXTEND or UNKNOWN. */
17207 bool
17208 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17209 scalar_mode mode, RTX_CODE code)
17211 rtx base, offset_1, offset_3, t1, t2;
17212 rtx mem_1, mem_2, mem_3, mem_4;
17213 rtx temp_operands[8];
17214 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17215 stp_off_upper_limit, stp_off_lower_limit, msize;
17217 /* We make changes on a copy as we may still bail out. */
17218 for (int i = 0; i < 8; i ++)
17219 temp_operands[i] = operands[i];
17221 /* Sort the operands. */
17222 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17224 if (load)
17226 mem_1 = temp_operands[1];
17227 mem_2 = temp_operands[3];
17228 mem_3 = temp_operands[5];
17229 mem_4 = temp_operands[7];
17231 else
17233 mem_1 = temp_operands[0];
17234 mem_2 = temp_operands[2];
17235 mem_3 = temp_operands[4];
17236 mem_4 = temp_operands[6];
17237 gcc_assert (code == UNKNOWN);
17240 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17241 extract_base_offset_in_addr (mem_3, &base, &offset_3);
17242 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17243 && offset_3 != NULL_RTX);
17245 /* Adjust offset so it can fit in LDP/STP instruction. */
17246 msize = GET_MODE_SIZE (mode);
17247 stp_off_upper_limit = msize * (0x40 - 1);
17248 stp_off_lower_limit = - msize * 0x40;
17250 off_val_1 = INTVAL (offset_1);
17251 off_val_3 = INTVAL (offset_3);
17253 /* The base offset is optimally half way between the two STP/LDP offsets. */
17254 if (msize <= 4)
17255 base_off = (off_val_1 + off_val_3) / 2;
17256 else
17257 /* However, due to issues with negative LDP/STP offset generation for
17258 larger modes, for DF, DI and vector modes. we must not use negative
17259 addresses smaller than 9 signed unadjusted bits can store. This
17260 provides the most range in this case. */
17261 base_off = off_val_1;
17263 /* Adjust the base so that it is aligned with the addresses but still
17264 optimal. */
17265 if (base_off % msize != off_val_1 % msize)
17266 /* Fix the offset, bearing in mind we want to make it bigger not
17267 smaller. */
17268 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17269 else if (msize <= 4)
17270 /* The negative range of LDP/STP is one larger than the positive range. */
17271 base_off += msize;
17273 /* Check if base offset is too big or too small. We can attempt to resolve
17274 this issue by setting it to the maximum value and seeing if the offsets
17275 still fit. */
17276 if (base_off >= 0x1000)
17278 base_off = 0x1000 - 1;
17279 /* We must still make sure that the base offset is aligned with respect
17280 to the address. But it may may not be made any bigger. */
17281 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17284 /* Likewise for the case where the base is too small. */
17285 if (base_off <= -0x1000)
17287 base_off = -0x1000 + 1;
17288 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17291 /* Offset of the first STP/LDP. */
17292 new_off_1 = off_val_1 - base_off;
17294 /* Offset of the second STP/LDP. */
17295 new_off_3 = off_val_3 - base_off;
17297 /* The offsets must be within the range of the LDP/STP instructions. */
17298 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17299 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17300 return false;
17302 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17303 new_off_1), true);
17304 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17305 new_off_1 + msize), true);
17306 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17307 new_off_3), true);
17308 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17309 new_off_3 + msize), true);
17311 if (!aarch64_mem_pair_operand (mem_1, mode)
17312 || !aarch64_mem_pair_operand (mem_3, mode))
17313 return false;
17315 if (code == ZERO_EXTEND)
17317 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17318 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17319 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17320 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17322 else if (code == SIGN_EXTEND)
17324 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17325 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17326 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17327 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17330 if (load)
17332 operands[0] = temp_operands[0];
17333 operands[1] = mem_1;
17334 operands[2] = temp_operands[2];
17335 operands[3] = mem_2;
17336 operands[4] = temp_operands[4];
17337 operands[5] = mem_3;
17338 operands[6] = temp_operands[6];
17339 operands[7] = mem_4;
17341 else
17343 operands[0] = mem_1;
17344 operands[1] = temp_operands[1];
17345 operands[2] = mem_2;
17346 operands[3] = temp_operands[3];
17347 operands[4] = mem_3;
17348 operands[5] = temp_operands[5];
17349 operands[6] = mem_4;
17350 operands[7] = temp_operands[7];
17353 /* Emit adjusting instruction. */
17354 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17355 /* Emit ldp/stp instructions. */
17356 t1 = gen_rtx_SET (operands[0], operands[1]);
17357 t2 = gen_rtx_SET (operands[2], operands[3]);
17358 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17359 t1 = gen_rtx_SET (operands[4], operands[5]);
17360 t2 = gen_rtx_SET (operands[6], operands[7]);
17361 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17362 return true;
17365 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17366 it isn't worth branching around empty masked ops (including masked
17367 stores). */
17369 static bool
17370 aarch64_empty_mask_is_expensive (unsigned)
17372 return false;
17375 /* Return 1 if pseudo register should be created and used to hold
17376 GOT address for PIC code. */
17378 bool
17379 aarch64_use_pseudo_pic_reg (void)
17381 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17384 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17386 static int
17387 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17389 switch (XINT (x, 1))
17391 case UNSPEC_GOTSMALLPIC:
17392 case UNSPEC_GOTSMALLPIC28K:
17393 case UNSPEC_GOTTINYPIC:
17394 return 0;
17395 default:
17396 break;
17399 return default_unspec_may_trap_p (x, flags);
17403 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17404 return the log2 of that value. Otherwise return -1. */
17407 aarch64_fpconst_pow_of_2 (rtx x)
17409 const REAL_VALUE_TYPE *r;
17411 if (!CONST_DOUBLE_P (x))
17412 return -1;
17414 r = CONST_DOUBLE_REAL_VALUE (x);
17416 if (REAL_VALUE_NEGATIVE (*r)
17417 || REAL_VALUE_ISNAN (*r)
17418 || REAL_VALUE_ISINF (*r)
17419 || !real_isinteger (r, DFmode))
17420 return -1;
17422 return exact_log2 (real_to_integer (r));
17425 /* If X is a vector of equal CONST_DOUBLE values and that value is
17426 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17429 aarch64_vec_fpconst_pow_of_2 (rtx x)
17431 int nelts;
17432 if (GET_CODE (x) != CONST_VECTOR
17433 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17434 return -1;
17436 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17437 return -1;
17439 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17440 if (firstval <= 0)
17441 return -1;
17443 for (int i = 1; i < nelts; i++)
17444 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17445 return -1;
17447 return firstval;
17450 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17451 to float.
17453 __fp16 always promotes through this hook.
17454 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17455 through the generic excess precision logic rather than here. */
17457 static tree
17458 aarch64_promoted_type (const_tree t)
17460 if (SCALAR_FLOAT_TYPE_P (t)
17461 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17462 return float_type_node;
17464 return NULL_TREE;
17467 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17469 static bool
17470 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17471 optimization_type opt_type)
17473 switch (op)
17475 case rsqrt_optab:
17476 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17478 default:
17479 return true;
17483 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17485 static unsigned int
17486 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17487 int *offset)
17489 /* Polynomial invariant 1 == (VG / 2) - 1. */
17490 gcc_assert (i == 1);
17491 *factor = 2;
17492 *offset = 1;
17493 return AARCH64_DWARF_VG;
17496 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17497 if MODE is HFmode, and punt to the generic implementation otherwise. */
17499 static bool
17500 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17502 return (mode == HFmode
17503 ? true
17504 : default_libgcc_floating_mode_supported_p (mode));
17507 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17508 if MODE is HFmode, and punt to the generic implementation otherwise. */
17510 static bool
17511 aarch64_scalar_mode_supported_p (scalar_mode mode)
17513 return (mode == HFmode
17514 ? true
17515 : default_scalar_mode_supported_p (mode));
17518 /* Set the value of FLT_EVAL_METHOD.
17519 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17521 0: evaluate all operations and constants, whose semantic type has at
17522 most the range and precision of type float, to the range and
17523 precision of float; evaluate all other operations and constants to
17524 the range and precision of the semantic type;
17526 N, where _FloatN is a supported interchange floating type
17527 evaluate all operations and constants, whose semantic type has at
17528 most the range and precision of _FloatN type, to the range and
17529 precision of the _FloatN type; evaluate all other operations and
17530 constants to the range and precision of the semantic type;
17532 If we have the ARMv8.2-A extensions then we support _Float16 in native
17533 precision, so we should set this to 16. Otherwise, we support the type,
17534 but want to evaluate expressions in float precision, so set this to
17535 0. */
17537 static enum flt_eval_method
17538 aarch64_excess_precision (enum excess_precision_type type)
17540 switch (type)
17542 case EXCESS_PRECISION_TYPE_FAST:
17543 case EXCESS_PRECISION_TYPE_STANDARD:
17544 /* We can calculate either in 16-bit range and precision or
17545 32-bit range and precision. Make that decision based on whether
17546 we have native support for the ARMv8.2-A 16-bit floating-point
17547 instructions or not. */
17548 return (TARGET_FP_F16INST
17549 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17550 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17551 case EXCESS_PRECISION_TYPE_IMPLICIT:
17552 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17553 default:
17554 gcc_unreachable ();
17556 return FLT_EVAL_METHOD_UNPREDICTABLE;
17559 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17560 scheduled for speculative execution. Reject the long-running division
17561 and square-root instructions. */
17563 static bool
17564 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17566 switch (get_attr_type (insn))
17568 case TYPE_SDIV:
17569 case TYPE_UDIV:
17570 case TYPE_FDIVS:
17571 case TYPE_FDIVD:
17572 case TYPE_FSQRTS:
17573 case TYPE_FSQRTD:
17574 case TYPE_NEON_FP_SQRT_S:
17575 case TYPE_NEON_FP_SQRT_D:
17576 case TYPE_NEON_FP_SQRT_S_Q:
17577 case TYPE_NEON_FP_SQRT_D_Q:
17578 case TYPE_NEON_FP_DIV_S:
17579 case TYPE_NEON_FP_DIV_D:
17580 case TYPE_NEON_FP_DIV_S_Q:
17581 case TYPE_NEON_FP_DIV_D_Q:
17582 return false;
17583 default:
17584 return true;
17588 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17590 static int
17591 aarch64_compute_pressure_classes (reg_class *classes)
17593 int i = 0;
17594 classes[i++] = GENERAL_REGS;
17595 classes[i++] = FP_REGS;
17596 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17597 registers need to go in PR_LO_REGS at some point during their
17598 lifetime. Splitting it into two halves has the effect of making
17599 all predicates count against PR_LO_REGS, so that we try whenever
17600 possible to restrict the number of live predicates to 8. This
17601 greatly reduces the amount of spilling in certain loops. */
17602 classes[i++] = PR_LO_REGS;
17603 classes[i++] = PR_HI_REGS;
17604 return i;
17607 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17609 static bool
17610 aarch64_can_change_mode_class (machine_mode from,
17611 machine_mode to, reg_class_t)
17613 if (BYTES_BIG_ENDIAN)
17615 bool from_sve_p = aarch64_sve_data_mode_p (from);
17616 bool to_sve_p = aarch64_sve_data_mode_p (to);
17618 /* Don't allow changes between SVE data modes and non-SVE modes.
17619 See the comment at the head of aarch64-sve.md for details. */
17620 if (from_sve_p != to_sve_p)
17621 return false;
17623 /* Don't allow changes in element size: lane 0 of the new vector
17624 would not then be lane 0 of the old vector. See the comment
17625 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17626 description.
17628 In the worst case, this forces a register to be spilled in
17629 one mode and reloaded in the other, which handles the
17630 endianness correctly. */
17631 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17632 return false;
17634 return true;
17637 /* Implement TARGET_EARLY_REMAT_MODES. */
17639 static void
17640 aarch64_select_early_remat_modes (sbitmap modes)
17642 /* SVE values are not normally live across a call, so it should be
17643 worth doing early rematerialization even in VL-specific mode. */
17644 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17646 machine_mode mode = (machine_mode) i;
17647 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17648 if (vec_flags & VEC_ANY_SVE)
17649 bitmap_set_bit (modes, i);
17653 /* Target-specific selftests. */
17655 #if CHECKING_P
17657 namespace selftest {
17659 /* Selftest for the RTL loader.
17660 Verify that the RTL loader copes with a dump from
17661 print_rtx_function. This is essentially just a test that class
17662 function_reader can handle a real dump, but it also verifies
17663 that lookup_reg_by_dump_name correctly handles hard regs.
17664 The presence of hard reg names in the dump means that the test is
17665 target-specific, hence it is in this file. */
17667 static void
17668 aarch64_test_loading_full_dump ()
17670 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17672 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17674 rtx_insn *insn_1 = get_insn_by_uid (1);
17675 ASSERT_EQ (NOTE, GET_CODE (insn_1));
17677 rtx_insn *insn_15 = get_insn_by_uid (15);
17678 ASSERT_EQ (INSN, GET_CODE (insn_15));
17679 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17681 /* Verify crtl->return_rtx. */
17682 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17683 ASSERT_EQ (0, REGNO (crtl->return_rtx));
17684 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17687 /* Run all target-specific selftests. */
17689 static void
17690 aarch64_run_selftests (void)
17692 aarch64_test_loading_full_dump ();
17695 } // namespace selftest
17697 #endif /* #if CHECKING_P */
17699 #undef TARGET_ADDRESS_COST
17700 #define TARGET_ADDRESS_COST aarch64_address_cost
17702 /* This hook will determines whether unnamed bitfields affect the alignment
17703 of the containing structure. The hook returns true if the structure
17704 should inherit the alignment requirements of an unnamed bitfield's
17705 type. */
17706 #undef TARGET_ALIGN_ANON_BITFIELD
17707 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17709 #undef TARGET_ASM_ALIGNED_DI_OP
17710 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17712 #undef TARGET_ASM_ALIGNED_HI_OP
17713 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17715 #undef TARGET_ASM_ALIGNED_SI_OP
17716 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17718 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17719 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17720 hook_bool_const_tree_hwi_hwi_const_tree_true
17722 #undef TARGET_ASM_FILE_START
17723 #define TARGET_ASM_FILE_START aarch64_start_file
17725 #undef TARGET_ASM_OUTPUT_MI_THUNK
17726 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17728 #undef TARGET_ASM_SELECT_RTX_SECTION
17729 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17731 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17732 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17734 #undef TARGET_BUILD_BUILTIN_VA_LIST
17735 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17737 #undef TARGET_CALLEE_COPIES
17738 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17740 #undef TARGET_CAN_ELIMINATE
17741 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17743 #undef TARGET_CAN_INLINE_P
17744 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17746 #undef TARGET_CANNOT_FORCE_CONST_MEM
17747 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17749 #undef TARGET_CASE_VALUES_THRESHOLD
17750 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17752 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17753 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17755 /* Only the least significant bit is used for initialization guard
17756 variables. */
17757 #undef TARGET_CXX_GUARD_MASK_BIT
17758 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17760 #undef TARGET_C_MODE_FOR_SUFFIX
17761 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17763 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17764 #undef TARGET_DEFAULT_TARGET_FLAGS
17765 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17766 #endif
17768 #undef TARGET_CLASS_MAX_NREGS
17769 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17771 #undef TARGET_BUILTIN_DECL
17772 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17774 #undef TARGET_BUILTIN_RECIPROCAL
17775 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17777 #undef TARGET_C_EXCESS_PRECISION
17778 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17780 #undef TARGET_EXPAND_BUILTIN
17781 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17783 #undef TARGET_EXPAND_BUILTIN_VA_START
17784 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17786 #undef TARGET_FOLD_BUILTIN
17787 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17789 #undef TARGET_FUNCTION_ARG
17790 #define TARGET_FUNCTION_ARG aarch64_function_arg
17792 #undef TARGET_FUNCTION_ARG_ADVANCE
17793 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17795 #undef TARGET_FUNCTION_ARG_BOUNDARY
17796 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17798 #undef TARGET_FUNCTION_ARG_PADDING
17799 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17801 #undef TARGET_GET_RAW_RESULT_MODE
17802 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17803 #undef TARGET_GET_RAW_ARG_MODE
17804 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17806 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17807 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17809 #undef TARGET_FUNCTION_VALUE
17810 #define TARGET_FUNCTION_VALUE aarch64_function_value
17812 #undef TARGET_FUNCTION_VALUE_REGNO_P
17813 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17815 #undef TARGET_GIMPLE_FOLD_BUILTIN
17816 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17818 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17819 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17821 #undef TARGET_INIT_BUILTINS
17822 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17824 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17825 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17826 aarch64_ira_change_pseudo_allocno_class
17828 #undef TARGET_LEGITIMATE_ADDRESS_P
17829 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17831 #undef TARGET_LEGITIMATE_CONSTANT_P
17832 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17834 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17835 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17836 aarch64_legitimize_address_displacement
17838 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17839 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17841 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17842 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17843 aarch64_libgcc_floating_mode_supported_p
17845 #undef TARGET_MANGLE_TYPE
17846 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17848 #undef TARGET_MEMORY_MOVE_COST
17849 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17851 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17852 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17854 #undef TARGET_MUST_PASS_IN_STACK
17855 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17857 /* This target hook should return true if accesses to volatile bitfields
17858 should use the narrowest mode possible. It should return false if these
17859 accesses should use the bitfield container type. */
17860 #undef TARGET_NARROW_VOLATILE_BITFIELD
17861 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17863 #undef TARGET_OPTION_OVERRIDE
17864 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17866 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17867 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17868 aarch64_override_options_after_change
17870 #undef TARGET_OPTION_SAVE
17871 #define TARGET_OPTION_SAVE aarch64_option_save
17873 #undef TARGET_OPTION_RESTORE
17874 #define TARGET_OPTION_RESTORE aarch64_option_restore
17876 #undef TARGET_OPTION_PRINT
17877 #define TARGET_OPTION_PRINT aarch64_option_print
17879 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17880 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17882 #undef TARGET_SET_CURRENT_FUNCTION
17883 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17885 #undef TARGET_PASS_BY_REFERENCE
17886 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17888 #undef TARGET_PREFERRED_RELOAD_CLASS
17889 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17891 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17892 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17894 #undef TARGET_PROMOTED_TYPE
17895 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17897 #undef TARGET_SECONDARY_RELOAD
17898 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17900 #undef TARGET_SHIFT_TRUNCATION_MASK
17901 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17903 #undef TARGET_SETUP_INCOMING_VARARGS
17904 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17906 #undef TARGET_STRUCT_VALUE_RTX
17907 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17909 #undef TARGET_REGISTER_MOVE_COST
17910 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17912 #undef TARGET_RETURN_IN_MEMORY
17913 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17915 #undef TARGET_RETURN_IN_MSB
17916 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17918 #undef TARGET_RTX_COSTS
17919 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17921 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17922 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17924 #undef TARGET_SCHED_ISSUE_RATE
17925 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17927 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17928 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17929 aarch64_sched_first_cycle_multipass_dfa_lookahead
17931 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17932 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17933 aarch64_first_cycle_multipass_dfa_lookahead_guard
17935 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17936 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17937 aarch64_get_separate_components
17939 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17940 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17941 aarch64_components_for_bb
17943 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17944 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17945 aarch64_disqualify_components
17947 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17948 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17949 aarch64_emit_prologue_components
17951 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17952 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17953 aarch64_emit_epilogue_components
17955 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17956 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17957 aarch64_set_handled_components
17959 #undef TARGET_TRAMPOLINE_INIT
17960 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17962 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17963 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17965 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17966 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17968 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17969 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17970 aarch64_builtin_support_vector_misalignment
17972 #undef TARGET_ARRAY_MODE
17973 #define TARGET_ARRAY_MODE aarch64_array_mode
17975 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17976 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17978 #undef TARGET_VECTORIZE_ADD_STMT_COST
17979 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17981 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17982 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17983 aarch64_builtin_vectorization_cost
17985 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17986 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17988 #undef TARGET_VECTORIZE_BUILTINS
17989 #define TARGET_VECTORIZE_BUILTINS
17991 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17992 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17993 aarch64_builtin_vectorized_function
17995 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17996 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17997 aarch64_autovectorize_vector_sizes
17999 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
18000 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
18001 aarch64_atomic_assign_expand_fenv
18003 /* Section anchor support. */
18005 #undef TARGET_MIN_ANCHOR_OFFSET
18006 #define TARGET_MIN_ANCHOR_OFFSET -256
18008 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
18009 byte offset; we can do much more for larger data types, but have no way
18010 to determine the size of the access. We assume accesses are aligned. */
18011 #undef TARGET_MAX_ANCHOR_OFFSET
18012 #define TARGET_MAX_ANCHOR_OFFSET 4095
18014 #undef TARGET_VECTOR_ALIGNMENT
18015 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
18017 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
18018 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
18019 aarch64_vectorize_preferred_vector_alignment
18020 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
18021 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
18022 aarch64_simd_vector_alignment_reachable
18024 /* vec_perm support. */
18026 #undef TARGET_VECTORIZE_VEC_PERM_CONST
18027 #define TARGET_VECTORIZE_VEC_PERM_CONST \
18028 aarch64_vectorize_vec_perm_const
18030 #undef TARGET_VECTORIZE_GET_MASK_MODE
18031 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18032 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18033 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18034 aarch64_empty_mask_is_expensive
18036 #undef TARGET_INIT_LIBFUNCS
18037 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18039 #undef TARGET_FIXED_CONDITION_CODE_REGS
18040 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18042 #undef TARGET_FLAGS_REGNUM
18043 #define TARGET_FLAGS_REGNUM CC_REGNUM
18045 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18046 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18048 #undef TARGET_ASAN_SHADOW_OFFSET
18049 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18051 #undef TARGET_LEGITIMIZE_ADDRESS
18052 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18054 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18055 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18057 #undef TARGET_CAN_USE_DOLOOP_P
18058 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18060 #undef TARGET_SCHED_ADJUST_PRIORITY
18061 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18063 #undef TARGET_SCHED_MACRO_FUSION_P
18064 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18066 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18067 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18069 #undef TARGET_SCHED_FUSION_PRIORITY
18070 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18072 #undef TARGET_UNSPEC_MAY_TRAP_P
18073 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18075 #undef TARGET_USE_PSEUDO_PIC_REG
18076 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18078 #undef TARGET_PRINT_OPERAND
18079 #define TARGET_PRINT_OPERAND aarch64_print_operand
18081 #undef TARGET_PRINT_OPERAND_ADDRESS
18082 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18084 #undef TARGET_OPTAB_SUPPORTED_P
18085 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18087 #undef TARGET_OMIT_STRUCT_RETURN_REG
18088 #define TARGET_OMIT_STRUCT_RETURN_REG true
18090 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18091 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18092 aarch64_dwarf_poly_indeterminate_value
18094 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
18095 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18096 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18098 #undef TARGET_HARD_REGNO_NREGS
18099 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18100 #undef TARGET_HARD_REGNO_MODE_OK
18101 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18103 #undef TARGET_MODES_TIEABLE_P
18104 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18106 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18107 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18108 aarch64_hard_regno_call_part_clobbered
18110 #undef TARGET_CONSTANT_ALIGNMENT
18111 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18113 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18114 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18116 #undef TARGET_CAN_CHANGE_MODE_CLASS
18117 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18119 #undef TARGET_SELECT_EARLY_REMAT_MODES
18120 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18122 #if CHECKING_P
18123 #undef TARGET_RUN_TARGET_SELFTESTS
18124 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18125 #endif /* #if CHECKING_P */
18127 struct gcc_target targetm = TARGET_INITIALIZER;
18129 #include "gt-aarch64.h"