Extend tree code folds to IFN_COND_*
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob6ef0cc7501840cb61104f973f9ab130477065cfe
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Classifies an address.
83 ADDRESS_REG_IMM
84 A simple base register plus immediate offset.
86 ADDRESS_REG_WB
87 A base register indexed by immediate offset with writeback.
89 ADDRESS_REG_REG
90 A base register indexed by (optionally scaled) register.
92 ADDRESS_REG_UXTW
93 A base register indexed by (optionally scaled) zero-extended register.
95 ADDRESS_REG_SXTW
96 A base register indexed by (optionally scaled) sign-extended register.
98 ADDRESS_LO_SUM
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
101 ADDRESS_SYMBOLIC:
102 A constant symbolic address, in pc-relative literal pool. */
104 enum aarch64_address_type {
105 ADDRESS_REG_IMM,
106 ADDRESS_REG_WB,
107 ADDRESS_REG_REG,
108 ADDRESS_REG_UXTW,
109 ADDRESS_REG_SXTW,
110 ADDRESS_LO_SUM,
111 ADDRESS_SYMBOLIC
114 struct aarch64_address_info {
115 enum aarch64_address_type type;
116 rtx base;
117 rtx offset;
118 poly_int64 const_offset;
119 int shift;
120 enum aarch64_symbol_type symbol_type;
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
126 enum insn_type { MOV, MVN };
127 enum modifier_type { LSL, MSL };
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode, rtx);
131 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
132 insn_type = MOV, modifier_type = LSL,
133 unsigned int = 0);
134 simd_immediate_info (scalar_mode, rtx, rtx);
136 /* The mode of the elements. */
137 scalar_mode elt_mode;
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
141 rtx value;
143 /* The value of the step if the constant is a series, null otherwise. */
144 rtx step;
146 /* The instruction to use to move the immediate into a vector. */
147 insn_type insn;
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier;
152 unsigned int shift;
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
159 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
160 modifier (LSL), shift (0)
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
165 fields. */
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
168 unsigned HOST_WIDE_INT value_in,
169 insn_type insn_in, modifier_type modifier_in,
170 unsigned int shift_in)
171 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
172 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
179 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
180 modifier (LSL), shift (0)
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel;
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg;
189 #ifdef HAVE_AS_TLS
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
192 #endif
194 static bool aarch64_composite_type_p (const_tree, machine_mode);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
196 const_tree,
197 machine_mode *, int *,
198 bool *);
199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode);
203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
205 const_tree type,
206 int misalignment,
207 bool is_packed);
208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
211 /* Major revision number of the ARM Architecture implemented by the target. */
212 unsigned aarch64_architecture_version;
214 /* The processor for which instructions should be scheduled. */
215 enum aarch64_processor aarch64_tune = cortexa53;
217 /* Mask to specify which instruction scheduling options should be used. */
218 unsigned long aarch64_tune_flags = 0;
220 /* Global flag for PC relative loads. */
221 bool aarch64_pcrelative_literal_loads;
223 /* Global flag for whether frame pointer is enabled. */
224 bool aarch64_use_frame_pointer;
226 /* Support for command line parsing of boolean flags in the tuning
227 structures. */
228 struct aarch64_flag_desc
230 const char* name;
231 unsigned int flag;
234 #define AARCH64_FUSION_PAIR(name, internal_name) \
235 { name, AARCH64_FUSE_##internal_name },
236 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
238 { "none", AARCH64_FUSE_NOTHING },
239 #include "aarch64-fusion-pairs.def"
240 { "all", AARCH64_FUSE_ALL },
241 { NULL, AARCH64_FUSE_NOTHING }
244 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
245 { name, AARCH64_EXTRA_TUNE_##internal_name },
246 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
248 { "none", AARCH64_EXTRA_TUNE_NONE },
249 #include "aarch64-tuning-flags.def"
250 { "all", AARCH64_EXTRA_TUNE_ALL },
251 { NULL, AARCH64_EXTRA_TUNE_NONE }
254 /* Tuning parameters. */
256 static const struct cpu_addrcost_table generic_addrcost_table =
259 1, /* hi */
260 0, /* si */
261 0, /* di */
262 1, /* ti */
264 0, /* pre_modify */
265 0, /* post_modify */
266 0, /* register_offset */
267 0, /* register_sextend */
268 0, /* register_zextend */
269 0 /* imm_offset */
272 static const struct cpu_addrcost_table exynosm1_addrcost_table =
275 0, /* hi */
276 0, /* si */
277 0, /* di */
278 2, /* ti */
280 0, /* pre_modify */
281 0, /* post_modify */
282 1, /* register_offset */
283 1, /* register_sextend */
284 2, /* register_zextend */
285 0, /* imm_offset */
288 static const struct cpu_addrcost_table xgene1_addrcost_table =
291 1, /* hi */
292 0, /* si */
293 0, /* di */
294 1, /* ti */
296 1, /* pre_modify */
297 0, /* post_modify */
298 0, /* register_offset */
299 1, /* register_sextend */
300 1, /* register_zextend */
301 0, /* imm_offset */
304 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
307 1, /* hi */
308 1, /* si */
309 1, /* di */
310 2, /* ti */
312 0, /* pre_modify */
313 0, /* post_modify */
314 2, /* register_offset */
315 3, /* register_sextend */
316 3, /* register_zextend */
317 0, /* imm_offset */
320 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
323 1, /* hi */
324 1, /* si */
325 1, /* di */
326 2, /* ti */
328 1, /* pre_modify */
329 1, /* post_modify */
330 3, /* register_offset */
331 4, /* register_sextend */
332 3, /* register_zextend */
333 2, /* imm_offset */
336 static const struct cpu_regmove_cost generic_regmove_cost =
338 1, /* GP2GP */
339 /* Avoid the use of slow int<->fp moves for spilling by setting
340 their cost higher than memmov_cost. */
341 5, /* GP2FP */
342 5, /* FP2GP */
343 2 /* FP2FP */
346 static const struct cpu_regmove_cost cortexa57_regmove_cost =
348 1, /* GP2GP */
349 /* Avoid the use of slow int<->fp moves for spilling by setting
350 their cost higher than memmov_cost. */
351 5, /* GP2FP */
352 5, /* FP2GP */
353 2 /* FP2FP */
356 static const struct cpu_regmove_cost cortexa53_regmove_cost =
358 1, /* GP2GP */
359 /* Avoid the use of slow int<->fp moves for spilling by setting
360 their cost higher than memmov_cost. */
361 5, /* GP2FP */
362 5, /* FP2GP */
363 2 /* FP2FP */
366 static const struct cpu_regmove_cost exynosm1_regmove_cost =
368 1, /* GP2GP */
369 /* Avoid the use of slow int<->fp moves for spilling by setting
370 their cost higher than memmov_cost (actual, 4 and 9). */
371 9, /* GP2FP */
372 9, /* FP2GP */
373 1 /* FP2FP */
376 static const struct cpu_regmove_cost thunderx_regmove_cost =
378 2, /* GP2GP */
379 2, /* GP2FP */
380 6, /* FP2GP */
381 4 /* FP2FP */
384 static const struct cpu_regmove_cost xgene1_regmove_cost =
386 1, /* GP2GP */
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost. */
389 8, /* GP2FP */
390 8, /* FP2GP */
391 2 /* FP2FP */
394 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
396 2, /* GP2GP */
397 /* Avoid the use of int<->fp moves for spilling. */
398 6, /* GP2FP */
399 6, /* FP2GP */
400 4 /* FP2FP */
403 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
405 1, /* GP2GP */
406 /* Avoid the use of int<->fp moves for spilling. */
407 8, /* GP2FP */
408 8, /* FP2GP */
409 4 /* FP2FP */
412 /* Generic costs for vector insn classes. */
413 static const struct cpu_vector_cost generic_vector_cost =
415 1, /* scalar_int_stmt_cost */
416 1, /* scalar_fp_stmt_cost */
417 1, /* scalar_load_cost */
418 1, /* scalar_store_cost */
419 1, /* vec_int_stmt_cost */
420 1, /* vec_fp_stmt_cost */
421 2, /* vec_permute_cost */
422 1, /* vec_to_scalar_cost */
423 1, /* scalar_to_vec_cost */
424 1, /* vec_align_load_cost */
425 1, /* vec_unalign_load_cost */
426 1, /* vec_unalign_store_cost */
427 1, /* vec_store_cost */
428 3, /* cond_taken_branch_cost */
429 1 /* cond_not_taken_branch_cost */
432 /* ThunderX costs for vector insn classes. */
433 static const struct cpu_vector_cost thunderx_vector_cost =
435 1, /* scalar_int_stmt_cost */
436 1, /* scalar_fp_stmt_cost */
437 3, /* scalar_load_cost */
438 1, /* scalar_store_cost */
439 4, /* vec_int_stmt_cost */
440 1, /* vec_fp_stmt_cost */
441 4, /* vec_permute_cost */
442 2, /* vec_to_scalar_cost */
443 2, /* scalar_to_vec_cost */
444 3, /* vec_align_load_cost */
445 5, /* vec_unalign_load_cost */
446 5, /* vec_unalign_store_cost */
447 1, /* vec_store_cost */
448 3, /* cond_taken_branch_cost */
449 3 /* cond_not_taken_branch_cost */
452 /* Generic costs for vector insn classes. */
453 static const struct cpu_vector_cost cortexa57_vector_cost =
455 1, /* scalar_int_stmt_cost */
456 1, /* scalar_fp_stmt_cost */
457 4, /* scalar_load_cost */
458 1, /* scalar_store_cost */
459 2, /* vec_int_stmt_cost */
460 2, /* vec_fp_stmt_cost */
461 3, /* vec_permute_cost */
462 8, /* vec_to_scalar_cost */
463 8, /* scalar_to_vec_cost */
464 4, /* vec_align_load_cost */
465 4, /* vec_unalign_load_cost */
466 1, /* vec_unalign_store_cost */
467 1, /* vec_store_cost */
468 1, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
472 static const struct cpu_vector_cost exynosm1_vector_cost =
474 1, /* scalar_int_stmt_cost */
475 1, /* scalar_fp_stmt_cost */
476 5, /* scalar_load_cost */
477 1, /* scalar_store_cost */
478 3, /* vec_int_stmt_cost */
479 3, /* vec_fp_stmt_cost */
480 3, /* vec_permute_cost */
481 3, /* vec_to_scalar_cost */
482 3, /* scalar_to_vec_cost */
483 5, /* vec_align_load_cost */
484 5, /* vec_unalign_load_cost */
485 1, /* vec_unalign_store_cost */
486 1, /* vec_store_cost */
487 1, /* cond_taken_branch_cost */
488 1 /* cond_not_taken_branch_cost */
491 /* Generic costs for vector insn classes. */
492 static const struct cpu_vector_cost xgene1_vector_cost =
494 1, /* scalar_int_stmt_cost */
495 1, /* scalar_fp_stmt_cost */
496 5, /* scalar_load_cost */
497 1, /* scalar_store_cost */
498 2, /* vec_int_stmt_cost */
499 2, /* vec_fp_stmt_cost */
500 2, /* vec_permute_cost */
501 4, /* vec_to_scalar_cost */
502 4, /* scalar_to_vec_cost */
503 10, /* vec_align_load_cost */
504 10, /* vec_unalign_load_cost */
505 2, /* vec_unalign_store_cost */
506 2, /* vec_store_cost */
507 2, /* cond_taken_branch_cost */
508 1 /* cond_not_taken_branch_cost */
511 /* Costs for vector insn classes for Vulcan. */
512 static const struct cpu_vector_cost thunderx2t99_vector_cost =
514 1, /* scalar_int_stmt_cost */
515 6, /* scalar_fp_stmt_cost */
516 4, /* scalar_load_cost */
517 1, /* scalar_store_cost */
518 5, /* vec_int_stmt_cost */
519 6, /* vec_fp_stmt_cost */
520 3, /* vec_permute_cost */
521 6, /* vec_to_scalar_cost */
522 5, /* scalar_to_vec_cost */
523 8, /* vec_align_load_cost */
524 8, /* vec_unalign_load_cost */
525 4, /* vec_unalign_store_cost */
526 4, /* vec_store_cost */
527 2, /* cond_taken_branch_cost */
528 1 /* cond_not_taken_branch_cost */
531 /* Generic costs for branch instructions. */
532 static const struct cpu_branch_cost generic_branch_cost =
534 1, /* Predictable. */
535 3 /* Unpredictable. */
538 /* Generic approximation modes. */
539 static const cpu_approx_modes generic_approx_modes =
541 AARCH64_APPROX_NONE, /* division */
542 AARCH64_APPROX_NONE, /* sqrt */
543 AARCH64_APPROX_NONE /* recip_sqrt */
546 /* Approximation modes for Exynos M1. */
547 static const cpu_approx_modes exynosm1_approx_modes =
549 AARCH64_APPROX_NONE, /* division */
550 AARCH64_APPROX_ALL, /* sqrt */
551 AARCH64_APPROX_ALL /* recip_sqrt */
554 /* Approximation modes for X-Gene 1. */
555 static const cpu_approx_modes xgene1_approx_modes =
557 AARCH64_APPROX_NONE, /* division */
558 AARCH64_APPROX_NONE, /* sqrt */
559 AARCH64_APPROX_ALL /* recip_sqrt */
562 /* Generic prefetch settings (which disable prefetch). */
563 static const cpu_prefetch_tune generic_prefetch_tune =
565 0, /* num_slots */
566 -1, /* l1_cache_size */
567 -1, /* l1_cache_line_size */
568 -1, /* l2_cache_size */
569 true, /* prefetch_dynamic_strides */
570 -1, /* minimum_stride */
571 -1 /* default_opt_level */
574 static const cpu_prefetch_tune exynosm1_prefetch_tune =
576 0, /* num_slots */
577 -1, /* l1_cache_size */
578 64, /* l1_cache_line_size */
579 -1, /* l2_cache_size */
580 true, /* prefetch_dynamic_strides */
581 -1, /* minimum_stride */
582 -1 /* default_opt_level */
585 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
587 4, /* num_slots */
588 32, /* l1_cache_size */
589 64, /* l1_cache_line_size */
590 512, /* l2_cache_size */
591 false, /* prefetch_dynamic_strides */
592 2048, /* minimum_stride */
593 3 /* default_opt_level */
596 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
598 8, /* num_slots */
599 32, /* l1_cache_size */
600 128, /* l1_cache_line_size */
601 16*1024, /* l2_cache_size */
602 true, /* prefetch_dynamic_strides */
603 -1, /* minimum_stride */
604 3 /* default_opt_level */
607 static const cpu_prefetch_tune thunderx_prefetch_tune =
609 8, /* num_slots */
610 32, /* l1_cache_size */
611 128, /* l1_cache_line_size */
612 -1, /* l2_cache_size */
613 true, /* prefetch_dynamic_strides */
614 -1, /* minimum_stride */
615 -1 /* default_opt_level */
618 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
620 8, /* num_slots */
621 32, /* l1_cache_size */
622 64, /* l1_cache_line_size */
623 256, /* l2_cache_size */
624 true, /* prefetch_dynamic_strides */
625 -1, /* minimum_stride */
626 -1 /* default_opt_level */
629 static const struct tune_params generic_tunings =
631 &cortexa57_extra_costs,
632 &generic_addrcost_table,
633 &generic_regmove_cost,
634 &generic_vector_cost,
635 &generic_branch_cost,
636 &generic_approx_modes,
637 4, /* memmov_cost */
638 2, /* issue_rate */
639 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
640 "8", /* function_align. */
641 "4", /* jump_align. */
642 "8", /* loop_align. */
643 2, /* int_reassoc_width. */
644 4, /* fp_reassoc_width. */
645 1, /* vec_reassoc_width. */
646 2, /* min_div_recip_mul_sf. */
647 2, /* min_div_recip_mul_df. */
648 0, /* max_case_values. */
649 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
650 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
651 &generic_prefetch_tune
654 static const struct tune_params cortexa35_tunings =
656 &cortexa53_extra_costs,
657 &generic_addrcost_table,
658 &cortexa53_regmove_cost,
659 &generic_vector_cost,
660 &generic_branch_cost,
661 &generic_approx_modes,
662 4, /* memmov_cost */
663 1, /* issue_rate */
664 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
665 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
666 "16", /* function_align. */
667 "4", /* jump_align. */
668 "8", /* loop_align. */
669 2, /* int_reassoc_width. */
670 4, /* fp_reassoc_width. */
671 1, /* vec_reassoc_width. */
672 2, /* min_div_recip_mul_sf. */
673 2, /* min_div_recip_mul_df. */
674 0, /* max_case_values. */
675 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
676 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
677 &generic_prefetch_tune
680 static const struct tune_params cortexa53_tunings =
682 &cortexa53_extra_costs,
683 &generic_addrcost_table,
684 &cortexa53_regmove_cost,
685 &generic_vector_cost,
686 &generic_branch_cost,
687 &generic_approx_modes,
688 4, /* memmov_cost */
689 2, /* issue_rate */
690 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
691 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
692 "16", /* function_align. */
693 "4", /* jump_align. */
694 "8", /* loop_align. */
695 2, /* int_reassoc_width. */
696 4, /* fp_reassoc_width. */
697 1, /* vec_reassoc_width. */
698 2, /* min_div_recip_mul_sf. */
699 2, /* min_div_recip_mul_df. */
700 0, /* max_case_values. */
701 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
702 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
703 &generic_prefetch_tune
706 static const struct tune_params cortexa57_tunings =
708 &cortexa57_extra_costs,
709 &generic_addrcost_table,
710 &cortexa57_regmove_cost,
711 &cortexa57_vector_cost,
712 &generic_branch_cost,
713 &generic_approx_modes,
714 4, /* memmov_cost */
715 3, /* issue_rate */
716 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
717 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
718 "16", /* function_align. */
719 "4", /* jump_align. */
720 "8", /* loop_align. */
721 2, /* int_reassoc_width. */
722 4, /* fp_reassoc_width. */
723 1, /* vec_reassoc_width. */
724 2, /* min_div_recip_mul_sf. */
725 2, /* min_div_recip_mul_df. */
726 0, /* max_case_values. */
727 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
728 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
729 &generic_prefetch_tune
732 static const struct tune_params cortexa72_tunings =
734 &cortexa57_extra_costs,
735 &generic_addrcost_table,
736 &cortexa57_regmove_cost,
737 &cortexa57_vector_cost,
738 &generic_branch_cost,
739 &generic_approx_modes,
740 4, /* memmov_cost */
741 3, /* issue_rate */
742 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
743 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
744 "16", /* function_align. */
745 "4", /* jump_align. */
746 "8", /* loop_align. */
747 2, /* int_reassoc_width. */
748 4, /* fp_reassoc_width. */
749 1, /* vec_reassoc_width. */
750 2, /* min_div_recip_mul_sf. */
751 2, /* min_div_recip_mul_df. */
752 0, /* max_case_values. */
753 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
754 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
755 &generic_prefetch_tune
758 static const struct tune_params cortexa73_tunings =
760 &cortexa57_extra_costs,
761 &generic_addrcost_table,
762 &cortexa57_regmove_cost,
763 &cortexa57_vector_cost,
764 &generic_branch_cost,
765 &generic_approx_modes,
766 4, /* memmov_cost. */
767 2, /* issue_rate. */
768 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
769 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
770 "16", /* function_align. */
771 "4", /* jump_align. */
772 "8", /* loop_align. */
773 2, /* int_reassoc_width. */
774 4, /* fp_reassoc_width. */
775 1, /* vec_reassoc_width. */
776 2, /* min_div_recip_mul_sf. */
777 2, /* min_div_recip_mul_df. */
778 0, /* max_case_values. */
779 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
780 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
781 &generic_prefetch_tune
786 static const struct tune_params exynosm1_tunings =
788 &exynosm1_extra_costs,
789 &exynosm1_addrcost_table,
790 &exynosm1_regmove_cost,
791 &exynosm1_vector_cost,
792 &generic_branch_cost,
793 &exynosm1_approx_modes,
794 4, /* memmov_cost */
795 3, /* issue_rate */
796 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
797 "4", /* function_align. */
798 "4", /* jump_align. */
799 "4", /* loop_align. */
800 2, /* int_reassoc_width. */
801 4, /* fp_reassoc_width. */
802 1, /* vec_reassoc_width. */
803 2, /* min_div_recip_mul_sf. */
804 2, /* min_div_recip_mul_df. */
805 48, /* max_case_values. */
806 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
807 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
808 &exynosm1_prefetch_tune
811 static const struct tune_params thunderxt88_tunings =
813 &thunderx_extra_costs,
814 &generic_addrcost_table,
815 &thunderx_regmove_cost,
816 &thunderx_vector_cost,
817 &generic_branch_cost,
818 &generic_approx_modes,
819 6, /* memmov_cost */
820 2, /* issue_rate */
821 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
822 "8", /* function_align. */
823 "8", /* jump_align. */
824 "8", /* loop_align. */
825 2, /* int_reassoc_width. */
826 4, /* fp_reassoc_width. */
827 1, /* vec_reassoc_width. */
828 2, /* min_div_recip_mul_sf. */
829 2, /* min_div_recip_mul_df. */
830 0, /* max_case_values. */
831 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
832 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
833 &thunderxt88_prefetch_tune
836 static const struct tune_params thunderx_tunings =
838 &thunderx_extra_costs,
839 &generic_addrcost_table,
840 &thunderx_regmove_cost,
841 &thunderx_vector_cost,
842 &generic_branch_cost,
843 &generic_approx_modes,
844 6, /* memmov_cost */
845 2, /* issue_rate */
846 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
847 "8", /* function_align. */
848 "8", /* jump_align. */
849 "8", /* loop_align. */
850 2, /* int_reassoc_width. */
851 4, /* fp_reassoc_width. */
852 1, /* vec_reassoc_width. */
853 2, /* min_div_recip_mul_sf. */
854 2, /* min_div_recip_mul_df. */
855 0, /* max_case_values. */
856 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
857 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
858 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
859 &thunderx_prefetch_tune
862 static const struct tune_params xgene1_tunings =
864 &xgene1_extra_costs,
865 &xgene1_addrcost_table,
866 &xgene1_regmove_cost,
867 &xgene1_vector_cost,
868 &generic_branch_cost,
869 &xgene1_approx_modes,
870 6, /* memmov_cost */
871 4, /* issue_rate */
872 AARCH64_FUSE_NOTHING, /* fusible_ops */
873 "16", /* function_align. */
874 "8", /* jump_align. */
875 "16", /* loop_align. */
876 2, /* int_reassoc_width. */
877 4, /* fp_reassoc_width. */
878 1, /* vec_reassoc_width. */
879 2, /* min_div_recip_mul_sf. */
880 2, /* min_div_recip_mul_df. */
881 0, /* max_case_values. */
882 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
883 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
884 &generic_prefetch_tune
887 static const struct tune_params qdf24xx_tunings =
889 &qdf24xx_extra_costs,
890 &qdf24xx_addrcost_table,
891 &qdf24xx_regmove_cost,
892 &generic_vector_cost,
893 &generic_branch_cost,
894 &generic_approx_modes,
895 4, /* memmov_cost */
896 4, /* issue_rate */
897 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
898 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
899 "16", /* function_align. */
900 "8", /* jump_align. */
901 "16", /* loop_align. */
902 2, /* int_reassoc_width. */
903 4, /* fp_reassoc_width. */
904 1, /* vec_reassoc_width. */
905 2, /* min_div_recip_mul_sf. */
906 2, /* min_div_recip_mul_df. */
907 0, /* max_case_values. */
908 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
909 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
910 &qdf24xx_prefetch_tune
913 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
914 for now. */
915 static const struct tune_params saphira_tunings =
917 &generic_extra_costs,
918 &generic_addrcost_table,
919 &generic_regmove_cost,
920 &generic_vector_cost,
921 &generic_branch_cost,
922 &generic_approx_modes,
923 4, /* memmov_cost */
924 4, /* issue_rate */
925 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
926 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
927 "16", /* function_align. */
928 "8", /* jump_align. */
929 "16", /* loop_align. */
930 2, /* int_reassoc_width. */
931 4, /* fp_reassoc_width. */
932 1, /* vec_reassoc_width. */
933 2, /* min_div_recip_mul_sf. */
934 2, /* min_div_recip_mul_df. */
935 0, /* max_case_values. */
936 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
937 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
938 &generic_prefetch_tune
941 static const struct tune_params thunderx2t99_tunings =
943 &thunderx2t99_extra_costs,
944 &thunderx2t99_addrcost_table,
945 &thunderx2t99_regmove_cost,
946 &thunderx2t99_vector_cost,
947 &generic_branch_cost,
948 &generic_approx_modes,
949 4, /* memmov_cost. */
950 4, /* issue_rate. */
951 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
952 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
953 "16", /* function_align. */
954 "8", /* jump_align. */
955 "16", /* loop_align. */
956 3, /* int_reassoc_width. */
957 2, /* fp_reassoc_width. */
958 2, /* vec_reassoc_width. */
959 2, /* min_div_recip_mul_sf. */
960 2, /* min_div_recip_mul_df. */
961 0, /* max_case_values. */
962 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
963 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
964 &thunderx2t99_prefetch_tune
967 /* Support for fine-grained override of the tuning structures. */
968 struct aarch64_tuning_override_function
970 const char* name;
971 void (*parse_override)(const char*, struct tune_params*);
974 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
975 static void aarch64_parse_tune_string (const char*, struct tune_params*);
977 static const struct aarch64_tuning_override_function
978 aarch64_tuning_override_functions[] =
980 { "fuse", aarch64_parse_fuse_string },
981 { "tune", aarch64_parse_tune_string },
982 { NULL, NULL }
985 /* A processor implementing AArch64. */
986 struct processor
988 const char *const name;
989 enum aarch64_processor ident;
990 enum aarch64_processor sched_core;
991 enum aarch64_arch arch;
992 unsigned architecture_version;
993 const unsigned long flags;
994 const struct tune_params *const tune;
997 /* Architectures implementing AArch64. */
998 static const struct processor all_architectures[] =
1000 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1001 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1002 #include "aarch64-arches.def"
1003 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1006 /* Processor cores implementing AArch64. */
1007 static const struct processor all_cores[] =
1009 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1010 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1011 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1012 FLAGS, &COSTS##_tunings},
1013 #include "aarch64-cores.def"
1014 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1015 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1016 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1020 /* Target specification. These are populated by the -march, -mtune, -mcpu
1021 handling code or by target attributes. */
1022 static const struct processor *selected_arch;
1023 static const struct processor *selected_cpu;
1024 static const struct processor *selected_tune;
1026 /* The current tuning set. */
1027 struct tune_params aarch64_tune_params = generic_tunings;
1029 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1031 /* An ISA extension in the co-processor and main instruction set space. */
1032 struct aarch64_option_extension
1034 const char *const name;
1035 const unsigned long flags_on;
1036 const unsigned long flags_off;
1039 typedef enum aarch64_cond_code
1041 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1042 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1043 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1045 aarch64_cc;
1047 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1049 /* The condition codes of the processor, and the inverse function. */
1050 static const char * const aarch64_condition_codes[] =
1052 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1053 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1056 /* Generate code to enable conditional branches in functions over 1 MiB. */
1057 const char *
1058 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1059 const char * branch_format)
1061 rtx_code_label * tmp_label = gen_label_rtx ();
1062 char label_buf[256];
1063 char buffer[128];
1064 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1065 CODE_LABEL_NUMBER (tmp_label));
1066 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1067 rtx dest_label = operands[pos_label];
1068 operands[pos_label] = tmp_label;
1070 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1071 output_asm_insn (buffer, operands);
1073 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1074 operands[pos_label] = dest_label;
1075 output_asm_insn (buffer, operands);
1076 return "";
1079 void
1080 aarch64_err_no_fpadvsimd (machine_mode mode)
1082 if (TARGET_GENERAL_REGS_ONLY)
1083 if (FLOAT_MODE_P (mode))
1084 error ("%qs is incompatible with the use of floating-point types",
1085 "-mgeneral-regs-only");
1086 else
1087 error ("%qs is incompatible with the use of vector types",
1088 "-mgeneral-regs-only");
1089 else
1090 if (FLOAT_MODE_P (mode))
1091 error ("%qs feature modifier is incompatible with the use of"
1092 " floating-point types", "+nofp");
1093 else
1094 error ("%qs feature modifier is incompatible with the use of"
1095 " vector types", "+nofp");
1098 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1099 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1100 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1101 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1102 and GENERAL_REGS is lower than the memory cost (in this case the best class
1103 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1104 cost results in bad allocations with many redundant int<->FP moves which
1105 are expensive on various cores.
1106 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1107 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1108 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1109 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1110 The result of this is that it is no longer inefficient to have a higher
1111 memory move cost than the register move cost.
1114 static reg_class_t
1115 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1116 reg_class_t best_class)
1118 machine_mode mode;
1120 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1121 || !reg_class_subset_p (FP_REGS, allocno_class))
1122 return allocno_class;
1124 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1125 || !reg_class_subset_p (FP_REGS, best_class))
1126 return best_class;
1128 mode = PSEUDO_REGNO_MODE (regno);
1129 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1132 static unsigned int
1133 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1135 if (GET_MODE_UNIT_SIZE (mode) == 4)
1136 return aarch64_tune_params.min_div_recip_mul_sf;
1137 return aarch64_tune_params.min_div_recip_mul_df;
1140 /* Return the reassociation width of treeop OPC with mode MODE. */
1141 static int
1142 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1144 if (VECTOR_MODE_P (mode))
1145 return aarch64_tune_params.vec_reassoc_width;
1146 if (INTEGRAL_MODE_P (mode))
1147 return aarch64_tune_params.int_reassoc_width;
1148 /* Avoid reassociating floating point addition so we emit more FMAs. */
1149 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1150 return aarch64_tune_params.fp_reassoc_width;
1151 return 1;
1154 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1155 unsigned
1156 aarch64_dbx_register_number (unsigned regno)
1158 if (GP_REGNUM_P (regno))
1159 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1160 else if (regno == SP_REGNUM)
1161 return AARCH64_DWARF_SP;
1162 else if (FP_REGNUM_P (regno))
1163 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1164 else if (PR_REGNUM_P (regno))
1165 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1166 else if (regno == VG_REGNUM)
1167 return AARCH64_DWARF_VG;
1169 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1170 equivalent DWARF register. */
1171 return DWARF_FRAME_REGISTERS;
1174 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1175 static bool
1176 aarch64_advsimd_struct_mode_p (machine_mode mode)
1178 return (TARGET_SIMD
1179 && (mode == OImode || mode == CImode || mode == XImode));
1182 /* Return true if MODE is an SVE predicate mode. */
1183 static bool
1184 aarch64_sve_pred_mode_p (machine_mode mode)
1186 return (TARGET_SVE
1187 && (mode == VNx16BImode
1188 || mode == VNx8BImode
1189 || mode == VNx4BImode
1190 || mode == VNx2BImode));
1193 /* Three mutually-exclusive flags describing a vector or predicate type. */
1194 const unsigned int VEC_ADVSIMD = 1;
1195 const unsigned int VEC_SVE_DATA = 2;
1196 const unsigned int VEC_SVE_PRED = 4;
1197 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1198 a structure of 2, 3 or 4 vectors. */
1199 const unsigned int VEC_STRUCT = 8;
1200 /* Useful combinations of the above. */
1201 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1202 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1204 /* Return a set of flags describing the vector properties of mode MODE.
1205 Ignore modes that are not supported by the current target. */
1206 static unsigned int
1207 aarch64_classify_vector_mode (machine_mode mode)
1209 if (aarch64_advsimd_struct_mode_p (mode))
1210 return VEC_ADVSIMD | VEC_STRUCT;
1212 if (aarch64_sve_pred_mode_p (mode))
1213 return VEC_SVE_PRED;
1215 scalar_mode inner = GET_MODE_INNER (mode);
1216 if (VECTOR_MODE_P (mode)
1217 && (inner == QImode
1218 || inner == HImode
1219 || inner == HFmode
1220 || inner == SImode
1221 || inner == SFmode
1222 || inner == DImode
1223 || inner == DFmode))
1225 if (TARGET_SVE)
1227 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1228 return VEC_SVE_DATA;
1229 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1230 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1231 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1232 return VEC_SVE_DATA | VEC_STRUCT;
1235 /* This includes V1DF but not V1DI (which doesn't exist). */
1236 if (TARGET_SIMD
1237 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1238 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1239 return VEC_ADVSIMD;
1242 return 0;
1245 /* Return true if MODE is any of the data vector modes, including
1246 structure modes. */
1247 static bool
1248 aarch64_vector_data_mode_p (machine_mode mode)
1250 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1253 /* Return true if MODE is an SVE data vector mode; either a single vector
1254 or a structure of vectors. */
1255 static bool
1256 aarch64_sve_data_mode_p (machine_mode mode)
1258 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1261 /* Implement target hook TARGET_ARRAY_MODE. */
1262 static opt_machine_mode
1263 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1265 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1266 && IN_RANGE (nelems, 2, 4))
1267 return mode_for_vector (GET_MODE_INNER (mode),
1268 GET_MODE_NUNITS (mode) * nelems);
1270 return opt_machine_mode ();
1273 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1274 static bool
1275 aarch64_array_mode_supported_p (machine_mode mode,
1276 unsigned HOST_WIDE_INT nelems)
1278 if (TARGET_SIMD
1279 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1280 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1281 && (nelems >= 2 && nelems <= 4))
1282 return true;
1284 return false;
1287 /* Return the SVE predicate mode to use for elements that have
1288 ELEM_NBYTES bytes, if such a mode exists. */
1290 opt_machine_mode
1291 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1293 if (TARGET_SVE)
1295 if (elem_nbytes == 1)
1296 return VNx16BImode;
1297 if (elem_nbytes == 2)
1298 return VNx8BImode;
1299 if (elem_nbytes == 4)
1300 return VNx4BImode;
1301 if (elem_nbytes == 8)
1302 return VNx2BImode;
1304 return opt_machine_mode ();
1307 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1309 static opt_machine_mode
1310 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1312 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1314 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1315 machine_mode pred_mode;
1316 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1317 return pred_mode;
1320 return default_get_mask_mode (nunits, nbytes);
1323 /* Implement TARGET_PREFERRED_ELSE_VALUE. Prefer to use the first
1324 arithmetic operand as the else value if the else value doesn't matter,
1325 since that exactly matches the SVE destructive merging form. */
1327 static tree
1328 aarch64_preferred_else_value (unsigned, tree, unsigned int, tree *ops)
1330 return ops[0];
1333 /* Implement TARGET_HARD_REGNO_NREGS. */
1335 static unsigned int
1336 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1338 /* ??? Logically we should only need to provide a value when
1339 HARD_REGNO_MODE_OK says that the combination is valid,
1340 but at the moment we need to handle all modes. Just ignore
1341 any runtime parts for registers that can't store them. */
1342 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1343 switch (aarch64_regno_regclass (regno))
1345 case FP_REGS:
1346 case FP_LO_REGS:
1347 if (aarch64_sve_data_mode_p (mode))
1348 return exact_div (GET_MODE_SIZE (mode),
1349 BYTES_PER_SVE_VECTOR).to_constant ();
1350 return CEIL (lowest_size, UNITS_PER_VREG);
1351 case PR_REGS:
1352 case PR_LO_REGS:
1353 case PR_HI_REGS:
1354 return 1;
1355 default:
1356 return CEIL (lowest_size, UNITS_PER_WORD);
1358 gcc_unreachable ();
1361 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1363 static bool
1364 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1366 if (GET_MODE_CLASS (mode) == MODE_CC)
1367 return regno == CC_REGNUM;
1369 if (regno == VG_REGNUM)
1370 /* This must have the same size as _Unwind_Word. */
1371 return mode == DImode;
1373 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1374 if (vec_flags & VEC_SVE_PRED)
1375 return PR_REGNUM_P (regno);
1377 if (PR_REGNUM_P (regno))
1378 return 0;
1380 if (regno == SP_REGNUM)
1381 /* The purpose of comparing with ptr_mode is to support the
1382 global register variable associated with the stack pointer
1383 register via the syntax of asm ("wsp") in ILP32. */
1384 return mode == Pmode || mode == ptr_mode;
1386 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1387 return mode == Pmode;
1389 if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1390 return true;
1392 if (FP_REGNUM_P (regno))
1394 if (vec_flags & VEC_STRUCT)
1395 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1396 else
1397 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1400 return false;
1403 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1404 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1405 clobbers the top 64 bits when restoring the bottom 64 bits. */
1407 static bool
1408 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1410 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1413 /* Implement REGMODE_NATURAL_SIZE. */
1414 poly_uint64
1415 aarch64_regmode_natural_size (machine_mode mode)
1417 /* The natural size for SVE data modes is one SVE data vector,
1418 and similarly for predicates. We can't independently modify
1419 anything smaller than that. */
1420 /* ??? For now, only do this for variable-width SVE registers.
1421 Doing it for constant-sized registers breaks lower-subreg.c. */
1422 /* ??? And once that's fixed, we should probably have similar
1423 code for Advanced SIMD. */
1424 if (!aarch64_sve_vg.is_constant ())
1426 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1427 if (vec_flags & VEC_SVE_PRED)
1428 return BYTES_PER_SVE_PRED;
1429 if (vec_flags & VEC_SVE_DATA)
1430 return BYTES_PER_SVE_VECTOR;
1432 return UNITS_PER_WORD;
1435 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1436 machine_mode
1437 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1438 machine_mode mode)
1440 /* The predicate mode determines which bits are significant and
1441 which are "don't care". Decreasing the number of lanes would
1442 lose data while increasing the number of lanes would make bits
1443 unnecessarily significant. */
1444 if (PR_REGNUM_P (regno))
1445 return mode;
1446 if (known_ge (GET_MODE_SIZE (mode), 4))
1447 return mode;
1448 else
1449 return SImode;
1452 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1453 that strcpy from constants will be faster. */
1455 static HOST_WIDE_INT
1456 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1458 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1459 return MAX (align, BITS_PER_WORD);
1460 return align;
1463 /* Return true if calls to DECL should be treated as
1464 long-calls (ie called via a register). */
1465 static bool
1466 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1468 return false;
1471 /* Return true if calls to symbol-ref SYM should be treated as
1472 long-calls (ie called via a register). */
1473 bool
1474 aarch64_is_long_call_p (rtx sym)
1476 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1479 /* Return true if calls to symbol-ref SYM should not go through
1480 plt stubs. */
1482 bool
1483 aarch64_is_noplt_call_p (rtx sym)
1485 const_tree decl = SYMBOL_REF_DECL (sym);
1487 if (flag_pic
1488 && decl
1489 && (!flag_plt
1490 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1491 && !targetm.binds_local_p (decl))
1492 return true;
1494 return false;
1497 /* Return true if the offsets to a zero/sign-extract operation
1498 represent an expression that matches an extend operation. The
1499 operands represent the paramters from
1501 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1502 bool
1503 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1504 rtx extract_imm)
1506 HOST_WIDE_INT mult_val, extract_val;
1508 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1509 return false;
1511 mult_val = INTVAL (mult_imm);
1512 extract_val = INTVAL (extract_imm);
1514 if (extract_val > 8
1515 && extract_val < GET_MODE_BITSIZE (mode)
1516 && exact_log2 (extract_val & ~7) > 0
1517 && (extract_val & 7) <= 4
1518 && mult_val == (1 << (extract_val & 7)))
1519 return true;
1521 return false;
1524 /* Emit an insn that's a simple single-set. Both the operands must be
1525 known to be valid. */
1526 inline static rtx_insn *
1527 emit_set_insn (rtx x, rtx y)
1529 return emit_insn (gen_rtx_SET (x, y));
1532 /* X and Y are two things to compare using CODE. Emit the compare insn and
1533 return the rtx for register 0 in the proper mode. */
1535 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1537 machine_mode mode = SELECT_CC_MODE (code, x, y);
1538 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1540 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1541 return cc_reg;
1544 /* Build the SYMBOL_REF for __tls_get_addr. */
1546 static GTY(()) rtx tls_get_addr_libfunc;
1549 aarch64_tls_get_addr (void)
1551 if (!tls_get_addr_libfunc)
1552 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1553 return tls_get_addr_libfunc;
1556 /* Return the TLS model to use for ADDR. */
1558 static enum tls_model
1559 tls_symbolic_operand_type (rtx addr)
1561 enum tls_model tls_kind = TLS_MODEL_NONE;
1562 if (GET_CODE (addr) == CONST)
1564 poly_int64 addend;
1565 rtx sym = strip_offset (addr, &addend);
1566 if (GET_CODE (sym) == SYMBOL_REF)
1567 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1569 else if (GET_CODE (addr) == SYMBOL_REF)
1570 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1572 return tls_kind;
1575 /* We'll allow lo_sum's in addresses in our legitimate addresses
1576 so that combine would take care of combining addresses where
1577 necessary, but for generation purposes, we'll generate the address
1578 as :
1579 RTL Absolute
1580 tmp = hi (symbol_ref); adrp x1, foo
1581 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1584 PIC TLS
1585 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1586 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1587 bl __tls_get_addr
1590 Load TLS symbol, depending on TLS mechanism and TLS access model.
1592 Global Dynamic - Traditional TLS:
1593 adrp tmp, :tlsgd:imm
1594 add dest, tmp, #:tlsgd_lo12:imm
1595 bl __tls_get_addr
1597 Global Dynamic - TLS Descriptors:
1598 adrp dest, :tlsdesc:imm
1599 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1600 add dest, dest, #:tlsdesc_lo12:imm
1601 blr tmp
1602 mrs tp, tpidr_el0
1603 add dest, dest, tp
1605 Initial Exec:
1606 mrs tp, tpidr_el0
1607 adrp tmp, :gottprel:imm
1608 ldr dest, [tmp, #:gottprel_lo12:imm]
1609 add dest, dest, tp
1611 Local Exec:
1612 mrs tp, tpidr_el0
1613 add t0, tp, #:tprel_hi12:imm, lsl #12
1614 add t0, t0, #:tprel_lo12_nc:imm
1617 static void
1618 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1619 enum aarch64_symbol_type type)
1621 switch (type)
1623 case SYMBOL_SMALL_ABSOLUTE:
1625 /* In ILP32, the mode of dest can be either SImode or DImode. */
1626 rtx tmp_reg = dest;
1627 machine_mode mode = GET_MODE (dest);
1629 gcc_assert (mode == Pmode || mode == ptr_mode);
1631 if (can_create_pseudo_p ())
1632 tmp_reg = gen_reg_rtx (mode);
1634 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1635 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1636 return;
1639 case SYMBOL_TINY_ABSOLUTE:
1640 emit_insn (gen_rtx_SET (dest, imm));
1641 return;
1643 case SYMBOL_SMALL_GOT_28K:
1645 machine_mode mode = GET_MODE (dest);
1646 rtx gp_rtx = pic_offset_table_rtx;
1647 rtx insn;
1648 rtx mem;
1650 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1651 here before rtl expand. Tree IVOPT will generate rtl pattern to
1652 decide rtx costs, in which case pic_offset_table_rtx is not
1653 initialized. For that case no need to generate the first adrp
1654 instruction as the final cost for global variable access is
1655 one instruction. */
1656 if (gp_rtx != NULL)
1658 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1659 using the page base as GOT base, the first page may be wasted,
1660 in the worst scenario, there is only 28K space for GOT).
1662 The generate instruction sequence for accessing global variable
1665 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1667 Only one instruction needed. But we must initialize
1668 pic_offset_table_rtx properly. We generate initialize insn for
1669 every global access, and allow CSE to remove all redundant.
1671 The final instruction sequences will look like the following
1672 for multiply global variables access.
1674 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1676 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1677 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1678 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1679 ... */
1681 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1682 crtl->uses_pic_offset_table = 1;
1683 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1685 if (mode != GET_MODE (gp_rtx))
1686 gp_rtx = gen_lowpart (mode, gp_rtx);
1690 if (mode == ptr_mode)
1692 if (mode == DImode)
1693 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1694 else
1695 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1697 mem = XVECEXP (SET_SRC (insn), 0, 0);
1699 else
1701 gcc_assert (mode == Pmode);
1703 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1704 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1707 /* The operand is expected to be MEM. Whenever the related insn
1708 pattern changed, above code which calculate mem should be
1709 updated. */
1710 gcc_assert (GET_CODE (mem) == MEM);
1711 MEM_READONLY_P (mem) = 1;
1712 MEM_NOTRAP_P (mem) = 1;
1713 emit_insn (insn);
1714 return;
1717 case SYMBOL_SMALL_GOT_4G:
1719 /* In ILP32, the mode of dest can be either SImode or DImode,
1720 while the got entry is always of SImode size. The mode of
1721 dest depends on how dest is used: if dest is assigned to a
1722 pointer (e.g. in the memory), it has SImode; it may have
1723 DImode if dest is dereferenced to access the memeory.
1724 This is why we have to handle three different ldr_got_small
1725 patterns here (two patterns for ILP32). */
1727 rtx insn;
1728 rtx mem;
1729 rtx tmp_reg = dest;
1730 machine_mode mode = GET_MODE (dest);
1732 if (can_create_pseudo_p ())
1733 tmp_reg = gen_reg_rtx (mode);
1735 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1736 if (mode == ptr_mode)
1738 if (mode == DImode)
1739 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1740 else
1741 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1743 mem = XVECEXP (SET_SRC (insn), 0, 0);
1745 else
1747 gcc_assert (mode == Pmode);
1749 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1750 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1753 gcc_assert (GET_CODE (mem) == MEM);
1754 MEM_READONLY_P (mem) = 1;
1755 MEM_NOTRAP_P (mem) = 1;
1756 emit_insn (insn);
1757 return;
1760 case SYMBOL_SMALL_TLSGD:
1762 rtx_insn *insns;
1763 machine_mode mode = GET_MODE (dest);
1764 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1766 start_sequence ();
1767 if (TARGET_ILP32)
1768 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1769 else
1770 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1771 insns = get_insns ();
1772 end_sequence ();
1774 RTL_CONST_CALL_P (insns) = 1;
1775 emit_libcall_block (insns, dest, result, imm);
1776 return;
1779 case SYMBOL_SMALL_TLSDESC:
1781 machine_mode mode = GET_MODE (dest);
1782 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1783 rtx tp;
1785 gcc_assert (mode == Pmode || mode == ptr_mode);
1787 /* In ILP32, the got entry is always of SImode size. Unlike
1788 small GOT, the dest is fixed at reg 0. */
1789 if (TARGET_ILP32)
1790 emit_insn (gen_tlsdesc_small_si (imm));
1791 else
1792 emit_insn (gen_tlsdesc_small_di (imm));
1793 tp = aarch64_load_tp (NULL);
1795 if (mode != Pmode)
1796 tp = gen_lowpart (mode, tp);
1798 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1799 if (REG_P (dest))
1800 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1801 return;
1804 case SYMBOL_SMALL_TLSIE:
1806 /* In ILP32, the mode of dest can be either SImode or DImode,
1807 while the got entry is always of SImode size. The mode of
1808 dest depends on how dest is used: if dest is assigned to a
1809 pointer (e.g. in the memory), it has SImode; it may have
1810 DImode if dest is dereferenced to access the memeory.
1811 This is why we have to handle three different tlsie_small
1812 patterns here (two patterns for ILP32). */
1813 machine_mode mode = GET_MODE (dest);
1814 rtx tmp_reg = gen_reg_rtx (mode);
1815 rtx tp = aarch64_load_tp (NULL);
1817 if (mode == ptr_mode)
1819 if (mode == DImode)
1820 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1821 else
1823 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1824 tp = gen_lowpart (mode, tp);
1827 else
1829 gcc_assert (mode == Pmode);
1830 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1833 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1834 if (REG_P (dest))
1835 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1836 return;
1839 case SYMBOL_TLSLE12:
1840 case SYMBOL_TLSLE24:
1841 case SYMBOL_TLSLE32:
1842 case SYMBOL_TLSLE48:
1844 machine_mode mode = GET_MODE (dest);
1845 rtx tp = aarch64_load_tp (NULL);
1847 if (mode != Pmode)
1848 tp = gen_lowpart (mode, tp);
1850 switch (type)
1852 case SYMBOL_TLSLE12:
1853 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1854 (dest, tp, imm));
1855 break;
1856 case SYMBOL_TLSLE24:
1857 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1858 (dest, tp, imm));
1859 break;
1860 case SYMBOL_TLSLE32:
1861 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1862 (dest, imm));
1863 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1864 (dest, dest, tp));
1865 break;
1866 case SYMBOL_TLSLE48:
1867 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1868 (dest, imm));
1869 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1870 (dest, dest, tp));
1871 break;
1872 default:
1873 gcc_unreachable ();
1876 if (REG_P (dest))
1877 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1878 return;
1881 case SYMBOL_TINY_GOT:
1882 emit_insn (gen_ldr_got_tiny (dest, imm));
1883 return;
1885 case SYMBOL_TINY_TLSIE:
1887 machine_mode mode = GET_MODE (dest);
1888 rtx tp = aarch64_load_tp (NULL);
1890 if (mode == ptr_mode)
1892 if (mode == DImode)
1893 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1894 else
1896 tp = gen_lowpart (mode, tp);
1897 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1900 else
1902 gcc_assert (mode == Pmode);
1903 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1906 if (REG_P (dest))
1907 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1908 return;
1911 default:
1912 gcc_unreachable ();
1916 /* Emit a move from SRC to DEST. Assume that the move expanders can
1917 handle all moves if !can_create_pseudo_p (). The distinction is
1918 important because, unlike emit_move_insn, the move expanders know
1919 how to force Pmode objects into the constant pool even when the
1920 constant pool address is not itself legitimate. */
1921 static rtx
1922 aarch64_emit_move (rtx dest, rtx src)
1924 return (can_create_pseudo_p ()
1925 ? emit_move_insn (dest, src)
1926 : emit_move_insn_1 (dest, src));
1929 /* Apply UNOPTAB to OP and store the result in DEST. */
1931 static void
1932 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1934 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1935 if (dest != tmp)
1936 emit_move_insn (dest, tmp);
1939 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
1941 static void
1942 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1944 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1945 OPTAB_DIRECT);
1946 if (dest != tmp)
1947 emit_move_insn (dest, tmp);
1950 /* Split a 128-bit move operation into two 64-bit move operations,
1951 taking care to handle partial overlap of register to register
1952 copies. Special cases are needed when moving between GP regs and
1953 FP regs. SRC can be a register, constant or memory; DST a register
1954 or memory. If either operand is memory it must not have any side
1955 effects. */
1956 void
1957 aarch64_split_128bit_move (rtx dst, rtx src)
1959 rtx dst_lo, dst_hi;
1960 rtx src_lo, src_hi;
1962 machine_mode mode = GET_MODE (dst);
1964 gcc_assert (mode == TImode || mode == TFmode);
1965 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1966 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1968 if (REG_P (dst) && REG_P (src))
1970 int src_regno = REGNO (src);
1971 int dst_regno = REGNO (dst);
1973 /* Handle FP <-> GP regs. */
1974 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1976 src_lo = gen_lowpart (word_mode, src);
1977 src_hi = gen_highpart (word_mode, src);
1979 if (mode == TImode)
1981 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1982 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1984 else
1986 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1987 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1989 return;
1991 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1993 dst_lo = gen_lowpart (word_mode, dst);
1994 dst_hi = gen_highpart (word_mode, dst);
1996 if (mode == TImode)
1998 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1999 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
2001 else
2003 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
2004 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
2006 return;
2010 dst_lo = gen_lowpart (word_mode, dst);
2011 dst_hi = gen_highpart (word_mode, dst);
2012 src_lo = gen_lowpart (word_mode, src);
2013 src_hi = gen_highpart_mode (word_mode, mode, src);
2015 /* At most one pairing may overlap. */
2016 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2018 aarch64_emit_move (dst_hi, src_hi);
2019 aarch64_emit_move (dst_lo, src_lo);
2021 else
2023 aarch64_emit_move (dst_lo, src_lo);
2024 aarch64_emit_move (dst_hi, src_hi);
2028 bool
2029 aarch64_split_128bit_move_p (rtx dst, rtx src)
2031 return (! REG_P (src)
2032 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2035 /* Split a complex SIMD combine. */
2037 void
2038 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2040 machine_mode src_mode = GET_MODE (src1);
2041 machine_mode dst_mode = GET_MODE (dst);
2043 gcc_assert (VECTOR_MODE_P (dst_mode));
2044 gcc_assert (register_operand (dst, dst_mode)
2045 && register_operand (src1, src_mode)
2046 && register_operand (src2, src_mode));
2048 rtx (*gen) (rtx, rtx, rtx);
2050 switch (src_mode)
2052 case E_V8QImode:
2053 gen = gen_aarch64_simd_combinev8qi;
2054 break;
2055 case E_V4HImode:
2056 gen = gen_aarch64_simd_combinev4hi;
2057 break;
2058 case E_V2SImode:
2059 gen = gen_aarch64_simd_combinev2si;
2060 break;
2061 case E_V4HFmode:
2062 gen = gen_aarch64_simd_combinev4hf;
2063 break;
2064 case E_V2SFmode:
2065 gen = gen_aarch64_simd_combinev2sf;
2066 break;
2067 case E_DImode:
2068 gen = gen_aarch64_simd_combinedi;
2069 break;
2070 case E_DFmode:
2071 gen = gen_aarch64_simd_combinedf;
2072 break;
2073 default:
2074 gcc_unreachable ();
2077 emit_insn (gen (dst, src1, src2));
2078 return;
2081 /* Split a complex SIMD move. */
2083 void
2084 aarch64_split_simd_move (rtx dst, rtx src)
2086 machine_mode src_mode = GET_MODE (src);
2087 machine_mode dst_mode = GET_MODE (dst);
2089 gcc_assert (VECTOR_MODE_P (dst_mode));
2091 if (REG_P (dst) && REG_P (src))
2093 rtx (*gen) (rtx, rtx);
2095 gcc_assert (VECTOR_MODE_P (src_mode));
2097 switch (src_mode)
2099 case E_V16QImode:
2100 gen = gen_aarch64_split_simd_movv16qi;
2101 break;
2102 case E_V8HImode:
2103 gen = gen_aarch64_split_simd_movv8hi;
2104 break;
2105 case E_V4SImode:
2106 gen = gen_aarch64_split_simd_movv4si;
2107 break;
2108 case E_V2DImode:
2109 gen = gen_aarch64_split_simd_movv2di;
2110 break;
2111 case E_V8HFmode:
2112 gen = gen_aarch64_split_simd_movv8hf;
2113 break;
2114 case E_V4SFmode:
2115 gen = gen_aarch64_split_simd_movv4sf;
2116 break;
2117 case E_V2DFmode:
2118 gen = gen_aarch64_split_simd_movv2df;
2119 break;
2120 default:
2121 gcc_unreachable ();
2124 emit_insn (gen (dst, src));
2125 return;
2129 bool
2130 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2131 machine_mode ymode, rtx y)
2133 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2134 gcc_assert (r != NULL);
2135 return rtx_equal_p (x, r);
2139 static rtx
2140 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2142 if (can_create_pseudo_p ())
2143 return force_reg (mode, value);
2144 else
2146 gcc_assert (x);
2147 aarch64_emit_move (x, value);
2148 return x;
2152 /* Return true if we can move VALUE into a register using a single
2153 CNT[BHWD] instruction. */
2155 static bool
2156 aarch64_sve_cnt_immediate_p (poly_int64 value)
2158 HOST_WIDE_INT factor = value.coeffs[0];
2159 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2160 return (value.coeffs[1] == factor
2161 && IN_RANGE (factor, 2, 16 * 16)
2162 && (factor & 1) == 0
2163 && factor <= 16 * (factor & -factor));
2166 /* Likewise for rtx X. */
2168 bool
2169 aarch64_sve_cnt_immediate_p (rtx x)
2171 poly_int64 value;
2172 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2175 /* Return the asm string for an instruction with a CNT-like vector size
2176 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2177 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2178 first part of the operands template (the part that comes before the
2179 vector size itself). FACTOR is the number of quadwords.
2180 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2181 If it is zero, we can use any element size. */
2183 static char *
2184 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2185 unsigned int factor,
2186 unsigned int nelts_per_vq)
2188 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2190 if (nelts_per_vq == 0)
2191 /* There is some overlap in the ranges of the four CNT instructions.
2192 Here we always use the smallest possible element size, so that the
2193 multiplier is 1 whereever possible. */
2194 nelts_per_vq = factor & -factor;
2195 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2196 gcc_assert (IN_RANGE (shift, 1, 4));
2197 char suffix = "dwhb"[shift - 1];
2199 factor >>= shift;
2200 unsigned int written;
2201 if (factor == 1)
2202 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2203 prefix, suffix, operands);
2204 else
2205 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2206 prefix, suffix, operands, factor);
2207 gcc_assert (written < sizeof (buffer));
2208 return buffer;
2211 /* Return the asm string for an instruction with a CNT-like vector size
2212 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2213 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2214 first part of the operands template (the part that comes before the
2215 vector size itself). X is the value of the vector size operand,
2216 as a polynomial integer rtx. */
2218 char *
2219 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2220 rtx x)
2222 poly_int64 value = rtx_to_poly_int64 (x);
2223 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2224 return aarch64_output_sve_cnt_immediate (prefix, operands,
2225 value.coeffs[1], 0);
2228 /* Return true if we can add VALUE to a register using a single ADDVL
2229 or ADDPL instruction. */
2231 static bool
2232 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2234 HOST_WIDE_INT factor = value.coeffs[0];
2235 if (factor == 0 || value.coeffs[1] != factor)
2236 return false;
2237 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2238 and a value of 16 is one vector width. */
2239 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2240 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2243 /* Likewise for rtx X. */
2245 bool
2246 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2248 poly_int64 value;
2249 return (poly_int_rtx_p (x, &value)
2250 && aarch64_sve_addvl_addpl_immediate_p (value));
2253 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2254 and storing the result in operand 0. */
2256 char *
2257 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2259 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2260 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2261 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2263 /* Use INC or DEC if possible. */
2264 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2266 if (aarch64_sve_cnt_immediate_p (offset_value))
2267 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2268 offset_value.coeffs[1], 0);
2269 if (aarch64_sve_cnt_immediate_p (-offset_value))
2270 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2271 -offset_value.coeffs[1], 0);
2274 int factor = offset_value.coeffs[1];
2275 if ((factor & 15) == 0)
2276 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2277 else
2278 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2279 return buffer;
2282 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2283 instruction. If it is, store the number of elements in each vector
2284 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2285 factor in *FACTOR_OUT (if nonnull). */
2287 bool
2288 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2289 unsigned int *nelts_per_vq_out)
2291 rtx elt;
2292 poly_int64 value;
2294 if (!const_vec_duplicate_p (x, &elt)
2295 || !poly_int_rtx_p (elt, &value))
2296 return false;
2298 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2299 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2300 /* There's no vector INCB. */
2301 return false;
2303 HOST_WIDE_INT factor = value.coeffs[0];
2304 if (value.coeffs[1] != factor)
2305 return false;
2307 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2308 if ((factor % nelts_per_vq) != 0
2309 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2310 return false;
2312 if (factor_out)
2313 *factor_out = factor;
2314 if (nelts_per_vq_out)
2315 *nelts_per_vq_out = nelts_per_vq;
2316 return true;
2319 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2320 instruction. */
2322 bool
2323 aarch64_sve_inc_dec_immediate_p (rtx x)
2325 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2328 /* Return the asm template for an SVE vector INC or DEC instruction.
2329 OPERANDS gives the operands before the vector count and X is the
2330 value of the vector count operand itself. */
2332 char *
2333 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2335 int factor;
2336 unsigned int nelts_per_vq;
2337 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2338 gcc_unreachable ();
2339 if (factor < 0)
2340 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2341 nelts_per_vq);
2342 else
2343 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2344 nelts_per_vq);
2347 static int
2348 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2349 scalar_int_mode mode)
2351 int i;
2352 unsigned HOST_WIDE_INT val, val2, mask;
2353 int one_match, zero_match;
2354 int num_insns;
2356 val = INTVAL (imm);
2358 if (aarch64_move_imm (val, mode))
2360 if (generate)
2361 emit_insn (gen_rtx_SET (dest, imm));
2362 return 1;
2365 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2366 (with XXXX non-zero). In that case check to see if the move can be done in
2367 a smaller mode. */
2368 val2 = val & 0xffffffff;
2369 if (mode == DImode
2370 && aarch64_move_imm (val2, SImode)
2371 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2373 if (generate)
2374 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2376 /* Check if we have to emit a second instruction by checking to see
2377 if any of the upper 32 bits of the original DI mode value is set. */
2378 if (val == val2)
2379 return 1;
2381 i = (val >> 48) ? 48 : 32;
2383 if (generate)
2384 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2385 GEN_INT ((val >> i) & 0xffff)));
2387 return 2;
2390 if ((val >> 32) == 0 || mode == SImode)
2392 if (generate)
2394 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2395 if (mode == SImode)
2396 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2397 GEN_INT ((val >> 16) & 0xffff)));
2398 else
2399 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2400 GEN_INT ((val >> 16) & 0xffff)));
2402 return 2;
2405 /* Remaining cases are all for DImode. */
2407 mask = 0xffff;
2408 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2409 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2410 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2411 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2413 if (zero_match != 2 && one_match != 2)
2415 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2416 For a 64-bit bitmask try whether changing 16 bits to all ones or
2417 zeroes creates a valid bitmask. To check any repeated bitmask,
2418 try using 16 bits from the other 32-bit half of val. */
2420 for (i = 0; i < 64; i += 16, mask <<= 16)
2422 val2 = val & ~mask;
2423 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2424 break;
2425 val2 = val | mask;
2426 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2427 break;
2428 val2 = val2 & ~mask;
2429 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2430 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2431 break;
2433 if (i != 64)
2435 if (generate)
2437 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2438 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2439 GEN_INT ((val >> i) & 0xffff)));
2441 return 2;
2445 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2446 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2447 otherwise skip zero bits. */
2449 num_insns = 1;
2450 mask = 0xffff;
2451 val2 = one_match > zero_match ? ~val : val;
2452 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2454 if (generate)
2455 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2456 ? (val | ~(mask << i))
2457 : (val & (mask << i)))));
2458 for (i += 16; i < 64; i += 16)
2460 if ((val2 & (mask << i)) == 0)
2461 continue;
2462 if (generate)
2463 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2464 GEN_INT ((val >> i) & 0xffff)));
2465 num_insns ++;
2468 return num_insns;
2471 /* Return whether imm is a 128-bit immediate which is simple enough to
2472 expand inline. */
2473 bool
2474 aarch64_mov128_immediate (rtx imm)
2476 if (GET_CODE (imm) == CONST_INT)
2477 return true;
2479 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2481 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2482 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2484 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2485 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2489 /* Return the number of temporary registers that aarch64_add_offset_1
2490 would need to add OFFSET to a register. */
2492 static unsigned int
2493 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2495 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2498 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2499 a non-polynomial OFFSET. MODE is the mode of the addition.
2500 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2501 be set and CFA adjustments added to the generated instructions.
2503 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2504 temporary if register allocation is already complete. This temporary
2505 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2506 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2507 the immediate again.
2509 Since this function may be used to adjust the stack pointer, we must
2510 ensure that it cannot cause transient stack deallocation (for example
2511 by first incrementing SP and then decrementing when adjusting by a
2512 large immediate). */
2514 static void
2515 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2516 rtx src, HOST_WIDE_INT offset, rtx temp1,
2517 bool frame_related_p, bool emit_move_imm)
2519 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2520 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2522 HOST_WIDE_INT moffset = abs_hwi (offset);
2523 rtx_insn *insn;
2525 if (!moffset)
2527 if (!rtx_equal_p (dest, src))
2529 insn = emit_insn (gen_rtx_SET (dest, src));
2530 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2532 return;
2535 /* Single instruction adjustment. */
2536 if (aarch64_uimm12_shift (moffset))
2538 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2539 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2540 return;
2543 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2544 and either:
2546 a) the offset cannot be loaded by a 16-bit move or
2547 b) there is no spare register into which we can move it. */
2548 if (moffset < 0x1000000
2549 && ((!temp1 && !can_create_pseudo_p ())
2550 || !aarch64_move_imm (moffset, mode)))
2552 HOST_WIDE_INT low_off = moffset & 0xfff;
2554 low_off = offset < 0 ? -low_off : low_off;
2555 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2556 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2557 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2558 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2559 return;
2562 /* Emit a move immediate if required and an addition/subtraction. */
2563 if (emit_move_imm)
2565 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2566 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2568 insn = emit_insn (offset < 0
2569 ? gen_sub3_insn (dest, src, temp1)
2570 : gen_add3_insn (dest, src, temp1));
2571 if (frame_related_p)
2573 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2574 rtx adj = plus_constant (mode, src, offset);
2575 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2579 /* Return the number of temporary registers that aarch64_add_offset
2580 would need to move OFFSET into a register or add OFFSET to a register;
2581 ADD_P is true if we want the latter rather than the former. */
2583 static unsigned int
2584 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2586 /* This follows the same structure as aarch64_add_offset. */
2587 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2588 return 0;
2590 unsigned int count = 0;
2591 HOST_WIDE_INT factor = offset.coeffs[1];
2592 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2593 poly_int64 poly_offset (factor, factor);
2594 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2595 /* Need one register for the ADDVL/ADDPL result. */
2596 count += 1;
2597 else if (factor != 0)
2599 factor = abs (factor);
2600 if (factor > 16 * (factor & -factor))
2601 /* Need one register for the CNT result and one for the multiplication
2602 factor. If necessary, the second temporary can be reused for the
2603 constant part of the offset. */
2604 return 2;
2605 /* Need one register for the CNT result (which might then
2606 be shifted). */
2607 count += 1;
2609 return count + aarch64_add_offset_1_temporaries (constant);
2612 /* If X can be represented as a poly_int64, return the number
2613 of temporaries that are required to add it to a register.
2614 Return -1 otherwise. */
2617 aarch64_add_offset_temporaries (rtx x)
2619 poly_int64 offset;
2620 if (!poly_int_rtx_p (x, &offset))
2621 return -1;
2622 return aarch64_offset_temporaries (true, offset);
2625 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2626 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2627 be set and CFA adjustments added to the generated instructions.
2629 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2630 temporary if register allocation is already complete. This temporary
2631 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2632 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2633 false to avoid emitting the immediate again.
2635 TEMP2, if nonnull, is a second temporary register that doesn't
2636 overlap either DEST or REG.
2638 Since this function may be used to adjust the stack pointer, we must
2639 ensure that it cannot cause transient stack deallocation (for example
2640 by first incrementing SP and then decrementing when adjusting by a
2641 large immediate). */
2643 static void
2644 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2645 poly_int64 offset, rtx temp1, rtx temp2,
2646 bool frame_related_p, bool emit_move_imm = true)
2648 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2649 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2650 gcc_assert (temp1 == NULL_RTX
2651 || !frame_related_p
2652 || !reg_overlap_mentioned_p (temp1, dest));
2653 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2655 /* Try using ADDVL or ADDPL to add the whole value. */
2656 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2658 rtx offset_rtx = gen_int_mode (offset, mode);
2659 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2660 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2661 return;
2664 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2665 SVE vector register, over and above the minimum size of 128 bits.
2666 This is equivalent to half the value returned by CNTD with a
2667 vector shape of ALL. */
2668 HOST_WIDE_INT factor = offset.coeffs[1];
2669 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2671 /* Try using ADDVL or ADDPL to add the VG-based part. */
2672 poly_int64 poly_offset (factor, factor);
2673 if (src != const0_rtx
2674 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2676 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2677 if (frame_related_p)
2679 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2680 RTX_FRAME_RELATED_P (insn) = true;
2681 src = dest;
2683 else
2685 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2686 src = aarch64_force_temporary (mode, temp1, addr);
2687 temp1 = temp2;
2688 temp2 = NULL_RTX;
2691 /* Otherwise use a CNT-based sequence. */
2692 else if (factor != 0)
2694 /* Use a subtraction if we have a negative factor. */
2695 rtx_code code = PLUS;
2696 if (factor < 0)
2698 factor = -factor;
2699 code = MINUS;
2702 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2703 into the multiplication. */
2704 rtx val;
2705 int shift = 0;
2706 if (factor & 1)
2707 /* Use a right shift by 1. */
2708 shift = -1;
2709 else
2710 factor /= 2;
2711 HOST_WIDE_INT low_bit = factor & -factor;
2712 if (factor <= 16 * low_bit)
2714 if (factor > 16 * 8)
2716 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2717 the value with the minimum multiplier and shift it into
2718 position. */
2719 int extra_shift = exact_log2 (low_bit);
2720 shift += extra_shift;
2721 factor >>= extra_shift;
2723 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2725 else
2727 /* Use CNTD, then multiply it by FACTOR. */
2728 val = gen_int_mode (poly_int64 (2, 2), mode);
2729 val = aarch64_force_temporary (mode, temp1, val);
2731 /* Go back to using a negative multiplication factor if we have
2732 no register from which to subtract. */
2733 if (code == MINUS && src == const0_rtx)
2735 factor = -factor;
2736 code = PLUS;
2738 rtx coeff1 = gen_int_mode (factor, mode);
2739 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2740 val = gen_rtx_MULT (mode, val, coeff1);
2743 if (shift > 0)
2745 /* Multiply by 1 << SHIFT. */
2746 val = aarch64_force_temporary (mode, temp1, val);
2747 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2749 else if (shift == -1)
2751 /* Divide by 2. */
2752 val = aarch64_force_temporary (mode, temp1, val);
2753 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2756 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2757 if (src != const0_rtx)
2759 val = aarch64_force_temporary (mode, temp1, val);
2760 val = gen_rtx_fmt_ee (code, mode, src, val);
2762 else if (code == MINUS)
2764 val = aarch64_force_temporary (mode, temp1, val);
2765 val = gen_rtx_NEG (mode, val);
2768 if (constant == 0 || frame_related_p)
2770 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2771 if (frame_related_p)
2773 RTX_FRAME_RELATED_P (insn) = true;
2774 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2775 gen_rtx_SET (dest, plus_constant (Pmode, src,
2776 poly_offset)));
2778 src = dest;
2779 if (constant == 0)
2780 return;
2782 else
2784 src = aarch64_force_temporary (mode, temp1, val);
2785 temp1 = temp2;
2786 temp2 = NULL_RTX;
2789 emit_move_imm = true;
2792 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2793 frame_related_p, emit_move_imm);
2796 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2797 than a poly_int64. */
2799 void
2800 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2801 rtx offset_rtx, rtx temp1, rtx temp2)
2803 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2804 temp1, temp2, false);
2807 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2808 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2809 if TEMP1 already contains abs (DELTA). */
2811 static inline void
2812 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2814 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2815 temp1, temp2, true, emit_move_imm);
2818 /* Subtract DELTA from the stack pointer, marking the instructions
2819 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2820 if nonnull. */
2822 static inline void
2823 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2825 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2826 temp1, temp2, frame_related_p);
2829 /* Set DEST to (vec_series BASE STEP). */
2831 static void
2832 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2834 machine_mode mode = GET_MODE (dest);
2835 scalar_mode inner = GET_MODE_INNER (mode);
2837 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2838 if (!aarch64_sve_index_immediate_p (base))
2839 base = force_reg (inner, base);
2840 if (!aarch64_sve_index_immediate_p (step))
2841 step = force_reg (inner, step);
2843 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2846 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2847 integer of mode INT_MODE. Return true on success. */
2849 static bool
2850 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2851 rtx src)
2853 /* If the constant is smaller than 128 bits, we can do the move
2854 using a vector of SRC_MODEs. */
2855 if (src_mode != TImode)
2857 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2858 GET_MODE_SIZE (src_mode));
2859 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2860 emit_move_insn (gen_lowpart (dup_mode, dest),
2861 gen_const_vec_duplicate (dup_mode, src));
2862 return true;
2865 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2866 src = force_const_mem (src_mode, src);
2867 if (!src)
2868 return false;
2870 /* Make sure that the address is legitimate. */
2871 if (!aarch64_sve_ld1r_operand_p (src))
2873 rtx addr = force_reg (Pmode, XEXP (src, 0));
2874 src = replace_equiv_address (src, addr);
2877 machine_mode mode = GET_MODE (dest);
2878 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2879 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2880 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2881 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2882 emit_insn (gen_rtx_SET (dest, src));
2883 return true;
2886 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2887 isn't a simple duplicate or series. */
2889 static void
2890 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2892 machine_mode mode = GET_MODE (src);
2893 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2894 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2895 gcc_assert (npatterns > 1);
2897 if (nelts_per_pattern == 1)
2899 /* The constant is a repeating seqeuence of at least two elements,
2900 where the repeating elements occupy no more than 128 bits.
2901 Get an integer representation of the replicated value. */
2902 scalar_int_mode int_mode;
2903 if (BYTES_BIG_ENDIAN)
2904 /* For now, always use LD1RQ to load the value on big-endian
2905 targets, since the handling of smaller integers includes a
2906 subreg that is semantically an element reverse. */
2907 int_mode = TImode;
2908 else
2910 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2911 gcc_assert (int_bits <= 128);
2912 int_mode = int_mode_for_size (int_bits, 0).require ();
2914 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2915 if (int_value
2916 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2917 return;
2920 /* Expand each pattern individually. */
2921 rtx_vector_builder builder;
2922 auto_vec<rtx, 16> vectors (npatterns);
2923 for (unsigned int i = 0; i < npatterns; ++i)
2925 builder.new_vector (mode, 1, nelts_per_pattern);
2926 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2927 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2928 vectors.quick_push (force_reg (mode, builder.build ()));
2931 /* Use permutes to interleave the separate vectors. */
2932 while (npatterns > 1)
2934 npatterns /= 2;
2935 for (unsigned int i = 0; i < npatterns; ++i)
2937 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2938 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2939 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2940 vectors[i] = tmp;
2943 gcc_assert (vectors[0] == dest);
2946 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2947 is a pattern that can be used to set DEST to a replicated scalar
2948 element. */
2950 void
2951 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2952 rtx (*gen_vec_duplicate) (rtx, rtx))
2954 machine_mode mode = GET_MODE (dest);
2956 /* Check on what type of symbol it is. */
2957 scalar_int_mode int_mode;
2958 if ((GET_CODE (imm) == SYMBOL_REF
2959 || GET_CODE (imm) == LABEL_REF
2960 || GET_CODE (imm) == CONST
2961 || GET_CODE (imm) == CONST_POLY_INT)
2962 && is_a <scalar_int_mode> (mode, &int_mode))
2964 rtx mem;
2965 poly_int64 offset;
2966 HOST_WIDE_INT const_offset;
2967 enum aarch64_symbol_type sty;
2969 /* If we have (const (plus symbol offset)), separate out the offset
2970 before we start classifying the symbol. */
2971 rtx base = strip_offset (imm, &offset);
2973 /* We must always add an offset involving VL separately, rather than
2974 folding it into the relocation. */
2975 if (!offset.is_constant (&const_offset))
2977 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2978 emit_insn (gen_rtx_SET (dest, imm));
2979 else
2981 /* Do arithmetic on 32-bit values if the result is smaller
2982 than that. */
2983 if (partial_subreg_p (int_mode, SImode))
2985 /* It is invalid to do symbol calculations in modes
2986 narrower than SImode. */
2987 gcc_assert (base == const0_rtx);
2988 dest = gen_lowpart (SImode, dest);
2989 int_mode = SImode;
2991 if (base != const0_rtx)
2993 base = aarch64_force_temporary (int_mode, dest, base);
2994 aarch64_add_offset (int_mode, dest, base, offset,
2995 NULL_RTX, NULL_RTX, false);
2997 else
2998 aarch64_add_offset (int_mode, dest, base, offset,
2999 dest, NULL_RTX, false);
3001 return;
3004 sty = aarch64_classify_symbol (base, const_offset);
3005 switch (sty)
3007 case SYMBOL_FORCE_TO_MEM:
3008 if (const_offset != 0
3009 && targetm.cannot_force_const_mem (int_mode, imm))
3011 gcc_assert (can_create_pseudo_p ());
3012 base = aarch64_force_temporary (int_mode, dest, base);
3013 aarch64_add_offset (int_mode, dest, base, const_offset,
3014 NULL_RTX, NULL_RTX, false);
3015 return;
3018 mem = force_const_mem (ptr_mode, imm);
3019 gcc_assert (mem);
3021 /* If we aren't generating PC relative literals, then
3022 we need to expand the literal pool access carefully.
3023 This is something that needs to be done in a number
3024 of places, so could well live as a separate function. */
3025 if (!aarch64_pcrelative_literal_loads)
3027 gcc_assert (can_create_pseudo_p ());
3028 base = gen_reg_rtx (ptr_mode);
3029 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3030 if (ptr_mode != Pmode)
3031 base = convert_memory_address (Pmode, base);
3032 mem = gen_rtx_MEM (ptr_mode, base);
3035 if (int_mode != ptr_mode)
3036 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3038 emit_insn (gen_rtx_SET (dest, mem));
3040 return;
3042 case SYMBOL_SMALL_TLSGD:
3043 case SYMBOL_SMALL_TLSDESC:
3044 case SYMBOL_SMALL_TLSIE:
3045 case SYMBOL_SMALL_GOT_28K:
3046 case SYMBOL_SMALL_GOT_4G:
3047 case SYMBOL_TINY_GOT:
3048 case SYMBOL_TINY_TLSIE:
3049 if (const_offset != 0)
3051 gcc_assert(can_create_pseudo_p ());
3052 base = aarch64_force_temporary (int_mode, dest, base);
3053 aarch64_add_offset (int_mode, dest, base, const_offset,
3054 NULL_RTX, NULL_RTX, false);
3055 return;
3057 /* FALLTHRU */
3059 case SYMBOL_SMALL_ABSOLUTE:
3060 case SYMBOL_TINY_ABSOLUTE:
3061 case SYMBOL_TLSLE12:
3062 case SYMBOL_TLSLE24:
3063 case SYMBOL_TLSLE32:
3064 case SYMBOL_TLSLE48:
3065 aarch64_load_symref_appropriately (dest, imm, sty);
3066 return;
3068 default:
3069 gcc_unreachable ();
3073 if (!CONST_INT_P (imm))
3075 rtx base, step, value;
3076 if (GET_CODE (imm) == HIGH
3077 || aarch64_simd_valid_immediate (imm, NULL))
3078 emit_insn (gen_rtx_SET (dest, imm));
3079 else if (const_vec_series_p (imm, &base, &step))
3080 aarch64_expand_vec_series (dest, base, step);
3081 else if (const_vec_duplicate_p (imm, &value))
3083 /* If the constant is out of range of an SVE vector move,
3084 load it from memory if we can, otherwise move it into
3085 a register and use a DUP. */
3086 scalar_mode inner_mode = GET_MODE_INNER (mode);
3087 rtx op = force_const_mem (inner_mode, value);
3088 if (!op)
3089 op = force_reg (inner_mode, value);
3090 else if (!aarch64_sve_ld1r_operand_p (op))
3092 rtx addr = force_reg (Pmode, XEXP (op, 0));
3093 op = replace_equiv_address (op, addr);
3095 emit_insn (gen_vec_duplicate (dest, op));
3097 else if (GET_CODE (imm) == CONST_VECTOR
3098 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3099 aarch64_expand_sve_const_vector (dest, imm);
3100 else
3102 rtx mem = force_const_mem (mode, imm);
3103 gcc_assert (mem);
3104 emit_move_insn (dest, mem);
3107 return;
3110 aarch64_internal_mov_immediate (dest, imm, true,
3111 as_a <scalar_int_mode> (mode));
3114 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3115 that is known to contain PTRUE. */
3117 void
3118 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3120 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3121 gen_rtvec (2, pred, src),
3122 UNSPEC_MERGE_PTRUE)));
3125 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3126 operand is in memory. In this case we need to use the predicated LD1
3127 and ST1 instead of LDR and STR, both for correctness on big-endian
3128 targets and because LD1 and ST1 support a wider range of addressing modes.
3129 PRED_MODE is the mode of the predicate.
3131 See the comment at the head of aarch64-sve.md for details about the
3132 big-endian handling. */
3134 void
3135 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3137 machine_mode mode = GET_MODE (dest);
3138 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3139 if (!register_operand (src, mode)
3140 && !register_operand (dest, mode))
3142 rtx tmp = gen_reg_rtx (mode);
3143 if (MEM_P (src))
3144 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3145 else
3146 emit_move_insn (tmp, src);
3147 src = tmp;
3149 aarch64_emit_sve_pred_move (dest, ptrue, src);
3152 /* Called only on big-endian targets. See whether an SVE vector move
3153 from SRC to DEST is effectively a REV[BHW] instruction, because at
3154 least one operand is a subreg of an SVE vector that has wider or
3155 narrower elements. Return true and emit the instruction if so.
3157 For example:
3159 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3161 represents a VIEW_CONVERT between the following vectors, viewed
3162 in memory order:
3164 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3165 R1: { [0], [1], [2], [3], ... }
3167 The high part of lane X in R2 should therefore correspond to lane X*2
3168 of R1, but the register representations are:
3170 msb lsb
3171 R2: ...... [1].high [1].low [0].high [0].low
3172 R1: ...... [3] [2] [1] [0]
3174 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3175 We therefore need a reverse operation to swap the high and low values
3176 around.
3178 This is purely an optimization. Without it we would spill the
3179 subreg operand to the stack in one mode and reload it in the
3180 other mode, which has the same effect as the REV. */
3182 bool
3183 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3185 gcc_assert (BYTES_BIG_ENDIAN);
3186 if (GET_CODE (dest) == SUBREG)
3187 dest = SUBREG_REG (dest);
3188 if (GET_CODE (src) == SUBREG)
3189 src = SUBREG_REG (src);
3191 /* The optimization handles two single SVE REGs with different element
3192 sizes. */
3193 if (!REG_P (dest)
3194 || !REG_P (src)
3195 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3196 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3197 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3198 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3199 return false;
3201 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3202 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3203 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3204 UNSPEC_REV_SUBREG);
3205 emit_insn (gen_rtx_SET (dest, unspec));
3206 return true;
3209 /* Return a copy of X with mode MODE, without changing its other
3210 attributes. Unlike gen_lowpart, this doesn't care whether the
3211 mode change is valid. */
3213 static rtx
3214 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3216 if (GET_MODE (x) == mode)
3217 return x;
3219 x = shallow_copy_rtx (x);
3220 set_mode_and_regno (x, mode, REGNO (x));
3221 return x;
3224 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3225 operands. */
3227 void
3228 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3230 /* Decide which REV operation we need. The mode with narrower elements
3231 determines the mode of the operands and the mode with the wider
3232 elements determines the reverse width. */
3233 machine_mode mode_with_wider_elts = GET_MODE (dest);
3234 machine_mode mode_with_narrower_elts = GET_MODE (src);
3235 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3236 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3237 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3239 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3240 unsigned int unspec;
3241 if (wider_bytes == 8)
3242 unspec = UNSPEC_REV64;
3243 else if (wider_bytes == 4)
3244 unspec = UNSPEC_REV32;
3245 else if (wider_bytes == 2)
3246 unspec = UNSPEC_REV16;
3247 else
3248 gcc_unreachable ();
3249 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3251 /* Emit:
3253 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3254 UNSPEC_MERGE_PTRUE))
3256 with the appropriate modes. */
3257 ptrue = gen_lowpart (pred_mode, ptrue);
3258 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3259 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3260 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3261 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3262 UNSPEC_MERGE_PTRUE);
3263 emit_insn (gen_rtx_SET (dest, src));
3266 static bool
3267 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3268 tree exp ATTRIBUTE_UNUSED)
3270 /* Currently, always true. */
3271 return true;
3274 /* Implement TARGET_PASS_BY_REFERENCE. */
3276 static bool
3277 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3278 machine_mode mode,
3279 const_tree type,
3280 bool named ATTRIBUTE_UNUSED)
3282 HOST_WIDE_INT size;
3283 machine_mode dummymode;
3284 int nregs;
3286 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3287 if (mode == BLKmode && type)
3288 size = int_size_in_bytes (type);
3289 else
3290 /* No frontends can create types with variable-sized modes, so we
3291 shouldn't be asked to pass or return them. */
3292 size = GET_MODE_SIZE (mode).to_constant ();
3294 /* Aggregates are passed by reference based on their size. */
3295 if (type && AGGREGATE_TYPE_P (type))
3297 size = int_size_in_bytes (type);
3300 /* Variable sized arguments are always returned by reference. */
3301 if (size < 0)
3302 return true;
3304 /* Can this be a candidate to be passed in fp/simd register(s)? */
3305 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3306 &dummymode, &nregs,
3307 NULL))
3308 return false;
3310 /* Arguments which are variable sized or larger than 2 registers are
3311 passed by reference unless they are a homogenous floating point
3312 aggregate. */
3313 return size > 2 * UNITS_PER_WORD;
3316 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3317 static bool
3318 aarch64_return_in_msb (const_tree valtype)
3320 machine_mode dummy_mode;
3321 int dummy_int;
3323 /* Never happens in little-endian mode. */
3324 if (!BYTES_BIG_ENDIAN)
3325 return false;
3327 /* Only composite types smaller than or equal to 16 bytes can
3328 be potentially returned in registers. */
3329 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3330 || int_size_in_bytes (valtype) <= 0
3331 || int_size_in_bytes (valtype) > 16)
3332 return false;
3334 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3335 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3336 is always passed/returned in the least significant bits of fp/simd
3337 register(s). */
3338 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3339 &dummy_mode, &dummy_int, NULL))
3340 return false;
3342 return true;
3345 /* Implement TARGET_FUNCTION_VALUE.
3346 Define how to find the value returned by a function. */
3348 static rtx
3349 aarch64_function_value (const_tree type, const_tree func,
3350 bool outgoing ATTRIBUTE_UNUSED)
3352 machine_mode mode;
3353 int unsignedp;
3354 int count;
3355 machine_mode ag_mode;
3357 mode = TYPE_MODE (type);
3358 if (INTEGRAL_TYPE_P (type))
3359 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3361 if (aarch64_return_in_msb (type))
3363 HOST_WIDE_INT size = int_size_in_bytes (type);
3365 if (size % UNITS_PER_WORD != 0)
3367 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3368 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3372 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3373 &ag_mode, &count, NULL))
3375 if (!aarch64_composite_type_p (type, mode))
3377 gcc_assert (count == 1 && mode == ag_mode);
3378 return gen_rtx_REG (mode, V0_REGNUM);
3380 else
3382 int i;
3383 rtx par;
3385 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3386 for (i = 0; i < count; i++)
3388 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3389 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3390 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3391 XVECEXP (par, 0, i) = tmp;
3393 return par;
3396 else
3397 return gen_rtx_REG (mode, R0_REGNUM);
3400 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3401 Return true if REGNO is the number of a hard register in which the values
3402 of called function may come back. */
3404 static bool
3405 aarch64_function_value_regno_p (const unsigned int regno)
3407 /* Maximum of 16 bytes can be returned in the general registers. Examples
3408 of 16-byte return values are: 128-bit integers and 16-byte small
3409 structures (excluding homogeneous floating-point aggregates). */
3410 if (regno == R0_REGNUM || regno == R1_REGNUM)
3411 return true;
3413 /* Up to four fp/simd registers can return a function value, e.g. a
3414 homogeneous floating-point aggregate having four members. */
3415 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3416 return TARGET_FLOAT;
3418 return false;
3421 /* Implement TARGET_RETURN_IN_MEMORY.
3423 If the type T of the result of a function is such that
3424 void func (T arg)
3425 would require that arg be passed as a value in a register (or set of
3426 registers) according to the parameter passing rules, then the result
3427 is returned in the same registers as would be used for such an
3428 argument. */
3430 static bool
3431 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3433 HOST_WIDE_INT size;
3434 machine_mode ag_mode;
3435 int count;
3437 if (!AGGREGATE_TYPE_P (type)
3438 && TREE_CODE (type) != COMPLEX_TYPE
3439 && TREE_CODE (type) != VECTOR_TYPE)
3440 /* Simple scalar types always returned in registers. */
3441 return false;
3443 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3444 type,
3445 &ag_mode,
3446 &count,
3447 NULL))
3448 return false;
3450 /* Types larger than 2 registers returned in memory. */
3451 size = int_size_in_bytes (type);
3452 return (size < 0 || size > 2 * UNITS_PER_WORD);
3455 static bool
3456 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3457 const_tree type, int *nregs)
3459 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3460 return aarch64_vfp_is_call_or_return_candidate (mode,
3461 type,
3462 &pcum->aapcs_vfp_rmode,
3463 nregs,
3464 NULL);
3467 /* Given MODE and TYPE of a function argument, return the alignment in
3468 bits. The idea is to suppress any stronger alignment requested by
3469 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3470 This is a helper function for local use only. */
3472 static unsigned int
3473 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3475 if (!type)
3476 return GET_MODE_ALIGNMENT (mode);
3478 if (integer_zerop (TYPE_SIZE (type)))
3479 return 0;
3481 gcc_assert (TYPE_MODE (type) == mode);
3483 if (!AGGREGATE_TYPE_P (type))
3484 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3486 if (TREE_CODE (type) == ARRAY_TYPE)
3487 return TYPE_ALIGN (TREE_TYPE (type));
3489 unsigned int alignment = 0;
3490 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3491 if (TREE_CODE (field) == FIELD_DECL)
3492 alignment = std::max (alignment, DECL_ALIGN (field));
3494 return alignment;
3497 /* Layout a function argument according to the AAPCS64 rules. The rule
3498 numbers refer to the rule numbers in the AAPCS64. */
3500 static void
3501 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3502 const_tree type,
3503 bool named ATTRIBUTE_UNUSED)
3505 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3506 int ncrn, nvrn, nregs;
3507 bool allocate_ncrn, allocate_nvrn;
3508 HOST_WIDE_INT size;
3510 /* We need to do this once per argument. */
3511 if (pcum->aapcs_arg_processed)
3512 return;
3514 pcum->aapcs_arg_processed = true;
3516 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3517 if (type)
3518 size = int_size_in_bytes (type);
3519 else
3520 /* No frontends can create types with variable-sized modes, so we
3521 shouldn't be asked to pass or return them. */
3522 size = GET_MODE_SIZE (mode).to_constant ();
3523 size = ROUND_UP (size, UNITS_PER_WORD);
3525 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3526 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3527 mode,
3528 type,
3529 &nregs);
3531 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3532 The following code thus handles passing by SIMD/FP registers first. */
3534 nvrn = pcum->aapcs_nvrn;
3536 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3537 and homogenous short-vector aggregates (HVA). */
3538 if (allocate_nvrn)
3540 if (!TARGET_FLOAT)
3541 aarch64_err_no_fpadvsimd (mode);
3543 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3545 pcum->aapcs_nextnvrn = nvrn + nregs;
3546 if (!aarch64_composite_type_p (type, mode))
3548 gcc_assert (nregs == 1);
3549 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3551 else
3553 rtx par;
3554 int i;
3555 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3556 for (i = 0; i < nregs; i++)
3558 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3559 V0_REGNUM + nvrn + i);
3560 rtx offset = gen_int_mode
3561 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3562 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3563 XVECEXP (par, 0, i) = tmp;
3565 pcum->aapcs_reg = par;
3567 return;
3569 else
3571 /* C.3 NSRN is set to 8. */
3572 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3573 goto on_stack;
3577 ncrn = pcum->aapcs_ncrn;
3578 nregs = size / UNITS_PER_WORD;
3580 /* C6 - C9. though the sign and zero extension semantics are
3581 handled elsewhere. This is the case where the argument fits
3582 entirely general registers. */
3583 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3586 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3588 /* C.8 if the argument has an alignment of 16 then the NGRN is
3589 rounded up to the next even number. */
3590 if (nregs == 2
3591 && ncrn % 2
3592 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3593 comparison is there because for > 16 * BITS_PER_UNIT
3594 alignment nregs should be > 2 and therefore it should be
3595 passed by reference rather than value. */
3596 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3598 ++ncrn;
3599 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3602 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3603 A reg is still generated for it, but the caller should be smart
3604 enough not to use it. */
3605 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3606 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3607 else
3609 rtx par;
3610 int i;
3612 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3613 for (i = 0; i < nregs; i++)
3615 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3616 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3617 GEN_INT (i * UNITS_PER_WORD));
3618 XVECEXP (par, 0, i) = tmp;
3620 pcum->aapcs_reg = par;
3623 pcum->aapcs_nextncrn = ncrn + nregs;
3624 return;
3627 /* C.11 */
3628 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3630 /* The argument is passed on stack; record the needed number of words for
3631 this argument and align the total size if necessary. */
3632 on_stack:
3633 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3635 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3636 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3637 16 / UNITS_PER_WORD);
3638 return;
3641 /* Implement TARGET_FUNCTION_ARG. */
3643 static rtx
3644 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3645 const_tree type, bool named)
3647 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3648 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3650 if (mode == VOIDmode)
3651 return NULL_RTX;
3653 aarch64_layout_arg (pcum_v, mode, type, named);
3654 return pcum->aapcs_reg;
3657 void
3658 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3659 const_tree fntype ATTRIBUTE_UNUSED,
3660 rtx libname ATTRIBUTE_UNUSED,
3661 const_tree fndecl ATTRIBUTE_UNUSED,
3662 unsigned n_named ATTRIBUTE_UNUSED)
3664 pcum->aapcs_ncrn = 0;
3665 pcum->aapcs_nvrn = 0;
3666 pcum->aapcs_nextncrn = 0;
3667 pcum->aapcs_nextnvrn = 0;
3668 pcum->pcs_variant = ARM_PCS_AAPCS64;
3669 pcum->aapcs_reg = NULL_RTX;
3670 pcum->aapcs_arg_processed = false;
3671 pcum->aapcs_stack_words = 0;
3672 pcum->aapcs_stack_size = 0;
3674 if (!TARGET_FLOAT
3675 && fndecl && TREE_PUBLIC (fndecl)
3676 && fntype && fntype != error_mark_node)
3678 const_tree type = TREE_TYPE (fntype);
3679 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3680 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3681 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3682 &mode, &nregs, NULL))
3683 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3685 return;
3688 static void
3689 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3690 machine_mode mode,
3691 const_tree type,
3692 bool named)
3694 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3695 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3697 aarch64_layout_arg (pcum_v, mode, type, named);
3698 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3699 != (pcum->aapcs_stack_words != 0));
3700 pcum->aapcs_arg_processed = false;
3701 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3702 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3703 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3704 pcum->aapcs_stack_words = 0;
3705 pcum->aapcs_reg = NULL_RTX;
3709 bool
3710 aarch64_function_arg_regno_p (unsigned regno)
3712 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3713 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3716 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3717 PARM_BOUNDARY bits of alignment, but will be given anything up
3718 to STACK_BOUNDARY bits if the type requires it. This makes sure
3719 that both before and after the layout of each argument, the Next
3720 Stacked Argument Address (NSAA) will have a minimum alignment of
3721 8 bytes. */
3723 static unsigned int
3724 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3726 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3727 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3730 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3732 static fixed_size_mode
3733 aarch64_get_reg_raw_mode (int regno)
3735 if (TARGET_SVE && FP_REGNUM_P (regno))
3736 /* Don't use the SVE part of the register for __builtin_apply and
3737 __builtin_return. The SVE registers aren't used by the normal PCS,
3738 so using them there would be a waste of time. The PCS extensions
3739 for SVE types are fundamentally incompatible with the
3740 __builtin_return/__builtin_apply interface. */
3741 return as_a <fixed_size_mode> (V16QImode);
3742 return default_get_reg_raw_mode (regno);
3745 /* Implement TARGET_FUNCTION_ARG_PADDING.
3747 Small aggregate types are placed in the lowest memory address.
3749 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3751 static pad_direction
3752 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3754 /* On little-endian targets, the least significant byte of every stack
3755 argument is passed at the lowest byte address of the stack slot. */
3756 if (!BYTES_BIG_ENDIAN)
3757 return PAD_UPWARD;
3759 /* Otherwise, integral, floating-point and pointer types are padded downward:
3760 the least significant byte of a stack argument is passed at the highest
3761 byte address of the stack slot. */
3762 if (type
3763 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3764 || POINTER_TYPE_P (type))
3765 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3766 return PAD_DOWNWARD;
3768 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3769 return PAD_UPWARD;
3772 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3774 It specifies padding for the last (may also be the only)
3775 element of a block move between registers and memory. If
3776 assuming the block is in the memory, padding upward means that
3777 the last element is padded after its highest significant byte,
3778 while in downward padding, the last element is padded at the
3779 its least significant byte side.
3781 Small aggregates and small complex types are always padded
3782 upwards.
3784 We don't need to worry about homogeneous floating-point or
3785 short-vector aggregates; their move is not affected by the
3786 padding direction determined here. Regardless of endianness,
3787 each element of such an aggregate is put in the least
3788 significant bits of a fp/simd register.
3790 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3791 register has useful data, and return the opposite if the most
3792 significant byte does. */
3794 bool
3795 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3796 bool first ATTRIBUTE_UNUSED)
3799 /* Small composite types are always padded upward. */
3800 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3802 HOST_WIDE_INT size;
3803 if (type)
3804 size = int_size_in_bytes (type);
3805 else
3806 /* No frontends can create types with variable-sized modes, so we
3807 shouldn't be asked to pass or return them. */
3808 size = GET_MODE_SIZE (mode).to_constant ();
3809 if (size < 2 * UNITS_PER_WORD)
3810 return true;
3813 /* Otherwise, use the default padding. */
3814 return !BYTES_BIG_ENDIAN;
3817 static scalar_int_mode
3818 aarch64_libgcc_cmp_return_mode (void)
3820 return SImode;
3823 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3825 /* We use the 12-bit shifted immediate arithmetic instructions so values
3826 must be multiple of (1 << 12), i.e. 4096. */
3827 #define ARITH_FACTOR 4096
3829 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3830 #error Cannot use simple address calculation for stack probing
3831 #endif
3833 /* The pair of scratch registers used for stack probing. */
3834 #define PROBE_STACK_FIRST_REG 9
3835 #define PROBE_STACK_SECOND_REG 10
3837 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3838 inclusive. These are offsets from the current stack pointer. */
3840 static void
3841 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3843 HOST_WIDE_INT size;
3844 if (!poly_size.is_constant (&size))
3846 sorry ("stack probes for SVE frames");
3847 return;
3850 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3852 /* See the same assertion on PROBE_INTERVAL above. */
3853 gcc_assert ((first % ARITH_FACTOR) == 0);
3855 /* See if we have a constant small number of probes to generate. If so,
3856 that's the easy case. */
3857 if (size <= PROBE_INTERVAL)
3859 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3861 emit_set_insn (reg1,
3862 plus_constant (Pmode,
3863 stack_pointer_rtx, -(first + base)));
3864 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3867 /* The run-time loop is made up of 8 insns in the generic case while the
3868 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3869 else if (size <= 4 * PROBE_INTERVAL)
3871 HOST_WIDE_INT i, rem;
3873 emit_set_insn (reg1,
3874 plus_constant (Pmode,
3875 stack_pointer_rtx,
3876 -(first + PROBE_INTERVAL)));
3877 emit_stack_probe (reg1);
3879 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3880 it exceeds SIZE. If only two probes are needed, this will not
3881 generate any code. Then probe at FIRST + SIZE. */
3882 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3884 emit_set_insn (reg1,
3885 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3886 emit_stack_probe (reg1);
3889 rem = size - (i - PROBE_INTERVAL);
3890 if (rem > 256)
3892 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3894 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3895 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3897 else
3898 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3901 /* Otherwise, do the same as above, but in a loop. Note that we must be
3902 extra careful with variables wrapping around because we might be at
3903 the very top (or the very bottom) of the address space and we have
3904 to be able to handle this case properly; in particular, we use an
3905 equality test for the loop condition. */
3906 else
3908 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3910 /* Step 1: round SIZE to the previous multiple of the interval. */
3912 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3915 /* Step 2: compute initial and final value of the loop counter. */
3917 /* TEST_ADDR = SP + FIRST. */
3918 emit_set_insn (reg1,
3919 plus_constant (Pmode, stack_pointer_rtx, -first));
3921 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3922 HOST_WIDE_INT adjustment = - (first + rounded_size);
3923 if (! aarch64_uimm12_shift (adjustment))
3925 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3926 true, Pmode);
3927 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3929 else
3930 emit_set_insn (reg2,
3931 plus_constant (Pmode, stack_pointer_rtx, adjustment));
3933 /* Step 3: the loop
3937 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3938 probe at TEST_ADDR
3940 while (TEST_ADDR != LAST_ADDR)
3942 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3943 until it is equal to ROUNDED_SIZE. */
3945 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3948 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3949 that SIZE is equal to ROUNDED_SIZE. */
3951 if (size != rounded_size)
3953 HOST_WIDE_INT rem = size - rounded_size;
3955 if (rem > 256)
3957 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3959 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3960 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3962 else
3963 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3967 /* Make sure nothing is scheduled before we are done. */
3968 emit_insn (gen_blockage ());
3971 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3972 absolute addresses. */
3974 const char *
3975 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3977 static int labelno = 0;
3978 char loop_lab[32];
3979 rtx xops[2];
3981 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3983 /* Loop. */
3984 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3986 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3987 xops[0] = reg1;
3988 xops[1] = GEN_INT (PROBE_INTERVAL);
3989 output_asm_insn ("sub\t%0, %0, %1", xops);
3991 /* Probe at TEST_ADDR. */
3992 output_asm_insn ("str\txzr, [%0]", xops);
3994 /* Test if TEST_ADDR == LAST_ADDR. */
3995 xops[1] = reg2;
3996 output_asm_insn ("cmp\t%0, %1", xops);
3998 /* Branch. */
3999 fputs ("\tb.ne\t", asm_out_file);
4000 assemble_name_raw (asm_out_file, loop_lab);
4001 fputc ('\n', asm_out_file);
4003 return "";
4006 /* Determine whether a frame chain needs to be generated. */
4007 static bool
4008 aarch64_needs_frame_chain (void)
4010 /* Force a frame chain for EH returns so the return address is at FP+8. */
4011 if (frame_pointer_needed || crtl->calls_eh_return)
4012 return true;
4014 /* A leaf function cannot have calls or write LR. */
4015 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4017 /* Don't use a frame chain in leaf functions if leaf frame pointers
4018 are disabled. */
4019 if (flag_omit_leaf_frame_pointer && is_leaf)
4020 return false;
4022 return aarch64_use_frame_pointer;
4025 /* Mark the registers that need to be saved by the callee and calculate
4026 the size of the callee-saved registers area and frame record (both FP
4027 and LR may be omitted). */
4028 static void
4029 aarch64_layout_frame (void)
4031 HOST_WIDE_INT offset = 0;
4032 int regno, last_fp_reg = INVALID_REGNUM;
4034 if (reload_completed && cfun->machine->frame.laid_out)
4035 return;
4037 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4039 #define SLOT_NOT_REQUIRED (-2)
4040 #define SLOT_REQUIRED (-1)
4042 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4043 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4045 /* First mark all the registers that really need to be saved... */
4046 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4047 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4049 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4050 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4052 /* ... that includes the eh data registers (if needed)... */
4053 if (crtl->calls_eh_return)
4054 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4055 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4056 = SLOT_REQUIRED;
4058 /* ... and any callee saved register that dataflow says is live. */
4059 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4060 if (df_regs_ever_live_p (regno)
4061 && (regno == R30_REGNUM
4062 || !call_used_regs[regno]))
4063 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4065 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4066 if (df_regs_ever_live_p (regno)
4067 && !call_used_regs[regno])
4069 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4070 last_fp_reg = regno;
4073 if (cfun->machine->frame.emit_frame_chain)
4075 /* FP and LR are placed in the linkage record. */
4076 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4077 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4078 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4079 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4080 offset = 2 * UNITS_PER_WORD;
4083 /* Now assign stack slots for them. */
4084 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4085 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4087 cfun->machine->frame.reg_offset[regno] = offset;
4088 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4089 cfun->machine->frame.wb_candidate1 = regno;
4090 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4091 cfun->machine->frame.wb_candidate2 = regno;
4092 offset += UNITS_PER_WORD;
4095 HOST_WIDE_INT max_int_offset = offset;
4096 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4097 bool has_align_gap = offset != max_int_offset;
4099 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4100 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4102 /* If there is an alignment gap between integer and fp callee-saves,
4103 allocate the last fp register to it if possible. */
4104 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4106 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4107 break;
4110 cfun->machine->frame.reg_offset[regno] = offset;
4111 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4112 cfun->machine->frame.wb_candidate1 = regno;
4113 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4114 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4115 cfun->machine->frame.wb_candidate2 = regno;
4116 offset += UNITS_PER_WORD;
4119 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4121 cfun->machine->frame.saved_regs_size = offset;
4123 HOST_WIDE_INT varargs_and_saved_regs_size
4124 = offset + cfun->machine->frame.saved_varargs_size;
4126 cfun->machine->frame.hard_fp_offset
4127 = aligned_upper_bound (varargs_and_saved_regs_size
4128 + get_frame_size (),
4129 STACK_BOUNDARY / BITS_PER_UNIT);
4131 /* Both these values are already aligned. */
4132 gcc_assert (multiple_p (crtl->outgoing_args_size,
4133 STACK_BOUNDARY / BITS_PER_UNIT));
4134 cfun->machine->frame.frame_size
4135 = (cfun->machine->frame.hard_fp_offset
4136 + crtl->outgoing_args_size);
4138 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4140 cfun->machine->frame.initial_adjust = 0;
4141 cfun->machine->frame.final_adjust = 0;
4142 cfun->machine->frame.callee_adjust = 0;
4143 cfun->machine->frame.callee_offset = 0;
4145 HOST_WIDE_INT max_push_offset = 0;
4146 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4147 max_push_offset = 512;
4148 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4149 max_push_offset = 256;
4151 HOST_WIDE_INT const_size, const_fp_offset;
4152 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4153 && const_size < max_push_offset
4154 && known_eq (crtl->outgoing_args_size, 0))
4156 /* Simple, small frame with no outgoing arguments:
4157 stp reg1, reg2, [sp, -frame_size]!
4158 stp reg3, reg4, [sp, 16] */
4159 cfun->machine->frame.callee_adjust = const_size;
4161 else if (known_lt (crtl->outgoing_args_size
4162 + cfun->machine->frame.saved_regs_size, 512)
4163 && !(cfun->calls_alloca
4164 && known_lt (cfun->machine->frame.hard_fp_offset,
4165 max_push_offset)))
4167 /* Frame with small outgoing arguments:
4168 sub sp, sp, frame_size
4169 stp reg1, reg2, [sp, outgoing_args_size]
4170 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4171 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4172 cfun->machine->frame.callee_offset
4173 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4175 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4176 && const_fp_offset < max_push_offset)
4178 /* Frame with large outgoing arguments but a small local area:
4179 stp reg1, reg2, [sp, -hard_fp_offset]!
4180 stp reg3, reg4, [sp, 16]
4181 sub sp, sp, outgoing_args_size */
4182 cfun->machine->frame.callee_adjust = const_fp_offset;
4183 cfun->machine->frame.final_adjust
4184 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4186 else
4188 /* Frame with large local area and outgoing arguments using frame pointer:
4189 sub sp, sp, hard_fp_offset
4190 stp x29, x30, [sp, 0]
4191 add x29, sp, 0
4192 stp reg3, reg4, [sp, 16]
4193 sub sp, sp, outgoing_args_size */
4194 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4195 cfun->machine->frame.final_adjust
4196 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4199 cfun->machine->frame.laid_out = true;
4202 /* Return true if the register REGNO is saved on entry to
4203 the current function. */
4205 static bool
4206 aarch64_register_saved_on_entry (int regno)
4208 return cfun->machine->frame.reg_offset[regno] >= 0;
4211 /* Return the next register up from REGNO up to LIMIT for the callee
4212 to save. */
4214 static unsigned
4215 aarch64_next_callee_save (unsigned regno, unsigned limit)
4217 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4218 regno ++;
4219 return regno;
4222 /* Push the register number REGNO of mode MODE to the stack with write-back
4223 adjusting the stack by ADJUSTMENT. */
4225 static void
4226 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4227 HOST_WIDE_INT adjustment)
4229 rtx base_rtx = stack_pointer_rtx;
4230 rtx insn, reg, mem;
4232 reg = gen_rtx_REG (mode, regno);
4233 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4234 plus_constant (Pmode, base_rtx, -adjustment));
4235 mem = gen_frame_mem (mode, mem);
4237 insn = emit_move_insn (mem, reg);
4238 RTX_FRAME_RELATED_P (insn) = 1;
4241 /* Generate and return an instruction to store the pair of registers
4242 REG and REG2 of mode MODE to location BASE with write-back adjusting
4243 the stack location BASE by ADJUSTMENT. */
4245 static rtx
4246 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4247 HOST_WIDE_INT adjustment)
4249 switch (mode)
4251 case E_DImode:
4252 return gen_storewb_pairdi_di (base, base, reg, reg2,
4253 GEN_INT (-adjustment),
4254 GEN_INT (UNITS_PER_WORD - adjustment));
4255 case E_DFmode:
4256 return gen_storewb_pairdf_di (base, base, reg, reg2,
4257 GEN_INT (-adjustment),
4258 GEN_INT (UNITS_PER_WORD - adjustment));
4259 default:
4260 gcc_unreachable ();
4264 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4265 stack pointer by ADJUSTMENT. */
4267 static void
4268 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4270 rtx_insn *insn;
4271 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4273 if (regno2 == INVALID_REGNUM)
4274 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4276 rtx reg1 = gen_rtx_REG (mode, regno1);
4277 rtx reg2 = gen_rtx_REG (mode, regno2);
4279 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4280 reg2, adjustment));
4281 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4282 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4283 RTX_FRAME_RELATED_P (insn) = 1;
4286 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4287 adjusting it by ADJUSTMENT afterwards. */
4289 static rtx
4290 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4291 HOST_WIDE_INT adjustment)
4293 switch (mode)
4295 case E_DImode:
4296 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4297 GEN_INT (UNITS_PER_WORD));
4298 case E_DFmode:
4299 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4300 GEN_INT (UNITS_PER_WORD));
4301 default:
4302 gcc_unreachable ();
4306 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4307 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4308 into CFI_OPS. */
4310 static void
4311 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4312 rtx *cfi_ops)
4314 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4315 rtx reg1 = gen_rtx_REG (mode, regno1);
4317 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4319 if (regno2 == INVALID_REGNUM)
4321 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4322 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4323 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4325 else
4327 rtx reg2 = gen_rtx_REG (mode, regno2);
4328 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4329 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4330 reg2, adjustment));
4334 /* Generate and return a store pair instruction of mode MODE to store
4335 register REG1 to MEM1 and register REG2 to MEM2. */
4337 static rtx
4338 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4339 rtx reg2)
4341 switch (mode)
4343 case E_DImode:
4344 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4346 case E_DFmode:
4347 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4349 default:
4350 gcc_unreachable ();
4354 /* Generate and regurn a load pair isntruction of mode MODE to load register
4355 REG1 from MEM1 and register REG2 from MEM2. */
4357 static rtx
4358 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4359 rtx mem2)
4361 switch (mode)
4363 case E_DImode:
4364 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4366 case E_DFmode:
4367 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4369 default:
4370 gcc_unreachable ();
4374 /* Return TRUE if return address signing should be enabled for the current
4375 function, otherwise return FALSE. */
4377 bool
4378 aarch64_return_address_signing_enabled (void)
4380 /* This function should only be called after frame laid out. */
4381 gcc_assert (cfun->machine->frame.laid_out);
4383 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4384 if it's LR is pushed onto stack. */
4385 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4386 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4387 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4390 /* Emit code to save the callee-saved registers from register number START
4391 to LIMIT to the stack at the location starting at offset START_OFFSET,
4392 skipping any write-back candidates if SKIP_WB is true. */
4394 static void
4395 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4396 unsigned start, unsigned limit, bool skip_wb)
4398 rtx_insn *insn;
4399 unsigned regno;
4400 unsigned regno2;
4402 for (regno = aarch64_next_callee_save (start, limit);
4403 regno <= limit;
4404 regno = aarch64_next_callee_save (regno + 1, limit))
4406 rtx reg, mem;
4407 poly_int64 offset;
4409 if (skip_wb
4410 && (regno == cfun->machine->frame.wb_candidate1
4411 || regno == cfun->machine->frame.wb_candidate2))
4412 continue;
4414 if (cfun->machine->reg_is_wrapped_separately[regno])
4415 continue;
4417 reg = gen_rtx_REG (mode, regno);
4418 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4419 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4420 offset));
4422 regno2 = aarch64_next_callee_save (regno + 1, limit);
4424 if (regno2 <= limit
4425 && !cfun->machine->reg_is_wrapped_separately[regno2]
4426 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4427 == cfun->machine->frame.reg_offset[regno2]))
4430 rtx reg2 = gen_rtx_REG (mode, regno2);
4431 rtx mem2;
4433 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4434 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4435 offset));
4436 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4437 reg2));
4439 /* The first part of a frame-related parallel insn is
4440 always assumed to be relevant to the frame
4441 calculations; subsequent parts, are only
4442 frame-related if explicitly marked. */
4443 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4444 regno = regno2;
4446 else
4447 insn = emit_move_insn (mem, reg);
4449 RTX_FRAME_RELATED_P (insn) = 1;
4453 /* Emit code to restore the callee registers of mode MODE from register
4454 number START up to and including LIMIT. Restore from the stack offset
4455 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4456 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4458 static void
4459 aarch64_restore_callee_saves (machine_mode mode,
4460 poly_int64 start_offset, unsigned start,
4461 unsigned limit, bool skip_wb, rtx *cfi_ops)
4463 rtx base_rtx = stack_pointer_rtx;
4464 unsigned regno;
4465 unsigned regno2;
4466 poly_int64 offset;
4468 for (regno = aarch64_next_callee_save (start, limit);
4469 regno <= limit;
4470 regno = aarch64_next_callee_save (regno + 1, limit))
4472 if (cfun->machine->reg_is_wrapped_separately[regno])
4473 continue;
4475 rtx reg, mem;
4477 if (skip_wb
4478 && (regno == cfun->machine->frame.wb_candidate1
4479 || regno == cfun->machine->frame.wb_candidate2))
4480 continue;
4482 reg = gen_rtx_REG (mode, regno);
4483 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4484 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4486 regno2 = aarch64_next_callee_save (regno + 1, limit);
4488 if (regno2 <= limit
4489 && !cfun->machine->reg_is_wrapped_separately[regno2]
4490 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4491 == cfun->machine->frame.reg_offset[regno2]))
4493 rtx reg2 = gen_rtx_REG (mode, regno2);
4494 rtx mem2;
4496 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4497 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4498 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4500 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4501 regno = regno2;
4503 else
4504 emit_move_insn (reg, mem);
4505 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4509 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4510 of MODE. */
4512 static inline bool
4513 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4515 HOST_WIDE_INT multiple;
4516 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4517 && IN_RANGE (multiple, -8, 7));
4520 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4521 of MODE. */
4523 static inline bool
4524 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4526 HOST_WIDE_INT multiple;
4527 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4528 && IN_RANGE (multiple, 0, 63));
4531 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4532 of MODE. */
4534 bool
4535 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4537 HOST_WIDE_INT multiple;
4538 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4539 && IN_RANGE (multiple, -64, 63));
4542 /* Return true if OFFSET is a signed 9-bit value. */
4544 static inline bool
4545 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4546 poly_int64 offset)
4548 HOST_WIDE_INT const_offset;
4549 return (offset.is_constant (&const_offset)
4550 && IN_RANGE (const_offset, -256, 255));
4553 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4554 of MODE. */
4556 static inline bool
4557 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4559 HOST_WIDE_INT multiple;
4560 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4561 && IN_RANGE (multiple, -256, 255));
4564 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4565 of MODE. */
4567 static inline bool
4568 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4570 HOST_WIDE_INT multiple;
4571 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4572 && IN_RANGE (multiple, 0, 4095));
4575 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4577 static sbitmap
4578 aarch64_get_separate_components (void)
4580 aarch64_layout_frame ();
4582 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4583 bitmap_clear (components);
4585 /* The registers we need saved to the frame. */
4586 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4587 if (aarch64_register_saved_on_entry (regno))
4589 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4590 if (!frame_pointer_needed)
4591 offset += cfun->machine->frame.frame_size
4592 - cfun->machine->frame.hard_fp_offset;
4593 /* Check that we can access the stack slot of the register with one
4594 direct load with no adjustments needed. */
4595 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4596 bitmap_set_bit (components, regno);
4599 /* Don't mess with the hard frame pointer. */
4600 if (frame_pointer_needed)
4601 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4603 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4604 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4605 /* If aarch64_layout_frame has chosen registers to store/restore with
4606 writeback don't interfere with them to avoid having to output explicit
4607 stack adjustment instructions. */
4608 if (reg2 != INVALID_REGNUM)
4609 bitmap_clear_bit (components, reg2);
4610 if (reg1 != INVALID_REGNUM)
4611 bitmap_clear_bit (components, reg1);
4613 bitmap_clear_bit (components, LR_REGNUM);
4614 bitmap_clear_bit (components, SP_REGNUM);
4616 return components;
4619 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4621 static sbitmap
4622 aarch64_components_for_bb (basic_block bb)
4624 bitmap in = DF_LIVE_IN (bb);
4625 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4626 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4628 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4629 bitmap_clear (components);
4631 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4632 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4633 if ((!call_used_regs[regno])
4634 && (bitmap_bit_p (in, regno)
4635 || bitmap_bit_p (gen, regno)
4636 || bitmap_bit_p (kill, regno)))
4638 unsigned regno2, offset, offset2;
4639 bitmap_set_bit (components, regno);
4641 /* If there is a callee-save at an adjacent offset, add it too
4642 to increase the use of LDP/STP. */
4643 offset = cfun->machine->frame.reg_offset[regno];
4644 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4646 if (regno2 <= LAST_SAVED_REGNUM)
4648 offset2 = cfun->machine->frame.reg_offset[regno2];
4649 if ((offset & ~8) == (offset2 & ~8))
4650 bitmap_set_bit (components, regno2);
4654 return components;
4657 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4658 Nothing to do for aarch64. */
4660 static void
4661 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4665 /* Return the next set bit in BMP from START onwards. Return the total number
4666 of bits in BMP if no set bit is found at or after START. */
4668 static unsigned int
4669 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4671 unsigned int nbits = SBITMAP_SIZE (bmp);
4672 if (start == nbits)
4673 return start;
4675 gcc_assert (start < nbits);
4676 for (unsigned int i = start; i < nbits; i++)
4677 if (bitmap_bit_p (bmp, i))
4678 return i;
4680 return nbits;
4683 /* Do the work for aarch64_emit_prologue_components and
4684 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4685 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4686 for these components or the epilogue sequence. That is, it determines
4687 whether we should emit stores or loads and what kind of CFA notes to attach
4688 to the insns. Otherwise the logic for the two sequences is very
4689 similar. */
4691 static void
4692 aarch64_process_components (sbitmap components, bool prologue_p)
4694 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4695 ? HARD_FRAME_POINTER_REGNUM
4696 : STACK_POINTER_REGNUM);
4698 unsigned last_regno = SBITMAP_SIZE (components);
4699 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4700 rtx_insn *insn = NULL;
4702 while (regno != last_regno)
4704 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4705 so DFmode for the vector registers is enough. */
4706 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4707 rtx reg = gen_rtx_REG (mode, regno);
4708 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4709 if (!frame_pointer_needed)
4710 offset += cfun->machine->frame.frame_size
4711 - cfun->machine->frame.hard_fp_offset;
4712 rtx addr = plus_constant (Pmode, ptr_reg, offset);
4713 rtx mem = gen_frame_mem (mode, addr);
4715 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4716 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4717 /* No more registers to handle after REGNO.
4718 Emit a single save/restore and exit. */
4719 if (regno2 == last_regno)
4721 insn = emit_insn (set);
4722 RTX_FRAME_RELATED_P (insn) = 1;
4723 if (prologue_p)
4724 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4725 else
4726 add_reg_note (insn, REG_CFA_RESTORE, reg);
4727 break;
4730 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4731 /* The next register is not of the same class or its offset is not
4732 mergeable with the current one into a pair. */
4733 if (!satisfies_constraint_Ump (mem)
4734 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4735 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4736 GET_MODE_SIZE (mode)))
4738 insn = emit_insn (set);
4739 RTX_FRAME_RELATED_P (insn) = 1;
4740 if (prologue_p)
4741 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4742 else
4743 add_reg_note (insn, REG_CFA_RESTORE, reg);
4745 regno = regno2;
4746 continue;
4749 /* REGNO2 can be saved/restored in a pair with REGNO. */
4750 rtx reg2 = gen_rtx_REG (mode, regno2);
4751 if (!frame_pointer_needed)
4752 offset2 += cfun->machine->frame.frame_size
4753 - cfun->machine->frame.hard_fp_offset;
4754 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4755 rtx mem2 = gen_frame_mem (mode, addr2);
4756 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4757 : gen_rtx_SET (reg2, mem2);
4759 if (prologue_p)
4760 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4761 else
4762 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4764 RTX_FRAME_RELATED_P (insn) = 1;
4765 if (prologue_p)
4767 add_reg_note (insn, REG_CFA_OFFSET, set);
4768 add_reg_note (insn, REG_CFA_OFFSET, set2);
4770 else
4772 add_reg_note (insn, REG_CFA_RESTORE, reg);
4773 add_reg_note (insn, REG_CFA_RESTORE, reg2);
4776 regno = aarch64_get_next_set_bit (components, regno2 + 1);
4780 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4782 static void
4783 aarch64_emit_prologue_components (sbitmap components)
4785 aarch64_process_components (components, true);
4788 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4790 static void
4791 aarch64_emit_epilogue_components (sbitmap components)
4793 aarch64_process_components (components, false);
4796 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4798 static void
4799 aarch64_set_handled_components (sbitmap components)
4801 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4802 if (bitmap_bit_p (components, regno))
4803 cfun->machine->reg_is_wrapped_separately[regno] = true;
4806 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4807 is saved at BASE + OFFSET. */
4809 static void
4810 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4811 rtx base, poly_int64 offset)
4813 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4814 add_reg_note (insn, REG_CFA_EXPRESSION,
4815 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4818 /* AArch64 stack frames generated by this compiler look like:
4820 +-------------------------------+
4822 | incoming stack arguments |
4824 +-------------------------------+
4825 | | <-- incoming stack pointer (aligned)
4826 | callee-allocated save area |
4827 | for register varargs |
4829 +-------------------------------+
4830 | local variables | <-- frame_pointer_rtx
4832 +-------------------------------+
4833 | padding0 | \
4834 +-------------------------------+ |
4835 | callee-saved registers | | frame.saved_regs_size
4836 +-------------------------------+ |
4837 | LR' | |
4838 +-------------------------------+ |
4839 | FP' | / <- hard_frame_pointer_rtx (aligned)
4840 +-------------------------------+
4841 | dynamic allocation |
4842 +-------------------------------+
4843 | padding |
4844 +-------------------------------+
4845 | outgoing stack arguments | <-- arg_pointer
4847 +-------------------------------+
4848 | | <-- stack_pointer_rtx (aligned)
4850 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4851 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4852 unchanged. */
4854 /* Generate the prologue instructions for entry into a function.
4855 Establish the stack frame by decreasing the stack pointer with a
4856 properly calculated size and, if necessary, create a frame record
4857 filled with the values of LR and previous frame pointer. The
4858 current FP is also set up if it is in use. */
4860 void
4861 aarch64_expand_prologue (void)
4863 aarch64_layout_frame ();
4865 poly_int64 frame_size = cfun->machine->frame.frame_size;
4866 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4867 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4868 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4869 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4870 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4871 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4872 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4873 rtx_insn *insn;
4875 /* Sign return address for functions. */
4876 if (aarch64_return_address_signing_enabled ())
4878 insn = emit_insn (gen_pacisp ());
4879 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4880 RTX_FRAME_RELATED_P (insn) = 1;
4883 if (flag_stack_usage_info)
4884 current_function_static_stack_size = constant_lower_bound (frame_size);
4886 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4888 if (crtl->is_leaf && !cfun->calls_alloca)
4890 if (maybe_gt (frame_size, PROBE_INTERVAL)
4891 && maybe_gt (frame_size, get_stack_check_protect ()))
4892 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4893 (frame_size
4894 - get_stack_check_protect ()));
4896 else if (maybe_gt (frame_size, 0))
4897 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4900 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4901 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4903 aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4905 if (callee_adjust != 0)
4906 aarch64_push_regs (reg1, reg2, callee_adjust);
4908 if (emit_frame_chain)
4910 poly_int64 reg_offset = callee_adjust;
4911 if (callee_adjust == 0)
4913 reg1 = R29_REGNUM;
4914 reg2 = R30_REGNUM;
4915 reg_offset = callee_offset;
4916 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4918 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4919 stack_pointer_rtx, callee_offset,
4920 ip1_rtx, ip0_rtx, frame_pointer_needed);
4921 if (frame_pointer_needed && !frame_size.is_constant ())
4923 /* Variable-sized frames need to describe the save slot
4924 address using DW_CFA_expression rather than DW_CFA_offset.
4925 This means that, without taking further action, the
4926 locations of the registers that we've already saved would
4927 remain based on the stack pointer even after we redefine
4928 the CFA based on the frame pointer. We therefore need new
4929 DW_CFA_expressions to re-express the save slots with addresses
4930 based on the frame pointer. */
4931 rtx_insn *insn = get_last_insn ();
4932 gcc_assert (RTX_FRAME_RELATED_P (insn));
4934 /* Add an explicit CFA definition if this was previously
4935 implicit. */
4936 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4938 rtx src = plus_constant (Pmode, stack_pointer_rtx,
4939 callee_offset);
4940 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4941 gen_rtx_SET (hard_frame_pointer_rtx, src));
4944 /* Change the save slot expressions for the registers that
4945 we've already saved. */
4946 reg_offset -= callee_offset;
4947 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4948 reg_offset + UNITS_PER_WORD);
4949 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4950 reg_offset);
4952 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4955 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4956 callee_adjust != 0 || emit_frame_chain);
4957 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4958 callee_adjust != 0 || emit_frame_chain);
4959 aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4962 /* Return TRUE if we can use a simple_return insn.
4964 This function checks whether the callee saved stack is empty, which
4965 means no restore actions are need. The pro_and_epilogue will use
4966 this to check whether shrink-wrapping opt is feasible. */
4968 bool
4969 aarch64_use_return_insn_p (void)
4971 if (!reload_completed)
4972 return false;
4974 if (crtl->profile)
4975 return false;
4977 aarch64_layout_frame ();
4979 return known_eq (cfun->machine->frame.frame_size, 0);
4982 /* Generate the epilogue instructions for returning from a function.
4983 This is almost exactly the reverse of the prolog sequence, except
4984 that we need to insert barriers to avoid scheduling loads that read
4985 from a deallocated stack, and we optimize the unwind records by
4986 emitting them all together if possible. */
4987 void
4988 aarch64_expand_epilogue (bool for_sibcall)
4990 aarch64_layout_frame ();
4992 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4993 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4994 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4995 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4996 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4997 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4998 rtx cfi_ops = NULL;
4999 rtx_insn *insn;
5000 /* A stack clash protection prologue may not have left IP0_REGNUM or
5001 IP1_REGNUM in a usable state. The same is true for allocations
5002 with an SVE component, since we then need both temporary registers
5003 for each allocation. */
5004 bool can_inherit_p = (initial_adjust.is_constant ()
5005 && final_adjust.is_constant ()
5006 && !flag_stack_clash_protection);
5008 /* We need to add memory barrier to prevent read from deallocated stack. */
5009 bool need_barrier_p
5010 = maybe_ne (get_frame_size ()
5011 + cfun->machine->frame.saved_varargs_size, 0);
5013 /* Emit a barrier to prevent loads from a deallocated stack. */
5014 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5015 || cfun->calls_alloca
5016 || crtl->calls_eh_return)
5018 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5019 need_barrier_p = false;
5022 /* Restore the stack pointer from the frame pointer if it may not
5023 be the same as the stack pointer. */
5024 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
5025 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
5026 if (frame_pointer_needed
5027 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5028 /* If writeback is used when restoring callee-saves, the CFA
5029 is restored on the instruction doing the writeback. */
5030 aarch64_add_offset (Pmode, stack_pointer_rtx,
5031 hard_frame_pointer_rtx, -callee_offset,
5032 ip1_rtx, ip0_rtx, callee_adjust == 0);
5033 else
5034 aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
5035 !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
5037 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5038 callee_adjust != 0, &cfi_ops);
5039 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5040 callee_adjust != 0, &cfi_ops);
5042 if (need_barrier_p)
5043 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5045 if (callee_adjust != 0)
5046 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5048 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5050 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5051 insn = get_last_insn ();
5052 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5053 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5054 RTX_FRAME_RELATED_P (insn) = 1;
5055 cfi_ops = NULL;
5058 aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5059 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5061 if (cfi_ops)
5063 /* Emit delayed restores and reset the CFA to be SP. */
5064 insn = get_last_insn ();
5065 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5066 REG_NOTES (insn) = cfi_ops;
5067 RTX_FRAME_RELATED_P (insn) = 1;
5070 /* We prefer to emit the combined return/authenticate instruction RETAA,
5071 however there are three cases in which we must instead emit an explicit
5072 authentication instruction.
5074 1) Sibcalls don't return in a normal way, so if we're about to call one
5075 we must authenticate.
5077 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5078 generating code for !TARGET_ARMV8_3 we can't use it and must
5079 explicitly authenticate.
5081 3) On an eh_return path we make extra stack adjustments to update the
5082 canonical frame address to be the exception handler's CFA. We want
5083 to authenticate using the CFA of the function which calls eh_return.
5085 if (aarch64_return_address_signing_enabled ()
5086 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5088 insn = emit_insn (gen_autisp ());
5089 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5090 RTX_FRAME_RELATED_P (insn) = 1;
5093 /* Stack adjustment for exception handler. */
5094 if (crtl->calls_eh_return)
5096 /* We need to unwind the stack by the offset computed by
5097 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5098 to be SP; letting the CFA move during this adjustment
5099 is just as correct as retaining the CFA from the body
5100 of the function. Therefore, do nothing special. */
5101 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5104 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5105 if (!for_sibcall)
5106 emit_jump_insn (ret_rtx);
5109 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5110 normally or return to a previous frame after unwinding.
5112 An EH return uses a single shared return sequence. The epilogue is
5113 exactly like a normal epilogue except that it has an extra input
5114 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5115 that must be applied after the frame has been destroyed. An extra label
5116 is inserted before the epilogue which initializes this register to zero,
5117 and this is the entry point for a normal return.
5119 An actual EH return updates the return address, initializes the stack
5120 adjustment and jumps directly into the epilogue (bypassing the zeroing
5121 of the adjustment). Since the return address is typically saved on the
5122 stack when a function makes a call, the saved LR must be updated outside
5123 the epilogue.
5125 This poses problems as the store is generated well before the epilogue,
5126 so the offset of LR is not known yet. Also optimizations will remove the
5127 store as it appears dead, even after the epilogue is generated (as the
5128 base or offset for loading LR is different in many cases).
5130 To avoid these problems this implementation forces the frame pointer
5131 in eh_return functions so that the location of LR is fixed and known early.
5132 It also marks the store volatile, so no optimization is permitted to
5133 remove the store. */
5135 aarch64_eh_return_handler_rtx (void)
5137 rtx tmp = gen_frame_mem (Pmode,
5138 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5140 /* Mark the store volatile, so no optimization is permitted to remove it. */
5141 MEM_VOLATILE_P (tmp) = true;
5142 return tmp;
5145 /* Output code to add DELTA to the first argument, and then jump
5146 to FUNCTION. Used for C++ multiple inheritance. */
5147 static void
5148 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5149 HOST_WIDE_INT delta,
5150 HOST_WIDE_INT vcall_offset,
5151 tree function)
5153 /* The this pointer is always in x0. Note that this differs from
5154 Arm where the this pointer maybe bumped to r1 if r0 is required
5155 to return a pointer to an aggregate. On AArch64 a result value
5156 pointer will be in x8. */
5157 int this_regno = R0_REGNUM;
5158 rtx this_rtx, temp0, temp1, addr, funexp;
5159 rtx_insn *insn;
5161 reload_completed = 1;
5162 emit_note (NOTE_INSN_PROLOGUE_END);
5164 this_rtx = gen_rtx_REG (Pmode, this_regno);
5165 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5166 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5168 if (vcall_offset == 0)
5169 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5170 else
5172 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5174 addr = this_rtx;
5175 if (delta != 0)
5177 if (delta >= -256 && delta < 256)
5178 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5179 plus_constant (Pmode, this_rtx, delta));
5180 else
5181 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5182 temp1, temp0, false);
5185 if (Pmode == ptr_mode)
5186 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5187 else
5188 aarch64_emit_move (temp0,
5189 gen_rtx_ZERO_EXTEND (Pmode,
5190 gen_rtx_MEM (ptr_mode, addr)));
5192 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5193 addr = plus_constant (Pmode, temp0, vcall_offset);
5194 else
5196 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5197 Pmode);
5198 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5201 if (Pmode == ptr_mode)
5202 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5203 else
5204 aarch64_emit_move (temp1,
5205 gen_rtx_SIGN_EXTEND (Pmode,
5206 gen_rtx_MEM (ptr_mode, addr)));
5208 emit_insn (gen_add2_insn (this_rtx, temp1));
5211 /* Generate a tail call to the target function. */
5212 if (!TREE_USED (function))
5214 assemble_external (function);
5215 TREE_USED (function) = 1;
5217 funexp = XEXP (DECL_RTL (function), 0);
5218 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5219 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5220 SIBLING_CALL_P (insn) = 1;
5222 insn = get_insns ();
5223 shorten_branches (insn);
5224 final_start_function (insn, file, 1);
5225 final (insn, file, 1);
5226 final_end_function ();
5228 /* Stop pretending to be a post-reload pass. */
5229 reload_completed = 0;
5232 static bool
5233 aarch64_tls_referenced_p (rtx x)
5235 if (!TARGET_HAVE_TLS)
5236 return false;
5237 subrtx_iterator::array_type array;
5238 FOR_EACH_SUBRTX (iter, array, x, ALL)
5240 const_rtx x = *iter;
5241 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5242 return true;
5243 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5244 TLS offsets, not real symbol references. */
5245 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5246 iter.skip_subrtxes ();
5248 return false;
5252 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5253 a left shift of 0 or 12 bits. */
5254 bool
5255 aarch64_uimm12_shift (HOST_WIDE_INT val)
5257 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5258 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5263 /* Return true if val is an immediate that can be loaded into a
5264 register by a MOVZ instruction. */
5265 static bool
5266 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5268 if (GET_MODE_SIZE (mode) > 4)
5270 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5271 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5272 return 1;
5274 else
5276 /* Ignore sign extension. */
5277 val &= (HOST_WIDE_INT) 0xffffffff;
5279 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5280 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5283 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5284 64-bit (DImode) integer. */
5286 static unsigned HOST_WIDE_INT
5287 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5289 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5290 while (size < 64)
5292 val &= (HOST_WIDE_INT_1U << size) - 1;
5293 val |= val << size;
5294 size *= 2;
5296 return val;
5299 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5301 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5303 0x0000000100000001ull,
5304 0x0001000100010001ull,
5305 0x0101010101010101ull,
5306 0x1111111111111111ull,
5307 0x5555555555555555ull,
5311 /* Return true if val is a valid bitmask immediate. */
5313 bool
5314 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5316 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5317 int bits;
5319 /* Check for a single sequence of one bits and return quickly if so.
5320 The special cases of all ones and all zeroes returns false. */
5321 val = aarch64_replicate_bitmask_imm (val_in, mode);
5322 tmp = val + (val & -val);
5324 if (tmp == (tmp & -tmp))
5325 return (val + 1) > 1;
5327 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5328 if (mode == SImode)
5329 val = (val << 32) | (val & 0xffffffff);
5331 /* Invert if the immediate doesn't start with a zero bit - this means we
5332 only need to search for sequences of one bits. */
5333 if (val & 1)
5334 val = ~val;
5336 /* Find the first set bit and set tmp to val with the first sequence of one
5337 bits removed. Return success if there is a single sequence of ones. */
5338 first_one = val & -val;
5339 tmp = val & (val + first_one);
5341 if (tmp == 0)
5342 return true;
5344 /* Find the next set bit and compute the difference in bit position. */
5345 next_one = tmp & -tmp;
5346 bits = clz_hwi (first_one) - clz_hwi (next_one);
5347 mask = val ^ tmp;
5349 /* Check the bit position difference is a power of 2, and that the first
5350 sequence of one bits fits within 'bits' bits. */
5351 if ((mask >> bits) != 0 || bits != (bits & -bits))
5352 return false;
5354 /* Check the sequence of one bits is repeated 64/bits times. */
5355 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5358 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5359 Assumed precondition: VAL_IN Is not zero. */
5361 unsigned HOST_WIDE_INT
5362 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5364 int lowest_bit_set = ctz_hwi (val_in);
5365 int highest_bit_set = floor_log2 (val_in);
5366 gcc_assert (val_in != 0);
5368 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5369 (HOST_WIDE_INT_1U << lowest_bit_set));
5372 /* Create constant where bits outside of lowest bit set to highest bit set
5373 are set to 1. */
5375 unsigned HOST_WIDE_INT
5376 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5378 return val_in | ~aarch64_and_split_imm1 (val_in);
5381 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5383 bool
5384 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5386 scalar_int_mode int_mode;
5387 if (!is_a <scalar_int_mode> (mode, &int_mode))
5388 return false;
5390 if (aarch64_bitmask_imm (val_in, int_mode))
5391 return false;
5393 if (aarch64_move_imm (val_in, int_mode))
5394 return false;
5396 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5398 return aarch64_bitmask_imm (imm2, int_mode);
5401 /* Return true if val is an immediate that can be loaded into a
5402 register in a single instruction. */
5403 bool
5404 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5406 scalar_int_mode int_mode;
5407 if (!is_a <scalar_int_mode> (mode, &int_mode))
5408 return false;
5410 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5411 return 1;
5412 return aarch64_bitmask_imm (val, int_mode);
5415 static bool
5416 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5418 rtx base, offset;
5420 if (GET_CODE (x) == HIGH)
5421 return true;
5423 /* There's no way to calculate VL-based values using relocations. */
5424 subrtx_iterator::array_type array;
5425 FOR_EACH_SUBRTX (iter, array, x, ALL)
5426 if (GET_CODE (*iter) == CONST_POLY_INT)
5427 return true;
5429 split_const (x, &base, &offset);
5430 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5432 if (aarch64_classify_symbol (base, INTVAL (offset))
5433 != SYMBOL_FORCE_TO_MEM)
5434 return true;
5435 else
5436 /* Avoid generating a 64-bit relocation in ILP32; leave
5437 to aarch64_expand_mov_immediate to handle it properly. */
5438 return mode != ptr_mode;
5441 return aarch64_tls_referenced_p (x);
5444 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5445 The expansion for a table switch is quite expensive due to the number
5446 of instructions, the table lookup and hard to predict indirect jump.
5447 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5448 set, otherwise use tables for > 16 cases as a tradeoff between size and
5449 performance. When optimizing for size, use the default setting. */
5451 static unsigned int
5452 aarch64_case_values_threshold (void)
5454 /* Use the specified limit for the number of cases before using jump
5455 tables at higher optimization levels. */
5456 if (optimize > 2
5457 && selected_cpu->tune->max_case_values != 0)
5458 return selected_cpu->tune->max_case_values;
5459 else
5460 return optimize_size ? default_case_values_threshold () : 17;
5463 /* Return true if register REGNO is a valid index register.
5464 STRICT_P is true if REG_OK_STRICT is in effect. */
5466 bool
5467 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5469 if (!HARD_REGISTER_NUM_P (regno))
5471 if (!strict_p)
5472 return true;
5474 if (!reg_renumber)
5475 return false;
5477 regno = reg_renumber[regno];
5479 return GP_REGNUM_P (regno);
5482 /* Return true if register REGNO is a valid base register for mode MODE.
5483 STRICT_P is true if REG_OK_STRICT is in effect. */
5485 bool
5486 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5488 if (!HARD_REGISTER_NUM_P (regno))
5490 if (!strict_p)
5491 return true;
5493 if (!reg_renumber)
5494 return false;
5496 regno = reg_renumber[regno];
5499 /* The fake registers will be eliminated to either the stack or
5500 hard frame pointer, both of which are usually valid base registers.
5501 Reload deals with the cases where the eliminated form isn't valid. */
5502 return (GP_REGNUM_P (regno)
5503 || regno == SP_REGNUM
5504 || regno == FRAME_POINTER_REGNUM
5505 || regno == ARG_POINTER_REGNUM);
5508 /* Return true if X is a valid base register for mode MODE.
5509 STRICT_P is true if REG_OK_STRICT is in effect. */
5511 static bool
5512 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5514 if (!strict_p
5515 && GET_CODE (x) == SUBREG
5516 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5517 x = SUBREG_REG (x);
5519 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5522 /* Return true if address offset is a valid index. If it is, fill in INFO
5523 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5525 static bool
5526 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5527 machine_mode mode, bool strict_p)
5529 enum aarch64_address_type type;
5530 rtx index;
5531 int shift;
5533 /* (reg:P) */
5534 if ((REG_P (x) || GET_CODE (x) == SUBREG)
5535 && GET_MODE (x) == Pmode)
5537 type = ADDRESS_REG_REG;
5538 index = x;
5539 shift = 0;
5541 /* (sign_extend:DI (reg:SI)) */
5542 else if ((GET_CODE (x) == SIGN_EXTEND
5543 || GET_CODE (x) == ZERO_EXTEND)
5544 && GET_MODE (x) == DImode
5545 && GET_MODE (XEXP (x, 0)) == SImode)
5547 type = (GET_CODE (x) == SIGN_EXTEND)
5548 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5549 index = XEXP (x, 0);
5550 shift = 0;
5552 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5553 else if (GET_CODE (x) == MULT
5554 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5555 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5556 && GET_MODE (XEXP (x, 0)) == DImode
5557 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5558 && CONST_INT_P (XEXP (x, 1)))
5560 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5561 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5562 index = XEXP (XEXP (x, 0), 0);
5563 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5565 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5566 else if (GET_CODE (x) == ASHIFT
5567 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5568 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5569 && GET_MODE (XEXP (x, 0)) == DImode
5570 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5571 && CONST_INT_P (XEXP (x, 1)))
5573 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5574 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5575 index = XEXP (XEXP (x, 0), 0);
5576 shift = INTVAL (XEXP (x, 1));
5578 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5579 else if ((GET_CODE (x) == SIGN_EXTRACT
5580 || GET_CODE (x) == ZERO_EXTRACT)
5581 && GET_MODE (x) == DImode
5582 && GET_CODE (XEXP (x, 0)) == MULT
5583 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5584 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5586 type = (GET_CODE (x) == SIGN_EXTRACT)
5587 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5588 index = XEXP (XEXP (x, 0), 0);
5589 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5590 if (INTVAL (XEXP (x, 1)) != 32 + shift
5591 || INTVAL (XEXP (x, 2)) != 0)
5592 shift = -1;
5594 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5595 (const_int 0xffffffff<<shift)) */
5596 else if (GET_CODE (x) == AND
5597 && GET_MODE (x) == DImode
5598 && GET_CODE (XEXP (x, 0)) == MULT
5599 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5600 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5601 && CONST_INT_P (XEXP (x, 1)))
5603 type = ADDRESS_REG_UXTW;
5604 index = XEXP (XEXP (x, 0), 0);
5605 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5606 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5607 shift = -1;
5609 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5610 else if ((GET_CODE (x) == SIGN_EXTRACT
5611 || GET_CODE (x) == ZERO_EXTRACT)
5612 && GET_MODE (x) == DImode
5613 && GET_CODE (XEXP (x, 0)) == ASHIFT
5614 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5615 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5617 type = (GET_CODE (x) == SIGN_EXTRACT)
5618 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5619 index = XEXP (XEXP (x, 0), 0);
5620 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5621 if (INTVAL (XEXP (x, 1)) != 32 + shift
5622 || INTVAL (XEXP (x, 2)) != 0)
5623 shift = -1;
5625 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5626 (const_int 0xffffffff<<shift)) */
5627 else if (GET_CODE (x) == AND
5628 && GET_MODE (x) == DImode
5629 && GET_CODE (XEXP (x, 0)) == ASHIFT
5630 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5631 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5632 && CONST_INT_P (XEXP (x, 1)))
5634 type = ADDRESS_REG_UXTW;
5635 index = XEXP (XEXP (x, 0), 0);
5636 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5637 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5638 shift = -1;
5640 /* (mult:P (reg:P) (const_int scale)) */
5641 else if (GET_CODE (x) == MULT
5642 && GET_MODE (x) == Pmode
5643 && GET_MODE (XEXP (x, 0)) == Pmode
5644 && CONST_INT_P (XEXP (x, 1)))
5646 type = ADDRESS_REG_REG;
5647 index = XEXP (x, 0);
5648 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5650 /* (ashift:P (reg:P) (const_int shift)) */
5651 else if (GET_CODE (x) == ASHIFT
5652 && GET_MODE (x) == Pmode
5653 && GET_MODE (XEXP (x, 0)) == Pmode
5654 && CONST_INT_P (XEXP (x, 1)))
5656 type = ADDRESS_REG_REG;
5657 index = XEXP (x, 0);
5658 shift = INTVAL (XEXP (x, 1));
5660 else
5661 return false;
5663 if (!strict_p
5664 && GET_CODE (index) == SUBREG
5665 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5666 index = SUBREG_REG (index);
5668 if (aarch64_sve_data_mode_p (mode))
5670 if (type != ADDRESS_REG_REG
5671 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5672 return false;
5674 else
5676 if (shift != 0
5677 && !(IN_RANGE (shift, 1, 3)
5678 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5679 return false;
5682 if (REG_P (index)
5683 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5685 info->type = type;
5686 info->offset = index;
5687 info->shift = shift;
5688 return true;
5691 return false;
5694 /* Return true if MODE is one of the modes for which we
5695 support LDP/STP operations. */
5697 static bool
5698 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5700 return mode == SImode || mode == DImode
5701 || mode == SFmode || mode == DFmode
5702 || (aarch64_vector_mode_supported_p (mode)
5703 && (known_eq (GET_MODE_SIZE (mode), 8)
5704 || (known_eq (GET_MODE_SIZE (mode), 16)
5705 && (aarch64_tune_params.extra_tuning_flags
5706 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
5709 /* Return true if REGNO is a virtual pointer register, or an eliminable
5710 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5711 include stack_pointer or hard_frame_pointer. */
5712 static bool
5713 virt_or_elim_regno_p (unsigned regno)
5715 return ((regno >= FIRST_VIRTUAL_REGISTER
5716 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5717 || regno == FRAME_POINTER_REGNUM
5718 || regno == ARG_POINTER_REGNUM);
5721 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5722 If it is, fill in INFO appropriately. STRICT_P is true if
5723 REG_OK_STRICT is in effect. */
5725 static bool
5726 aarch64_classify_address (struct aarch64_address_info *info,
5727 rtx x, machine_mode mode, bool strict_p,
5728 aarch64_addr_query_type type = ADDR_QUERY_M)
5730 enum rtx_code code = GET_CODE (x);
5731 rtx op0, op1;
5732 poly_int64 offset;
5734 HOST_WIDE_INT const_size;
5736 /* On BE, we use load/store pair for all large int mode load/stores.
5737 TI/TFmode may also use a load/store pair. */
5738 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5739 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5740 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5741 || mode == TImode
5742 || mode == TFmode
5743 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5745 bool allow_reg_index_p = (!load_store_pair_p
5746 && (known_lt (GET_MODE_SIZE (mode), 16)
5747 || vec_flags == VEC_ADVSIMD
5748 || vec_flags == VEC_SVE_DATA));
5750 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5751 [Rn, #offset, MUL VL]. */
5752 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5753 && (code != REG && code != PLUS))
5754 return false;
5756 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5757 REG addressing. */
5758 if (advsimd_struct_p
5759 && !BYTES_BIG_ENDIAN
5760 && (code != POST_INC && code != REG))
5761 return false;
5763 gcc_checking_assert (GET_MODE (x) == VOIDmode
5764 || SCALAR_INT_MODE_P (GET_MODE (x)));
5766 switch (code)
5768 case REG:
5769 case SUBREG:
5770 info->type = ADDRESS_REG_IMM;
5771 info->base = x;
5772 info->offset = const0_rtx;
5773 info->const_offset = 0;
5774 return aarch64_base_register_rtx_p (x, strict_p);
5776 case PLUS:
5777 op0 = XEXP (x, 0);
5778 op1 = XEXP (x, 1);
5780 if (! strict_p
5781 && REG_P (op0)
5782 && virt_or_elim_regno_p (REGNO (op0))
5783 && poly_int_rtx_p (op1, &offset))
5785 info->type = ADDRESS_REG_IMM;
5786 info->base = op0;
5787 info->offset = op1;
5788 info->const_offset = offset;
5790 return true;
5793 if (maybe_ne (GET_MODE_SIZE (mode), 0)
5794 && aarch64_base_register_rtx_p (op0, strict_p)
5795 && poly_int_rtx_p (op1, &offset))
5797 info->type = ADDRESS_REG_IMM;
5798 info->base = op0;
5799 info->offset = op1;
5800 info->const_offset = offset;
5802 /* TImode and TFmode values are allowed in both pairs of X
5803 registers and individual Q registers. The available
5804 address modes are:
5805 X,X: 7-bit signed scaled offset
5806 Q: 9-bit signed offset
5807 We conservatively require an offset representable in either mode.
5808 When performing the check for pairs of X registers i.e. LDP/STP
5809 pass down DImode since that is the natural size of the LDP/STP
5810 instruction memory accesses. */
5811 if (mode == TImode || mode == TFmode)
5812 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5813 && (offset_9bit_signed_unscaled_p (mode, offset)
5814 || offset_12bit_unsigned_scaled_p (mode, offset)));
5816 /* A 7bit offset check because OImode will emit a ldp/stp
5817 instruction (only big endian will get here).
5818 For ldp/stp instructions, the offset is scaled for the size of a
5819 single element of the pair. */
5820 if (mode == OImode)
5821 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5823 /* Three 9/12 bit offsets checks because CImode will emit three
5824 ldr/str instructions (only big endian will get here). */
5825 if (mode == CImode)
5826 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5827 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5828 || offset_12bit_unsigned_scaled_p (V16QImode,
5829 offset + 32)));
5831 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5832 instructions (only big endian will get here). */
5833 if (mode == XImode)
5834 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5835 && aarch64_offset_7bit_signed_scaled_p (TImode,
5836 offset + 32));
5838 /* Make "m" use the LD1 offset range for SVE data modes, so
5839 that pre-RTL optimizers like ivopts will work to that
5840 instead of the wider LDR/STR range. */
5841 if (vec_flags == VEC_SVE_DATA)
5842 return (type == ADDR_QUERY_M
5843 ? offset_4bit_signed_scaled_p (mode, offset)
5844 : offset_9bit_signed_scaled_p (mode, offset));
5846 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5848 poly_int64 end_offset = (offset
5849 + GET_MODE_SIZE (mode)
5850 - BYTES_PER_SVE_VECTOR);
5851 return (type == ADDR_QUERY_M
5852 ? offset_4bit_signed_scaled_p (mode, offset)
5853 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5854 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5855 end_offset)));
5858 if (vec_flags == VEC_SVE_PRED)
5859 return offset_9bit_signed_scaled_p (mode, offset);
5861 if (load_store_pair_p)
5862 return ((known_eq (GET_MODE_SIZE (mode), 4)
5863 || known_eq (GET_MODE_SIZE (mode), 8)
5864 || known_eq (GET_MODE_SIZE (mode), 16))
5865 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5866 else
5867 return (offset_9bit_signed_unscaled_p (mode, offset)
5868 || offset_12bit_unsigned_scaled_p (mode, offset));
5871 if (allow_reg_index_p)
5873 /* Look for base + (scaled/extended) index register. */
5874 if (aarch64_base_register_rtx_p (op0, strict_p)
5875 && aarch64_classify_index (info, op1, mode, strict_p))
5877 info->base = op0;
5878 return true;
5880 if (aarch64_base_register_rtx_p (op1, strict_p)
5881 && aarch64_classify_index (info, op0, mode, strict_p))
5883 info->base = op1;
5884 return true;
5888 return false;
5890 case POST_INC:
5891 case POST_DEC:
5892 case PRE_INC:
5893 case PRE_DEC:
5894 info->type = ADDRESS_REG_WB;
5895 info->base = XEXP (x, 0);
5896 info->offset = NULL_RTX;
5897 return aarch64_base_register_rtx_p (info->base, strict_p);
5899 case POST_MODIFY:
5900 case PRE_MODIFY:
5901 info->type = ADDRESS_REG_WB;
5902 info->base = XEXP (x, 0);
5903 if (GET_CODE (XEXP (x, 1)) == PLUS
5904 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5905 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5906 && aarch64_base_register_rtx_p (info->base, strict_p))
5908 info->offset = XEXP (XEXP (x, 1), 1);
5909 info->const_offset = offset;
5911 /* TImode and TFmode values are allowed in both pairs of X
5912 registers and individual Q registers. The available
5913 address modes are:
5914 X,X: 7-bit signed scaled offset
5915 Q: 9-bit signed offset
5916 We conservatively require an offset representable in either mode.
5918 if (mode == TImode || mode == TFmode)
5919 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5920 && offset_9bit_signed_unscaled_p (mode, offset));
5922 if (load_store_pair_p)
5923 return ((known_eq (GET_MODE_SIZE (mode), 4)
5924 || known_eq (GET_MODE_SIZE (mode), 8)
5925 || known_eq (GET_MODE_SIZE (mode), 16))
5926 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5927 else
5928 return offset_9bit_signed_unscaled_p (mode, offset);
5930 return false;
5932 case CONST:
5933 case SYMBOL_REF:
5934 case LABEL_REF:
5935 /* load literal: pc-relative constant pool entry. Only supported
5936 for SI mode or larger. */
5937 info->type = ADDRESS_SYMBOLIC;
5939 if (!load_store_pair_p
5940 && GET_MODE_SIZE (mode).is_constant (&const_size)
5941 && const_size >= 4)
5943 rtx sym, addend;
5945 split_const (x, &sym, &addend);
5946 return ((GET_CODE (sym) == LABEL_REF
5947 || (GET_CODE (sym) == SYMBOL_REF
5948 && CONSTANT_POOL_ADDRESS_P (sym)
5949 && aarch64_pcrelative_literal_loads)));
5951 return false;
5953 case LO_SUM:
5954 info->type = ADDRESS_LO_SUM;
5955 info->base = XEXP (x, 0);
5956 info->offset = XEXP (x, 1);
5957 if (allow_reg_index_p
5958 && aarch64_base_register_rtx_p (info->base, strict_p))
5960 rtx sym, offs;
5961 split_const (info->offset, &sym, &offs);
5962 if (GET_CODE (sym) == SYMBOL_REF
5963 && (aarch64_classify_symbol (sym, INTVAL (offs))
5964 == SYMBOL_SMALL_ABSOLUTE))
5966 /* The symbol and offset must be aligned to the access size. */
5967 unsigned int align;
5969 if (CONSTANT_POOL_ADDRESS_P (sym))
5970 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5971 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5973 tree exp = SYMBOL_REF_DECL (sym);
5974 align = TYPE_ALIGN (TREE_TYPE (exp));
5975 align = aarch64_constant_alignment (exp, align);
5977 else if (SYMBOL_REF_DECL (sym))
5978 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5979 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5980 && SYMBOL_REF_BLOCK (sym) != NULL)
5981 align = SYMBOL_REF_BLOCK (sym)->alignment;
5982 else
5983 align = BITS_PER_UNIT;
5985 poly_int64 ref_size = GET_MODE_SIZE (mode);
5986 if (known_eq (ref_size, 0))
5987 ref_size = GET_MODE_SIZE (DImode);
5989 return (multiple_p (INTVAL (offs), ref_size)
5990 && multiple_p (align / BITS_PER_UNIT, ref_size));
5993 return false;
5995 default:
5996 return false;
6000 /* Return true if the address X is valid for a PRFM instruction.
6001 STRICT_P is true if we should do strict checking with
6002 aarch64_classify_address. */
6004 bool
6005 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6007 struct aarch64_address_info addr;
6009 /* PRFM accepts the same addresses as DImode... */
6010 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6011 if (!res)
6012 return false;
6014 /* ... except writeback forms. */
6015 return addr.type != ADDRESS_REG_WB;
6018 bool
6019 aarch64_symbolic_address_p (rtx x)
6021 rtx offset;
6023 split_const (x, &x, &offset);
6024 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6027 /* Classify the base of symbolic expression X. */
6029 enum aarch64_symbol_type
6030 aarch64_classify_symbolic_expression (rtx x)
6032 rtx offset;
6034 split_const (x, &x, &offset);
6035 return aarch64_classify_symbol (x, INTVAL (offset));
6039 /* Return TRUE if X is a legitimate address for accessing memory in
6040 mode MODE. */
6041 static bool
6042 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6044 struct aarch64_address_info addr;
6046 return aarch64_classify_address (&addr, x, mode, strict_p);
6049 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6050 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6051 bool
6052 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6053 aarch64_addr_query_type type)
6055 struct aarch64_address_info addr;
6057 return aarch64_classify_address (&addr, x, mode, strict_p, type);
6060 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6062 static bool
6063 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6064 poly_int64 orig_offset,
6065 machine_mode mode)
6067 HOST_WIDE_INT size;
6068 if (GET_MODE_SIZE (mode).is_constant (&size))
6070 HOST_WIDE_INT const_offset, second_offset;
6072 /* A general SVE offset is A * VQ + B. Remove the A component from
6073 coefficient 0 in order to get the constant B. */
6074 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6076 /* Split an out-of-range address displacement into a base and
6077 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6078 range otherwise to increase opportunities for sharing the base
6079 address of different sizes. Unaligned accesses use the signed
6080 9-bit range, TImode/TFmode use the intersection of signed
6081 scaled 7-bit and signed 9-bit offset. */
6082 if (mode == TImode || mode == TFmode)
6083 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6084 else if ((const_offset & (size - 1)) != 0)
6085 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6086 else
6087 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6089 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6090 return false;
6092 /* Split the offset into second_offset and the rest. */
6093 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6094 *offset2 = gen_int_mode (second_offset, Pmode);
6095 return true;
6097 else
6099 /* Get the mode we should use as the basis of the range. For structure
6100 modes this is the mode of one vector. */
6101 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6102 machine_mode step_mode
6103 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6105 /* Get the "mul vl" multiplier we'd like to use. */
6106 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6107 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6108 if (vec_flags & VEC_SVE_DATA)
6109 /* LDR supports a 9-bit range, but the move patterns for
6110 structure modes require all vectors to be in range of the
6111 same base. The simplest way of accomodating that while still
6112 promoting reuse of anchor points between different modes is
6113 to use an 8-bit range unconditionally. */
6114 vnum = ((vnum + 128) & 255) - 128;
6115 else
6116 /* Predicates are only handled singly, so we might as well use
6117 the full range. */
6118 vnum = ((vnum + 256) & 511) - 256;
6119 if (vnum == 0)
6120 return false;
6122 /* Convert the "mul vl" multiplier into a byte offset. */
6123 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6124 if (known_eq (second_offset, orig_offset))
6125 return false;
6127 /* Split the offset into second_offset and the rest. */
6128 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6129 *offset2 = gen_int_mode (second_offset, Pmode);
6130 return true;
6134 /* Return the binary representation of floating point constant VALUE in INTVAL.
6135 If the value cannot be converted, return false without setting INTVAL.
6136 The conversion is done in the given MODE. */
6137 bool
6138 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6141 /* We make a general exception for 0. */
6142 if (aarch64_float_const_zero_rtx_p (value))
6144 *intval = 0;
6145 return true;
6148 scalar_float_mode mode;
6149 if (GET_CODE (value) != CONST_DOUBLE
6150 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6151 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6152 /* Only support up to DF mode. */
6153 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6154 return false;
6156 unsigned HOST_WIDE_INT ival = 0;
6158 long res[2];
6159 real_to_target (res,
6160 CONST_DOUBLE_REAL_VALUE (value),
6161 REAL_MODE_FORMAT (mode));
6163 if (mode == DFmode)
6165 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6166 ival = zext_hwi (res[order], 32);
6167 ival |= (zext_hwi (res[1 - order], 32) << 32);
6169 else
6170 ival = zext_hwi (res[0], 32);
6172 *intval = ival;
6173 return true;
6176 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6177 single MOV(+MOVK) followed by an FMOV. */
6178 bool
6179 aarch64_float_const_rtx_p (rtx x)
6181 machine_mode mode = GET_MODE (x);
6182 if (mode == VOIDmode)
6183 return false;
6185 /* Determine whether it's cheaper to write float constants as
6186 mov/movk pairs over ldr/adrp pairs. */
6187 unsigned HOST_WIDE_INT ival;
6189 if (GET_CODE (x) == CONST_DOUBLE
6190 && SCALAR_FLOAT_MODE_P (mode)
6191 && aarch64_reinterpret_float_as_int (x, &ival))
6193 scalar_int_mode imode = (mode == HFmode
6194 ? SImode
6195 : int_mode_for_mode (mode).require ());
6196 int num_instr = aarch64_internal_mov_immediate
6197 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6198 return num_instr < 3;
6201 return false;
6204 /* Return TRUE if rtx X is immediate constant 0.0 */
6205 bool
6206 aarch64_float_const_zero_rtx_p (rtx x)
6208 if (GET_MODE (x) == VOIDmode)
6209 return false;
6211 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6212 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6213 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6216 /* Return TRUE if rtx X is immediate constant that fits in a single
6217 MOVI immediate operation. */
6218 bool
6219 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6221 if (!TARGET_SIMD)
6222 return false;
6224 machine_mode vmode;
6225 scalar_int_mode imode;
6226 unsigned HOST_WIDE_INT ival;
6228 if (GET_CODE (x) == CONST_DOUBLE
6229 && SCALAR_FLOAT_MODE_P (mode))
6231 if (!aarch64_reinterpret_float_as_int (x, &ival))
6232 return false;
6234 /* We make a general exception for 0. */
6235 if (aarch64_float_const_zero_rtx_p (x))
6236 return true;
6238 imode = int_mode_for_mode (mode).require ();
6240 else if (GET_CODE (x) == CONST_INT
6241 && is_a <scalar_int_mode> (mode, &imode))
6242 ival = INTVAL (x);
6243 else
6244 return false;
6246 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6247 a 128 bit vector mode. */
6248 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6250 vmode = aarch64_simd_container_mode (imode, width);
6251 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6253 return aarch64_simd_valid_immediate (v_op, NULL);
6257 /* Return the fixed registers used for condition codes. */
6259 static bool
6260 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6262 *p1 = CC_REGNUM;
6263 *p2 = INVALID_REGNUM;
6264 return true;
6267 /* This function is used by the call expanders of the machine description.
6268 RESULT is the register in which the result is returned. It's NULL for
6269 "call" and "sibcall".
6270 MEM is the location of the function call.
6271 SIBCALL indicates whether this function call is normal call or sibling call.
6272 It will generate different pattern accordingly. */
6274 void
6275 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6277 rtx call, callee, tmp;
6278 rtvec vec;
6279 machine_mode mode;
6281 gcc_assert (MEM_P (mem));
6282 callee = XEXP (mem, 0);
6283 mode = GET_MODE (callee);
6284 gcc_assert (mode == Pmode);
6286 /* Decide if we should generate indirect calls by loading the
6287 address of the callee into a register before performing
6288 the branch-and-link. */
6289 if (SYMBOL_REF_P (callee)
6290 ? (aarch64_is_long_call_p (callee)
6291 || aarch64_is_noplt_call_p (callee))
6292 : !REG_P (callee))
6293 XEXP (mem, 0) = force_reg (mode, callee);
6295 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6297 if (result != NULL_RTX)
6298 call = gen_rtx_SET (result, call);
6300 if (sibcall)
6301 tmp = ret_rtx;
6302 else
6303 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6305 vec = gen_rtvec (2, call, tmp);
6306 call = gen_rtx_PARALLEL (VOIDmode, vec);
6308 aarch64_emit_call_insn (call);
6311 /* Emit call insn with PAT and do aarch64-specific handling. */
6313 void
6314 aarch64_emit_call_insn (rtx pat)
6316 rtx insn = emit_call_insn (pat);
6318 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6319 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6320 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6323 machine_mode
6324 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6326 /* All floating point compares return CCFP if it is an equality
6327 comparison, and CCFPE otherwise. */
6328 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6330 switch (code)
6332 case EQ:
6333 case NE:
6334 case UNORDERED:
6335 case ORDERED:
6336 case UNLT:
6337 case UNLE:
6338 case UNGT:
6339 case UNGE:
6340 case UNEQ:
6341 return CCFPmode;
6343 case LT:
6344 case LE:
6345 case GT:
6346 case GE:
6347 case LTGT:
6348 return CCFPEmode;
6350 default:
6351 gcc_unreachable ();
6355 /* Equality comparisons of short modes against zero can be performed
6356 using the TST instruction with the appropriate bitmask. */
6357 if (y == const0_rtx && REG_P (x)
6358 && (code == EQ || code == NE)
6359 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6360 return CC_NZmode;
6362 /* Similarly, comparisons of zero_extends from shorter modes can
6363 be performed using an ANDS with an immediate mask. */
6364 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6365 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6366 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6367 && (code == EQ || code == NE))
6368 return CC_NZmode;
6370 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6371 && y == const0_rtx
6372 && (code == EQ || code == NE || code == LT || code == GE)
6373 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6374 || GET_CODE (x) == NEG
6375 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6376 && CONST_INT_P (XEXP (x, 2)))))
6377 return CC_NZmode;
6379 /* A compare with a shifted operand. Because of canonicalization,
6380 the comparison will have to be swapped when we emit the assembly
6381 code. */
6382 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6383 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6384 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6385 || GET_CODE (x) == LSHIFTRT
6386 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6387 return CC_SWPmode;
6389 /* Similarly for a negated operand, but we can only do this for
6390 equalities. */
6391 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6392 && (REG_P (y) || GET_CODE (y) == SUBREG)
6393 && (code == EQ || code == NE)
6394 && GET_CODE (x) == NEG)
6395 return CC_Zmode;
6397 /* A test for unsigned overflow. */
6398 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6399 && code == NE
6400 && GET_CODE (x) == PLUS
6401 && GET_CODE (y) == ZERO_EXTEND)
6402 return CC_Cmode;
6404 /* For everything else, return CCmode. */
6405 return CCmode;
6408 static int
6409 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6412 aarch64_get_condition_code (rtx x)
6414 machine_mode mode = GET_MODE (XEXP (x, 0));
6415 enum rtx_code comp_code = GET_CODE (x);
6417 if (GET_MODE_CLASS (mode) != MODE_CC)
6418 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6419 return aarch64_get_condition_code_1 (mode, comp_code);
6422 static int
6423 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6425 switch (mode)
6427 case E_CCFPmode:
6428 case E_CCFPEmode:
6429 switch (comp_code)
6431 case GE: return AARCH64_GE;
6432 case GT: return AARCH64_GT;
6433 case LE: return AARCH64_LS;
6434 case LT: return AARCH64_MI;
6435 case NE: return AARCH64_NE;
6436 case EQ: return AARCH64_EQ;
6437 case ORDERED: return AARCH64_VC;
6438 case UNORDERED: return AARCH64_VS;
6439 case UNLT: return AARCH64_LT;
6440 case UNLE: return AARCH64_LE;
6441 case UNGT: return AARCH64_HI;
6442 case UNGE: return AARCH64_PL;
6443 default: return -1;
6445 break;
6447 case E_CCmode:
6448 switch (comp_code)
6450 case NE: return AARCH64_NE;
6451 case EQ: return AARCH64_EQ;
6452 case GE: return AARCH64_GE;
6453 case GT: return AARCH64_GT;
6454 case LE: return AARCH64_LE;
6455 case LT: return AARCH64_LT;
6456 case GEU: return AARCH64_CS;
6457 case GTU: return AARCH64_HI;
6458 case LEU: return AARCH64_LS;
6459 case LTU: return AARCH64_CC;
6460 default: return -1;
6462 break;
6464 case E_CC_SWPmode:
6465 switch (comp_code)
6467 case NE: return AARCH64_NE;
6468 case EQ: return AARCH64_EQ;
6469 case GE: return AARCH64_LE;
6470 case GT: return AARCH64_LT;
6471 case LE: return AARCH64_GE;
6472 case LT: return AARCH64_GT;
6473 case GEU: return AARCH64_LS;
6474 case GTU: return AARCH64_CC;
6475 case LEU: return AARCH64_CS;
6476 case LTU: return AARCH64_HI;
6477 default: return -1;
6479 break;
6481 case E_CC_NZmode:
6482 switch (comp_code)
6484 case NE: return AARCH64_NE;
6485 case EQ: return AARCH64_EQ;
6486 case GE: return AARCH64_PL;
6487 case LT: return AARCH64_MI;
6488 default: return -1;
6490 break;
6492 case E_CC_Zmode:
6493 switch (comp_code)
6495 case NE: return AARCH64_NE;
6496 case EQ: return AARCH64_EQ;
6497 default: return -1;
6499 break;
6501 case E_CC_Cmode:
6502 switch (comp_code)
6504 case NE: return AARCH64_CS;
6505 case EQ: return AARCH64_CC;
6506 default: return -1;
6508 break;
6510 default:
6511 return -1;
6514 return -1;
6517 bool
6518 aarch64_const_vec_all_same_in_range_p (rtx x,
6519 HOST_WIDE_INT minval,
6520 HOST_WIDE_INT maxval)
6522 rtx elt;
6523 return (const_vec_duplicate_p (x, &elt)
6524 && CONST_INT_P (elt)
6525 && IN_RANGE (INTVAL (elt), minval, maxval));
6528 bool
6529 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6531 return aarch64_const_vec_all_same_in_range_p (x, val, val);
6534 /* Return true if VEC is a constant in which every element is in the range
6535 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6537 static bool
6538 aarch64_const_vec_all_in_range_p (rtx vec,
6539 HOST_WIDE_INT minval,
6540 HOST_WIDE_INT maxval)
6542 if (GET_CODE (vec) != CONST_VECTOR
6543 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6544 return false;
6546 int nunits;
6547 if (!CONST_VECTOR_STEPPED_P (vec))
6548 nunits = const_vector_encoded_nelts (vec);
6549 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6550 return false;
6552 for (int i = 0; i < nunits; i++)
6554 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6555 if (!CONST_INT_P (vec_elem)
6556 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6557 return false;
6559 return true;
6562 /* N Z C V. */
6563 #define AARCH64_CC_V 1
6564 #define AARCH64_CC_C (1 << 1)
6565 #define AARCH64_CC_Z (1 << 2)
6566 #define AARCH64_CC_N (1 << 3)
6568 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6569 static const int aarch64_nzcv_codes[] =
6571 0, /* EQ, Z == 1. */
6572 AARCH64_CC_Z, /* NE, Z == 0. */
6573 0, /* CS, C == 1. */
6574 AARCH64_CC_C, /* CC, C == 0. */
6575 0, /* MI, N == 1. */
6576 AARCH64_CC_N, /* PL, N == 0. */
6577 0, /* VS, V == 1. */
6578 AARCH64_CC_V, /* VC, V == 0. */
6579 0, /* HI, C ==1 && Z == 0. */
6580 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
6581 AARCH64_CC_V, /* GE, N == V. */
6582 0, /* LT, N != V. */
6583 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
6584 0, /* LE, !(Z == 0 && N == V). */
6585 0, /* AL, Any. */
6586 0 /* NV, Any. */
6589 /* Print floating-point vector immediate operand X to F, negating it
6590 first if NEGATE is true. Return true on success, false if it isn't
6591 a constant we can handle. */
6593 static bool
6594 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6596 rtx elt;
6598 if (!const_vec_duplicate_p (x, &elt))
6599 return false;
6601 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6602 if (negate)
6603 r = real_value_negate (&r);
6605 /* We only handle the SVE single-bit immediates here. */
6606 if (real_equal (&r, &dconst0))
6607 asm_fprintf (f, "0.0");
6608 else if (real_equal (&r, &dconst1))
6609 asm_fprintf (f, "1.0");
6610 else if (real_equal (&r, &dconsthalf))
6611 asm_fprintf (f, "0.5");
6612 else
6613 return false;
6615 return true;
6618 /* Return the equivalent letter for size. */
6619 static char
6620 sizetochar (int size)
6622 switch (size)
6624 case 64: return 'd';
6625 case 32: return 's';
6626 case 16: return 'h';
6627 case 8 : return 'b';
6628 default: gcc_unreachable ();
6632 /* Print operand X to file F in a target specific manner according to CODE.
6633 The acceptable formatting commands given by CODE are:
6634 'c': An integer or symbol address without a preceding #
6635 sign.
6636 'C': Take the duplicated element in a vector constant
6637 and print it in hex.
6638 'D': Take the duplicated element in a vector constant
6639 and print it as an unsigned integer, in decimal.
6640 'e': Print the sign/zero-extend size as a character 8->b,
6641 16->h, 32->w.
6642 'p': Prints N such that 2^N == X (X must be power of 2 and
6643 const int).
6644 'P': Print the number of non-zero bits in X (a const_int).
6645 'H': Print the higher numbered register of a pair (TImode)
6646 of regs.
6647 'm': Print a condition (eq, ne, etc).
6648 'M': Same as 'm', but invert condition.
6649 'N': Take the duplicated element in a vector constant
6650 and print the negative of it in decimal.
6651 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6652 'S/T/U/V': Print a FP/SIMD register name for a register list.
6653 The register printed is the FP/SIMD register name
6654 of X + 0/1/2/3 for S/T/U/V.
6655 'R': Print a scalar FP/SIMD register name + 1.
6656 'X': Print bottom 16 bits of integer constant in hex.
6657 'w/x': Print a general register name or the zero register
6658 (32-bit or 64-bit).
6659 '0': Print a normal operand, if it's a general register,
6660 then we assume DImode.
6661 'k': Print NZCV for conditional compare instructions.
6662 'A': Output address constant representing the first
6663 argument of X, specifying a relocation offset
6664 if appropriate.
6665 'L': Output constant address specified by X
6666 with a relocation offset if appropriate.
6667 'G': Prints address of X, specifying a PC relative
6668 relocation mode if appropriate.
6669 'y': Output address of LDP or STP - this is used for
6670 some LDP/STPs which don't use a PARALLEL in their
6671 pattern (so the mode needs to be adjusted).
6672 'z': Output address of a typical LDP or STP. */
6674 static void
6675 aarch64_print_operand (FILE *f, rtx x, int code)
6677 rtx elt;
6678 switch (code)
6680 case 'c':
6681 switch (GET_CODE (x))
6683 case CONST_INT:
6684 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6685 break;
6687 case SYMBOL_REF:
6688 output_addr_const (f, x);
6689 break;
6691 case CONST:
6692 if (GET_CODE (XEXP (x, 0)) == PLUS
6693 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6695 output_addr_const (f, x);
6696 break;
6698 /* Fall through. */
6700 default:
6701 output_operand_lossage ("unsupported operand for code '%c'", code);
6703 break;
6705 case 'e':
6707 int n;
6709 if (!CONST_INT_P (x)
6710 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6712 output_operand_lossage ("invalid operand for '%%%c'", code);
6713 return;
6716 switch (n)
6718 case 3:
6719 fputc ('b', f);
6720 break;
6721 case 4:
6722 fputc ('h', f);
6723 break;
6724 case 5:
6725 fputc ('w', f);
6726 break;
6727 default:
6728 output_operand_lossage ("invalid operand for '%%%c'", code);
6729 return;
6732 break;
6734 case 'p':
6736 int n;
6738 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6740 output_operand_lossage ("invalid operand for '%%%c'", code);
6741 return;
6744 asm_fprintf (f, "%d", n);
6746 break;
6748 case 'P':
6749 if (!CONST_INT_P (x))
6751 output_operand_lossage ("invalid operand for '%%%c'", code);
6752 return;
6755 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6756 break;
6758 case 'H':
6759 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6761 output_operand_lossage ("invalid operand for '%%%c'", code);
6762 return;
6765 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6766 break;
6768 case 'M':
6769 case 'm':
6771 int cond_code;
6772 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6773 if (x == const_true_rtx)
6775 if (code == 'M')
6776 fputs ("nv", f);
6777 return;
6780 if (!COMPARISON_P (x))
6782 output_operand_lossage ("invalid operand for '%%%c'", code);
6783 return;
6786 cond_code = aarch64_get_condition_code (x);
6787 gcc_assert (cond_code >= 0);
6788 if (code == 'M')
6789 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6790 fputs (aarch64_condition_codes[cond_code], f);
6792 break;
6794 case 'N':
6795 if (!const_vec_duplicate_p (x, &elt))
6797 output_operand_lossage ("invalid vector constant");
6798 return;
6801 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6802 asm_fprintf (f, "%wd", -INTVAL (elt));
6803 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6804 && aarch64_print_vector_float_operand (f, x, true))
6806 else
6808 output_operand_lossage ("invalid vector constant");
6809 return;
6811 break;
6813 case 'b':
6814 case 'h':
6815 case 's':
6816 case 'd':
6817 case 'q':
6818 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6820 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6821 return;
6823 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6824 break;
6826 case 'S':
6827 case 'T':
6828 case 'U':
6829 case 'V':
6830 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6832 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6833 return;
6835 asm_fprintf (f, "%c%d",
6836 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6837 REGNO (x) - V0_REGNUM + (code - 'S'));
6838 break;
6840 case 'R':
6841 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6843 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6844 return;
6846 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6847 break;
6849 case 'X':
6850 if (!CONST_INT_P (x))
6852 output_operand_lossage ("invalid operand for '%%%c'", code);
6853 return;
6855 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6856 break;
6858 case 'C':
6860 /* Print a replicated constant in hex. */
6861 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6863 output_operand_lossage ("invalid operand for '%%%c'", code);
6864 return;
6866 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6867 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6869 break;
6871 case 'D':
6873 /* Print a replicated constant in decimal, treating it as
6874 unsigned. */
6875 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6877 output_operand_lossage ("invalid operand for '%%%c'", code);
6878 return;
6880 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6881 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6883 break;
6885 case 'w':
6886 case 'x':
6887 if (x == const0_rtx
6888 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6890 asm_fprintf (f, "%czr", code);
6891 break;
6894 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6896 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6897 break;
6900 if (REG_P (x) && REGNO (x) == SP_REGNUM)
6902 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6903 break;
6906 /* Fall through */
6908 case 0:
6909 if (x == NULL)
6911 output_operand_lossage ("missing operand");
6912 return;
6915 switch (GET_CODE (x))
6917 case REG:
6918 if (aarch64_sve_data_mode_p (GET_MODE (x)))
6920 if (REG_NREGS (x) == 1)
6921 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6922 else
6924 char suffix
6925 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6926 asm_fprintf (f, "{z%d.%c - z%d.%c}",
6927 REGNO (x) - V0_REGNUM, suffix,
6928 END_REGNO (x) - V0_REGNUM - 1, suffix);
6931 else
6932 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6933 break;
6935 case MEM:
6936 output_address (GET_MODE (x), XEXP (x, 0));
6937 break;
6939 case LABEL_REF:
6940 case SYMBOL_REF:
6941 output_addr_const (asm_out_file, x);
6942 break;
6944 case CONST_INT:
6945 asm_fprintf (f, "%wd", INTVAL (x));
6946 break;
6948 case CONST:
6949 if (!VECTOR_MODE_P (GET_MODE (x)))
6951 output_addr_const (asm_out_file, x);
6952 break;
6954 /* fall through */
6956 case CONST_VECTOR:
6957 if (!const_vec_duplicate_p (x, &elt))
6959 output_operand_lossage ("invalid vector constant");
6960 return;
6963 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6964 asm_fprintf (f, "%wd", INTVAL (elt));
6965 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6966 && aarch64_print_vector_float_operand (f, x, false))
6968 else
6970 output_operand_lossage ("invalid vector constant");
6971 return;
6973 break;
6975 case CONST_DOUBLE:
6976 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6977 be getting CONST_DOUBLEs holding integers. */
6978 gcc_assert (GET_MODE (x) != VOIDmode);
6979 if (aarch64_float_const_zero_rtx_p (x))
6981 fputc ('0', f);
6982 break;
6984 else if (aarch64_float_const_representable_p (x))
6986 #define buf_size 20
6987 char float_buf[buf_size] = {'\0'};
6988 real_to_decimal_for_mode (float_buf,
6989 CONST_DOUBLE_REAL_VALUE (x),
6990 buf_size, buf_size,
6991 1, GET_MODE (x));
6992 asm_fprintf (asm_out_file, "%s", float_buf);
6993 break;
6994 #undef buf_size
6996 output_operand_lossage ("invalid constant");
6997 return;
6998 default:
6999 output_operand_lossage ("invalid operand");
7000 return;
7002 break;
7004 case 'A':
7005 if (GET_CODE (x) == HIGH)
7006 x = XEXP (x, 0);
7008 switch (aarch64_classify_symbolic_expression (x))
7010 case SYMBOL_SMALL_GOT_4G:
7011 asm_fprintf (asm_out_file, ":got:");
7012 break;
7014 case SYMBOL_SMALL_TLSGD:
7015 asm_fprintf (asm_out_file, ":tlsgd:");
7016 break;
7018 case SYMBOL_SMALL_TLSDESC:
7019 asm_fprintf (asm_out_file, ":tlsdesc:");
7020 break;
7022 case SYMBOL_SMALL_TLSIE:
7023 asm_fprintf (asm_out_file, ":gottprel:");
7024 break;
7026 case SYMBOL_TLSLE24:
7027 asm_fprintf (asm_out_file, ":tprel:");
7028 break;
7030 case SYMBOL_TINY_GOT:
7031 gcc_unreachable ();
7032 break;
7034 default:
7035 break;
7037 output_addr_const (asm_out_file, x);
7038 break;
7040 case 'L':
7041 switch (aarch64_classify_symbolic_expression (x))
7043 case SYMBOL_SMALL_GOT_4G:
7044 asm_fprintf (asm_out_file, ":lo12:");
7045 break;
7047 case SYMBOL_SMALL_TLSGD:
7048 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7049 break;
7051 case SYMBOL_SMALL_TLSDESC:
7052 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7053 break;
7055 case SYMBOL_SMALL_TLSIE:
7056 asm_fprintf (asm_out_file, ":gottprel_lo12:");
7057 break;
7059 case SYMBOL_TLSLE12:
7060 asm_fprintf (asm_out_file, ":tprel_lo12:");
7061 break;
7063 case SYMBOL_TLSLE24:
7064 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7065 break;
7067 case SYMBOL_TINY_GOT:
7068 asm_fprintf (asm_out_file, ":got:");
7069 break;
7071 case SYMBOL_TINY_TLSIE:
7072 asm_fprintf (asm_out_file, ":gottprel:");
7073 break;
7075 default:
7076 break;
7078 output_addr_const (asm_out_file, x);
7079 break;
7081 case 'G':
7082 switch (aarch64_classify_symbolic_expression (x))
7084 case SYMBOL_TLSLE24:
7085 asm_fprintf (asm_out_file, ":tprel_hi12:");
7086 break;
7087 default:
7088 break;
7090 output_addr_const (asm_out_file, x);
7091 break;
7093 case 'k':
7095 HOST_WIDE_INT cond_code;
7097 if (!CONST_INT_P (x))
7099 output_operand_lossage ("invalid operand for '%%%c'", code);
7100 return;
7103 cond_code = INTVAL (x);
7104 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7105 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7107 break;
7109 case 'y':
7110 case 'z':
7112 machine_mode mode = GET_MODE (x);
7114 if (GET_CODE (x) != MEM
7115 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7117 output_operand_lossage ("invalid operand for '%%%c'", code);
7118 return;
7121 if (code == 'y')
7122 /* LDP/STP which uses a single double-width memory operand.
7123 Adjust the mode to appear like a typical LDP/STP.
7124 Currently this is supported for 16-byte accesses only. */
7125 mode = DFmode;
7127 if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7128 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7130 break;
7132 default:
7133 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7134 return;
7138 /* Print address 'x' of a memory access with mode 'mode'.
7139 'op' is the context required by aarch64_classify_address. It can either be
7140 MEM for a normal memory access or PARALLEL for LDP/STP. */
7141 static bool
7142 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7143 aarch64_addr_query_type type)
7145 struct aarch64_address_info addr;
7146 unsigned int size;
7148 /* Check all addresses are Pmode - including ILP32. */
7149 if (GET_MODE (x) != Pmode)
7150 output_operand_lossage ("invalid address mode");
7152 if (aarch64_classify_address (&addr, x, mode, true, type))
7153 switch (addr.type)
7155 case ADDRESS_REG_IMM:
7156 if (known_eq (addr.const_offset, 0))
7157 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7158 else if (aarch64_sve_data_mode_p (mode))
7160 HOST_WIDE_INT vnum
7161 = exact_div (addr.const_offset,
7162 BYTES_PER_SVE_VECTOR).to_constant ();
7163 asm_fprintf (f, "[%s, #%wd, mul vl]",
7164 reg_names[REGNO (addr.base)], vnum);
7166 else if (aarch64_sve_pred_mode_p (mode))
7168 HOST_WIDE_INT vnum
7169 = exact_div (addr.const_offset,
7170 BYTES_PER_SVE_PRED).to_constant ();
7171 asm_fprintf (f, "[%s, #%wd, mul vl]",
7172 reg_names[REGNO (addr.base)], vnum);
7174 else
7175 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7176 INTVAL (addr.offset));
7177 return true;
7179 case ADDRESS_REG_REG:
7180 if (addr.shift == 0)
7181 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7182 reg_names [REGNO (addr.offset)]);
7183 else
7184 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7185 reg_names [REGNO (addr.offset)], addr.shift);
7186 return true;
7188 case ADDRESS_REG_UXTW:
7189 if (addr.shift == 0)
7190 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7191 REGNO (addr.offset) - R0_REGNUM);
7192 else
7193 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7194 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7195 return true;
7197 case ADDRESS_REG_SXTW:
7198 if (addr.shift == 0)
7199 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7200 REGNO (addr.offset) - R0_REGNUM);
7201 else
7202 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7203 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7204 return true;
7206 case ADDRESS_REG_WB:
7207 /* Writeback is only supported for fixed-width modes. */
7208 size = GET_MODE_SIZE (mode).to_constant ();
7209 switch (GET_CODE (x))
7211 case PRE_INC:
7212 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7213 return true;
7214 case POST_INC:
7215 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7216 return true;
7217 case PRE_DEC:
7218 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7219 return true;
7220 case POST_DEC:
7221 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7222 return true;
7223 case PRE_MODIFY:
7224 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7225 INTVAL (addr.offset));
7226 return true;
7227 case POST_MODIFY:
7228 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7229 INTVAL (addr.offset));
7230 return true;
7231 default:
7232 break;
7234 break;
7236 case ADDRESS_LO_SUM:
7237 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7238 output_addr_const (f, addr.offset);
7239 asm_fprintf (f, "]");
7240 return true;
7242 case ADDRESS_SYMBOLIC:
7243 output_addr_const (f, x);
7244 return true;
7247 return false;
7250 /* Print address 'x' of a LDP/STP with mode 'mode'. */
7251 static bool
7252 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7254 return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7257 /* Print address 'x' of a memory access with mode 'mode'. */
7258 static void
7259 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7261 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7262 output_addr_const (f, x);
7265 bool
7266 aarch64_label_mentioned_p (rtx x)
7268 const char *fmt;
7269 int i;
7271 if (GET_CODE (x) == LABEL_REF)
7272 return true;
7274 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7275 referencing instruction, but they are constant offsets, not
7276 symbols. */
7277 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7278 return false;
7280 fmt = GET_RTX_FORMAT (GET_CODE (x));
7281 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7283 if (fmt[i] == 'E')
7285 int j;
7287 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7288 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7289 return 1;
7291 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7292 return 1;
7295 return 0;
7298 /* Implement REGNO_REG_CLASS. */
7300 enum reg_class
7301 aarch64_regno_regclass (unsigned regno)
7303 if (GP_REGNUM_P (regno))
7304 return GENERAL_REGS;
7306 if (regno == SP_REGNUM)
7307 return STACK_REG;
7309 if (regno == FRAME_POINTER_REGNUM
7310 || regno == ARG_POINTER_REGNUM)
7311 return POINTER_REGS;
7313 if (FP_REGNUM_P (regno))
7314 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
7316 if (PR_REGNUM_P (regno))
7317 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7319 return NO_REGS;
7322 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7323 If OFFSET is out of range, return an offset of an anchor point
7324 that is in range. Return 0 otherwise. */
7326 static HOST_WIDE_INT
7327 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7328 machine_mode mode)
7330 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7331 if (size > 16)
7332 return (offset + 0x400) & ~0x7f0;
7334 /* For offsets that aren't a multiple of the access size, the limit is
7335 -256...255. */
7336 if (offset & (size - 1))
7338 /* BLKmode typically uses LDP of X-registers. */
7339 if (mode == BLKmode)
7340 return (offset + 512) & ~0x3ff;
7341 return (offset + 0x100) & ~0x1ff;
7344 /* Small negative offsets are supported. */
7345 if (IN_RANGE (offset, -256, 0))
7346 return 0;
7348 if (mode == TImode || mode == TFmode)
7349 return (offset + 0x100) & ~0x1ff;
7351 /* Use 12-bit offset by access size. */
7352 return offset & (~0xfff * size);
7355 static rtx
7356 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
7358 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7359 where mask is selected by alignment and size of the offset.
7360 We try to pick as large a range for the offset as possible to
7361 maximize the chance of a CSE. However, for aligned addresses
7362 we limit the range to 4k so that structures with different sized
7363 elements are likely to use the same base. We need to be careful
7364 not to split a CONST for some forms of address expression, otherwise
7365 it will generate sub-optimal code. */
7367 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7369 rtx base = XEXP (x, 0);
7370 rtx offset_rtx = XEXP (x, 1);
7371 HOST_WIDE_INT offset = INTVAL (offset_rtx);
7373 if (GET_CODE (base) == PLUS)
7375 rtx op0 = XEXP (base, 0);
7376 rtx op1 = XEXP (base, 1);
7378 /* Force any scaling into a temp for CSE. */
7379 op0 = force_reg (Pmode, op0);
7380 op1 = force_reg (Pmode, op1);
7382 /* Let the pointer register be in op0. */
7383 if (REG_POINTER (op1))
7384 std::swap (op0, op1);
7386 /* If the pointer is virtual or frame related, then we know that
7387 virtual register instantiation or register elimination is going
7388 to apply a second constant. We want the two constants folded
7389 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7390 if (virt_or_elim_regno_p (REGNO (op0)))
7392 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7393 NULL_RTX, true, OPTAB_DIRECT);
7394 return gen_rtx_PLUS (Pmode, base, op1);
7397 /* Otherwise, in order to encourage CSE (and thence loop strength
7398 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7399 base = expand_binop (Pmode, add_optab, op0, op1,
7400 NULL_RTX, true, OPTAB_DIRECT);
7401 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7404 HOST_WIDE_INT size;
7405 if (GET_MODE_SIZE (mode).is_constant (&size))
7407 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7408 mode);
7409 if (base_offset != 0)
7411 base = plus_constant (Pmode, base, base_offset);
7412 base = force_operand (base, NULL_RTX);
7413 return plus_constant (Pmode, base, offset - base_offset);
7418 return x;
7421 /* Return the reload icode required for a constant pool in mode. */
7422 static enum insn_code
7423 aarch64_constant_pool_reload_icode (machine_mode mode)
7425 switch (mode)
7427 case E_SFmode:
7428 return CODE_FOR_aarch64_reload_movcpsfdi;
7430 case E_DFmode:
7431 return CODE_FOR_aarch64_reload_movcpdfdi;
7433 case E_TFmode:
7434 return CODE_FOR_aarch64_reload_movcptfdi;
7436 case E_V8QImode:
7437 return CODE_FOR_aarch64_reload_movcpv8qidi;
7439 case E_V16QImode:
7440 return CODE_FOR_aarch64_reload_movcpv16qidi;
7442 case E_V4HImode:
7443 return CODE_FOR_aarch64_reload_movcpv4hidi;
7445 case E_V8HImode:
7446 return CODE_FOR_aarch64_reload_movcpv8hidi;
7448 case E_V2SImode:
7449 return CODE_FOR_aarch64_reload_movcpv2sidi;
7451 case E_V4SImode:
7452 return CODE_FOR_aarch64_reload_movcpv4sidi;
7454 case E_V2DImode:
7455 return CODE_FOR_aarch64_reload_movcpv2didi;
7457 case E_V2DFmode:
7458 return CODE_FOR_aarch64_reload_movcpv2dfdi;
7460 default:
7461 gcc_unreachable ();
7464 gcc_unreachable ();
7466 static reg_class_t
7467 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7468 reg_class_t rclass,
7469 machine_mode mode,
7470 secondary_reload_info *sri)
7472 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7473 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7474 comment at the head of aarch64-sve.md for more details about the
7475 big-endian handling. */
7476 if (BYTES_BIG_ENDIAN
7477 && reg_class_subset_p (rclass, FP_REGS)
7478 && !((REG_P (x) && HARD_REGISTER_P (x))
7479 || aarch64_simd_valid_immediate (x, NULL))
7480 && aarch64_sve_data_mode_p (mode))
7482 sri->icode = CODE_FOR_aarch64_sve_reload_be;
7483 return NO_REGS;
7486 /* If we have to disable direct literal pool loads and stores because the
7487 function is too big, then we need a scratch register. */
7488 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7489 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7490 || targetm.vector_mode_supported_p (GET_MODE (x)))
7491 && !aarch64_pcrelative_literal_loads)
7493 sri->icode = aarch64_constant_pool_reload_icode (mode);
7494 return NO_REGS;
7497 /* Without the TARGET_SIMD instructions we cannot move a Q register
7498 to a Q register directly. We need a scratch. */
7499 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7500 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7501 && reg_class_subset_p (rclass, FP_REGS))
7503 if (mode == TFmode)
7504 sri->icode = CODE_FOR_aarch64_reload_movtf;
7505 else if (mode == TImode)
7506 sri->icode = CODE_FOR_aarch64_reload_movti;
7507 return NO_REGS;
7510 /* A TFmode or TImode memory access should be handled via an FP_REGS
7511 because AArch64 has richer addressing modes for LDR/STR instructions
7512 than LDP/STP instructions. */
7513 if (TARGET_FLOAT && rclass == GENERAL_REGS
7514 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7515 return FP_REGS;
7517 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7518 return GENERAL_REGS;
7520 return NO_REGS;
7523 static bool
7524 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7526 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7528 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7529 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7530 if (frame_pointer_needed)
7531 return to == HARD_FRAME_POINTER_REGNUM;
7532 return true;
7535 poly_int64
7536 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7538 aarch64_layout_frame ();
7540 if (to == HARD_FRAME_POINTER_REGNUM)
7542 if (from == ARG_POINTER_REGNUM)
7543 return cfun->machine->frame.hard_fp_offset;
7545 if (from == FRAME_POINTER_REGNUM)
7546 return cfun->machine->frame.hard_fp_offset
7547 - cfun->machine->frame.locals_offset;
7550 if (to == STACK_POINTER_REGNUM)
7552 if (from == FRAME_POINTER_REGNUM)
7553 return cfun->machine->frame.frame_size
7554 - cfun->machine->frame.locals_offset;
7557 return cfun->machine->frame.frame_size;
7560 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7561 previous frame. */
7564 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7566 if (count != 0)
7567 return const0_rtx;
7568 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7572 static void
7573 aarch64_asm_trampoline_template (FILE *f)
7575 if (TARGET_ILP32)
7577 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7578 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7580 else
7582 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7583 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7585 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7586 assemble_aligned_integer (4, const0_rtx);
7587 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7588 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7591 static void
7592 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7594 rtx fnaddr, mem, a_tramp;
7595 const int tramp_code_sz = 16;
7597 /* Don't need to copy the trailing D-words, we fill those in below. */
7598 emit_block_move (m_tramp, assemble_trampoline_template (),
7599 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7600 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7601 fnaddr = XEXP (DECL_RTL (fndecl), 0);
7602 if (GET_MODE (fnaddr) != ptr_mode)
7603 fnaddr = convert_memory_address (ptr_mode, fnaddr);
7604 emit_move_insn (mem, fnaddr);
7606 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7607 emit_move_insn (mem, chain_value);
7609 /* XXX We should really define a "clear_cache" pattern and use
7610 gen_clear_cache(). */
7611 a_tramp = XEXP (m_tramp, 0);
7612 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7613 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7614 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7615 ptr_mode);
7618 static unsigned char
7619 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7621 /* ??? Logically we should only need to provide a value when
7622 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7623 can hold MODE, but at the moment we need to handle all modes.
7624 Just ignore any runtime parts for registers that can't store them. */
7625 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7626 unsigned int nregs;
7627 switch (regclass)
7629 case TAILCALL_ADDR_REGS:
7630 case POINTER_REGS:
7631 case GENERAL_REGS:
7632 case ALL_REGS:
7633 case POINTER_AND_FP_REGS:
7634 case FP_REGS:
7635 case FP_LO_REGS:
7636 if (aarch64_sve_data_mode_p (mode)
7637 && constant_multiple_p (GET_MODE_SIZE (mode),
7638 BYTES_PER_SVE_VECTOR, &nregs))
7639 return nregs;
7640 return (aarch64_vector_data_mode_p (mode)
7641 ? CEIL (lowest_size, UNITS_PER_VREG)
7642 : CEIL (lowest_size, UNITS_PER_WORD));
7643 case STACK_REG:
7644 case PR_REGS:
7645 case PR_LO_REGS:
7646 case PR_HI_REGS:
7647 return 1;
7649 case NO_REGS:
7650 return 0;
7652 default:
7653 break;
7655 gcc_unreachable ();
7658 static reg_class_t
7659 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7661 if (regclass == POINTER_REGS)
7662 return GENERAL_REGS;
7664 if (regclass == STACK_REG)
7666 if (REG_P(x)
7667 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7668 return regclass;
7670 return NO_REGS;
7673 /* Register eliminiation can result in a request for
7674 SP+constant->FP_REGS. We cannot support such operations which
7675 use SP as source and an FP_REG as destination, so reject out
7676 right now. */
7677 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7679 rtx lhs = XEXP (x, 0);
7681 /* Look through a possible SUBREG introduced by ILP32. */
7682 if (GET_CODE (lhs) == SUBREG)
7683 lhs = SUBREG_REG (lhs);
7685 gcc_assert (REG_P (lhs));
7686 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7687 POINTER_REGS));
7688 return NO_REGS;
7691 return regclass;
7694 void
7695 aarch64_asm_output_labelref (FILE* f, const char *name)
7697 asm_fprintf (f, "%U%s", name);
7700 static void
7701 aarch64_elf_asm_constructor (rtx symbol, int priority)
7703 if (priority == DEFAULT_INIT_PRIORITY)
7704 default_ctor_section_asm_out_constructor (symbol, priority);
7705 else
7707 section *s;
7708 /* While priority is known to be in range [0, 65535], so 18 bytes
7709 would be enough, the compiler might not know that. To avoid
7710 -Wformat-truncation false positive, use a larger size. */
7711 char buf[23];
7712 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7713 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7714 switch_to_section (s);
7715 assemble_align (POINTER_SIZE);
7716 assemble_aligned_integer (POINTER_BYTES, symbol);
7720 static void
7721 aarch64_elf_asm_destructor (rtx symbol, int priority)
7723 if (priority == DEFAULT_INIT_PRIORITY)
7724 default_dtor_section_asm_out_destructor (symbol, priority);
7725 else
7727 section *s;
7728 /* While priority is known to be in range [0, 65535], so 18 bytes
7729 would be enough, the compiler might not know that. To avoid
7730 -Wformat-truncation false positive, use a larger size. */
7731 char buf[23];
7732 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7733 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7734 switch_to_section (s);
7735 assemble_align (POINTER_SIZE);
7736 assemble_aligned_integer (POINTER_BYTES, symbol);
7740 const char*
7741 aarch64_output_casesi (rtx *operands)
7743 char buf[100];
7744 char label[100];
7745 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7746 int index;
7747 static const char *const patterns[4][2] =
7750 "ldrb\t%w3, [%0,%w1,uxtw]",
7751 "add\t%3, %4, %w3, sxtb #2"
7754 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7755 "add\t%3, %4, %w3, sxth #2"
7758 "ldr\t%w3, [%0,%w1,uxtw #2]",
7759 "add\t%3, %4, %w3, sxtw #2"
7761 /* We assume that DImode is only generated when not optimizing and
7762 that we don't really need 64-bit address offsets. That would
7763 imply an object file with 8GB of code in a single function! */
7765 "ldr\t%w3, [%0,%w1,uxtw #2]",
7766 "add\t%3, %4, %w3, sxtw #2"
7770 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7772 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7773 index = exact_log2 (GET_MODE_SIZE (mode));
7775 gcc_assert (index >= 0 && index <= 3);
7777 /* Need to implement table size reduction, by chaning the code below. */
7778 output_asm_insn (patterns[index][0], operands);
7779 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7780 snprintf (buf, sizeof (buf),
7781 "adr\t%%4, %s", targetm.strip_name_encoding (label));
7782 output_asm_insn (buf, operands);
7783 output_asm_insn (patterns[index][1], operands);
7784 output_asm_insn ("br\t%3", operands);
7785 assemble_label (asm_out_file, label);
7786 return "";
7790 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7791 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7792 operator. */
7795 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7797 if (shift >= 0 && shift <= 3)
7799 int size;
7800 for (size = 8; size <= 32; size *= 2)
7802 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7803 if (mask == bits << shift)
7804 return size;
7807 return 0;
7810 /* Constant pools are per function only when PC relative
7811 literal loads are true or we are in the large memory
7812 model. */
7814 static inline bool
7815 aarch64_can_use_per_function_literal_pools_p (void)
7817 return (aarch64_pcrelative_literal_loads
7818 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7821 static bool
7822 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7824 /* We can't use blocks for constants when we're using a per-function
7825 constant pool. */
7826 return !aarch64_can_use_per_function_literal_pools_p ();
7829 /* Select appropriate section for constants depending
7830 on where we place literal pools. */
7832 static section *
7833 aarch64_select_rtx_section (machine_mode mode,
7834 rtx x,
7835 unsigned HOST_WIDE_INT align)
7837 if (aarch64_can_use_per_function_literal_pools_p ())
7838 return function_section (current_function_decl);
7840 return default_elf_select_rtx_section (mode, x, align);
7843 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7844 void
7845 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7846 HOST_WIDE_INT offset)
7848 /* When using per-function literal pools, we must ensure that any code
7849 section is aligned to the minimal instruction length, lest we get
7850 errors from the assembler re "unaligned instructions". */
7851 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7852 ASM_OUTPUT_ALIGN (f, 2);
7855 /* Costs. */
7857 /* Helper function for rtx cost calculation. Strip a shift expression
7858 from X. Returns the inner operand if successful, or the original
7859 expression on failure. */
7860 static rtx
7861 aarch64_strip_shift (rtx x)
7863 rtx op = x;
7865 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7866 we can convert both to ROR during final output. */
7867 if ((GET_CODE (op) == ASHIFT
7868 || GET_CODE (op) == ASHIFTRT
7869 || GET_CODE (op) == LSHIFTRT
7870 || GET_CODE (op) == ROTATERT
7871 || GET_CODE (op) == ROTATE)
7872 && CONST_INT_P (XEXP (op, 1)))
7873 return XEXP (op, 0);
7875 if (GET_CODE (op) == MULT
7876 && CONST_INT_P (XEXP (op, 1))
7877 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7878 return XEXP (op, 0);
7880 return x;
7883 /* Helper function for rtx cost calculation. Strip an extend
7884 expression from X. Returns the inner operand if successful, or the
7885 original expression on failure. We deal with a number of possible
7886 canonicalization variations here. If STRIP_SHIFT is true, then
7887 we can strip off a shift also. */
7888 static rtx
7889 aarch64_strip_extend (rtx x, bool strip_shift)
7891 scalar_int_mode mode;
7892 rtx op = x;
7894 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7895 return op;
7897 /* Zero and sign extraction of a widened value. */
7898 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7899 && XEXP (op, 2) == const0_rtx
7900 && GET_CODE (XEXP (op, 0)) == MULT
7901 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7902 XEXP (op, 1)))
7903 return XEXP (XEXP (op, 0), 0);
7905 /* It can also be represented (for zero-extend) as an AND with an
7906 immediate. */
7907 if (GET_CODE (op) == AND
7908 && GET_CODE (XEXP (op, 0)) == MULT
7909 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7910 && CONST_INT_P (XEXP (op, 1))
7911 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7912 INTVAL (XEXP (op, 1))) != 0)
7913 return XEXP (XEXP (op, 0), 0);
7915 /* Now handle extended register, as this may also have an optional
7916 left shift by 1..4. */
7917 if (strip_shift
7918 && GET_CODE (op) == ASHIFT
7919 && CONST_INT_P (XEXP (op, 1))
7920 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7921 op = XEXP (op, 0);
7923 if (GET_CODE (op) == ZERO_EXTEND
7924 || GET_CODE (op) == SIGN_EXTEND)
7925 op = XEXP (op, 0);
7927 if (op != x)
7928 return op;
7930 return x;
7933 /* Return true iff CODE is a shift supported in combination
7934 with arithmetic instructions. */
7936 static bool
7937 aarch64_shift_p (enum rtx_code code)
7939 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7943 /* Return true iff X is a cheap shift without a sign extend. */
7945 static bool
7946 aarch64_cheap_mult_shift_p (rtx x)
7948 rtx op0, op1;
7950 op0 = XEXP (x, 0);
7951 op1 = XEXP (x, 1);
7953 if (!(aarch64_tune_params.extra_tuning_flags
7954 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7955 return false;
7957 if (GET_CODE (op0) == SIGN_EXTEND)
7958 return false;
7960 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7961 && UINTVAL (op1) <= 4)
7962 return true;
7964 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7965 return false;
7967 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7969 if (l2 > 0 && l2 <= 4)
7970 return true;
7972 return false;
7975 /* Helper function for rtx cost calculation. Calculate the cost of
7976 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7977 Return the calculated cost of the expression, recursing manually in to
7978 operands where needed. */
7980 static int
7981 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7983 rtx op0, op1;
7984 const struct cpu_cost_table *extra_cost
7985 = aarch64_tune_params.insn_extra_cost;
7986 int cost = 0;
7987 bool compound_p = (outer == PLUS || outer == MINUS);
7988 machine_mode mode = GET_MODE (x);
7990 gcc_checking_assert (code == MULT);
7992 op0 = XEXP (x, 0);
7993 op1 = XEXP (x, 1);
7995 if (VECTOR_MODE_P (mode))
7996 mode = GET_MODE_INNER (mode);
7998 /* Integer multiply/fma. */
7999 if (GET_MODE_CLASS (mode) == MODE_INT)
8001 /* The multiply will be canonicalized as a shift, cost it as such. */
8002 if (aarch64_shift_p (GET_CODE (x))
8003 || (CONST_INT_P (op1)
8004 && exact_log2 (INTVAL (op1)) > 0))
8006 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
8007 || GET_CODE (op0) == SIGN_EXTEND;
8008 if (speed)
8010 if (compound_p)
8012 /* If the shift is considered cheap,
8013 then don't add any cost. */
8014 if (aarch64_cheap_mult_shift_p (x))
8016 else if (REG_P (op1))
8017 /* ARITH + shift-by-register. */
8018 cost += extra_cost->alu.arith_shift_reg;
8019 else if (is_extend)
8020 /* ARITH + extended register. We don't have a cost field
8021 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
8022 cost += extra_cost->alu.extend_arith;
8023 else
8024 /* ARITH + shift-by-immediate. */
8025 cost += extra_cost->alu.arith_shift;
8027 else
8028 /* LSL (immediate). */
8029 cost += extra_cost->alu.shift;
8032 /* Strip extends as we will have costed them in the case above. */
8033 if (is_extend)
8034 op0 = aarch64_strip_extend (op0, true);
8036 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8038 return cost;
8041 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8042 compound and let the below cases handle it. After all, MNEG is a
8043 special-case alias of MSUB. */
8044 if (GET_CODE (op0) == NEG)
8046 op0 = XEXP (op0, 0);
8047 compound_p = true;
8050 /* Integer multiplies or FMAs have zero/sign extending variants. */
8051 if ((GET_CODE (op0) == ZERO_EXTEND
8052 && GET_CODE (op1) == ZERO_EXTEND)
8053 || (GET_CODE (op0) == SIGN_EXTEND
8054 && GET_CODE (op1) == SIGN_EXTEND))
8056 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8057 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8059 if (speed)
8061 if (compound_p)
8062 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8063 cost += extra_cost->mult[0].extend_add;
8064 else
8065 /* MUL/SMULL/UMULL. */
8066 cost += extra_cost->mult[0].extend;
8069 return cost;
8072 /* This is either an integer multiply or a MADD. In both cases
8073 we want to recurse and cost the operands. */
8074 cost += rtx_cost (op0, mode, MULT, 0, speed);
8075 cost += rtx_cost (op1, mode, MULT, 1, speed);
8077 if (speed)
8079 if (compound_p)
8080 /* MADD/MSUB. */
8081 cost += extra_cost->mult[mode == DImode].add;
8082 else
8083 /* MUL. */
8084 cost += extra_cost->mult[mode == DImode].simple;
8087 return cost;
8089 else
8091 if (speed)
8093 /* Floating-point FMA/FMUL can also support negations of the
8094 operands, unless the rounding mode is upward or downward in
8095 which case FNMUL is different than FMUL with operand negation. */
8096 bool neg0 = GET_CODE (op0) == NEG;
8097 bool neg1 = GET_CODE (op1) == NEG;
8098 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8100 if (neg0)
8101 op0 = XEXP (op0, 0);
8102 if (neg1)
8103 op1 = XEXP (op1, 0);
8106 if (compound_p)
8107 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8108 cost += extra_cost->fp[mode == DFmode].fma;
8109 else
8110 /* FMUL/FNMUL. */
8111 cost += extra_cost->fp[mode == DFmode].mult;
8114 cost += rtx_cost (op0, mode, MULT, 0, speed);
8115 cost += rtx_cost (op1, mode, MULT, 1, speed);
8116 return cost;
8120 static int
8121 aarch64_address_cost (rtx x,
8122 machine_mode mode,
8123 addr_space_t as ATTRIBUTE_UNUSED,
8124 bool speed)
8126 enum rtx_code c = GET_CODE (x);
8127 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8128 struct aarch64_address_info info;
8129 int cost = 0;
8130 info.shift = 0;
8132 if (!aarch64_classify_address (&info, x, mode, false))
8134 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8136 /* This is a CONST or SYMBOL ref which will be split
8137 in a different way depending on the code model in use.
8138 Cost it through the generic infrastructure. */
8139 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8140 /* Divide through by the cost of one instruction to
8141 bring it to the same units as the address costs. */
8142 cost_symbol_ref /= COSTS_N_INSNS (1);
8143 /* The cost is then the cost of preparing the address,
8144 followed by an immediate (possibly 0) offset. */
8145 return cost_symbol_ref + addr_cost->imm_offset;
8147 else
8149 /* This is most likely a jump table from a case
8150 statement. */
8151 return addr_cost->register_offset;
8155 switch (info.type)
8157 case ADDRESS_LO_SUM:
8158 case ADDRESS_SYMBOLIC:
8159 case ADDRESS_REG_IMM:
8160 cost += addr_cost->imm_offset;
8161 break;
8163 case ADDRESS_REG_WB:
8164 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8165 cost += addr_cost->pre_modify;
8166 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8167 cost += addr_cost->post_modify;
8168 else
8169 gcc_unreachable ();
8171 break;
8173 case ADDRESS_REG_REG:
8174 cost += addr_cost->register_offset;
8175 break;
8177 case ADDRESS_REG_SXTW:
8178 cost += addr_cost->register_sextend;
8179 break;
8181 case ADDRESS_REG_UXTW:
8182 cost += addr_cost->register_zextend;
8183 break;
8185 default:
8186 gcc_unreachable ();
8190 if (info.shift > 0)
8192 /* For the sake of calculating the cost of the shifted register
8193 component, we can treat same sized modes in the same way. */
8194 if (known_eq (GET_MODE_BITSIZE (mode), 16))
8195 cost += addr_cost->addr_scale_costs.hi;
8196 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8197 cost += addr_cost->addr_scale_costs.si;
8198 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8199 cost += addr_cost->addr_scale_costs.di;
8200 else
8201 /* We can't tell, or this is a 128-bit vector. */
8202 cost += addr_cost->addr_scale_costs.ti;
8205 return cost;
8208 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8209 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8210 to be taken. */
8213 aarch64_branch_cost (bool speed_p, bool predictable_p)
8215 /* When optimizing for speed, use the cost of unpredictable branches. */
8216 const struct cpu_branch_cost *branch_costs =
8217 aarch64_tune_params.branch_costs;
8219 if (!speed_p || predictable_p)
8220 return branch_costs->predictable;
8221 else
8222 return branch_costs->unpredictable;
8225 /* Return true if the RTX X in mode MODE is a zero or sign extract
8226 usable in an ADD or SUB (extended register) instruction. */
8227 static bool
8228 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8230 /* Catch add with a sign extract.
8231 This is add_<optab><mode>_multp2. */
8232 if (GET_CODE (x) == SIGN_EXTRACT
8233 || GET_CODE (x) == ZERO_EXTRACT)
8235 rtx op0 = XEXP (x, 0);
8236 rtx op1 = XEXP (x, 1);
8237 rtx op2 = XEXP (x, 2);
8239 if (GET_CODE (op0) == MULT
8240 && CONST_INT_P (op1)
8241 && op2 == const0_rtx
8242 && CONST_INT_P (XEXP (op0, 1))
8243 && aarch64_is_extend_from_extract (mode,
8244 XEXP (op0, 1),
8245 op1))
8247 return true;
8250 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8251 No shift. */
8252 else if (GET_CODE (x) == SIGN_EXTEND
8253 || GET_CODE (x) == ZERO_EXTEND)
8254 return REG_P (XEXP (x, 0));
8256 return false;
8259 static bool
8260 aarch64_frint_unspec_p (unsigned int u)
8262 switch (u)
8264 case UNSPEC_FRINTZ:
8265 case UNSPEC_FRINTP:
8266 case UNSPEC_FRINTM:
8267 case UNSPEC_FRINTA:
8268 case UNSPEC_FRINTN:
8269 case UNSPEC_FRINTX:
8270 case UNSPEC_FRINTI:
8271 return true;
8273 default:
8274 return false;
8278 /* Return true iff X is an rtx that will match an extr instruction
8279 i.e. as described in the *extr<mode>5_insn family of patterns.
8280 OP0 and OP1 will be set to the operands of the shifts involved
8281 on success and will be NULL_RTX otherwise. */
8283 static bool
8284 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8286 rtx op0, op1;
8287 scalar_int_mode mode;
8288 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8289 return false;
8291 *res_op0 = NULL_RTX;
8292 *res_op1 = NULL_RTX;
8294 if (GET_CODE (x) != IOR)
8295 return false;
8297 op0 = XEXP (x, 0);
8298 op1 = XEXP (x, 1);
8300 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8301 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8303 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8304 if (GET_CODE (op1) == ASHIFT)
8305 std::swap (op0, op1);
8307 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8308 return false;
8310 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8311 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8313 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8314 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8316 *res_op0 = XEXP (op0, 0);
8317 *res_op1 = XEXP (op1, 0);
8318 return true;
8322 return false;
8325 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8326 storing it in *COST. Result is true if the total cost of the operation
8327 has now been calculated. */
8328 static bool
8329 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8331 rtx inner;
8332 rtx comparator;
8333 enum rtx_code cmpcode;
8335 if (COMPARISON_P (op0))
8337 inner = XEXP (op0, 0);
8338 comparator = XEXP (op0, 1);
8339 cmpcode = GET_CODE (op0);
8341 else
8343 inner = op0;
8344 comparator = const0_rtx;
8345 cmpcode = NE;
8348 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8350 /* Conditional branch. */
8351 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8352 return true;
8353 else
8355 if (cmpcode == NE || cmpcode == EQ)
8357 if (comparator == const0_rtx)
8359 /* TBZ/TBNZ/CBZ/CBNZ. */
8360 if (GET_CODE (inner) == ZERO_EXTRACT)
8361 /* TBZ/TBNZ. */
8362 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8363 ZERO_EXTRACT, 0, speed);
8364 else
8365 /* CBZ/CBNZ. */
8366 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8368 return true;
8371 else if (cmpcode == LT || cmpcode == GE)
8373 /* TBZ/TBNZ. */
8374 if (comparator == const0_rtx)
8375 return true;
8379 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8381 /* CCMP. */
8382 if (GET_CODE (op1) == COMPARE)
8384 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8385 if (XEXP (op1, 1) == const0_rtx)
8386 *cost += 1;
8387 if (speed)
8389 machine_mode mode = GET_MODE (XEXP (op1, 0));
8390 const struct cpu_cost_table *extra_cost
8391 = aarch64_tune_params.insn_extra_cost;
8393 if (GET_MODE_CLASS (mode) == MODE_INT)
8394 *cost += extra_cost->alu.arith;
8395 else
8396 *cost += extra_cost->fp[mode == DFmode].compare;
8398 return true;
8401 /* It's a conditional operation based on the status flags,
8402 so it must be some flavor of CSEL. */
8404 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8405 if (GET_CODE (op1) == NEG
8406 || GET_CODE (op1) == NOT
8407 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8408 op1 = XEXP (op1, 0);
8409 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8411 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8412 op1 = XEXP (op1, 0);
8413 op2 = XEXP (op2, 0);
8416 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8417 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8418 return true;
8421 /* We don't know what this is, cost all operands. */
8422 return false;
8425 /* Check whether X is a bitfield operation of the form shift + extend that
8426 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8427 operand to which the bitfield operation is applied. Otherwise return
8428 NULL_RTX. */
8430 static rtx
8431 aarch64_extend_bitfield_pattern_p (rtx x)
8433 rtx_code outer_code = GET_CODE (x);
8434 machine_mode outer_mode = GET_MODE (x);
8436 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8437 && outer_mode != SImode && outer_mode != DImode)
8438 return NULL_RTX;
8440 rtx inner = XEXP (x, 0);
8441 rtx_code inner_code = GET_CODE (inner);
8442 machine_mode inner_mode = GET_MODE (inner);
8443 rtx op = NULL_RTX;
8445 switch (inner_code)
8447 case ASHIFT:
8448 if (CONST_INT_P (XEXP (inner, 1))
8449 && (inner_mode == QImode || inner_mode == HImode))
8450 op = XEXP (inner, 0);
8451 break;
8452 case LSHIFTRT:
8453 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8454 && (inner_mode == QImode || inner_mode == HImode))
8455 op = XEXP (inner, 0);
8456 break;
8457 case ASHIFTRT:
8458 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8459 && (inner_mode == QImode || inner_mode == HImode))
8460 op = XEXP (inner, 0);
8461 break;
8462 default:
8463 break;
8466 return op;
8469 /* Return true if the mask and a shift amount from an RTX of the form
8470 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8471 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8473 bool
8474 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8475 rtx shft_amnt)
8477 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8478 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8479 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8480 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8483 /* Calculate the cost of calculating X, storing it in *COST. Result
8484 is true if the total cost of the operation has now been calculated. */
8485 static bool
8486 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8487 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8489 rtx op0, op1, op2;
8490 const struct cpu_cost_table *extra_cost
8491 = aarch64_tune_params.insn_extra_cost;
8492 int code = GET_CODE (x);
8493 scalar_int_mode int_mode;
8495 /* By default, assume that everything has equivalent cost to the
8496 cheapest instruction. Any additional costs are applied as a delta
8497 above this default. */
8498 *cost = COSTS_N_INSNS (1);
8500 switch (code)
8502 case SET:
8503 /* The cost depends entirely on the operands to SET. */
8504 *cost = 0;
8505 op0 = SET_DEST (x);
8506 op1 = SET_SRC (x);
8508 switch (GET_CODE (op0))
8510 case MEM:
8511 if (speed)
8513 rtx address = XEXP (op0, 0);
8514 if (VECTOR_MODE_P (mode))
8515 *cost += extra_cost->ldst.storev;
8516 else if (GET_MODE_CLASS (mode) == MODE_INT)
8517 *cost += extra_cost->ldst.store;
8518 else if (mode == SFmode)
8519 *cost += extra_cost->ldst.storef;
8520 else if (mode == DFmode)
8521 *cost += extra_cost->ldst.stored;
8523 *cost +=
8524 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8525 0, speed));
8528 *cost += rtx_cost (op1, mode, SET, 1, speed);
8529 return true;
8531 case SUBREG:
8532 if (! REG_P (SUBREG_REG (op0)))
8533 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8535 /* Fall through. */
8536 case REG:
8537 /* The cost is one per vector-register copied. */
8538 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8540 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8541 *cost = COSTS_N_INSNS (nregs);
8543 /* const0_rtx is in general free, but we will use an
8544 instruction to set a register to 0. */
8545 else if (REG_P (op1) || op1 == const0_rtx)
8547 /* The cost is 1 per register copied. */
8548 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8549 *cost = COSTS_N_INSNS (nregs);
8551 else
8552 /* Cost is just the cost of the RHS of the set. */
8553 *cost += rtx_cost (op1, mode, SET, 1, speed);
8554 return true;
8556 case ZERO_EXTRACT:
8557 case SIGN_EXTRACT:
8558 /* Bit-field insertion. Strip any redundant widening of
8559 the RHS to meet the width of the target. */
8560 if (GET_CODE (op1) == SUBREG)
8561 op1 = SUBREG_REG (op1);
8562 if ((GET_CODE (op1) == ZERO_EXTEND
8563 || GET_CODE (op1) == SIGN_EXTEND)
8564 && CONST_INT_P (XEXP (op0, 1))
8565 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8566 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8567 op1 = XEXP (op1, 0);
8569 if (CONST_INT_P (op1))
8571 /* MOV immediate is assumed to always be cheap. */
8572 *cost = COSTS_N_INSNS (1);
8574 else
8576 /* BFM. */
8577 if (speed)
8578 *cost += extra_cost->alu.bfi;
8579 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8582 return true;
8584 default:
8585 /* We can't make sense of this, assume default cost. */
8586 *cost = COSTS_N_INSNS (1);
8587 return false;
8589 return false;
8591 case CONST_INT:
8592 /* If an instruction can incorporate a constant within the
8593 instruction, the instruction's expression avoids calling
8594 rtx_cost() on the constant. If rtx_cost() is called on a
8595 constant, then it is usually because the constant must be
8596 moved into a register by one or more instructions.
8598 The exception is constant 0, which can be expressed
8599 as XZR/WZR and is therefore free. The exception to this is
8600 if we have (set (reg) (const0_rtx)) in which case we must cost
8601 the move. However, we can catch that when we cost the SET, so
8602 we don't need to consider that here. */
8603 if (x == const0_rtx)
8604 *cost = 0;
8605 else
8607 /* To an approximation, building any other constant is
8608 proportionally expensive to the number of instructions
8609 required to build that constant. This is true whether we
8610 are compiling for SPEED or otherwise. */
8611 if (!is_a <scalar_int_mode> (mode, &int_mode))
8612 int_mode = word_mode;
8613 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8614 (NULL_RTX, x, false, int_mode));
8616 return true;
8618 case CONST_DOUBLE:
8620 /* First determine number of instructions to do the move
8621 as an integer constant. */
8622 if (!aarch64_float_const_representable_p (x)
8623 && !aarch64_can_const_movi_rtx_p (x, mode)
8624 && aarch64_float_const_rtx_p (x))
8626 unsigned HOST_WIDE_INT ival;
8627 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8628 gcc_assert (succeed);
8630 scalar_int_mode imode = (mode == HFmode
8631 ? SImode
8632 : int_mode_for_mode (mode).require ());
8633 int ncost = aarch64_internal_mov_immediate
8634 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8635 *cost += COSTS_N_INSNS (ncost);
8636 return true;
8639 if (speed)
8641 /* mov[df,sf]_aarch64. */
8642 if (aarch64_float_const_representable_p (x))
8643 /* FMOV (scalar immediate). */
8644 *cost += extra_cost->fp[mode == DFmode].fpconst;
8645 else if (!aarch64_float_const_zero_rtx_p (x))
8647 /* This will be a load from memory. */
8648 if (mode == DFmode)
8649 *cost += extra_cost->ldst.loadd;
8650 else
8651 *cost += extra_cost->ldst.loadf;
8653 else
8654 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8655 or MOV v0.s[0], wzr - neither of which are modeled by the
8656 cost tables. Just use the default cost. */
8661 return true;
8663 case MEM:
8664 if (speed)
8666 /* For loads we want the base cost of a load, plus an
8667 approximation for the additional cost of the addressing
8668 mode. */
8669 rtx address = XEXP (x, 0);
8670 if (VECTOR_MODE_P (mode))
8671 *cost += extra_cost->ldst.loadv;
8672 else if (GET_MODE_CLASS (mode) == MODE_INT)
8673 *cost += extra_cost->ldst.load;
8674 else if (mode == SFmode)
8675 *cost += extra_cost->ldst.loadf;
8676 else if (mode == DFmode)
8677 *cost += extra_cost->ldst.loadd;
8679 *cost +=
8680 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8681 0, speed));
8684 return true;
8686 case NEG:
8687 op0 = XEXP (x, 0);
8689 if (VECTOR_MODE_P (mode))
8691 if (speed)
8693 /* FNEG. */
8694 *cost += extra_cost->vect.alu;
8696 return false;
8699 if (GET_MODE_CLASS (mode) == MODE_INT)
8701 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8702 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8704 /* CSETM. */
8705 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8706 return true;
8709 /* Cost this as SUB wzr, X. */
8710 op0 = CONST0_RTX (mode);
8711 op1 = XEXP (x, 0);
8712 goto cost_minus;
8715 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8717 /* Support (neg(fma...)) as a single instruction only if
8718 sign of zeros is unimportant. This matches the decision
8719 making in aarch64.md. */
8720 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8722 /* FNMADD. */
8723 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8724 return true;
8726 if (GET_CODE (op0) == MULT)
8728 /* FNMUL. */
8729 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8730 return true;
8732 if (speed)
8733 /* FNEG. */
8734 *cost += extra_cost->fp[mode == DFmode].neg;
8735 return false;
8738 return false;
8740 case CLRSB:
8741 case CLZ:
8742 if (speed)
8744 if (VECTOR_MODE_P (mode))
8745 *cost += extra_cost->vect.alu;
8746 else
8747 *cost += extra_cost->alu.clz;
8750 return false;
8752 case COMPARE:
8753 op0 = XEXP (x, 0);
8754 op1 = XEXP (x, 1);
8756 if (op1 == const0_rtx
8757 && GET_CODE (op0) == AND)
8759 x = op0;
8760 mode = GET_MODE (op0);
8761 goto cost_logic;
8764 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8766 /* TODO: A write to the CC flags possibly costs extra, this
8767 needs encoding in the cost tables. */
8769 mode = GET_MODE (op0);
8770 /* ANDS. */
8771 if (GET_CODE (op0) == AND)
8773 x = op0;
8774 goto cost_logic;
8777 if (GET_CODE (op0) == PLUS)
8779 /* ADDS (and CMN alias). */
8780 x = op0;
8781 goto cost_plus;
8784 if (GET_CODE (op0) == MINUS)
8786 /* SUBS. */
8787 x = op0;
8788 goto cost_minus;
8791 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8792 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8793 && CONST_INT_P (XEXP (op0, 2)))
8795 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8796 Handle it here directly rather than going to cost_logic
8797 since we know the immediate generated for the TST is valid
8798 so we can avoid creating an intermediate rtx for it only
8799 for costing purposes. */
8800 if (speed)
8801 *cost += extra_cost->alu.logical;
8803 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8804 ZERO_EXTRACT, 0, speed);
8805 return true;
8808 if (GET_CODE (op1) == NEG)
8810 /* CMN. */
8811 if (speed)
8812 *cost += extra_cost->alu.arith;
8814 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8815 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8816 return true;
8819 /* CMP.
8821 Compare can freely swap the order of operands, and
8822 canonicalization puts the more complex operation first.
8823 But the integer MINUS logic expects the shift/extend
8824 operation in op1. */
8825 if (! (REG_P (op0)
8826 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8828 op0 = XEXP (x, 1);
8829 op1 = XEXP (x, 0);
8831 goto cost_minus;
8834 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8836 /* FCMP. */
8837 if (speed)
8838 *cost += extra_cost->fp[mode == DFmode].compare;
8840 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8842 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8843 /* FCMP supports constant 0.0 for no extra cost. */
8844 return true;
8846 return false;
8849 if (VECTOR_MODE_P (mode))
8851 /* Vector compare. */
8852 if (speed)
8853 *cost += extra_cost->vect.alu;
8855 if (aarch64_float_const_zero_rtx_p (op1))
8857 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8858 cost. */
8859 return true;
8861 return false;
8863 return false;
8865 case MINUS:
8867 op0 = XEXP (x, 0);
8868 op1 = XEXP (x, 1);
8870 cost_minus:
8871 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8873 /* Detect valid immediates. */
8874 if ((GET_MODE_CLASS (mode) == MODE_INT
8875 || (GET_MODE_CLASS (mode) == MODE_CC
8876 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8877 && CONST_INT_P (op1)
8878 && aarch64_uimm12_shift (INTVAL (op1)))
8880 if (speed)
8881 /* SUB(S) (immediate). */
8882 *cost += extra_cost->alu.arith;
8883 return true;
8886 /* Look for SUB (extended register). */
8887 if (is_a <scalar_int_mode> (mode, &int_mode)
8888 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8890 if (speed)
8891 *cost += extra_cost->alu.extend_arith;
8893 op1 = aarch64_strip_extend (op1, true);
8894 *cost += rtx_cost (op1, VOIDmode,
8895 (enum rtx_code) GET_CODE (op1), 0, speed);
8896 return true;
8899 rtx new_op1 = aarch64_strip_extend (op1, false);
8901 /* Cost this as an FMA-alike operation. */
8902 if ((GET_CODE (new_op1) == MULT
8903 || aarch64_shift_p (GET_CODE (new_op1)))
8904 && code != COMPARE)
8906 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8907 (enum rtx_code) code,
8908 speed);
8909 return true;
8912 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8914 if (speed)
8916 if (VECTOR_MODE_P (mode))
8918 /* Vector SUB. */
8919 *cost += extra_cost->vect.alu;
8921 else if (GET_MODE_CLASS (mode) == MODE_INT)
8923 /* SUB(S). */
8924 *cost += extra_cost->alu.arith;
8926 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8928 /* FSUB. */
8929 *cost += extra_cost->fp[mode == DFmode].addsub;
8932 return true;
8935 case PLUS:
8937 rtx new_op0;
8939 op0 = XEXP (x, 0);
8940 op1 = XEXP (x, 1);
8942 cost_plus:
8943 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8944 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8946 /* CSINC. */
8947 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8948 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8949 return true;
8952 if (GET_MODE_CLASS (mode) == MODE_INT
8953 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8954 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8956 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8958 if (speed)
8959 /* ADD (immediate). */
8960 *cost += extra_cost->alu.arith;
8961 return true;
8964 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8966 /* Look for ADD (extended register). */
8967 if (is_a <scalar_int_mode> (mode, &int_mode)
8968 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8970 if (speed)
8971 *cost += extra_cost->alu.extend_arith;
8973 op0 = aarch64_strip_extend (op0, true);
8974 *cost += rtx_cost (op0, VOIDmode,
8975 (enum rtx_code) GET_CODE (op0), 0, speed);
8976 return true;
8979 /* Strip any extend, leave shifts behind as we will
8980 cost them through mult_cost. */
8981 new_op0 = aarch64_strip_extend (op0, false);
8983 if (GET_CODE (new_op0) == MULT
8984 || aarch64_shift_p (GET_CODE (new_op0)))
8986 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8987 speed);
8988 return true;
8991 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8993 if (speed)
8995 if (VECTOR_MODE_P (mode))
8997 /* Vector ADD. */
8998 *cost += extra_cost->vect.alu;
9000 else if (GET_MODE_CLASS (mode) == MODE_INT)
9002 /* ADD. */
9003 *cost += extra_cost->alu.arith;
9005 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9007 /* FADD. */
9008 *cost += extra_cost->fp[mode == DFmode].addsub;
9011 return true;
9014 case BSWAP:
9015 *cost = COSTS_N_INSNS (1);
9017 if (speed)
9019 if (VECTOR_MODE_P (mode))
9020 *cost += extra_cost->vect.alu;
9021 else
9022 *cost += extra_cost->alu.rev;
9024 return false;
9026 case IOR:
9027 if (aarch_rev16_p (x))
9029 *cost = COSTS_N_INSNS (1);
9031 if (speed)
9033 if (VECTOR_MODE_P (mode))
9034 *cost += extra_cost->vect.alu;
9035 else
9036 *cost += extra_cost->alu.rev;
9038 return true;
9041 if (aarch64_extr_rtx_p (x, &op0, &op1))
9043 *cost += rtx_cost (op0, mode, IOR, 0, speed);
9044 *cost += rtx_cost (op1, mode, IOR, 1, speed);
9045 if (speed)
9046 *cost += extra_cost->alu.shift;
9048 return true;
9050 /* Fall through. */
9051 case XOR:
9052 case AND:
9053 cost_logic:
9054 op0 = XEXP (x, 0);
9055 op1 = XEXP (x, 1);
9057 if (VECTOR_MODE_P (mode))
9059 if (speed)
9060 *cost += extra_cost->vect.alu;
9061 return true;
9064 if (code == AND
9065 && GET_CODE (op0) == MULT
9066 && CONST_INT_P (XEXP (op0, 1))
9067 && CONST_INT_P (op1)
9068 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9069 INTVAL (op1)) != 0)
9071 /* This is a UBFM/SBFM. */
9072 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9073 if (speed)
9074 *cost += extra_cost->alu.bfx;
9075 return true;
9078 if (is_int_mode (mode, &int_mode))
9080 if (CONST_INT_P (op1))
9082 /* We have a mask + shift version of a UBFIZ
9083 i.e. the *andim_ashift<mode>_bfiz pattern. */
9084 if (GET_CODE (op0) == ASHIFT
9085 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9086 XEXP (op0, 1)))
9088 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9089 (enum rtx_code) code, 0, speed);
9090 if (speed)
9091 *cost += extra_cost->alu.bfx;
9093 return true;
9095 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9097 /* We possibly get the immediate for free, this is not
9098 modelled. */
9099 *cost += rtx_cost (op0, int_mode,
9100 (enum rtx_code) code, 0, speed);
9101 if (speed)
9102 *cost += extra_cost->alu.logical;
9104 return true;
9107 else
9109 rtx new_op0 = op0;
9111 /* Handle ORN, EON, or BIC. */
9112 if (GET_CODE (op0) == NOT)
9113 op0 = XEXP (op0, 0);
9115 new_op0 = aarch64_strip_shift (op0);
9117 /* If we had a shift on op0 then this is a logical-shift-
9118 by-register/immediate operation. Otherwise, this is just
9119 a logical operation. */
9120 if (speed)
9122 if (new_op0 != op0)
9124 /* Shift by immediate. */
9125 if (CONST_INT_P (XEXP (op0, 1)))
9126 *cost += extra_cost->alu.log_shift;
9127 else
9128 *cost += extra_cost->alu.log_shift_reg;
9130 else
9131 *cost += extra_cost->alu.logical;
9134 /* In both cases we want to cost both operands. */
9135 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9136 0, speed);
9137 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9138 1, speed);
9140 return true;
9143 return false;
9145 case NOT:
9146 x = XEXP (x, 0);
9147 op0 = aarch64_strip_shift (x);
9149 if (VECTOR_MODE_P (mode))
9151 /* Vector NOT. */
9152 *cost += extra_cost->vect.alu;
9153 return false;
9156 /* MVN-shifted-reg. */
9157 if (op0 != x)
9159 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9161 if (speed)
9162 *cost += extra_cost->alu.log_shift;
9164 return true;
9166 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9167 Handle the second form here taking care that 'a' in the above can
9168 be a shift. */
9169 else if (GET_CODE (op0) == XOR)
9171 rtx newop0 = XEXP (op0, 0);
9172 rtx newop1 = XEXP (op0, 1);
9173 rtx op0_stripped = aarch64_strip_shift (newop0);
9175 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9176 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9178 if (speed)
9180 if (op0_stripped != newop0)
9181 *cost += extra_cost->alu.log_shift;
9182 else
9183 *cost += extra_cost->alu.logical;
9186 return true;
9188 /* MVN. */
9189 if (speed)
9190 *cost += extra_cost->alu.logical;
9192 return false;
9194 case ZERO_EXTEND:
9196 op0 = XEXP (x, 0);
9197 /* If a value is written in SI mode, then zero extended to DI
9198 mode, the operation will in general be free as a write to
9199 a 'w' register implicitly zeroes the upper bits of an 'x'
9200 register. However, if this is
9202 (set (reg) (zero_extend (reg)))
9204 we must cost the explicit register move. */
9205 if (mode == DImode
9206 && GET_MODE (op0) == SImode
9207 && outer == SET)
9209 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9211 /* If OP_COST is non-zero, then the cost of the zero extend
9212 is effectively the cost of the inner operation. Otherwise
9213 we have a MOV instruction and we take the cost from the MOV
9214 itself. This is true independently of whether we are
9215 optimizing for space or time. */
9216 if (op_cost)
9217 *cost = op_cost;
9219 return true;
9221 else if (MEM_P (op0))
9223 /* All loads can zero extend to any size for free. */
9224 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9225 return true;
9228 op0 = aarch64_extend_bitfield_pattern_p (x);
9229 if (op0)
9231 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9232 if (speed)
9233 *cost += extra_cost->alu.bfx;
9234 return true;
9237 if (speed)
9239 if (VECTOR_MODE_P (mode))
9241 /* UMOV. */
9242 *cost += extra_cost->vect.alu;
9244 else
9246 /* We generate an AND instead of UXTB/UXTH. */
9247 *cost += extra_cost->alu.logical;
9250 return false;
9252 case SIGN_EXTEND:
9253 if (MEM_P (XEXP (x, 0)))
9255 /* LDRSH. */
9256 if (speed)
9258 rtx address = XEXP (XEXP (x, 0), 0);
9259 *cost += extra_cost->ldst.load_sign_extend;
9261 *cost +=
9262 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9263 0, speed));
9265 return true;
9268 op0 = aarch64_extend_bitfield_pattern_p (x);
9269 if (op0)
9271 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9272 if (speed)
9273 *cost += extra_cost->alu.bfx;
9274 return true;
9277 if (speed)
9279 if (VECTOR_MODE_P (mode))
9280 *cost += extra_cost->vect.alu;
9281 else
9282 *cost += extra_cost->alu.extend;
9284 return false;
9286 case ASHIFT:
9287 op0 = XEXP (x, 0);
9288 op1 = XEXP (x, 1);
9290 if (CONST_INT_P (op1))
9292 if (speed)
9294 if (VECTOR_MODE_P (mode))
9296 /* Vector shift (immediate). */
9297 *cost += extra_cost->vect.alu;
9299 else
9301 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9302 aliases. */
9303 *cost += extra_cost->alu.shift;
9307 /* We can incorporate zero/sign extend for free. */
9308 if (GET_CODE (op0) == ZERO_EXTEND
9309 || GET_CODE (op0) == SIGN_EXTEND)
9310 op0 = XEXP (op0, 0);
9312 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9313 return true;
9315 else
9317 if (VECTOR_MODE_P (mode))
9319 if (speed)
9320 /* Vector shift (register). */
9321 *cost += extra_cost->vect.alu;
9323 else
9325 if (speed)
9326 /* LSLV. */
9327 *cost += extra_cost->alu.shift_reg;
9329 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9330 && CONST_INT_P (XEXP (op1, 1))
9331 && known_eq (INTVAL (XEXP (op1, 1)),
9332 GET_MODE_BITSIZE (mode) - 1))
9334 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9335 /* We already demanded XEXP (op1, 0) to be REG_P, so
9336 don't recurse into it. */
9337 return true;
9340 return false; /* All arguments need to be in registers. */
9343 case ROTATE:
9344 case ROTATERT:
9345 case LSHIFTRT:
9346 case ASHIFTRT:
9347 op0 = XEXP (x, 0);
9348 op1 = XEXP (x, 1);
9350 if (CONST_INT_P (op1))
9352 /* ASR (immediate) and friends. */
9353 if (speed)
9355 if (VECTOR_MODE_P (mode))
9356 *cost += extra_cost->vect.alu;
9357 else
9358 *cost += extra_cost->alu.shift;
9361 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9362 return true;
9364 else
9366 if (VECTOR_MODE_P (mode))
9368 if (speed)
9369 /* Vector shift (register). */
9370 *cost += extra_cost->vect.alu;
9372 else
9374 if (speed)
9375 /* ASR (register) and friends. */
9376 *cost += extra_cost->alu.shift_reg;
9378 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9379 && CONST_INT_P (XEXP (op1, 1))
9380 && known_eq (INTVAL (XEXP (op1, 1)),
9381 GET_MODE_BITSIZE (mode) - 1))
9383 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9384 /* We already demanded XEXP (op1, 0) to be REG_P, so
9385 don't recurse into it. */
9386 return true;
9389 return false; /* All arguments need to be in registers. */
9392 case SYMBOL_REF:
9394 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9395 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9397 /* LDR. */
9398 if (speed)
9399 *cost += extra_cost->ldst.load;
9401 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9402 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9404 /* ADRP, followed by ADD. */
9405 *cost += COSTS_N_INSNS (1);
9406 if (speed)
9407 *cost += 2 * extra_cost->alu.arith;
9409 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9410 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9412 /* ADR. */
9413 if (speed)
9414 *cost += extra_cost->alu.arith;
9417 if (flag_pic)
9419 /* One extra load instruction, after accessing the GOT. */
9420 *cost += COSTS_N_INSNS (1);
9421 if (speed)
9422 *cost += extra_cost->ldst.load;
9424 return true;
9426 case HIGH:
9427 case LO_SUM:
9428 /* ADRP/ADD (immediate). */
9429 if (speed)
9430 *cost += extra_cost->alu.arith;
9431 return true;
9433 case ZERO_EXTRACT:
9434 case SIGN_EXTRACT:
9435 /* UBFX/SBFX. */
9436 if (speed)
9438 if (VECTOR_MODE_P (mode))
9439 *cost += extra_cost->vect.alu;
9440 else
9441 *cost += extra_cost->alu.bfx;
9444 /* We can trust that the immediates used will be correct (there
9445 are no by-register forms), so we need only cost op0. */
9446 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9447 return true;
9449 case MULT:
9450 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9451 /* aarch64_rtx_mult_cost always handles recursion to its
9452 operands. */
9453 return true;
9455 case MOD:
9456 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9457 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9458 an unconditional negate. This case should only ever be reached through
9459 the set_smod_pow2_cheap check in expmed.c. */
9460 if (CONST_INT_P (XEXP (x, 1))
9461 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9462 && (mode == SImode || mode == DImode))
9464 /* We expand to 4 instructions. Reset the baseline. */
9465 *cost = COSTS_N_INSNS (4);
9467 if (speed)
9468 *cost += 2 * extra_cost->alu.logical
9469 + 2 * extra_cost->alu.arith;
9471 return true;
9474 /* Fall-through. */
9475 case UMOD:
9476 if (speed)
9478 /* Slighly prefer UMOD over SMOD. */
9479 if (VECTOR_MODE_P (mode))
9480 *cost += extra_cost->vect.alu;
9481 else if (GET_MODE_CLASS (mode) == MODE_INT)
9482 *cost += (extra_cost->mult[mode == DImode].add
9483 + extra_cost->mult[mode == DImode].idiv
9484 + (code == MOD ? 1 : 0));
9486 return false; /* All arguments need to be in registers. */
9488 case DIV:
9489 case UDIV:
9490 case SQRT:
9491 if (speed)
9493 if (VECTOR_MODE_P (mode))
9494 *cost += extra_cost->vect.alu;
9495 else if (GET_MODE_CLASS (mode) == MODE_INT)
9496 /* There is no integer SQRT, so only DIV and UDIV can get
9497 here. */
9498 *cost += (extra_cost->mult[mode == DImode].idiv
9499 /* Slighly prefer UDIV over SDIV. */
9500 + (code == DIV ? 1 : 0));
9501 else
9502 *cost += extra_cost->fp[mode == DFmode].div;
9504 return false; /* All arguments need to be in registers. */
9506 case IF_THEN_ELSE:
9507 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9508 XEXP (x, 2), cost, speed);
9510 case EQ:
9511 case NE:
9512 case GT:
9513 case GTU:
9514 case LT:
9515 case LTU:
9516 case GE:
9517 case GEU:
9518 case LE:
9519 case LEU:
9521 return false; /* All arguments must be in registers. */
9523 case FMA:
9524 op0 = XEXP (x, 0);
9525 op1 = XEXP (x, 1);
9526 op2 = XEXP (x, 2);
9528 if (speed)
9530 if (VECTOR_MODE_P (mode))
9531 *cost += extra_cost->vect.alu;
9532 else
9533 *cost += extra_cost->fp[mode == DFmode].fma;
9536 /* FMSUB, FNMADD, and FNMSUB are free. */
9537 if (GET_CODE (op0) == NEG)
9538 op0 = XEXP (op0, 0);
9540 if (GET_CODE (op2) == NEG)
9541 op2 = XEXP (op2, 0);
9543 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9544 and the by-element operand as operand 0. */
9545 if (GET_CODE (op1) == NEG)
9546 op1 = XEXP (op1, 0);
9548 /* Catch vector-by-element operations. The by-element operand can
9549 either be (vec_duplicate (vec_select (x))) or just
9550 (vec_select (x)), depending on whether we are multiplying by
9551 a vector or a scalar.
9553 Canonicalization is not very good in these cases, FMA4 will put the
9554 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9555 if (GET_CODE (op0) == VEC_DUPLICATE)
9556 op0 = XEXP (op0, 0);
9557 else if (GET_CODE (op1) == VEC_DUPLICATE)
9558 op1 = XEXP (op1, 0);
9560 if (GET_CODE (op0) == VEC_SELECT)
9561 op0 = XEXP (op0, 0);
9562 else if (GET_CODE (op1) == VEC_SELECT)
9563 op1 = XEXP (op1, 0);
9565 /* If the remaining parameters are not registers,
9566 get the cost to put them into registers. */
9567 *cost += rtx_cost (op0, mode, FMA, 0, speed);
9568 *cost += rtx_cost (op1, mode, FMA, 1, speed);
9569 *cost += rtx_cost (op2, mode, FMA, 2, speed);
9570 return true;
9572 case FLOAT:
9573 case UNSIGNED_FLOAT:
9574 if (speed)
9575 *cost += extra_cost->fp[mode == DFmode].fromint;
9576 return false;
9578 case FLOAT_EXTEND:
9579 if (speed)
9581 if (VECTOR_MODE_P (mode))
9583 /*Vector truncate. */
9584 *cost += extra_cost->vect.alu;
9586 else
9587 *cost += extra_cost->fp[mode == DFmode].widen;
9589 return false;
9591 case FLOAT_TRUNCATE:
9592 if (speed)
9594 if (VECTOR_MODE_P (mode))
9596 /*Vector conversion. */
9597 *cost += extra_cost->vect.alu;
9599 else
9600 *cost += extra_cost->fp[mode == DFmode].narrow;
9602 return false;
9604 case FIX:
9605 case UNSIGNED_FIX:
9606 x = XEXP (x, 0);
9607 /* Strip the rounding part. They will all be implemented
9608 by the fcvt* family of instructions anyway. */
9609 if (GET_CODE (x) == UNSPEC)
9611 unsigned int uns_code = XINT (x, 1);
9613 if (uns_code == UNSPEC_FRINTA
9614 || uns_code == UNSPEC_FRINTM
9615 || uns_code == UNSPEC_FRINTN
9616 || uns_code == UNSPEC_FRINTP
9617 || uns_code == UNSPEC_FRINTZ)
9618 x = XVECEXP (x, 0, 0);
9621 if (speed)
9623 if (VECTOR_MODE_P (mode))
9624 *cost += extra_cost->vect.alu;
9625 else
9626 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9629 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9630 fixed-point fcvt. */
9631 if (GET_CODE (x) == MULT
9632 && ((VECTOR_MODE_P (mode)
9633 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9634 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9636 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9637 0, speed);
9638 return true;
9641 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9642 return true;
9644 case ABS:
9645 if (VECTOR_MODE_P (mode))
9647 /* ABS (vector). */
9648 if (speed)
9649 *cost += extra_cost->vect.alu;
9651 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9653 op0 = XEXP (x, 0);
9655 /* FABD, which is analogous to FADD. */
9656 if (GET_CODE (op0) == MINUS)
9658 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9659 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9660 if (speed)
9661 *cost += extra_cost->fp[mode == DFmode].addsub;
9663 return true;
9665 /* Simple FABS is analogous to FNEG. */
9666 if (speed)
9667 *cost += extra_cost->fp[mode == DFmode].neg;
9669 else
9671 /* Integer ABS will either be split to
9672 two arithmetic instructions, or will be an ABS
9673 (scalar), which we don't model. */
9674 *cost = COSTS_N_INSNS (2);
9675 if (speed)
9676 *cost += 2 * extra_cost->alu.arith;
9678 return false;
9680 case SMAX:
9681 case SMIN:
9682 if (speed)
9684 if (VECTOR_MODE_P (mode))
9685 *cost += extra_cost->vect.alu;
9686 else
9688 /* FMAXNM/FMINNM/FMAX/FMIN.
9689 TODO: This may not be accurate for all implementations, but
9690 we do not model this in the cost tables. */
9691 *cost += extra_cost->fp[mode == DFmode].addsub;
9694 return false;
9696 case UNSPEC:
9697 /* The floating point round to integer frint* instructions. */
9698 if (aarch64_frint_unspec_p (XINT (x, 1)))
9700 if (speed)
9701 *cost += extra_cost->fp[mode == DFmode].roundint;
9703 return false;
9706 if (XINT (x, 1) == UNSPEC_RBIT)
9708 if (speed)
9709 *cost += extra_cost->alu.rev;
9711 return false;
9713 break;
9715 case TRUNCATE:
9717 /* Decompose <su>muldi3_highpart. */
9718 if (/* (truncate:DI */
9719 mode == DImode
9720 /* (lshiftrt:TI */
9721 && GET_MODE (XEXP (x, 0)) == TImode
9722 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9723 /* (mult:TI */
9724 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9725 /* (ANY_EXTEND:TI (reg:DI))
9726 (ANY_EXTEND:TI (reg:DI))) */
9727 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9728 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9729 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9730 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9731 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9732 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9733 /* (const_int 64) */
9734 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9735 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9737 /* UMULH/SMULH. */
9738 if (speed)
9739 *cost += extra_cost->mult[mode == DImode].extend;
9740 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9741 mode, MULT, 0, speed);
9742 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9743 mode, MULT, 1, speed);
9744 return true;
9747 /* Fall through. */
9748 default:
9749 break;
9752 if (dump_file
9753 && flag_aarch64_verbose_cost)
9754 fprintf (dump_file,
9755 "\nFailed to cost RTX. Assuming default cost.\n");
9757 return true;
9760 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9761 calculated for X. This cost is stored in *COST. Returns true
9762 if the total cost of X was calculated. */
9763 static bool
9764 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9765 int param, int *cost, bool speed)
9767 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9769 if (dump_file
9770 && flag_aarch64_verbose_cost)
9772 print_rtl_single (dump_file, x);
9773 fprintf (dump_file, "\n%s cost: %d (%s)\n",
9774 speed ? "Hot" : "Cold",
9775 *cost, result ? "final" : "partial");
9778 return result;
9781 static int
9782 aarch64_register_move_cost (machine_mode mode,
9783 reg_class_t from_i, reg_class_t to_i)
9785 enum reg_class from = (enum reg_class) from_i;
9786 enum reg_class to = (enum reg_class) to_i;
9787 const struct cpu_regmove_cost *regmove_cost
9788 = aarch64_tune_params.regmove_cost;
9790 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9791 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9792 to = GENERAL_REGS;
9794 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9795 from = GENERAL_REGS;
9797 /* Moving between GPR and stack cost is the same as GP2GP. */
9798 if ((from == GENERAL_REGS && to == STACK_REG)
9799 || (to == GENERAL_REGS && from == STACK_REG))
9800 return regmove_cost->GP2GP;
9802 /* To/From the stack register, we move via the gprs. */
9803 if (to == STACK_REG || from == STACK_REG)
9804 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9805 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9807 if (known_eq (GET_MODE_SIZE (mode), 16))
9809 /* 128-bit operations on general registers require 2 instructions. */
9810 if (from == GENERAL_REGS && to == GENERAL_REGS)
9811 return regmove_cost->GP2GP * 2;
9812 else if (from == GENERAL_REGS)
9813 return regmove_cost->GP2FP * 2;
9814 else if (to == GENERAL_REGS)
9815 return regmove_cost->FP2GP * 2;
9817 /* When AdvSIMD instructions are disabled it is not possible to move
9818 a 128-bit value directly between Q registers. This is handled in
9819 secondary reload. A general register is used as a scratch to move
9820 the upper DI value and the lower DI value is moved directly,
9821 hence the cost is the sum of three moves. */
9822 if (! TARGET_SIMD)
9823 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9825 return regmove_cost->FP2FP;
9828 if (from == GENERAL_REGS && to == GENERAL_REGS)
9829 return regmove_cost->GP2GP;
9830 else if (from == GENERAL_REGS)
9831 return regmove_cost->GP2FP;
9832 else if (to == GENERAL_REGS)
9833 return regmove_cost->FP2GP;
9835 return regmove_cost->FP2FP;
9838 static int
9839 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9840 reg_class_t rclass ATTRIBUTE_UNUSED,
9841 bool in ATTRIBUTE_UNUSED)
9843 return aarch64_tune_params.memmov_cost;
9846 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9847 to optimize 1.0/sqrt. */
9849 static bool
9850 use_rsqrt_p (machine_mode mode)
9852 return (!flag_trapping_math
9853 && flag_unsafe_math_optimizations
9854 && ((aarch64_tune_params.approx_modes->recip_sqrt
9855 & AARCH64_APPROX_MODE (mode))
9856 || flag_mrecip_low_precision_sqrt));
9859 /* Function to decide when to use the approximate reciprocal square root
9860 builtin. */
9862 static tree
9863 aarch64_builtin_reciprocal (tree fndecl)
9865 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9867 if (!use_rsqrt_p (mode))
9868 return NULL_TREE;
9869 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9872 typedef rtx (*rsqrte_type) (rtx, rtx);
9874 /* Select reciprocal square root initial estimate insn depending on machine
9875 mode. */
9877 static rsqrte_type
9878 get_rsqrte_type (machine_mode mode)
9880 switch (mode)
9882 case E_DFmode: return gen_aarch64_rsqrtedf;
9883 case E_SFmode: return gen_aarch64_rsqrtesf;
9884 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9885 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9886 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9887 default: gcc_unreachable ();
9891 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9893 /* Select reciprocal square root series step insn depending on machine mode. */
9895 static rsqrts_type
9896 get_rsqrts_type (machine_mode mode)
9898 switch (mode)
9900 case E_DFmode: return gen_aarch64_rsqrtsdf;
9901 case E_SFmode: return gen_aarch64_rsqrtssf;
9902 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9903 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9904 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9905 default: gcc_unreachable ();
9909 /* Emit instruction sequence to compute either the approximate square root
9910 or its approximate reciprocal, depending on the flag RECP, and return
9911 whether the sequence was emitted or not. */
9913 bool
9914 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9916 machine_mode mode = GET_MODE (dst);
9918 if (GET_MODE_INNER (mode) == HFmode)
9920 gcc_assert (!recp);
9921 return false;
9924 if (!recp)
9926 if (!(flag_mlow_precision_sqrt
9927 || (aarch64_tune_params.approx_modes->sqrt
9928 & AARCH64_APPROX_MODE (mode))))
9929 return false;
9931 if (flag_finite_math_only
9932 || flag_trapping_math
9933 || !flag_unsafe_math_optimizations
9934 || optimize_function_for_size_p (cfun))
9935 return false;
9937 else
9938 /* Caller assumes we cannot fail. */
9939 gcc_assert (use_rsqrt_p (mode));
9941 machine_mode mmsk = mode_for_int_vector (mode).require ();
9942 rtx xmsk = gen_reg_rtx (mmsk);
9943 if (!recp)
9944 /* When calculating the approximate square root, compare the
9945 argument with 0.0 and create a mask. */
9946 emit_insn (gen_rtx_SET (xmsk,
9947 gen_rtx_NEG (mmsk,
9948 gen_rtx_EQ (mmsk, src,
9949 CONST0_RTX (mode)))));
9951 /* Estimate the approximate reciprocal square root. */
9952 rtx xdst = gen_reg_rtx (mode);
9953 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9955 /* Iterate over the series twice for SF and thrice for DF. */
9956 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9958 /* Optionally iterate over the series once less for faster performance
9959 while sacrificing the accuracy. */
9960 if ((recp && flag_mrecip_low_precision_sqrt)
9961 || (!recp && flag_mlow_precision_sqrt))
9962 iterations--;
9964 /* Iterate over the series to calculate the approximate reciprocal square
9965 root. */
9966 rtx x1 = gen_reg_rtx (mode);
9967 while (iterations--)
9969 rtx x2 = gen_reg_rtx (mode);
9970 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9972 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9974 if (iterations > 0)
9975 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9978 if (!recp)
9980 /* Qualify the approximate reciprocal square root when the argument is
9981 0.0 by squashing the intermediary result to 0.0. */
9982 rtx xtmp = gen_reg_rtx (mmsk);
9983 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9984 gen_rtx_SUBREG (mmsk, xdst, 0)));
9985 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9987 /* Calculate the approximate square root. */
9988 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9991 /* Finalize the approximation. */
9992 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9994 return true;
9997 typedef rtx (*recpe_type) (rtx, rtx);
9999 /* Select reciprocal initial estimate insn depending on machine mode. */
10001 static recpe_type
10002 get_recpe_type (machine_mode mode)
10004 switch (mode)
10006 case E_SFmode: return (gen_aarch64_frecpesf);
10007 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
10008 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
10009 case E_DFmode: return (gen_aarch64_frecpedf);
10010 case E_V2DFmode: return (gen_aarch64_frecpev2df);
10011 default: gcc_unreachable ();
10015 typedef rtx (*recps_type) (rtx, rtx, rtx);
10017 /* Select reciprocal series step insn depending on machine mode. */
10019 static recps_type
10020 get_recps_type (machine_mode mode)
10022 switch (mode)
10024 case E_SFmode: return (gen_aarch64_frecpssf);
10025 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
10026 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
10027 case E_DFmode: return (gen_aarch64_frecpsdf);
10028 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
10029 default: gcc_unreachable ();
10033 /* Emit the instruction sequence to compute the approximation for the division
10034 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10036 bool
10037 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10039 machine_mode mode = GET_MODE (quo);
10041 if (GET_MODE_INNER (mode) == HFmode)
10042 return false;
10044 bool use_approx_division_p = (flag_mlow_precision_div
10045 || (aarch64_tune_params.approx_modes->division
10046 & AARCH64_APPROX_MODE (mode)));
10048 if (!flag_finite_math_only
10049 || flag_trapping_math
10050 || !flag_unsafe_math_optimizations
10051 || optimize_function_for_size_p (cfun)
10052 || !use_approx_division_p)
10053 return false;
10055 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10056 return false;
10058 /* Estimate the approximate reciprocal. */
10059 rtx xrcp = gen_reg_rtx (mode);
10060 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
10062 /* Iterate over the series twice for SF and thrice for DF. */
10063 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10065 /* Optionally iterate over the series once less for faster performance,
10066 while sacrificing the accuracy. */
10067 if (flag_mlow_precision_div)
10068 iterations--;
10070 /* Iterate over the series to calculate the approximate reciprocal. */
10071 rtx xtmp = gen_reg_rtx (mode);
10072 while (iterations--)
10074 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10076 if (iterations > 0)
10077 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10080 if (num != CONST1_RTX (mode))
10082 /* As the approximate reciprocal of DEN is already calculated, only
10083 calculate the approximate division when NUM is not 1.0. */
10084 rtx xnum = force_reg (mode, num);
10085 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10088 /* Finalize the approximation. */
10089 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10090 return true;
10093 /* Return the number of instructions that can be issued per cycle. */
10094 static int
10095 aarch64_sched_issue_rate (void)
10097 return aarch64_tune_params.issue_rate;
10100 static int
10101 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10103 int issue_rate = aarch64_sched_issue_rate ();
10105 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10109 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10110 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10111 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10113 static int
10114 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10115 int ready_index)
10117 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10121 /* Vectorizer cost model target hooks. */
10123 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10124 static int
10125 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10126 tree vectype,
10127 int misalign ATTRIBUTE_UNUSED)
10129 unsigned elements;
10130 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10131 bool fp = false;
10133 if (vectype != NULL)
10134 fp = FLOAT_TYPE_P (vectype);
10136 switch (type_of_cost)
10138 case scalar_stmt:
10139 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10141 case scalar_load:
10142 return costs->scalar_load_cost;
10144 case scalar_store:
10145 return costs->scalar_store_cost;
10147 case vector_stmt:
10148 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10150 case vector_load:
10151 return costs->vec_align_load_cost;
10153 case vector_store:
10154 return costs->vec_store_cost;
10156 case vec_to_scalar:
10157 return costs->vec_to_scalar_cost;
10159 case scalar_to_vec:
10160 return costs->scalar_to_vec_cost;
10162 case unaligned_load:
10163 case vector_gather_load:
10164 return costs->vec_unalign_load_cost;
10166 case unaligned_store:
10167 case vector_scatter_store:
10168 return costs->vec_unalign_store_cost;
10170 case cond_branch_taken:
10171 return costs->cond_taken_branch_cost;
10173 case cond_branch_not_taken:
10174 return costs->cond_not_taken_branch_cost;
10176 case vec_perm:
10177 return costs->vec_permute_cost;
10179 case vec_promote_demote:
10180 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10182 case vec_construct:
10183 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10184 return elements / 2 + 1;
10186 default:
10187 gcc_unreachable ();
10191 /* Implement targetm.vectorize.add_stmt_cost. */
10192 static unsigned
10193 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10194 struct _stmt_vec_info *stmt_info, int misalign,
10195 enum vect_cost_model_location where)
10197 unsigned *cost = (unsigned *) data;
10198 unsigned retval = 0;
10200 if (flag_vect_cost_model)
10202 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10203 int stmt_cost =
10204 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10206 /* Statements in an inner loop relative to the loop being
10207 vectorized are weighted more heavily. The value here is
10208 arbitrary and could potentially be improved with analysis. */
10209 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10210 count *= 50; /* FIXME */
10212 retval = (unsigned) (count * stmt_cost);
10213 cost[where] += retval;
10216 return retval;
10219 static void initialize_aarch64_code_model (struct gcc_options *);
10221 /* Parse the TO_PARSE string and put the architecture struct that it
10222 selects into RES and the architectural features into ISA_FLAGS.
10223 Return an aarch64_parse_opt_result describing the parse result.
10224 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10226 static enum aarch64_parse_opt_result
10227 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10228 unsigned long *isa_flags)
10230 char *ext;
10231 const struct processor *arch;
10232 char *str = (char *) alloca (strlen (to_parse) + 1);
10233 size_t len;
10235 strcpy (str, to_parse);
10237 ext = strchr (str, '+');
10239 if (ext != NULL)
10240 len = ext - str;
10241 else
10242 len = strlen (str);
10244 if (len == 0)
10245 return AARCH64_PARSE_MISSING_ARG;
10248 /* Loop through the list of supported ARCHes to find a match. */
10249 for (arch = all_architectures; arch->name != NULL; arch++)
10251 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10253 unsigned long isa_temp = arch->flags;
10255 if (ext != NULL)
10257 /* TO_PARSE string contains at least one extension. */
10258 enum aarch64_parse_opt_result ext_res
10259 = aarch64_parse_extension (ext, &isa_temp);
10261 if (ext_res != AARCH64_PARSE_OK)
10262 return ext_res;
10264 /* Extension parsing was successful. Confirm the result
10265 arch and ISA flags. */
10266 *res = arch;
10267 *isa_flags = isa_temp;
10268 return AARCH64_PARSE_OK;
10272 /* ARCH name not found in list. */
10273 return AARCH64_PARSE_INVALID_ARG;
10276 /* Parse the TO_PARSE string and put the result tuning in RES and the
10277 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10278 describing the parse result. If there is an error parsing, RES and
10279 ISA_FLAGS are left unchanged. */
10281 static enum aarch64_parse_opt_result
10282 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10283 unsigned long *isa_flags)
10285 char *ext;
10286 const struct processor *cpu;
10287 char *str = (char *) alloca (strlen (to_parse) + 1);
10288 size_t len;
10290 strcpy (str, to_parse);
10292 ext = strchr (str, '+');
10294 if (ext != NULL)
10295 len = ext - str;
10296 else
10297 len = strlen (str);
10299 if (len == 0)
10300 return AARCH64_PARSE_MISSING_ARG;
10303 /* Loop through the list of supported CPUs to find a match. */
10304 for (cpu = all_cores; cpu->name != NULL; cpu++)
10306 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10308 unsigned long isa_temp = cpu->flags;
10311 if (ext != NULL)
10313 /* TO_PARSE string contains at least one extension. */
10314 enum aarch64_parse_opt_result ext_res
10315 = aarch64_parse_extension (ext, &isa_temp);
10317 if (ext_res != AARCH64_PARSE_OK)
10318 return ext_res;
10320 /* Extension parsing was successfull. Confirm the result
10321 cpu and ISA flags. */
10322 *res = cpu;
10323 *isa_flags = isa_temp;
10324 return AARCH64_PARSE_OK;
10328 /* CPU name not found in list. */
10329 return AARCH64_PARSE_INVALID_ARG;
10332 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10333 Return an aarch64_parse_opt_result describing the parse result.
10334 If the parsing fails the RES does not change. */
10336 static enum aarch64_parse_opt_result
10337 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10339 const struct processor *cpu;
10340 char *str = (char *) alloca (strlen (to_parse) + 1);
10342 strcpy (str, to_parse);
10344 /* Loop through the list of supported CPUs to find a match. */
10345 for (cpu = all_cores; cpu->name != NULL; cpu++)
10347 if (strcmp (cpu->name, str) == 0)
10349 *res = cpu;
10350 return AARCH64_PARSE_OK;
10354 /* CPU name not found in list. */
10355 return AARCH64_PARSE_INVALID_ARG;
10358 /* Parse TOKEN, which has length LENGTH to see if it is an option
10359 described in FLAG. If it is, return the index bit for that fusion type.
10360 If not, error (printing OPTION_NAME) and return zero. */
10362 static unsigned int
10363 aarch64_parse_one_option_token (const char *token,
10364 size_t length,
10365 const struct aarch64_flag_desc *flag,
10366 const char *option_name)
10368 for (; flag->name != NULL; flag++)
10370 if (length == strlen (flag->name)
10371 && !strncmp (flag->name, token, length))
10372 return flag->flag;
10375 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10376 return 0;
10379 /* Parse OPTION which is a comma-separated list of flags to enable.
10380 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10381 default state we inherit from the CPU tuning structures. OPTION_NAME
10382 gives the top-level option we are parsing in the -moverride string,
10383 for use in error messages. */
10385 static unsigned int
10386 aarch64_parse_boolean_options (const char *option,
10387 const struct aarch64_flag_desc *flags,
10388 unsigned int initial_state,
10389 const char *option_name)
10391 const char separator = '.';
10392 const char* specs = option;
10393 const char* ntoken = option;
10394 unsigned int found_flags = initial_state;
10396 while ((ntoken = strchr (specs, separator)))
10398 size_t token_length = ntoken - specs;
10399 unsigned token_ops = aarch64_parse_one_option_token (specs,
10400 token_length,
10401 flags,
10402 option_name);
10403 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10404 in the token stream, reset the supported operations. So:
10406 adrp+add.cmp+branch.none.adrp+add
10408 would have the result of turning on only adrp+add fusion. */
10409 if (!token_ops)
10410 found_flags = 0;
10412 found_flags |= token_ops;
10413 specs = ++ntoken;
10416 /* We ended with a comma, print something. */
10417 if (!(*specs))
10419 error ("%s string ill-formed\n", option_name);
10420 return 0;
10423 /* We still have one more token to parse. */
10424 size_t token_length = strlen (specs);
10425 unsigned token_ops = aarch64_parse_one_option_token (specs,
10426 token_length,
10427 flags,
10428 option_name);
10429 if (!token_ops)
10430 found_flags = 0;
10432 found_flags |= token_ops;
10433 return found_flags;
10436 /* Support for overriding instruction fusion. */
10438 static void
10439 aarch64_parse_fuse_string (const char *fuse_string,
10440 struct tune_params *tune)
10442 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10443 aarch64_fusible_pairs,
10444 tune->fusible_ops,
10445 "fuse=");
10448 /* Support for overriding other tuning flags. */
10450 static void
10451 aarch64_parse_tune_string (const char *tune_string,
10452 struct tune_params *tune)
10454 tune->extra_tuning_flags
10455 = aarch64_parse_boolean_options (tune_string,
10456 aarch64_tuning_flags,
10457 tune->extra_tuning_flags,
10458 "tune=");
10461 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10462 we understand. If it is, extract the option string and handoff to
10463 the appropriate function. */
10465 void
10466 aarch64_parse_one_override_token (const char* token,
10467 size_t length,
10468 struct tune_params *tune)
10470 const struct aarch64_tuning_override_function *fn
10471 = aarch64_tuning_override_functions;
10473 const char *option_part = strchr (token, '=');
10474 if (!option_part)
10476 error ("tuning string missing in option (%s)", token);
10477 return;
10480 /* Get the length of the option name. */
10481 length = option_part - token;
10482 /* Skip the '=' to get to the option string. */
10483 option_part++;
10485 for (; fn->name != NULL; fn++)
10487 if (!strncmp (fn->name, token, length))
10489 fn->parse_override (option_part, tune);
10490 return;
10494 error ("unknown tuning option (%s)",token);
10495 return;
10498 /* A checking mechanism for the implementation of the tls size. */
10500 static void
10501 initialize_aarch64_tls_size (struct gcc_options *opts)
10503 if (aarch64_tls_size == 0)
10504 aarch64_tls_size = 24;
10506 switch (opts->x_aarch64_cmodel_var)
10508 case AARCH64_CMODEL_TINY:
10509 /* Both the default and maximum TLS size allowed under tiny is 1M which
10510 needs two instructions to address, so we clamp the size to 24. */
10511 if (aarch64_tls_size > 24)
10512 aarch64_tls_size = 24;
10513 break;
10514 case AARCH64_CMODEL_SMALL:
10515 /* The maximum TLS size allowed under small is 4G. */
10516 if (aarch64_tls_size > 32)
10517 aarch64_tls_size = 32;
10518 break;
10519 case AARCH64_CMODEL_LARGE:
10520 /* The maximum TLS size allowed under large is 16E.
10521 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10522 if (aarch64_tls_size > 48)
10523 aarch64_tls_size = 48;
10524 break;
10525 default:
10526 gcc_unreachable ();
10529 return;
10532 /* Parse STRING looking for options in the format:
10533 string :: option:string
10534 option :: name=substring
10535 name :: {a-z}
10536 substring :: defined by option. */
10538 static void
10539 aarch64_parse_override_string (const char* input_string,
10540 struct tune_params* tune)
10542 const char separator = ':';
10543 size_t string_length = strlen (input_string) + 1;
10544 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10545 char *string = string_root;
10546 strncpy (string, input_string, string_length);
10547 string[string_length - 1] = '\0';
10549 char* ntoken = string;
10551 while ((ntoken = strchr (string, separator)))
10553 size_t token_length = ntoken - string;
10554 /* Make this substring look like a string. */
10555 *ntoken = '\0';
10556 aarch64_parse_one_override_token (string, token_length, tune);
10557 string = ++ntoken;
10560 /* One last option to parse. */
10561 aarch64_parse_one_override_token (string, strlen (string), tune);
10562 free (string_root);
10566 static void
10567 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10569 /* PR 70044: We have to be careful about being called multiple times for the
10570 same function. This means all changes should be repeatable. */
10572 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10573 Disable the frame pointer flag so the mid-end will not use a frame
10574 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10575 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10576 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
10577 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10578 if (opts->x_flag_omit_frame_pointer == 0)
10579 opts->x_flag_omit_frame_pointer = 2;
10581 /* If not optimizing for size, set the default
10582 alignment to what the target wants. */
10583 if (!opts->x_optimize_size)
10585 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
10586 opts->x_str_align_loops = aarch64_tune_params.loop_align;
10587 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
10588 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
10589 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
10590 opts->x_str_align_functions = aarch64_tune_params.function_align;
10593 /* We default to no pc-relative literal loads. */
10595 aarch64_pcrelative_literal_loads = false;
10597 /* If -mpc-relative-literal-loads is set on the command line, this
10598 implies that the user asked for PC relative literal loads. */
10599 if (opts->x_pcrelative_literal_loads == 1)
10600 aarch64_pcrelative_literal_loads = true;
10602 /* In the tiny memory model it makes no sense to disallow PC relative
10603 literal pool loads. */
10604 if (aarch64_cmodel == AARCH64_CMODEL_TINY
10605 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10606 aarch64_pcrelative_literal_loads = true;
10608 /* When enabling the lower precision Newton series for the square root, also
10609 enable it for the reciprocal square root, since the latter is an
10610 intermediary step for the former. */
10611 if (flag_mlow_precision_sqrt)
10612 flag_mrecip_low_precision_sqrt = true;
10615 /* 'Unpack' up the internal tuning structs and update the options
10616 in OPTS. The caller must have set up selected_tune and selected_arch
10617 as all the other target-specific codegen decisions are
10618 derived from them. */
10620 void
10621 aarch64_override_options_internal (struct gcc_options *opts)
10623 aarch64_tune_flags = selected_tune->flags;
10624 aarch64_tune = selected_tune->sched_core;
10625 /* Make a copy of the tuning parameters attached to the core, which
10626 we may later overwrite. */
10627 aarch64_tune_params = *(selected_tune->tune);
10628 aarch64_architecture_version = selected_arch->architecture_version;
10630 if (opts->x_aarch64_override_tune_string)
10631 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10632 &aarch64_tune_params);
10634 /* This target defaults to strict volatile bitfields. */
10635 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10636 opts->x_flag_strict_volatile_bitfields = 1;
10638 initialize_aarch64_code_model (opts);
10639 initialize_aarch64_tls_size (opts);
10641 int queue_depth = 0;
10642 switch (aarch64_tune_params.autoprefetcher_model)
10644 case tune_params::AUTOPREFETCHER_OFF:
10645 queue_depth = -1;
10646 break;
10647 case tune_params::AUTOPREFETCHER_WEAK:
10648 queue_depth = 0;
10649 break;
10650 case tune_params::AUTOPREFETCHER_STRONG:
10651 queue_depth = max_insn_queue_index + 1;
10652 break;
10653 default:
10654 gcc_unreachable ();
10657 /* We don't mind passing in global_options_set here as we don't use
10658 the *options_set structs anyway. */
10659 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10660 queue_depth,
10661 opts->x_param_values,
10662 global_options_set.x_param_values);
10664 /* Set up parameters to be used in prefetching algorithm. Do not
10665 override the defaults unless we are tuning for a core we have
10666 researched values for. */
10667 if (aarch64_tune_params.prefetch->num_slots > 0)
10668 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10669 aarch64_tune_params.prefetch->num_slots,
10670 opts->x_param_values,
10671 global_options_set.x_param_values);
10672 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10673 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10674 aarch64_tune_params.prefetch->l1_cache_size,
10675 opts->x_param_values,
10676 global_options_set.x_param_values);
10677 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10678 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10679 aarch64_tune_params.prefetch->l1_cache_line_size,
10680 opts->x_param_values,
10681 global_options_set.x_param_values);
10682 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10683 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10684 aarch64_tune_params.prefetch->l2_cache_size,
10685 opts->x_param_values,
10686 global_options_set.x_param_values);
10687 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10688 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10690 opts->x_param_values,
10691 global_options_set.x_param_values);
10692 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10693 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10694 aarch64_tune_params.prefetch->minimum_stride,
10695 opts->x_param_values,
10696 global_options_set.x_param_values);
10698 /* Use the alternative scheduling-pressure algorithm by default. */
10699 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10700 opts->x_param_values,
10701 global_options_set.x_param_values);
10703 /* Enable sw prefetching at specified optimization level for
10704 CPUS that have prefetch. Lower optimization level threshold by 1
10705 when profiling is enabled. */
10706 if (opts->x_flag_prefetch_loop_arrays < 0
10707 && !opts->x_optimize_size
10708 && aarch64_tune_params.prefetch->default_opt_level >= 0
10709 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10710 opts->x_flag_prefetch_loop_arrays = 1;
10712 aarch64_override_options_after_change_1 (opts);
10715 /* Print a hint with a suggestion for a core or architecture name that
10716 most closely resembles what the user passed in STR. ARCH is true if
10717 the user is asking for an architecture name. ARCH is false if the user
10718 is asking for a core name. */
10720 static void
10721 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10723 auto_vec<const char *> candidates;
10724 const struct processor *entry = arch ? all_architectures : all_cores;
10725 for (; entry->name != NULL; entry++)
10726 candidates.safe_push (entry->name);
10728 #ifdef HAVE_LOCAL_CPU_DETECT
10729 /* Add also "native" as possible value. */
10730 if (arch)
10731 candidates.safe_push ("native");
10732 #endif
10734 char *s;
10735 const char *hint = candidates_list_and_hint (str, s, candidates);
10736 if (hint)
10737 inform (input_location, "valid arguments are: %s;"
10738 " did you mean %qs?", s, hint);
10739 else
10740 inform (input_location, "valid arguments are: %s", s);
10742 XDELETEVEC (s);
10745 /* Print a hint with a suggestion for a core name that most closely resembles
10746 what the user passed in STR. */
10748 inline static void
10749 aarch64_print_hint_for_core (const char *str)
10751 aarch64_print_hint_for_core_or_arch (str, false);
10754 /* Print a hint with a suggestion for an architecture name that most closely
10755 resembles what the user passed in STR. */
10757 inline static void
10758 aarch64_print_hint_for_arch (const char *str)
10760 aarch64_print_hint_for_core_or_arch (str, true);
10763 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10764 specified in STR and throw errors if appropriate. Put the results if
10765 they are valid in RES and ISA_FLAGS. Return whether the option is
10766 valid. */
10768 static bool
10769 aarch64_validate_mcpu (const char *str, const struct processor **res,
10770 unsigned long *isa_flags)
10772 enum aarch64_parse_opt_result parse_res
10773 = aarch64_parse_cpu (str, res, isa_flags);
10775 if (parse_res == AARCH64_PARSE_OK)
10776 return true;
10778 switch (parse_res)
10780 case AARCH64_PARSE_MISSING_ARG:
10781 error ("missing cpu name in %<-mcpu=%s%>", str);
10782 break;
10783 case AARCH64_PARSE_INVALID_ARG:
10784 error ("unknown value %qs for -mcpu", str);
10785 aarch64_print_hint_for_core (str);
10786 break;
10787 case AARCH64_PARSE_INVALID_FEATURE:
10788 error ("invalid feature modifier in %<-mcpu=%s%>", str);
10789 break;
10790 default:
10791 gcc_unreachable ();
10794 return false;
10797 /* Validate a command-line -march option. Parse the arch and extensions
10798 (if any) specified in STR and throw errors if appropriate. Put the
10799 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10800 option is valid. */
10802 static bool
10803 aarch64_validate_march (const char *str, const struct processor **res,
10804 unsigned long *isa_flags)
10806 enum aarch64_parse_opt_result parse_res
10807 = aarch64_parse_arch (str, res, isa_flags);
10809 if (parse_res == AARCH64_PARSE_OK)
10810 return true;
10812 switch (parse_res)
10814 case AARCH64_PARSE_MISSING_ARG:
10815 error ("missing arch name in %<-march=%s%>", str);
10816 break;
10817 case AARCH64_PARSE_INVALID_ARG:
10818 error ("unknown value %qs for -march", str);
10819 aarch64_print_hint_for_arch (str);
10820 break;
10821 case AARCH64_PARSE_INVALID_FEATURE:
10822 error ("invalid feature modifier in %<-march=%s%>", str);
10823 break;
10824 default:
10825 gcc_unreachable ();
10828 return false;
10831 /* Validate a command-line -mtune option. Parse the cpu
10832 specified in STR and throw errors if appropriate. Put the
10833 result, if it is valid, in RES. Return whether the option is
10834 valid. */
10836 static bool
10837 aarch64_validate_mtune (const char *str, const struct processor **res)
10839 enum aarch64_parse_opt_result parse_res
10840 = aarch64_parse_tune (str, res);
10842 if (parse_res == AARCH64_PARSE_OK)
10843 return true;
10845 switch (parse_res)
10847 case AARCH64_PARSE_MISSING_ARG:
10848 error ("missing cpu name in %<-mtune=%s%>", str);
10849 break;
10850 case AARCH64_PARSE_INVALID_ARG:
10851 error ("unknown value %qs for -mtune", str);
10852 aarch64_print_hint_for_core (str);
10853 break;
10854 default:
10855 gcc_unreachable ();
10857 return false;
10860 /* Return the CPU corresponding to the enum CPU.
10861 If it doesn't specify a cpu, return the default. */
10863 static const struct processor *
10864 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10866 if (cpu != aarch64_none)
10867 return &all_cores[cpu];
10869 /* The & 0x3f is to extract the bottom 6 bits that encode the
10870 default cpu as selected by the --with-cpu GCC configure option
10871 in config.gcc.
10872 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10873 flags mechanism should be reworked to make it more sane. */
10874 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10877 /* Return the architecture corresponding to the enum ARCH.
10878 If it doesn't specify a valid architecture, return the default. */
10880 static const struct processor *
10881 aarch64_get_arch (enum aarch64_arch arch)
10883 if (arch != aarch64_no_arch)
10884 return &all_architectures[arch];
10886 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10888 return &all_architectures[cpu->arch];
10891 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10893 static poly_uint16
10894 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10896 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10897 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10898 deciding which .md file patterns to use and when deciding whether
10899 something is a legitimate address or constant. */
10900 if (value == SVE_SCALABLE || value == SVE_128)
10901 return poly_uint16 (2, 2);
10902 else
10903 return (int) value / 64;
10906 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10907 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10908 tuning structs. In particular it must set selected_tune and
10909 aarch64_isa_flags that define the available ISA features and tuning
10910 decisions. It must also set selected_arch as this will be used to
10911 output the .arch asm tags for each function. */
10913 static void
10914 aarch64_override_options (void)
10916 unsigned long cpu_isa = 0;
10917 unsigned long arch_isa = 0;
10918 aarch64_isa_flags = 0;
10920 bool valid_cpu = true;
10921 bool valid_tune = true;
10922 bool valid_arch = true;
10924 selected_cpu = NULL;
10925 selected_arch = NULL;
10926 selected_tune = NULL;
10928 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10929 If either of -march or -mtune is given, they override their
10930 respective component of -mcpu. */
10931 if (aarch64_cpu_string)
10932 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10933 &cpu_isa);
10935 if (aarch64_arch_string)
10936 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10937 &arch_isa);
10939 if (aarch64_tune_string)
10940 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10942 /* If the user did not specify a processor, choose the default
10943 one for them. This will be the CPU set during configuration using
10944 --with-cpu, otherwise it is "generic". */
10945 if (!selected_cpu)
10947 if (selected_arch)
10949 selected_cpu = &all_cores[selected_arch->ident];
10950 aarch64_isa_flags = arch_isa;
10951 explicit_arch = selected_arch->arch;
10953 else
10955 /* Get default configure-time CPU. */
10956 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10957 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10960 if (selected_tune)
10961 explicit_tune_core = selected_tune->ident;
10963 /* If both -mcpu and -march are specified check that they are architecturally
10964 compatible, warn if they're not and prefer the -march ISA flags. */
10965 else if (selected_arch)
10967 if (selected_arch->arch != selected_cpu->arch)
10969 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10970 all_architectures[selected_cpu->arch].name,
10971 selected_arch->name);
10973 aarch64_isa_flags = arch_isa;
10974 explicit_arch = selected_arch->arch;
10975 explicit_tune_core = selected_tune ? selected_tune->ident
10976 : selected_cpu->ident;
10978 else
10980 /* -mcpu but no -march. */
10981 aarch64_isa_flags = cpu_isa;
10982 explicit_tune_core = selected_tune ? selected_tune->ident
10983 : selected_cpu->ident;
10984 gcc_assert (selected_cpu);
10985 selected_arch = &all_architectures[selected_cpu->arch];
10986 explicit_arch = selected_arch->arch;
10989 /* Set the arch as well as we will need it when outputing
10990 the .arch directive in assembly. */
10991 if (!selected_arch)
10993 gcc_assert (selected_cpu);
10994 selected_arch = &all_architectures[selected_cpu->arch];
10997 if (!selected_tune)
10998 selected_tune = selected_cpu;
11000 #ifndef HAVE_AS_MABI_OPTION
11001 /* The compiler may have been configured with 2.23.* binutils, which does
11002 not have support for ILP32. */
11003 if (TARGET_ILP32)
11004 error ("assembler does not support -mabi=ilp32");
11005 #endif
11007 /* Convert -msve-vector-bits to a VG count. */
11008 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
11010 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
11011 sorry ("return address signing is only supported for -mabi=lp64");
11013 /* Make sure we properly set up the explicit options. */
11014 if ((aarch64_cpu_string && valid_cpu)
11015 || (aarch64_tune_string && valid_tune))
11016 gcc_assert (explicit_tune_core != aarch64_none);
11018 if ((aarch64_cpu_string && valid_cpu)
11019 || (aarch64_arch_string && valid_arch))
11020 gcc_assert (explicit_arch != aarch64_no_arch);
11022 aarch64_override_options_internal (&global_options);
11024 /* Save these options as the default ones in case we push and pop them later
11025 while processing functions with potential target attributes. */
11026 target_option_default_node = target_option_current_node
11027 = build_target_option_node (&global_options);
11030 /* Implement targetm.override_options_after_change. */
11032 static void
11033 aarch64_override_options_after_change (void)
11035 aarch64_override_options_after_change_1 (&global_options);
11038 static struct machine_function *
11039 aarch64_init_machine_status (void)
11041 struct machine_function *machine;
11042 machine = ggc_cleared_alloc<machine_function> ();
11043 return machine;
11046 void
11047 aarch64_init_expanders (void)
11049 init_machine_status = aarch64_init_machine_status;
11052 /* A checking mechanism for the implementation of the various code models. */
11053 static void
11054 initialize_aarch64_code_model (struct gcc_options *opts)
11056 if (opts->x_flag_pic)
11058 switch (opts->x_aarch64_cmodel_var)
11060 case AARCH64_CMODEL_TINY:
11061 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11062 break;
11063 case AARCH64_CMODEL_SMALL:
11064 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11065 aarch64_cmodel = (flag_pic == 2
11066 ? AARCH64_CMODEL_SMALL_PIC
11067 : AARCH64_CMODEL_SMALL_SPIC);
11068 #else
11069 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11070 #endif
11071 break;
11072 case AARCH64_CMODEL_LARGE:
11073 sorry ("code model %qs with -f%s", "large",
11074 opts->x_flag_pic > 1 ? "PIC" : "pic");
11075 break;
11076 default:
11077 gcc_unreachable ();
11080 else
11081 aarch64_cmodel = opts->x_aarch64_cmodel_var;
11084 /* Implement TARGET_OPTION_SAVE. */
11086 static void
11087 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11089 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11092 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11093 using the information saved in PTR. */
11095 static void
11096 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11098 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11099 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11100 opts->x_explicit_arch = ptr->x_explicit_arch;
11101 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11102 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11104 aarch64_override_options_internal (opts);
11107 /* Implement TARGET_OPTION_PRINT. */
11109 static void
11110 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11112 const struct processor *cpu
11113 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11114 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11115 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11116 std::string extension
11117 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11119 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11120 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11121 arch->name, extension.c_str ());
11124 static GTY(()) tree aarch64_previous_fndecl;
11126 void
11127 aarch64_reset_previous_fndecl (void)
11129 aarch64_previous_fndecl = NULL;
11132 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11133 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11134 make sure optab availability predicates are recomputed when necessary. */
11136 void
11137 aarch64_save_restore_target_globals (tree new_tree)
11139 if (TREE_TARGET_GLOBALS (new_tree))
11140 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11141 else if (new_tree == target_option_default_node)
11142 restore_target_globals (&default_target_globals);
11143 else
11144 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11147 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11148 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11149 of the function, if such exists. This function may be called multiple
11150 times on a single function so use aarch64_previous_fndecl to avoid
11151 setting up identical state. */
11153 static void
11154 aarch64_set_current_function (tree fndecl)
11156 if (!fndecl || fndecl == aarch64_previous_fndecl)
11157 return;
11159 tree old_tree = (aarch64_previous_fndecl
11160 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11161 : NULL_TREE);
11163 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11165 /* If current function has no attributes but the previous one did,
11166 use the default node. */
11167 if (!new_tree && old_tree)
11168 new_tree = target_option_default_node;
11170 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11171 the default have been handled by aarch64_save_restore_target_globals from
11172 aarch64_pragma_target_parse. */
11173 if (old_tree == new_tree)
11174 return;
11176 aarch64_previous_fndecl = fndecl;
11178 /* First set the target options. */
11179 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11181 aarch64_save_restore_target_globals (new_tree);
11184 /* Enum describing the various ways we can handle attributes.
11185 In many cases we can reuse the generic option handling machinery. */
11187 enum aarch64_attr_opt_type
11189 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
11190 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
11191 aarch64_attr_enum, /* Attribute sets an enum variable. */
11192 aarch64_attr_custom /* Attribute requires a custom handling function. */
11195 /* All the information needed to handle a target attribute.
11196 NAME is the name of the attribute.
11197 ATTR_TYPE specifies the type of behavior of the attribute as described
11198 in the definition of enum aarch64_attr_opt_type.
11199 ALLOW_NEG is true if the attribute supports a "no-" form.
11200 HANDLER is the function that takes the attribute string as an argument
11201 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11202 OPT_NUM is the enum specifying the option that the attribute modifies.
11203 This is needed for attributes that mirror the behavior of a command-line
11204 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11205 aarch64_attr_enum. */
11207 struct aarch64_attribute_info
11209 const char *name;
11210 enum aarch64_attr_opt_type attr_type;
11211 bool allow_neg;
11212 bool (*handler) (const char *);
11213 enum opt_code opt_num;
11216 /* Handle the ARCH_STR argument to the arch= target attribute. */
11218 static bool
11219 aarch64_handle_attr_arch (const char *str)
11221 const struct processor *tmp_arch = NULL;
11222 enum aarch64_parse_opt_result parse_res
11223 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11225 if (parse_res == AARCH64_PARSE_OK)
11227 gcc_assert (tmp_arch);
11228 selected_arch = tmp_arch;
11229 explicit_arch = selected_arch->arch;
11230 return true;
11233 switch (parse_res)
11235 case AARCH64_PARSE_MISSING_ARG:
11236 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11237 break;
11238 case AARCH64_PARSE_INVALID_ARG:
11239 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11240 aarch64_print_hint_for_arch (str);
11241 break;
11242 case AARCH64_PARSE_INVALID_FEATURE:
11243 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11244 break;
11245 default:
11246 gcc_unreachable ();
11249 return false;
11252 /* Handle the argument CPU_STR to the cpu= target attribute. */
11254 static bool
11255 aarch64_handle_attr_cpu (const char *str)
11257 const struct processor *tmp_cpu = NULL;
11258 enum aarch64_parse_opt_result parse_res
11259 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11261 if (parse_res == AARCH64_PARSE_OK)
11263 gcc_assert (tmp_cpu);
11264 selected_tune = tmp_cpu;
11265 explicit_tune_core = selected_tune->ident;
11267 selected_arch = &all_architectures[tmp_cpu->arch];
11268 explicit_arch = selected_arch->arch;
11269 return true;
11272 switch (parse_res)
11274 case AARCH64_PARSE_MISSING_ARG:
11275 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11276 break;
11277 case AARCH64_PARSE_INVALID_ARG:
11278 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11279 aarch64_print_hint_for_core (str);
11280 break;
11281 case AARCH64_PARSE_INVALID_FEATURE:
11282 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11283 break;
11284 default:
11285 gcc_unreachable ();
11288 return false;
11291 /* Handle the argument STR to the tune= target attribute. */
11293 static bool
11294 aarch64_handle_attr_tune (const char *str)
11296 const struct processor *tmp_tune = NULL;
11297 enum aarch64_parse_opt_result parse_res
11298 = aarch64_parse_tune (str, &tmp_tune);
11300 if (parse_res == AARCH64_PARSE_OK)
11302 gcc_assert (tmp_tune);
11303 selected_tune = tmp_tune;
11304 explicit_tune_core = selected_tune->ident;
11305 return true;
11308 switch (parse_res)
11310 case AARCH64_PARSE_INVALID_ARG:
11311 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11312 aarch64_print_hint_for_core (str);
11313 break;
11314 default:
11315 gcc_unreachable ();
11318 return false;
11321 /* Parse an architecture extensions target attribute string specified in STR.
11322 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11323 if successful. Update aarch64_isa_flags to reflect the ISA features
11324 modified. */
11326 static bool
11327 aarch64_handle_attr_isa_flags (char *str)
11329 enum aarch64_parse_opt_result parse_res;
11330 unsigned long isa_flags = aarch64_isa_flags;
11332 /* We allow "+nothing" in the beginning to clear out all architectural
11333 features if the user wants to handpick specific features. */
11334 if (strncmp ("+nothing", str, 8) == 0)
11336 isa_flags = 0;
11337 str += 8;
11340 parse_res = aarch64_parse_extension (str, &isa_flags);
11342 if (parse_res == AARCH64_PARSE_OK)
11344 aarch64_isa_flags = isa_flags;
11345 return true;
11348 switch (parse_res)
11350 case AARCH64_PARSE_MISSING_ARG:
11351 error ("missing value in %<target()%> pragma or attribute");
11352 break;
11354 case AARCH64_PARSE_INVALID_FEATURE:
11355 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11356 break;
11358 default:
11359 gcc_unreachable ();
11362 return false;
11365 /* The target attributes that we support. On top of these we also support just
11366 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11367 handled explicitly in aarch64_process_one_target_attr. */
11369 static const struct aarch64_attribute_info aarch64_attributes[] =
11371 { "general-regs-only", aarch64_attr_mask, false, NULL,
11372 OPT_mgeneral_regs_only },
11373 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11374 OPT_mfix_cortex_a53_835769 },
11375 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11376 OPT_mfix_cortex_a53_843419 },
11377 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11378 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11379 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11380 OPT_momit_leaf_frame_pointer },
11381 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11382 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11383 OPT_march_ },
11384 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11385 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11386 OPT_mtune_ },
11387 { "sign-return-address", aarch64_attr_enum, false, NULL,
11388 OPT_msign_return_address_ },
11389 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11392 /* Parse ARG_STR which contains the definition of one target attribute.
11393 Show appropriate errors if any or return true if the attribute is valid. */
11395 static bool
11396 aarch64_process_one_target_attr (char *arg_str)
11398 bool invert = false;
11400 size_t len = strlen (arg_str);
11402 if (len == 0)
11404 error ("malformed %<target()%> pragma or attribute");
11405 return false;
11408 char *str_to_check = (char *) alloca (len + 1);
11409 strcpy (str_to_check, arg_str);
11411 /* Skip leading whitespace. */
11412 while (*str_to_check == ' ' || *str_to_check == '\t')
11413 str_to_check++;
11415 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11416 It is easier to detect and handle it explicitly here rather than going
11417 through the machinery for the rest of the target attributes in this
11418 function. */
11419 if (*str_to_check == '+')
11420 return aarch64_handle_attr_isa_flags (str_to_check);
11422 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11424 invert = true;
11425 str_to_check += 3;
11427 char *arg = strchr (str_to_check, '=');
11429 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11430 and point ARG to "foo". */
11431 if (arg)
11433 *arg = '\0';
11434 arg++;
11436 const struct aarch64_attribute_info *p_attr;
11437 bool found = false;
11438 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11440 /* If the names don't match up, or the user has given an argument
11441 to an attribute that doesn't accept one, or didn't give an argument
11442 to an attribute that expects one, fail to match. */
11443 if (strcmp (str_to_check, p_attr->name) != 0)
11444 continue;
11446 found = true;
11447 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11448 || p_attr->attr_type == aarch64_attr_enum;
11450 if (attr_need_arg_p ^ (arg != NULL))
11452 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11453 return false;
11456 /* If the name matches but the attribute does not allow "no-" versions
11457 then we can't match. */
11458 if (invert && !p_attr->allow_neg)
11460 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11461 return false;
11464 switch (p_attr->attr_type)
11466 /* Has a custom handler registered.
11467 For example, cpu=, arch=, tune=. */
11468 case aarch64_attr_custom:
11469 gcc_assert (p_attr->handler);
11470 if (!p_attr->handler (arg))
11471 return false;
11472 break;
11474 /* Either set or unset a boolean option. */
11475 case aarch64_attr_bool:
11477 struct cl_decoded_option decoded;
11479 generate_option (p_attr->opt_num, NULL, !invert,
11480 CL_TARGET, &decoded);
11481 aarch64_handle_option (&global_options, &global_options_set,
11482 &decoded, input_location);
11483 break;
11485 /* Set or unset a bit in the target_flags. aarch64_handle_option
11486 should know what mask to apply given the option number. */
11487 case aarch64_attr_mask:
11489 struct cl_decoded_option decoded;
11490 /* We only need to specify the option number.
11491 aarch64_handle_option will know which mask to apply. */
11492 decoded.opt_index = p_attr->opt_num;
11493 decoded.value = !invert;
11494 aarch64_handle_option (&global_options, &global_options_set,
11495 &decoded, input_location);
11496 break;
11498 /* Use the option setting machinery to set an option to an enum. */
11499 case aarch64_attr_enum:
11501 gcc_assert (arg);
11502 bool valid;
11503 int value;
11504 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11505 &value, CL_TARGET);
11506 if (valid)
11508 set_option (&global_options, NULL, p_attr->opt_num, value,
11509 NULL, DK_UNSPECIFIED, input_location,
11510 global_dc);
11512 else
11514 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11516 break;
11518 default:
11519 gcc_unreachable ();
11523 /* If we reached here we either have found an attribute and validated
11524 it or didn't match any. If we matched an attribute but its arguments
11525 were malformed we will have returned false already. */
11526 return found;
11529 /* Count how many times the character C appears in
11530 NULL-terminated string STR. */
11532 static unsigned int
11533 num_occurences_in_str (char c, char *str)
11535 unsigned int res = 0;
11536 while (*str != '\0')
11538 if (*str == c)
11539 res++;
11541 str++;
11544 return res;
11547 /* Parse the tree in ARGS that contains the target attribute information
11548 and update the global target options space. */
11550 bool
11551 aarch64_process_target_attr (tree args)
11553 if (TREE_CODE (args) == TREE_LIST)
11557 tree head = TREE_VALUE (args);
11558 if (head)
11560 if (!aarch64_process_target_attr (head))
11561 return false;
11563 args = TREE_CHAIN (args);
11564 } while (args);
11566 return true;
11569 if (TREE_CODE (args) != STRING_CST)
11571 error ("attribute %<target%> argument not a string");
11572 return false;
11575 size_t len = strlen (TREE_STRING_POINTER (args));
11576 char *str_to_check = (char *) alloca (len + 1);
11577 strcpy (str_to_check, TREE_STRING_POINTER (args));
11579 if (len == 0)
11581 error ("malformed %<target()%> pragma or attribute");
11582 return false;
11585 /* Used to catch empty spaces between commas i.e.
11586 attribute ((target ("attr1,,attr2"))). */
11587 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11589 /* Handle multiple target attributes separated by ','. */
11590 char *token = strtok (str_to_check, ",");
11592 unsigned int num_attrs = 0;
11593 while (token)
11595 num_attrs++;
11596 if (!aarch64_process_one_target_attr (token))
11598 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11599 return false;
11602 token = strtok (NULL, ",");
11605 if (num_attrs != num_commas + 1)
11607 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11608 return false;
11611 return true;
11614 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11615 process attribute ((target ("..."))). */
11617 static bool
11618 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11620 struct cl_target_option cur_target;
11621 bool ret;
11622 tree old_optimize;
11623 tree new_target, new_optimize;
11624 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11626 /* If what we're processing is the current pragma string then the
11627 target option node is already stored in target_option_current_node
11628 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11629 having to re-parse the string. This is especially useful to keep
11630 arm_neon.h compile times down since that header contains a lot
11631 of intrinsics enclosed in pragmas. */
11632 if (!existing_target && args == current_target_pragma)
11634 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11635 return true;
11637 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11639 old_optimize = build_optimization_node (&global_options);
11640 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11642 /* If the function changed the optimization levels as well as setting
11643 target options, start with the optimizations specified. */
11644 if (func_optimize && func_optimize != old_optimize)
11645 cl_optimization_restore (&global_options,
11646 TREE_OPTIMIZATION (func_optimize));
11648 /* Save the current target options to restore at the end. */
11649 cl_target_option_save (&cur_target, &global_options);
11651 /* If fndecl already has some target attributes applied to it, unpack
11652 them so that we add this attribute on top of them, rather than
11653 overwriting them. */
11654 if (existing_target)
11656 struct cl_target_option *existing_options
11657 = TREE_TARGET_OPTION (existing_target);
11659 if (existing_options)
11660 cl_target_option_restore (&global_options, existing_options);
11662 else
11663 cl_target_option_restore (&global_options,
11664 TREE_TARGET_OPTION (target_option_current_node));
11666 ret = aarch64_process_target_attr (args);
11668 /* Set up any additional state. */
11669 if (ret)
11671 aarch64_override_options_internal (&global_options);
11672 /* Initialize SIMD builtins if we haven't already.
11673 Set current_target_pragma to NULL for the duration so that
11674 the builtin initialization code doesn't try to tag the functions
11675 being built with the attributes specified by any current pragma, thus
11676 going into an infinite recursion. */
11677 if (TARGET_SIMD)
11679 tree saved_current_target_pragma = current_target_pragma;
11680 current_target_pragma = NULL;
11681 aarch64_init_simd_builtins ();
11682 current_target_pragma = saved_current_target_pragma;
11684 new_target = build_target_option_node (&global_options);
11686 else
11687 new_target = NULL;
11689 new_optimize = build_optimization_node (&global_options);
11691 if (fndecl && ret)
11693 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11695 if (old_optimize != new_optimize)
11696 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11699 cl_target_option_restore (&global_options, &cur_target);
11701 if (old_optimize != new_optimize)
11702 cl_optimization_restore (&global_options,
11703 TREE_OPTIMIZATION (old_optimize));
11704 return ret;
11707 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11708 tri-bool options (yes, no, don't care) and the default value is
11709 DEF, determine whether to reject inlining. */
11711 static bool
11712 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11713 int dont_care, int def)
11715 /* If the callee doesn't care, always allow inlining. */
11716 if (callee == dont_care)
11717 return true;
11719 /* If the caller doesn't care, always allow inlining. */
11720 if (caller == dont_care)
11721 return true;
11723 /* Otherwise, allow inlining if either the callee and caller values
11724 agree, or if the callee is using the default value. */
11725 return (callee == caller || callee == def);
11728 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11729 to inline CALLEE into CALLER based on target-specific info.
11730 Make sure that the caller and callee have compatible architectural
11731 features. Then go through the other possible target attributes
11732 and see if they can block inlining. Try not to reject always_inline
11733 callees unless they are incompatible architecturally. */
11735 static bool
11736 aarch64_can_inline_p (tree caller, tree callee)
11738 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11739 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11741 struct cl_target_option *caller_opts
11742 = TREE_TARGET_OPTION (caller_tree ? caller_tree
11743 : target_option_default_node);
11745 struct cl_target_option *callee_opts
11746 = TREE_TARGET_OPTION (callee_tree ? callee_tree
11747 : target_option_default_node);
11749 /* Callee's ISA flags should be a subset of the caller's. */
11750 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11751 != callee_opts->x_aarch64_isa_flags)
11752 return false;
11754 /* Allow non-strict aligned functions inlining into strict
11755 aligned ones. */
11756 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11757 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11758 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11759 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11760 return false;
11762 bool always_inline = lookup_attribute ("always_inline",
11763 DECL_ATTRIBUTES (callee));
11765 /* If the architectural features match up and the callee is always_inline
11766 then the other attributes don't matter. */
11767 if (always_inline)
11768 return true;
11770 if (caller_opts->x_aarch64_cmodel_var
11771 != callee_opts->x_aarch64_cmodel_var)
11772 return false;
11774 if (caller_opts->x_aarch64_tls_dialect
11775 != callee_opts->x_aarch64_tls_dialect)
11776 return false;
11778 /* Honour explicit requests to workaround errata. */
11779 if (!aarch64_tribools_ok_for_inlining_p (
11780 caller_opts->x_aarch64_fix_a53_err835769,
11781 callee_opts->x_aarch64_fix_a53_err835769,
11782 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11783 return false;
11785 if (!aarch64_tribools_ok_for_inlining_p (
11786 caller_opts->x_aarch64_fix_a53_err843419,
11787 callee_opts->x_aarch64_fix_a53_err843419,
11788 2, TARGET_FIX_ERR_A53_843419))
11789 return false;
11791 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11792 caller and calle and they don't match up, reject inlining. */
11793 if (!aarch64_tribools_ok_for_inlining_p (
11794 caller_opts->x_flag_omit_leaf_frame_pointer,
11795 callee_opts->x_flag_omit_leaf_frame_pointer,
11796 2, 1))
11797 return false;
11799 /* If the callee has specific tuning overrides, respect them. */
11800 if (callee_opts->x_aarch64_override_tune_string != NULL
11801 && caller_opts->x_aarch64_override_tune_string == NULL)
11802 return false;
11804 /* If the user specified tuning override strings for the
11805 caller and callee and they don't match up, reject inlining.
11806 We just do a string compare here, we don't analyze the meaning
11807 of the string, as it would be too costly for little gain. */
11808 if (callee_opts->x_aarch64_override_tune_string
11809 && caller_opts->x_aarch64_override_tune_string
11810 && (strcmp (callee_opts->x_aarch64_override_tune_string,
11811 caller_opts->x_aarch64_override_tune_string) != 0))
11812 return false;
11814 return true;
11817 /* Return true if SYMBOL_REF X binds locally. */
11819 static bool
11820 aarch64_symbol_binds_local_p (const_rtx x)
11822 return (SYMBOL_REF_DECL (x)
11823 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11824 : SYMBOL_REF_LOCAL_P (x));
11827 /* Return true if SYMBOL_REF X is thread local */
11828 static bool
11829 aarch64_tls_symbol_p (rtx x)
11831 if (! TARGET_HAVE_TLS)
11832 return false;
11834 if (GET_CODE (x) != SYMBOL_REF)
11835 return false;
11837 return SYMBOL_REF_TLS_MODEL (x) != 0;
11840 /* Classify a TLS symbol into one of the TLS kinds. */
11841 enum aarch64_symbol_type
11842 aarch64_classify_tls_symbol (rtx x)
11844 enum tls_model tls_kind = tls_symbolic_operand_type (x);
11846 switch (tls_kind)
11848 case TLS_MODEL_GLOBAL_DYNAMIC:
11849 case TLS_MODEL_LOCAL_DYNAMIC:
11850 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11852 case TLS_MODEL_INITIAL_EXEC:
11853 switch (aarch64_cmodel)
11855 case AARCH64_CMODEL_TINY:
11856 case AARCH64_CMODEL_TINY_PIC:
11857 return SYMBOL_TINY_TLSIE;
11858 default:
11859 return SYMBOL_SMALL_TLSIE;
11862 case TLS_MODEL_LOCAL_EXEC:
11863 if (aarch64_tls_size == 12)
11864 return SYMBOL_TLSLE12;
11865 else if (aarch64_tls_size == 24)
11866 return SYMBOL_TLSLE24;
11867 else if (aarch64_tls_size == 32)
11868 return SYMBOL_TLSLE32;
11869 else if (aarch64_tls_size == 48)
11870 return SYMBOL_TLSLE48;
11871 else
11872 gcc_unreachable ();
11874 case TLS_MODEL_EMULATED:
11875 case TLS_MODEL_NONE:
11876 return SYMBOL_FORCE_TO_MEM;
11878 default:
11879 gcc_unreachable ();
11883 /* Return the correct method for accessing X + OFFSET, where X is either
11884 a SYMBOL_REF or LABEL_REF. */
11886 enum aarch64_symbol_type
11887 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11889 if (GET_CODE (x) == LABEL_REF)
11891 switch (aarch64_cmodel)
11893 case AARCH64_CMODEL_LARGE:
11894 return SYMBOL_FORCE_TO_MEM;
11896 case AARCH64_CMODEL_TINY_PIC:
11897 case AARCH64_CMODEL_TINY:
11898 return SYMBOL_TINY_ABSOLUTE;
11900 case AARCH64_CMODEL_SMALL_SPIC:
11901 case AARCH64_CMODEL_SMALL_PIC:
11902 case AARCH64_CMODEL_SMALL:
11903 return SYMBOL_SMALL_ABSOLUTE;
11905 default:
11906 gcc_unreachable ();
11910 if (GET_CODE (x) == SYMBOL_REF)
11912 if (aarch64_tls_symbol_p (x))
11913 return aarch64_classify_tls_symbol (x);
11915 switch (aarch64_cmodel)
11917 case AARCH64_CMODEL_TINY:
11918 /* When we retrieve symbol + offset address, we have to make sure
11919 the offset does not cause overflow of the final address. But
11920 we have no way of knowing the address of symbol at compile time
11921 so we can't accurately say if the distance between the PC and
11922 symbol + offset is outside the addressible range of +/-1M in the
11923 TINY code model. So we rely on images not being greater than
11924 1M and cap the offset at 1M and anything beyond 1M will have to
11925 be loaded using an alternative mechanism. Furthermore if the
11926 symbol is a weak reference to something that isn't known to
11927 resolve to a symbol in this module, then force to memory. */
11928 if ((SYMBOL_REF_WEAK (x)
11929 && !aarch64_symbol_binds_local_p (x))
11930 || !IN_RANGE (offset, -1048575, 1048575))
11931 return SYMBOL_FORCE_TO_MEM;
11932 return SYMBOL_TINY_ABSOLUTE;
11934 case AARCH64_CMODEL_SMALL:
11935 /* Same reasoning as the tiny code model, but the offset cap here is
11936 4G. */
11937 if ((SYMBOL_REF_WEAK (x)
11938 && !aarch64_symbol_binds_local_p (x))
11939 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11940 HOST_WIDE_INT_C (4294967264)))
11941 return SYMBOL_FORCE_TO_MEM;
11942 return SYMBOL_SMALL_ABSOLUTE;
11944 case AARCH64_CMODEL_TINY_PIC:
11945 if (!aarch64_symbol_binds_local_p (x))
11946 return SYMBOL_TINY_GOT;
11947 return SYMBOL_TINY_ABSOLUTE;
11949 case AARCH64_CMODEL_SMALL_SPIC:
11950 case AARCH64_CMODEL_SMALL_PIC:
11951 if (!aarch64_symbol_binds_local_p (x))
11952 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11953 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11954 return SYMBOL_SMALL_ABSOLUTE;
11956 case AARCH64_CMODEL_LARGE:
11957 /* This is alright even in PIC code as the constant
11958 pool reference is always PC relative and within
11959 the same translation unit. */
11960 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11961 return SYMBOL_SMALL_ABSOLUTE;
11962 else
11963 return SYMBOL_FORCE_TO_MEM;
11965 default:
11966 gcc_unreachable ();
11970 /* By default push everything into the constant pool. */
11971 return SYMBOL_FORCE_TO_MEM;
11974 bool
11975 aarch64_constant_address_p (rtx x)
11977 return (CONSTANT_P (x) && memory_address_p (DImode, x));
11980 bool
11981 aarch64_legitimate_pic_operand_p (rtx x)
11983 if (GET_CODE (x) == SYMBOL_REF
11984 || (GET_CODE (x) == CONST
11985 && GET_CODE (XEXP (x, 0)) == PLUS
11986 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11987 return false;
11989 return true;
11992 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11993 that should be rematerialized rather than spilled. */
11995 static bool
11996 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11998 /* Support CSE and rematerialization of common constants. */
11999 if (CONST_INT_P (x)
12000 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
12001 || GET_CODE (x) == CONST_VECTOR)
12002 return true;
12004 /* Do not allow vector struct mode constants for Advanced SIMD.
12005 We could support 0 and -1 easily, but they need support in
12006 aarch64-simd.md. */
12007 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12008 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
12009 return false;
12011 /* Only accept variable-length vector constants if they can be
12012 handled directly.
12014 ??? It would be possible to handle rematerialization of other
12015 constants via secondary reloads. */
12016 if (vec_flags & VEC_ANY_SVE)
12017 return aarch64_simd_valid_immediate (x, NULL);
12019 if (GET_CODE (x) == HIGH)
12020 x = XEXP (x, 0);
12022 /* Accept polynomial constants that can be calculated by using the
12023 destination of a move as the sole temporary. Constants that
12024 require a second temporary cannot be rematerialized (they can't be
12025 forced to memory and also aren't legitimate constants). */
12026 poly_int64 offset;
12027 if (poly_int_rtx_p (x, &offset))
12028 return aarch64_offset_temporaries (false, offset) <= 1;
12030 /* If an offset is being added to something else, we need to allow the
12031 base to be moved into the destination register, meaning that there
12032 are no free temporaries for the offset. */
12033 x = strip_offset (x, &offset);
12034 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
12035 return false;
12037 /* Do not allow const (plus (anchor_symbol, const_int)). */
12038 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
12039 return false;
12041 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
12042 so spilling them is better than rematerialization. */
12043 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
12044 return true;
12046 /* Label references are always constant. */
12047 if (GET_CODE (x) == LABEL_REF)
12048 return true;
12050 return false;
12054 aarch64_load_tp (rtx target)
12056 if (!target
12057 || GET_MODE (target) != Pmode
12058 || !register_operand (target, Pmode))
12059 target = gen_reg_rtx (Pmode);
12061 /* Can return in any reg. */
12062 emit_insn (gen_aarch64_load_tp_hard (target));
12063 return target;
12066 /* On AAPCS systems, this is the "struct __va_list". */
12067 static GTY(()) tree va_list_type;
12069 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12070 Return the type to use as __builtin_va_list.
12072 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12074 struct __va_list
12076 void *__stack;
12077 void *__gr_top;
12078 void *__vr_top;
12079 int __gr_offs;
12080 int __vr_offs;
12081 }; */
12083 static tree
12084 aarch64_build_builtin_va_list (void)
12086 tree va_list_name;
12087 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12089 /* Create the type. */
12090 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12091 /* Give it the required name. */
12092 va_list_name = build_decl (BUILTINS_LOCATION,
12093 TYPE_DECL,
12094 get_identifier ("__va_list"),
12095 va_list_type);
12096 DECL_ARTIFICIAL (va_list_name) = 1;
12097 TYPE_NAME (va_list_type) = va_list_name;
12098 TYPE_STUB_DECL (va_list_type) = va_list_name;
12100 /* Create the fields. */
12101 f_stack = build_decl (BUILTINS_LOCATION,
12102 FIELD_DECL, get_identifier ("__stack"),
12103 ptr_type_node);
12104 f_grtop = build_decl (BUILTINS_LOCATION,
12105 FIELD_DECL, get_identifier ("__gr_top"),
12106 ptr_type_node);
12107 f_vrtop = build_decl (BUILTINS_LOCATION,
12108 FIELD_DECL, get_identifier ("__vr_top"),
12109 ptr_type_node);
12110 f_groff = build_decl (BUILTINS_LOCATION,
12111 FIELD_DECL, get_identifier ("__gr_offs"),
12112 integer_type_node);
12113 f_vroff = build_decl (BUILTINS_LOCATION,
12114 FIELD_DECL, get_identifier ("__vr_offs"),
12115 integer_type_node);
12117 /* Tell tree-stdarg pass about our internal offset fields.
12118 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12119 purpose to identify whether the code is updating va_list internal
12120 offset fields through irregular way. */
12121 va_list_gpr_counter_field = f_groff;
12122 va_list_fpr_counter_field = f_vroff;
12124 DECL_ARTIFICIAL (f_stack) = 1;
12125 DECL_ARTIFICIAL (f_grtop) = 1;
12126 DECL_ARTIFICIAL (f_vrtop) = 1;
12127 DECL_ARTIFICIAL (f_groff) = 1;
12128 DECL_ARTIFICIAL (f_vroff) = 1;
12130 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12131 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12132 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12133 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12134 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12136 TYPE_FIELDS (va_list_type) = f_stack;
12137 DECL_CHAIN (f_stack) = f_grtop;
12138 DECL_CHAIN (f_grtop) = f_vrtop;
12139 DECL_CHAIN (f_vrtop) = f_groff;
12140 DECL_CHAIN (f_groff) = f_vroff;
12142 /* Compute its layout. */
12143 layout_type (va_list_type);
12145 return va_list_type;
12148 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12149 static void
12150 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12152 const CUMULATIVE_ARGS *cum;
12153 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12154 tree stack, grtop, vrtop, groff, vroff;
12155 tree t;
12156 int gr_save_area_size = cfun->va_list_gpr_size;
12157 int vr_save_area_size = cfun->va_list_fpr_size;
12158 int vr_offset;
12160 cum = &crtl->args.info;
12161 if (cfun->va_list_gpr_size)
12162 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12163 cfun->va_list_gpr_size);
12164 if (cfun->va_list_fpr_size)
12165 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12166 * UNITS_PER_VREG, cfun->va_list_fpr_size);
12168 if (!TARGET_FLOAT)
12170 gcc_assert (cum->aapcs_nvrn == 0);
12171 vr_save_area_size = 0;
12174 f_stack = TYPE_FIELDS (va_list_type_node);
12175 f_grtop = DECL_CHAIN (f_stack);
12176 f_vrtop = DECL_CHAIN (f_grtop);
12177 f_groff = DECL_CHAIN (f_vrtop);
12178 f_vroff = DECL_CHAIN (f_groff);
12180 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12181 NULL_TREE);
12182 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12183 NULL_TREE);
12184 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12185 NULL_TREE);
12186 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12187 NULL_TREE);
12188 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12189 NULL_TREE);
12191 /* Emit code to initialize STACK, which points to the next varargs stack
12192 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12193 by named arguments. STACK is 8-byte aligned. */
12194 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12195 if (cum->aapcs_stack_size > 0)
12196 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12197 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12198 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12200 /* Emit code to initialize GRTOP, the top of the GR save area.
12201 virtual_incoming_args_rtx should have been 16 byte aligned. */
12202 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12203 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12204 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12206 /* Emit code to initialize VRTOP, the top of the VR save area.
12207 This address is gr_save_area_bytes below GRTOP, rounded
12208 down to the next 16-byte boundary. */
12209 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12210 vr_offset = ROUND_UP (gr_save_area_size,
12211 STACK_BOUNDARY / BITS_PER_UNIT);
12213 if (vr_offset)
12214 t = fold_build_pointer_plus_hwi (t, -vr_offset);
12215 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12216 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12218 /* Emit code to initialize GROFF, the offset from GRTOP of the
12219 next GPR argument. */
12220 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12221 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12222 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12224 /* Likewise emit code to initialize VROFF, the offset from FTOP
12225 of the next VR argument. */
12226 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12227 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12228 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12231 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12233 static tree
12234 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12235 gimple_seq *post_p ATTRIBUTE_UNUSED)
12237 tree addr;
12238 bool indirect_p;
12239 bool is_ha; /* is HFA or HVA. */
12240 bool dw_align; /* double-word align. */
12241 machine_mode ag_mode = VOIDmode;
12242 int nregs;
12243 machine_mode mode;
12245 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12246 tree stack, f_top, f_off, off, arg, roundup, on_stack;
12247 HOST_WIDE_INT size, rsize, adjust, align;
12248 tree t, u, cond1, cond2;
12250 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12251 if (indirect_p)
12252 type = build_pointer_type (type);
12254 mode = TYPE_MODE (type);
12256 f_stack = TYPE_FIELDS (va_list_type_node);
12257 f_grtop = DECL_CHAIN (f_stack);
12258 f_vrtop = DECL_CHAIN (f_grtop);
12259 f_groff = DECL_CHAIN (f_vrtop);
12260 f_vroff = DECL_CHAIN (f_groff);
12262 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12263 f_stack, NULL_TREE);
12264 size = int_size_in_bytes (type);
12265 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12267 dw_align = false;
12268 adjust = 0;
12269 if (aarch64_vfp_is_call_or_return_candidate (mode,
12270 type,
12271 &ag_mode,
12272 &nregs,
12273 &is_ha))
12275 /* No frontends can create types with variable-sized modes, so we
12276 shouldn't be asked to pass or return them. */
12277 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12279 /* TYPE passed in fp/simd registers. */
12280 if (!TARGET_FLOAT)
12281 aarch64_err_no_fpadvsimd (mode);
12283 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12284 unshare_expr (valist), f_vrtop, NULL_TREE);
12285 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12286 unshare_expr (valist), f_vroff, NULL_TREE);
12288 rsize = nregs * UNITS_PER_VREG;
12290 if (is_ha)
12292 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12293 adjust = UNITS_PER_VREG - ag_size;
12295 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12296 && size < UNITS_PER_VREG)
12298 adjust = UNITS_PER_VREG - size;
12301 else
12303 /* TYPE passed in general registers. */
12304 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12305 unshare_expr (valist), f_grtop, NULL_TREE);
12306 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12307 unshare_expr (valist), f_groff, NULL_TREE);
12308 rsize = ROUND_UP (size, UNITS_PER_WORD);
12309 nregs = rsize / UNITS_PER_WORD;
12311 if (align > 8)
12312 dw_align = true;
12314 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12315 && size < UNITS_PER_WORD)
12317 adjust = UNITS_PER_WORD - size;
12321 /* Get a local temporary for the field value. */
12322 off = get_initialized_tmp_var (f_off, pre_p, NULL);
12324 /* Emit code to branch if off >= 0. */
12325 t = build2 (GE_EXPR, boolean_type_node, off,
12326 build_int_cst (TREE_TYPE (off), 0));
12327 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12329 if (dw_align)
12331 /* Emit: offs = (offs + 15) & -16. */
12332 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12333 build_int_cst (TREE_TYPE (off), 15));
12334 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12335 build_int_cst (TREE_TYPE (off), -16));
12336 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12338 else
12339 roundup = NULL;
12341 /* Update ap.__[g|v]r_offs */
12342 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12343 build_int_cst (TREE_TYPE (off), rsize));
12344 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12346 /* String up. */
12347 if (roundup)
12348 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12350 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12351 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12352 build_int_cst (TREE_TYPE (f_off), 0));
12353 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12355 /* String up: make sure the assignment happens before the use. */
12356 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12357 COND_EXPR_ELSE (cond1) = t;
12359 /* Prepare the trees handling the argument that is passed on the stack;
12360 the top level node will store in ON_STACK. */
12361 arg = get_initialized_tmp_var (stack, pre_p, NULL);
12362 if (align > 8)
12364 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12365 t = fold_build_pointer_plus_hwi (arg, 15);
12366 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12367 build_int_cst (TREE_TYPE (t), -16));
12368 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12370 else
12371 roundup = NULL;
12372 /* Advance ap.__stack */
12373 t = fold_build_pointer_plus_hwi (arg, size + 7);
12374 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12375 build_int_cst (TREE_TYPE (t), -8));
12376 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12377 /* String up roundup and advance. */
12378 if (roundup)
12379 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12380 /* String up with arg */
12381 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12382 /* Big-endianness related address adjustment. */
12383 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12384 && size < UNITS_PER_WORD)
12386 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12387 size_int (UNITS_PER_WORD - size));
12388 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12391 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12392 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12394 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12395 t = off;
12396 if (adjust)
12397 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12398 build_int_cst (TREE_TYPE (off), adjust));
12400 t = fold_convert (sizetype, t);
12401 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12403 if (is_ha)
12405 /* type ha; // treat as "struct {ftype field[n];}"
12406 ... [computing offs]
12407 for (i = 0; i <nregs; ++i, offs += 16)
12408 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12409 return ha; */
12410 int i;
12411 tree tmp_ha, field_t, field_ptr_t;
12413 /* Declare a local variable. */
12414 tmp_ha = create_tmp_var_raw (type, "ha");
12415 gimple_add_tmp_var (tmp_ha);
12417 /* Establish the base type. */
12418 switch (ag_mode)
12420 case E_SFmode:
12421 field_t = float_type_node;
12422 field_ptr_t = float_ptr_type_node;
12423 break;
12424 case E_DFmode:
12425 field_t = double_type_node;
12426 field_ptr_t = double_ptr_type_node;
12427 break;
12428 case E_TFmode:
12429 field_t = long_double_type_node;
12430 field_ptr_t = long_double_ptr_type_node;
12431 break;
12432 case E_HFmode:
12433 field_t = aarch64_fp16_type_node;
12434 field_ptr_t = aarch64_fp16_ptr_type_node;
12435 break;
12436 case E_V2SImode:
12437 case E_V4SImode:
12439 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12440 field_t = build_vector_type_for_mode (innertype, ag_mode);
12441 field_ptr_t = build_pointer_type (field_t);
12443 break;
12444 default:
12445 gcc_assert (0);
12448 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12449 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12450 addr = t;
12451 t = fold_convert (field_ptr_t, addr);
12452 t = build2 (MODIFY_EXPR, field_t,
12453 build1 (INDIRECT_REF, field_t, tmp_ha),
12454 build1 (INDIRECT_REF, field_t, t));
12456 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12457 for (i = 1; i < nregs; ++i)
12459 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12460 u = fold_convert (field_ptr_t, addr);
12461 u = build2 (MODIFY_EXPR, field_t,
12462 build2 (MEM_REF, field_t, tmp_ha,
12463 build_int_cst (field_ptr_t,
12464 (i *
12465 int_size_in_bytes (field_t)))),
12466 build1 (INDIRECT_REF, field_t, u));
12467 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12470 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12471 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12474 COND_EXPR_ELSE (cond2) = t;
12475 addr = fold_convert (build_pointer_type (type), cond1);
12476 addr = build_va_arg_indirect_ref (addr);
12478 if (indirect_p)
12479 addr = build_va_arg_indirect_ref (addr);
12481 return addr;
12484 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12486 static void
12487 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12488 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12489 int no_rtl)
12491 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12492 CUMULATIVE_ARGS local_cum;
12493 int gr_saved = cfun->va_list_gpr_size;
12494 int vr_saved = cfun->va_list_fpr_size;
12496 /* The caller has advanced CUM up to, but not beyond, the last named
12497 argument. Advance a local copy of CUM past the last "real" named
12498 argument, to find out how many registers are left over. */
12499 local_cum = *cum;
12500 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12502 /* Found out how many registers we need to save.
12503 Honor tree-stdvar analysis results. */
12504 if (cfun->va_list_gpr_size)
12505 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12506 cfun->va_list_gpr_size / UNITS_PER_WORD);
12507 if (cfun->va_list_fpr_size)
12508 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12509 cfun->va_list_fpr_size / UNITS_PER_VREG);
12511 if (!TARGET_FLOAT)
12513 gcc_assert (local_cum.aapcs_nvrn == 0);
12514 vr_saved = 0;
12517 if (!no_rtl)
12519 if (gr_saved > 0)
12521 rtx ptr, mem;
12523 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12524 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12525 - gr_saved * UNITS_PER_WORD);
12526 mem = gen_frame_mem (BLKmode, ptr);
12527 set_mem_alias_set (mem, get_varargs_alias_set ());
12529 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12530 mem, gr_saved);
12532 if (vr_saved > 0)
12534 /* We can't use move_block_from_reg, because it will use
12535 the wrong mode, storing D regs only. */
12536 machine_mode mode = TImode;
12537 int off, i, vr_start;
12539 /* Set OFF to the offset from virtual_incoming_args_rtx of
12540 the first vector register. The VR save area lies below
12541 the GR one, and is aligned to 16 bytes. */
12542 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12543 STACK_BOUNDARY / BITS_PER_UNIT);
12544 off -= vr_saved * UNITS_PER_VREG;
12546 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12547 for (i = 0; i < vr_saved; ++i)
12549 rtx ptr, mem;
12551 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12552 mem = gen_frame_mem (mode, ptr);
12553 set_mem_alias_set (mem, get_varargs_alias_set ());
12554 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12555 off += UNITS_PER_VREG;
12560 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12561 any complication of having crtl->args.pretend_args_size changed. */
12562 cfun->machine->frame.saved_varargs_size
12563 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12564 STACK_BOUNDARY / BITS_PER_UNIT)
12565 + vr_saved * UNITS_PER_VREG);
12568 static void
12569 aarch64_conditional_register_usage (void)
12571 int i;
12572 if (!TARGET_FLOAT)
12574 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12576 fixed_regs[i] = 1;
12577 call_used_regs[i] = 1;
12580 if (!TARGET_SVE)
12581 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12583 fixed_regs[i] = 1;
12584 call_used_regs[i] = 1;
12588 /* Walk down the type tree of TYPE counting consecutive base elements.
12589 If *MODEP is VOIDmode, then set it to the first valid floating point
12590 type. If a non-floating point type is found, or if a floating point
12591 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12592 otherwise return the count in the sub-tree. */
12593 static int
12594 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12596 machine_mode mode;
12597 HOST_WIDE_INT size;
12599 switch (TREE_CODE (type))
12601 case REAL_TYPE:
12602 mode = TYPE_MODE (type);
12603 if (mode != DFmode && mode != SFmode
12604 && mode != TFmode && mode != HFmode)
12605 return -1;
12607 if (*modep == VOIDmode)
12608 *modep = mode;
12610 if (*modep == mode)
12611 return 1;
12613 break;
12615 case COMPLEX_TYPE:
12616 mode = TYPE_MODE (TREE_TYPE (type));
12617 if (mode != DFmode && mode != SFmode
12618 && mode != TFmode && mode != HFmode)
12619 return -1;
12621 if (*modep == VOIDmode)
12622 *modep = mode;
12624 if (*modep == mode)
12625 return 2;
12627 break;
12629 case VECTOR_TYPE:
12630 /* Use V2SImode and V4SImode as representatives of all 64-bit
12631 and 128-bit vector types. */
12632 size = int_size_in_bytes (type);
12633 switch (size)
12635 case 8:
12636 mode = V2SImode;
12637 break;
12638 case 16:
12639 mode = V4SImode;
12640 break;
12641 default:
12642 return -1;
12645 if (*modep == VOIDmode)
12646 *modep = mode;
12648 /* Vector modes are considered to be opaque: two vectors are
12649 equivalent for the purposes of being homogeneous aggregates
12650 if they are the same size. */
12651 if (*modep == mode)
12652 return 1;
12654 break;
12656 case ARRAY_TYPE:
12658 int count;
12659 tree index = TYPE_DOMAIN (type);
12661 /* Can't handle incomplete types nor sizes that are not
12662 fixed. */
12663 if (!COMPLETE_TYPE_P (type)
12664 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12665 return -1;
12667 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12668 if (count == -1
12669 || !index
12670 || !TYPE_MAX_VALUE (index)
12671 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12672 || !TYPE_MIN_VALUE (index)
12673 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12674 || count < 0)
12675 return -1;
12677 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12678 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12680 /* There must be no padding. */
12681 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12682 count * GET_MODE_BITSIZE (*modep)))
12683 return -1;
12685 return count;
12688 case RECORD_TYPE:
12690 int count = 0;
12691 int sub_count;
12692 tree field;
12694 /* Can't handle incomplete types nor sizes that are not
12695 fixed. */
12696 if (!COMPLETE_TYPE_P (type)
12697 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12698 return -1;
12700 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12702 if (TREE_CODE (field) != FIELD_DECL)
12703 continue;
12705 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12706 if (sub_count < 0)
12707 return -1;
12708 count += sub_count;
12711 /* There must be no padding. */
12712 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12713 count * GET_MODE_BITSIZE (*modep)))
12714 return -1;
12716 return count;
12719 case UNION_TYPE:
12720 case QUAL_UNION_TYPE:
12722 /* These aren't very interesting except in a degenerate case. */
12723 int count = 0;
12724 int sub_count;
12725 tree field;
12727 /* Can't handle incomplete types nor sizes that are not
12728 fixed. */
12729 if (!COMPLETE_TYPE_P (type)
12730 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12731 return -1;
12733 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12735 if (TREE_CODE (field) != FIELD_DECL)
12736 continue;
12738 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12739 if (sub_count < 0)
12740 return -1;
12741 count = count > sub_count ? count : sub_count;
12744 /* There must be no padding. */
12745 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12746 count * GET_MODE_BITSIZE (*modep)))
12747 return -1;
12749 return count;
12752 default:
12753 break;
12756 return -1;
12759 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12760 type as described in AAPCS64 \S 4.1.2.
12762 See the comment above aarch64_composite_type_p for the notes on MODE. */
12764 static bool
12765 aarch64_short_vector_p (const_tree type,
12766 machine_mode mode)
12768 poly_int64 size = -1;
12770 if (type && TREE_CODE (type) == VECTOR_TYPE)
12771 size = int_size_in_bytes (type);
12772 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12773 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12774 size = GET_MODE_SIZE (mode);
12776 return known_eq (size, 8) || known_eq (size, 16);
12779 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12780 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12781 array types. The C99 floating-point complex types are also considered
12782 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12783 types, which are GCC extensions and out of the scope of AAPCS64, are
12784 treated as composite types here as well.
12786 Note that MODE itself is not sufficient in determining whether a type
12787 is such a composite type or not. This is because
12788 stor-layout.c:compute_record_mode may have already changed the MODE
12789 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12790 structure with only one field may have its MODE set to the mode of the
12791 field. Also an integer mode whose size matches the size of the
12792 RECORD_TYPE type may be used to substitute the original mode
12793 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12794 solely relied on. */
12796 static bool
12797 aarch64_composite_type_p (const_tree type,
12798 machine_mode mode)
12800 if (aarch64_short_vector_p (type, mode))
12801 return false;
12803 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12804 return true;
12806 if (mode == BLKmode
12807 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12808 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12809 return true;
12811 return false;
12814 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12815 shall be passed or returned in simd/fp register(s) (providing these
12816 parameter passing registers are available).
12818 Upon successful return, *COUNT returns the number of needed registers,
12819 *BASE_MODE returns the mode of the individual register and when IS_HAF
12820 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12821 floating-point aggregate or a homogeneous short-vector aggregate. */
12823 static bool
12824 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12825 const_tree type,
12826 machine_mode *base_mode,
12827 int *count,
12828 bool *is_ha)
12830 machine_mode new_mode = VOIDmode;
12831 bool composite_p = aarch64_composite_type_p (type, mode);
12833 if (is_ha != NULL) *is_ha = false;
12835 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12836 || aarch64_short_vector_p (type, mode))
12838 *count = 1;
12839 new_mode = mode;
12841 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12843 if (is_ha != NULL) *is_ha = true;
12844 *count = 2;
12845 new_mode = GET_MODE_INNER (mode);
12847 else if (type && composite_p)
12849 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12851 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12853 if (is_ha != NULL) *is_ha = true;
12854 *count = ag_count;
12856 else
12857 return false;
12859 else
12860 return false;
12862 *base_mode = new_mode;
12863 return true;
12866 /* Implement TARGET_STRUCT_VALUE_RTX. */
12868 static rtx
12869 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12870 int incoming ATTRIBUTE_UNUSED)
12872 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12875 /* Implements target hook vector_mode_supported_p. */
12876 static bool
12877 aarch64_vector_mode_supported_p (machine_mode mode)
12879 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12880 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12883 /* Return appropriate SIMD container
12884 for MODE within a vector of WIDTH bits. */
12885 static machine_mode
12886 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12888 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12889 switch (mode)
12891 case E_DFmode:
12892 return VNx2DFmode;
12893 case E_SFmode:
12894 return VNx4SFmode;
12895 case E_HFmode:
12896 return VNx8HFmode;
12897 case E_DImode:
12898 return VNx2DImode;
12899 case E_SImode:
12900 return VNx4SImode;
12901 case E_HImode:
12902 return VNx8HImode;
12903 case E_QImode:
12904 return VNx16QImode;
12905 default:
12906 return word_mode;
12909 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12910 if (TARGET_SIMD)
12912 if (known_eq (width, 128))
12913 switch (mode)
12915 case E_DFmode:
12916 return V2DFmode;
12917 case E_SFmode:
12918 return V4SFmode;
12919 case E_HFmode:
12920 return V8HFmode;
12921 case E_SImode:
12922 return V4SImode;
12923 case E_HImode:
12924 return V8HImode;
12925 case E_QImode:
12926 return V16QImode;
12927 case E_DImode:
12928 return V2DImode;
12929 default:
12930 break;
12932 else
12933 switch (mode)
12935 case E_SFmode:
12936 return V2SFmode;
12937 case E_HFmode:
12938 return V4HFmode;
12939 case E_SImode:
12940 return V2SImode;
12941 case E_HImode:
12942 return V4HImode;
12943 case E_QImode:
12944 return V8QImode;
12945 default:
12946 break;
12949 return word_mode;
12952 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12953 static machine_mode
12954 aarch64_preferred_simd_mode (scalar_mode mode)
12956 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12957 return aarch64_simd_container_mode (mode, bits);
12960 /* Return a list of possible vector sizes for the vectorizer
12961 to iterate over. */
12962 static void
12963 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12965 if (TARGET_SVE)
12966 sizes->safe_push (BYTES_PER_SVE_VECTOR);
12967 sizes->safe_push (16);
12968 sizes->safe_push (8);
12971 /* Implement TARGET_MANGLE_TYPE. */
12973 static const char *
12974 aarch64_mangle_type (const_tree type)
12976 /* The AArch64 ABI documents say that "__va_list" has to be
12977 managled as if it is in the "std" namespace. */
12978 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12979 return "St9__va_list";
12981 /* Half-precision float. */
12982 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12983 return "Dh";
12985 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12986 builtin types. */
12987 if (TYPE_NAME (type) != NULL)
12988 return aarch64_mangle_builtin_type (type);
12990 /* Use the default mangling. */
12991 return NULL;
12994 /* Find the first rtx_insn before insn that will generate an assembly
12995 instruction. */
12997 static rtx_insn *
12998 aarch64_prev_real_insn (rtx_insn *insn)
13000 if (!insn)
13001 return NULL;
13005 insn = prev_real_insn (insn);
13007 while (insn && recog_memoized (insn) < 0);
13009 return insn;
13012 static bool
13013 is_madd_op (enum attr_type t1)
13015 unsigned int i;
13016 /* A number of these may be AArch32 only. */
13017 enum attr_type mlatypes[] = {
13018 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
13019 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
13020 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
13023 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
13025 if (t1 == mlatypes[i])
13026 return true;
13029 return false;
13032 /* Check if there is a register dependency between a load and the insn
13033 for which we hold recog_data. */
13035 static bool
13036 dep_between_memop_and_curr (rtx memop)
13038 rtx load_reg;
13039 int opno;
13041 gcc_assert (GET_CODE (memop) == SET);
13043 if (!REG_P (SET_DEST (memop)))
13044 return false;
13046 load_reg = SET_DEST (memop);
13047 for (opno = 1; opno < recog_data.n_operands; opno++)
13049 rtx operand = recog_data.operand[opno];
13050 if (REG_P (operand)
13051 && reg_overlap_mentioned_p (load_reg, operand))
13052 return true;
13055 return false;
13059 /* When working around the Cortex-A53 erratum 835769,
13060 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13061 instruction and has a preceding memory instruction such that a NOP
13062 should be inserted between them. */
13064 bool
13065 aarch64_madd_needs_nop (rtx_insn* insn)
13067 enum attr_type attr_type;
13068 rtx_insn *prev;
13069 rtx body;
13071 if (!TARGET_FIX_ERR_A53_835769)
13072 return false;
13074 if (!INSN_P (insn) || recog_memoized (insn) < 0)
13075 return false;
13077 attr_type = get_attr_type (insn);
13078 if (!is_madd_op (attr_type))
13079 return false;
13081 prev = aarch64_prev_real_insn (insn);
13082 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13083 Restore recog state to INSN to avoid state corruption. */
13084 extract_constrain_insn_cached (insn);
13086 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13087 return false;
13089 body = single_set (prev);
13091 /* If the previous insn is a memory op and there is no dependency between
13092 it and the DImode madd, emit a NOP between them. If body is NULL then we
13093 have a complex memory operation, probably a load/store pair.
13094 Be conservative for now and emit a NOP. */
13095 if (GET_MODE (recog_data.operand[0]) == DImode
13096 && (!body || !dep_between_memop_and_curr (body)))
13097 return true;
13099 return false;
13104 /* Implement FINAL_PRESCAN_INSN. */
13106 void
13107 aarch64_final_prescan_insn (rtx_insn *insn)
13109 if (aarch64_madd_needs_nop (insn))
13110 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13114 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13115 instruction. */
13117 bool
13118 aarch64_sve_index_immediate_p (rtx base_or_step)
13120 return (CONST_INT_P (base_or_step)
13121 && IN_RANGE (INTVAL (base_or_step), -16, 15));
13124 /* Return true if X is a valid immediate for the SVE ADD and SUB
13125 instructions. Negate X first if NEGATE_P is true. */
13127 bool
13128 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13130 rtx elt;
13132 if (!const_vec_duplicate_p (x, &elt)
13133 || !CONST_INT_P (elt))
13134 return false;
13136 HOST_WIDE_INT val = INTVAL (elt);
13137 if (negate_p)
13138 val = -val;
13139 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13141 if (val & 0xff)
13142 return IN_RANGE (val, 0, 0xff);
13143 return IN_RANGE (val, 0, 0xff00);
13146 /* Return true if X is a valid immediate operand for an SVE logical
13147 instruction such as AND. */
13149 bool
13150 aarch64_sve_bitmask_immediate_p (rtx x)
13152 rtx elt;
13154 return (const_vec_duplicate_p (x, &elt)
13155 && CONST_INT_P (elt)
13156 && aarch64_bitmask_imm (INTVAL (elt),
13157 GET_MODE_INNER (GET_MODE (x))));
13160 /* Return true if X is a valid immediate for the SVE DUP and CPY
13161 instructions. */
13163 bool
13164 aarch64_sve_dup_immediate_p (rtx x)
13166 rtx elt;
13168 if (!const_vec_duplicate_p (x, &elt)
13169 || !CONST_INT_P (elt))
13170 return false;
13172 HOST_WIDE_INT val = INTVAL (elt);
13173 if (val & 0xff)
13174 return IN_RANGE (val, -0x80, 0x7f);
13175 return IN_RANGE (val, -0x8000, 0x7f00);
13178 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13179 SIGNED_P says whether the operand is signed rather than unsigned. */
13181 bool
13182 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13184 rtx elt;
13186 return (const_vec_duplicate_p (x, &elt)
13187 && CONST_INT_P (elt)
13188 && (signed_p
13189 ? IN_RANGE (INTVAL (elt), -16, 15)
13190 : IN_RANGE (INTVAL (elt), 0, 127)));
13193 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13194 instruction. Negate X first if NEGATE_P is true. */
13196 bool
13197 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13199 rtx elt;
13200 REAL_VALUE_TYPE r;
13202 if (!const_vec_duplicate_p (x, &elt)
13203 || GET_CODE (elt) != CONST_DOUBLE)
13204 return false;
13206 r = *CONST_DOUBLE_REAL_VALUE (elt);
13208 if (negate_p)
13209 r = real_value_negate (&r);
13211 if (real_equal (&r, &dconst1))
13212 return true;
13213 if (real_equal (&r, &dconsthalf))
13214 return true;
13215 return false;
13218 /* Return true if X is a valid immediate operand for an SVE FMUL
13219 instruction. */
13221 bool
13222 aarch64_sve_float_mul_immediate_p (rtx x)
13224 rtx elt;
13226 /* GCC will never generate a multiply with an immediate of 2, so there is no
13227 point testing for it (even though it is a valid constant). */
13228 return (const_vec_duplicate_p (x, &elt)
13229 && GET_CODE (elt) == CONST_DOUBLE
13230 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13233 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13234 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13235 is nonnull, use it to describe valid immediates. */
13236 static bool
13237 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13238 simd_immediate_info *info,
13239 enum simd_immediate_check which,
13240 simd_immediate_info::insn_type insn)
13242 /* Try a 4-byte immediate with LSL. */
13243 for (unsigned int shift = 0; shift < 32; shift += 8)
13244 if ((val32 & (0xff << shift)) == val32)
13246 if (info)
13247 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13248 simd_immediate_info::LSL, shift);
13249 return true;
13252 /* Try a 2-byte immediate with LSL. */
13253 unsigned int imm16 = val32 & 0xffff;
13254 if (imm16 == (val32 >> 16))
13255 for (unsigned int shift = 0; shift < 16; shift += 8)
13256 if ((imm16 & (0xff << shift)) == imm16)
13258 if (info)
13259 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13260 simd_immediate_info::LSL, shift);
13261 return true;
13264 /* Try a 4-byte immediate with MSL, except for cases that MVN
13265 can handle. */
13266 if (which == AARCH64_CHECK_MOV)
13267 for (unsigned int shift = 8; shift < 24; shift += 8)
13269 unsigned int low = (1 << shift) - 1;
13270 if (((val32 & (0xff << shift)) | low) == val32)
13272 if (info)
13273 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13274 simd_immediate_info::MSL, shift);
13275 return true;
13279 return false;
13282 /* Return true if replicating VAL64 is a valid immediate for the
13283 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13284 use it to describe valid immediates. */
13285 static bool
13286 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13287 simd_immediate_info *info,
13288 enum simd_immediate_check which)
13290 unsigned int val32 = val64 & 0xffffffff;
13291 unsigned int val16 = val64 & 0xffff;
13292 unsigned int val8 = val64 & 0xff;
13294 if (val32 == (val64 >> 32))
13296 if ((which & AARCH64_CHECK_ORR) != 0
13297 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13298 simd_immediate_info::MOV))
13299 return true;
13301 if ((which & AARCH64_CHECK_BIC) != 0
13302 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13303 simd_immediate_info::MVN))
13304 return true;
13306 /* Try using a replicated byte. */
13307 if (which == AARCH64_CHECK_MOV
13308 && val16 == (val32 >> 16)
13309 && val8 == (val16 >> 8))
13311 if (info)
13312 *info = simd_immediate_info (QImode, val8);
13313 return true;
13317 /* Try using a bit-to-bytemask. */
13318 if (which == AARCH64_CHECK_MOV)
13320 unsigned int i;
13321 for (i = 0; i < 64; i += 8)
13323 unsigned char byte = (val64 >> i) & 0xff;
13324 if (byte != 0 && byte != 0xff)
13325 break;
13327 if (i == 64)
13329 if (info)
13330 *info = simd_immediate_info (DImode, val64);
13331 return true;
13334 return false;
13337 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13338 instruction. If INFO is nonnull, use it to describe valid immediates. */
13340 static bool
13341 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13342 simd_immediate_info *info)
13344 scalar_int_mode mode = DImode;
13345 unsigned int val32 = val64 & 0xffffffff;
13346 if (val32 == (val64 >> 32))
13348 mode = SImode;
13349 unsigned int val16 = val32 & 0xffff;
13350 if (val16 == (val32 >> 16))
13352 mode = HImode;
13353 unsigned int val8 = val16 & 0xff;
13354 if (val8 == (val16 >> 8))
13355 mode = QImode;
13358 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13359 if (IN_RANGE (val, -0x80, 0x7f))
13361 /* DUP with no shift. */
13362 if (info)
13363 *info = simd_immediate_info (mode, val);
13364 return true;
13366 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13368 /* DUP with LSL #8. */
13369 if (info)
13370 *info = simd_immediate_info (mode, val);
13371 return true;
13373 if (aarch64_bitmask_imm (val64, mode))
13375 /* DUPM. */
13376 if (info)
13377 *info = simd_immediate_info (mode, val);
13378 return true;
13380 return false;
13383 /* Return true if OP is a valid SIMD immediate for the operation
13384 described by WHICH. If INFO is nonnull, use it to describe valid
13385 immediates. */
13386 bool
13387 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13388 enum simd_immediate_check which)
13390 machine_mode mode = GET_MODE (op);
13391 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13392 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13393 return false;
13395 scalar_mode elt_mode = GET_MODE_INNER (mode);
13396 rtx base, step;
13397 unsigned int n_elts;
13398 if (GET_CODE (op) == CONST_VECTOR
13399 && CONST_VECTOR_DUPLICATE_P (op))
13400 n_elts = CONST_VECTOR_NPATTERNS (op);
13401 else if ((vec_flags & VEC_SVE_DATA)
13402 && const_vec_series_p (op, &base, &step))
13404 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13405 if (!aarch64_sve_index_immediate_p (base)
13406 || !aarch64_sve_index_immediate_p (step))
13407 return false;
13409 if (info)
13410 *info = simd_immediate_info (elt_mode, base, step);
13411 return true;
13413 else if (GET_CODE (op) == CONST_VECTOR
13414 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13415 /* N_ELTS set above. */;
13416 else
13417 return false;
13419 /* Handle PFALSE and PTRUE. */
13420 if (vec_flags & VEC_SVE_PRED)
13421 return (op == CONST0_RTX (mode)
13422 || op == CONSTM1_RTX (mode));
13424 scalar_float_mode elt_float_mode;
13425 if (n_elts == 1
13426 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13428 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13429 if (aarch64_float_const_zero_rtx_p (elt)
13430 || aarch64_float_const_representable_p (elt))
13432 if (info)
13433 *info = simd_immediate_info (elt_float_mode, elt);
13434 return true;
13438 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13439 if (elt_size > 8)
13440 return false;
13442 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13444 /* Expand the vector constant out into a byte vector, with the least
13445 significant byte of the register first. */
13446 auto_vec<unsigned char, 16> bytes;
13447 bytes.reserve (n_elts * elt_size);
13448 for (unsigned int i = 0; i < n_elts; i++)
13450 /* The vector is provided in gcc endian-neutral fashion.
13451 For aarch64_be Advanced SIMD, it must be laid out in the vector
13452 register in reverse order. */
13453 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13454 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13456 if (elt_mode != elt_int_mode)
13457 elt = gen_lowpart (elt_int_mode, elt);
13459 if (!CONST_INT_P (elt))
13460 return false;
13462 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13463 for (unsigned int byte = 0; byte < elt_size; byte++)
13465 bytes.quick_push (elt_val & 0xff);
13466 elt_val >>= BITS_PER_UNIT;
13470 /* The immediate must repeat every eight bytes. */
13471 unsigned int nbytes = bytes.length ();
13472 for (unsigned i = 8; i < nbytes; ++i)
13473 if (bytes[i] != bytes[i - 8])
13474 return false;
13476 /* Get the repeating 8-byte value as an integer. No endian correction
13477 is needed here because bytes is already in lsb-first order. */
13478 unsigned HOST_WIDE_INT val64 = 0;
13479 for (unsigned int i = 0; i < 8; i++)
13480 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13481 << (i * BITS_PER_UNIT));
13483 if (vec_flags & VEC_SVE_DATA)
13484 return aarch64_sve_valid_immediate (val64, info);
13485 else
13486 return aarch64_advsimd_valid_immediate (val64, info, which);
13489 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13490 has a step in the range of INDEX. Return the index expression if so,
13491 otherwise return null. */
13493 aarch64_check_zero_based_sve_index_immediate (rtx x)
13495 rtx base, step;
13496 if (const_vec_series_p (x, &base, &step)
13497 && base == const0_rtx
13498 && aarch64_sve_index_immediate_p (step))
13499 return step;
13500 return NULL_RTX;
13503 /* Check of immediate shift constants are within range. */
13504 bool
13505 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13507 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13508 if (left)
13509 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13510 else
13511 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13514 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13515 operation of width WIDTH at bit position POS. */
13518 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13520 gcc_assert (CONST_INT_P (width));
13521 gcc_assert (CONST_INT_P (pos));
13523 unsigned HOST_WIDE_INT mask
13524 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13525 return GEN_INT (mask << UINTVAL (pos));
13528 bool
13529 aarch64_mov_operand_p (rtx x, machine_mode mode)
13531 if (GET_CODE (x) == HIGH
13532 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13533 return true;
13535 if (CONST_INT_P (x))
13536 return true;
13538 if (VECTOR_MODE_P (GET_MODE (x)))
13539 return aarch64_simd_valid_immediate (x, NULL);
13541 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13542 return true;
13544 if (aarch64_sve_cnt_immediate_p (x))
13545 return true;
13547 return aarch64_classify_symbolic_expression (x)
13548 == SYMBOL_TINY_ABSOLUTE;
13551 /* Return a const_int vector of VAL. */
13553 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13555 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13556 return gen_const_vec_duplicate (mode, c);
13559 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13561 bool
13562 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13564 machine_mode vmode;
13566 vmode = aarch64_simd_container_mode (mode, 64);
13567 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13568 return aarch64_simd_valid_immediate (op_v, NULL);
13571 /* Construct and return a PARALLEL RTX vector with elements numbering the
13572 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13573 the vector - from the perspective of the architecture. This does not
13574 line up with GCC's perspective on lane numbers, so we end up with
13575 different masks depending on our target endian-ness. The diagram
13576 below may help. We must draw the distinction when building masks
13577 which select one half of the vector. An instruction selecting
13578 architectural low-lanes for a big-endian target, must be described using
13579 a mask selecting GCC high-lanes.
13581 Big-Endian Little-Endian
13583 GCC 0 1 2 3 3 2 1 0
13584 | x | x | x | x | | x | x | x | x |
13585 Architecture 3 2 1 0 3 2 1 0
13587 Low Mask: { 2, 3 } { 0, 1 }
13588 High Mask: { 0, 1 } { 2, 3 }
13590 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13593 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13595 rtvec v = rtvec_alloc (nunits / 2);
13596 int high_base = nunits / 2;
13597 int low_base = 0;
13598 int base;
13599 rtx t1;
13600 int i;
13602 if (BYTES_BIG_ENDIAN)
13603 base = high ? low_base : high_base;
13604 else
13605 base = high ? high_base : low_base;
13607 for (i = 0; i < nunits / 2; i++)
13608 RTVEC_ELT (v, i) = GEN_INT (base + i);
13610 t1 = gen_rtx_PARALLEL (mode, v);
13611 return t1;
13614 /* Check OP for validity as a PARALLEL RTX vector with elements
13615 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13616 from the perspective of the architecture. See the diagram above
13617 aarch64_simd_vect_par_cnst_half for more details. */
13619 bool
13620 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13621 bool high)
13623 int nelts;
13624 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13625 return false;
13627 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13628 HOST_WIDE_INT count_op = XVECLEN (op, 0);
13629 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13630 int i = 0;
13632 if (count_op != count_ideal)
13633 return false;
13635 for (i = 0; i < count_ideal; i++)
13637 rtx elt_op = XVECEXP (op, 0, i);
13638 rtx elt_ideal = XVECEXP (ideal, 0, i);
13640 if (!CONST_INT_P (elt_op)
13641 || INTVAL (elt_ideal) != INTVAL (elt_op))
13642 return false;
13644 return true;
13647 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13648 HIGH (exclusive). */
13649 void
13650 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13651 const_tree exp)
13653 HOST_WIDE_INT lane;
13654 gcc_assert (CONST_INT_P (operand));
13655 lane = INTVAL (operand);
13657 if (lane < low || lane >= high)
13659 if (exp)
13660 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13661 else
13662 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13666 /* Peform endian correction on lane number N, which indexes a vector
13667 of mode MODE, and return the result as an SImode rtx. */
13670 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13672 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13675 /* Return TRUE if OP is a valid vector addressing mode. */
13677 bool
13678 aarch64_simd_mem_operand_p (rtx op)
13680 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13681 || REG_P (XEXP (op, 0)));
13684 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13686 bool
13687 aarch64_sve_ld1r_operand_p (rtx op)
13689 struct aarch64_address_info addr;
13690 scalar_mode mode;
13692 return (MEM_P (op)
13693 && is_a <scalar_mode> (GET_MODE (op), &mode)
13694 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13695 && addr.type == ADDRESS_REG_IMM
13696 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13699 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13700 The conditions for STR are the same. */
13701 bool
13702 aarch64_sve_ldr_operand_p (rtx op)
13704 struct aarch64_address_info addr;
13706 return (MEM_P (op)
13707 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13708 false, ADDR_QUERY_ANY)
13709 && addr.type == ADDRESS_REG_IMM);
13712 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13713 We need to be able to access the individual pieces, so the range
13714 is different from LD[234] and ST[234]. */
13715 bool
13716 aarch64_sve_struct_memory_operand_p (rtx op)
13718 if (!MEM_P (op))
13719 return false;
13721 machine_mode mode = GET_MODE (op);
13722 struct aarch64_address_info addr;
13723 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13724 ADDR_QUERY_ANY)
13725 || addr.type != ADDRESS_REG_IMM)
13726 return false;
13728 poly_int64 first = addr.const_offset;
13729 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13730 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13731 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13734 /* Emit a register copy from operand to operand, taking care not to
13735 early-clobber source registers in the process.
13737 COUNT is the number of components into which the copy needs to be
13738 decomposed. */
13739 void
13740 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13741 unsigned int count)
13743 unsigned int i;
13744 int rdest = REGNO (operands[0]);
13745 int rsrc = REGNO (operands[1]);
13747 if (!reg_overlap_mentioned_p (operands[0], operands[1])
13748 || rdest < rsrc)
13749 for (i = 0; i < count; i++)
13750 emit_move_insn (gen_rtx_REG (mode, rdest + i),
13751 gen_rtx_REG (mode, rsrc + i));
13752 else
13753 for (i = 0; i < count; i++)
13754 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13755 gen_rtx_REG (mode, rsrc + count - i - 1));
13758 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13759 one of VSTRUCT modes: OI, CI, or XI. */
13761 aarch64_simd_attr_length_rglist (machine_mode mode)
13763 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13764 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13767 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13768 alignment of a vector to 128 bits. SVE predicates have an alignment of
13769 16 bits. */
13770 static HOST_WIDE_INT
13771 aarch64_simd_vector_alignment (const_tree type)
13773 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13774 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13775 be set for non-predicate vectors of booleans. Modes are the most
13776 direct way we have of identifying real SVE predicate types. */
13777 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13778 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13779 return MIN (align, 128);
13782 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13783 static HOST_WIDE_INT
13784 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13786 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13788 /* If the length of the vector is fixed, try to align to that length,
13789 otherwise don't try to align at all. */
13790 HOST_WIDE_INT result;
13791 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13792 result = TYPE_ALIGN (TREE_TYPE (type));
13793 return result;
13795 return TYPE_ALIGN (type);
13798 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13799 static bool
13800 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13802 if (is_packed)
13803 return false;
13805 /* For fixed-length vectors, check that the vectorizer will aim for
13806 full-vector alignment. This isn't true for generic GCC vectors
13807 that are wider than the ABI maximum of 128 bits. */
13808 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13809 && (wi::to_widest (TYPE_SIZE (type))
13810 != aarch64_vectorize_preferred_vector_alignment (type)))
13811 return false;
13813 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13814 return true;
13817 /* Return true if the vector misalignment factor is supported by the
13818 target. */
13819 static bool
13820 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13821 const_tree type, int misalignment,
13822 bool is_packed)
13824 if (TARGET_SIMD && STRICT_ALIGNMENT)
13826 /* Return if movmisalign pattern is not supported for this mode. */
13827 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13828 return false;
13830 /* Misalignment factor is unknown at compile time. */
13831 if (misalignment == -1)
13832 return false;
13834 return default_builtin_support_vector_misalignment (mode, type, misalignment,
13835 is_packed);
13838 /* If VALS is a vector constant that can be loaded into a register
13839 using DUP, generate instructions to do so and return an RTX to
13840 assign to the register. Otherwise return NULL_RTX. */
13841 static rtx
13842 aarch64_simd_dup_constant (rtx vals)
13844 machine_mode mode = GET_MODE (vals);
13845 machine_mode inner_mode = GET_MODE_INNER (mode);
13846 rtx x;
13848 if (!const_vec_duplicate_p (vals, &x))
13849 return NULL_RTX;
13851 /* We can load this constant by using DUP and a constant in a
13852 single ARM register. This will be cheaper than a vector
13853 load. */
13854 x = copy_to_mode_reg (inner_mode, x);
13855 return gen_vec_duplicate (mode, x);
13859 /* Generate code to load VALS, which is a PARALLEL containing only
13860 constants (for vec_init) or CONST_VECTOR, efficiently into a
13861 register. Returns an RTX to copy into the register, or NULL_RTX
13862 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13863 static rtx
13864 aarch64_simd_make_constant (rtx vals)
13866 machine_mode mode = GET_MODE (vals);
13867 rtx const_dup;
13868 rtx const_vec = NULL_RTX;
13869 int n_const = 0;
13870 int i;
13872 if (GET_CODE (vals) == CONST_VECTOR)
13873 const_vec = vals;
13874 else if (GET_CODE (vals) == PARALLEL)
13876 /* A CONST_VECTOR must contain only CONST_INTs and
13877 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13878 Only store valid constants in a CONST_VECTOR. */
13879 int n_elts = XVECLEN (vals, 0);
13880 for (i = 0; i < n_elts; ++i)
13882 rtx x = XVECEXP (vals, 0, i);
13883 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13884 n_const++;
13886 if (n_const == n_elts)
13887 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13889 else
13890 gcc_unreachable ();
13892 if (const_vec != NULL_RTX
13893 && aarch64_simd_valid_immediate (const_vec, NULL))
13894 /* Load using MOVI/MVNI. */
13895 return const_vec;
13896 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13897 /* Loaded using DUP. */
13898 return const_dup;
13899 else if (const_vec != NULL_RTX)
13900 /* Load from constant pool. We can not take advantage of single-cycle
13901 LD1 because we need a PC-relative addressing mode. */
13902 return const_vec;
13903 else
13904 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13905 We can not construct an initializer. */
13906 return NULL_RTX;
13909 /* Expand a vector initialisation sequence, such that TARGET is
13910 initialised to contain VALS. */
13912 void
13913 aarch64_expand_vector_init (rtx target, rtx vals)
13915 machine_mode mode = GET_MODE (target);
13916 scalar_mode inner_mode = GET_MODE_INNER (mode);
13917 /* The number of vector elements. */
13918 int n_elts = XVECLEN (vals, 0);
13919 /* The number of vector elements which are not constant. */
13920 int n_var = 0;
13921 rtx any_const = NULL_RTX;
13922 /* The first element of vals. */
13923 rtx v0 = XVECEXP (vals, 0, 0);
13924 bool all_same = true;
13926 /* Count the number of variable elements to initialise. */
13927 for (int i = 0; i < n_elts; ++i)
13929 rtx x = XVECEXP (vals, 0, i);
13930 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13931 ++n_var;
13932 else
13933 any_const = x;
13935 all_same &= rtx_equal_p (x, v0);
13938 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13939 how best to handle this. */
13940 if (n_var == 0)
13942 rtx constant = aarch64_simd_make_constant (vals);
13943 if (constant != NULL_RTX)
13945 emit_move_insn (target, constant);
13946 return;
13950 /* Splat a single non-constant element if we can. */
13951 if (all_same)
13953 rtx x = copy_to_mode_reg (inner_mode, v0);
13954 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13955 return;
13958 enum insn_code icode = optab_handler (vec_set_optab, mode);
13959 gcc_assert (icode != CODE_FOR_nothing);
13961 /* If there are only variable elements, try to optimize
13962 the insertion using dup for the most common element
13963 followed by insertions. */
13965 /* The algorithm will fill matches[*][0] with the earliest matching element,
13966 and matches[X][1] with the count of duplicate elements (if X is the
13967 earliest element which has duplicates). */
13969 if (n_var == n_elts && n_elts <= 16)
13971 int matches[16][2] = {0};
13972 for (int i = 0; i < n_elts; i++)
13974 for (int j = 0; j <= i; j++)
13976 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13978 matches[i][0] = j;
13979 matches[j][1]++;
13980 break;
13984 int maxelement = 0;
13985 int maxv = 0;
13986 for (int i = 0; i < n_elts; i++)
13987 if (matches[i][1] > maxv)
13989 maxelement = i;
13990 maxv = matches[i][1];
13993 /* Create a duplicate of the most common element, unless all elements
13994 are equally useless to us, in which case just immediately set the
13995 vector register using the first element. */
13997 if (maxv == 1)
13999 /* For vectors of two 64-bit elements, we can do even better. */
14000 if (n_elts == 2
14001 && (inner_mode == E_DImode
14002 || inner_mode == E_DFmode))
14005 rtx x0 = XVECEXP (vals, 0, 0);
14006 rtx x1 = XVECEXP (vals, 0, 1);
14007 /* Combine can pick up this case, but handling it directly
14008 here leaves clearer RTL.
14010 This is load_pair_lanes<mode>, and also gives us a clean-up
14011 for store_pair_lanes<mode>. */
14012 if (memory_operand (x0, inner_mode)
14013 && memory_operand (x1, inner_mode)
14014 && !STRICT_ALIGNMENT
14015 && rtx_equal_p (XEXP (x1, 0),
14016 plus_constant (Pmode,
14017 XEXP (x0, 0),
14018 GET_MODE_SIZE (inner_mode))))
14020 rtx t;
14021 if (inner_mode == DFmode)
14022 t = gen_load_pair_lanesdf (target, x0, x1);
14023 else
14024 t = gen_load_pair_lanesdi (target, x0, x1);
14025 emit_insn (t);
14026 return;
14029 /* The subreg-move sequence below will move into lane zero of the
14030 vector register. For big-endian we want that position to hold
14031 the last element of VALS. */
14032 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
14033 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14034 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
14036 else
14038 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14039 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14042 /* Insert the rest. */
14043 for (int i = 0; i < n_elts; i++)
14045 rtx x = XVECEXP (vals, 0, i);
14046 if (matches[i][0] == maxelement)
14047 continue;
14048 x = copy_to_mode_reg (inner_mode, x);
14049 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14051 return;
14054 /* Initialise a vector which is part-variable. We want to first try
14055 to build those lanes which are constant in the most efficient way we
14056 can. */
14057 if (n_var != n_elts)
14059 rtx copy = copy_rtx (vals);
14061 /* Load constant part of vector. We really don't care what goes into the
14062 parts we will overwrite, but we're more likely to be able to load the
14063 constant efficiently if it has fewer, larger, repeating parts
14064 (see aarch64_simd_valid_immediate). */
14065 for (int i = 0; i < n_elts; i++)
14067 rtx x = XVECEXP (vals, 0, i);
14068 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14069 continue;
14070 rtx subst = any_const;
14071 for (int bit = n_elts / 2; bit > 0; bit /= 2)
14073 /* Look in the copied vector, as more elements are const. */
14074 rtx test = XVECEXP (copy, 0, i ^ bit);
14075 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14077 subst = test;
14078 break;
14081 XVECEXP (copy, 0, i) = subst;
14083 aarch64_expand_vector_init (target, copy);
14086 /* Insert the variable lanes directly. */
14087 for (int i = 0; i < n_elts; i++)
14089 rtx x = XVECEXP (vals, 0, i);
14090 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14091 continue;
14092 x = copy_to_mode_reg (inner_mode, x);
14093 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14097 static unsigned HOST_WIDE_INT
14098 aarch64_shift_truncation_mask (machine_mode mode)
14100 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14101 return 0;
14102 return GET_MODE_UNIT_BITSIZE (mode) - 1;
14105 /* Select a format to encode pointers in exception handling data. */
14107 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14109 int type;
14110 switch (aarch64_cmodel)
14112 case AARCH64_CMODEL_TINY:
14113 case AARCH64_CMODEL_TINY_PIC:
14114 case AARCH64_CMODEL_SMALL:
14115 case AARCH64_CMODEL_SMALL_PIC:
14116 case AARCH64_CMODEL_SMALL_SPIC:
14117 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
14118 for everything. */
14119 type = DW_EH_PE_sdata4;
14120 break;
14121 default:
14122 /* No assumptions here. 8-byte relocs required. */
14123 type = DW_EH_PE_sdata8;
14124 break;
14126 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14129 /* The last .arch and .tune assembly strings that we printed. */
14130 static std::string aarch64_last_printed_arch_string;
14131 static std::string aarch64_last_printed_tune_string;
14133 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14134 by the function fndecl. */
14136 void
14137 aarch64_declare_function_name (FILE *stream, const char* name,
14138 tree fndecl)
14140 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14142 struct cl_target_option *targ_options;
14143 if (target_parts)
14144 targ_options = TREE_TARGET_OPTION (target_parts);
14145 else
14146 targ_options = TREE_TARGET_OPTION (target_option_current_node);
14147 gcc_assert (targ_options);
14149 const struct processor *this_arch
14150 = aarch64_get_arch (targ_options->x_explicit_arch);
14152 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14153 std::string extension
14154 = aarch64_get_extension_string_for_isa_flags (isa_flags,
14155 this_arch->flags);
14156 /* Only update the assembler .arch string if it is distinct from the last
14157 such string we printed. */
14158 std::string to_print = this_arch->name + extension;
14159 if (to_print != aarch64_last_printed_arch_string)
14161 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14162 aarch64_last_printed_arch_string = to_print;
14165 /* Print the cpu name we're tuning for in the comments, might be
14166 useful to readers of the generated asm. Do it only when it changes
14167 from function to function and verbose assembly is requested. */
14168 const struct processor *this_tune
14169 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14171 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14173 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14174 this_tune->name);
14175 aarch64_last_printed_tune_string = this_tune->name;
14178 /* Don't forget the type directive for ELF. */
14179 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14180 ASM_OUTPUT_LABEL (stream, name);
14183 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14185 static void
14186 aarch64_start_file (void)
14188 struct cl_target_option *default_options
14189 = TREE_TARGET_OPTION (target_option_default_node);
14191 const struct processor *default_arch
14192 = aarch64_get_arch (default_options->x_explicit_arch);
14193 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14194 std::string extension
14195 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14196 default_arch->flags);
14198 aarch64_last_printed_arch_string = default_arch->name + extension;
14199 aarch64_last_printed_tune_string = "";
14200 asm_fprintf (asm_out_file, "\t.arch %s\n",
14201 aarch64_last_printed_arch_string.c_str ());
14203 default_file_start ();
14206 /* Emit load exclusive. */
14208 static void
14209 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14210 rtx mem, rtx model_rtx)
14212 rtx (*gen) (rtx, rtx, rtx);
14214 switch (mode)
14216 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14217 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14218 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14219 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14220 default:
14221 gcc_unreachable ();
14224 emit_insn (gen (rval, mem, model_rtx));
14227 /* Emit store exclusive. */
14229 static void
14230 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14231 rtx rval, rtx mem, rtx model_rtx)
14233 rtx (*gen) (rtx, rtx, rtx, rtx);
14235 switch (mode)
14237 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14238 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14239 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14240 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14241 default:
14242 gcc_unreachable ();
14245 emit_insn (gen (bval, rval, mem, model_rtx));
14248 /* Mark the previous jump instruction as unlikely. */
14250 static void
14251 aarch64_emit_unlikely_jump (rtx insn)
14253 rtx_insn *jump = emit_jump_insn (insn);
14254 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14257 /* Expand a compare and swap pattern. */
14259 void
14260 aarch64_expand_compare_and_swap (rtx operands[])
14262 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14263 machine_mode mode, cmp_mode;
14264 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14265 int idx;
14266 gen_cas_fn gen;
14267 const gen_cas_fn split_cas[] =
14269 gen_aarch64_compare_and_swapqi,
14270 gen_aarch64_compare_and_swaphi,
14271 gen_aarch64_compare_and_swapsi,
14272 gen_aarch64_compare_and_swapdi
14274 const gen_cas_fn atomic_cas[] =
14276 gen_aarch64_compare_and_swapqi_lse,
14277 gen_aarch64_compare_and_swaphi_lse,
14278 gen_aarch64_compare_and_swapsi_lse,
14279 gen_aarch64_compare_and_swapdi_lse
14282 bval = operands[0];
14283 rval = operands[1];
14284 mem = operands[2];
14285 oldval = operands[3];
14286 newval = operands[4];
14287 is_weak = operands[5];
14288 mod_s = operands[6];
14289 mod_f = operands[7];
14290 mode = GET_MODE (mem);
14291 cmp_mode = mode;
14293 /* Normally the succ memory model must be stronger than fail, but in the
14294 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14295 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14297 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14298 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14299 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14301 switch (mode)
14303 case E_QImode:
14304 case E_HImode:
14305 /* For short modes, we're going to perform the comparison in SImode,
14306 so do the zero-extension now. */
14307 cmp_mode = SImode;
14308 rval = gen_reg_rtx (SImode);
14309 oldval = convert_modes (SImode, mode, oldval, true);
14310 /* Fall through. */
14312 case E_SImode:
14313 case E_DImode:
14314 /* Force the value into a register if needed. */
14315 if (!aarch64_plus_operand (oldval, mode))
14316 oldval = force_reg (cmp_mode, oldval);
14317 break;
14319 default:
14320 gcc_unreachable ();
14323 switch (mode)
14325 case E_QImode: idx = 0; break;
14326 case E_HImode: idx = 1; break;
14327 case E_SImode: idx = 2; break;
14328 case E_DImode: idx = 3; break;
14329 default:
14330 gcc_unreachable ();
14332 if (TARGET_LSE)
14333 gen = atomic_cas[idx];
14334 else
14335 gen = split_cas[idx];
14337 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14339 if (mode == QImode || mode == HImode)
14340 emit_move_insn (operands[1], gen_lowpart (mode, rval));
14342 x = gen_rtx_REG (CCmode, CC_REGNUM);
14343 x = gen_rtx_EQ (SImode, x, const0_rtx);
14344 emit_insn (gen_rtx_SET (bval, x));
14347 /* Test whether the target supports using a atomic load-operate instruction.
14348 CODE is the operation and AFTER is TRUE if the data in memory after the
14349 operation should be returned and FALSE if the data before the operation
14350 should be returned. Returns FALSE if the operation isn't supported by the
14351 architecture. */
14353 bool
14354 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14356 if (!TARGET_LSE)
14357 return false;
14359 switch (code)
14361 case SET:
14362 case AND:
14363 case IOR:
14364 case XOR:
14365 case MINUS:
14366 case PLUS:
14367 return true;
14368 default:
14369 return false;
14373 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14374 sequence implementing an atomic operation. */
14376 static void
14377 aarch64_emit_post_barrier (enum memmodel model)
14379 const enum memmodel base_model = memmodel_base (model);
14381 if (is_mm_sync (model)
14382 && (base_model == MEMMODEL_ACQUIRE
14383 || base_model == MEMMODEL_ACQ_REL
14384 || base_model == MEMMODEL_SEQ_CST))
14386 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14390 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14391 for the data in memory. EXPECTED is the value expected to be in memory.
14392 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14393 is the memory ordering to use. */
14395 void
14396 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14397 rtx expected, rtx desired,
14398 rtx model)
14400 rtx (*gen) (rtx, rtx, rtx, rtx);
14401 machine_mode mode;
14403 mode = GET_MODE (mem);
14405 switch (mode)
14407 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14408 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14409 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14410 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14411 default:
14412 gcc_unreachable ();
14415 /* Move the expected value into the CAS destination register. */
14416 emit_insn (gen_rtx_SET (rval, expected));
14418 /* Emit the CAS. */
14419 emit_insn (gen (rval, mem, desired, model));
14421 /* Compare the expected value with the value loaded by the CAS, to establish
14422 whether the swap was made. */
14423 aarch64_gen_compare_reg (EQ, rval, expected);
14426 /* Split a compare and swap pattern. */
14428 void
14429 aarch64_split_compare_and_swap (rtx operands[])
14431 rtx rval, mem, oldval, newval, scratch;
14432 machine_mode mode;
14433 bool is_weak;
14434 rtx_code_label *label1, *label2;
14435 rtx x, cond;
14436 enum memmodel model;
14437 rtx model_rtx;
14439 rval = operands[0];
14440 mem = operands[1];
14441 oldval = operands[2];
14442 newval = operands[3];
14443 is_weak = (operands[4] != const0_rtx);
14444 model_rtx = operands[5];
14445 scratch = operands[7];
14446 mode = GET_MODE (mem);
14447 model = memmodel_from_int (INTVAL (model_rtx));
14449 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14450 loop:
14451 .label1:
14452 LD[A]XR rval, [mem]
14453 CBNZ rval, .label2
14454 ST[L]XR scratch, newval, [mem]
14455 CBNZ scratch, .label1
14456 .label2:
14457 CMP rval, 0. */
14458 bool strong_zero_p = !is_weak && oldval == const0_rtx;
14460 label1 = NULL;
14461 if (!is_weak)
14463 label1 = gen_label_rtx ();
14464 emit_label (label1);
14466 label2 = gen_label_rtx ();
14468 /* The initial load can be relaxed for a __sync operation since a final
14469 barrier will be emitted to stop code hoisting. */
14470 if (is_mm_sync (model))
14471 aarch64_emit_load_exclusive (mode, rval, mem,
14472 GEN_INT (MEMMODEL_RELAXED));
14473 else
14474 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14476 if (strong_zero_p)
14478 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14479 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14480 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14481 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14483 else
14485 cond = aarch64_gen_compare_reg (NE, rval, oldval);
14486 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14487 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14488 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14489 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14492 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14494 if (!is_weak)
14496 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14497 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14498 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14499 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14501 else
14503 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14504 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14505 emit_insn (gen_rtx_SET (cond, x));
14508 emit_label (label2);
14509 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14510 to set the condition flags. If this is not used it will be removed by
14511 later passes. */
14512 if (strong_zero_p)
14514 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14515 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14516 emit_insn (gen_rtx_SET (cond, x));
14518 /* Emit any final barrier needed for a __sync operation. */
14519 if (is_mm_sync (model))
14520 aarch64_emit_post_barrier (model);
14523 /* Emit a BIC instruction. */
14525 static void
14526 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14528 rtx shift_rtx = GEN_INT (shift);
14529 rtx (*gen) (rtx, rtx, rtx, rtx);
14531 switch (mode)
14533 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14534 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14535 default:
14536 gcc_unreachable ();
14539 emit_insn (gen (dst, s2, shift_rtx, s1));
14542 /* Emit an atomic swap. */
14544 static void
14545 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14546 rtx mem, rtx model)
14548 rtx (*gen) (rtx, rtx, rtx, rtx);
14550 switch (mode)
14552 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14553 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14554 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14555 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14556 default:
14557 gcc_unreachable ();
14560 emit_insn (gen (dst, mem, value, model));
14563 /* Operations supported by aarch64_emit_atomic_load_op. */
14565 enum aarch64_atomic_load_op_code
14567 AARCH64_LDOP_PLUS, /* A + B */
14568 AARCH64_LDOP_XOR, /* A ^ B */
14569 AARCH64_LDOP_OR, /* A | B */
14570 AARCH64_LDOP_BIC /* A & ~B */
14573 /* Emit an atomic load-operate. */
14575 static void
14576 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14577 machine_mode mode, rtx dst, rtx src,
14578 rtx mem, rtx model)
14580 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14581 const aarch64_atomic_load_op_fn plus[] =
14583 gen_aarch64_atomic_loadaddqi,
14584 gen_aarch64_atomic_loadaddhi,
14585 gen_aarch64_atomic_loadaddsi,
14586 gen_aarch64_atomic_loadadddi
14588 const aarch64_atomic_load_op_fn eor[] =
14590 gen_aarch64_atomic_loadeorqi,
14591 gen_aarch64_atomic_loadeorhi,
14592 gen_aarch64_atomic_loadeorsi,
14593 gen_aarch64_atomic_loadeordi
14595 const aarch64_atomic_load_op_fn ior[] =
14597 gen_aarch64_atomic_loadsetqi,
14598 gen_aarch64_atomic_loadsethi,
14599 gen_aarch64_atomic_loadsetsi,
14600 gen_aarch64_atomic_loadsetdi
14602 const aarch64_atomic_load_op_fn bic[] =
14604 gen_aarch64_atomic_loadclrqi,
14605 gen_aarch64_atomic_loadclrhi,
14606 gen_aarch64_atomic_loadclrsi,
14607 gen_aarch64_atomic_loadclrdi
14609 aarch64_atomic_load_op_fn gen;
14610 int idx = 0;
14612 switch (mode)
14614 case E_QImode: idx = 0; break;
14615 case E_HImode: idx = 1; break;
14616 case E_SImode: idx = 2; break;
14617 case E_DImode: idx = 3; break;
14618 default:
14619 gcc_unreachable ();
14622 switch (code)
14624 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14625 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14626 case AARCH64_LDOP_OR: gen = ior[idx]; break;
14627 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14628 default:
14629 gcc_unreachable ();
14632 emit_insn (gen (dst, mem, src, model));
14635 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14636 location to store the data read from memory. OUT_RESULT is the location to
14637 store the result of the operation. MEM is the memory location to read and
14638 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14639 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14640 be NULL. */
14642 void
14643 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14644 rtx mem, rtx value, rtx model_rtx)
14646 machine_mode mode = GET_MODE (mem);
14647 machine_mode wmode = (mode == DImode ? DImode : SImode);
14648 const bool short_mode = (mode < SImode);
14649 aarch64_atomic_load_op_code ldop_code;
14650 rtx src;
14651 rtx x;
14653 if (out_data)
14654 out_data = gen_lowpart (mode, out_data);
14656 if (out_result)
14657 out_result = gen_lowpart (mode, out_result);
14659 /* Make sure the value is in a register, putting it into a destination
14660 register if it needs to be manipulated. */
14661 if (!register_operand (value, mode)
14662 || code == AND || code == MINUS)
14664 src = out_result ? out_result : out_data;
14665 emit_move_insn (src, gen_lowpart (mode, value));
14667 else
14668 src = value;
14669 gcc_assert (register_operand (src, mode));
14671 /* Preprocess the data for the operation as necessary. If the operation is
14672 a SET then emit a swap instruction and finish. */
14673 switch (code)
14675 case SET:
14676 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14677 return;
14679 case MINUS:
14680 /* Negate the value and treat it as a PLUS. */
14682 rtx neg_src;
14684 /* Resize the value if necessary. */
14685 if (short_mode)
14686 src = gen_lowpart (wmode, src);
14688 neg_src = gen_rtx_NEG (wmode, src);
14689 emit_insn (gen_rtx_SET (src, neg_src));
14691 if (short_mode)
14692 src = gen_lowpart (mode, src);
14694 /* Fall-through. */
14695 case PLUS:
14696 ldop_code = AARCH64_LDOP_PLUS;
14697 break;
14699 case IOR:
14700 ldop_code = AARCH64_LDOP_OR;
14701 break;
14703 case XOR:
14704 ldop_code = AARCH64_LDOP_XOR;
14705 break;
14707 case AND:
14709 rtx not_src;
14711 /* Resize the value if necessary. */
14712 if (short_mode)
14713 src = gen_lowpart (wmode, src);
14715 not_src = gen_rtx_NOT (wmode, src);
14716 emit_insn (gen_rtx_SET (src, not_src));
14718 if (short_mode)
14719 src = gen_lowpart (mode, src);
14721 ldop_code = AARCH64_LDOP_BIC;
14722 break;
14724 default:
14725 /* The operation can't be done with atomic instructions. */
14726 gcc_unreachable ();
14729 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14731 /* If necessary, calculate the data in memory after the update by redoing the
14732 operation from values in registers. */
14733 if (!out_result)
14734 return;
14736 if (short_mode)
14738 src = gen_lowpart (wmode, src);
14739 out_data = gen_lowpart (wmode, out_data);
14740 out_result = gen_lowpart (wmode, out_result);
14743 x = NULL_RTX;
14745 switch (code)
14747 case MINUS:
14748 case PLUS:
14749 x = gen_rtx_PLUS (wmode, out_data, src);
14750 break;
14751 case IOR:
14752 x = gen_rtx_IOR (wmode, out_data, src);
14753 break;
14754 case XOR:
14755 x = gen_rtx_XOR (wmode, out_data, src);
14756 break;
14757 case AND:
14758 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14759 return;
14760 default:
14761 gcc_unreachable ();
14764 emit_set_insn (out_result, x);
14766 return;
14769 /* Split an atomic operation. */
14771 void
14772 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14773 rtx value, rtx model_rtx, rtx cond)
14775 machine_mode mode = GET_MODE (mem);
14776 machine_mode wmode = (mode == DImode ? DImode : SImode);
14777 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14778 const bool is_sync = is_mm_sync (model);
14779 rtx_code_label *label;
14780 rtx x;
14782 /* Split the atomic operation into a sequence. */
14783 label = gen_label_rtx ();
14784 emit_label (label);
14786 if (new_out)
14787 new_out = gen_lowpart (wmode, new_out);
14788 if (old_out)
14789 old_out = gen_lowpart (wmode, old_out);
14790 else
14791 old_out = new_out;
14792 value = simplify_gen_subreg (wmode, value, mode, 0);
14794 /* The initial load can be relaxed for a __sync operation since a final
14795 barrier will be emitted to stop code hoisting. */
14796 if (is_sync)
14797 aarch64_emit_load_exclusive (mode, old_out, mem,
14798 GEN_INT (MEMMODEL_RELAXED));
14799 else
14800 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14802 switch (code)
14804 case SET:
14805 new_out = value;
14806 break;
14808 case NOT:
14809 x = gen_rtx_AND (wmode, old_out, value);
14810 emit_insn (gen_rtx_SET (new_out, x));
14811 x = gen_rtx_NOT (wmode, new_out);
14812 emit_insn (gen_rtx_SET (new_out, x));
14813 break;
14815 case MINUS:
14816 if (CONST_INT_P (value))
14818 value = GEN_INT (-INTVAL (value));
14819 code = PLUS;
14821 /* Fall through. */
14823 default:
14824 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14825 emit_insn (gen_rtx_SET (new_out, x));
14826 break;
14829 aarch64_emit_store_exclusive (mode, cond, mem,
14830 gen_lowpart (mode, new_out), model_rtx);
14832 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14833 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14834 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14835 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14837 /* Emit any final barrier needed for a __sync operation. */
14838 if (is_sync)
14839 aarch64_emit_post_barrier (model);
14842 static void
14843 aarch64_init_libfuncs (void)
14845 /* Half-precision float operations. The compiler handles all operations
14846 with NULL libfuncs by converting to SFmode. */
14848 /* Conversions. */
14849 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14850 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14852 /* Arithmetic. */
14853 set_optab_libfunc (add_optab, HFmode, NULL);
14854 set_optab_libfunc (sdiv_optab, HFmode, NULL);
14855 set_optab_libfunc (smul_optab, HFmode, NULL);
14856 set_optab_libfunc (neg_optab, HFmode, NULL);
14857 set_optab_libfunc (sub_optab, HFmode, NULL);
14859 /* Comparisons. */
14860 set_optab_libfunc (eq_optab, HFmode, NULL);
14861 set_optab_libfunc (ne_optab, HFmode, NULL);
14862 set_optab_libfunc (lt_optab, HFmode, NULL);
14863 set_optab_libfunc (le_optab, HFmode, NULL);
14864 set_optab_libfunc (ge_optab, HFmode, NULL);
14865 set_optab_libfunc (gt_optab, HFmode, NULL);
14866 set_optab_libfunc (unord_optab, HFmode, NULL);
14869 /* Target hook for c_mode_for_suffix. */
14870 static machine_mode
14871 aarch64_c_mode_for_suffix (char suffix)
14873 if (suffix == 'q')
14874 return TFmode;
14876 return VOIDmode;
14879 /* We can only represent floating point constants which will fit in
14880 "quarter-precision" values. These values are characterised by
14881 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14884 (-1)^s * (n/16) * 2^r
14886 Where:
14887 's' is the sign bit.
14888 'n' is an integer in the range 16 <= n <= 31.
14889 'r' is an integer in the range -3 <= r <= 4. */
14891 /* Return true iff X can be represented by a quarter-precision
14892 floating point immediate operand X. Note, we cannot represent 0.0. */
14893 bool
14894 aarch64_float_const_representable_p (rtx x)
14896 /* This represents our current view of how many bits
14897 make up the mantissa. */
14898 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14899 int exponent;
14900 unsigned HOST_WIDE_INT mantissa, mask;
14901 REAL_VALUE_TYPE r, m;
14902 bool fail;
14904 if (!CONST_DOUBLE_P (x))
14905 return false;
14907 /* We don't support HFmode constants yet. */
14908 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14909 return false;
14911 r = *CONST_DOUBLE_REAL_VALUE (x);
14913 /* We cannot represent infinities, NaNs or +/-zero. We won't
14914 know if we have +zero until we analyse the mantissa, but we
14915 can reject the other invalid values. */
14916 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14917 || REAL_VALUE_MINUS_ZERO (r))
14918 return false;
14920 /* Extract exponent. */
14921 r = real_value_abs (&r);
14922 exponent = REAL_EXP (&r);
14924 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14925 highest (sign) bit, with a fixed binary point at bit point_pos.
14926 m1 holds the low part of the mantissa, m2 the high part.
14927 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14928 bits for the mantissa, this can fail (low bits will be lost). */
14929 real_ldexp (&m, &r, point_pos - exponent);
14930 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14932 /* If the low part of the mantissa has bits set we cannot represent
14933 the value. */
14934 if (w.ulow () != 0)
14935 return false;
14936 /* We have rejected the lower HOST_WIDE_INT, so update our
14937 understanding of how many bits lie in the mantissa and
14938 look only at the high HOST_WIDE_INT. */
14939 mantissa = w.elt (1);
14940 point_pos -= HOST_BITS_PER_WIDE_INT;
14942 /* We can only represent values with a mantissa of the form 1.xxxx. */
14943 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14944 if ((mantissa & mask) != 0)
14945 return false;
14947 /* Having filtered unrepresentable values, we may now remove all
14948 but the highest 5 bits. */
14949 mantissa >>= point_pos - 5;
14951 /* We cannot represent the value 0.0, so reject it. This is handled
14952 elsewhere. */
14953 if (mantissa == 0)
14954 return false;
14956 /* Then, as bit 4 is always set, we can mask it off, leaving
14957 the mantissa in the range [0, 15]. */
14958 mantissa &= ~(1 << 4);
14959 gcc_assert (mantissa <= 15);
14961 /* GCC internally does not use IEEE754-like encoding (where normalized
14962 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14963 Our mantissa values are shifted 4 places to the left relative to
14964 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14965 by 5 places to correct for GCC's representation. */
14966 exponent = 5 - exponent;
14968 return (exponent >= 0 && exponent <= 7);
14971 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14972 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14973 output MOVI/MVNI, ORR or BIC immediate. */
14974 char*
14975 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14976 enum simd_immediate_check which)
14978 bool is_valid;
14979 static char templ[40];
14980 const char *mnemonic;
14981 const char *shift_op;
14982 unsigned int lane_count = 0;
14983 char element_char;
14985 struct simd_immediate_info info;
14987 /* This will return true to show const_vector is legal for use as either
14988 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14989 It will also update INFO to show how the immediate should be generated.
14990 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14991 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14992 gcc_assert (is_valid);
14994 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14995 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14997 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14999 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
15000 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
15001 move immediate path. */
15002 if (aarch64_float_const_zero_rtx_p (info.value))
15003 info.value = GEN_INT (0);
15004 else
15006 const unsigned int buf_size = 20;
15007 char float_buf[buf_size] = {'\0'};
15008 real_to_decimal_for_mode (float_buf,
15009 CONST_DOUBLE_REAL_VALUE (info.value),
15010 buf_size, buf_size, 1, info.elt_mode);
15012 if (lane_count == 1)
15013 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
15014 else
15015 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
15016 lane_count, element_char, float_buf);
15017 return templ;
15021 gcc_assert (CONST_INT_P (info.value));
15023 if (which == AARCH64_CHECK_MOV)
15025 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15026 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15027 if (lane_count == 1)
15028 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15029 mnemonic, UINTVAL (info.value));
15030 else if (info.shift)
15031 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15032 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15033 element_char, UINTVAL (info.value), shift_op, info.shift);
15034 else
15035 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15036 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15037 element_char, UINTVAL (info.value));
15039 else
15041 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
15042 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15043 if (info.shift)
15044 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15045 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15046 element_char, UINTVAL (info.value), "lsl", info.shift);
15047 else
15048 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15049 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15050 element_char, UINTVAL (info.value));
15052 return templ;
15055 char*
15056 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15059 /* If a floating point number was passed and we desire to use it in an
15060 integer mode do the conversion to integer. */
15061 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15063 unsigned HOST_WIDE_INT ival;
15064 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15065 gcc_unreachable ();
15066 immediate = gen_int_mode (ival, mode);
15069 machine_mode vmode;
15070 /* use a 64 bit mode for everything except for DI/DF mode, where we use
15071 a 128 bit vector mode. */
15072 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15074 vmode = aarch64_simd_container_mode (mode, width);
15075 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15076 return aarch64_output_simd_mov_immediate (v_op, width);
15079 /* Return the output string to use for moving immediate CONST_VECTOR
15080 into an SVE register. */
15082 char *
15083 aarch64_output_sve_mov_immediate (rtx const_vector)
15085 static char templ[40];
15086 struct simd_immediate_info info;
15087 char element_char;
15089 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15090 gcc_assert (is_valid);
15092 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15094 if (info.step)
15096 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15097 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15098 element_char, INTVAL (info.value), INTVAL (info.step));
15099 return templ;
15102 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15104 if (aarch64_float_const_zero_rtx_p (info.value))
15105 info.value = GEN_INT (0);
15106 else
15108 const int buf_size = 20;
15109 char float_buf[buf_size] = {};
15110 real_to_decimal_for_mode (float_buf,
15111 CONST_DOUBLE_REAL_VALUE (info.value),
15112 buf_size, buf_size, 1, info.elt_mode);
15114 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15115 element_char, float_buf);
15116 return templ;
15120 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15121 element_char, INTVAL (info.value));
15122 return templ;
15125 /* Return the asm format for a PTRUE instruction whose destination has
15126 mode MODE. SUFFIX is the element size suffix. */
15128 char *
15129 aarch64_output_ptrue (machine_mode mode, char suffix)
15131 unsigned int nunits;
15132 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15133 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15134 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15135 else
15136 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15137 return buf;
15140 /* Split operands into moves from op[1] + op[2] into op[0]. */
15142 void
15143 aarch64_split_combinev16qi (rtx operands[3])
15145 unsigned int dest = REGNO (operands[0]);
15146 unsigned int src1 = REGNO (operands[1]);
15147 unsigned int src2 = REGNO (operands[2]);
15148 machine_mode halfmode = GET_MODE (operands[1]);
15149 unsigned int halfregs = REG_NREGS (operands[1]);
15150 rtx destlo, desthi;
15152 gcc_assert (halfmode == V16QImode);
15154 if (src1 == dest && src2 == dest + halfregs)
15156 /* No-op move. Can't split to nothing; emit something. */
15157 emit_note (NOTE_INSN_DELETED);
15158 return;
15161 /* Preserve register attributes for variable tracking. */
15162 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15163 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15164 GET_MODE_SIZE (halfmode));
15166 /* Special case of reversed high/low parts. */
15167 if (reg_overlap_mentioned_p (operands[2], destlo)
15168 && reg_overlap_mentioned_p (operands[1], desthi))
15170 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15171 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15172 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15174 else if (!reg_overlap_mentioned_p (operands[2], destlo))
15176 /* Try to avoid unnecessary moves if part of the result
15177 is in the right place already. */
15178 if (src1 != dest)
15179 emit_move_insn (destlo, operands[1]);
15180 if (src2 != dest + halfregs)
15181 emit_move_insn (desthi, operands[2]);
15183 else
15185 if (src2 != dest + halfregs)
15186 emit_move_insn (desthi, operands[2]);
15187 if (src1 != dest)
15188 emit_move_insn (destlo, operands[1]);
15192 /* vec_perm support. */
15194 struct expand_vec_perm_d
15196 rtx target, op0, op1;
15197 vec_perm_indices perm;
15198 machine_mode vmode;
15199 unsigned int vec_flags;
15200 bool one_vector_p;
15201 bool testing_p;
15204 /* Generate a variable permutation. */
15206 static void
15207 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15209 machine_mode vmode = GET_MODE (target);
15210 bool one_vector_p = rtx_equal_p (op0, op1);
15212 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15213 gcc_checking_assert (GET_MODE (op0) == vmode);
15214 gcc_checking_assert (GET_MODE (op1) == vmode);
15215 gcc_checking_assert (GET_MODE (sel) == vmode);
15216 gcc_checking_assert (TARGET_SIMD);
15218 if (one_vector_p)
15220 if (vmode == V8QImode)
15222 /* Expand the argument to a V16QI mode by duplicating it. */
15223 rtx pair = gen_reg_rtx (V16QImode);
15224 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15225 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15227 else
15229 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15232 else
15234 rtx pair;
15236 if (vmode == V8QImode)
15238 pair = gen_reg_rtx (V16QImode);
15239 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15240 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15242 else
15244 pair = gen_reg_rtx (OImode);
15245 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15246 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15251 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15252 NELT is the number of elements in the vector. */
15254 void
15255 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15256 unsigned int nelt)
15258 machine_mode vmode = GET_MODE (target);
15259 bool one_vector_p = rtx_equal_p (op0, op1);
15260 rtx mask;
15262 /* The TBL instruction does not use a modulo index, so we must take care
15263 of that ourselves. */
15264 mask = aarch64_simd_gen_const_vector_dup (vmode,
15265 one_vector_p ? nelt - 1 : 2 * nelt - 1);
15266 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15268 /* For big-endian, we also need to reverse the index within the vector
15269 (but not which vector). */
15270 if (BYTES_BIG_ENDIAN)
15272 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15273 if (!one_vector_p)
15274 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15275 sel = expand_simple_binop (vmode, XOR, sel, mask,
15276 NULL, 0, OPTAB_LIB_WIDEN);
15278 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15281 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15283 static void
15284 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15286 emit_insn (gen_rtx_SET (target,
15287 gen_rtx_UNSPEC (GET_MODE (target),
15288 gen_rtvec (2, op0, op1), code)));
15291 /* Expand an SVE vec_perm with the given operands. */
15293 void
15294 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15296 machine_mode data_mode = GET_MODE (target);
15297 machine_mode sel_mode = GET_MODE (sel);
15298 /* Enforced by the pattern condition. */
15299 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15301 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15302 size of the two value vectors, i.e. the upper bits of the indices
15303 are effectively ignored. SVE TBL instead produces 0 for any
15304 out-of-range indices, so we need to modulo all the vec_perm indices
15305 to ensure they are all in range. */
15306 rtx sel_reg = force_reg (sel_mode, sel);
15308 /* Check if the sel only references the first values vector. */
15309 if (GET_CODE (sel) == CONST_VECTOR
15310 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15312 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15313 return;
15316 /* Check if the two values vectors are the same. */
15317 if (rtx_equal_p (op0, op1))
15319 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15320 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15321 NULL, 0, OPTAB_DIRECT);
15322 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15323 return;
15326 /* Run TBL on for each value vector and combine the results. */
15328 rtx res0 = gen_reg_rtx (data_mode);
15329 rtx res1 = gen_reg_rtx (data_mode);
15330 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15331 if (GET_CODE (sel) != CONST_VECTOR
15332 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15334 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15335 2 * nunits - 1);
15336 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15337 NULL, 0, OPTAB_DIRECT);
15339 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15340 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15341 NULL, 0, OPTAB_DIRECT);
15342 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15343 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15344 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15345 else
15346 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15349 /* Recognize patterns suitable for the TRN instructions. */
15350 static bool
15351 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15353 HOST_WIDE_INT odd;
15354 poly_uint64 nelt = d->perm.length ();
15355 rtx out, in0, in1, x;
15356 machine_mode vmode = d->vmode;
15358 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15359 return false;
15361 /* Note that these are little-endian tests.
15362 We correct for big-endian later. */
15363 if (!d->perm[0].is_constant (&odd)
15364 || (odd != 0 && odd != 1)
15365 || !d->perm.series_p (0, 2, odd, 2)
15366 || !d->perm.series_p (1, 2, nelt + odd, 2))
15367 return false;
15369 /* Success! */
15370 if (d->testing_p)
15371 return true;
15373 in0 = d->op0;
15374 in1 = d->op1;
15375 /* We don't need a big-endian lane correction for SVE; see the comment
15376 at the head of aarch64-sve.md for details. */
15377 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15379 x = in0, in0 = in1, in1 = x;
15380 odd = !odd;
15382 out = d->target;
15384 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15385 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15386 return true;
15389 /* Recognize patterns suitable for the UZP instructions. */
15390 static bool
15391 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15393 HOST_WIDE_INT odd;
15394 rtx out, in0, in1, x;
15395 machine_mode vmode = d->vmode;
15397 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15398 return false;
15400 /* Note that these are little-endian tests.
15401 We correct for big-endian later. */
15402 if (!d->perm[0].is_constant (&odd)
15403 || (odd != 0 && odd != 1)
15404 || !d->perm.series_p (0, 1, odd, 2))
15405 return false;
15407 /* Success! */
15408 if (d->testing_p)
15409 return true;
15411 in0 = d->op0;
15412 in1 = d->op1;
15413 /* We don't need a big-endian lane correction for SVE; see the comment
15414 at the head of aarch64-sve.md for details. */
15415 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15417 x = in0, in0 = in1, in1 = x;
15418 odd = !odd;
15420 out = d->target;
15422 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15423 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15424 return true;
15427 /* Recognize patterns suitable for the ZIP instructions. */
15428 static bool
15429 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15431 unsigned int high;
15432 poly_uint64 nelt = d->perm.length ();
15433 rtx out, in0, in1, x;
15434 machine_mode vmode = d->vmode;
15436 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15437 return false;
15439 /* Note that these are little-endian tests.
15440 We correct for big-endian later. */
15441 poly_uint64 first = d->perm[0];
15442 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15443 || !d->perm.series_p (0, 2, first, 1)
15444 || !d->perm.series_p (1, 2, first + nelt, 1))
15445 return false;
15446 high = maybe_ne (first, 0U);
15448 /* Success! */
15449 if (d->testing_p)
15450 return true;
15452 in0 = d->op0;
15453 in1 = d->op1;
15454 /* We don't need a big-endian lane correction for SVE; see the comment
15455 at the head of aarch64-sve.md for details. */
15456 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15458 x = in0, in0 = in1, in1 = x;
15459 high = !high;
15461 out = d->target;
15463 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15464 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15465 return true;
15468 /* Recognize patterns for the EXT insn. */
15470 static bool
15471 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15473 HOST_WIDE_INT location;
15474 rtx offset;
15476 /* The first element always refers to the first vector.
15477 Check if the extracted indices are increasing by one. */
15478 if (d->vec_flags == VEC_SVE_PRED
15479 || !d->perm[0].is_constant (&location)
15480 || !d->perm.series_p (0, 1, location, 1))
15481 return false;
15483 /* Success! */
15484 if (d->testing_p)
15485 return true;
15487 /* The case where (location == 0) is a no-op for both big- and little-endian,
15488 and is removed by the mid-end at optimization levels -O1 and higher.
15490 We don't need a big-endian lane correction for SVE; see the comment
15491 at the head of aarch64-sve.md for details. */
15492 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15494 /* After setup, we want the high elements of the first vector (stored
15495 at the LSB end of the register), and the low elements of the second
15496 vector (stored at the MSB end of the register). So swap. */
15497 std::swap (d->op0, d->op1);
15498 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15499 to_constant () is safe since this is restricted to Advanced SIMD
15500 vectors. */
15501 location = d->perm.length ().to_constant () - location;
15504 offset = GEN_INT (location);
15505 emit_set_insn (d->target,
15506 gen_rtx_UNSPEC (d->vmode,
15507 gen_rtvec (3, d->op0, d->op1, offset),
15508 UNSPEC_EXT));
15509 return true;
15512 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15513 within each 64-bit, 32-bit or 16-bit granule. */
15515 static bool
15516 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15518 HOST_WIDE_INT diff;
15519 unsigned int i, size, unspec;
15520 machine_mode pred_mode;
15522 if (d->vec_flags == VEC_SVE_PRED
15523 || !d->one_vector_p
15524 || !d->perm[0].is_constant (&diff))
15525 return false;
15527 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15528 if (size == 8)
15530 unspec = UNSPEC_REV64;
15531 pred_mode = VNx2BImode;
15533 else if (size == 4)
15535 unspec = UNSPEC_REV32;
15536 pred_mode = VNx4BImode;
15538 else if (size == 2)
15540 unspec = UNSPEC_REV16;
15541 pred_mode = VNx8BImode;
15543 else
15544 return false;
15546 unsigned int step = diff + 1;
15547 for (i = 0; i < step; ++i)
15548 if (!d->perm.series_p (i, step, diff - i, step))
15549 return false;
15551 /* Success! */
15552 if (d->testing_p)
15553 return true;
15555 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15556 if (d->vec_flags == VEC_SVE_DATA)
15558 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15559 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15560 UNSPEC_MERGE_PTRUE);
15562 emit_set_insn (d->target, src);
15563 return true;
15566 /* Recognize patterns for the REV insn, which reverses elements within
15567 a full vector. */
15569 static bool
15570 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15572 poly_uint64 nelt = d->perm.length ();
15574 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15575 return false;
15577 if (!d->perm.series_p (0, 1, nelt - 1, -1))
15578 return false;
15580 /* Success! */
15581 if (d->testing_p)
15582 return true;
15584 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15585 emit_set_insn (d->target, src);
15586 return true;
15589 static bool
15590 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15592 rtx out = d->target;
15593 rtx in0;
15594 HOST_WIDE_INT elt;
15595 machine_mode vmode = d->vmode;
15596 rtx lane;
15598 if (d->vec_flags == VEC_SVE_PRED
15599 || d->perm.encoding ().encoded_nelts () != 1
15600 || !d->perm[0].is_constant (&elt))
15601 return false;
15603 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15604 return false;
15606 /* Success! */
15607 if (d->testing_p)
15608 return true;
15610 /* The generic preparation in aarch64_expand_vec_perm_const_1
15611 swaps the operand order and the permute indices if it finds
15612 d->perm[0] to be in the second operand. Thus, we can always
15613 use d->op0 and need not do any extra arithmetic to get the
15614 correct lane number. */
15615 in0 = d->op0;
15616 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
15618 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15619 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15620 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15621 return true;
15624 static bool
15625 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15627 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15628 machine_mode vmode = d->vmode;
15630 /* Make sure that the indices are constant. */
15631 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15632 for (unsigned int i = 0; i < encoded_nelts; ++i)
15633 if (!d->perm[i].is_constant ())
15634 return false;
15636 if (d->testing_p)
15637 return true;
15639 /* Generic code will try constant permutation twice. Once with the
15640 original mode and again with the elements lowered to QImode.
15641 So wait and don't do the selector expansion ourselves. */
15642 if (vmode != V8QImode && vmode != V16QImode)
15643 return false;
15645 /* to_constant is safe since this routine is specific to Advanced SIMD
15646 vectors. */
15647 unsigned int nelt = d->perm.length ().to_constant ();
15648 for (unsigned int i = 0; i < nelt; ++i)
15649 /* If big-endian and two vectors we end up with a weird mixed-endian
15650 mode on NEON. Reverse the index within each word but not the word
15651 itself. to_constant is safe because we checked is_constant above. */
15652 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15653 ? d->perm[i].to_constant () ^ (nelt - 1)
15654 : d->perm[i].to_constant ());
15656 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15657 sel = force_reg (vmode, sel);
15659 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15660 return true;
15663 /* Try to implement D using an SVE TBL instruction. */
15665 static bool
15666 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15668 unsigned HOST_WIDE_INT nelt;
15670 /* Permuting two variable-length vectors could overflow the
15671 index range. */
15672 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15673 return false;
15675 if (d->testing_p)
15676 return true;
15678 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15679 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15680 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15681 return true;
15684 static bool
15685 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15687 /* The pattern matching functions above are written to look for a small
15688 number to begin the sequence (0, 1, N/2). If we begin with an index
15689 from the second operand, we can swap the operands. */
15690 poly_int64 nelt = d->perm.length ();
15691 if (known_ge (d->perm[0], nelt))
15693 d->perm.rotate_inputs (1);
15694 std::swap (d->op0, d->op1);
15697 if ((d->vec_flags == VEC_ADVSIMD
15698 || d->vec_flags == VEC_SVE_DATA
15699 || d->vec_flags == VEC_SVE_PRED)
15700 && known_gt (nelt, 1))
15702 if (aarch64_evpc_rev_local (d))
15703 return true;
15704 else if (aarch64_evpc_rev_global (d))
15705 return true;
15706 else if (aarch64_evpc_ext (d))
15707 return true;
15708 else if (aarch64_evpc_dup (d))
15709 return true;
15710 else if (aarch64_evpc_zip (d))
15711 return true;
15712 else if (aarch64_evpc_uzp (d))
15713 return true;
15714 else if (aarch64_evpc_trn (d))
15715 return true;
15716 if (d->vec_flags == VEC_SVE_DATA)
15717 return aarch64_evpc_sve_tbl (d);
15718 else if (d->vec_flags == VEC_SVE_DATA)
15719 return aarch64_evpc_tbl (d);
15721 return false;
15724 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15726 static bool
15727 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15728 rtx op1, const vec_perm_indices &sel)
15730 struct expand_vec_perm_d d;
15732 /* Check whether the mask can be applied to a single vector. */
15733 if (op0 && rtx_equal_p (op0, op1))
15734 d.one_vector_p = true;
15735 else if (sel.all_from_input_p (0))
15737 d.one_vector_p = true;
15738 op1 = op0;
15740 else if (sel.all_from_input_p (1))
15742 d.one_vector_p = true;
15743 op0 = op1;
15745 else
15746 d.one_vector_p = false;
15748 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15749 sel.nelts_per_input ());
15750 d.vmode = vmode;
15751 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15752 d.target = target;
15753 d.op0 = op0;
15754 d.op1 = op1;
15755 d.testing_p = !target;
15757 if (!d.testing_p)
15758 return aarch64_expand_vec_perm_const_1 (&d);
15760 rtx_insn *last = get_last_insn ();
15761 bool ret = aarch64_expand_vec_perm_const_1 (&d);
15762 gcc_assert (last == get_last_insn ());
15764 return ret;
15767 /* Generate a byte permute mask for a register of mode MODE,
15768 which has NUNITS units. */
15771 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15773 /* We have to reverse each vector because we dont have
15774 a permuted load that can reverse-load according to ABI rules. */
15775 rtx mask;
15776 rtvec v = rtvec_alloc (16);
15777 unsigned int i, j;
15778 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15780 gcc_assert (BYTES_BIG_ENDIAN);
15781 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15783 for (i = 0; i < nunits; i++)
15784 for (j = 0; j < usize; j++)
15785 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15786 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15787 return force_reg (V16QImode, mask);
15790 /* Return true if X is a valid second operand for the SVE instruction
15791 that implements integer comparison OP_CODE. */
15793 static bool
15794 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15796 if (register_operand (x, VOIDmode))
15797 return true;
15799 switch (op_code)
15801 case LTU:
15802 case LEU:
15803 case GEU:
15804 case GTU:
15805 return aarch64_sve_cmp_immediate_p (x, false);
15806 case LT:
15807 case LE:
15808 case GE:
15809 case GT:
15810 case NE:
15811 case EQ:
15812 return aarch64_sve_cmp_immediate_p (x, true);
15813 default:
15814 gcc_unreachable ();
15818 /* Use predicated SVE instructions to implement the equivalent of:
15820 (set TARGET OP)
15822 given that PTRUE is an all-true predicate of the appropriate mode. */
15824 static void
15825 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15827 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15828 gen_rtvec (2, ptrue, op),
15829 UNSPEC_MERGE_PTRUE);
15830 rtx_insn *insn = emit_set_insn (target, unspec);
15831 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15834 /* Likewise, but also clobber the condition codes. */
15836 static void
15837 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15839 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15840 gen_rtvec (2, ptrue, op),
15841 UNSPEC_MERGE_PTRUE);
15842 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15843 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15846 /* Return the UNSPEC_COND_* code for comparison CODE. */
15848 static unsigned int
15849 aarch64_unspec_cond_code (rtx_code code)
15851 switch (code)
15853 case NE:
15854 return UNSPEC_COND_NE;
15855 case EQ:
15856 return UNSPEC_COND_EQ;
15857 case LT:
15858 return UNSPEC_COND_LT;
15859 case GT:
15860 return UNSPEC_COND_GT;
15861 case LE:
15862 return UNSPEC_COND_LE;
15863 case GE:
15864 return UNSPEC_COND_GE;
15865 default:
15866 gcc_unreachable ();
15870 /* Emit:
15872 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15874 where <X> is the operation associated with comparison CODE. This form
15875 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15876 semantics, such as when PRED might not be all-true and when comparing
15877 inactive lanes could have side effects. */
15879 static void
15880 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15881 rtx pred, rtx op0, rtx op1)
15883 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15884 gen_rtvec (3, pred, op0, op1),
15885 aarch64_unspec_cond_code (code));
15886 emit_set_insn (target, unspec);
15889 /* Expand an SVE integer comparison using the SVE equivalent of:
15891 (set TARGET (CODE OP0 OP1)). */
15893 void
15894 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15896 machine_mode pred_mode = GET_MODE (target);
15897 machine_mode data_mode = GET_MODE (op0);
15899 if (!aarch64_sve_cmp_operand_p (code, op1))
15900 op1 = force_reg (data_mode, op1);
15902 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15903 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15904 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15907 /* Emit the SVE equivalent of:
15909 (set TMP1 (CODE1 OP0 OP1))
15910 (set TMP2 (CODE2 OP0 OP1))
15911 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15913 PTRUE is an all-true predicate with the same mode as TARGET. */
15915 static void
15916 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15917 rtx ptrue, rtx op0, rtx op1)
15919 machine_mode pred_mode = GET_MODE (ptrue);
15920 rtx tmp1 = gen_reg_rtx (pred_mode);
15921 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15922 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15923 rtx tmp2 = gen_reg_rtx (pred_mode);
15924 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15925 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15926 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15929 /* Emit the SVE equivalent of:
15931 (set TMP (CODE OP0 OP1))
15932 (set TARGET (not TMP))
15934 PTRUE is an all-true predicate with the same mode as TARGET. */
15936 static void
15937 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15938 rtx op0, rtx op1)
15940 machine_mode pred_mode = GET_MODE (ptrue);
15941 rtx tmp = gen_reg_rtx (pred_mode);
15942 aarch64_emit_sve_ptrue_op (tmp, ptrue,
15943 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15944 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15947 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15949 (set TARGET (CODE OP0 OP1))
15951 If CAN_INVERT_P is true, the caller can also handle inverted results;
15952 return true if the result is in fact inverted. */
15954 bool
15955 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15956 rtx op0, rtx op1, bool can_invert_p)
15958 machine_mode pred_mode = GET_MODE (target);
15959 machine_mode data_mode = GET_MODE (op0);
15961 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15962 switch (code)
15964 case UNORDERED:
15965 /* UNORDERED has no immediate form. */
15966 op1 = force_reg (data_mode, op1);
15967 /* fall through */
15968 case LT:
15969 case LE:
15970 case GT:
15971 case GE:
15972 case EQ:
15973 case NE:
15975 /* There is native support for the comparison. */
15976 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15977 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15978 return false;
15981 case LTGT:
15982 /* This is a trapping operation (LT or GT). */
15983 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15984 return false;
15986 case UNEQ:
15987 if (!flag_trapping_math)
15989 /* This would trap for signaling NaNs. */
15990 op1 = force_reg (data_mode, op1);
15991 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15992 return false;
15994 /* fall through */
15995 case UNLT:
15996 case UNLE:
15997 case UNGT:
15998 case UNGE:
15999 if (flag_trapping_math)
16001 /* Work out which elements are ordered. */
16002 rtx ordered = gen_reg_rtx (pred_mode);
16003 op1 = force_reg (data_mode, op1);
16004 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
16006 /* Test the opposite condition for the ordered elements,
16007 then invert the result. */
16008 if (code == UNEQ)
16009 code = NE;
16010 else
16011 code = reverse_condition_maybe_unordered (code);
16012 if (can_invert_p)
16014 aarch64_emit_sve_predicated_cond (target, code,
16015 ordered, op0, op1);
16016 return true;
16018 rtx tmp = gen_reg_rtx (pred_mode);
16019 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
16020 aarch64_emit_unop (target, one_cmpl_optab, tmp);
16021 return false;
16023 break;
16025 case ORDERED:
16026 /* ORDERED has no immediate form. */
16027 op1 = force_reg (data_mode, op1);
16028 break;
16030 default:
16031 gcc_unreachable ();
16034 /* There is native support for the inverse comparison. */
16035 code = reverse_condition_maybe_unordered (code);
16036 if (can_invert_p)
16038 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16039 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16040 return true;
16042 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16043 return false;
16046 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
16047 of the data being selected and CMP_MODE is the mode of the values being
16048 compared. */
16050 void
16051 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16052 rtx *ops)
16054 machine_mode pred_mode
16055 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16056 GET_MODE_SIZE (cmp_mode)).require ();
16057 rtx pred = gen_reg_rtx (pred_mode);
16058 if (FLOAT_MODE_P (cmp_mode))
16060 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16061 ops[4], ops[5], true))
16062 std::swap (ops[1], ops[2]);
16064 else
16065 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16067 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16068 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16071 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
16072 true. However due to issues with register allocation it is preferable
16073 to avoid tieing integer scalar and FP scalar modes. Executing integer
16074 operations in general registers is better than treating them as scalar
16075 vector operations. This reduces latency and avoids redundant int<->FP
16076 moves. So tie modes if they are either the same class, or vector modes
16077 with other vector modes, vector structs or any scalar mode. */
16079 static bool
16080 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16082 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16083 return true;
16085 /* We specifically want to allow elements of "structure" modes to
16086 be tieable to the structure. This more general condition allows
16087 other rarer situations too. The reason we don't extend this to
16088 predicate modes is that there are no predicate structure modes
16089 nor any specific instructions for extracting part of a predicate
16090 register. */
16091 if (aarch64_vector_data_mode_p (mode1)
16092 && aarch64_vector_data_mode_p (mode2))
16093 return true;
16095 /* Also allow any scalar modes with vectors. */
16096 if (aarch64_vector_mode_supported_p (mode1)
16097 || aarch64_vector_mode_supported_p (mode2))
16098 return true;
16100 return false;
16103 /* Return a new RTX holding the result of moving POINTER forward by
16104 AMOUNT bytes. */
16106 static rtx
16107 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16109 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16111 return adjust_automodify_address (pointer, GET_MODE (pointer),
16112 next, amount);
16115 /* Return a new RTX holding the result of moving POINTER forward by the
16116 size of the mode it points to. */
16118 static rtx
16119 aarch64_progress_pointer (rtx pointer)
16121 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16124 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16125 MODE bytes. */
16127 static void
16128 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16129 machine_mode mode)
16131 rtx reg = gen_reg_rtx (mode);
16133 /* "Cast" the pointers to the correct mode. */
16134 *src = adjust_address (*src, mode, 0);
16135 *dst = adjust_address (*dst, mode, 0);
16136 /* Emit the memcpy. */
16137 emit_move_insn (reg, *src);
16138 emit_move_insn (*dst, reg);
16139 /* Move the pointers forward. */
16140 *src = aarch64_progress_pointer (*src);
16141 *dst = aarch64_progress_pointer (*dst);
16144 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16145 we succeed, otherwise return false. */
16147 bool
16148 aarch64_expand_movmem (rtx *operands)
16150 int n, mode_bits;
16151 rtx dst = operands[0];
16152 rtx src = operands[1];
16153 rtx base;
16154 machine_mode cur_mode = BLKmode, next_mode;
16155 bool speed_p = !optimize_function_for_size_p (cfun);
16157 /* When optimizing for size, give a better estimate of the length of a
16158 memcpy call, but use the default otherwise. Moves larger than 8 bytes
16159 will always require an even number of instructions to do now. And each
16160 operation requires both a load+store, so devide the max number by 2. */
16161 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
16163 /* We can't do anything smart if the amount to copy is not constant. */
16164 if (!CONST_INT_P (operands[2]))
16165 return false;
16167 n = INTVAL (operands[2]);
16169 /* Try to keep the number of instructions low. For all cases we will do at
16170 most two moves for the residual amount, since we'll always overlap the
16171 remainder. */
16172 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
16173 return false;
16175 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16176 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16178 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16179 src = adjust_automodify_address (src, VOIDmode, base, 0);
16181 /* Convert n to bits to make the rest of the code simpler. */
16182 n = n * BITS_PER_UNIT;
16184 while (n > 0)
16186 /* Find the largest mode in which to do the copy in without over reading
16187 or writing. */
16188 opt_scalar_int_mode mode_iter;
16189 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
16190 if (GET_MODE_BITSIZE (mode_iter.require ()) <= n)
16191 cur_mode = mode_iter.require ();
16193 gcc_assert (cur_mode != BLKmode);
16195 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
16196 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
16198 n -= mode_bits;
16200 /* Do certain trailing copies as overlapping if it's going to be
16201 cheaper. i.e. less instructions to do so. For instance doing a 15
16202 byte copy it's more efficient to do two overlapping 8 byte copies than
16203 8 + 6 + 1. */
16204 next_mode = smallest_mode_for_size (n, MODE_INT);
16205 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
16206 if (n > 0 && n_bits > n && n_bits <= 8 * BITS_PER_UNIT)
16208 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
16209 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
16210 n = n_bits;
16214 return true;
16217 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16218 SImode stores. Handle the case when the constant has identical
16219 bottom and top halves. This is beneficial when the two stores can be
16220 merged into an STP and we avoid synthesising potentially expensive
16221 immediates twice. Return true if such a split is possible. */
16223 bool
16224 aarch64_split_dimode_const_store (rtx dst, rtx src)
16226 rtx lo = gen_lowpart (SImode, src);
16227 rtx hi = gen_highpart_mode (SImode, DImode, src);
16229 bool size_p = optimize_function_for_size_p (cfun);
16231 if (!rtx_equal_p (lo, hi))
16232 return false;
16234 unsigned int orig_cost
16235 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16236 unsigned int lo_cost
16237 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16239 /* We want to transform:
16240 MOV x1, 49370
16241 MOVK x1, 0x140, lsl 16
16242 MOVK x1, 0xc0da, lsl 32
16243 MOVK x1, 0x140, lsl 48
16244 STR x1, [x0]
16245 into:
16246 MOV w1, 49370
16247 MOVK w1, 0x140, lsl 16
16248 STP w1, w1, [x0]
16249 So we want to perform this only when we save two instructions
16250 or more. When optimizing for size, however, accept any code size
16251 savings we can. */
16252 if (size_p && orig_cost <= lo_cost)
16253 return false;
16255 if (!size_p
16256 && (orig_cost <= lo_cost + 1))
16257 return false;
16259 rtx mem_lo = adjust_address (dst, SImode, 0);
16260 if (!aarch64_mem_pair_operand (mem_lo, SImode))
16261 return false;
16263 rtx tmp_reg = gen_reg_rtx (SImode);
16264 aarch64_expand_mov_immediate (tmp_reg, lo);
16265 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16266 /* Don't emit an explicit store pair as this may not be always profitable.
16267 Let the sched-fusion logic decide whether to merge them. */
16268 emit_move_insn (mem_lo, tmp_reg);
16269 emit_move_insn (mem_hi, tmp_reg);
16271 return true;
16274 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16276 static unsigned HOST_WIDE_INT
16277 aarch64_asan_shadow_offset (void)
16279 return (HOST_WIDE_INT_1 << 36);
16282 static rtx
16283 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16284 int code, tree treeop0, tree treeop1)
16286 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16287 rtx op0, op1;
16288 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16289 insn_code icode;
16290 struct expand_operand ops[4];
16292 start_sequence ();
16293 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16295 op_mode = GET_MODE (op0);
16296 if (op_mode == VOIDmode)
16297 op_mode = GET_MODE (op1);
16299 switch (op_mode)
16301 case E_QImode:
16302 case E_HImode:
16303 case E_SImode:
16304 cmp_mode = SImode;
16305 icode = CODE_FOR_cmpsi;
16306 break;
16308 case E_DImode:
16309 cmp_mode = DImode;
16310 icode = CODE_FOR_cmpdi;
16311 break;
16313 case E_SFmode:
16314 cmp_mode = SFmode;
16315 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16316 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16317 break;
16319 case E_DFmode:
16320 cmp_mode = DFmode;
16321 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16322 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16323 break;
16325 default:
16326 end_sequence ();
16327 return NULL_RTX;
16330 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16331 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16332 if (!op0 || !op1)
16334 end_sequence ();
16335 return NULL_RTX;
16337 *prep_seq = get_insns ();
16338 end_sequence ();
16340 create_fixed_operand (&ops[0], op0);
16341 create_fixed_operand (&ops[1], op1);
16343 start_sequence ();
16344 if (!maybe_expand_insn (icode, 2, ops))
16346 end_sequence ();
16347 return NULL_RTX;
16349 *gen_seq = get_insns ();
16350 end_sequence ();
16352 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16353 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16356 static rtx
16357 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16358 int cmp_code, tree treeop0, tree treeop1, int bit_code)
16360 rtx op0, op1, target;
16361 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16362 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16363 insn_code icode;
16364 struct expand_operand ops[6];
16365 int aarch64_cond;
16367 push_to_sequence (*prep_seq);
16368 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16370 op_mode = GET_MODE (op0);
16371 if (op_mode == VOIDmode)
16372 op_mode = GET_MODE (op1);
16374 switch (op_mode)
16376 case E_QImode:
16377 case E_HImode:
16378 case E_SImode:
16379 cmp_mode = SImode;
16380 icode = CODE_FOR_ccmpsi;
16381 break;
16383 case E_DImode:
16384 cmp_mode = DImode;
16385 icode = CODE_FOR_ccmpdi;
16386 break;
16388 case E_SFmode:
16389 cmp_mode = SFmode;
16390 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16391 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16392 break;
16394 case E_DFmode:
16395 cmp_mode = DFmode;
16396 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16397 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16398 break;
16400 default:
16401 end_sequence ();
16402 return NULL_RTX;
16405 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16406 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16407 if (!op0 || !op1)
16409 end_sequence ();
16410 return NULL_RTX;
16412 *prep_seq = get_insns ();
16413 end_sequence ();
16415 target = gen_rtx_REG (cc_mode, CC_REGNUM);
16416 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16418 if (bit_code != AND)
16420 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16421 GET_MODE (XEXP (prev, 0))),
16422 VOIDmode, XEXP (prev, 0), const0_rtx);
16423 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16426 create_fixed_operand (&ops[0], XEXP (prev, 0));
16427 create_fixed_operand (&ops[1], target);
16428 create_fixed_operand (&ops[2], op0);
16429 create_fixed_operand (&ops[3], op1);
16430 create_fixed_operand (&ops[4], prev);
16431 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16433 push_to_sequence (*gen_seq);
16434 if (!maybe_expand_insn (icode, 6, ops))
16436 end_sequence ();
16437 return NULL_RTX;
16440 *gen_seq = get_insns ();
16441 end_sequence ();
16443 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16446 #undef TARGET_GEN_CCMP_FIRST
16447 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16449 #undef TARGET_GEN_CCMP_NEXT
16450 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16452 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16453 instruction fusion of some sort. */
16455 static bool
16456 aarch64_macro_fusion_p (void)
16458 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16462 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16463 should be kept together during scheduling. */
16465 static bool
16466 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16468 rtx set_dest;
16469 rtx prev_set = single_set (prev);
16470 rtx curr_set = single_set (curr);
16471 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16472 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16474 if (!aarch64_macro_fusion_p ())
16475 return false;
16477 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16479 /* We are trying to match:
16480 prev (mov) == (set (reg r0) (const_int imm16))
16481 curr (movk) == (set (zero_extract (reg r0)
16482 (const_int 16)
16483 (const_int 16))
16484 (const_int imm16_1)) */
16486 set_dest = SET_DEST (curr_set);
16488 if (GET_CODE (set_dest) == ZERO_EXTRACT
16489 && CONST_INT_P (SET_SRC (curr_set))
16490 && CONST_INT_P (SET_SRC (prev_set))
16491 && CONST_INT_P (XEXP (set_dest, 2))
16492 && INTVAL (XEXP (set_dest, 2)) == 16
16493 && REG_P (XEXP (set_dest, 0))
16494 && REG_P (SET_DEST (prev_set))
16495 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16497 return true;
16501 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16504 /* We're trying to match:
16505 prev (adrp) == (set (reg r1)
16506 (high (symbol_ref ("SYM"))))
16507 curr (add) == (set (reg r0)
16508 (lo_sum (reg r1)
16509 (symbol_ref ("SYM"))))
16510 Note that r0 need not necessarily be the same as r1, especially
16511 during pre-regalloc scheduling. */
16513 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16514 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16516 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16517 && REG_P (XEXP (SET_SRC (curr_set), 0))
16518 && REGNO (XEXP (SET_SRC (curr_set), 0))
16519 == REGNO (SET_DEST (prev_set))
16520 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16521 XEXP (SET_SRC (curr_set), 1)))
16522 return true;
16526 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16529 /* We're trying to match:
16530 prev (movk) == (set (zero_extract (reg r0)
16531 (const_int 16)
16532 (const_int 32))
16533 (const_int imm16_1))
16534 curr (movk) == (set (zero_extract (reg r0)
16535 (const_int 16)
16536 (const_int 48))
16537 (const_int imm16_2)) */
16539 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16540 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16541 && REG_P (XEXP (SET_DEST (prev_set), 0))
16542 && REG_P (XEXP (SET_DEST (curr_set), 0))
16543 && REGNO (XEXP (SET_DEST (prev_set), 0))
16544 == REGNO (XEXP (SET_DEST (curr_set), 0))
16545 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16546 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16547 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16548 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16549 && CONST_INT_P (SET_SRC (prev_set))
16550 && CONST_INT_P (SET_SRC (curr_set)))
16551 return true;
16554 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16556 /* We're trying to match:
16557 prev (adrp) == (set (reg r0)
16558 (high (symbol_ref ("SYM"))))
16559 curr (ldr) == (set (reg r1)
16560 (mem (lo_sum (reg r0)
16561 (symbol_ref ("SYM")))))
16563 curr (ldr) == (set (reg r1)
16564 (zero_extend (mem
16565 (lo_sum (reg r0)
16566 (symbol_ref ("SYM")))))) */
16567 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16568 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16570 rtx curr_src = SET_SRC (curr_set);
16572 if (GET_CODE (curr_src) == ZERO_EXTEND)
16573 curr_src = XEXP (curr_src, 0);
16575 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16576 && REG_P (XEXP (XEXP (curr_src, 0), 0))
16577 && REGNO (XEXP (XEXP (curr_src, 0), 0))
16578 == REGNO (SET_DEST (prev_set))
16579 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16580 XEXP (SET_SRC (prev_set), 0)))
16581 return true;
16585 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16586 && aarch_crypto_can_dual_issue (prev, curr))
16587 return true;
16589 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16590 && any_condjump_p (curr))
16592 enum attr_type prev_type = get_attr_type (prev);
16594 unsigned int condreg1, condreg2;
16595 rtx cc_reg_1;
16596 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16597 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16599 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16600 && prev
16601 && modified_in_p (cc_reg_1, prev))
16603 /* FIXME: this misses some which is considered simple arthematic
16604 instructions for ThunderX. Simple shifts are missed here. */
16605 if (prev_type == TYPE_ALUS_SREG
16606 || prev_type == TYPE_ALUS_IMM
16607 || prev_type == TYPE_LOGICS_REG
16608 || prev_type == TYPE_LOGICS_IMM)
16609 return true;
16613 if (prev_set
16614 && curr_set
16615 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16616 && any_condjump_p (curr))
16618 /* We're trying to match:
16619 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16620 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16621 (const_int 0))
16622 (label_ref ("SYM"))
16623 (pc)) */
16624 if (SET_DEST (curr_set) == (pc_rtx)
16625 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16626 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16627 && REG_P (SET_DEST (prev_set))
16628 && REGNO (SET_DEST (prev_set))
16629 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16631 /* Fuse ALU operations followed by conditional branch instruction. */
16632 switch (get_attr_type (prev))
16634 case TYPE_ALU_IMM:
16635 case TYPE_ALU_SREG:
16636 case TYPE_ADC_REG:
16637 case TYPE_ADC_IMM:
16638 case TYPE_ADCS_REG:
16639 case TYPE_ADCS_IMM:
16640 case TYPE_LOGIC_REG:
16641 case TYPE_LOGIC_IMM:
16642 case TYPE_CSEL:
16643 case TYPE_ADR:
16644 case TYPE_MOV_IMM:
16645 case TYPE_SHIFT_REG:
16646 case TYPE_SHIFT_IMM:
16647 case TYPE_BFM:
16648 case TYPE_RBIT:
16649 case TYPE_REV:
16650 case TYPE_EXTEND:
16651 return true;
16653 default:;
16658 return false;
16661 /* Return true iff the instruction fusion described by OP is enabled. */
16663 bool
16664 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16666 return (aarch64_tune_params.fusible_ops & op) != 0;
16669 /* If MEM is in the form of [base+offset], extract the two parts
16670 of address and set to BASE and OFFSET, otherwise return false
16671 after clearing BASE and OFFSET. */
16673 bool
16674 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16676 rtx addr;
16678 gcc_assert (MEM_P (mem));
16680 addr = XEXP (mem, 0);
16682 if (REG_P (addr))
16684 *base = addr;
16685 *offset = const0_rtx;
16686 return true;
16689 if (GET_CODE (addr) == PLUS
16690 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16692 *base = XEXP (addr, 0);
16693 *offset = XEXP (addr, 1);
16694 return true;
16697 *base = NULL_RTX;
16698 *offset = NULL_RTX;
16700 return false;
16703 /* Types for scheduling fusion. */
16704 enum sched_fusion_type
16706 SCHED_FUSION_NONE = 0,
16707 SCHED_FUSION_LD_SIGN_EXTEND,
16708 SCHED_FUSION_LD_ZERO_EXTEND,
16709 SCHED_FUSION_LD,
16710 SCHED_FUSION_ST,
16711 SCHED_FUSION_NUM
16714 /* If INSN is a load or store of address in the form of [base+offset],
16715 extract the two parts and set to BASE and OFFSET. Return scheduling
16716 fusion type this INSN is. */
16718 static enum sched_fusion_type
16719 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16721 rtx x, dest, src;
16722 enum sched_fusion_type fusion = SCHED_FUSION_LD;
16724 gcc_assert (INSN_P (insn));
16725 x = PATTERN (insn);
16726 if (GET_CODE (x) != SET)
16727 return SCHED_FUSION_NONE;
16729 src = SET_SRC (x);
16730 dest = SET_DEST (x);
16732 machine_mode dest_mode = GET_MODE (dest);
16734 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16735 return SCHED_FUSION_NONE;
16737 if (GET_CODE (src) == SIGN_EXTEND)
16739 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16740 src = XEXP (src, 0);
16741 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16742 return SCHED_FUSION_NONE;
16744 else if (GET_CODE (src) == ZERO_EXTEND)
16746 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16747 src = XEXP (src, 0);
16748 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16749 return SCHED_FUSION_NONE;
16752 if (GET_CODE (src) == MEM && REG_P (dest))
16753 extract_base_offset_in_addr (src, base, offset);
16754 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16756 fusion = SCHED_FUSION_ST;
16757 extract_base_offset_in_addr (dest, base, offset);
16759 else
16760 return SCHED_FUSION_NONE;
16762 if (*base == NULL_RTX || *offset == NULL_RTX)
16763 fusion = SCHED_FUSION_NONE;
16765 return fusion;
16768 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16770 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16771 and PRI are only calculated for these instructions. For other instruction,
16772 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16773 type instruction fusion can be added by returning different priorities.
16775 It's important that irrelevant instructions get the largest FUSION_PRI. */
16777 static void
16778 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16779 int *fusion_pri, int *pri)
16781 int tmp, off_val;
16782 rtx base, offset;
16783 enum sched_fusion_type fusion;
16785 gcc_assert (INSN_P (insn));
16787 tmp = max_pri - 1;
16788 fusion = fusion_load_store (insn, &base, &offset);
16789 if (fusion == SCHED_FUSION_NONE)
16791 *pri = tmp;
16792 *fusion_pri = tmp;
16793 return;
16796 /* Set FUSION_PRI according to fusion type and base register. */
16797 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16799 /* Calculate PRI. */
16800 tmp /= 2;
16802 /* INSN with smaller offset goes first. */
16803 off_val = (int)(INTVAL (offset));
16804 if (off_val >= 0)
16805 tmp -= (off_val & 0xfffff);
16806 else
16807 tmp += ((- off_val) & 0xfffff);
16809 *pri = tmp;
16810 return;
16813 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16814 Adjust priority of sha1h instructions so they are scheduled before
16815 other SHA1 instructions. */
16817 static int
16818 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16820 rtx x = PATTERN (insn);
16822 if (GET_CODE (x) == SET)
16824 x = SET_SRC (x);
16826 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16827 return priority + 10;
16830 return priority;
16833 /* Given OPERANDS of consecutive load/store, check if we can merge
16834 them into ldp/stp. LOAD is true if they are load instructions.
16835 MODE is the mode of memory operands. */
16837 bool
16838 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16839 machine_mode mode)
16841 HOST_WIDE_INT offval_1, offval_2, msize;
16842 enum reg_class rclass_1, rclass_2;
16843 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16845 if (load)
16847 mem_1 = operands[1];
16848 mem_2 = operands[3];
16849 reg_1 = operands[0];
16850 reg_2 = operands[2];
16851 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16852 if (REGNO (reg_1) == REGNO (reg_2))
16853 return false;
16855 else
16857 mem_1 = operands[0];
16858 mem_2 = operands[2];
16859 reg_1 = operands[1];
16860 reg_2 = operands[3];
16863 /* The mems cannot be volatile. */
16864 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16865 return false;
16867 /* If we have SImode and slow unaligned ldp,
16868 check the alignment to be at least 8 byte. */
16869 if (mode == SImode
16870 && (aarch64_tune_params.extra_tuning_flags
16871 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16872 && !optimize_size
16873 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16874 return false;
16876 /* Check if the addresses are in the form of [base+offset]. */
16877 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16878 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16879 return false;
16880 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16881 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16882 return false;
16884 /* Check if the bases are same. */
16885 if (!rtx_equal_p (base_1, base_2))
16886 return false;
16888 /* The operands must be of the same size. */
16889 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16890 GET_MODE_SIZE (GET_MODE (mem_2))));
16892 offval_1 = INTVAL (offset_1);
16893 offval_2 = INTVAL (offset_2);
16894 /* We should only be trying this for fixed-sized modes. There is no
16895 SVE LDP/STP instruction. */
16896 msize = GET_MODE_SIZE (mode).to_constant ();
16897 /* Check if the offsets are consecutive. */
16898 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16899 return false;
16901 /* Check if the addresses are clobbered by load. */
16902 if (load)
16904 if (reg_mentioned_p (reg_1, mem_1))
16905 return false;
16907 /* In increasing order, the last load can clobber the address. */
16908 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16909 return false;
16912 /* One of the memory accesses must be a mempair operand.
16913 If it is not the first one, they need to be swapped by the
16914 peephole. */
16915 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16916 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16917 return false;
16919 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16920 rclass_1 = FP_REGS;
16921 else
16922 rclass_1 = GENERAL_REGS;
16924 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16925 rclass_2 = FP_REGS;
16926 else
16927 rclass_2 = GENERAL_REGS;
16929 /* Check if the registers are of same class. */
16930 if (rclass_1 != rclass_2)
16931 return false;
16933 return true;
16936 /* Given OPERANDS of consecutive load/store that can be merged,
16937 swap them if they are not in ascending order. */
16938 void
16939 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16941 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16942 HOST_WIDE_INT offval_1, offval_2;
16944 if (load)
16946 mem_1 = operands[1];
16947 mem_2 = operands[3];
16949 else
16951 mem_1 = operands[0];
16952 mem_2 = operands[2];
16955 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16956 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16958 offval_1 = INTVAL (offset_1);
16959 offval_2 = INTVAL (offset_2);
16961 if (offval_1 > offval_2)
16963 /* Irrespective of whether this is a load or a store,
16964 we do the same swap. */
16965 std::swap (operands[0], operands[2]);
16966 std::swap (operands[1], operands[3]);
16970 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16971 comparison between the two. */
16973 aarch64_host_wide_int_compare (const void *x, const void *y)
16975 return wi::cmps (* ((const HOST_WIDE_INT *) x),
16976 * ((const HOST_WIDE_INT *) y));
16979 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16980 other pointing to a REG rtx containing an offset, compare the offsets
16981 of the two pairs.
16983 Return:
16985 1 iff offset (X) > offset (Y)
16986 0 iff offset (X) == offset (Y)
16987 -1 iff offset (X) < offset (Y) */
16989 aarch64_ldrstr_offset_compare (const void *x, const void *y)
16991 const rtx * operands_1 = (const rtx *) x;
16992 const rtx * operands_2 = (const rtx *) y;
16993 rtx mem_1, mem_2, base, offset_1, offset_2;
16995 if (MEM_P (operands_1[0]))
16996 mem_1 = operands_1[0];
16997 else
16998 mem_1 = operands_1[1];
17000 if (MEM_P (operands_2[0]))
17001 mem_2 = operands_2[0];
17002 else
17003 mem_2 = operands_2[1];
17005 /* Extract the offsets. */
17006 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17007 extract_base_offset_in_addr (mem_2, &base, &offset_2);
17009 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17011 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17014 /* Given OPERANDS of consecutive load/store, check if we can merge
17015 them into ldp/stp by adjusting the offset. LOAD is true if they
17016 are load instructions. MODE is the mode of memory operands.
17018 Given below consecutive stores:
17020 str w1, [xb, 0x100]
17021 str w1, [xb, 0x104]
17022 str w1, [xb, 0x108]
17023 str w1, [xb, 0x10c]
17025 Though the offsets are out of the range supported by stp, we can
17026 still pair them after adjusting the offset, like:
17028 add scratch, xb, 0x100
17029 stp w1, w1, [scratch]
17030 stp w1, w1, [scratch, 0x8]
17032 The peephole patterns detecting this opportunity should guarantee
17033 the scratch register is avaliable. */
17035 bool
17036 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17037 scalar_mode mode)
17039 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
17040 HOST_WIDE_INT offvals[4], msize;
17041 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
17042 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
17044 if (load)
17046 reg_1 = operands[0];
17047 mem_1 = operands[1];
17048 reg_2 = operands[2];
17049 mem_2 = operands[3];
17050 reg_3 = operands[4];
17051 mem_3 = operands[5];
17052 reg_4 = operands[6];
17053 mem_4 = operands[7];
17054 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
17055 && REG_P (reg_3) && REG_P (reg_4));
17057 /* Do not attempt to merge the loads if the loads clobber each other. */
17058 for (int i = 0; i < 8; i += 2)
17059 for (int j = i + 2; j < 8; j += 2)
17060 if (reg_overlap_mentioned_p (operands[i], operands[j]))
17061 return false;
17063 else
17065 mem_1 = operands[0];
17066 reg_1 = operands[1];
17067 mem_2 = operands[2];
17068 reg_2 = operands[3];
17069 mem_3 = operands[4];
17070 reg_3 = operands[5];
17071 mem_4 = operands[6];
17072 reg_4 = operands[7];
17074 /* Skip if memory operand is by itslef valid for ldp/stp. */
17075 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
17076 return false;
17078 /* The mems cannot be volatile. */
17079 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
17080 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
17081 return false;
17083 /* Check if the addresses are in the form of [base+offset]. */
17084 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17085 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17086 return false;
17087 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17088 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17089 return false;
17090 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
17091 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
17092 return false;
17093 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
17094 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
17095 return false;
17097 /* Check if the bases are same. */
17098 if (!rtx_equal_p (base_1, base_2)
17099 || !rtx_equal_p (base_2, base_3)
17100 || !rtx_equal_p (base_3, base_4))
17101 return false;
17103 offvals[0] = INTVAL (offset_1);
17104 offvals[1] = INTVAL (offset_2);
17105 offvals[2] = INTVAL (offset_3);
17106 offvals[3] = INTVAL (offset_4);
17107 msize = GET_MODE_SIZE (mode);
17109 /* Check if the offsets can be put in the right order to do a ldp/stp. */
17110 qsort (offvals, 4, sizeof (HOST_WIDE_INT), aarch64_host_wide_int_compare);
17112 if (!(offvals[1] == offvals[0] + msize
17113 && offvals[3] == offvals[2] + msize))
17114 return false;
17116 /* Check that offsets are within range of each other. The ldp/stp
17117 instructions have 7 bit immediate offsets, so use 0x80. */
17118 if (offvals[2] - offvals[0] >= msize * 0x80)
17119 return false;
17121 /* The offsets must be aligned with respect to each other. */
17122 if (offvals[0] % msize != offvals[2] % msize)
17123 return false;
17125 /* Check if the addresses are clobbered by load. */
17126 if (load && (reg_mentioned_p (reg_1, mem_1)
17127 || reg_mentioned_p (reg_2, mem_2)
17128 || reg_mentioned_p (reg_3, mem_3)
17129 || reg_mentioned_p (reg_4, mem_4)))
17130 return false;
17132 /* If we have SImode and slow unaligned ldp,
17133 check the alignment to be at least 8 byte. */
17134 if (mode == SImode
17135 && (aarch64_tune_params.extra_tuning_flags
17136 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17137 && !optimize_size
17138 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17139 return false;
17141 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17142 rclass_1 = FP_REGS;
17143 else
17144 rclass_1 = GENERAL_REGS;
17146 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17147 rclass_2 = FP_REGS;
17148 else
17149 rclass_2 = GENERAL_REGS;
17151 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
17152 rclass_3 = FP_REGS;
17153 else
17154 rclass_3 = GENERAL_REGS;
17156 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
17157 rclass_4 = FP_REGS;
17158 else
17159 rclass_4 = GENERAL_REGS;
17161 /* Check if the registers are of same class. */
17162 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17163 return false;
17165 return true;
17168 /* Given OPERANDS of consecutive load/store, this function pairs them
17169 into LDP/STP after adjusting the offset. It depends on the fact
17170 that the operands can be sorted so the offsets are correct for STP.
17171 MODE is the mode of memory operands. CODE is the rtl operator
17172 which should be applied to all memory operands, it's SIGN_EXTEND,
17173 ZERO_EXTEND or UNKNOWN. */
17175 bool
17176 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17177 scalar_mode mode, RTX_CODE code)
17179 rtx base, offset_1, offset_3, t1, t2;
17180 rtx mem_1, mem_2, mem_3, mem_4;
17181 rtx temp_operands[8];
17182 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17183 stp_off_upper_limit, stp_off_lower_limit, msize;
17185 /* We make changes on a copy as we may still bail out. */
17186 for (int i = 0; i < 8; i ++)
17187 temp_operands[i] = operands[i];
17189 /* Sort the operands. */
17190 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17192 if (load)
17194 mem_1 = temp_operands[1];
17195 mem_2 = temp_operands[3];
17196 mem_3 = temp_operands[5];
17197 mem_4 = temp_operands[7];
17199 else
17201 mem_1 = temp_operands[0];
17202 mem_2 = temp_operands[2];
17203 mem_3 = temp_operands[4];
17204 mem_4 = temp_operands[6];
17205 gcc_assert (code == UNKNOWN);
17208 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17209 extract_base_offset_in_addr (mem_3, &base, &offset_3);
17210 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17211 && offset_3 != NULL_RTX);
17213 /* Adjust offset so it can fit in LDP/STP instruction. */
17214 msize = GET_MODE_SIZE (mode);
17215 stp_off_upper_limit = msize * (0x40 - 1);
17216 stp_off_lower_limit = - msize * 0x40;
17218 off_val_1 = INTVAL (offset_1);
17219 off_val_3 = INTVAL (offset_3);
17221 /* The base offset is optimally half way between the two STP/LDP offsets. */
17222 if (msize <= 4)
17223 base_off = (off_val_1 + off_val_3) / 2;
17224 else
17225 /* However, due to issues with negative LDP/STP offset generation for
17226 larger modes, for DF, DI and vector modes. we must not use negative
17227 addresses smaller than 9 signed unadjusted bits can store. This
17228 provides the most range in this case. */
17229 base_off = off_val_1;
17231 /* Adjust the base so that it is aligned with the addresses but still
17232 optimal. */
17233 if (base_off % msize != off_val_1 % msize)
17234 /* Fix the offset, bearing in mind we want to make it bigger not
17235 smaller. */
17236 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17237 else if (msize <= 4)
17238 /* The negative range of LDP/STP is one larger than the positive range. */
17239 base_off += msize;
17241 /* Check if base offset is too big or too small. We can attempt to resolve
17242 this issue by setting it to the maximum value and seeing if the offsets
17243 still fit. */
17244 if (base_off >= 0x1000)
17246 base_off = 0x1000 - 1;
17247 /* We must still make sure that the base offset is aligned with respect
17248 to the address. But it may may not be made any bigger. */
17249 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17252 /* Likewise for the case where the base is too small. */
17253 if (base_off <= -0x1000)
17255 base_off = -0x1000 + 1;
17256 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17259 /* Offset of the first STP/LDP. */
17260 new_off_1 = off_val_1 - base_off;
17262 /* Offset of the second STP/LDP. */
17263 new_off_3 = off_val_3 - base_off;
17265 /* The offsets must be within the range of the LDP/STP instructions. */
17266 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17267 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17268 return false;
17270 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17271 new_off_1), true);
17272 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17273 new_off_1 + msize), true);
17274 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17275 new_off_3), true);
17276 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17277 new_off_3 + msize), true);
17279 if (!aarch64_mem_pair_operand (mem_1, mode)
17280 || !aarch64_mem_pair_operand (mem_3, mode))
17281 return false;
17283 if (code == ZERO_EXTEND)
17285 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17286 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17287 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17288 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17290 else if (code == SIGN_EXTEND)
17292 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17293 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17294 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17295 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17298 if (load)
17300 operands[0] = temp_operands[0];
17301 operands[1] = mem_1;
17302 operands[2] = temp_operands[2];
17303 operands[3] = mem_2;
17304 operands[4] = temp_operands[4];
17305 operands[5] = mem_3;
17306 operands[6] = temp_operands[6];
17307 operands[7] = mem_4;
17309 else
17311 operands[0] = mem_1;
17312 operands[1] = temp_operands[1];
17313 operands[2] = mem_2;
17314 operands[3] = temp_operands[3];
17315 operands[4] = mem_3;
17316 operands[5] = temp_operands[5];
17317 operands[6] = mem_4;
17318 operands[7] = temp_operands[7];
17321 /* Emit adjusting instruction. */
17322 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17323 /* Emit ldp/stp instructions. */
17324 t1 = gen_rtx_SET (operands[0], operands[1]);
17325 t2 = gen_rtx_SET (operands[2], operands[3]);
17326 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17327 t1 = gen_rtx_SET (operands[4], operands[5]);
17328 t2 = gen_rtx_SET (operands[6], operands[7]);
17329 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17330 return true;
17333 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17334 it isn't worth branching around empty masked ops (including masked
17335 stores). */
17337 static bool
17338 aarch64_empty_mask_is_expensive (unsigned)
17340 return false;
17343 /* Return 1 if pseudo register should be created and used to hold
17344 GOT address for PIC code. */
17346 bool
17347 aarch64_use_pseudo_pic_reg (void)
17349 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17352 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17354 static int
17355 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17357 switch (XINT (x, 1))
17359 case UNSPEC_GOTSMALLPIC:
17360 case UNSPEC_GOTSMALLPIC28K:
17361 case UNSPEC_GOTTINYPIC:
17362 return 0;
17363 default:
17364 break;
17367 return default_unspec_may_trap_p (x, flags);
17371 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17372 return the log2 of that value. Otherwise return -1. */
17375 aarch64_fpconst_pow_of_2 (rtx x)
17377 const REAL_VALUE_TYPE *r;
17379 if (!CONST_DOUBLE_P (x))
17380 return -1;
17382 r = CONST_DOUBLE_REAL_VALUE (x);
17384 if (REAL_VALUE_NEGATIVE (*r)
17385 || REAL_VALUE_ISNAN (*r)
17386 || REAL_VALUE_ISINF (*r)
17387 || !real_isinteger (r, DFmode))
17388 return -1;
17390 return exact_log2 (real_to_integer (r));
17393 /* If X is a vector of equal CONST_DOUBLE values and that value is
17394 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17397 aarch64_vec_fpconst_pow_of_2 (rtx x)
17399 int nelts;
17400 if (GET_CODE (x) != CONST_VECTOR
17401 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17402 return -1;
17404 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17405 return -1;
17407 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17408 if (firstval <= 0)
17409 return -1;
17411 for (int i = 1; i < nelts; i++)
17412 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17413 return -1;
17415 return firstval;
17418 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17419 to float.
17421 __fp16 always promotes through this hook.
17422 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17423 through the generic excess precision logic rather than here. */
17425 static tree
17426 aarch64_promoted_type (const_tree t)
17428 if (SCALAR_FLOAT_TYPE_P (t)
17429 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17430 return float_type_node;
17432 return NULL_TREE;
17435 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17437 static bool
17438 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17439 optimization_type opt_type)
17441 switch (op)
17443 case rsqrt_optab:
17444 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17446 default:
17447 return true;
17451 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17453 static unsigned int
17454 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17455 int *offset)
17457 /* Polynomial invariant 1 == (VG / 2) - 1. */
17458 gcc_assert (i == 1);
17459 *factor = 2;
17460 *offset = 1;
17461 return AARCH64_DWARF_VG;
17464 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17465 if MODE is HFmode, and punt to the generic implementation otherwise. */
17467 static bool
17468 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17470 return (mode == HFmode
17471 ? true
17472 : default_libgcc_floating_mode_supported_p (mode));
17475 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17476 if MODE is HFmode, and punt to the generic implementation otherwise. */
17478 static bool
17479 aarch64_scalar_mode_supported_p (scalar_mode mode)
17481 return (mode == HFmode
17482 ? true
17483 : default_scalar_mode_supported_p (mode));
17486 /* Set the value of FLT_EVAL_METHOD.
17487 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17489 0: evaluate all operations and constants, whose semantic type has at
17490 most the range and precision of type float, to the range and
17491 precision of float; evaluate all other operations and constants to
17492 the range and precision of the semantic type;
17494 N, where _FloatN is a supported interchange floating type
17495 evaluate all operations and constants, whose semantic type has at
17496 most the range and precision of _FloatN type, to the range and
17497 precision of the _FloatN type; evaluate all other operations and
17498 constants to the range and precision of the semantic type;
17500 If we have the ARMv8.2-A extensions then we support _Float16 in native
17501 precision, so we should set this to 16. Otherwise, we support the type,
17502 but want to evaluate expressions in float precision, so set this to
17503 0. */
17505 static enum flt_eval_method
17506 aarch64_excess_precision (enum excess_precision_type type)
17508 switch (type)
17510 case EXCESS_PRECISION_TYPE_FAST:
17511 case EXCESS_PRECISION_TYPE_STANDARD:
17512 /* We can calculate either in 16-bit range and precision or
17513 32-bit range and precision. Make that decision based on whether
17514 we have native support for the ARMv8.2-A 16-bit floating-point
17515 instructions or not. */
17516 return (TARGET_FP_F16INST
17517 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17518 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17519 case EXCESS_PRECISION_TYPE_IMPLICIT:
17520 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17521 default:
17522 gcc_unreachable ();
17524 return FLT_EVAL_METHOD_UNPREDICTABLE;
17527 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17528 scheduled for speculative execution. Reject the long-running division
17529 and square-root instructions. */
17531 static bool
17532 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17534 switch (get_attr_type (insn))
17536 case TYPE_SDIV:
17537 case TYPE_UDIV:
17538 case TYPE_FDIVS:
17539 case TYPE_FDIVD:
17540 case TYPE_FSQRTS:
17541 case TYPE_FSQRTD:
17542 case TYPE_NEON_FP_SQRT_S:
17543 case TYPE_NEON_FP_SQRT_D:
17544 case TYPE_NEON_FP_SQRT_S_Q:
17545 case TYPE_NEON_FP_SQRT_D_Q:
17546 case TYPE_NEON_FP_DIV_S:
17547 case TYPE_NEON_FP_DIV_D:
17548 case TYPE_NEON_FP_DIV_S_Q:
17549 case TYPE_NEON_FP_DIV_D_Q:
17550 return false;
17551 default:
17552 return true;
17556 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17558 static int
17559 aarch64_compute_pressure_classes (reg_class *classes)
17561 int i = 0;
17562 classes[i++] = GENERAL_REGS;
17563 classes[i++] = FP_REGS;
17564 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17565 registers need to go in PR_LO_REGS at some point during their
17566 lifetime. Splitting it into two halves has the effect of making
17567 all predicates count against PR_LO_REGS, so that we try whenever
17568 possible to restrict the number of live predicates to 8. This
17569 greatly reduces the amount of spilling in certain loops. */
17570 classes[i++] = PR_LO_REGS;
17571 classes[i++] = PR_HI_REGS;
17572 return i;
17575 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17577 static bool
17578 aarch64_can_change_mode_class (machine_mode from,
17579 machine_mode to, reg_class_t)
17581 if (BYTES_BIG_ENDIAN)
17583 bool from_sve_p = aarch64_sve_data_mode_p (from);
17584 bool to_sve_p = aarch64_sve_data_mode_p (to);
17586 /* Don't allow changes between SVE data modes and non-SVE modes.
17587 See the comment at the head of aarch64-sve.md for details. */
17588 if (from_sve_p != to_sve_p)
17589 return false;
17591 /* Don't allow changes in element size: lane 0 of the new vector
17592 would not then be lane 0 of the old vector. See the comment
17593 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17594 description.
17596 In the worst case, this forces a register to be spilled in
17597 one mode and reloaded in the other, which handles the
17598 endianness correctly. */
17599 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17600 return false;
17602 return true;
17605 /* Implement TARGET_EARLY_REMAT_MODES. */
17607 static void
17608 aarch64_select_early_remat_modes (sbitmap modes)
17610 /* SVE values are not normally live across a call, so it should be
17611 worth doing early rematerialization even in VL-specific mode. */
17612 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17614 machine_mode mode = (machine_mode) i;
17615 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17616 if (vec_flags & VEC_ANY_SVE)
17617 bitmap_set_bit (modes, i);
17621 /* Target-specific selftests. */
17623 #if CHECKING_P
17625 namespace selftest {
17627 /* Selftest for the RTL loader.
17628 Verify that the RTL loader copes with a dump from
17629 print_rtx_function. This is essentially just a test that class
17630 function_reader can handle a real dump, but it also verifies
17631 that lookup_reg_by_dump_name correctly handles hard regs.
17632 The presence of hard reg names in the dump means that the test is
17633 target-specific, hence it is in this file. */
17635 static void
17636 aarch64_test_loading_full_dump ()
17638 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17640 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17642 rtx_insn *insn_1 = get_insn_by_uid (1);
17643 ASSERT_EQ (NOTE, GET_CODE (insn_1));
17645 rtx_insn *insn_15 = get_insn_by_uid (15);
17646 ASSERT_EQ (INSN, GET_CODE (insn_15));
17647 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17649 /* Verify crtl->return_rtx. */
17650 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17651 ASSERT_EQ (0, REGNO (crtl->return_rtx));
17652 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17655 /* Run all target-specific selftests. */
17657 static void
17658 aarch64_run_selftests (void)
17660 aarch64_test_loading_full_dump ();
17663 } // namespace selftest
17665 #endif /* #if CHECKING_P */
17667 #undef TARGET_ADDRESS_COST
17668 #define TARGET_ADDRESS_COST aarch64_address_cost
17670 /* This hook will determines whether unnamed bitfields affect the alignment
17671 of the containing structure. The hook returns true if the structure
17672 should inherit the alignment requirements of an unnamed bitfield's
17673 type. */
17674 #undef TARGET_ALIGN_ANON_BITFIELD
17675 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17677 #undef TARGET_ASM_ALIGNED_DI_OP
17678 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17680 #undef TARGET_ASM_ALIGNED_HI_OP
17681 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17683 #undef TARGET_ASM_ALIGNED_SI_OP
17684 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17686 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17687 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17688 hook_bool_const_tree_hwi_hwi_const_tree_true
17690 #undef TARGET_ASM_FILE_START
17691 #define TARGET_ASM_FILE_START aarch64_start_file
17693 #undef TARGET_ASM_OUTPUT_MI_THUNK
17694 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17696 #undef TARGET_ASM_SELECT_RTX_SECTION
17697 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17699 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17700 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17702 #undef TARGET_BUILD_BUILTIN_VA_LIST
17703 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17705 #undef TARGET_CALLEE_COPIES
17706 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17708 #undef TARGET_CAN_ELIMINATE
17709 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17711 #undef TARGET_CAN_INLINE_P
17712 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17714 #undef TARGET_CANNOT_FORCE_CONST_MEM
17715 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17717 #undef TARGET_CASE_VALUES_THRESHOLD
17718 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17720 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17721 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17723 /* Only the least significant bit is used for initialization guard
17724 variables. */
17725 #undef TARGET_CXX_GUARD_MASK_BIT
17726 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17728 #undef TARGET_C_MODE_FOR_SUFFIX
17729 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17731 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17732 #undef TARGET_DEFAULT_TARGET_FLAGS
17733 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17734 #endif
17736 #undef TARGET_CLASS_MAX_NREGS
17737 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17739 #undef TARGET_BUILTIN_DECL
17740 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17742 #undef TARGET_BUILTIN_RECIPROCAL
17743 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17745 #undef TARGET_C_EXCESS_PRECISION
17746 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17748 #undef TARGET_EXPAND_BUILTIN
17749 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17751 #undef TARGET_EXPAND_BUILTIN_VA_START
17752 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17754 #undef TARGET_FOLD_BUILTIN
17755 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17757 #undef TARGET_FUNCTION_ARG
17758 #define TARGET_FUNCTION_ARG aarch64_function_arg
17760 #undef TARGET_FUNCTION_ARG_ADVANCE
17761 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17763 #undef TARGET_FUNCTION_ARG_BOUNDARY
17764 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17766 #undef TARGET_FUNCTION_ARG_PADDING
17767 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17769 #undef TARGET_GET_RAW_RESULT_MODE
17770 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17771 #undef TARGET_GET_RAW_ARG_MODE
17772 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17774 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17775 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17777 #undef TARGET_FUNCTION_VALUE
17778 #define TARGET_FUNCTION_VALUE aarch64_function_value
17780 #undef TARGET_FUNCTION_VALUE_REGNO_P
17781 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17783 #undef TARGET_GIMPLE_FOLD_BUILTIN
17784 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17786 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17787 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17789 #undef TARGET_INIT_BUILTINS
17790 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17792 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17793 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17794 aarch64_ira_change_pseudo_allocno_class
17796 #undef TARGET_LEGITIMATE_ADDRESS_P
17797 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17799 #undef TARGET_LEGITIMATE_CONSTANT_P
17800 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17802 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17803 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17804 aarch64_legitimize_address_displacement
17806 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17807 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17809 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17810 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17811 aarch64_libgcc_floating_mode_supported_p
17813 #undef TARGET_MANGLE_TYPE
17814 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17816 #undef TARGET_MEMORY_MOVE_COST
17817 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17819 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17820 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17822 #undef TARGET_MUST_PASS_IN_STACK
17823 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17825 /* This target hook should return true if accesses to volatile bitfields
17826 should use the narrowest mode possible. It should return false if these
17827 accesses should use the bitfield container type. */
17828 #undef TARGET_NARROW_VOLATILE_BITFIELD
17829 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17831 #undef TARGET_OPTION_OVERRIDE
17832 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17834 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17835 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17836 aarch64_override_options_after_change
17838 #undef TARGET_OPTION_SAVE
17839 #define TARGET_OPTION_SAVE aarch64_option_save
17841 #undef TARGET_OPTION_RESTORE
17842 #define TARGET_OPTION_RESTORE aarch64_option_restore
17844 #undef TARGET_OPTION_PRINT
17845 #define TARGET_OPTION_PRINT aarch64_option_print
17847 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17848 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17850 #undef TARGET_SET_CURRENT_FUNCTION
17851 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17853 #undef TARGET_PASS_BY_REFERENCE
17854 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17856 #undef TARGET_PREFERRED_RELOAD_CLASS
17857 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17859 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17860 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17862 #undef TARGET_PROMOTED_TYPE
17863 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17865 #undef TARGET_SECONDARY_RELOAD
17866 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17868 #undef TARGET_SHIFT_TRUNCATION_MASK
17869 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17871 #undef TARGET_SETUP_INCOMING_VARARGS
17872 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17874 #undef TARGET_STRUCT_VALUE_RTX
17875 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17877 #undef TARGET_REGISTER_MOVE_COST
17878 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17880 #undef TARGET_RETURN_IN_MEMORY
17881 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17883 #undef TARGET_RETURN_IN_MSB
17884 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17886 #undef TARGET_RTX_COSTS
17887 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17889 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17890 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17892 #undef TARGET_SCHED_ISSUE_RATE
17893 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17895 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17896 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17897 aarch64_sched_first_cycle_multipass_dfa_lookahead
17899 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17900 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17901 aarch64_first_cycle_multipass_dfa_lookahead_guard
17903 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17904 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17905 aarch64_get_separate_components
17907 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17908 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17909 aarch64_components_for_bb
17911 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17912 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17913 aarch64_disqualify_components
17915 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17916 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17917 aarch64_emit_prologue_components
17919 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17920 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17921 aarch64_emit_epilogue_components
17923 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17924 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17925 aarch64_set_handled_components
17927 #undef TARGET_TRAMPOLINE_INIT
17928 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17930 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17931 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17933 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17934 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17936 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17937 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17938 aarch64_builtin_support_vector_misalignment
17940 #undef TARGET_ARRAY_MODE
17941 #define TARGET_ARRAY_MODE aarch64_array_mode
17943 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17944 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17946 #undef TARGET_VECTORIZE_ADD_STMT_COST
17947 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17949 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17950 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17951 aarch64_builtin_vectorization_cost
17953 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17954 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17956 #undef TARGET_VECTORIZE_BUILTINS
17957 #define TARGET_VECTORIZE_BUILTINS
17959 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17960 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17961 aarch64_builtin_vectorized_function
17963 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17964 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17965 aarch64_autovectorize_vector_sizes
17967 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17968 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17969 aarch64_atomic_assign_expand_fenv
17971 /* Section anchor support. */
17973 #undef TARGET_MIN_ANCHOR_OFFSET
17974 #define TARGET_MIN_ANCHOR_OFFSET -256
17976 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17977 byte offset; we can do much more for larger data types, but have no way
17978 to determine the size of the access. We assume accesses are aligned. */
17979 #undef TARGET_MAX_ANCHOR_OFFSET
17980 #define TARGET_MAX_ANCHOR_OFFSET 4095
17982 #undef TARGET_VECTOR_ALIGNMENT
17983 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17985 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17986 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17987 aarch64_vectorize_preferred_vector_alignment
17988 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17989 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17990 aarch64_simd_vector_alignment_reachable
17992 /* vec_perm support. */
17994 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17995 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17996 aarch64_vectorize_vec_perm_const
17998 #undef TARGET_VECTORIZE_GET_MASK_MODE
17999 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18000 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18001 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18002 aarch64_empty_mask_is_expensive
18003 #undef TARGET_PREFERRED_ELSE_VALUE
18004 #define TARGET_PREFERRED_ELSE_VALUE \
18005 aarch64_preferred_else_value
18007 #undef TARGET_INIT_LIBFUNCS
18008 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18010 #undef TARGET_FIXED_CONDITION_CODE_REGS
18011 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18013 #undef TARGET_FLAGS_REGNUM
18014 #define TARGET_FLAGS_REGNUM CC_REGNUM
18016 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18017 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18019 #undef TARGET_ASAN_SHADOW_OFFSET
18020 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18022 #undef TARGET_LEGITIMIZE_ADDRESS
18023 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18025 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18026 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18028 #undef TARGET_CAN_USE_DOLOOP_P
18029 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18031 #undef TARGET_SCHED_ADJUST_PRIORITY
18032 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18034 #undef TARGET_SCHED_MACRO_FUSION_P
18035 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18037 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18038 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18040 #undef TARGET_SCHED_FUSION_PRIORITY
18041 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18043 #undef TARGET_UNSPEC_MAY_TRAP_P
18044 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18046 #undef TARGET_USE_PSEUDO_PIC_REG
18047 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18049 #undef TARGET_PRINT_OPERAND
18050 #define TARGET_PRINT_OPERAND aarch64_print_operand
18052 #undef TARGET_PRINT_OPERAND_ADDRESS
18053 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18055 #undef TARGET_OPTAB_SUPPORTED_P
18056 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18058 #undef TARGET_OMIT_STRUCT_RETURN_REG
18059 #define TARGET_OMIT_STRUCT_RETURN_REG true
18061 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18062 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18063 aarch64_dwarf_poly_indeterminate_value
18065 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
18066 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18067 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18069 #undef TARGET_HARD_REGNO_NREGS
18070 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18071 #undef TARGET_HARD_REGNO_MODE_OK
18072 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18074 #undef TARGET_MODES_TIEABLE_P
18075 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18077 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18078 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18079 aarch64_hard_regno_call_part_clobbered
18081 #undef TARGET_CONSTANT_ALIGNMENT
18082 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18084 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18085 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18087 #undef TARGET_CAN_CHANGE_MODE_CLASS
18088 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18090 #undef TARGET_SELECT_EARLY_REMAT_MODES
18091 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18093 #if CHECKING_P
18094 #undef TARGET_RUN_TARGET_SELFTESTS
18095 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18096 #endif /* #if CHECKING_P */
18098 struct gcc_target targetm = TARGET_INITIALIZER;
18100 #include "gt-aarch64.h"