Fix SVE fallout from r260951
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob98ef45714fd62678a39a10f705f89d322c083c5b
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Classifies an address.
83 ADDRESS_REG_IMM
84 A simple base register plus immediate offset.
86 ADDRESS_REG_WB
87 A base register indexed by immediate offset with writeback.
89 ADDRESS_REG_REG
90 A base register indexed by (optionally scaled) register.
92 ADDRESS_REG_UXTW
93 A base register indexed by (optionally scaled) zero-extended register.
95 ADDRESS_REG_SXTW
96 A base register indexed by (optionally scaled) sign-extended register.
98 ADDRESS_LO_SUM
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
101 ADDRESS_SYMBOLIC:
102 A constant symbolic address, in pc-relative literal pool. */
104 enum aarch64_address_type {
105 ADDRESS_REG_IMM,
106 ADDRESS_REG_WB,
107 ADDRESS_REG_REG,
108 ADDRESS_REG_UXTW,
109 ADDRESS_REG_SXTW,
110 ADDRESS_LO_SUM,
111 ADDRESS_SYMBOLIC
114 struct aarch64_address_info {
115 enum aarch64_address_type type;
116 rtx base;
117 rtx offset;
118 poly_int64 const_offset;
119 int shift;
120 enum aarch64_symbol_type symbol_type;
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
126 enum insn_type { MOV, MVN };
127 enum modifier_type { LSL, MSL };
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode, rtx);
131 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
132 insn_type = MOV, modifier_type = LSL,
133 unsigned int = 0);
134 simd_immediate_info (scalar_mode, rtx, rtx);
136 /* The mode of the elements. */
137 scalar_mode elt_mode;
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
141 rtx value;
143 /* The value of the step if the constant is a series, null otherwise. */
144 rtx step;
146 /* The instruction to use to move the immediate into a vector. */
147 insn_type insn;
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier;
152 unsigned int shift;
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
159 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
160 modifier (LSL), shift (0)
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
165 fields. */
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
168 unsigned HOST_WIDE_INT value_in,
169 insn_type insn_in, modifier_type modifier_in,
170 unsigned int shift_in)
171 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
172 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
179 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
180 modifier (LSL), shift (0)
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel;
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg;
189 #ifdef HAVE_AS_TLS
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
192 #endif
194 static bool aarch64_composite_type_p (const_tree, machine_mode);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
196 const_tree,
197 machine_mode *, int *,
198 bool *);
199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode);
203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
205 const_tree type,
206 int misalignment,
207 bool is_packed);
208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
211 /* Major revision number of the ARM Architecture implemented by the target. */
212 unsigned aarch64_architecture_version;
214 /* The processor for which instructions should be scheduled. */
215 enum aarch64_processor aarch64_tune = cortexa53;
217 /* Mask to specify which instruction scheduling options should be used. */
218 unsigned long aarch64_tune_flags = 0;
220 /* Global flag for PC relative loads. */
221 bool aarch64_pcrelative_literal_loads;
223 /* Global flag for whether frame pointer is enabled. */
224 bool aarch64_use_frame_pointer;
226 /* Support for command line parsing of boolean flags in the tuning
227 structures. */
228 struct aarch64_flag_desc
230 const char* name;
231 unsigned int flag;
234 #define AARCH64_FUSION_PAIR(name, internal_name) \
235 { name, AARCH64_FUSE_##internal_name },
236 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
238 { "none", AARCH64_FUSE_NOTHING },
239 #include "aarch64-fusion-pairs.def"
240 { "all", AARCH64_FUSE_ALL },
241 { NULL, AARCH64_FUSE_NOTHING }
244 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
245 { name, AARCH64_EXTRA_TUNE_##internal_name },
246 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
248 { "none", AARCH64_EXTRA_TUNE_NONE },
249 #include "aarch64-tuning-flags.def"
250 { "all", AARCH64_EXTRA_TUNE_ALL },
251 { NULL, AARCH64_EXTRA_TUNE_NONE }
254 /* Tuning parameters. */
256 static const struct cpu_addrcost_table generic_addrcost_table =
259 1, /* hi */
260 0, /* si */
261 0, /* di */
262 1, /* ti */
264 0, /* pre_modify */
265 0, /* post_modify */
266 0, /* register_offset */
267 0, /* register_sextend */
268 0, /* register_zextend */
269 0 /* imm_offset */
272 static const struct cpu_addrcost_table exynosm1_addrcost_table =
275 0, /* hi */
276 0, /* si */
277 0, /* di */
278 2, /* ti */
280 0, /* pre_modify */
281 0, /* post_modify */
282 1, /* register_offset */
283 1, /* register_sextend */
284 2, /* register_zextend */
285 0, /* imm_offset */
288 static const struct cpu_addrcost_table xgene1_addrcost_table =
291 1, /* hi */
292 0, /* si */
293 0, /* di */
294 1, /* ti */
296 1, /* pre_modify */
297 0, /* post_modify */
298 0, /* register_offset */
299 1, /* register_sextend */
300 1, /* register_zextend */
301 0, /* imm_offset */
304 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
307 1, /* hi */
308 1, /* si */
309 1, /* di */
310 2, /* ti */
312 0, /* pre_modify */
313 0, /* post_modify */
314 2, /* register_offset */
315 3, /* register_sextend */
316 3, /* register_zextend */
317 0, /* imm_offset */
320 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
323 1, /* hi */
324 1, /* si */
325 1, /* di */
326 2, /* ti */
328 1, /* pre_modify */
329 1, /* post_modify */
330 3, /* register_offset */
331 4, /* register_sextend */
332 3, /* register_zextend */
333 2, /* imm_offset */
336 static const struct cpu_regmove_cost generic_regmove_cost =
338 1, /* GP2GP */
339 /* Avoid the use of slow int<->fp moves for spilling by setting
340 their cost higher than memmov_cost. */
341 5, /* GP2FP */
342 5, /* FP2GP */
343 2 /* FP2FP */
346 static const struct cpu_regmove_cost cortexa57_regmove_cost =
348 1, /* GP2GP */
349 /* Avoid the use of slow int<->fp moves for spilling by setting
350 their cost higher than memmov_cost. */
351 5, /* GP2FP */
352 5, /* FP2GP */
353 2 /* FP2FP */
356 static const struct cpu_regmove_cost cortexa53_regmove_cost =
358 1, /* GP2GP */
359 /* Avoid the use of slow int<->fp moves for spilling by setting
360 their cost higher than memmov_cost. */
361 5, /* GP2FP */
362 5, /* FP2GP */
363 2 /* FP2FP */
366 static const struct cpu_regmove_cost exynosm1_regmove_cost =
368 1, /* GP2GP */
369 /* Avoid the use of slow int<->fp moves for spilling by setting
370 their cost higher than memmov_cost (actual, 4 and 9). */
371 9, /* GP2FP */
372 9, /* FP2GP */
373 1 /* FP2FP */
376 static const struct cpu_regmove_cost thunderx_regmove_cost =
378 2, /* GP2GP */
379 2, /* GP2FP */
380 6, /* FP2GP */
381 4 /* FP2FP */
384 static const struct cpu_regmove_cost xgene1_regmove_cost =
386 1, /* GP2GP */
387 /* Avoid the use of slow int<->fp moves for spilling by setting
388 their cost higher than memmov_cost. */
389 8, /* GP2FP */
390 8, /* FP2GP */
391 2 /* FP2FP */
394 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
396 2, /* GP2GP */
397 /* Avoid the use of int<->fp moves for spilling. */
398 6, /* GP2FP */
399 6, /* FP2GP */
400 4 /* FP2FP */
403 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
405 1, /* GP2GP */
406 /* Avoid the use of int<->fp moves for spilling. */
407 8, /* GP2FP */
408 8, /* FP2GP */
409 4 /* FP2FP */
412 /* Generic costs for vector insn classes. */
413 static const struct cpu_vector_cost generic_vector_cost =
415 1, /* scalar_int_stmt_cost */
416 1, /* scalar_fp_stmt_cost */
417 1, /* scalar_load_cost */
418 1, /* scalar_store_cost */
419 1, /* vec_int_stmt_cost */
420 1, /* vec_fp_stmt_cost */
421 2, /* vec_permute_cost */
422 1, /* vec_to_scalar_cost */
423 1, /* scalar_to_vec_cost */
424 1, /* vec_align_load_cost */
425 1, /* vec_unalign_load_cost */
426 1, /* vec_unalign_store_cost */
427 1, /* vec_store_cost */
428 3, /* cond_taken_branch_cost */
429 1 /* cond_not_taken_branch_cost */
432 /* ThunderX costs for vector insn classes. */
433 static const struct cpu_vector_cost thunderx_vector_cost =
435 1, /* scalar_int_stmt_cost */
436 1, /* scalar_fp_stmt_cost */
437 3, /* scalar_load_cost */
438 1, /* scalar_store_cost */
439 4, /* vec_int_stmt_cost */
440 1, /* vec_fp_stmt_cost */
441 4, /* vec_permute_cost */
442 2, /* vec_to_scalar_cost */
443 2, /* scalar_to_vec_cost */
444 3, /* vec_align_load_cost */
445 5, /* vec_unalign_load_cost */
446 5, /* vec_unalign_store_cost */
447 1, /* vec_store_cost */
448 3, /* cond_taken_branch_cost */
449 3 /* cond_not_taken_branch_cost */
452 /* Generic costs for vector insn classes. */
453 static const struct cpu_vector_cost cortexa57_vector_cost =
455 1, /* scalar_int_stmt_cost */
456 1, /* scalar_fp_stmt_cost */
457 4, /* scalar_load_cost */
458 1, /* scalar_store_cost */
459 2, /* vec_int_stmt_cost */
460 2, /* vec_fp_stmt_cost */
461 3, /* vec_permute_cost */
462 8, /* vec_to_scalar_cost */
463 8, /* scalar_to_vec_cost */
464 4, /* vec_align_load_cost */
465 4, /* vec_unalign_load_cost */
466 1, /* vec_unalign_store_cost */
467 1, /* vec_store_cost */
468 1, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
472 static const struct cpu_vector_cost exynosm1_vector_cost =
474 1, /* scalar_int_stmt_cost */
475 1, /* scalar_fp_stmt_cost */
476 5, /* scalar_load_cost */
477 1, /* scalar_store_cost */
478 3, /* vec_int_stmt_cost */
479 3, /* vec_fp_stmt_cost */
480 3, /* vec_permute_cost */
481 3, /* vec_to_scalar_cost */
482 3, /* scalar_to_vec_cost */
483 5, /* vec_align_load_cost */
484 5, /* vec_unalign_load_cost */
485 1, /* vec_unalign_store_cost */
486 1, /* vec_store_cost */
487 1, /* cond_taken_branch_cost */
488 1 /* cond_not_taken_branch_cost */
491 /* Generic costs for vector insn classes. */
492 static const struct cpu_vector_cost xgene1_vector_cost =
494 1, /* scalar_int_stmt_cost */
495 1, /* scalar_fp_stmt_cost */
496 5, /* scalar_load_cost */
497 1, /* scalar_store_cost */
498 2, /* vec_int_stmt_cost */
499 2, /* vec_fp_stmt_cost */
500 2, /* vec_permute_cost */
501 4, /* vec_to_scalar_cost */
502 4, /* scalar_to_vec_cost */
503 10, /* vec_align_load_cost */
504 10, /* vec_unalign_load_cost */
505 2, /* vec_unalign_store_cost */
506 2, /* vec_store_cost */
507 2, /* cond_taken_branch_cost */
508 1 /* cond_not_taken_branch_cost */
511 /* Costs for vector insn classes for Vulcan. */
512 static const struct cpu_vector_cost thunderx2t99_vector_cost =
514 1, /* scalar_int_stmt_cost */
515 6, /* scalar_fp_stmt_cost */
516 4, /* scalar_load_cost */
517 1, /* scalar_store_cost */
518 5, /* vec_int_stmt_cost */
519 6, /* vec_fp_stmt_cost */
520 3, /* vec_permute_cost */
521 6, /* vec_to_scalar_cost */
522 5, /* scalar_to_vec_cost */
523 8, /* vec_align_load_cost */
524 8, /* vec_unalign_load_cost */
525 4, /* vec_unalign_store_cost */
526 4, /* vec_store_cost */
527 2, /* cond_taken_branch_cost */
528 1 /* cond_not_taken_branch_cost */
531 /* Generic costs for branch instructions. */
532 static const struct cpu_branch_cost generic_branch_cost =
534 1, /* Predictable. */
535 3 /* Unpredictable. */
538 /* Generic approximation modes. */
539 static const cpu_approx_modes generic_approx_modes =
541 AARCH64_APPROX_NONE, /* division */
542 AARCH64_APPROX_NONE, /* sqrt */
543 AARCH64_APPROX_NONE /* recip_sqrt */
546 /* Approximation modes for Exynos M1. */
547 static const cpu_approx_modes exynosm1_approx_modes =
549 AARCH64_APPROX_NONE, /* division */
550 AARCH64_APPROX_ALL, /* sqrt */
551 AARCH64_APPROX_ALL /* recip_sqrt */
554 /* Approximation modes for X-Gene 1. */
555 static const cpu_approx_modes xgene1_approx_modes =
557 AARCH64_APPROX_NONE, /* division */
558 AARCH64_APPROX_NONE, /* sqrt */
559 AARCH64_APPROX_ALL /* recip_sqrt */
562 /* Generic prefetch settings (which disable prefetch). */
563 static const cpu_prefetch_tune generic_prefetch_tune =
565 0, /* num_slots */
566 -1, /* l1_cache_size */
567 -1, /* l1_cache_line_size */
568 -1, /* l2_cache_size */
569 true, /* prefetch_dynamic_strides */
570 -1, /* minimum_stride */
571 -1 /* default_opt_level */
574 static const cpu_prefetch_tune exynosm1_prefetch_tune =
576 0, /* num_slots */
577 -1, /* l1_cache_size */
578 64, /* l1_cache_line_size */
579 -1, /* l2_cache_size */
580 true, /* prefetch_dynamic_strides */
581 -1, /* minimum_stride */
582 -1 /* default_opt_level */
585 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
587 4, /* num_slots */
588 32, /* l1_cache_size */
589 64, /* l1_cache_line_size */
590 512, /* l2_cache_size */
591 false, /* prefetch_dynamic_strides */
592 2048, /* minimum_stride */
593 3 /* default_opt_level */
596 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
598 8, /* num_slots */
599 32, /* l1_cache_size */
600 128, /* l1_cache_line_size */
601 16*1024, /* l2_cache_size */
602 true, /* prefetch_dynamic_strides */
603 -1, /* minimum_stride */
604 3 /* default_opt_level */
607 static const cpu_prefetch_tune thunderx_prefetch_tune =
609 8, /* num_slots */
610 32, /* l1_cache_size */
611 128, /* l1_cache_line_size */
612 -1, /* l2_cache_size */
613 true, /* prefetch_dynamic_strides */
614 -1, /* minimum_stride */
615 -1 /* default_opt_level */
618 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
620 8, /* num_slots */
621 32, /* l1_cache_size */
622 64, /* l1_cache_line_size */
623 256, /* l2_cache_size */
624 true, /* prefetch_dynamic_strides */
625 -1, /* minimum_stride */
626 -1 /* default_opt_level */
629 static const struct tune_params generic_tunings =
631 &cortexa57_extra_costs,
632 &generic_addrcost_table,
633 &generic_regmove_cost,
634 &generic_vector_cost,
635 &generic_branch_cost,
636 &generic_approx_modes,
637 4, /* memmov_cost */
638 2, /* issue_rate */
639 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
640 8, /* function_align. */
641 4, /* jump_align. */
642 8, /* loop_align. */
643 2, /* int_reassoc_width. */
644 4, /* fp_reassoc_width. */
645 1, /* vec_reassoc_width. */
646 2, /* min_div_recip_mul_sf. */
647 2, /* min_div_recip_mul_df. */
648 0, /* max_case_values. */
649 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
650 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
651 &generic_prefetch_tune
654 static const struct tune_params cortexa35_tunings =
656 &cortexa53_extra_costs,
657 &generic_addrcost_table,
658 &cortexa53_regmove_cost,
659 &generic_vector_cost,
660 &generic_branch_cost,
661 &generic_approx_modes,
662 4, /* memmov_cost */
663 1, /* issue_rate */
664 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
665 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
666 16, /* function_align. */
667 4, /* jump_align. */
668 8, /* loop_align. */
669 2, /* int_reassoc_width. */
670 4, /* fp_reassoc_width. */
671 1, /* vec_reassoc_width. */
672 2, /* min_div_recip_mul_sf. */
673 2, /* min_div_recip_mul_df. */
674 0, /* max_case_values. */
675 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
676 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
677 &generic_prefetch_tune
680 static const struct tune_params cortexa53_tunings =
682 &cortexa53_extra_costs,
683 &generic_addrcost_table,
684 &cortexa53_regmove_cost,
685 &generic_vector_cost,
686 &generic_branch_cost,
687 &generic_approx_modes,
688 4, /* memmov_cost */
689 2, /* issue_rate */
690 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
691 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
692 16, /* function_align. */
693 4, /* jump_align. */
694 8, /* loop_align. */
695 2, /* int_reassoc_width. */
696 4, /* fp_reassoc_width. */
697 1, /* vec_reassoc_width. */
698 2, /* min_div_recip_mul_sf. */
699 2, /* min_div_recip_mul_df. */
700 0, /* max_case_values. */
701 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
702 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
703 &generic_prefetch_tune
706 static const struct tune_params cortexa57_tunings =
708 &cortexa57_extra_costs,
709 &generic_addrcost_table,
710 &cortexa57_regmove_cost,
711 &cortexa57_vector_cost,
712 &generic_branch_cost,
713 &generic_approx_modes,
714 4, /* memmov_cost */
715 3, /* issue_rate */
716 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
717 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
718 16, /* function_align. */
719 4, /* jump_align. */
720 8, /* loop_align. */
721 2, /* int_reassoc_width. */
722 4, /* fp_reassoc_width. */
723 1, /* vec_reassoc_width. */
724 2, /* min_div_recip_mul_sf. */
725 2, /* min_div_recip_mul_df. */
726 0, /* max_case_values. */
727 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
728 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
729 &generic_prefetch_tune
732 static const struct tune_params cortexa72_tunings =
734 &cortexa57_extra_costs,
735 &generic_addrcost_table,
736 &cortexa57_regmove_cost,
737 &cortexa57_vector_cost,
738 &generic_branch_cost,
739 &generic_approx_modes,
740 4, /* memmov_cost */
741 3, /* issue_rate */
742 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
743 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
744 16, /* function_align. */
745 4, /* jump_align. */
746 8, /* loop_align. */
747 2, /* int_reassoc_width. */
748 4, /* fp_reassoc_width. */
749 1, /* vec_reassoc_width. */
750 2, /* min_div_recip_mul_sf. */
751 2, /* min_div_recip_mul_df. */
752 0, /* max_case_values. */
753 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
754 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
755 &generic_prefetch_tune
758 static const struct tune_params cortexa73_tunings =
760 &cortexa57_extra_costs,
761 &generic_addrcost_table,
762 &cortexa57_regmove_cost,
763 &cortexa57_vector_cost,
764 &generic_branch_cost,
765 &generic_approx_modes,
766 4, /* memmov_cost. */
767 2, /* issue_rate. */
768 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
769 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
770 16, /* function_align. */
771 4, /* jump_align. */
772 8, /* loop_align. */
773 2, /* int_reassoc_width. */
774 4, /* fp_reassoc_width. */
775 1, /* vec_reassoc_width. */
776 2, /* min_div_recip_mul_sf. */
777 2, /* min_div_recip_mul_df. */
778 0, /* max_case_values. */
779 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
780 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
781 &generic_prefetch_tune
786 static const struct tune_params exynosm1_tunings =
788 &exynosm1_extra_costs,
789 &exynosm1_addrcost_table,
790 &exynosm1_regmove_cost,
791 &exynosm1_vector_cost,
792 &generic_branch_cost,
793 &exynosm1_approx_modes,
794 4, /* memmov_cost */
795 3, /* issue_rate */
796 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
797 4, /* function_align. */
798 4, /* jump_align. */
799 4, /* loop_align. */
800 2, /* int_reassoc_width. */
801 4, /* fp_reassoc_width. */
802 1, /* vec_reassoc_width. */
803 2, /* min_div_recip_mul_sf. */
804 2, /* min_div_recip_mul_df. */
805 48, /* max_case_values. */
806 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
807 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
808 &exynosm1_prefetch_tune
811 static const struct tune_params thunderxt88_tunings =
813 &thunderx_extra_costs,
814 &generic_addrcost_table,
815 &thunderx_regmove_cost,
816 &thunderx_vector_cost,
817 &generic_branch_cost,
818 &generic_approx_modes,
819 6, /* memmov_cost */
820 2, /* issue_rate */
821 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
822 8, /* function_align. */
823 8, /* jump_align. */
824 8, /* loop_align. */
825 2, /* int_reassoc_width. */
826 4, /* fp_reassoc_width. */
827 1, /* vec_reassoc_width. */
828 2, /* min_div_recip_mul_sf. */
829 2, /* min_div_recip_mul_df. */
830 0, /* max_case_values. */
831 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
832 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
833 &thunderxt88_prefetch_tune
836 static const struct tune_params thunderx_tunings =
838 &thunderx_extra_costs,
839 &generic_addrcost_table,
840 &thunderx_regmove_cost,
841 &thunderx_vector_cost,
842 &generic_branch_cost,
843 &generic_approx_modes,
844 6, /* memmov_cost */
845 2, /* issue_rate */
846 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
847 8, /* function_align. */
848 8, /* jump_align. */
849 8, /* loop_align. */
850 2, /* int_reassoc_width. */
851 4, /* fp_reassoc_width. */
852 1, /* vec_reassoc_width. */
853 2, /* min_div_recip_mul_sf. */
854 2, /* min_div_recip_mul_df. */
855 0, /* max_case_values. */
856 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
857 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
858 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
859 &thunderx_prefetch_tune
862 static const struct tune_params xgene1_tunings =
864 &xgene1_extra_costs,
865 &xgene1_addrcost_table,
866 &xgene1_regmove_cost,
867 &xgene1_vector_cost,
868 &generic_branch_cost,
869 &xgene1_approx_modes,
870 6, /* memmov_cost */
871 4, /* issue_rate */
872 AARCH64_FUSE_NOTHING, /* fusible_ops */
873 16, /* function_align. */
874 8, /* jump_align. */
875 16, /* loop_align. */
876 2, /* int_reassoc_width. */
877 4, /* fp_reassoc_width. */
878 1, /* vec_reassoc_width. */
879 2, /* min_div_recip_mul_sf. */
880 2, /* min_div_recip_mul_df. */
881 0, /* max_case_values. */
882 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
883 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
884 &generic_prefetch_tune
887 static const struct tune_params qdf24xx_tunings =
889 &qdf24xx_extra_costs,
890 &qdf24xx_addrcost_table,
891 &qdf24xx_regmove_cost,
892 &generic_vector_cost,
893 &generic_branch_cost,
894 &generic_approx_modes,
895 4, /* memmov_cost */
896 4, /* issue_rate */
897 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
898 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
899 16, /* function_align. */
900 8, /* jump_align. */
901 16, /* loop_align. */
902 2, /* int_reassoc_width. */
903 4, /* fp_reassoc_width. */
904 1, /* vec_reassoc_width. */
905 2, /* min_div_recip_mul_sf. */
906 2, /* min_div_recip_mul_df. */
907 0, /* max_case_values. */
908 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
909 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
910 &qdf24xx_prefetch_tune
913 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
914 for now. */
915 static const struct tune_params saphira_tunings =
917 &generic_extra_costs,
918 &generic_addrcost_table,
919 &generic_regmove_cost,
920 &generic_vector_cost,
921 &generic_branch_cost,
922 &generic_approx_modes,
923 4, /* memmov_cost */
924 4, /* issue_rate */
925 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
926 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
927 16, /* function_align. */
928 8, /* jump_align. */
929 16, /* loop_align. */
930 2, /* int_reassoc_width. */
931 4, /* fp_reassoc_width. */
932 1, /* vec_reassoc_width. */
933 2, /* min_div_recip_mul_sf. */
934 2, /* min_div_recip_mul_df. */
935 0, /* max_case_values. */
936 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
937 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
938 &generic_prefetch_tune
941 static const struct tune_params thunderx2t99_tunings =
943 &thunderx2t99_extra_costs,
944 &thunderx2t99_addrcost_table,
945 &thunderx2t99_regmove_cost,
946 &thunderx2t99_vector_cost,
947 &generic_branch_cost,
948 &generic_approx_modes,
949 4, /* memmov_cost. */
950 4, /* issue_rate. */
951 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
952 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
953 16, /* function_align. */
954 8, /* jump_align. */
955 16, /* loop_align. */
956 3, /* int_reassoc_width. */
957 2, /* fp_reassoc_width. */
958 2, /* vec_reassoc_width. */
959 2, /* min_div_recip_mul_sf. */
960 2, /* min_div_recip_mul_df. */
961 0, /* max_case_values. */
962 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
963 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
964 &thunderx2t99_prefetch_tune
967 /* Support for fine-grained override of the tuning structures. */
968 struct aarch64_tuning_override_function
970 const char* name;
971 void (*parse_override)(const char*, struct tune_params*);
974 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
975 static void aarch64_parse_tune_string (const char*, struct tune_params*);
977 static const struct aarch64_tuning_override_function
978 aarch64_tuning_override_functions[] =
980 { "fuse", aarch64_parse_fuse_string },
981 { "tune", aarch64_parse_tune_string },
982 { NULL, NULL }
985 /* A processor implementing AArch64. */
986 struct processor
988 const char *const name;
989 enum aarch64_processor ident;
990 enum aarch64_processor sched_core;
991 enum aarch64_arch arch;
992 unsigned architecture_version;
993 const unsigned long flags;
994 const struct tune_params *const tune;
997 /* Architectures implementing AArch64. */
998 static const struct processor all_architectures[] =
1000 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1001 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1002 #include "aarch64-arches.def"
1003 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1006 /* Processor cores implementing AArch64. */
1007 static const struct processor all_cores[] =
1009 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1010 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1011 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1012 FLAGS, &COSTS##_tunings},
1013 #include "aarch64-cores.def"
1014 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1015 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1016 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1020 /* Target specification. These are populated by the -march, -mtune, -mcpu
1021 handling code or by target attributes. */
1022 static const struct processor *selected_arch;
1023 static const struct processor *selected_cpu;
1024 static const struct processor *selected_tune;
1026 /* The current tuning set. */
1027 struct tune_params aarch64_tune_params = generic_tunings;
1029 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1031 /* An ISA extension in the co-processor and main instruction set space. */
1032 struct aarch64_option_extension
1034 const char *const name;
1035 const unsigned long flags_on;
1036 const unsigned long flags_off;
1039 typedef enum aarch64_cond_code
1041 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1042 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1043 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1045 aarch64_cc;
1047 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1049 /* The condition codes of the processor, and the inverse function. */
1050 static const char * const aarch64_condition_codes[] =
1052 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1053 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1056 /* Generate code to enable conditional branches in functions over 1 MiB. */
1057 const char *
1058 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1059 const char * branch_format)
1061 rtx_code_label * tmp_label = gen_label_rtx ();
1062 char label_buf[256];
1063 char buffer[128];
1064 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1065 CODE_LABEL_NUMBER (tmp_label));
1066 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1067 rtx dest_label = operands[pos_label];
1068 operands[pos_label] = tmp_label;
1070 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1071 output_asm_insn (buffer, operands);
1073 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1074 operands[pos_label] = dest_label;
1075 output_asm_insn (buffer, operands);
1076 return "";
1079 void
1080 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1082 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1083 if (TARGET_GENERAL_REGS_ONLY)
1084 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1085 else
1086 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1089 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1090 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1091 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1092 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1093 and GENERAL_REGS is lower than the memory cost (in this case the best class
1094 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1095 cost results in bad allocations with many redundant int<->FP moves which
1096 are expensive on various cores.
1097 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1098 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1099 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1100 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1101 The result of this is that it is no longer inefficient to have a higher
1102 memory move cost than the register move cost.
1105 static reg_class_t
1106 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1107 reg_class_t best_class)
1109 machine_mode mode;
1111 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1112 || !reg_class_subset_p (FP_REGS, allocno_class))
1113 return allocno_class;
1115 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1116 || !reg_class_subset_p (FP_REGS, best_class))
1117 return best_class;
1119 mode = PSEUDO_REGNO_MODE (regno);
1120 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1123 static unsigned int
1124 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1126 if (GET_MODE_UNIT_SIZE (mode) == 4)
1127 return aarch64_tune_params.min_div_recip_mul_sf;
1128 return aarch64_tune_params.min_div_recip_mul_df;
1131 /* Return the reassociation width of treeop OPC with mode MODE. */
1132 static int
1133 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1135 if (VECTOR_MODE_P (mode))
1136 return aarch64_tune_params.vec_reassoc_width;
1137 if (INTEGRAL_MODE_P (mode))
1138 return aarch64_tune_params.int_reassoc_width;
1139 /* Avoid reassociating floating point addition so we emit more FMAs. */
1140 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1141 return aarch64_tune_params.fp_reassoc_width;
1142 return 1;
1145 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1146 unsigned
1147 aarch64_dbx_register_number (unsigned regno)
1149 if (GP_REGNUM_P (regno))
1150 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1151 else if (regno == SP_REGNUM)
1152 return AARCH64_DWARF_SP;
1153 else if (FP_REGNUM_P (regno))
1154 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1155 else if (PR_REGNUM_P (regno))
1156 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1157 else if (regno == VG_REGNUM)
1158 return AARCH64_DWARF_VG;
1160 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1161 equivalent DWARF register. */
1162 return DWARF_FRAME_REGISTERS;
1165 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1166 static bool
1167 aarch64_advsimd_struct_mode_p (machine_mode mode)
1169 return (TARGET_SIMD
1170 && (mode == OImode || mode == CImode || mode == XImode));
1173 /* Return true if MODE is an SVE predicate mode. */
1174 static bool
1175 aarch64_sve_pred_mode_p (machine_mode mode)
1177 return (TARGET_SVE
1178 && (mode == VNx16BImode
1179 || mode == VNx8BImode
1180 || mode == VNx4BImode
1181 || mode == VNx2BImode));
1184 /* Three mutually-exclusive flags describing a vector or predicate type. */
1185 const unsigned int VEC_ADVSIMD = 1;
1186 const unsigned int VEC_SVE_DATA = 2;
1187 const unsigned int VEC_SVE_PRED = 4;
1188 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1189 a structure of 2, 3 or 4 vectors. */
1190 const unsigned int VEC_STRUCT = 8;
1191 /* Useful combinations of the above. */
1192 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1193 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1195 /* Return a set of flags describing the vector properties of mode MODE.
1196 Ignore modes that are not supported by the current target. */
1197 static unsigned int
1198 aarch64_classify_vector_mode (machine_mode mode)
1200 if (aarch64_advsimd_struct_mode_p (mode))
1201 return VEC_ADVSIMD | VEC_STRUCT;
1203 if (aarch64_sve_pred_mode_p (mode))
1204 return VEC_SVE_PRED;
1206 scalar_mode inner = GET_MODE_INNER (mode);
1207 if (VECTOR_MODE_P (mode)
1208 && (inner == QImode
1209 || inner == HImode
1210 || inner == HFmode
1211 || inner == SImode
1212 || inner == SFmode
1213 || inner == DImode
1214 || inner == DFmode))
1216 if (TARGET_SVE)
1218 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1219 return VEC_SVE_DATA;
1220 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1221 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1222 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1223 return VEC_SVE_DATA | VEC_STRUCT;
1226 /* This includes V1DF but not V1DI (which doesn't exist). */
1227 if (TARGET_SIMD
1228 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1229 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1230 return VEC_ADVSIMD;
1233 return 0;
1236 /* Return true if MODE is any of the data vector modes, including
1237 structure modes. */
1238 static bool
1239 aarch64_vector_data_mode_p (machine_mode mode)
1241 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1244 /* Return true if MODE is an SVE data vector mode; either a single vector
1245 or a structure of vectors. */
1246 static bool
1247 aarch64_sve_data_mode_p (machine_mode mode)
1249 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1252 /* Implement target hook TARGET_ARRAY_MODE. */
1253 static opt_machine_mode
1254 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1256 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1257 && IN_RANGE (nelems, 2, 4))
1258 return mode_for_vector (GET_MODE_INNER (mode),
1259 GET_MODE_NUNITS (mode) * nelems);
1261 return opt_machine_mode ();
1264 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1265 static bool
1266 aarch64_array_mode_supported_p (machine_mode mode,
1267 unsigned HOST_WIDE_INT nelems)
1269 if (TARGET_SIMD
1270 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1271 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1272 && (nelems >= 2 && nelems <= 4))
1273 return true;
1275 return false;
1278 /* Return the SVE predicate mode to use for elements that have
1279 ELEM_NBYTES bytes, if such a mode exists. */
1281 opt_machine_mode
1282 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1284 if (TARGET_SVE)
1286 if (elem_nbytes == 1)
1287 return VNx16BImode;
1288 if (elem_nbytes == 2)
1289 return VNx8BImode;
1290 if (elem_nbytes == 4)
1291 return VNx4BImode;
1292 if (elem_nbytes == 8)
1293 return VNx2BImode;
1295 return opt_machine_mode ();
1298 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1300 static opt_machine_mode
1301 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1303 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1305 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1306 machine_mode pred_mode;
1307 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1308 return pred_mode;
1311 return default_get_mask_mode (nunits, nbytes);
1314 /* Implement TARGET_HARD_REGNO_NREGS. */
1316 static unsigned int
1317 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1319 /* ??? Logically we should only need to provide a value when
1320 HARD_REGNO_MODE_OK says that the combination is valid,
1321 but at the moment we need to handle all modes. Just ignore
1322 any runtime parts for registers that can't store them. */
1323 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1324 switch (aarch64_regno_regclass (regno))
1326 case FP_REGS:
1327 case FP_LO_REGS:
1328 if (aarch64_sve_data_mode_p (mode))
1329 return exact_div (GET_MODE_SIZE (mode),
1330 BYTES_PER_SVE_VECTOR).to_constant ();
1331 return CEIL (lowest_size, UNITS_PER_VREG);
1332 case PR_REGS:
1333 case PR_LO_REGS:
1334 case PR_HI_REGS:
1335 return 1;
1336 default:
1337 return CEIL (lowest_size, UNITS_PER_WORD);
1339 gcc_unreachable ();
1342 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1344 static bool
1345 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1347 if (GET_MODE_CLASS (mode) == MODE_CC)
1348 return regno == CC_REGNUM;
1350 if (regno == VG_REGNUM)
1351 /* This must have the same size as _Unwind_Word. */
1352 return mode == DImode;
1354 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1355 if (vec_flags & VEC_SVE_PRED)
1356 return PR_REGNUM_P (regno);
1358 if (PR_REGNUM_P (regno))
1359 return 0;
1361 if (regno == SP_REGNUM)
1362 /* The purpose of comparing with ptr_mode is to support the
1363 global register variable associated with the stack pointer
1364 register via the syntax of asm ("wsp") in ILP32. */
1365 return mode == Pmode || mode == ptr_mode;
1367 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1368 return mode == Pmode;
1370 if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1371 return true;
1373 if (FP_REGNUM_P (regno))
1375 if (vec_flags & VEC_STRUCT)
1376 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1377 else
1378 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1381 return false;
1384 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1385 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1386 clobbers the top 64 bits when restoring the bottom 64 bits. */
1388 static bool
1389 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1391 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1394 /* Implement REGMODE_NATURAL_SIZE. */
1395 poly_uint64
1396 aarch64_regmode_natural_size (machine_mode mode)
1398 /* The natural size for SVE data modes is one SVE data vector,
1399 and similarly for predicates. We can't independently modify
1400 anything smaller than that. */
1401 /* ??? For now, only do this for variable-width SVE registers.
1402 Doing it for constant-sized registers breaks lower-subreg.c. */
1403 /* ??? And once that's fixed, we should probably have similar
1404 code for Advanced SIMD. */
1405 if (!aarch64_sve_vg.is_constant ())
1407 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1408 if (vec_flags & VEC_SVE_PRED)
1409 return BYTES_PER_SVE_PRED;
1410 if (vec_flags & VEC_SVE_DATA)
1411 return BYTES_PER_SVE_VECTOR;
1413 return UNITS_PER_WORD;
1416 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1417 machine_mode
1418 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1419 machine_mode mode)
1421 /* The predicate mode determines which bits are significant and
1422 which are "don't care". Decreasing the number of lanes would
1423 lose data while increasing the number of lanes would make bits
1424 unnecessarily significant. */
1425 if (PR_REGNUM_P (regno))
1426 return mode;
1427 if (known_ge (GET_MODE_SIZE (mode), 4))
1428 return mode;
1429 else
1430 return SImode;
1433 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1434 that strcpy from constants will be faster. */
1436 static HOST_WIDE_INT
1437 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1439 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1440 return MAX (align, BITS_PER_WORD);
1441 return align;
1444 /* Return true if calls to DECL should be treated as
1445 long-calls (ie called via a register). */
1446 static bool
1447 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1449 return false;
1452 /* Return true if calls to symbol-ref SYM should be treated as
1453 long-calls (ie called via a register). */
1454 bool
1455 aarch64_is_long_call_p (rtx sym)
1457 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1460 /* Return true if calls to symbol-ref SYM should not go through
1461 plt stubs. */
1463 bool
1464 aarch64_is_noplt_call_p (rtx sym)
1466 const_tree decl = SYMBOL_REF_DECL (sym);
1468 if (flag_pic
1469 && decl
1470 && (!flag_plt
1471 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1472 && !targetm.binds_local_p (decl))
1473 return true;
1475 return false;
1478 /* Return true if the offsets to a zero/sign-extract operation
1479 represent an expression that matches an extend operation. The
1480 operands represent the paramters from
1482 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1483 bool
1484 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1485 rtx extract_imm)
1487 HOST_WIDE_INT mult_val, extract_val;
1489 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1490 return false;
1492 mult_val = INTVAL (mult_imm);
1493 extract_val = INTVAL (extract_imm);
1495 if (extract_val > 8
1496 && extract_val < GET_MODE_BITSIZE (mode)
1497 && exact_log2 (extract_val & ~7) > 0
1498 && (extract_val & 7) <= 4
1499 && mult_val == (1 << (extract_val & 7)))
1500 return true;
1502 return false;
1505 /* Emit an insn that's a simple single-set. Both the operands must be
1506 known to be valid. */
1507 inline static rtx_insn *
1508 emit_set_insn (rtx x, rtx y)
1510 return emit_insn (gen_rtx_SET (x, y));
1513 /* X and Y are two things to compare using CODE. Emit the compare insn and
1514 return the rtx for register 0 in the proper mode. */
1516 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1518 machine_mode mode = SELECT_CC_MODE (code, x, y);
1519 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1521 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1522 return cc_reg;
1525 /* Build the SYMBOL_REF for __tls_get_addr. */
1527 static GTY(()) rtx tls_get_addr_libfunc;
1530 aarch64_tls_get_addr (void)
1532 if (!tls_get_addr_libfunc)
1533 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1534 return tls_get_addr_libfunc;
1537 /* Return the TLS model to use for ADDR. */
1539 static enum tls_model
1540 tls_symbolic_operand_type (rtx addr)
1542 enum tls_model tls_kind = TLS_MODEL_NONE;
1543 if (GET_CODE (addr) == CONST)
1545 poly_int64 addend;
1546 rtx sym = strip_offset (addr, &addend);
1547 if (GET_CODE (sym) == SYMBOL_REF)
1548 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1550 else if (GET_CODE (addr) == SYMBOL_REF)
1551 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1553 return tls_kind;
1556 /* We'll allow lo_sum's in addresses in our legitimate addresses
1557 so that combine would take care of combining addresses where
1558 necessary, but for generation purposes, we'll generate the address
1559 as :
1560 RTL Absolute
1561 tmp = hi (symbol_ref); adrp x1, foo
1562 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1565 PIC TLS
1566 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1567 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1568 bl __tls_get_addr
1571 Load TLS symbol, depending on TLS mechanism and TLS access model.
1573 Global Dynamic - Traditional TLS:
1574 adrp tmp, :tlsgd:imm
1575 add dest, tmp, #:tlsgd_lo12:imm
1576 bl __tls_get_addr
1578 Global Dynamic - TLS Descriptors:
1579 adrp dest, :tlsdesc:imm
1580 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1581 add dest, dest, #:tlsdesc_lo12:imm
1582 blr tmp
1583 mrs tp, tpidr_el0
1584 add dest, dest, tp
1586 Initial Exec:
1587 mrs tp, tpidr_el0
1588 adrp tmp, :gottprel:imm
1589 ldr dest, [tmp, #:gottprel_lo12:imm]
1590 add dest, dest, tp
1592 Local Exec:
1593 mrs tp, tpidr_el0
1594 add t0, tp, #:tprel_hi12:imm, lsl #12
1595 add t0, t0, #:tprel_lo12_nc:imm
1598 static void
1599 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1600 enum aarch64_symbol_type type)
1602 switch (type)
1604 case SYMBOL_SMALL_ABSOLUTE:
1606 /* In ILP32, the mode of dest can be either SImode or DImode. */
1607 rtx tmp_reg = dest;
1608 machine_mode mode = GET_MODE (dest);
1610 gcc_assert (mode == Pmode || mode == ptr_mode);
1612 if (can_create_pseudo_p ())
1613 tmp_reg = gen_reg_rtx (mode);
1615 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1616 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1617 return;
1620 case SYMBOL_TINY_ABSOLUTE:
1621 emit_insn (gen_rtx_SET (dest, imm));
1622 return;
1624 case SYMBOL_SMALL_GOT_28K:
1626 machine_mode mode = GET_MODE (dest);
1627 rtx gp_rtx = pic_offset_table_rtx;
1628 rtx insn;
1629 rtx mem;
1631 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1632 here before rtl expand. Tree IVOPT will generate rtl pattern to
1633 decide rtx costs, in which case pic_offset_table_rtx is not
1634 initialized. For that case no need to generate the first adrp
1635 instruction as the final cost for global variable access is
1636 one instruction. */
1637 if (gp_rtx != NULL)
1639 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1640 using the page base as GOT base, the first page may be wasted,
1641 in the worst scenario, there is only 28K space for GOT).
1643 The generate instruction sequence for accessing global variable
1646 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1648 Only one instruction needed. But we must initialize
1649 pic_offset_table_rtx properly. We generate initialize insn for
1650 every global access, and allow CSE to remove all redundant.
1652 The final instruction sequences will look like the following
1653 for multiply global variables access.
1655 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1657 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1658 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1659 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1660 ... */
1662 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1663 crtl->uses_pic_offset_table = 1;
1664 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1666 if (mode != GET_MODE (gp_rtx))
1667 gp_rtx = gen_lowpart (mode, gp_rtx);
1671 if (mode == ptr_mode)
1673 if (mode == DImode)
1674 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1675 else
1676 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1678 mem = XVECEXP (SET_SRC (insn), 0, 0);
1680 else
1682 gcc_assert (mode == Pmode);
1684 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1685 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1688 /* The operand is expected to be MEM. Whenever the related insn
1689 pattern changed, above code which calculate mem should be
1690 updated. */
1691 gcc_assert (GET_CODE (mem) == MEM);
1692 MEM_READONLY_P (mem) = 1;
1693 MEM_NOTRAP_P (mem) = 1;
1694 emit_insn (insn);
1695 return;
1698 case SYMBOL_SMALL_GOT_4G:
1700 /* In ILP32, the mode of dest can be either SImode or DImode,
1701 while the got entry is always of SImode size. The mode of
1702 dest depends on how dest is used: if dest is assigned to a
1703 pointer (e.g. in the memory), it has SImode; it may have
1704 DImode if dest is dereferenced to access the memeory.
1705 This is why we have to handle three different ldr_got_small
1706 patterns here (two patterns for ILP32). */
1708 rtx insn;
1709 rtx mem;
1710 rtx tmp_reg = dest;
1711 machine_mode mode = GET_MODE (dest);
1713 if (can_create_pseudo_p ())
1714 tmp_reg = gen_reg_rtx (mode);
1716 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1717 if (mode == ptr_mode)
1719 if (mode == DImode)
1720 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1721 else
1722 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1724 mem = XVECEXP (SET_SRC (insn), 0, 0);
1726 else
1728 gcc_assert (mode == Pmode);
1730 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1731 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1734 gcc_assert (GET_CODE (mem) == MEM);
1735 MEM_READONLY_P (mem) = 1;
1736 MEM_NOTRAP_P (mem) = 1;
1737 emit_insn (insn);
1738 return;
1741 case SYMBOL_SMALL_TLSGD:
1743 rtx_insn *insns;
1744 machine_mode mode = GET_MODE (dest);
1745 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1747 start_sequence ();
1748 if (TARGET_ILP32)
1749 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1750 else
1751 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1752 insns = get_insns ();
1753 end_sequence ();
1755 RTL_CONST_CALL_P (insns) = 1;
1756 emit_libcall_block (insns, dest, result, imm);
1757 return;
1760 case SYMBOL_SMALL_TLSDESC:
1762 machine_mode mode = GET_MODE (dest);
1763 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1764 rtx tp;
1766 gcc_assert (mode == Pmode || mode == ptr_mode);
1768 /* In ILP32, the got entry is always of SImode size. Unlike
1769 small GOT, the dest is fixed at reg 0. */
1770 if (TARGET_ILP32)
1771 emit_insn (gen_tlsdesc_small_si (imm));
1772 else
1773 emit_insn (gen_tlsdesc_small_di (imm));
1774 tp = aarch64_load_tp (NULL);
1776 if (mode != Pmode)
1777 tp = gen_lowpart (mode, tp);
1779 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1780 if (REG_P (dest))
1781 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1782 return;
1785 case SYMBOL_SMALL_TLSIE:
1787 /* In ILP32, the mode of dest can be either SImode or DImode,
1788 while the got entry is always of SImode size. The mode of
1789 dest depends on how dest is used: if dest is assigned to a
1790 pointer (e.g. in the memory), it has SImode; it may have
1791 DImode if dest is dereferenced to access the memeory.
1792 This is why we have to handle three different tlsie_small
1793 patterns here (two patterns for ILP32). */
1794 machine_mode mode = GET_MODE (dest);
1795 rtx tmp_reg = gen_reg_rtx (mode);
1796 rtx tp = aarch64_load_tp (NULL);
1798 if (mode == ptr_mode)
1800 if (mode == DImode)
1801 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1802 else
1804 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1805 tp = gen_lowpart (mode, tp);
1808 else
1810 gcc_assert (mode == Pmode);
1811 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1814 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1815 if (REG_P (dest))
1816 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1817 return;
1820 case SYMBOL_TLSLE12:
1821 case SYMBOL_TLSLE24:
1822 case SYMBOL_TLSLE32:
1823 case SYMBOL_TLSLE48:
1825 machine_mode mode = GET_MODE (dest);
1826 rtx tp = aarch64_load_tp (NULL);
1828 if (mode != Pmode)
1829 tp = gen_lowpart (mode, tp);
1831 switch (type)
1833 case SYMBOL_TLSLE12:
1834 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1835 (dest, tp, imm));
1836 break;
1837 case SYMBOL_TLSLE24:
1838 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1839 (dest, tp, imm));
1840 break;
1841 case SYMBOL_TLSLE32:
1842 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1843 (dest, imm));
1844 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1845 (dest, dest, tp));
1846 break;
1847 case SYMBOL_TLSLE48:
1848 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1849 (dest, imm));
1850 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1851 (dest, dest, tp));
1852 break;
1853 default:
1854 gcc_unreachable ();
1857 if (REG_P (dest))
1858 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1859 return;
1862 case SYMBOL_TINY_GOT:
1863 emit_insn (gen_ldr_got_tiny (dest, imm));
1864 return;
1866 case SYMBOL_TINY_TLSIE:
1868 machine_mode mode = GET_MODE (dest);
1869 rtx tp = aarch64_load_tp (NULL);
1871 if (mode == ptr_mode)
1873 if (mode == DImode)
1874 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1875 else
1877 tp = gen_lowpart (mode, tp);
1878 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1881 else
1883 gcc_assert (mode == Pmode);
1884 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1887 if (REG_P (dest))
1888 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1889 return;
1892 default:
1893 gcc_unreachable ();
1897 /* Emit a move from SRC to DEST. Assume that the move expanders can
1898 handle all moves if !can_create_pseudo_p (). The distinction is
1899 important because, unlike emit_move_insn, the move expanders know
1900 how to force Pmode objects into the constant pool even when the
1901 constant pool address is not itself legitimate. */
1902 static rtx
1903 aarch64_emit_move (rtx dest, rtx src)
1905 return (can_create_pseudo_p ()
1906 ? emit_move_insn (dest, src)
1907 : emit_move_insn_1 (dest, src));
1910 /* Apply UNOPTAB to OP and store the result in DEST. */
1912 static void
1913 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1915 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1916 if (dest != tmp)
1917 emit_move_insn (dest, tmp);
1920 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
1922 static void
1923 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1925 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1926 OPTAB_DIRECT);
1927 if (dest != tmp)
1928 emit_move_insn (dest, tmp);
1931 /* Split a 128-bit move operation into two 64-bit move operations,
1932 taking care to handle partial overlap of register to register
1933 copies. Special cases are needed when moving between GP regs and
1934 FP regs. SRC can be a register, constant or memory; DST a register
1935 or memory. If either operand is memory it must not have any side
1936 effects. */
1937 void
1938 aarch64_split_128bit_move (rtx dst, rtx src)
1940 rtx dst_lo, dst_hi;
1941 rtx src_lo, src_hi;
1943 machine_mode mode = GET_MODE (dst);
1945 gcc_assert (mode == TImode || mode == TFmode);
1946 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1947 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1949 if (REG_P (dst) && REG_P (src))
1951 int src_regno = REGNO (src);
1952 int dst_regno = REGNO (dst);
1954 /* Handle FP <-> GP regs. */
1955 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1957 src_lo = gen_lowpart (word_mode, src);
1958 src_hi = gen_highpart (word_mode, src);
1960 if (mode == TImode)
1962 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1963 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1965 else
1967 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1968 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1970 return;
1972 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1974 dst_lo = gen_lowpart (word_mode, dst);
1975 dst_hi = gen_highpart (word_mode, dst);
1977 if (mode == TImode)
1979 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1980 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1982 else
1984 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1985 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1987 return;
1991 dst_lo = gen_lowpart (word_mode, dst);
1992 dst_hi = gen_highpart (word_mode, dst);
1993 src_lo = gen_lowpart (word_mode, src);
1994 src_hi = gen_highpart_mode (word_mode, mode, src);
1996 /* At most one pairing may overlap. */
1997 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1999 aarch64_emit_move (dst_hi, src_hi);
2000 aarch64_emit_move (dst_lo, src_lo);
2002 else
2004 aarch64_emit_move (dst_lo, src_lo);
2005 aarch64_emit_move (dst_hi, src_hi);
2009 bool
2010 aarch64_split_128bit_move_p (rtx dst, rtx src)
2012 return (! REG_P (src)
2013 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2016 /* Split a complex SIMD combine. */
2018 void
2019 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2021 machine_mode src_mode = GET_MODE (src1);
2022 machine_mode dst_mode = GET_MODE (dst);
2024 gcc_assert (VECTOR_MODE_P (dst_mode));
2025 gcc_assert (register_operand (dst, dst_mode)
2026 && register_operand (src1, src_mode)
2027 && register_operand (src2, src_mode));
2029 rtx (*gen) (rtx, rtx, rtx);
2031 switch (src_mode)
2033 case E_V8QImode:
2034 gen = gen_aarch64_simd_combinev8qi;
2035 break;
2036 case E_V4HImode:
2037 gen = gen_aarch64_simd_combinev4hi;
2038 break;
2039 case E_V2SImode:
2040 gen = gen_aarch64_simd_combinev2si;
2041 break;
2042 case E_V4HFmode:
2043 gen = gen_aarch64_simd_combinev4hf;
2044 break;
2045 case E_V2SFmode:
2046 gen = gen_aarch64_simd_combinev2sf;
2047 break;
2048 case E_DImode:
2049 gen = gen_aarch64_simd_combinedi;
2050 break;
2051 case E_DFmode:
2052 gen = gen_aarch64_simd_combinedf;
2053 break;
2054 default:
2055 gcc_unreachable ();
2058 emit_insn (gen (dst, src1, src2));
2059 return;
2062 /* Split a complex SIMD move. */
2064 void
2065 aarch64_split_simd_move (rtx dst, rtx src)
2067 machine_mode src_mode = GET_MODE (src);
2068 machine_mode dst_mode = GET_MODE (dst);
2070 gcc_assert (VECTOR_MODE_P (dst_mode));
2072 if (REG_P (dst) && REG_P (src))
2074 rtx (*gen) (rtx, rtx);
2076 gcc_assert (VECTOR_MODE_P (src_mode));
2078 switch (src_mode)
2080 case E_V16QImode:
2081 gen = gen_aarch64_split_simd_movv16qi;
2082 break;
2083 case E_V8HImode:
2084 gen = gen_aarch64_split_simd_movv8hi;
2085 break;
2086 case E_V4SImode:
2087 gen = gen_aarch64_split_simd_movv4si;
2088 break;
2089 case E_V2DImode:
2090 gen = gen_aarch64_split_simd_movv2di;
2091 break;
2092 case E_V8HFmode:
2093 gen = gen_aarch64_split_simd_movv8hf;
2094 break;
2095 case E_V4SFmode:
2096 gen = gen_aarch64_split_simd_movv4sf;
2097 break;
2098 case E_V2DFmode:
2099 gen = gen_aarch64_split_simd_movv2df;
2100 break;
2101 default:
2102 gcc_unreachable ();
2105 emit_insn (gen (dst, src));
2106 return;
2110 bool
2111 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2112 machine_mode ymode, rtx y)
2114 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2115 gcc_assert (r != NULL);
2116 return rtx_equal_p (x, r);
2120 static rtx
2121 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2123 if (can_create_pseudo_p ())
2124 return force_reg (mode, value);
2125 else
2127 gcc_assert (x);
2128 aarch64_emit_move (x, value);
2129 return x;
2133 /* Return true if we can move VALUE into a register using a single
2134 CNT[BHWD] instruction. */
2136 static bool
2137 aarch64_sve_cnt_immediate_p (poly_int64 value)
2139 HOST_WIDE_INT factor = value.coeffs[0];
2140 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2141 return (value.coeffs[1] == factor
2142 && IN_RANGE (factor, 2, 16 * 16)
2143 && (factor & 1) == 0
2144 && factor <= 16 * (factor & -factor));
2147 /* Likewise for rtx X. */
2149 bool
2150 aarch64_sve_cnt_immediate_p (rtx x)
2152 poly_int64 value;
2153 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2156 /* Return the asm string for an instruction with a CNT-like vector size
2157 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2158 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2159 first part of the operands template (the part that comes before the
2160 vector size itself). FACTOR is the number of quadwords.
2161 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2162 If it is zero, we can use any element size. */
2164 static char *
2165 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2166 unsigned int factor,
2167 unsigned int nelts_per_vq)
2169 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2171 if (nelts_per_vq == 0)
2172 /* There is some overlap in the ranges of the four CNT instructions.
2173 Here we always use the smallest possible element size, so that the
2174 multiplier is 1 whereever possible. */
2175 nelts_per_vq = factor & -factor;
2176 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2177 gcc_assert (IN_RANGE (shift, 1, 4));
2178 char suffix = "dwhb"[shift - 1];
2180 factor >>= shift;
2181 unsigned int written;
2182 if (factor == 1)
2183 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2184 prefix, suffix, operands);
2185 else
2186 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2187 prefix, suffix, operands, factor);
2188 gcc_assert (written < sizeof (buffer));
2189 return buffer;
2192 /* Return the asm string for an instruction with a CNT-like vector size
2193 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2194 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2195 first part of the operands template (the part that comes before the
2196 vector size itself). X is the value of the vector size operand,
2197 as a polynomial integer rtx. */
2199 char *
2200 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2201 rtx x)
2203 poly_int64 value = rtx_to_poly_int64 (x);
2204 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2205 return aarch64_output_sve_cnt_immediate (prefix, operands,
2206 value.coeffs[1], 0);
2209 /* Return true if we can add VALUE to a register using a single ADDVL
2210 or ADDPL instruction. */
2212 static bool
2213 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2215 HOST_WIDE_INT factor = value.coeffs[0];
2216 if (factor == 0 || value.coeffs[1] != factor)
2217 return false;
2218 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2219 and a value of 16 is one vector width. */
2220 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2221 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2224 /* Likewise for rtx X. */
2226 bool
2227 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2229 poly_int64 value;
2230 return (poly_int_rtx_p (x, &value)
2231 && aarch64_sve_addvl_addpl_immediate_p (value));
2234 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2235 and storing the result in operand 0. */
2237 char *
2238 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2240 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2241 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2242 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2244 /* Use INC or DEC if possible. */
2245 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2247 if (aarch64_sve_cnt_immediate_p (offset_value))
2248 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2249 offset_value.coeffs[1], 0);
2250 if (aarch64_sve_cnt_immediate_p (-offset_value))
2251 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2252 -offset_value.coeffs[1], 0);
2255 int factor = offset_value.coeffs[1];
2256 if ((factor & 15) == 0)
2257 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2258 else
2259 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2260 return buffer;
2263 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2264 instruction. If it is, store the number of elements in each vector
2265 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2266 factor in *FACTOR_OUT (if nonnull). */
2268 bool
2269 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2270 unsigned int *nelts_per_vq_out)
2272 rtx elt;
2273 poly_int64 value;
2275 if (!const_vec_duplicate_p (x, &elt)
2276 || !poly_int_rtx_p (elt, &value))
2277 return false;
2279 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2280 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2281 /* There's no vector INCB. */
2282 return false;
2284 HOST_WIDE_INT factor = value.coeffs[0];
2285 if (value.coeffs[1] != factor)
2286 return false;
2288 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2289 if ((factor % nelts_per_vq) != 0
2290 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2291 return false;
2293 if (factor_out)
2294 *factor_out = factor;
2295 if (nelts_per_vq_out)
2296 *nelts_per_vq_out = nelts_per_vq;
2297 return true;
2300 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2301 instruction. */
2303 bool
2304 aarch64_sve_inc_dec_immediate_p (rtx x)
2306 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2309 /* Return the asm template for an SVE vector INC or DEC instruction.
2310 OPERANDS gives the operands before the vector count and X is the
2311 value of the vector count operand itself. */
2313 char *
2314 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2316 int factor;
2317 unsigned int nelts_per_vq;
2318 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2319 gcc_unreachable ();
2320 if (factor < 0)
2321 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2322 nelts_per_vq);
2323 else
2324 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2325 nelts_per_vq);
2328 static int
2329 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2330 scalar_int_mode mode)
2332 int i;
2333 unsigned HOST_WIDE_INT val, val2, mask;
2334 int one_match, zero_match;
2335 int num_insns;
2337 val = INTVAL (imm);
2339 if (aarch64_move_imm (val, mode))
2341 if (generate)
2342 emit_insn (gen_rtx_SET (dest, imm));
2343 return 1;
2346 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2347 (with XXXX non-zero). In that case check to see if the move can be done in
2348 a smaller mode. */
2349 val2 = val & 0xffffffff;
2350 if (mode == DImode
2351 && aarch64_move_imm (val2, SImode)
2352 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2354 if (generate)
2355 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2357 /* Check if we have to emit a second instruction by checking to see
2358 if any of the upper 32 bits of the original DI mode value is set. */
2359 if (val == val2)
2360 return 1;
2362 i = (val >> 48) ? 48 : 32;
2364 if (generate)
2365 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2366 GEN_INT ((val >> i) & 0xffff)));
2368 return 2;
2371 if ((val >> 32) == 0 || mode == SImode)
2373 if (generate)
2375 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2376 if (mode == SImode)
2377 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2378 GEN_INT ((val >> 16) & 0xffff)));
2379 else
2380 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2381 GEN_INT ((val >> 16) & 0xffff)));
2383 return 2;
2386 /* Remaining cases are all for DImode. */
2388 mask = 0xffff;
2389 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2390 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2391 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2392 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2394 if (zero_match != 2 && one_match != 2)
2396 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2397 For a 64-bit bitmask try whether changing 16 bits to all ones or
2398 zeroes creates a valid bitmask. To check any repeated bitmask,
2399 try using 16 bits from the other 32-bit half of val. */
2401 for (i = 0; i < 64; i += 16, mask <<= 16)
2403 val2 = val & ~mask;
2404 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2405 break;
2406 val2 = val | mask;
2407 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2408 break;
2409 val2 = val2 & ~mask;
2410 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2411 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2412 break;
2414 if (i != 64)
2416 if (generate)
2418 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2419 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2420 GEN_INT ((val >> i) & 0xffff)));
2422 return 2;
2426 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2427 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2428 otherwise skip zero bits. */
2430 num_insns = 1;
2431 mask = 0xffff;
2432 val2 = one_match > zero_match ? ~val : val;
2433 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2435 if (generate)
2436 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2437 ? (val | ~(mask << i))
2438 : (val & (mask << i)))));
2439 for (i += 16; i < 64; i += 16)
2441 if ((val2 & (mask << i)) == 0)
2442 continue;
2443 if (generate)
2444 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2445 GEN_INT ((val >> i) & 0xffff)));
2446 num_insns ++;
2449 return num_insns;
2452 /* Return whether imm is a 128-bit immediate which is simple enough to
2453 expand inline. */
2454 bool
2455 aarch64_mov128_immediate (rtx imm)
2457 if (GET_CODE (imm) == CONST_INT)
2458 return true;
2460 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2462 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2463 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2465 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2466 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2470 /* Return the number of temporary registers that aarch64_add_offset_1
2471 would need to add OFFSET to a register. */
2473 static unsigned int
2474 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2476 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2479 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2480 a non-polynomial OFFSET. MODE is the mode of the addition.
2481 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2482 be set and CFA adjustments added to the generated instructions.
2484 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2485 temporary if register allocation is already complete. This temporary
2486 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2487 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2488 the immediate again.
2490 Since this function may be used to adjust the stack pointer, we must
2491 ensure that it cannot cause transient stack deallocation (for example
2492 by first incrementing SP and then decrementing when adjusting by a
2493 large immediate). */
2495 static void
2496 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2497 rtx src, HOST_WIDE_INT offset, rtx temp1,
2498 bool frame_related_p, bool emit_move_imm)
2500 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2501 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2503 HOST_WIDE_INT moffset = abs_hwi (offset);
2504 rtx_insn *insn;
2506 if (!moffset)
2508 if (!rtx_equal_p (dest, src))
2510 insn = emit_insn (gen_rtx_SET (dest, src));
2511 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2513 return;
2516 /* Single instruction adjustment. */
2517 if (aarch64_uimm12_shift (moffset))
2519 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2520 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2521 return;
2524 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2525 and either:
2527 a) the offset cannot be loaded by a 16-bit move or
2528 b) there is no spare register into which we can move it. */
2529 if (moffset < 0x1000000
2530 && ((!temp1 && !can_create_pseudo_p ())
2531 || !aarch64_move_imm (moffset, mode)))
2533 HOST_WIDE_INT low_off = moffset & 0xfff;
2535 low_off = offset < 0 ? -low_off : low_off;
2536 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2537 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2538 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2539 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2540 return;
2543 /* Emit a move immediate if required and an addition/subtraction. */
2544 if (emit_move_imm)
2546 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2547 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2549 insn = emit_insn (offset < 0
2550 ? gen_sub3_insn (dest, src, temp1)
2551 : gen_add3_insn (dest, src, temp1));
2552 if (frame_related_p)
2554 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2555 rtx adj = plus_constant (mode, src, offset);
2556 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2560 /* Return the number of temporary registers that aarch64_add_offset
2561 would need to move OFFSET into a register or add OFFSET to a register;
2562 ADD_P is true if we want the latter rather than the former. */
2564 static unsigned int
2565 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2567 /* This follows the same structure as aarch64_add_offset. */
2568 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2569 return 0;
2571 unsigned int count = 0;
2572 HOST_WIDE_INT factor = offset.coeffs[1];
2573 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2574 poly_int64 poly_offset (factor, factor);
2575 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2576 /* Need one register for the ADDVL/ADDPL result. */
2577 count += 1;
2578 else if (factor != 0)
2580 factor = abs (factor);
2581 if (factor > 16 * (factor & -factor))
2582 /* Need one register for the CNT result and one for the multiplication
2583 factor. If necessary, the second temporary can be reused for the
2584 constant part of the offset. */
2585 return 2;
2586 /* Need one register for the CNT result (which might then
2587 be shifted). */
2588 count += 1;
2590 return count + aarch64_add_offset_1_temporaries (constant);
2593 /* If X can be represented as a poly_int64, return the number
2594 of temporaries that are required to add it to a register.
2595 Return -1 otherwise. */
2598 aarch64_add_offset_temporaries (rtx x)
2600 poly_int64 offset;
2601 if (!poly_int_rtx_p (x, &offset))
2602 return -1;
2603 return aarch64_offset_temporaries (true, offset);
2606 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2607 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2608 be set and CFA adjustments added to the generated instructions.
2610 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2611 temporary if register allocation is already complete. This temporary
2612 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2613 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2614 false to avoid emitting the immediate again.
2616 TEMP2, if nonnull, is a second temporary register that doesn't
2617 overlap either DEST or REG.
2619 Since this function may be used to adjust the stack pointer, we must
2620 ensure that it cannot cause transient stack deallocation (for example
2621 by first incrementing SP and then decrementing when adjusting by a
2622 large immediate). */
2624 static void
2625 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2626 poly_int64 offset, rtx temp1, rtx temp2,
2627 bool frame_related_p, bool emit_move_imm = true)
2629 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2630 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2631 gcc_assert (temp1 == NULL_RTX
2632 || !frame_related_p
2633 || !reg_overlap_mentioned_p (temp1, dest));
2634 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2636 /* Try using ADDVL or ADDPL to add the whole value. */
2637 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2639 rtx offset_rtx = gen_int_mode (offset, mode);
2640 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2641 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2642 return;
2645 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2646 SVE vector register, over and above the minimum size of 128 bits.
2647 This is equivalent to half the value returned by CNTD with a
2648 vector shape of ALL. */
2649 HOST_WIDE_INT factor = offset.coeffs[1];
2650 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2652 /* Try using ADDVL or ADDPL to add the VG-based part. */
2653 poly_int64 poly_offset (factor, factor);
2654 if (src != const0_rtx
2655 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2657 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2658 if (frame_related_p)
2660 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2661 RTX_FRAME_RELATED_P (insn) = true;
2662 src = dest;
2664 else
2666 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2667 src = aarch64_force_temporary (mode, temp1, addr);
2668 temp1 = temp2;
2669 temp2 = NULL_RTX;
2672 /* Otherwise use a CNT-based sequence. */
2673 else if (factor != 0)
2675 /* Use a subtraction if we have a negative factor. */
2676 rtx_code code = PLUS;
2677 if (factor < 0)
2679 factor = -factor;
2680 code = MINUS;
2683 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2684 into the multiplication. */
2685 rtx val;
2686 int shift = 0;
2687 if (factor & 1)
2688 /* Use a right shift by 1. */
2689 shift = -1;
2690 else
2691 factor /= 2;
2692 HOST_WIDE_INT low_bit = factor & -factor;
2693 if (factor <= 16 * low_bit)
2695 if (factor > 16 * 8)
2697 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2698 the value with the minimum multiplier and shift it into
2699 position. */
2700 int extra_shift = exact_log2 (low_bit);
2701 shift += extra_shift;
2702 factor >>= extra_shift;
2704 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2706 else
2708 /* Use CNTD, then multiply it by FACTOR. */
2709 val = gen_int_mode (poly_int64 (2, 2), mode);
2710 val = aarch64_force_temporary (mode, temp1, val);
2712 /* Go back to using a negative multiplication factor if we have
2713 no register from which to subtract. */
2714 if (code == MINUS && src == const0_rtx)
2716 factor = -factor;
2717 code = PLUS;
2719 rtx coeff1 = gen_int_mode (factor, mode);
2720 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2721 val = gen_rtx_MULT (mode, val, coeff1);
2724 if (shift > 0)
2726 /* Multiply by 1 << SHIFT. */
2727 val = aarch64_force_temporary (mode, temp1, val);
2728 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2730 else if (shift == -1)
2732 /* Divide by 2. */
2733 val = aarch64_force_temporary (mode, temp1, val);
2734 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2737 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2738 if (src != const0_rtx)
2740 val = aarch64_force_temporary (mode, temp1, val);
2741 val = gen_rtx_fmt_ee (code, mode, src, val);
2743 else if (code == MINUS)
2745 val = aarch64_force_temporary (mode, temp1, val);
2746 val = gen_rtx_NEG (mode, val);
2749 if (constant == 0 || frame_related_p)
2751 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2752 if (frame_related_p)
2754 RTX_FRAME_RELATED_P (insn) = true;
2755 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2756 gen_rtx_SET (dest, plus_constant (Pmode, src,
2757 poly_offset)));
2759 src = dest;
2760 if (constant == 0)
2761 return;
2763 else
2765 src = aarch64_force_temporary (mode, temp1, val);
2766 temp1 = temp2;
2767 temp2 = NULL_RTX;
2770 emit_move_imm = true;
2773 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2774 frame_related_p, emit_move_imm);
2777 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2778 than a poly_int64. */
2780 void
2781 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2782 rtx offset_rtx, rtx temp1, rtx temp2)
2784 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2785 temp1, temp2, false);
2788 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2789 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2790 if TEMP1 already contains abs (DELTA). */
2792 static inline void
2793 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2795 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2796 temp1, temp2, true, emit_move_imm);
2799 /* Subtract DELTA from the stack pointer, marking the instructions
2800 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2801 if nonnull. */
2803 static inline void
2804 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2806 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2807 temp1, temp2, frame_related_p);
2810 /* Set DEST to (vec_series BASE STEP). */
2812 static void
2813 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2815 machine_mode mode = GET_MODE (dest);
2816 scalar_mode inner = GET_MODE_INNER (mode);
2818 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2819 if (!aarch64_sve_index_immediate_p (base))
2820 base = force_reg (inner, base);
2821 if (!aarch64_sve_index_immediate_p (step))
2822 step = force_reg (inner, step);
2824 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2827 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2828 integer of mode INT_MODE. Return true on success. */
2830 static bool
2831 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2832 rtx src)
2834 /* If the constant is smaller than 128 bits, we can do the move
2835 using a vector of SRC_MODEs. */
2836 if (src_mode != TImode)
2838 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2839 GET_MODE_SIZE (src_mode));
2840 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2841 emit_move_insn (gen_lowpart (dup_mode, dest),
2842 gen_const_vec_duplicate (dup_mode, src));
2843 return true;
2846 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2847 src = force_const_mem (src_mode, src);
2848 if (!src)
2849 return false;
2851 /* Make sure that the address is legitimate. */
2852 if (!aarch64_sve_ld1r_operand_p (src))
2854 rtx addr = force_reg (Pmode, XEXP (src, 0));
2855 src = replace_equiv_address (src, addr);
2858 machine_mode mode = GET_MODE (dest);
2859 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2860 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2861 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2862 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2863 emit_insn (gen_rtx_SET (dest, src));
2864 return true;
2867 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2868 isn't a simple duplicate or series. */
2870 static void
2871 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2873 machine_mode mode = GET_MODE (src);
2874 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2875 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2876 gcc_assert (npatterns > 1);
2878 if (nelts_per_pattern == 1)
2880 /* The constant is a repeating seqeuence of at least two elements,
2881 where the repeating elements occupy no more than 128 bits.
2882 Get an integer representation of the replicated value. */
2883 scalar_int_mode int_mode;
2884 if (BYTES_BIG_ENDIAN)
2885 /* For now, always use LD1RQ to load the value on big-endian
2886 targets, since the handling of smaller integers includes a
2887 subreg that is semantically an element reverse. */
2888 int_mode = TImode;
2889 else
2891 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2892 gcc_assert (int_bits <= 128);
2893 int_mode = int_mode_for_size (int_bits, 0).require ();
2895 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2896 if (int_value
2897 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2898 return;
2901 /* Expand each pattern individually. */
2902 rtx_vector_builder builder;
2903 auto_vec<rtx, 16> vectors (npatterns);
2904 for (unsigned int i = 0; i < npatterns; ++i)
2906 builder.new_vector (mode, 1, nelts_per_pattern);
2907 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2908 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2909 vectors.quick_push (force_reg (mode, builder.build ()));
2912 /* Use permutes to interleave the separate vectors. */
2913 while (npatterns > 1)
2915 npatterns /= 2;
2916 for (unsigned int i = 0; i < npatterns; ++i)
2918 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2919 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2920 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2921 vectors[i] = tmp;
2924 gcc_assert (vectors[0] == dest);
2927 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2928 is a pattern that can be used to set DEST to a replicated scalar
2929 element. */
2931 void
2932 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2933 rtx (*gen_vec_duplicate) (rtx, rtx))
2935 machine_mode mode = GET_MODE (dest);
2937 /* Check on what type of symbol it is. */
2938 scalar_int_mode int_mode;
2939 if ((GET_CODE (imm) == SYMBOL_REF
2940 || GET_CODE (imm) == LABEL_REF
2941 || GET_CODE (imm) == CONST
2942 || GET_CODE (imm) == CONST_POLY_INT)
2943 && is_a <scalar_int_mode> (mode, &int_mode))
2945 rtx mem;
2946 poly_int64 offset;
2947 HOST_WIDE_INT const_offset;
2948 enum aarch64_symbol_type sty;
2950 /* If we have (const (plus symbol offset)), separate out the offset
2951 before we start classifying the symbol. */
2952 rtx base = strip_offset (imm, &offset);
2954 /* We must always add an offset involving VL separately, rather than
2955 folding it into the relocation. */
2956 if (!offset.is_constant (&const_offset))
2958 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2959 emit_insn (gen_rtx_SET (dest, imm));
2960 else
2962 /* Do arithmetic on 32-bit values if the result is smaller
2963 than that. */
2964 if (partial_subreg_p (int_mode, SImode))
2966 /* It is invalid to do symbol calculations in modes
2967 narrower than SImode. */
2968 gcc_assert (base == const0_rtx);
2969 dest = gen_lowpart (SImode, dest);
2970 int_mode = SImode;
2972 if (base != const0_rtx)
2974 base = aarch64_force_temporary (int_mode, dest, base);
2975 aarch64_add_offset (int_mode, dest, base, offset,
2976 NULL_RTX, NULL_RTX, false);
2978 else
2979 aarch64_add_offset (int_mode, dest, base, offset,
2980 dest, NULL_RTX, false);
2982 return;
2985 sty = aarch64_classify_symbol (base, const_offset);
2986 switch (sty)
2988 case SYMBOL_FORCE_TO_MEM:
2989 if (const_offset != 0
2990 && targetm.cannot_force_const_mem (int_mode, imm))
2992 gcc_assert (can_create_pseudo_p ());
2993 base = aarch64_force_temporary (int_mode, dest, base);
2994 aarch64_add_offset (int_mode, dest, base, const_offset,
2995 NULL_RTX, NULL_RTX, false);
2996 return;
2999 mem = force_const_mem (ptr_mode, imm);
3000 gcc_assert (mem);
3002 /* If we aren't generating PC relative literals, then
3003 we need to expand the literal pool access carefully.
3004 This is something that needs to be done in a number
3005 of places, so could well live as a separate function. */
3006 if (!aarch64_pcrelative_literal_loads)
3008 gcc_assert (can_create_pseudo_p ());
3009 base = gen_reg_rtx (ptr_mode);
3010 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3011 if (ptr_mode != Pmode)
3012 base = convert_memory_address (Pmode, base);
3013 mem = gen_rtx_MEM (ptr_mode, base);
3016 if (int_mode != ptr_mode)
3017 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3019 emit_insn (gen_rtx_SET (dest, mem));
3021 return;
3023 case SYMBOL_SMALL_TLSGD:
3024 case SYMBOL_SMALL_TLSDESC:
3025 case SYMBOL_SMALL_TLSIE:
3026 case SYMBOL_SMALL_GOT_28K:
3027 case SYMBOL_SMALL_GOT_4G:
3028 case SYMBOL_TINY_GOT:
3029 case SYMBOL_TINY_TLSIE:
3030 if (const_offset != 0)
3032 gcc_assert(can_create_pseudo_p ());
3033 base = aarch64_force_temporary (int_mode, dest, base);
3034 aarch64_add_offset (int_mode, dest, base, const_offset,
3035 NULL_RTX, NULL_RTX, false);
3036 return;
3038 /* FALLTHRU */
3040 case SYMBOL_SMALL_ABSOLUTE:
3041 case SYMBOL_TINY_ABSOLUTE:
3042 case SYMBOL_TLSLE12:
3043 case SYMBOL_TLSLE24:
3044 case SYMBOL_TLSLE32:
3045 case SYMBOL_TLSLE48:
3046 aarch64_load_symref_appropriately (dest, imm, sty);
3047 return;
3049 default:
3050 gcc_unreachable ();
3054 if (!CONST_INT_P (imm))
3056 rtx base, step, value;
3057 if (GET_CODE (imm) == HIGH
3058 || aarch64_simd_valid_immediate (imm, NULL))
3059 emit_insn (gen_rtx_SET (dest, imm));
3060 else if (const_vec_series_p (imm, &base, &step))
3061 aarch64_expand_vec_series (dest, base, step);
3062 else if (const_vec_duplicate_p (imm, &value))
3064 /* If the constant is out of range of an SVE vector move,
3065 load it from memory if we can, otherwise move it into
3066 a register and use a DUP. */
3067 scalar_mode inner_mode = GET_MODE_INNER (mode);
3068 rtx op = force_const_mem (inner_mode, value);
3069 if (!op)
3070 op = force_reg (inner_mode, value);
3071 else if (!aarch64_sve_ld1r_operand_p (op))
3073 rtx addr = force_reg (Pmode, XEXP (op, 0));
3074 op = replace_equiv_address (op, addr);
3076 emit_insn (gen_vec_duplicate (dest, op));
3078 else if (GET_CODE (imm) == CONST_VECTOR
3079 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3080 aarch64_expand_sve_const_vector (dest, imm);
3081 else
3083 rtx mem = force_const_mem (mode, imm);
3084 gcc_assert (mem);
3085 emit_move_insn (dest, mem);
3088 return;
3091 aarch64_internal_mov_immediate (dest, imm, true,
3092 as_a <scalar_int_mode> (mode));
3095 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3096 that is known to contain PTRUE. */
3098 void
3099 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3101 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3102 gen_rtvec (2, pred, src),
3103 UNSPEC_MERGE_PTRUE)));
3106 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3107 operand is in memory. In this case we need to use the predicated LD1
3108 and ST1 instead of LDR and STR, both for correctness on big-endian
3109 targets and because LD1 and ST1 support a wider range of addressing modes.
3110 PRED_MODE is the mode of the predicate.
3112 See the comment at the head of aarch64-sve.md for details about the
3113 big-endian handling. */
3115 void
3116 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3118 machine_mode mode = GET_MODE (dest);
3119 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3120 if (!register_operand (src, mode)
3121 && !register_operand (dest, mode))
3123 rtx tmp = gen_reg_rtx (mode);
3124 if (MEM_P (src))
3125 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3126 else
3127 emit_move_insn (tmp, src);
3128 src = tmp;
3130 aarch64_emit_sve_pred_move (dest, ptrue, src);
3133 /* Called only on big-endian targets. See whether an SVE vector move
3134 from SRC to DEST is effectively a REV[BHW] instruction, because at
3135 least one operand is a subreg of an SVE vector that has wider or
3136 narrower elements. Return true and emit the instruction if so.
3138 For example:
3140 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3142 represents a VIEW_CONVERT between the following vectors, viewed
3143 in memory order:
3145 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3146 R1: { [0], [1], [2], [3], ... }
3148 The high part of lane X in R2 should therefore correspond to lane X*2
3149 of R1, but the register representations are:
3151 msb lsb
3152 R2: ...... [1].high [1].low [0].high [0].low
3153 R1: ...... [3] [2] [1] [0]
3155 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3156 We therefore need a reverse operation to swap the high and low values
3157 around.
3159 This is purely an optimization. Without it we would spill the
3160 subreg operand to the stack in one mode and reload it in the
3161 other mode, which has the same effect as the REV. */
3163 bool
3164 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3166 gcc_assert (BYTES_BIG_ENDIAN);
3167 if (GET_CODE (dest) == SUBREG)
3168 dest = SUBREG_REG (dest);
3169 if (GET_CODE (src) == SUBREG)
3170 src = SUBREG_REG (src);
3172 /* The optimization handles two single SVE REGs with different element
3173 sizes. */
3174 if (!REG_P (dest)
3175 || !REG_P (src)
3176 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3177 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3178 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3179 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3180 return false;
3182 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3183 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3184 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3185 UNSPEC_REV_SUBREG);
3186 emit_insn (gen_rtx_SET (dest, unspec));
3187 return true;
3190 /* Return a copy of X with mode MODE, without changing its other
3191 attributes. Unlike gen_lowpart, this doesn't care whether the
3192 mode change is valid. */
3194 static rtx
3195 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3197 if (GET_MODE (x) == mode)
3198 return x;
3200 x = shallow_copy_rtx (x);
3201 set_mode_and_regno (x, mode, REGNO (x));
3202 return x;
3205 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3206 operands. */
3208 void
3209 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3211 /* Decide which REV operation we need. The mode with narrower elements
3212 determines the mode of the operands and the mode with the wider
3213 elements determines the reverse width. */
3214 machine_mode mode_with_wider_elts = GET_MODE (dest);
3215 machine_mode mode_with_narrower_elts = GET_MODE (src);
3216 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3217 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3218 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3220 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3221 unsigned int unspec;
3222 if (wider_bytes == 8)
3223 unspec = UNSPEC_REV64;
3224 else if (wider_bytes == 4)
3225 unspec = UNSPEC_REV32;
3226 else if (wider_bytes == 2)
3227 unspec = UNSPEC_REV16;
3228 else
3229 gcc_unreachable ();
3230 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3232 /* Emit:
3234 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3235 UNSPEC_MERGE_PTRUE))
3237 with the appropriate modes. */
3238 ptrue = gen_lowpart (pred_mode, ptrue);
3239 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3240 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3241 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3242 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3243 UNSPEC_MERGE_PTRUE);
3244 emit_insn (gen_rtx_SET (dest, src));
3247 static bool
3248 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3249 tree exp ATTRIBUTE_UNUSED)
3251 /* Currently, always true. */
3252 return true;
3255 /* Implement TARGET_PASS_BY_REFERENCE. */
3257 static bool
3258 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3259 machine_mode mode,
3260 const_tree type,
3261 bool named ATTRIBUTE_UNUSED)
3263 HOST_WIDE_INT size;
3264 machine_mode dummymode;
3265 int nregs;
3267 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3268 if (mode == BLKmode && type)
3269 size = int_size_in_bytes (type);
3270 else
3271 /* No frontends can create types with variable-sized modes, so we
3272 shouldn't be asked to pass or return them. */
3273 size = GET_MODE_SIZE (mode).to_constant ();
3275 /* Aggregates are passed by reference based on their size. */
3276 if (type && AGGREGATE_TYPE_P (type))
3278 size = int_size_in_bytes (type);
3281 /* Variable sized arguments are always returned by reference. */
3282 if (size < 0)
3283 return true;
3285 /* Can this be a candidate to be passed in fp/simd register(s)? */
3286 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3287 &dummymode, &nregs,
3288 NULL))
3289 return false;
3291 /* Arguments which are variable sized or larger than 2 registers are
3292 passed by reference unless they are a homogenous floating point
3293 aggregate. */
3294 return size > 2 * UNITS_PER_WORD;
3297 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3298 static bool
3299 aarch64_return_in_msb (const_tree valtype)
3301 machine_mode dummy_mode;
3302 int dummy_int;
3304 /* Never happens in little-endian mode. */
3305 if (!BYTES_BIG_ENDIAN)
3306 return false;
3308 /* Only composite types smaller than or equal to 16 bytes can
3309 be potentially returned in registers. */
3310 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3311 || int_size_in_bytes (valtype) <= 0
3312 || int_size_in_bytes (valtype) > 16)
3313 return false;
3315 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3316 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3317 is always passed/returned in the least significant bits of fp/simd
3318 register(s). */
3319 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3320 &dummy_mode, &dummy_int, NULL))
3321 return false;
3323 return true;
3326 /* Implement TARGET_FUNCTION_VALUE.
3327 Define how to find the value returned by a function. */
3329 static rtx
3330 aarch64_function_value (const_tree type, const_tree func,
3331 bool outgoing ATTRIBUTE_UNUSED)
3333 machine_mode mode;
3334 int unsignedp;
3335 int count;
3336 machine_mode ag_mode;
3338 mode = TYPE_MODE (type);
3339 if (INTEGRAL_TYPE_P (type))
3340 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3342 if (aarch64_return_in_msb (type))
3344 HOST_WIDE_INT size = int_size_in_bytes (type);
3346 if (size % UNITS_PER_WORD != 0)
3348 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3349 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3353 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3354 &ag_mode, &count, NULL))
3356 if (!aarch64_composite_type_p (type, mode))
3358 gcc_assert (count == 1 && mode == ag_mode);
3359 return gen_rtx_REG (mode, V0_REGNUM);
3361 else
3363 int i;
3364 rtx par;
3366 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3367 for (i = 0; i < count; i++)
3369 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3370 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3371 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3372 XVECEXP (par, 0, i) = tmp;
3374 return par;
3377 else
3378 return gen_rtx_REG (mode, R0_REGNUM);
3381 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3382 Return true if REGNO is the number of a hard register in which the values
3383 of called function may come back. */
3385 static bool
3386 aarch64_function_value_regno_p (const unsigned int regno)
3388 /* Maximum of 16 bytes can be returned in the general registers. Examples
3389 of 16-byte return values are: 128-bit integers and 16-byte small
3390 structures (excluding homogeneous floating-point aggregates). */
3391 if (regno == R0_REGNUM || regno == R1_REGNUM)
3392 return true;
3394 /* Up to four fp/simd registers can return a function value, e.g. a
3395 homogeneous floating-point aggregate having four members. */
3396 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3397 return TARGET_FLOAT;
3399 return false;
3402 /* Implement TARGET_RETURN_IN_MEMORY.
3404 If the type T of the result of a function is such that
3405 void func (T arg)
3406 would require that arg be passed as a value in a register (or set of
3407 registers) according to the parameter passing rules, then the result
3408 is returned in the same registers as would be used for such an
3409 argument. */
3411 static bool
3412 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3414 HOST_WIDE_INT size;
3415 machine_mode ag_mode;
3416 int count;
3418 if (!AGGREGATE_TYPE_P (type)
3419 && TREE_CODE (type) != COMPLEX_TYPE
3420 && TREE_CODE (type) != VECTOR_TYPE)
3421 /* Simple scalar types always returned in registers. */
3422 return false;
3424 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3425 type,
3426 &ag_mode,
3427 &count,
3428 NULL))
3429 return false;
3431 /* Types larger than 2 registers returned in memory. */
3432 size = int_size_in_bytes (type);
3433 return (size < 0 || size > 2 * UNITS_PER_WORD);
3436 static bool
3437 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3438 const_tree type, int *nregs)
3440 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3441 return aarch64_vfp_is_call_or_return_candidate (mode,
3442 type,
3443 &pcum->aapcs_vfp_rmode,
3444 nregs,
3445 NULL);
3448 /* Given MODE and TYPE of a function argument, return the alignment in
3449 bits. The idea is to suppress any stronger alignment requested by
3450 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3451 This is a helper function for local use only. */
3453 static unsigned int
3454 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3456 if (!type)
3457 return GET_MODE_ALIGNMENT (mode);
3459 if (integer_zerop (TYPE_SIZE (type)))
3460 return 0;
3462 gcc_assert (TYPE_MODE (type) == mode);
3464 if (!AGGREGATE_TYPE_P (type))
3465 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3467 if (TREE_CODE (type) == ARRAY_TYPE)
3468 return TYPE_ALIGN (TREE_TYPE (type));
3470 unsigned int alignment = 0;
3471 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3472 if (TREE_CODE (field) == FIELD_DECL)
3473 alignment = std::max (alignment, DECL_ALIGN (field));
3475 return alignment;
3478 /* Layout a function argument according to the AAPCS64 rules. The rule
3479 numbers refer to the rule numbers in the AAPCS64. */
3481 static void
3482 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3483 const_tree type,
3484 bool named ATTRIBUTE_UNUSED)
3486 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3487 int ncrn, nvrn, nregs;
3488 bool allocate_ncrn, allocate_nvrn;
3489 HOST_WIDE_INT size;
3491 /* We need to do this once per argument. */
3492 if (pcum->aapcs_arg_processed)
3493 return;
3495 pcum->aapcs_arg_processed = true;
3497 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3498 if (type)
3499 size = int_size_in_bytes (type);
3500 else
3501 /* No frontends can create types with variable-sized modes, so we
3502 shouldn't be asked to pass or return them. */
3503 size = GET_MODE_SIZE (mode).to_constant ();
3504 size = ROUND_UP (size, UNITS_PER_WORD);
3506 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3507 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3508 mode,
3509 type,
3510 &nregs);
3512 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3513 The following code thus handles passing by SIMD/FP registers first. */
3515 nvrn = pcum->aapcs_nvrn;
3517 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3518 and homogenous short-vector aggregates (HVA). */
3519 if (allocate_nvrn)
3521 if (!TARGET_FLOAT)
3522 aarch64_err_no_fpadvsimd (mode, "argument");
3524 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3526 pcum->aapcs_nextnvrn = nvrn + nregs;
3527 if (!aarch64_composite_type_p (type, mode))
3529 gcc_assert (nregs == 1);
3530 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3532 else
3534 rtx par;
3535 int i;
3536 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3537 for (i = 0; i < nregs; i++)
3539 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3540 V0_REGNUM + nvrn + i);
3541 rtx offset = gen_int_mode
3542 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3543 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3544 XVECEXP (par, 0, i) = tmp;
3546 pcum->aapcs_reg = par;
3548 return;
3550 else
3552 /* C.3 NSRN is set to 8. */
3553 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3554 goto on_stack;
3558 ncrn = pcum->aapcs_ncrn;
3559 nregs = size / UNITS_PER_WORD;
3561 /* C6 - C9. though the sign and zero extension semantics are
3562 handled elsewhere. This is the case where the argument fits
3563 entirely general registers. */
3564 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3567 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3569 /* C.8 if the argument has an alignment of 16 then the NGRN is
3570 rounded up to the next even number. */
3571 if (nregs == 2
3572 && ncrn % 2
3573 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3574 comparison is there because for > 16 * BITS_PER_UNIT
3575 alignment nregs should be > 2 and therefore it should be
3576 passed by reference rather than value. */
3577 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3579 ++ncrn;
3580 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3583 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3584 A reg is still generated for it, but the caller should be smart
3585 enough not to use it. */
3586 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3587 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3588 else
3590 rtx par;
3591 int i;
3593 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3594 for (i = 0; i < nregs; i++)
3596 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3597 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3598 GEN_INT (i * UNITS_PER_WORD));
3599 XVECEXP (par, 0, i) = tmp;
3601 pcum->aapcs_reg = par;
3604 pcum->aapcs_nextncrn = ncrn + nregs;
3605 return;
3608 /* C.11 */
3609 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3611 /* The argument is passed on stack; record the needed number of words for
3612 this argument and align the total size if necessary. */
3613 on_stack:
3614 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3616 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3617 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3618 16 / UNITS_PER_WORD);
3619 return;
3622 /* Implement TARGET_FUNCTION_ARG. */
3624 static rtx
3625 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3626 const_tree type, bool named)
3628 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3629 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3631 if (mode == VOIDmode)
3632 return NULL_RTX;
3634 aarch64_layout_arg (pcum_v, mode, type, named);
3635 return pcum->aapcs_reg;
3638 void
3639 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3640 const_tree fntype ATTRIBUTE_UNUSED,
3641 rtx libname ATTRIBUTE_UNUSED,
3642 const_tree fndecl ATTRIBUTE_UNUSED,
3643 unsigned n_named ATTRIBUTE_UNUSED)
3645 pcum->aapcs_ncrn = 0;
3646 pcum->aapcs_nvrn = 0;
3647 pcum->aapcs_nextncrn = 0;
3648 pcum->aapcs_nextnvrn = 0;
3649 pcum->pcs_variant = ARM_PCS_AAPCS64;
3650 pcum->aapcs_reg = NULL_RTX;
3651 pcum->aapcs_arg_processed = false;
3652 pcum->aapcs_stack_words = 0;
3653 pcum->aapcs_stack_size = 0;
3655 if (!TARGET_FLOAT
3656 && fndecl && TREE_PUBLIC (fndecl)
3657 && fntype && fntype != error_mark_node)
3659 const_tree type = TREE_TYPE (fntype);
3660 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3661 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3662 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3663 &mode, &nregs, NULL))
3664 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3666 return;
3669 static void
3670 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3671 machine_mode mode,
3672 const_tree type,
3673 bool named)
3675 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3676 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3678 aarch64_layout_arg (pcum_v, mode, type, named);
3679 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3680 != (pcum->aapcs_stack_words != 0));
3681 pcum->aapcs_arg_processed = false;
3682 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3683 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3684 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3685 pcum->aapcs_stack_words = 0;
3686 pcum->aapcs_reg = NULL_RTX;
3690 bool
3691 aarch64_function_arg_regno_p (unsigned regno)
3693 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3694 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3697 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3698 PARM_BOUNDARY bits of alignment, but will be given anything up
3699 to STACK_BOUNDARY bits if the type requires it. This makes sure
3700 that both before and after the layout of each argument, the Next
3701 Stacked Argument Address (NSAA) will have a minimum alignment of
3702 8 bytes. */
3704 static unsigned int
3705 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3707 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3708 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3711 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3713 static fixed_size_mode
3714 aarch64_get_reg_raw_mode (int regno)
3716 if (TARGET_SVE && FP_REGNUM_P (regno))
3717 /* Don't use the SVE part of the register for __builtin_apply and
3718 __builtin_return. The SVE registers aren't used by the normal PCS,
3719 so using them there would be a waste of time. The PCS extensions
3720 for SVE types are fundamentally incompatible with the
3721 __builtin_return/__builtin_apply interface. */
3722 return as_a <fixed_size_mode> (V16QImode);
3723 return default_get_reg_raw_mode (regno);
3726 /* Implement TARGET_FUNCTION_ARG_PADDING.
3728 Small aggregate types are placed in the lowest memory address.
3730 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3732 static pad_direction
3733 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3735 /* On little-endian targets, the least significant byte of every stack
3736 argument is passed at the lowest byte address of the stack slot. */
3737 if (!BYTES_BIG_ENDIAN)
3738 return PAD_UPWARD;
3740 /* Otherwise, integral, floating-point and pointer types are padded downward:
3741 the least significant byte of a stack argument is passed at the highest
3742 byte address of the stack slot. */
3743 if (type
3744 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3745 || POINTER_TYPE_P (type))
3746 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3747 return PAD_DOWNWARD;
3749 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3750 return PAD_UPWARD;
3753 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3755 It specifies padding for the last (may also be the only)
3756 element of a block move between registers and memory. If
3757 assuming the block is in the memory, padding upward means that
3758 the last element is padded after its highest significant byte,
3759 while in downward padding, the last element is padded at the
3760 its least significant byte side.
3762 Small aggregates and small complex types are always padded
3763 upwards.
3765 We don't need to worry about homogeneous floating-point or
3766 short-vector aggregates; their move is not affected by the
3767 padding direction determined here. Regardless of endianness,
3768 each element of such an aggregate is put in the least
3769 significant bits of a fp/simd register.
3771 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3772 register has useful data, and return the opposite if the most
3773 significant byte does. */
3775 bool
3776 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3777 bool first ATTRIBUTE_UNUSED)
3780 /* Small composite types are always padded upward. */
3781 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3783 HOST_WIDE_INT size;
3784 if (type)
3785 size = int_size_in_bytes (type);
3786 else
3787 /* No frontends can create types with variable-sized modes, so we
3788 shouldn't be asked to pass or return them. */
3789 size = GET_MODE_SIZE (mode).to_constant ();
3790 if (size < 2 * UNITS_PER_WORD)
3791 return true;
3794 /* Otherwise, use the default padding. */
3795 return !BYTES_BIG_ENDIAN;
3798 static scalar_int_mode
3799 aarch64_libgcc_cmp_return_mode (void)
3801 return SImode;
3804 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3806 /* We use the 12-bit shifted immediate arithmetic instructions so values
3807 must be multiple of (1 << 12), i.e. 4096. */
3808 #define ARITH_FACTOR 4096
3810 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3811 #error Cannot use simple address calculation for stack probing
3812 #endif
3814 /* The pair of scratch registers used for stack probing. */
3815 #define PROBE_STACK_FIRST_REG 9
3816 #define PROBE_STACK_SECOND_REG 10
3818 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3819 inclusive. These are offsets from the current stack pointer. */
3821 static void
3822 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3824 HOST_WIDE_INT size;
3825 if (!poly_size.is_constant (&size))
3827 sorry ("stack probes for SVE frames");
3828 return;
3831 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3833 /* See the same assertion on PROBE_INTERVAL above. */
3834 gcc_assert ((first % ARITH_FACTOR) == 0);
3836 /* See if we have a constant small number of probes to generate. If so,
3837 that's the easy case. */
3838 if (size <= PROBE_INTERVAL)
3840 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3842 emit_set_insn (reg1,
3843 plus_constant (Pmode,
3844 stack_pointer_rtx, -(first + base)));
3845 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3848 /* The run-time loop is made up of 8 insns in the generic case while the
3849 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3850 else if (size <= 4 * PROBE_INTERVAL)
3852 HOST_WIDE_INT i, rem;
3854 emit_set_insn (reg1,
3855 plus_constant (Pmode,
3856 stack_pointer_rtx,
3857 -(first + PROBE_INTERVAL)));
3858 emit_stack_probe (reg1);
3860 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3861 it exceeds SIZE. If only two probes are needed, this will not
3862 generate any code. Then probe at FIRST + SIZE. */
3863 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3865 emit_set_insn (reg1,
3866 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3867 emit_stack_probe (reg1);
3870 rem = size - (i - PROBE_INTERVAL);
3871 if (rem > 256)
3873 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3875 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3876 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3878 else
3879 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3882 /* Otherwise, do the same as above, but in a loop. Note that we must be
3883 extra careful with variables wrapping around because we might be at
3884 the very top (or the very bottom) of the address space and we have
3885 to be able to handle this case properly; in particular, we use an
3886 equality test for the loop condition. */
3887 else
3889 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3891 /* Step 1: round SIZE to the previous multiple of the interval. */
3893 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3896 /* Step 2: compute initial and final value of the loop counter. */
3898 /* TEST_ADDR = SP + FIRST. */
3899 emit_set_insn (reg1,
3900 plus_constant (Pmode, stack_pointer_rtx, -first));
3902 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3903 HOST_WIDE_INT adjustment = - (first + rounded_size);
3904 if (! aarch64_uimm12_shift (adjustment))
3906 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3907 true, Pmode);
3908 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3910 else
3911 emit_set_insn (reg2,
3912 plus_constant (Pmode, stack_pointer_rtx, adjustment));
3914 /* Step 3: the loop
3918 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3919 probe at TEST_ADDR
3921 while (TEST_ADDR != LAST_ADDR)
3923 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3924 until it is equal to ROUNDED_SIZE. */
3926 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3929 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3930 that SIZE is equal to ROUNDED_SIZE. */
3932 if (size != rounded_size)
3934 HOST_WIDE_INT rem = size - rounded_size;
3936 if (rem > 256)
3938 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3940 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3941 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3943 else
3944 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3948 /* Make sure nothing is scheduled before we are done. */
3949 emit_insn (gen_blockage ());
3952 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3953 absolute addresses. */
3955 const char *
3956 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3958 static int labelno = 0;
3959 char loop_lab[32];
3960 rtx xops[2];
3962 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3964 /* Loop. */
3965 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3967 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3968 xops[0] = reg1;
3969 xops[1] = GEN_INT (PROBE_INTERVAL);
3970 output_asm_insn ("sub\t%0, %0, %1", xops);
3972 /* Probe at TEST_ADDR. */
3973 output_asm_insn ("str\txzr, [%0]", xops);
3975 /* Test if TEST_ADDR == LAST_ADDR. */
3976 xops[1] = reg2;
3977 output_asm_insn ("cmp\t%0, %1", xops);
3979 /* Branch. */
3980 fputs ("\tb.ne\t", asm_out_file);
3981 assemble_name_raw (asm_out_file, loop_lab);
3982 fputc ('\n', asm_out_file);
3984 return "";
3987 /* Determine whether a frame chain needs to be generated. */
3988 static bool
3989 aarch64_needs_frame_chain (void)
3991 /* Force a frame chain for EH returns so the return address is at FP+8. */
3992 if (frame_pointer_needed || crtl->calls_eh_return)
3993 return true;
3995 /* A leaf function cannot have calls or write LR. */
3996 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
3998 /* Don't use a frame chain in leaf functions if leaf frame pointers
3999 are disabled. */
4000 if (flag_omit_leaf_frame_pointer && is_leaf)
4001 return false;
4003 return aarch64_use_frame_pointer;
4006 /* Mark the registers that need to be saved by the callee and calculate
4007 the size of the callee-saved registers area and frame record (both FP
4008 and LR may be omitted). */
4009 static void
4010 aarch64_layout_frame (void)
4012 HOST_WIDE_INT offset = 0;
4013 int regno, last_fp_reg = INVALID_REGNUM;
4015 if (reload_completed && cfun->machine->frame.laid_out)
4016 return;
4018 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4020 #define SLOT_NOT_REQUIRED (-2)
4021 #define SLOT_REQUIRED (-1)
4023 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4024 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4026 /* First mark all the registers that really need to be saved... */
4027 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4028 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4030 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4031 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4033 /* ... that includes the eh data registers (if needed)... */
4034 if (crtl->calls_eh_return)
4035 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4036 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4037 = SLOT_REQUIRED;
4039 /* ... and any callee saved register that dataflow says is live. */
4040 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4041 if (df_regs_ever_live_p (regno)
4042 && (regno == R30_REGNUM
4043 || !call_used_regs[regno]))
4044 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4046 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4047 if (df_regs_ever_live_p (regno)
4048 && !call_used_regs[regno])
4050 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4051 last_fp_reg = regno;
4054 if (cfun->machine->frame.emit_frame_chain)
4056 /* FP and LR are placed in the linkage record. */
4057 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4058 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4059 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4060 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4061 offset = 2 * UNITS_PER_WORD;
4064 /* Now assign stack slots for them. */
4065 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4066 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4068 cfun->machine->frame.reg_offset[regno] = offset;
4069 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4070 cfun->machine->frame.wb_candidate1 = regno;
4071 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4072 cfun->machine->frame.wb_candidate2 = regno;
4073 offset += UNITS_PER_WORD;
4076 HOST_WIDE_INT max_int_offset = offset;
4077 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4078 bool has_align_gap = offset != max_int_offset;
4080 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4081 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4083 /* If there is an alignment gap between integer and fp callee-saves,
4084 allocate the last fp register to it if possible. */
4085 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4087 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4088 break;
4091 cfun->machine->frame.reg_offset[regno] = offset;
4092 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4093 cfun->machine->frame.wb_candidate1 = regno;
4094 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4095 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4096 cfun->machine->frame.wb_candidate2 = regno;
4097 offset += UNITS_PER_WORD;
4100 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4102 cfun->machine->frame.saved_regs_size = offset;
4104 HOST_WIDE_INT varargs_and_saved_regs_size
4105 = offset + cfun->machine->frame.saved_varargs_size;
4107 cfun->machine->frame.hard_fp_offset
4108 = aligned_upper_bound (varargs_and_saved_regs_size
4109 + get_frame_size (),
4110 STACK_BOUNDARY / BITS_PER_UNIT);
4112 /* Both these values are already aligned. */
4113 gcc_assert (multiple_p (crtl->outgoing_args_size,
4114 STACK_BOUNDARY / BITS_PER_UNIT));
4115 cfun->machine->frame.frame_size
4116 = (cfun->machine->frame.hard_fp_offset
4117 + crtl->outgoing_args_size);
4119 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4121 cfun->machine->frame.initial_adjust = 0;
4122 cfun->machine->frame.final_adjust = 0;
4123 cfun->machine->frame.callee_adjust = 0;
4124 cfun->machine->frame.callee_offset = 0;
4126 HOST_WIDE_INT max_push_offset = 0;
4127 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4128 max_push_offset = 512;
4129 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4130 max_push_offset = 256;
4132 HOST_WIDE_INT const_size, const_fp_offset;
4133 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4134 && const_size < max_push_offset
4135 && known_eq (crtl->outgoing_args_size, 0))
4137 /* Simple, small frame with no outgoing arguments:
4138 stp reg1, reg2, [sp, -frame_size]!
4139 stp reg3, reg4, [sp, 16] */
4140 cfun->machine->frame.callee_adjust = const_size;
4142 else if (known_lt (crtl->outgoing_args_size
4143 + cfun->machine->frame.saved_regs_size, 512)
4144 && !(cfun->calls_alloca
4145 && known_lt (cfun->machine->frame.hard_fp_offset,
4146 max_push_offset)))
4148 /* Frame with small outgoing arguments:
4149 sub sp, sp, frame_size
4150 stp reg1, reg2, [sp, outgoing_args_size]
4151 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4152 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4153 cfun->machine->frame.callee_offset
4154 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4156 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4157 && const_fp_offset < max_push_offset)
4159 /* Frame with large outgoing arguments but a small local area:
4160 stp reg1, reg2, [sp, -hard_fp_offset]!
4161 stp reg3, reg4, [sp, 16]
4162 sub sp, sp, outgoing_args_size */
4163 cfun->machine->frame.callee_adjust = const_fp_offset;
4164 cfun->machine->frame.final_adjust
4165 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4167 else
4169 /* Frame with large local area and outgoing arguments using frame pointer:
4170 sub sp, sp, hard_fp_offset
4171 stp x29, x30, [sp, 0]
4172 add x29, sp, 0
4173 stp reg3, reg4, [sp, 16]
4174 sub sp, sp, outgoing_args_size */
4175 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4176 cfun->machine->frame.final_adjust
4177 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4180 cfun->machine->frame.laid_out = true;
4183 /* Return true if the register REGNO is saved on entry to
4184 the current function. */
4186 static bool
4187 aarch64_register_saved_on_entry (int regno)
4189 return cfun->machine->frame.reg_offset[regno] >= 0;
4192 /* Return the next register up from REGNO up to LIMIT for the callee
4193 to save. */
4195 static unsigned
4196 aarch64_next_callee_save (unsigned regno, unsigned limit)
4198 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4199 regno ++;
4200 return regno;
4203 /* Push the register number REGNO of mode MODE to the stack with write-back
4204 adjusting the stack by ADJUSTMENT. */
4206 static void
4207 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4208 HOST_WIDE_INT adjustment)
4210 rtx base_rtx = stack_pointer_rtx;
4211 rtx insn, reg, mem;
4213 reg = gen_rtx_REG (mode, regno);
4214 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4215 plus_constant (Pmode, base_rtx, -adjustment));
4216 mem = gen_frame_mem (mode, mem);
4218 insn = emit_move_insn (mem, reg);
4219 RTX_FRAME_RELATED_P (insn) = 1;
4222 /* Generate and return an instruction to store the pair of registers
4223 REG and REG2 of mode MODE to location BASE with write-back adjusting
4224 the stack location BASE by ADJUSTMENT. */
4226 static rtx
4227 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4228 HOST_WIDE_INT adjustment)
4230 switch (mode)
4232 case E_DImode:
4233 return gen_storewb_pairdi_di (base, base, reg, reg2,
4234 GEN_INT (-adjustment),
4235 GEN_INT (UNITS_PER_WORD - adjustment));
4236 case E_DFmode:
4237 return gen_storewb_pairdf_di (base, base, reg, reg2,
4238 GEN_INT (-adjustment),
4239 GEN_INT (UNITS_PER_WORD - adjustment));
4240 default:
4241 gcc_unreachable ();
4245 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4246 stack pointer by ADJUSTMENT. */
4248 static void
4249 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4251 rtx_insn *insn;
4252 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4254 if (regno2 == INVALID_REGNUM)
4255 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4257 rtx reg1 = gen_rtx_REG (mode, regno1);
4258 rtx reg2 = gen_rtx_REG (mode, regno2);
4260 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4261 reg2, adjustment));
4262 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4263 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4264 RTX_FRAME_RELATED_P (insn) = 1;
4267 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4268 adjusting it by ADJUSTMENT afterwards. */
4270 static rtx
4271 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4272 HOST_WIDE_INT adjustment)
4274 switch (mode)
4276 case E_DImode:
4277 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4278 GEN_INT (UNITS_PER_WORD));
4279 case E_DFmode:
4280 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4281 GEN_INT (UNITS_PER_WORD));
4282 default:
4283 gcc_unreachable ();
4287 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4288 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4289 into CFI_OPS. */
4291 static void
4292 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4293 rtx *cfi_ops)
4295 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4296 rtx reg1 = gen_rtx_REG (mode, regno1);
4298 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4300 if (regno2 == INVALID_REGNUM)
4302 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4303 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4304 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4306 else
4308 rtx reg2 = gen_rtx_REG (mode, regno2);
4309 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4310 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4311 reg2, adjustment));
4315 /* Generate and return a store pair instruction of mode MODE to store
4316 register REG1 to MEM1 and register REG2 to MEM2. */
4318 static rtx
4319 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4320 rtx reg2)
4322 switch (mode)
4324 case E_DImode:
4325 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4327 case E_DFmode:
4328 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4330 default:
4331 gcc_unreachable ();
4335 /* Generate and regurn a load pair isntruction of mode MODE to load register
4336 REG1 from MEM1 and register REG2 from MEM2. */
4338 static rtx
4339 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4340 rtx mem2)
4342 switch (mode)
4344 case E_DImode:
4345 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4347 case E_DFmode:
4348 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4350 default:
4351 gcc_unreachable ();
4355 /* Return TRUE if return address signing should be enabled for the current
4356 function, otherwise return FALSE. */
4358 bool
4359 aarch64_return_address_signing_enabled (void)
4361 /* This function should only be called after frame laid out. */
4362 gcc_assert (cfun->machine->frame.laid_out);
4364 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4365 if it's LR is pushed onto stack. */
4366 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4367 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4368 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4371 /* Emit code to save the callee-saved registers from register number START
4372 to LIMIT to the stack at the location starting at offset START_OFFSET,
4373 skipping any write-back candidates if SKIP_WB is true. */
4375 static void
4376 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4377 unsigned start, unsigned limit, bool skip_wb)
4379 rtx_insn *insn;
4380 unsigned regno;
4381 unsigned regno2;
4383 for (regno = aarch64_next_callee_save (start, limit);
4384 regno <= limit;
4385 regno = aarch64_next_callee_save (regno + 1, limit))
4387 rtx reg, mem;
4388 poly_int64 offset;
4390 if (skip_wb
4391 && (regno == cfun->machine->frame.wb_candidate1
4392 || regno == cfun->machine->frame.wb_candidate2))
4393 continue;
4395 if (cfun->machine->reg_is_wrapped_separately[regno])
4396 continue;
4398 reg = gen_rtx_REG (mode, regno);
4399 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4400 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4401 offset));
4403 regno2 = aarch64_next_callee_save (regno + 1, limit);
4405 if (regno2 <= limit
4406 && !cfun->machine->reg_is_wrapped_separately[regno2]
4407 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4408 == cfun->machine->frame.reg_offset[regno2]))
4411 rtx reg2 = gen_rtx_REG (mode, regno2);
4412 rtx mem2;
4414 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4415 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4416 offset));
4417 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4418 reg2));
4420 /* The first part of a frame-related parallel insn is
4421 always assumed to be relevant to the frame
4422 calculations; subsequent parts, are only
4423 frame-related if explicitly marked. */
4424 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4425 regno = regno2;
4427 else
4428 insn = emit_move_insn (mem, reg);
4430 RTX_FRAME_RELATED_P (insn) = 1;
4434 /* Emit code to restore the callee registers of mode MODE from register
4435 number START up to and including LIMIT. Restore from the stack offset
4436 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4437 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4439 static void
4440 aarch64_restore_callee_saves (machine_mode mode,
4441 poly_int64 start_offset, unsigned start,
4442 unsigned limit, bool skip_wb, rtx *cfi_ops)
4444 rtx base_rtx = stack_pointer_rtx;
4445 unsigned regno;
4446 unsigned regno2;
4447 poly_int64 offset;
4449 for (regno = aarch64_next_callee_save (start, limit);
4450 regno <= limit;
4451 regno = aarch64_next_callee_save (regno + 1, limit))
4453 if (cfun->machine->reg_is_wrapped_separately[regno])
4454 continue;
4456 rtx reg, mem;
4458 if (skip_wb
4459 && (regno == cfun->machine->frame.wb_candidate1
4460 || regno == cfun->machine->frame.wb_candidate2))
4461 continue;
4463 reg = gen_rtx_REG (mode, regno);
4464 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4465 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4467 regno2 = aarch64_next_callee_save (regno + 1, limit);
4469 if (regno2 <= limit
4470 && !cfun->machine->reg_is_wrapped_separately[regno2]
4471 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4472 == cfun->machine->frame.reg_offset[regno2]))
4474 rtx reg2 = gen_rtx_REG (mode, regno2);
4475 rtx mem2;
4477 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4478 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4479 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4481 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4482 regno = regno2;
4484 else
4485 emit_move_insn (reg, mem);
4486 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4490 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4491 of MODE. */
4493 static inline bool
4494 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4496 HOST_WIDE_INT multiple;
4497 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4498 && IN_RANGE (multiple, -8, 7));
4501 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4502 of MODE. */
4504 static inline bool
4505 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4507 HOST_WIDE_INT multiple;
4508 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4509 && IN_RANGE (multiple, 0, 63));
4512 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4513 of MODE. */
4515 bool
4516 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4518 HOST_WIDE_INT multiple;
4519 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4520 && IN_RANGE (multiple, -64, 63));
4523 /* Return true if OFFSET is a signed 9-bit value. */
4525 static inline bool
4526 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4527 poly_int64 offset)
4529 HOST_WIDE_INT const_offset;
4530 return (offset.is_constant (&const_offset)
4531 && IN_RANGE (const_offset, -256, 255));
4534 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4535 of MODE. */
4537 static inline bool
4538 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4540 HOST_WIDE_INT multiple;
4541 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4542 && IN_RANGE (multiple, -256, 255));
4545 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4546 of MODE. */
4548 static inline bool
4549 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4551 HOST_WIDE_INT multiple;
4552 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4553 && IN_RANGE (multiple, 0, 4095));
4556 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4558 static sbitmap
4559 aarch64_get_separate_components (void)
4561 aarch64_layout_frame ();
4563 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4564 bitmap_clear (components);
4566 /* The registers we need saved to the frame. */
4567 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4568 if (aarch64_register_saved_on_entry (regno))
4570 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4571 if (!frame_pointer_needed)
4572 offset += cfun->machine->frame.frame_size
4573 - cfun->machine->frame.hard_fp_offset;
4574 /* Check that we can access the stack slot of the register with one
4575 direct load with no adjustments needed. */
4576 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4577 bitmap_set_bit (components, regno);
4580 /* Don't mess with the hard frame pointer. */
4581 if (frame_pointer_needed)
4582 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4584 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4585 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4586 /* If aarch64_layout_frame has chosen registers to store/restore with
4587 writeback don't interfere with them to avoid having to output explicit
4588 stack adjustment instructions. */
4589 if (reg2 != INVALID_REGNUM)
4590 bitmap_clear_bit (components, reg2);
4591 if (reg1 != INVALID_REGNUM)
4592 bitmap_clear_bit (components, reg1);
4594 bitmap_clear_bit (components, LR_REGNUM);
4595 bitmap_clear_bit (components, SP_REGNUM);
4597 return components;
4600 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4602 static sbitmap
4603 aarch64_components_for_bb (basic_block bb)
4605 bitmap in = DF_LIVE_IN (bb);
4606 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4607 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4609 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4610 bitmap_clear (components);
4612 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4613 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4614 if ((!call_used_regs[regno])
4615 && (bitmap_bit_p (in, regno)
4616 || bitmap_bit_p (gen, regno)
4617 || bitmap_bit_p (kill, regno)))
4619 unsigned regno2, offset, offset2;
4620 bitmap_set_bit (components, regno);
4622 /* If there is a callee-save at an adjacent offset, add it too
4623 to increase the use of LDP/STP. */
4624 offset = cfun->machine->frame.reg_offset[regno];
4625 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4627 if (regno2 <= LAST_SAVED_REGNUM)
4629 offset2 = cfun->machine->frame.reg_offset[regno2];
4630 if ((offset & ~8) == (offset2 & ~8))
4631 bitmap_set_bit (components, regno2);
4635 return components;
4638 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4639 Nothing to do for aarch64. */
4641 static void
4642 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4646 /* Return the next set bit in BMP from START onwards. Return the total number
4647 of bits in BMP if no set bit is found at or after START. */
4649 static unsigned int
4650 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4652 unsigned int nbits = SBITMAP_SIZE (bmp);
4653 if (start == nbits)
4654 return start;
4656 gcc_assert (start < nbits);
4657 for (unsigned int i = start; i < nbits; i++)
4658 if (bitmap_bit_p (bmp, i))
4659 return i;
4661 return nbits;
4664 /* Do the work for aarch64_emit_prologue_components and
4665 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4666 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4667 for these components or the epilogue sequence. That is, it determines
4668 whether we should emit stores or loads and what kind of CFA notes to attach
4669 to the insns. Otherwise the logic for the two sequences is very
4670 similar. */
4672 static void
4673 aarch64_process_components (sbitmap components, bool prologue_p)
4675 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4676 ? HARD_FRAME_POINTER_REGNUM
4677 : STACK_POINTER_REGNUM);
4679 unsigned last_regno = SBITMAP_SIZE (components);
4680 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4681 rtx_insn *insn = NULL;
4683 while (regno != last_regno)
4685 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4686 so DFmode for the vector registers is enough. */
4687 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4688 rtx reg = gen_rtx_REG (mode, regno);
4689 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4690 if (!frame_pointer_needed)
4691 offset += cfun->machine->frame.frame_size
4692 - cfun->machine->frame.hard_fp_offset;
4693 rtx addr = plus_constant (Pmode, ptr_reg, offset);
4694 rtx mem = gen_frame_mem (mode, addr);
4696 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4697 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4698 /* No more registers to handle after REGNO.
4699 Emit a single save/restore and exit. */
4700 if (regno2 == last_regno)
4702 insn = emit_insn (set);
4703 RTX_FRAME_RELATED_P (insn) = 1;
4704 if (prologue_p)
4705 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4706 else
4707 add_reg_note (insn, REG_CFA_RESTORE, reg);
4708 break;
4711 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4712 /* The next register is not of the same class or its offset is not
4713 mergeable with the current one into a pair. */
4714 if (!satisfies_constraint_Ump (mem)
4715 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4716 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4717 GET_MODE_SIZE (mode)))
4719 insn = emit_insn (set);
4720 RTX_FRAME_RELATED_P (insn) = 1;
4721 if (prologue_p)
4722 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4723 else
4724 add_reg_note (insn, REG_CFA_RESTORE, reg);
4726 regno = regno2;
4727 continue;
4730 /* REGNO2 can be saved/restored in a pair with REGNO. */
4731 rtx reg2 = gen_rtx_REG (mode, regno2);
4732 if (!frame_pointer_needed)
4733 offset2 += cfun->machine->frame.frame_size
4734 - cfun->machine->frame.hard_fp_offset;
4735 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4736 rtx mem2 = gen_frame_mem (mode, addr2);
4737 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4738 : gen_rtx_SET (reg2, mem2);
4740 if (prologue_p)
4741 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4742 else
4743 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4745 RTX_FRAME_RELATED_P (insn) = 1;
4746 if (prologue_p)
4748 add_reg_note (insn, REG_CFA_OFFSET, set);
4749 add_reg_note (insn, REG_CFA_OFFSET, set2);
4751 else
4753 add_reg_note (insn, REG_CFA_RESTORE, reg);
4754 add_reg_note (insn, REG_CFA_RESTORE, reg2);
4757 regno = aarch64_get_next_set_bit (components, regno2 + 1);
4761 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4763 static void
4764 aarch64_emit_prologue_components (sbitmap components)
4766 aarch64_process_components (components, true);
4769 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4771 static void
4772 aarch64_emit_epilogue_components (sbitmap components)
4774 aarch64_process_components (components, false);
4777 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4779 static void
4780 aarch64_set_handled_components (sbitmap components)
4782 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4783 if (bitmap_bit_p (components, regno))
4784 cfun->machine->reg_is_wrapped_separately[regno] = true;
4787 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4788 is saved at BASE + OFFSET. */
4790 static void
4791 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4792 rtx base, poly_int64 offset)
4794 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4795 add_reg_note (insn, REG_CFA_EXPRESSION,
4796 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4799 /* AArch64 stack frames generated by this compiler look like:
4801 +-------------------------------+
4803 | incoming stack arguments |
4805 +-------------------------------+
4806 | | <-- incoming stack pointer (aligned)
4807 | callee-allocated save area |
4808 | for register varargs |
4810 +-------------------------------+
4811 | local variables | <-- frame_pointer_rtx
4813 +-------------------------------+
4814 | padding0 | \
4815 +-------------------------------+ |
4816 | callee-saved registers | | frame.saved_regs_size
4817 +-------------------------------+ |
4818 | LR' | |
4819 +-------------------------------+ |
4820 | FP' | / <- hard_frame_pointer_rtx (aligned)
4821 +-------------------------------+
4822 | dynamic allocation |
4823 +-------------------------------+
4824 | padding |
4825 +-------------------------------+
4826 | outgoing stack arguments | <-- arg_pointer
4828 +-------------------------------+
4829 | | <-- stack_pointer_rtx (aligned)
4831 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4832 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4833 unchanged. */
4835 /* Generate the prologue instructions for entry into a function.
4836 Establish the stack frame by decreasing the stack pointer with a
4837 properly calculated size and, if necessary, create a frame record
4838 filled with the values of LR and previous frame pointer. The
4839 current FP is also set up if it is in use. */
4841 void
4842 aarch64_expand_prologue (void)
4844 aarch64_layout_frame ();
4846 poly_int64 frame_size = cfun->machine->frame.frame_size;
4847 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4848 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4849 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4850 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4851 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4852 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4853 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4854 rtx_insn *insn;
4856 /* Sign return address for functions. */
4857 if (aarch64_return_address_signing_enabled ())
4859 insn = emit_insn (gen_pacisp ());
4860 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4861 RTX_FRAME_RELATED_P (insn) = 1;
4864 if (flag_stack_usage_info)
4865 current_function_static_stack_size = constant_lower_bound (frame_size);
4867 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4869 if (crtl->is_leaf && !cfun->calls_alloca)
4871 if (maybe_gt (frame_size, PROBE_INTERVAL)
4872 && maybe_gt (frame_size, get_stack_check_protect ()))
4873 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4874 (frame_size
4875 - get_stack_check_protect ()));
4877 else if (maybe_gt (frame_size, 0))
4878 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4881 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4882 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4884 aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4886 if (callee_adjust != 0)
4887 aarch64_push_regs (reg1, reg2, callee_adjust);
4889 if (emit_frame_chain)
4891 poly_int64 reg_offset = callee_adjust;
4892 if (callee_adjust == 0)
4894 reg1 = R29_REGNUM;
4895 reg2 = R30_REGNUM;
4896 reg_offset = callee_offset;
4897 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4899 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4900 stack_pointer_rtx, callee_offset,
4901 ip1_rtx, ip0_rtx, frame_pointer_needed);
4902 if (frame_pointer_needed && !frame_size.is_constant ())
4904 /* Variable-sized frames need to describe the save slot
4905 address using DW_CFA_expression rather than DW_CFA_offset.
4906 This means that, without taking further action, the
4907 locations of the registers that we've already saved would
4908 remain based on the stack pointer even after we redefine
4909 the CFA based on the frame pointer. We therefore need new
4910 DW_CFA_expressions to re-express the save slots with addresses
4911 based on the frame pointer. */
4912 rtx_insn *insn = get_last_insn ();
4913 gcc_assert (RTX_FRAME_RELATED_P (insn));
4915 /* Add an explicit CFA definition if this was previously
4916 implicit. */
4917 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4919 rtx src = plus_constant (Pmode, stack_pointer_rtx,
4920 callee_offset);
4921 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4922 gen_rtx_SET (hard_frame_pointer_rtx, src));
4925 /* Change the save slot expressions for the registers that
4926 we've already saved. */
4927 reg_offset -= callee_offset;
4928 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4929 reg_offset + UNITS_PER_WORD);
4930 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4931 reg_offset);
4933 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4936 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4937 callee_adjust != 0 || emit_frame_chain);
4938 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4939 callee_adjust != 0 || emit_frame_chain);
4940 aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4943 /* Return TRUE if we can use a simple_return insn.
4945 This function checks whether the callee saved stack is empty, which
4946 means no restore actions are need. The pro_and_epilogue will use
4947 this to check whether shrink-wrapping opt is feasible. */
4949 bool
4950 aarch64_use_return_insn_p (void)
4952 if (!reload_completed)
4953 return false;
4955 if (crtl->profile)
4956 return false;
4958 aarch64_layout_frame ();
4960 return known_eq (cfun->machine->frame.frame_size, 0);
4963 /* Generate the epilogue instructions for returning from a function.
4964 This is almost exactly the reverse of the prolog sequence, except
4965 that we need to insert barriers to avoid scheduling loads that read
4966 from a deallocated stack, and we optimize the unwind records by
4967 emitting them all together if possible. */
4968 void
4969 aarch64_expand_epilogue (bool for_sibcall)
4971 aarch64_layout_frame ();
4973 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4974 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4975 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4976 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4977 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4978 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4979 rtx cfi_ops = NULL;
4980 rtx_insn *insn;
4981 /* A stack clash protection prologue may not have left IP0_REGNUM or
4982 IP1_REGNUM in a usable state. The same is true for allocations
4983 with an SVE component, since we then need both temporary registers
4984 for each allocation. */
4985 bool can_inherit_p = (initial_adjust.is_constant ()
4986 && final_adjust.is_constant ()
4987 && !flag_stack_clash_protection);
4989 /* We need to add memory barrier to prevent read from deallocated stack. */
4990 bool need_barrier_p
4991 = maybe_ne (get_frame_size ()
4992 + cfun->machine->frame.saved_varargs_size, 0);
4994 /* Emit a barrier to prevent loads from a deallocated stack. */
4995 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4996 || cfun->calls_alloca
4997 || crtl->calls_eh_return)
4999 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5000 need_barrier_p = false;
5003 /* Restore the stack pointer from the frame pointer if it may not
5004 be the same as the stack pointer. */
5005 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
5006 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
5007 if (frame_pointer_needed
5008 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5009 /* If writeback is used when restoring callee-saves, the CFA
5010 is restored on the instruction doing the writeback. */
5011 aarch64_add_offset (Pmode, stack_pointer_rtx,
5012 hard_frame_pointer_rtx, -callee_offset,
5013 ip1_rtx, ip0_rtx, callee_adjust == 0);
5014 else
5015 aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
5016 !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
5018 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5019 callee_adjust != 0, &cfi_ops);
5020 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5021 callee_adjust != 0, &cfi_ops);
5023 if (need_barrier_p)
5024 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5026 if (callee_adjust != 0)
5027 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5029 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5031 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5032 insn = get_last_insn ();
5033 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5034 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5035 RTX_FRAME_RELATED_P (insn) = 1;
5036 cfi_ops = NULL;
5039 aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5040 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5042 if (cfi_ops)
5044 /* Emit delayed restores and reset the CFA to be SP. */
5045 insn = get_last_insn ();
5046 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5047 REG_NOTES (insn) = cfi_ops;
5048 RTX_FRAME_RELATED_P (insn) = 1;
5051 /* We prefer to emit the combined return/authenticate instruction RETAA,
5052 however there are three cases in which we must instead emit an explicit
5053 authentication instruction.
5055 1) Sibcalls don't return in a normal way, so if we're about to call one
5056 we must authenticate.
5058 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5059 generating code for !TARGET_ARMV8_3 we can't use it and must
5060 explicitly authenticate.
5062 3) On an eh_return path we make extra stack adjustments to update the
5063 canonical frame address to be the exception handler's CFA. We want
5064 to authenticate using the CFA of the function which calls eh_return.
5066 if (aarch64_return_address_signing_enabled ()
5067 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5069 insn = emit_insn (gen_autisp ());
5070 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5071 RTX_FRAME_RELATED_P (insn) = 1;
5074 /* Stack adjustment for exception handler. */
5075 if (crtl->calls_eh_return)
5077 /* We need to unwind the stack by the offset computed by
5078 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5079 to be SP; letting the CFA move during this adjustment
5080 is just as correct as retaining the CFA from the body
5081 of the function. Therefore, do nothing special. */
5082 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5085 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5086 if (!for_sibcall)
5087 emit_jump_insn (ret_rtx);
5090 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5091 normally or return to a previous frame after unwinding.
5093 An EH return uses a single shared return sequence. The epilogue is
5094 exactly like a normal epilogue except that it has an extra input
5095 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5096 that must be applied after the frame has been destroyed. An extra label
5097 is inserted before the epilogue which initializes this register to zero,
5098 and this is the entry point for a normal return.
5100 An actual EH return updates the return address, initializes the stack
5101 adjustment and jumps directly into the epilogue (bypassing the zeroing
5102 of the adjustment). Since the return address is typically saved on the
5103 stack when a function makes a call, the saved LR must be updated outside
5104 the epilogue.
5106 This poses problems as the store is generated well before the epilogue,
5107 so the offset of LR is not known yet. Also optimizations will remove the
5108 store as it appears dead, even after the epilogue is generated (as the
5109 base or offset for loading LR is different in many cases).
5111 To avoid these problems this implementation forces the frame pointer
5112 in eh_return functions so that the location of LR is fixed and known early.
5113 It also marks the store volatile, so no optimization is permitted to
5114 remove the store. */
5116 aarch64_eh_return_handler_rtx (void)
5118 rtx tmp = gen_frame_mem (Pmode,
5119 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5121 /* Mark the store volatile, so no optimization is permitted to remove it. */
5122 MEM_VOLATILE_P (tmp) = true;
5123 return tmp;
5126 /* Output code to add DELTA to the first argument, and then jump
5127 to FUNCTION. Used for C++ multiple inheritance. */
5128 static void
5129 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5130 HOST_WIDE_INT delta,
5131 HOST_WIDE_INT vcall_offset,
5132 tree function)
5134 /* The this pointer is always in x0. Note that this differs from
5135 Arm where the this pointer maybe bumped to r1 if r0 is required
5136 to return a pointer to an aggregate. On AArch64 a result value
5137 pointer will be in x8. */
5138 int this_regno = R0_REGNUM;
5139 rtx this_rtx, temp0, temp1, addr, funexp;
5140 rtx_insn *insn;
5142 reload_completed = 1;
5143 emit_note (NOTE_INSN_PROLOGUE_END);
5145 this_rtx = gen_rtx_REG (Pmode, this_regno);
5146 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5147 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5149 if (vcall_offset == 0)
5150 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5151 else
5153 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5155 addr = this_rtx;
5156 if (delta != 0)
5158 if (delta >= -256 && delta < 256)
5159 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5160 plus_constant (Pmode, this_rtx, delta));
5161 else
5162 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5163 temp1, temp0, false);
5166 if (Pmode == ptr_mode)
5167 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5168 else
5169 aarch64_emit_move (temp0,
5170 gen_rtx_ZERO_EXTEND (Pmode,
5171 gen_rtx_MEM (ptr_mode, addr)));
5173 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5174 addr = plus_constant (Pmode, temp0, vcall_offset);
5175 else
5177 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5178 Pmode);
5179 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5182 if (Pmode == ptr_mode)
5183 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5184 else
5185 aarch64_emit_move (temp1,
5186 gen_rtx_SIGN_EXTEND (Pmode,
5187 gen_rtx_MEM (ptr_mode, addr)));
5189 emit_insn (gen_add2_insn (this_rtx, temp1));
5192 /* Generate a tail call to the target function. */
5193 if (!TREE_USED (function))
5195 assemble_external (function);
5196 TREE_USED (function) = 1;
5198 funexp = XEXP (DECL_RTL (function), 0);
5199 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5200 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5201 SIBLING_CALL_P (insn) = 1;
5203 insn = get_insns ();
5204 shorten_branches (insn);
5205 final_start_function (insn, file, 1);
5206 final (insn, file, 1);
5207 final_end_function ();
5209 /* Stop pretending to be a post-reload pass. */
5210 reload_completed = 0;
5213 static bool
5214 aarch64_tls_referenced_p (rtx x)
5216 if (!TARGET_HAVE_TLS)
5217 return false;
5218 subrtx_iterator::array_type array;
5219 FOR_EACH_SUBRTX (iter, array, x, ALL)
5221 const_rtx x = *iter;
5222 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5223 return true;
5224 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5225 TLS offsets, not real symbol references. */
5226 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5227 iter.skip_subrtxes ();
5229 return false;
5233 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5234 a left shift of 0 or 12 bits. */
5235 bool
5236 aarch64_uimm12_shift (HOST_WIDE_INT val)
5238 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5239 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5244 /* Return true if val is an immediate that can be loaded into a
5245 register by a MOVZ instruction. */
5246 static bool
5247 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5249 if (GET_MODE_SIZE (mode) > 4)
5251 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5252 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5253 return 1;
5255 else
5257 /* Ignore sign extension. */
5258 val &= (HOST_WIDE_INT) 0xffffffff;
5260 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5261 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5264 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5265 64-bit (DImode) integer. */
5267 static unsigned HOST_WIDE_INT
5268 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5270 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5271 while (size < 64)
5273 val &= (HOST_WIDE_INT_1U << size) - 1;
5274 val |= val << size;
5275 size *= 2;
5277 return val;
5280 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5282 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5284 0x0000000100000001ull,
5285 0x0001000100010001ull,
5286 0x0101010101010101ull,
5287 0x1111111111111111ull,
5288 0x5555555555555555ull,
5292 /* Return true if val is a valid bitmask immediate. */
5294 bool
5295 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5297 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5298 int bits;
5300 /* Check for a single sequence of one bits and return quickly if so.
5301 The special cases of all ones and all zeroes returns false. */
5302 val = aarch64_replicate_bitmask_imm (val_in, mode);
5303 tmp = val + (val & -val);
5305 if (tmp == (tmp & -tmp))
5306 return (val + 1) > 1;
5308 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5309 if (mode == SImode)
5310 val = (val << 32) | (val & 0xffffffff);
5312 /* Invert if the immediate doesn't start with a zero bit - this means we
5313 only need to search for sequences of one bits. */
5314 if (val & 1)
5315 val = ~val;
5317 /* Find the first set bit and set tmp to val with the first sequence of one
5318 bits removed. Return success if there is a single sequence of ones. */
5319 first_one = val & -val;
5320 tmp = val & (val + first_one);
5322 if (tmp == 0)
5323 return true;
5325 /* Find the next set bit and compute the difference in bit position. */
5326 next_one = tmp & -tmp;
5327 bits = clz_hwi (first_one) - clz_hwi (next_one);
5328 mask = val ^ tmp;
5330 /* Check the bit position difference is a power of 2, and that the first
5331 sequence of one bits fits within 'bits' bits. */
5332 if ((mask >> bits) != 0 || bits != (bits & -bits))
5333 return false;
5335 /* Check the sequence of one bits is repeated 64/bits times. */
5336 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5339 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5340 Assumed precondition: VAL_IN Is not zero. */
5342 unsigned HOST_WIDE_INT
5343 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5345 int lowest_bit_set = ctz_hwi (val_in);
5346 int highest_bit_set = floor_log2 (val_in);
5347 gcc_assert (val_in != 0);
5349 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5350 (HOST_WIDE_INT_1U << lowest_bit_set));
5353 /* Create constant where bits outside of lowest bit set to highest bit set
5354 are set to 1. */
5356 unsigned HOST_WIDE_INT
5357 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5359 return val_in | ~aarch64_and_split_imm1 (val_in);
5362 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5364 bool
5365 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5367 scalar_int_mode int_mode;
5368 if (!is_a <scalar_int_mode> (mode, &int_mode))
5369 return false;
5371 if (aarch64_bitmask_imm (val_in, int_mode))
5372 return false;
5374 if (aarch64_move_imm (val_in, int_mode))
5375 return false;
5377 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5379 return aarch64_bitmask_imm (imm2, int_mode);
5382 /* Return true if val is an immediate that can be loaded into a
5383 register in a single instruction. */
5384 bool
5385 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5387 scalar_int_mode int_mode;
5388 if (!is_a <scalar_int_mode> (mode, &int_mode))
5389 return false;
5391 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5392 return 1;
5393 return aarch64_bitmask_imm (val, int_mode);
5396 static bool
5397 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5399 rtx base, offset;
5401 if (GET_CODE (x) == HIGH)
5402 return true;
5404 /* There's no way to calculate VL-based values using relocations. */
5405 subrtx_iterator::array_type array;
5406 FOR_EACH_SUBRTX (iter, array, x, ALL)
5407 if (GET_CODE (*iter) == CONST_POLY_INT)
5408 return true;
5410 split_const (x, &base, &offset);
5411 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5413 if (aarch64_classify_symbol (base, INTVAL (offset))
5414 != SYMBOL_FORCE_TO_MEM)
5415 return true;
5416 else
5417 /* Avoid generating a 64-bit relocation in ILP32; leave
5418 to aarch64_expand_mov_immediate to handle it properly. */
5419 return mode != ptr_mode;
5422 return aarch64_tls_referenced_p (x);
5425 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5426 The expansion for a table switch is quite expensive due to the number
5427 of instructions, the table lookup and hard to predict indirect jump.
5428 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5429 set, otherwise use tables for > 16 cases as a tradeoff between size and
5430 performance. When optimizing for size, use the default setting. */
5432 static unsigned int
5433 aarch64_case_values_threshold (void)
5435 /* Use the specified limit for the number of cases before using jump
5436 tables at higher optimization levels. */
5437 if (optimize > 2
5438 && selected_cpu->tune->max_case_values != 0)
5439 return selected_cpu->tune->max_case_values;
5440 else
5441 return optimize_size ? default_case_values_threshold () : 17;
5444 /* Return true if register REGNO is a valid index register.
5445 STRICT_P is true if REG_OK_STRICT is in effect. */
5447 bool
5448 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5450 if (!HARD_REGISTER_NUM_P (regno))
5452 if (!strict_p)
5453 return true;
5455 if (!reg_renumber)
5456 return false;
5458 regno = reg_renumber[regno];
5460 return GP_REGNUM_P (regno);
5463 /* Return true if register REGNO is a valid base register for mode MODE.
5464 STRICT_P is true if REG_OK_STRICT is in effect. */
5466 bool
5467 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5469 if (!HARD_REGISTER_NUM_P (regno))
5471 if (!strict_p)
5472 return true;
5474 if (!reg_renumber)
5475 return false;
5477 regno = reg_renumber[regno];
5480 /* The fake registers will be eliminated to either the stack or
5481 hard frame pointer, both of which are usually valid base registers.
5482 Reload deals with the cases where the eliminated form isn't valid. */
5483 return (GP_REGNUM_P (regno)
5484 || regno == SP_REGNUM
5485 || regno == FRAME_POINTER_REGNUM
5486 || regno == ARG_POINTER_REGNUM);
5489 /* Return true if X is a valid base register for mode MODE.
5490 STRICT_P is true if REG_OK_STRICT is in effect. */
5492 static bool
5493 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5495 if (!strict_p
5496 && GET_CODE (x) == SUBREG
5497 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5498 x = SUBREG_REG (x);
5500 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5503 /* Return true if address offset is a valid index. If it is, fill in INFO
5504 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5506 static bool
5507 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5508 machine_mode mode, bool strict_p)
5510 enum aarch64_address_type type;
5511 rtx index;
5512 int shift;
5514 /* (reg:P) */
5515 if ((REG_P (x) || GET_CODE (x) == SUBREG)
5516 && GET_MODE (x) == Pmode)
5518 type = ADDRESS_REG_REG;
5519 index = x;
5520 shift = 0;
5522 /* (sign_extend:DI (reg:SI)) */
5523 else if ((GET_CODE (x) == SIGN_EXTEND
5524 || GET_CODE (x) == ZERO_EXTEND)
5525 && GET_MODE (x) == DImode
5526 && GET_MODE (XEXP (x, 0)) == SImode)
5528 type = (GET_CODE (x) == SIGN_EXTEND)
5529 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5530 index = XEXP (x, 0);
5531 shift = 0;
5533 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5534 else if (GET_CODE (x) == MULT
5535 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5536 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5537 && GET_MODE (XEXP (x, 0)) == DImode
5538 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5539 && CONST_INT_P (XEXP (x, 1)))
5541 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5542 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5543 index = XEXP (XEXP (x, 0), 0);
5544 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5546 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5547 else if (GET_CODE (x) == ASHIFT
5548 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5549 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5550 && GET_MODE (XEXP (x, 0)) == DImode
5551 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5552 && CONST_INT_P (XEXP (x, 1)))
5554 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5555 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5556 index = XEXP (XEXP (x, 0), 0);
5557 shift = INTVAL (XEXP (x, 1));
5559 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5560 else if ((GET_CODE (x) == SIGN_EXTRACT
5561 || GET_CODE (x) == ZERO_EXTRACT)
5562 && GET_MODE (x) == DImode
5563 && GET_CODE (XEXP (x, 0)) == MULT
5564 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5565 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5567 type = (GET_CODE (x) == SIGN_EXTRACT)
5568 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5569 index = XEXP (XEXP (x, 0), 0);
5570 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5571 if (INTVAL (XEXP (x, 1)) != 32 + shift
5572 || INTVAL (XEXP (x, 2)) != 0)
5573 shift = -1;
5575 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5576 (const_int 0xffffffff<<shift)) */
5577 else if (GET_CODE (x) == AND
5578 && GET_MODE (x) == DImode
5579 && GET_CODE (XEXP (x, 0)) == MULT
5580 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5581 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5582 && CONST_INT_P (XEXP (x, 1)))
5584 type = ADDRESS_REG_UXTW;
5585 index = XEXP (XEXP (x, 0), 0);
5586 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5587 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5588 shift = -1;
5590 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5591 else if ((GET_CODE (x) == SIGN_EXTRACT
5592 || GET_CODE (x) == ZERO_EXTRACT)
5593 && GET_MODE (x) == DImode
5594 && GET_CODE (XEXP (x, 0)) == ASHIFT
5595 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5596 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5598 type = (GET_CODE (x) == SIGN_EXTRACT)
5599 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5600 index = XEXP (XEXP (x, 0), 0);
5601 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5602 if (INTVAL (XEXP (x, 1)) != 32 + shift
5603 || INTVAL (XEXP (x, 2)) != 0)
5604 shift = -1;
5606 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5607 (const_int 0xffffffff<<shift)) */
5608 else if (GET_CODE (x) == AND
5609 && GET_MODE (x) == DImode
5610 && GET_CODE (XEXP (x, 0)) == ASHIFT
5611 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5612 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5613 && CONST_INT_P (XEXP (x, 1)))
5615 type = ADDRESS_REG_UXTW;
5616 index = XEXP (XEXP (x, 0), 0);
5617 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5618 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5619 shift = -1;
5621 /* (mult:P (reg:P) (const_int scale)) */
5622 else if (GET_CODE (x) == MULT
5623 && GET_MODE (x) == Pmode
5624 && GET_MODE (XEXP (x, 0)) == Pmode
5625 && CONST_INT_P (XEXP (x, 1)))
5627 type = ADDRESS_REG_REG;
5628 index = XEXP (x, 0);
5629 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5631 /* (ashift:P (reg:P) (const_int shift)) */
5632 else if (GET_CODE (x) == ASHIFT
5633 && GET_MODE (x) == Pmode
5634 && GET_MODE (XEXP (x, 0)) == Pmode
5635 && CONST_INT_P (XEXP (x, 1)))
5637 type = ADDRESS_REG_REG;
5638 index = XEXP (x, 0);
5639 shift = INTVAL (XEXP (x, 1));
5641 else
5642 return false;
5644 if (!strict_p
5645 && GET_CODE (index) == SUBREG
5646 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5647 index = SUBREG_REG (index);
5649 if (aarch64_sve_data_mode_p (mode))
5651 if (type != ADDRESS_REG_REG
5652 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5653 return false;
5655 else
5657 if (shift != 0
5658 && !(IN_RANGE (shift, 1, 3)
5659 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5660 return false;
5663 if (REG_P (index)
5664 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5666 info->type = type;
5667 info->offset = index;
5668 info->shift = shift;
5669 return true;
5672 return false;
5675 /* Return true if MODE is one of the modes for which we
5676 support LDP/STP operations. */
5678 static bool
5679 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5681 return mode == SImode || mode == DImode
5682 || mode == SFmode || mode == DFmode
5683 || (aarch64_vector_mode_supported_p (mode)
5684 && known_eq (GET_MODE_SIZE (mode), 8));
5687 /* Return true if REGNO is a virtual pointer register, or an eliminable
5688 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5689 include stack_pointer or hard_frame_pointer. */
5690 static bool
5691 virt_or_elim_regno_p (unsigned regno)
5693 return ((regno >= FIRST_VIRTUAL_REGISTER
5694 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5695 || regno == FRAME_POINTER_REGNUM
5696 || regno == ARG_POINTER_REGNUM);
5699 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5700 If it is, fill in INFO appropriately. STRICT_P is true if
5701 REG_OK_STRICT is in effect. */
5703 static bool
5704 aarch64_classify_address (struct aarch64_address_info *info,
5705 rtx x, machine_mode mode, bool strict_p,
5706 aarch64_addr_query_type type = ADDR_QUERY_M)
5708 enum rtx_code code = GET_CODE (x);
5709 rtx op0, op1;
5710 poly_int64 offset;
5712 HOST_WIDE_INT const_size;
5714 /* On BE, we use load/store pair for all large int mode load/stores.
5715 TI/TFmode may also use a load/store pair. */
5716 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5717 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5718 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5719 || mode == TImode
5720 || mode == TFmode
5721 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5723 bool allow_reg_index_p = (!load_store_pair_p
5724 && (known_lt (GET_MODE_SIZE (mode), 16)
5725 || vec_flags == VEC_ADVSIMD
5726 || vec_flags == VEC_SVE_DATA));
5728 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5729 [Rn, #offset, MUL VL]. */
5730 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5731 && (code != REG && code != PLUS))
5732 return false;
5734 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5735 REG addressing. */
5736 if (advsimd_struct_p
5737 && !BYTES_BIG_ENDIAN
5738 && (code != POST_INC && code != REG))
5739 return false;
5741 gcc_checking_assert (GET_MODE (x) == VOIDmode
5742 || SCALAR_INT_MODE_P (GET_MODE (x)));
5744 switch (code)
5746 case REG:
5747 case SUBREG:
5748 info->type = ADDRESS_REG_IMM;
5749 info->base = x;
5750 info->offset = const0_rtx;
5751 info->const_offset = 0;
5752 return aarch64_base_register_rtx_p (x, strict_p);
5754 case PLUS:
5755 op0 = XEXP (x, 0);
5756 op1 = XEXP (x, 1);
5758 if (! strict_p
5759 && REG_P (op0)
5760 && virt_or_elim_regno_p (REGNO (op0))
5761 && poly_int_rtx_p (op1, &offset))
5763 info->type = ADDRESS_REG_IMM;
5764 info->base = op0;
5765 info->offset = op1;
5766 info->const_offset = offset;
5768 return true;
5771 if (maybe_ne (GET_MODE_SIZE (mode), 0)
5772 && aarch64_base_register_rtx_p (op0, strict_p)
5773 && poly_int_rtx_p (op1, &offset))
5775 info->type = ADDRESS_REG_IMM;
5776 info->base = op0;
5777 info->offset = op1;
5778 info->const_offset = offset;
5780 /* TImode and TFmode values are allowed in both pairs of X
5781 registers and individual Q registers. The available
5782 address modes are:
5783 X,X: 7-bit signed scaled offset
5784 Q: 9-bit signed offset
5785 We conservatively require an offset representable in either mode.
5786 When performing the check for pairs of X registers i.e. LDP/STP
5787 pass down DImode since that is the natural size of the LDP/STP
5788 instruction memory accesses. */
5789 if (mode == TImode || mode == TFmode)
5790 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5791 && (offset_9bit_signed_unscaled_p (mode, offset)
5792 || offset_12bit_unsigned_scaled_p (mode, offset)));
5794 /* A 7bit offset check because OImode will emit a ldp/stp
5795 instruction (only big endian will get here).
5796 For ldp/stp instructions, the offset is scaled for the size of a
5797 single element of the pair. */
5798 if (mode == OImode)
5799 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5801 /* Three 9/12 bit offsets checks because CImode will emit three
5802 ldr/str instructions (only big endian will get here). */
5803 if (mode == CImode)
5804 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5805 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5806 || offset_12bit_unsigned_scaled_p (V16QImode,
5807 offset + 32)));
5809 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5810 instructions (only big endian will get here). */
5811 if (mode == XImode)
5812 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5813 && aarch64_offset_7bit_signed_scaled_p (TImode,
5814 offset + 32));
5816 /* Make "m" use the LD1 offset range for SVE data modes, so
5817 that pre-RTL optimizers like ivopts will work to that
5818 instead of the wider LDR/STR range. */
5819 if (vec_flags == VEC_SVE_DATA)
5820 return (type == ADDR_QUERY_M
5821 ? offset_4bit_signed_scaled_p (mode, offset)
5822 : offset_9bit_signed_scaled_p (mode, offset));
5824 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5826 poly_int64 end_offset = (offset
5827 + GET_MODE_SIZE (mode)
5828 - BYTES_PER_SVE_VECTOR);
5829 return (type == ADDR_QUERY_M
5830 ? offset_4bit_signed_scaled_p (mode, offset)
5831 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5832 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5833 end_offset)));
5836 if (vec_flags == VEC_SVE_PRED)
5837 return offset_9bit_signed_scaled_p (mode, offset);
5839 if (load_store_pair_p)
5840 return ((known_eq (GET_MODE_SIZE (mode), 4)
5841 || known_eq (GET_MODE_SIZE (mode), 8))
5842 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5843 else
5844 return (offset_9bit_signed_unscaled_p (mode, offset)
5845 || offset_12bit_unsigned_scaled_p (mode, offset));
5848 if (allow_reg_index_p)
5850 /* Look for base + (scaled/extended) index register. */
5851 if (aarch64_base_register_rtx_p (op0, strict_p)
5852 && aarch64_classify_index (info, op1, mode, strict_p))
5854 info->base = op0;
5855 return true;
5857 if (aarch64_base_register_rtx_p (op1, strict_p)
5858 && aarch64_classify_index (info, op0, mode, strict_p))
5860 info->base = op1;
5861 return true;
5865 return false;
5867 case POST_INC:
5868 case POST_DEC:
5869 case PRE_INC:
5870 case PRE_DEC:
5871 info->type = ADDRESS_REG_WB;
5872 info->base = XEXP (x, 0);
5873 info->offset = NULL_RTX;
5874 return aarch64_base_register_rtx_p (info->base, strict_p);
5876 case POST_MODIFY:
5877 case PRE_MODIFY:
5878 info->type = ADDRESS_REG_WB;
5879 info->base = XEXP (x, 0);
5880 if (GET_CODE (XEXP (x, 1)) == PLUS
5881 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5882 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5883 && aarch64_base_register_rtx_p (info->base, strict_p))
5885 info->offset = XEXP (XEXP (x, 1), 1);
5886 info->const_offset = offset;
5888 /* TImode and TFmode values are allowed in both pairs of X
5889 registers and individual Q registers. The available
5890 address modes are:
5891 X,X: 7-bit signed scaled offset
5892 Q: 9-bit signed offset
5893 We conservatively require an offset representable in either mode.
5895 if (mode == TImode || mode == TFmode)
5896 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5897 && offset_9bit_signed_unscaled_p (mode, offset));
5899 if (load_store_pair_p)
5900 return ((known_eq (GET_MODE_SIZE (mode), 4)
5901 || known_eq (GET_MODE_SIZE (mode), 8))
5902 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5903 else
5904 return offset_9bit_signed_unscaled_p (mode, offset);
5906 return false;
5908 case CONST:
5909 case SYMBOL_REF:
5910 case LABEL_REF:
5911 /* load literal: pc-relative constant pool entry. Only supported
5912 for SI mode or larger. */
5913 info->type = ADDRESS_SYMBOLIC;
5915 if (!load_store_pair_p
5916 && GET_MODE_SIZE (mode).is_constant (&const_size)
5917 && const_size >= 4)
5919 rtx sym, addend;
5921 split_const (x, &sym, &addend);
5922 return ((GET_CODE (sym) == LABEL_REF
5923 || (GET_CODE (sym) == SYMBOL_REF
5924 && CONSTANT_POOL_ADDRESS_P (sym)
5925 && aarch64_pcrelative_literal_loads)));
5927 return false;
5929 case LO_SUM:
5930 info->type = ADDRESS_LO_SUM;
5931 info->base = XEXP (x, 0);
5932 info->offset = XEXP (x, 1);
5933 if (allow_reg_index_p
5934 && aarch64_base_register_rtx_p (info->base, strict_p))
5936 rtx sym, offs;
5937 split_const (info->offset, &sym, &offs);
5938 if (GET_CODE (sym) == SYMBOL_REF
5939 && (aarch64_classify_symbol (sym, INTVAL (offs))
5940 == SYMBOL_SMALL_ABSOLUTE))
5942 /* The symbol and offset must be aligned to the access size. */
5943 unsigned int align;
5945 if (CONSTANT_POOL_ADDRESS_P (sym))
5946 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5947 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5949 tree exp = SYMBOL_REF_DECL (sym);
5950 align = TYPE_ALIGN (TREE_TYPE (exp));
5951 align = aarch64_constant_alignment (exp, align);
5953 else if (SYMBOL_REF_DECL (sym))
5954 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5955 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5956 && SYMBOL_REF_BLOCK (sym) != NULL)
5957 align = SYMBOL_REF_BLOCK (sym)->alignment;
5958 else
5959 align = BITS_PER_UNIT;
5961 poly_int64 ref_size = GET_MODE_SIZE (mode);
5962 if (known_eq (ref_size, 0))
5963 ref_size = GET_MODE_SIZE (DImode);
5965 return (multiple_p (INTVAL (offs), ref_size)
5966 && multiple_p (align / BITS_PER_UNIT, ref_size));
5969 return false;
5971 default:
5972 return false;
5976 /* Return true if the address X is valid for a PRFM instruction.
5977 STRICT_P is true if we should do strict checking with
5978 aarch64_classify_address. */
5980 bool
5981 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5983 struct aarch64_address_info addr;
5985 /* PRFM accepts the same addresses as DImode... */
5986 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5987 if (!res)
5988 return false;
5990 /* ... except writeback forms. */
5991 return addr.type != ADDRESS_REG_WB;
5994 bool
5995 aarch64_symbolic_address_p (rtx x)
5997 rtx offset;
5999 split_const (x, &x, &offset);
6000 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6003 /* Classify the base of symbolic expression X. */
6005 enum aarch64_symbol_type
6006 aarch64_classify_symbolic_expression (rtx x)
6008 rtx offset;
6010 split_const (x, &x, &offset);
6011 return aarch64_classify_symbol (x, INTVAL (offset));
6015 /* Return TRUE if X is a legitimate address for accessing memory in
6016 mode MODE. */
6017 static bool
6018 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6020 struct aarch64_address_info addr;
6022 return aarch64_classify_address (&addr, x, mode, strict_p);
6025 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6026 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6027 bool
6028 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6029 aarch64_addr_query_type type)
6031 struct aarch64_address_info addr;
6033 return aarch64_classify_address (&addr, x, mode, strict_p, type);
6036 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6038 static bool
6039 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6040 poly_int64 orig_offset,
6041 machine_mode mode)
6043 HOST_WIDE_INT size;
6044 if (GET_MODE_SIZE (mode).is_constant (&size))
6046 HOST_WIDE_INT const_offset, second_offset;
6048 /* A general SVE offset is A * VQ + B. Remove the A component from
6049 coefficient 0 in order to get the constant B. */
6050 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6052 /* Split an out-of-range address displacement into a base and
6053 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6054 range otherwise to increase opportunities for sharing the base
6055 address of different sizes. Unaligned accesses use the signed
6056 9-bit range, TImode/TFmode use the intersection of signed
6057 scaled 7-bit and signed 9-bit offset. */
6058 if (mode == TImode || mode == TFmode)
6059 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6060 else if ((const_offset & (size - 1)) != 0)
6061 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6062 else
6063 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6065 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6066 return false;
6068 /* Split the offset into second_offset and the rest. */
6069 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6070 *offset2 = gen_int_mode (second_offset, Pmode);
6071 return true;
6073 else
6075 /* Get the mode we should use as the basis of the range. For structure
6076 modes this is the mode of one vector. */
6077 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6078 machine_mode step_mode
6079 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6081 /* Get the "mul vl" multiplier we'd like to use. */
6082 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6083 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6084 if (vec_flags & VEC_SVE_DATA)
6085 /* LDR supports a 9-bit range, but the move patterns for
6086 structure modes require all vectors to be in range of the
6087 same base. The simplest way of accomodating that while still
6088 promoting reuse of anchor points between different modes is
6089 to use an 8-bit range unconditionally. */
6090 vnum = ((vnum + 128) & 255) - 128;
6091 else
6092 /* Predicates are only handled singly, so we might as well use
6093 the full range. */
6094 vnum = ((vnum + 256) & 511) - 256;
6095 if (vnum == 0)
6096 return false;
6098 /* Convert the "mul vl" multiplier into a byte offset. */
6099 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6100 if (known_eq (second_offset, orig_offset))
6101 return false;
6103 /* Split the offset into second_offset and the rest. */
6104 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6105 *offset2 = gen_int_mode (second_offset, Pmode);
6106 return true;
6110 /* Return the binary representation of floating point constant VALUE in INTVAL.
6111 If the value cannot be converted, return false without setting INTVAL.
6112 The conversion is done in the given MODE. */
6113 bool
6114 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6117 /* We make a general exception for 0. */
6118 if (aarch64_float_const_zero_rtx_p (value))
6120 *intval = 0;
6121 return true;
6124 scalar_float_mode mode;
6125 if (GET_CODE (value) != CONST_DOUBLE
6126 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6127 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6128 /* Only support up to DF mode. */
6129 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6130 return false;
6132 unsigned HOST_WIDE_INT ival = 0;
6134 long res[2];
6135 real_to_target (res,
6136 CONST_DOUBLE_REAL_VALUE (value),
6137 REAL_MODE_FORMAT (mode));
6139 if (mode == DFmode)
6141 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6142 ival = zext_hwi (res[order], 32);
6143 ival |= (zext_hwi (res[1 - order], 32) << 32);
6145 else
6146 ival = zext_hwi (res[0], 32);
6148 *intval = ival;
6149 return true;
6152 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6153 single MOV(+MOVK) followed by an FMOV. */
6154 bool
6155 aarch64_float_const_rtx_p (rtx x)
6157 machine_mode mode = GET_MODE (x);
6158 if (mode == VOIDmode)
6159 return false;
6161 /* Determine whether it's cheaper to write float constants as
6162 mov/movk pairs over ldr/adrp pairs. */
6163 unsigned HOST_WIDE_INT ival;
6165 if (GET_CODE (x) == CONST_DOUBLE
6166 && SCALAR_FLOAT_MODE_P (mode)
6167 && aarch64_reinterpret_float_as_int (x, &ival))
6169 scalar_int_mode imode = (mode == HFmode
6170 ? SImode
6171 : int_mode_for_mode (mode).require ());
6172 int num_instr = aarch64_internal_mov_immediate
6173 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6174 return num_instr < 3;
6177 return false;
6180 /* Return TRUE if rtx X is immediate constant 0.0 */
6181 bool
6182 aarch64_float_const_zero_rtx_p (rtx x)
6184 if (GET_MODE (x) == VOIDmode)
6185 return false;
6187 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6188 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6189 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6192 /* Return TRUE if rtx X is immediate constant that fits in a single
6193 MOVI immediate operation. */
6194 bool
6195 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6197 if (!TARGET_SIMD)
6198 return false;
6200 machine_mode vmode;
6201 scalar_int_mode imode;
6202 unsigned HOST_WIDE_INT ival;
6204 if (GET_CODE (x) == CONST_DOUBLE
6205 && SCALAR_FLOAT_MODE_P (mode))
6207 if (!aarch64_reinterpret_float_as_int (x, &ival))
6208 return false;
6210 /* We make a general exception for 0. */
6211 if (aarch64_float_const_zero_rtx_p (x))
6212 return true;
6214 imode = int_mode_for_mode (mode).require ();
6216 else if (GET_CODE (x) == CONST_INT
6217 && is_a <scalar_int_mode> (mode, &imode))
6218 ival = INTVAL (x);
6219 else
6220 return false;
6222 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6223 a 128 bit vector mode. */
6224 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6226 vmode = aarch64_simd_container_mode (imode, width);
6227 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6229 return aarch64_simd_valid_immediate (v_op, NULL);
6233 /* Return the fixed registers used for condition codes. */
6235 static bool
6236 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6238 *p1 = CC_REGNUM;
6239 *p2 = INVALID_REGNUM;
6240 return true;
6243 /* This function is used by the call expanders of the machine description.
6244 RESULT is the register in which the result is returned. It's NULL for
6245 "call" and "sibcall".
6246 MEM is the location of the function call.
6247 SIBCALL indicates whether this function call is normal call or sibling call.
6248 It will generate different pattern accordingly. */
6250 void
6251 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6253 rtx call, callee, tmp;
6254 rtvec vec;
6255 machine_mode mode;
6257 gcc_assert (MEM_P (mem));
6258 callee = XEXP (mem, 0);
6259 mode = GET_MODE (callee);
6260 gcc_assert (mode == Pmode);
6262 /* Decide if we should generate indirect calls by loading the
6263 address of the callee into a register before performing
6264 the branch-and-link. */
6265 if (SYMBOL_REF_P (callee)
6266 ? (aarch64_is_long_call_p (callee)
6267 || aarch64_is_noplt_call_p (callee))
6268 : !REG_P (callee))
6269 XEXP (mem, 0) = force_reg (mode, callee);
6271 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6273 if (result != NULL_RTX)
6274 call = gen_rtx_SET (result, call);
6276 if (sibcall)
6277 tmp = ret_rtx;
6278 else
6279 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6281 vec = gen_rtvec (2, call, tmp);
6282 call = gen_rtx_PARALLEL (VOIDmode, vec);
6284 aarch64_emit_call_insn (call);
6287 /* Emit call insn with PAT and do aarch64-specific handling. */
6289 void
6290 aarch64_emit_call_insn (rtx pat)
6292 rtx insn = emit_call_insn (pat);
6294 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6295 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6296 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6299 machine_mode
6300 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6302 /* All floating point compares return CCFP if it is an equality
6303 comparison, and CCFPE otherwise. */
6304 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6306 switch (code)
6308 case EQ:
6309 case NE:
6310 case UNORDERED:
6311 case ORDERED:
6312 case UNLT:
6313 case UNLE:
6314 case UNGT:
6315 case UNGE:
6316 case UNEQ:
6317 return CCFPmode;
6319 case LT:
6320 case LE:
6321 case GT:
6322 case GE:
6323 case LTGT:
6324 return CCFPEmode;
6326 default:
6327 gcc_unreachable ();
6331 /* Equality comparisons of short modes against zero can be performed
6332 using the TST instruction with the appropriate bitmask. */
6333 if (y == const0_rtx && REG_P (x)
6334 && (code == EQ || code == NE)
6335 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6336 return CC_NZmode;
6338 /* Similarly, comparisons of zero_extends from shorter modes can
6339 be performed using an ANDS with an immediate mask. */
6340 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6341 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6342 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6343 && (code == EQ || code == NE))
6344 return CC_NZmode;
6346 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6347 && y == const0_rtx
6348 && (code == EQ || code == NE || code == LT || code == GE)
6349 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6350 || GET_CODE (x) == NEG
6351 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6352 && CONST_INT_P (XEXP (x, 2)))))
6353 return CC_NZmode;
6355 /* A compare with a shifted operand. Because of canonicalization,
6356 the comparison will have to be swapped when we emit the assembly
6357 code. */
6358 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6359 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6360 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6361 || GET_CODE (x) == LSHIFTRT
6362 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6363 return CC_SWPmode;
6365 /* Similarly for a negated operand, but we can only do this for
6366 equalities. */
6367 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6368 && (REG_P (y) || GET_CODE (y) == SUBREG)
6369 && (code == EQ || code == NE)
6370 && GET_CODE (x) == NEG)
6371 return CC_Zmode;
6373 /* A test for unsigned overflow. */
6374 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6375 && code == NE
6376 && GET_CODE (x) == PLUS
6377 && GET_CODE (y) == ZERO_EXTEND)
6378 return CC_Cmode;
6380 /* For everything else, return CCmode. */
6381 return CCmode;
6384 static int
6385 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6388 aarch64_get_condition_code (rtx x)
6390 machine_mode mode = GET_MODE (XEXP (x, 0));
6391 enum rtx_code comp_code = GET_CODE (x);
6393 if (GET_MODE_CLASS (mode) != MODE_CC)
6394 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6395 return aarch64_get_condition_code_1 (mode, comp_code);
6398 static int
6399 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6401 switch (mode)
6403 case E_CCFPmode:
6404 case E_CCFPEmode:
6405 switch (comp_code)
6407 case GE: return AARCH64_GE;
6408 case GT: return AARCH64_GT;
6409 case LE: return AARCH64_LS;
6410 case LT: return AARCH64_MI;
6411 case NE: return AARCH64_NE;
6412 case EQ: return AARCH64_EQ;
6413 case ORDERED: return AARCH64_VC;
6414 case UNORDERED: return AARCH64_VS;
6415 case UNLT: return AARCH64_LT;
6416 case UNLE: return AARCH64_LE;
6417 case UNGT: return AARCH64_HI;
6418 case UNGE: return AARCH64_PL;
6419 default: return -1;
6421 break;
6423 case E_CCmode:
6424 switch (comp_code)
6426 case NE: return AARCH64_NE;
6427 case EQ: return AARCH64_EQ;
6428 case GE: return AARCH64_GE;
6429 case GT: return AARCH64_GT;
6430 case LE: return AARCH64_LE;
6431 case LT: return AARCH64_LT;
6432 case GEU: return AARCH64_CS;
6433 case GTU: return AARCH64_HI;
6434 case LEU: return AARCH64_LS;
6435 case LTU: return AARCH64_CC;
6436 default: return -1;
6438 break;
6440 case E_CC_SWPmode:
6441 switch (comp_code)
6443 case NE: return AARCH64_NE;
6444 case EQ: return AARCH64_EQ;
6445 case GE: return AARCH64_LE;
6446 case GT: return AARCH64_LT;
6447 case LE: return AARCH64_GE;
6448 case LT: return AARCH64_GT;
6449 case GEU: return AARCH64_LS;
6450 case GTU: return AARCH64_CC;
6451 case LEU: return AARCH64_CS;
6452 case LTU: return AARCH64_HI;
6453 default: return -1;
6455 break;
6457 case E_CC_NZmode:
6458 switch (comp_code)
6460 case NE: return AARCH64_NE;
6461 case EQ: return AARCH64_EQ;
6462 case GE: return AARCH64_PL;
6463 case LT: return AARCH64_MI;
6464 default: return -1;
6466 break;
6468 case E_CC_Zmode:
6469 switch (comp_code)
6471 case NE: return AARCH64_NE;
6472 case EQ: return AARCH64_EQ;
6473 default: return -1;
6475 break;
6477 case E_CC_Cmode:
6478 switch (comp_code)
6480 case NE: return AARCH64_CS;
6481 case EQ: return AARCH64_CC;
6482 default: return -1;
6484 break;
6486 default:
6487 return -1;
6490 return -1;
6493 bool
6494 aarch64_const_vec_all_same_in_range_p (rtx x,
6495 HOST_WIDE_INT minval,
6496 HOST_WIDE_INT maxval)
6498 rtx elt;
6499 return (const_vec_duplicate_p (x, &elt)
6500 && CONST_INT_P (elt)
6501 && IN_RANGE (INTVAL (elt), minval, maxval));
6504 bool
6505 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6507 return aarch64_const_vec_all_same_in_range_p (x, val, val);
6510 /* Return true if VEC is a constant in which every element is in the range
6511 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6513 static bool
6514 aarch64_const_vec_all_in_range_p (rtx vec,
6515 HOST_WIDE_INT minval,
6516 HOST_WIDE_INT maxval)
6518 if (GET_CODE (vec) != CONST_VECTOR
6519 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6520 return false;
6522 int nunits;
6523 if (!CONST_VECTOR_STEPPED_P (vec))
6524 nunits = const_vector_encoded_nelts (vec);
6525 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6526 return false;
6528 for (int i = 0; i < nunits; i++)
6530 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6531 if (!CONST_INT_P (vec_elem)
6532 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6533 return false;
6535 return true;
6538 /* N Z C V. */
6539 #define AARCH64_CC_V 1
6540 #define AARCH64_CC_C (1 << 1)
6541 #define AARCH64_CC_Z (1 << 2)
6542 #define AARCH64_CC_N (1 << 3)
6544 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6545 static const int aarch64_nzcv_codes[] =
6547 0, /* EQ, Z == 1. */
6548 AARCH64_CC_Z, /* NE, Z == 0. */
6549 0, /* CS, C == 1. */
6550 AARCH64_CC_C, /* CC, C == 0. */
6551 0, /* MI, N == 1. */
6552 AARCH64_CC_N, /* PL, N == 0. */
6553 0, /* VS, V == 1. */
6554 AARCH64_CC_V, /* VC, V == 0. */
6555 0, /* HI, C ==1 && Z == 0. */
6556 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
6557 AARCH64_CC_V, /* GE, N == V. */
6558 0, /* LT, N != V. */
6559 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
6560 0, /* LE, !(Z == 0 && N == V). */
6561 0, /* AL, Any. */
6562 0 /* NV, Any. */
6565 /* Print floating-point vector immediate operand X to F, negating it
6566 first if NEGATE is true. Return true on success, false if it isn't
6567 a constant we can handle. */
6569 static bool
6570 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6572 rtx elt;
6574 if (!const_vec_duplicate_p (x, &elt))
6575 return false;
6577 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6578 if (negate)
6579 r = real_value_negate (&r);
6581 /* We only handle the SVE single-bit immediates here. */
6582 if (real_equal (&r, &dconst0))
6583 asm_fprintf (f, "0.0");
6584 else if (real_equal (&r, &dconst1))
6585 asm_fprintf (f, "1.0");
6586 else if (real_equal (&r, &dconsthalf))
6587 asm_fprintf (f, "0.5");
6588 else
6589 return false;
6591 return true;
6594 /* Return the equivalent letter for size. */
6595 static char
6596 sizetochar (int size)
6598 switch (size)
6600 case 64: return 'd';
6601 case 32: return 's';
6602 case 16: return 'h';
6603 case 8 : return 'b';
6604 default: gcc_unreachable ();
6608 /* Print operand X to file F in a target specific manner according to CODE.
6609 The acceptable formatting commands given by CODE are:
6610 'c': An integer or symbol address without a preceding #
6611 sign.
6612 'C': Take the duplicated element in a vector constant
6613 and print it in hex.
6614 'D': Take the duplicated element in a vector constant
6615 and print it as an unsigned integer, in decimal.
6616 'e': Print the sign/zero-extend size as a character 8->b,
6617 16->h, 32->w.
6618 'p': Prints N such that 2^N == X (X must be power of 2 and
6619 const int).
6620 'P': Print the number of non-zero bits in X (a const_int).
6621 'H': Print the higher numbered register of a pair (TImode)
6622 of regs.
6623 'm': Print a condition (eq, ne, etc).
6624 'M': Same as 'm', but invert condition.
6625 'N': Take the duplicated element in a vector constant
6626 and print the negative of it in decimal.
6627 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6628 'S/T/U/V': Print a FP/SIMD register name for a register list.
6629 The register printed is the FP/SIMD register name
6630 of X + 0/1/2/3 for S/T/U/V.
6631 'R': Print a scalar FP/SIMD register name + 1.
6632 'X': Print bottom 16 bits of integer constant in hex.
6633 'w/x': Print a general register name or the zero register
6634 (32-bit or 64-bit).
6635 '0': Print a normal operand, if it's a general register,
6636 then we assume DImode.
6637 'k': Print NZCV for conditional compare instructions.
6638 'A': Output address constant representing the first
6639 argument of X, specifying a relocation offset
6640 if appropriate.
6641 'L': Output constant address specified by X
6642 with a relocation offset if appropriate.
6643 'G': Prints address of X, specifying a PC relative
6644 relocation mode if appropriate.
6645 'y': Output address of LDP or STP - this is used for
6646 some LDP/STPs which don't use a PARALLEL in their
6647 pattern (so the mode needs to be adjusted).
6648 'z': Output address of a typical LDP or STP. */
6650 static void
6651 aarch64_print_operand (FILE *f, rtx x, int code)
6653 rtx elt;
6654 switch (code)
6656 case 'c':
6657 switch (GET_CODE (x))
6659 case CONST_INT:
6660 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6661 break;
6663 case SYMBOL_REF:
6664 output_addr_const (f, x);
6665 break;
6667 case CONST:
6668 if (GET_CODE (XEXP (x, 0)) == PLUS
6669 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6671 output_addr_const (f, x);
6672 break;
6674 /* Fall through. */
6676 default:
6677 output_operand_lossage ("unsupported operand for code '%c'", code);
6679 break;
6681 case 'e':
6683 int n;
6685 if (!CONST_INT_P (x)
6686 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6688 output_operand_lossage ("invalid operand for '%%%c'", code);
6689 return;
6692 switch (n)
6694 case 3:
6695 fputc ('b', f);
6696 break;
6697 case 4:
6698 fputc ('h', f);
6699 break;
6700 case 5:
6701 fputc ('w', f);
6702 break;
6703 default:
6704 output_operand_lossage ("invalid operand for '%%%c'", code);
6705 return;
6708 break;
6710 case 'p':
6712 int n;
6714 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6716 output_operand_lossage ("invalid operand for '%%%c'", code);
6717 return;
6720 asm_fprintf (f, "%d", n);
6722 break;
6724 case 'P':
6725 if (!CONST_INT_P (x))
6727 output_operand_lossage ("invalid operand for '%%%c'", code);
6728 return;
6731 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6732 break;
6734 case 'H':
6735 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6737 output_operand_lossage ("invalid operand for '%%%c'", code);
6738 return;
6741 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6742 break;
6744 case 'M':
6745 case 'm':
6747 int cond_code;
6748 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6749 if (x == const_true_rtx)
6751 if (code == 'M')
6752 fputs ("nv", f);
6753 return;
6756 if (!COMPARISON_P (x))
6758 output_operand_lossage ("invalid operand for '%%%c'", code);
6759 return;
6762 cond_code = aarch64_get_condition_code (x);
6763 gcc_assert (cond_code >= 0);
6764 if (code == 'M')
6765 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6766 fputs (aarch64_condition_codes[cond_code], f);
6768 break;
6770 case 'N':
6771 if (!const_vec_duplicate_p (x, &elt))
6773 output_operand_lossage ("invalid vector constant");
6774 return;
6777 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6778 asm_fprintf (f, "%wd", -INTVAL (elt));
6779 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6780 && aarch64_print_vector_float_operand (f, x, true))
6782 else
6784 output_operand_lossage ("invalid vector constant");
6785 return;
6787 break;
6789 case 'b':
6790 case 'h':
6791 case 's':
6792 case 'd':
6793 case 'q':
6794 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6796 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6797 return;
6799 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6800 break;
6802 case 'S':
6803 case 'T':
6804 case 'U':
6805 case 'V':
6806 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6808 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6809 return;
6811 asm_fprintf (f, "%c%d",
6812 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6813 REGNO (x) - V0_REGNUM + (code - 'S'));
6814 break;
6816 case 'R':
6817 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6819 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6820 return;
6822 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6823 break;
6825 case 'X':
6826 if (!CONST_INT_P (x))
6828 output_operand_lossage ("invalid operand for '%%%c'", code);
6829 return;
6831 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6832 break;
6834 case 'C':
6836 /* Print a replicated constant in hex. */
6837 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6839 output_operand_lossage ("invalid operand for '%%%c'", code);
6840 return;
6842 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6843 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6845 break;
6847 case 'D':
6849 /* Print a replicated constant in decimal, treating it as
6850 unsigned. */
6851 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6853 output_operand_lossage ("invalid operand for '%%%c'", code);
6854 return;
6856 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6857 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6859 break;
6861 case 'w':
6862 case 'x':
6863 if (x == const0_rtx
6864 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6866 asm_fprintf (f, "%czr", code);
6867 break;
6870 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6872 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6873 break;
6876 if (REG_P (x) && REGNO (x) == SP_REGNUM)
6878 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6879 break;
6882 /* Fall through */
6884 case 0:
6885 if (x == NULL)
6887 output_operand_lossage ("missing operand");
6888 return;
6891 switch (GET_CODE (x))
6893 case REG:
6894 if (aarch64_sve_data_mode_p (GET_MODE (x)))
6896 if (REG_NREGS (x) == 1)
6897 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6898 else
6900 char suffix
6901 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6902 asm_fprintf (f, "{z%d.%c - z%d.%c}",
6903 REGNO (x) - V0_REGNUM, suffix,
6904 END_REGNO (x) - V0_REGNUM - 1, suffix);
6907 else
6908 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6909 break;
6911 case MEM:
6912 output_address (GET_MODE (x), XEXP (x, 0));
6913 break;
6915 case LABEL_REF:
6916 case SYMBOL_REF:
6917 output_addr_const (asm_out_file, x);
6918 break;
6920 case CONST_INT:
6921 asm_fprintf (f, "%wd", INTVAL (x));
6922 break;
6924 case CONST:
6925 if (!VECTOR_MODE_P (GET_MODE (x)))
6927 output_addr_const (asm_out_file, x);
6928 break;
6930 /* fall through */
6932 case CONST_VECTOR:
6933 if (!const_vec_duplicate_p (x, &elt))
6935 output_operand_lossage ("invalid vector constant");
6936 return;
6939 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6940 asm_fprintf (f, "%wd", INTVAL (elt));
6941 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6942 && aarch64_print_vector_float_operand (f, x, false))
6944 else
6946 output_operand_lossage ("invalid vector constant");
6947 return;
6949 break;
6951 case CONST_DOUBLE:
6952 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6953 be getting CONST_DOUBLEs holding integers. */
6954 gcc_assert (GET_MODE (x) != VOIDmode);
6955 if (aarch64_float_const_zero_rtx_p (x))
6957 fputc ('0', f);
6958 break;
6960 else if (aarch64_float_const_representable_p (x))
6962 #define buf_size 20
6963 char float_buf[buf_size] = {'\0'};
6964 real_to_decimal_for_mode (float_buf,
6965 CONST_DOUBLE_REAL_VALUE (x),
6966 buf_size, buf_size,
6967 1, GET_MODE (x));
6968 asm_fprintf (asm_out_file, "%s", float_buf);
6969 break;
6970 #undef buf_size
6972 output_operand_lossage ("invalid constant");
6973 return;
6974 default:
6975 output_operand_lossage ("invalid operand");
6976 return;
6978 break;
6980 case 'A':
6981 if (GET_CODE (x) == HIGH)
6982 x = XEXP (x, 0);
6984 switch (aarch64_classify_symbolic_expression (x))
6986 case SYMBOL_SMALL_GOT_4G:
6987 asm_fprintf (asm_out_file, ":got:");
6988 break;
6990 case SYMBOL_SMALL_TLSGD:
6991 asm_fprintf (asm_out_file, ":tlsgd:");
6992 break;
6994 case SYMBOL_SMALL_TLSDESC:
6995 asm_fprintf (asm_out_file, ":tlsdesc:");
6996 break;
6998 case SYMBOL_SMALL_TLSIE:
6999 asm_fprintf (asm_out_file, ":gottprel:");
7000 break;
7002 case SYMBOL_TLSLE24:
7003 asm_fprintf (asm_out_file, ":tprel:");
7004 break;
7006 case SYMBOL_TINY_GOT:
7007 gcc_unreachable ();
7008 break;
7010 default:
7011 break;
7013 output_addr_const (asm_out_file, x);
7014 break;
7016 case 'L':
7017 switch (aarch64_classify_symbolic_expression (x))
7019 case SYMBOL_SMALL_GOT_4G:
7020 asm_fprintf (asm_out_file, ":lo12:");
7021 break;
7023 case SYMBOL_SMALL_TLSGD:
7024 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7025 break;
7027 case SYMBOL_SMALL_TLSDESC:
7028 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7029 break;
7031 case SYMBOL_SMALL_TLSIE:
7032 asm_fprintf (asm_out_file, ":gottprel_lo12:");
7033 break;
7035 case SYMBOL_TLSLE12:
7036 asm_fprintf (asm_out_file, ":tprel_lo12:");
7037 break;
7039 case SYMBOL_TLSLE24:
7040 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7041 break;
7043 case SYMBOL_TINY_GOT:
7044 asm_fprintf (asm_out_file, ":got:");
7045 break;
7047 case SYMBOL_TINY_TLSIE:
7048 asm_fprintf (asm_out_file, ":gottprel:");
7049 break;
7051 default:
7052 break;
7054 output_addr_const (asm_out_file, x);
7055 break;
7057 case 'G':
7058 switch (aarch64_classify_symbolic_expression (x))
7060 case SYMBOL_TLSLE24:
7061 asm_fprintf (asm_out_file, ":tprel_hi12:");
7062 break;
7063 default:
7064 break;
7066 output_addr_const (asm_out_file, x);
7067 break;
7069 case 'k':
7071 HOST_WIDE_INT cond_code;
7073 if (!CONST_INT_P (x))
7075 output_operand_lossage ("invalid operand for '%%%c'", code);
7076 return;
7079 cond_code = INTVAL (x);
7080 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7081 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7083 break;
7085 case 'y':
7086 case 'z':
7088 machine_mode mode = GET_MODE (x);
7090 if (GET_CODE (x) != MEM
7091 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7093 output_operand_lossage ("invalid operand for '%%%c'", code);
7094 return;
7097 if (code == 'y')
7098 /* LDP/STP which uses a single double-width memory operand.
7099 Adjust the mode to appear like a typical LDP/STP.
7100 Currently this is supported for 16-byte accesses only. */
7101 mode = DFmode;
7103 if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7104 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7106 break;
7108 default:
7109 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7110 return;
7114 /* Print address 'x' of a memory access with mode 'mode'.
7115 'op' is the context required by aarch64_classify_address. It can either be
7116 MEM for a normal memory access or PARALLEL for LDP/STP. */
7117 static bool
7118 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7119 aarch64_addr_query_type type)
7121 struct aarch64_address_info addr;
7122 unsigned int size;
7124 /* Check all addresses are Pmode - including ILP32. */
7125 if (GET_MODE (x) != Pmode)
7126 output_operand_lossage ("invalid address mode");
7128 if (aarch64_classify_address (&addr, x, mode, true, type))
7129 switch (addr.type)
7131 case ADDRESS_REG_IMM:
7132 if (known_eq (addr.const_offset, 0))
7133 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7134 else if (aarch64_sve_data_mode_p (mode))
7136 HOST_WIDE_INT vnum
7137 = exact_div (addr.const_offset,
7138 BYTES_PER_SVE_VECTOR).to_constant ();
7139 asm_fprintf (f, "[%s, #%wd, mul vl]",
7140 reg_names[REGNO (addr.base)], vnum);
7142 else if (aarch64_sve_pred_mode_p (mode))
7144 HOST_WIDE_INT vnum
7145 = exact_div (addr.const_offset,
7146 BYTES_PER_SVE_PRED).to_constant ();
7147 asm_fprintf (f, "[%s, #%wd, mul vl]",
7148 reg_names[REGNO (addr.base)], vnum);
7150 else
7151 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7152 INTVAL (addr.offset));
7153 return true;
7155 case ADDRESS_REG_REG:
7156 if (addr.shift == 0)
7157 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7158 reg_names [REGNO (addr.offset)]);
7159 else
7160 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7161 reg_names [REGNO (addr.offset)], addr.shift);
7162 return true;
7164 case ADDRESS_REG_UXTW:
7165 if (addr.shift == 0)
7166 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7167 REGNO (addr.offset) - R0_REGNUM);
7168 else
7169 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7170 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7171 return true;
7173 case ADDRESS_REG_SXTW:
7174 if (addr.shift == 0)
7175 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7176 REGNO (addr.offset) - R0_REGNUM);
7177 else
7178 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7179 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7180 return true;
7182 case ADDRESS_REG_WB:
7183 /* Writeback is only supported for fixed-width modes. */
7184 size = GET_MODE_SIZE (mode).to_constant ();
7185 switch (GET_CODE (x))
7187 case PRE_INC:
7188 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7189 return true;
7190 case POST_INC:
7191 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7192 return true;
7193 case PRE_DEC:
7194 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7195 return true;
7196 case POST_DEC:
7197 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7198 return true;
7199 case PRE_MODIFY:
7200 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7201 INTVAL (addr.offset));
7202 return true;
7203 case POST_MODIFY:
7204 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7205 INTVAL (addr.offset));
7206 return true;
7207 default:
7208 break;
7210 break;
7212 case ADDRESS_LO_SUM:
7213 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7214 output_addr_const (f, addr.offset);
7215 asm_fprintf (f, "]");
7216 return true;
7218 case ADDRESS_SYMBOLIC:
7219 output_addr_const (f, x);
7220 return true;
7223 return false;
7226 /* Print address 'x' of a LDP/STP with mode 'mode'. */
7227 static bool
7228 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7230 return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7233 /* Print address 'x' of a memory access with mode 'mode'. */
7234 static void
7235 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7237 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7238 output_addr_const (f, x);
7241 bool
7242 aarch64_label_mentioned_p (rtx x)
7244 const char *fmt;
7245 int i;
7247 if (GET_CODE (x) == LABEL_REF)
7248 return true;
7250 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7251 referencing instruction, but they are constant offsets, not
7252 symbols. */
7253 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7254 return false;
7256 fmt = GET_RTX_FORMAT (GET_CODE (x));
7257 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7259 if (fmt[i] == 'E')
7261 int j;
7263 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7264 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7265 return 1;
7267 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7268 return 1;
7271 return 0;
7274 /* Implement REGNO_REG_CLASS. */
7276 enum reg_class
7277 aarch64_regno_regclass (unsigned regno)
7279 if (GP_REGNUM_P (regno))
7280 return GENERAL_REGS;
7282 if (regno == SP_REGNUM)
7283 return STACK_REG;
7285 if (regno == FRAME_POINTER_REGNUM
7286 || regno == ARG_POINTER_REGNUM)
7287 return POINTER_REGS;
7289 if (FP_REGNUM_P (regno))
7290 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
7292 if (PR_REGNUM_P (regno))
7293 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7295 return NO_REGS;
7298 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7299 If OFFSET is out of range, return an offset of an anchor point
7300 that is in range. Return 0 otherwise. */
7302 static HOST_WIDE_INT
7303 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7304 machine_mode mode)
7306 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7307 if (size > 16)
7308 return (offset + 0x400) & ~0x7f0;
7310 /* For offsets that aren't a multiple of the access size, the limit is
7311 -256...255. */
7312 if (offset & (size - 1))
7314 /* BLKmode typically uses LDP of X-registers. */
7315 if (mode == BLKmode)
7316 return (offset + 512) & ~0x3ff;
7317 return (offset + 0x100) & ~0x1ff;
7320 /* Small negative offsets are supported. */
7321 if (IN_RANGE (offset, -256, 0))
7322 return 0;
7324 if (mode == TImode || mode == TFmode)
7325 return (offset + 0x100) & ~0x1ff;
7327 /* Use 12-bit offset by access size. */
7328 return offset & (~0xfff * size);
7331 static rtx
7332 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
7334 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7335 where mask is selected by alignment and size of the offset.
7336 We try to pick as large a range for the offset as possible to
7337 maximize the chance of a CSE. However, for aligned addresses
7338 we limit the range to 4k so that structures with different sized
7339 elements are likely to use the same base. We need to be careful
7340 not to split a CONST for some forms of address expression, otherwise
7341 it will generate sub-optimal code. */
7343 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7345 rtx base = XEXP (x, 0);
7346 rtx offset_rtx = XEXP (x, 1);
7347 HOST_WIDE_INT offset = INTVAL (offset_rtx);
7349 if (GET_CODE (base) == PLUS)
7351 rtx op0 = XEXP (base, 0);
7352 rtx op1 = XEXP (base, 1);
7354 /* Force any scaling into a temp for CSE. */
7355 op0 = force_reg (Pmode, op0);
7356 op1 = force_reg (Pmode, op1);
7358 /* Let the pointer register be in op0. */
7359 if (REG_POINTER (op1))
7360 std::swap (op0, op1);
7362 /* If the pointer is virtual or frame related, then we know that
7363 virtual register instantiation or register elimination is going
7364 to apply a second constant. We want the two constants folded
7365 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7366 if (virt_or_elim_regno_p (REGNO (op0)))
7368 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7369 NULL_RTX, true, OPTAB_DIRECT);
7370 return gen_rtx_PLUS (Pmode, base, op1);
7373 /* Otherwise, in order to encourage CSE (and thence loop strength
7374 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7375 base = expand_binop (Pmode, add_optab, op0, op1,
7376 NULL_RTX, true, OPTAB_DIRECT);
7377 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7380 HOST_WIDE_INT size;
7381 if (GET_MODE_SIZE (mode).is_constant (&size))
7383 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7384 mode);
7385 if (base_offset != 0)
7387 base = plus_constant (Pmode, base, base_offset);
7388 base = force_operand (base, NULL_RTX);
7389 return plus_constant (Pmode, base, offset - base_offset);
7394 return x;
7397 /* Return the reload icode required for a constant pool in mode. */
7398 static enum insn_code
7399 aarch64_constant_pool_reload_icode (machine_mode mode)
7401 switch (mode)
7403 case E_SFmode:
7404 return CODE_FOR_aarch64_reload_movcpsfdi;
7406 case E_DFmode:
7407 return CODE_FOR_aarch64_reload_movcpdfdi;
7409 case E_TFmode:
7410 return CODE_FOR_aarch64_reload_movcptfdi;
7412 case E_V8QImode:
7413 return CODE_FOR_aarch64_reload_movcpv8qidi;
7415 case E_V16QImode:
7416 return CODE_FOR_aarch64_reload_movcpv16qidi;
7418 case E_V4HImode:
7419 return CODE_FOR_aarch64_reload_movcpv4hidi;
7421 case E_V8HImode:
7422 return CODE_FOR_aarch64_reload_movcpv8hidi;
7424 case E_V2SImode:
7425 return CODE_FOR_aarch64_reload_movcpv2sidi;
7427 case E_V4SImode:
7428 return CODE_FOR_aarch64_reload_movcpv4sidi;
7430 case E_V2DImode:
7431 return CODE_FOR_aarch64_reload_movcpv2didi;
7433 case E_V2DFmode:
7434 return CODE_FOR_aarch64_reload_movcpv2dfdi;
7436 default:
7437 gcc_unreachable ();
7440 gcc_unreachable ();
7442 static reg_class_t
7443 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7444 reg_class_t rclass,
7445 machine_mode mode,
7446 secondary_reload_info *sri)
7448 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7449 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7450 comment at the head of aarch64-sve.md for more details about the
7451 big-endian handling. */
7452 if (BYTES_BIG_ENDIAN
7453 && reg_class_subset_p (rclass, FP_REGS)
7454 && !((REG_P (x) && HARD_REGISTER_P (x))
7455 || aarch64_simd_valid_immediate (x, NULL))
7456 && aarch64_sve_data_mode_p (mode))
7458 sri->icode = CODE_FOR_aarch64_sve_reload_be;
7459 return NO_REGS;
7462 /* If we have to disable direct literal pool loads and stores because the
7463 function is too big, then we need a scratch register. */
7464 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7465 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7466 || targetm.vector_mode_supported_p (GET_MODE (x)))
7467 && !aarch64_pcrelative_literal_loads)
7469 sri->icode = aarch64_constant_pool_reload_icode (mode);
7470 return NO_REGS;
7473 /* Without the TARGET_SIMD instructions we cannot move a Q register
7474 to a Q register directly. We need a scratch. */
7475 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7476 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7477 && reg_class_subset_p (rclass, FP_REGS))
7479 if (mode == TFmode)
7480 sri->icode = CODE_FOR_aarch64_reload_movtf;
7481 else if (mode == TImode)
7482 sri->icode = CODE_FOR_aarch64_reload_movti;
7483 return NO_REGS;
7486 /* A TFmode or TImode memory access should be handled via an FP_REGS
7487 because AArch64 has richer addressing modes for LDR/STR instructions
7488 than LDP/STP instructions. */
7489 if (TARGET_FLOAT && rclass == GENERAL_REGS
7490 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7491 return FP_REGS;
7493 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7494 return GENERAL_REGS;
7496 return NO_REGS;
7499 static bool
7500 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7502 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7504 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7505 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7506 if (frame_pointer_needed)
7507 return to == HARD_FRAME_POINTER_REGNUM;
7508 return true;
7511 poly_int64
7512 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7514 aarch64_layout_frame ();
7516 if (to == HARD_FRAME_POINTER_REGNUM)
7518 if (from == ARG_POINTER_REGNUM)
7519 return cfun->machine->frame.hard_fp_offset;
7521 if (from == FRAME_POINTER_REGNUM)
7522 return cfun->machine->frame.hard_fp_offset
7523 - cfun->machine->frame.locals_offset;
7526 if (to == STACK_POINTER_REGNUM)
7528 if (from == FRAME_POINTER_REGNUM)
7529 return cfun->machine->frame.frame_size
7530 - cfun->machine->frame.locals_offset;
7533 return cfun->machine->frame.frame_size;
7536 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7537 previous frame. */
7540 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7542 if (count != 0)
7543 return const0_rtx;
7544 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7548 static void
7549 aarch64_asm_trampoline_template (FILE *f)
7551 if (TARGET_ILP32)
7553 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7554 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7556 else
7558 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7559 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7561 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7562 assemble_aligned_integer (4, const0_rtx);
7563 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7564 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7567 static void
7568 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7570 rtx fnaddr, mem, a_tramp;
7571 const int tramp_code_sz = 16;
7573 /* Don't need to copy the trailing D-words, we fill those in below. */
7574 emit_block_move (m_tramp, assemble_trampoline_template (),
7575 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7576 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7577 fnaddr = XEXP (DECL_RTL (fndecl), 0);
7578 if (GET_MODE (fnaddr) != ptr_mode)
7579 fnaddr = convert_memory_address (ptr_mode, fnaddr);
7580 emit_move_insn (mem, fnaddr);
7582 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7583 emit_move_insn (mem, chain_value);
7585 /* XXX We should really define a "clear_cache" pattern and use
7586 gen_clear_cache(). */
7587 a_tramp = XEXP (m_tramp, 0);
7588 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7589 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7590 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7591 ptr_mode);
7594 static unsigned char
7595 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7597 /* ??? Logically we should only need to provide a value when
7598 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7599 can hold MODE, but at the moment we need to handle all modes.
7600 Just ignore any runtime parts for registers that can't store them. */
7601 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7602 unsigned int nregs;
7603 switch (regclass)
7605 case TAILCALL_ADDR_REGS:
7606 case POINTER_REGS:
7607 case GENERAL_REGS:
7608 case ALL_REGS:
7609 case POINTER_AND_FP_REGS:
7610 case FP_REGS:
7611 case FP_LO_REGS:
7612 if (aarch64_sve_data_mode_p (mode)
7613 && constant_multiple_p (GET_MODE_SIZE (mode),
7614 BYTES_PER_SVE_VECTOR, &nregs))
7615 return nregs;
7616 return (aarch64_vector_data_mode_p (mode)
7617 ? CEIL (lowest_size, UNITS_PER_VREG)
7618 : CEIL (lowest_size, UNITS_PER_WORD));
7619 case STACK_REG:
7620 case PR_REGS:
7621 case PR_LO_REGS:
7622 case PR_HI_REGS:
7623 return 1;
7625 case NO_REGS:
7626 return 0;
7628 default:
7629 break;
7631 gcc_unreachable ();
7634 static reg_class_t
7635 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7637 if (regclass == POINTER_REGS)
7638 return GENERAL_REGS;
7640 if (regclass == STACK_REG)
7642 if (REG_P(x)
7643 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7644 return regclass;
7646 return NO_REGS;
7649 /* Register eliminiation can result in a request for
7650 SP+constant->FP_REGS. We cannot support such operations which
7651 use SP as source and an FP_REG as destination, so reject out
7652 right now. */
7653 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7655 rtx lhs = XEXP (x, 0);
7657 /* Look through a possible SUBREG introduced by ILP32. */
7658 if (GET_CODE (lhs) == SUBREG)
7659 lhs = SUBREG_REG (lhs);
7661 gcc_assert (REG_P (lhs));
7662 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7663 POINTER_REGS));
7664 return NO_REGS;
7667 return regclass;
7670 void
7671 aarch64_asm_output_labelref (FILE* f, const char *name)
7673 asm_fprintf (f, "%U%s", name);
7676 static void
7677 aarch64_elf_asm_constructor (rtx symbol, int priority)
7679 if (priority == DEFAULT_INIT_PRIORITY)
7680 default_ctor_section_asm_out_constructor (symbol, priority);
7681 else
7683 section *s;
7684 /* While priority is known to be in range [0, 65535], so 18 bytes
7685 would be enough, the compiler might not know that. To avoid
7686 -Wformat-truncation false positive, use a larger size. */
7687 char buf[23];
7688 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7689 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7690 switch_to_section (s);
7691 assemble_align (POINTER_SIZE);
7692 assemble_aligned_integer (POINTER_BYTES, symbol);
7696 static void
7697 aarch64_elf_asm_destructor (rtx symbol, int priority)
7699 if (priority == DEFAULT_INIT_PRIORITY)
7700 default_dtor_section_asm_out_destructor (symbol, priority);
7701 else
7703 section *s;
7704 /* While priority is known to be in range [0, 65535], so 18 bytes
7705 would be enough, the compiler might not know that. To avoid
7706 -Wformat-truncation false positive, use a larger size. */
7707 char buf[23];
7708 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7709 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7710 switch_to_section (s);
7711 assemble_align (POINTER_SIZE);
7712 assemble_aligned_integer (POINTER_BYTES, symbol);
7716 const char*
7717 aarch64_output_casesi (rtx *operands)
7719 char buf[100];
7720 char label[100];
7721 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7722 int index;
7723 static const char *const patterns[4][2] =
7726 "ldrb\t%w3, [%0,%w1,uxtw]",
7727 "add\t%3, %4, %w3, sxtb #2"
7730 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7731 "add\t%3, %4, %w3, sxth #2"
7734 "ldr\t%w3, [%0,%w1,uxtw #2]",
7735 "add\t%3, %4, %w3, sxtw #2"
7737 /* We assume that DImode is only generated when not optimizing and
7738 that we don't really need 64-bit address offsets. That would
7739 imply an object file with 8GB of code in a single function! */
7741 "ldr\t%w3, [%0,%w1,uxtw #2]",
7742 "add\t%3, %4, %w3, sxtw #2"
7746 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7748 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7749 index = exact_log2 (GET_MODE_SIZE (mode));
7751 gcc_assert (index >= 0 && index <= 3);
7753 /* Need to implement table size reduction, by chaning the code below. */
7754 output_asm_insn (patterns[index][0], operands);
7755 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7756 snprintf (buf, sizeof (buf),
7757 "adr\t%%4, %s", targetm.strip_name_encoding (label));
7758 output_asm_insn (buf, operands);
7759 output_asm_insn (patterns[index][1], operands);
7760 output_asm_insn ("br\t%3", operands);
7761 assemble_label (asm_out_file, label);
7762 return "";
7766 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7767 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7768 operator. */
7771 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7773 if (shift >= 0 && shift <= 3)
7775 int size;
7776 for (size = 8; size <= 32; size *= 2)
7778 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7779 if (mask == bits << shift)
7780 return size;
7783 return 0;
7786 /* Constant pools are per function only when PC relative
7787 literal loads are true or we are in the large memory
7788 model. */
7790 static inline bool
7791 aarch64_can_use_per_function_literal_pools_p (void)
7793 return (aarch64_pcrelative_literal_loads
7794 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7797 static bool
7798 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7800 /* We can't use blocks for constants when we're using a per-function
7801 constant pool. */
7802 return !aarch64_can_use_per_function_literal_pools_p ();
7805 /* Select appropriate section for constants depending
7806 on where we place literal pools. */
7808 static section *
7809 aarch64_select_rtx_section (machine_mode mode,
7810 rtx x,
7811 unsigned HOST_WIDE_INT align)
7813 if (aarch64_can_use_per_function_literal_pools_p ())
7814 return function_section (current_function_decl);
7816 return default_elf_select_rtx_section (mode, x, align);
7819 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7820 void
7821 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7822 HOST_WIDE_INT offset)
7824 /* When using per-function literal pools, we must ensure that any code
7825 section is aligned to the minimal instruction length, lest we get
7826 errors from the assembler re "unaligned instructions". */
7827 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7828 ASM_OUTPUT_ALIGN (f, 2);
7831 /* Costs. */
7833 /* Helper function for rtx cost calculation. Strip a shift expression
7834 from X. Returns the inner operand if successful, or the original
7835 expression on failure. */
7836 static rtx
7837 aarch64_strip_shift (rtx x)
7839 rtx op = x;
7841 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7842 we can convert both to ROR during final output. */
7843 if ((GET_CODE (op) == ASHIFT
7844 || GET_CODE (op) == ASHIFTRT
7845 || GET_CODE (op) == LSHIFTRT
7846 || GET_CODE (op) == ROTATERT
7847 || GET_CODE (op) == ROTATE)
7848 && CONST_INT_P (XEXP (op, 1)))
7849 return XEXP (op, 0);
7851 if (GET_CODE (op) == MULT
7852 && CONST_INT_P (XEXP (op, 1))
7853 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7854 return XEXP (op, 0);
7856 return x;
7859 /* Helper function for rtx cost calculation. Strip an extend
7860 expression from X. Returns the inner operand if successful, or the
7861 original expression on failure. We deal with a number of possible
7862 canonicalization variations here. If STRIP_SHIFT is true, then
7863 we can strip off a shift also. */
7864 static rtx
7865 aarch64_strip_extend (rtx x, bool strip_shift)
7867 scalar_int_mode mode;
7868 rtx op = x;
7870 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7871 return op;
7873 /* Zero and sign extraction of a widened value. */
7874 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7875 && XEXP (op, 2) == const0_rtx
7876 && GET_CODE (XEXP (op, 0)) == MULT
7877 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7878 XEXP (op, 1)))
7879 return XEXP (XEXP (op, 0), 0);
7881 /* It can also be represented (for zero-extend) as an AND with an
7882 immediate. */
7883 if (GET_CODE (op) == AND
7884 && GET_CODE (XEXP (op, 0)) == MULT
7885 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7886 && CONST_INT_P (XEXP (op, 1))
7887 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7888 INTVAL (XEXP (op, 1))) != 0)
7889 return XEXP (XEXP (op, 0), 0);
7891 /* Now handle extended register, as this may also have an optional
7892 left shift by 1..4. */
7893 if (strip_shift
7894 && GET_CODE (op) == ASHIFT
7895 && CONST_INT_P (XEXP (op, 1))
7896 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7897 op = XEXP (op, 0);
7899 if (GET_CODE (op) == ZERO_EXTEND
7900 || GET_CODE (op) == SIGN_EXTEND)
7901 op = XEXP (op, 0);
7903 if (op != x)
7904 return op;
7906 return x;
7909 /* Return true iff CODE is a shift supported in combination
7910 with arithmetic instructions. */
7912 static bool
7913 aarch64_shift_p (enum rtx_code code)
7915 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7919 /* Return true iff X is a cheap shift without a sign extend. */
7921 static bool
7922 aarch64_cheap_mult_shift_p (rtx x)
7924 rtx op0, op1;
7926 op0 = XEXP (x, 0);
7927 op1 = XEXP (x, 1);
7929 if (!(aarch64_tune_params.extra_tuning_flags
7930 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7931 return false;
7933 if (GET_CODE (op0) == SIGN_EXTEND)
7934 return false;
7936 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7937 && UINTVAL (op1) <= 4)
7938 return true;
7940 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7941 return false;
7943 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7945 if (l2 > 0 && l2 <= 4)
7946 return true;
7948 return false;
7951 /* Helper function for rtx cost calculation. Calculate the cost of
7952 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7953 Return the calculated cost of the expression, recursing manually in to
7954 operands where needed. */
7956 static int
7957 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7959 rtx op0, op1;
7960 const struct cpu_cost_table *extra_cost
7961 = aarch64_tune_params.insn_extra_cost;
7962 int cost = 0;
7963 bool compound_p = (outer == PLUS || outer == MINUS);
7964 machine_mode mode = GET_MODE (x);
7966 gcc_checking_assert (code == MULT);
7968 op0 = XEXP (x, 0);
7969 op1 = XEXP (x, 1);
7971 if (VECTOR_MODE_P (mode))
7972 mode = GET_MODE_INNER (mode);
7974 /* Integer multiply/fma. */
7975 if (GET_MODE_CLASS (mode) == MODE_INT)
7977 /* The multiply will be canonicalized as a shift, cost it as such. */
7978 if (aarch64_shift_p (GET_CODE (x))
7979 || (CONST_INT_P (op1)
7980 && exact_log2 (INTVAL (op1)) > 0))
7982 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7983 || GET_CODE (op0) == SIGN_EXTEND;
7984 if (speed)
7986 if (compound_p)
7988 /* If the shift is considered cheap,
7989 then don't add any cost. */
7990 if (aarch64_cheap_mult_shift_p (x))
7992 else if (REG_P (op1))
7993 /* ARITH + shift-by-register. */
7994 cost += extra_cost->alu.arith_shift_reg;
7995 else if (is_extend)
7996 /* ARITH + extended register. We don't have a cost field
7997 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
7998 cost += extra_cost->alu.extend_arith;
7999 else
8000 /* ARITH + shift-by-immediate. */
8001 cost += extra_cost->alu.arith_shift;
8003 else
8004 /* LSL (immediate). */
8005 cost += extra_cost->alu.shift;
8008 /* Strip extends as we will have costed them in the case above. */
8009 if (is_extend)
8010 op0 = aarch64_strip_extend (op0, true);
8012 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
8014 return cost;
8017 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
8018 compound and let the below cases handle it. After all, MNEG is a
8019 special-case alias of MSUB. */
8020 if (GET_CODE (op0) == NEG)
8022 op0 = XEXP (op0, 0);
8023 compound_p = true;
8026 /* Integer multiplies or FMAs have zero/sign extending variants. */
8027 if ((GET_CODE (op0) == ZERO_EXTEND
8028 && GET_CODE (op1) == ZERO_EXTEND)
8029 || (GET_CODE (op0) == SIGN_EXTEND
8030 && GET_CODE (op1) == SIGN_EXTEND))
8032 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8033 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8035 if (speed)
8037 if (compound_p)
8038 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8039 cost += extra_cost->mult[0].extend_add;
8040 else
8041 /* MUL/SMULL/UMULL. */
8042 cost += extra_cost->mult[0].extend;
8045 return cost;
8048 /* This is either an integer multiply or a MADD. In both cases
8049 we want to recurse and cost the operands. */
8050 cost += rtx_cost (op0, mode, MULT, 0, speed);
8051 cost += rtx_cost (op1, mode, MULT, 1, speed);
8053 if (speed)
8055 if (compound_p)
8056 /* MADD/MSUB. */
8057 cost += extra_cost->mult[mode == DImode].add;
8058 else
8059 /* MUL. */
8060 cost += extra_cost->mult[mode == DImode].simple;
8063 return cost;
8065 else
8067 if (speed)
8069 /* Floating-point FMA/FMUL can also support negations of the
8070 operands, unless the rounding mode is upward or downward in
8071 which case FNMUL is different than FMUL with operand negation. */
8072 bool neg0 = GET_CODE (op0) == NEG;
8073 bool neg1 = GET_CODE (op1) == NEG;
8074 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8076 if (neg0)
8077 op0 = XEXP (op0, 0);
8078 if (neg1)
8079 op1 = XEXP (op1, 0);
8082 if (compound_p)
8083 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8084 cost += extra_cost->fp[mode == DFmode].fma;
8085 else
8086 /* FMUL/FNMUL. */
8087 cost += extra_cost->fp[mode == DFmode].mult;
8090 cost += rtx_cost (op0, mode, MULT, 0, speed);
8091 cost += rtx_cost (op1, mode, MULT, 1, speed);
8092 return cost;
8096 static int
8097 aarch64_address_cost (rtx x,
8098 machine_mode mode,
8099 addr_space_t as ATTRIBUTE_UNUSED,
8100 bool speed)
8102 enum rtx_code c = GET_CODE (x);
8103 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8104 struct aarch64_address_info info;
8105 int cost = 0;
8106 info.shift = 0;
8108 if (!aarch64_classify_address (&info, x, mode, false))
8110 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8112 /* This is a CONST or SYMBOL ref which will be split
8113 in a different way depending on the code model in use.
8114 Cost it through the generic infrastructure. */
8115 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8116 /* Divide through by the cost of one instruction to
8117 bring it to the same units as the address costs. */
8118 cost_symbol_ref /= COSTS_N_INSNS (1);
8119 /* The cost is then the cost of preparing the address,
8120 followed by an immediate (possibly 0) offset. */
8121 return cost_symbol_ref + addr_cost->imm_offset;
8123 else
8125 /* This is most likely a jump table from a case
8126 statement. */
8127 return addr_cost->register_offset;
8131 switch (info.type)
8133 case ADDRESS_LO_SUM:
8134 case ADDRESS_SYMBOLIC:
8135 case ADDRESS_REG_IMM:
8136 cost += addr_cost->imm_offset;
8137 break;
8139 case ADDRESS_REG_WB:
8140 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8141 cost += addr_cost->pre_modify;
8142 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8143 cost += addr_cost->post_modify;
8144 else
8145 gcc_unreachable ();
8147 break;
8149 case ADDRESS_REG_REG:
8150 cost += addr_cost->register_offset;
8151 break;
8153 case ADDRESS_REG_SXTW:
8154 cost += addr_cost->register_sextend;
8155 break;
8157 case ADDRESS_REG_UXTW:
8158 cost += addr_cost->register_zextend;
8159 break;
8161 default:
8162 gcc_unreachable ();
8166 if (info.shift > 0)
8168 /* For the sake of calculating the cost of the shifted register
8169 component, we can treat same sized modes in the same way. */
8170 if (known_eq (GET_MODE_BITSIZE (mode), 16))
8171 cost += addr_cost->addr_scale_costs.hi;
8172 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8173 cost += addr_cost->addr_scale_costs.si;
8174 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8175 cost += addr_cost->addr_scale_costs.di;
8176 else
8177 /* We can't tell, or this is a 128-bit vector. */
8178 cost += addr_cost->addr_scale_costs.ti;
8181 return cost;
8184 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8185 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8186 to be taken. */
8189 aarch64_branch_cost (bool speed_p, bool predictable_p)
8191 /* When optimizing for speed, use the cost of unpredictable branches. */
8192 const struct cpu_branch_cost *branch_costs =
8193 aarch64_tune_params.branch_costs;
8195 if (!speed_p || predictable_p)
8196 return branch_costs->predictable;
8197 else
8198 return branch_costs->unpredictable;
8201 /* Return true if the RTX X in mode MODE is a zero or sign extract
8202 usable in an ADD or SUB (extended register) instruction. */
8203 static bool
8204 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8206 /* Catch add with a sign extract.
8207 This is add_<optab><mode>_multp2. */
8208 if (GET_CODE (x) == SIGN_EXTRACT
8209 || GET_CODE (x) == ZERO_EXTRACT)
8211 rtx op0 = XEXP (x, 0);
8212 rtx op1 = XEXP (x, 1);
8213 rtx op2 = XEXP (x, 2);
8215 if (GET_CODE (op0) == MULT
8216 && CONST_INT_P (op1)
8217 && op2 == const0_rtx
8218 && CONST_INT_P (XEXP (op0, 1))
8219 && aarch64_is_extend_from_extract (mode,
8220 XEXP (op0, 1),
8221 op1))
8223 return true;
8226 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8227 No shift. */
8228 else if (GET_CODE (x) == SIGN_EXTEND
8229 || GET_CODE (x) == ZERO_EXTEND)
8230 return REG_P (XEXP (x, 0));
8232 return false;
8235 static bool
8236 aarch64_frint_unspec_p (unsigned int u)
8238 switch (u)
8240 case UNSPEC_FRINTZ:
8241 case UNSPEC_FRINTP:
8242 case UNSPEC_FRINTM:
8243 case UNSPEC_FRINTA:
8244 case UNSPEC_FRINTN:
8245 case UNSPEC_FRINTX:
8246 case UNSPEC_FRINTI:
8247 return true;
8249 default:
8250 return false;
8254 /* Return true iff X is an rtx that will match an extr instruction
8255 i.e. as described in the *extr<mode>5_insn family of patterns.
8256 OP0 and OP1 will be set to the operands of the shifts involved
8257 on success and will be NULL_RTX otherwise. */
8259 static bool
8260 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8262 rtx op0, op1;
8263 scalar_int_mode mode;
8264 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8265 return false;
8267 *res_op0 = NULL_RTX;
8268 *res_op1 = NULL_RTX;
8270 if (GET_CODE (x) != IOR)
8271 return false;
8273 op0 = XEXP (x, 0);
8274 op1 = XEXP (x, 1);
8276 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8277 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8279 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8280 if (GET_CODE (op1) == ASHIFT)
8281 std::swap (op0, op1);
8283 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8284 return false;
8286 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8287 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8289 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8290 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8292 *res_op0 = XEXP (op0, 0);
8293 *res_op1 = XEXP (op1, 0);
8294 return true;
8298 return false;
8301 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8302 storing it in *COST. Result is true if the total cost of the operation
8303 has now been calculated. */
8304 static bool
8305 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8307 rtx inner;
8308 rtx comparator;
8309 enum rtx_code cmpcode;
8311 if (COMPARISON_P (op0))
8313 inner = XEXP (op0, 0);
8314 comparator = XEXP (op0, 1);
8315 cmpcode = GET_CODE (op0);
8317 else
8319 inner = op0;
8320 comparator = const0_rtx;
8321 cmpcode = NE;
8324 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8326 /* Conditional branch. */
8327 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8328 return true;
8329 else
8331 if (cmpcode == NE || cmpcode == EQ)
8333 if (comparator == const0_rtx)
8335 /* TBZ/TBNZ/CBZ/CBNZ. */
8336 if (GET_CODE (inner) == ZERO_EXTRACT)
8337 /* TBZ/TBNZ. */
8338 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8339 ZERO_EXTRACT, 0, speed);
8340 else
8341 /* CBZ/CBNZ. */
8342 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8344 return true;
8347 else if (cmpcode == LT || cmpcode == GE)
8349 /* TBZ/TBNZ. */
8350 if (comparator == const0_rtx)
8351 return true;
8355 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8357 /* CCMP. */
8358 if (GET_CODE (op1) == COMPARE)
8360 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8361 if (XEXP (op1, 1) == const0_rtx)
8362 *cost += 1;
8363 if (speed)
8365 machine_mode mode = GET_MODE (XEXP (op1, 0));
8366 const struct cpu_cost_table *extra_cost
8367 = aarch64_tune_params.insn_extra_cost;
8369 if (GET_MODE_CLASS (mode) == MODE_INT)
8370 *cost += extra_cost->alu.arith;
8371 else
8372 *cost += extra_cost->fp[mode == DFmode].compare;
8374 return true;
8377 /* It's a conditional operation based on the status flags,
8378 so it must be some flavor of CSEL. */
8380 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8381 if (GET_CODE (op1) == NEG
8382 || GET_CODE (op1) == NOT
8383 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8384 op1 = XEXP (op1, 0);
8385 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8387 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8388 op1 = XEXP (op1, 0);
8389 op2 = XEXP (op2, 0);
8392 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8393 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8394 return true;
8397 /* We don't know what this is, cost all operands. */
8398 return false;
8401 /* Check whether X is a bitfield operation of the form shift + extend that
8402 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8403 operand to which the bitfield operation is applied. Otherwise return
8404 NULL_RTX. */
8406 static rtx
8407 aarch64_extend_bitfield_pattern_p (rtx x)
8409 rtx_code outer_code = GET_CODE (x);
8410 machine_mode outer_mode = GET_MODE (x);
8412 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8413 && outer_mode != SImode && outer_mode != DImode)
8414 return NULL_RTX;
8416 rtx inner = XEXP (x, 0);
8417 rtx_code inner_code = GET_CODE (inner);
8418 machine_mode inner_mode = GET_MODE (inner);
8419 rtx op = NULL_RTX;
8421 switch (inner_code)
8423 case ASHIFT:
8424 if (CONST_INT_P (XEXP (inner, 1))
8425 && (inner_mode == QImode || inner_mode == HImode))
8426 op = XEXP (inner, 0);
8427 break;
8428 case LSHIFTRT:
8429 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8430 && (inner_mode == QImode || inner_mode == HImode))
8431 op = XEXP (inner, 0);
8432 break;
8433 case ASHIFTRT:
8434 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8435 && (inner_mode == QImode || inner_mode == HImode))
8436 op = XEXP (inner, 0);
8437 break;
8438 default:
8439 break;
8442 return op;
8445 /* Return true if the mask and a shift amount from an RTX of the form
8446 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8447 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8449 bool
8450 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8451 rtx shft_amnt)
8453 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8454 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8455 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8456 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8459 /* Calculate the cost of calculating X, storing it in *COST. Result
8460 is true if the total cost of the operation has now been calculated. */
8461 static bool
8462 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8463 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8465 rtx op0, op1, op2;
8466 const struct cpu_cost_table *extra_cost
8467 = aarch64_tune_params.insn_extra_cost;
8468 int code = GET_CODE (x);
8469 scalar_int_mode int_mode;
8471 /* By default, assume that everything has equivalent cost to the
8472 cheapest instruction. Any additional costs are applied as a delta
8473 above this default. */
8474 *cost = COSTS_N_INSNS (1);
8476 switch (code)
8478 case SET:
8479 /* The cost depends entirely on the operands to SET. */
8480 *cost = 0;
8481 op0 = SET_DEST (x);
8482 op1 = SET_SRC (x);
8484 switch (GET_CODE (op0))
8486 case MEM:
8487 if (speed)
8489 rtx address = XEXP (op0, 0);
8490 if (VECTOR_MODE_P (mode))
8491 *cost += extra_cost->ldst.storev;
8492 else if (GET_MODE_CLASS (mode) == MODE_INT)
8493 *cost += extra_cost->ldst.store;
8494 else if (mode == SFmode)
8495 *cost += extra_cost->ldst.storef;
8496 else if (mode == DFmode)
8497 *cost += extra_cost->ldst.stored;
8499 *cost +=
8500 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8501 0, speed));
8504 *cost += rtx_cost (op1, mode, SET, 1, speed);
8505 return true;
8507 case SUBREG:
8508 if (! REG_P (SUBREG_REG (op0)))
8509 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8511 /* Fall through. */
8512 case REG:
8513 /* The cost is one per vector-register copied. */
8514 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8516 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8517 *cost = COSTS_N_INSNS (nregs);
8519 /* const0_rtx is in general free, but we will use an
8520 instruction to set a register to 0. */
8521 else if (REG_P (op1) || op1 == const0_rtx)
8523 /* The cost is 1 per register copied. */
8524 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8525 *cost = COSTS_N_INSNS (nregs);
8527 else
8528 /* Cost is just the cost of the RHS of the set. */
8529 *cost += rtx_cost (op1, mode, SET, 1, speed);
8530 return true;
8532 case ZERO_EXTRACT:
8533 case SIGN_EXTRACT:
8534 /* Bit-field insertion. Strip any redundant widening of
8535 the RHS to meet the width of the target. */
8536 if (GET_CODE (op1) == SUBREG)
8537 op1 = SUBREG_REG (op1);
8538 if ((GET_CODE (op1) == ZERO_EXTEND
8539 || GET_CODE (op1) == SIGN_EXTEND)
8540 && CONST_INT_P (XEXP (op0, 1))
8541 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8542 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8543 op1 = XEXP (op1, 0);
8545 if (CONST_INT_P (op1))
8547 /* MOV immediate is assumed to always be cheap. */
8548 *cost = COSTS_N_INSNS (1);
8550 else
8552 /* BFM. */
8553 if (speed)
8554 *cost += extra_cost->alu.bfi;
8555 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8558 return true;
8560 default:
8561 /* We can't make sense of this, assume default cost. */
8562 *cost = COSTS_N_INSNS (1);
8563 return false;
8565 return false;
8567 case CONST_INT:
8568 /* If an instruction can incorporate a constant within the
8569 instruction, the instruction's expression avoids calling
8570 rtx_cost() on the constant. If rtx_cost() is called on a
8571 constant, then it is usually because the constant must be
8572 moved into a register by one or more instructions.
8574 The exception is constant 0, which can be expressed
8575 as XZR/WZR and is therefore free. The exception to this is
8576 if we have (set (reg) (const0_rtx)) in which case we must cost
8577 the move. However, we can catch that when we cost the SET, so
8578 we don't need to consider that here. */
8579 if (x == const0_rtx)
8580 *cost = 0;
8581 else
8583 /* To an approximation, building any other constant is
8584 proportionally expensive to the number of instructions
8585 required to build that constant. This is true whether we
8586 are compiling for SPEED or otherwise. */
8587 if (!is_a <scalar_int_mode> (mode, &int_mode))
8588 int_mode = word_mode;
8589 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8590 (NULL_RTX, x, false, int_mode));
8592 return true;
8594 case CONST_DOUBLE:
8596 /* First determine number of instructions to do the move
8597 as an integer constant. */
8598 if (!aarch64_float_const_representable_p (x)
8599 && !aarch64_can_const_movi_rtx_p (x, mode)
8600 && aarch64_float_const_rtx_p (x))
8602 unsigned HOST_WIDE_INT ival;
8603 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8604 gcc_assert (succeed);
8606 scalar_int_mode imode = (mode == HFmode
8607 ? SImode
8608 : int_mode_for_mode (mode).require ());
8609 int ncost = aarch64_internal_mov_immediate
8610 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8611 *cost += COSTS_N_INSNS (ncost);
8612 return true;
8615 if (speed)
8617 /* mov[df,sf]_aarch64. */
8618 if (aarch64_float_const_representable_p (x))
8619 /* FMOV (scalar immediate). */
8620 *cost += extra_cost->fp[mode == DFmode].fpconst;
8621 else if (!aarch64_float_const_zero_rtx_p (x))
8623 /* This will be a load from memory. */
8624 if (mode == DFmode)
8625 *cost += extra_cost->ldst.loadd;
8626 else
8627 *cost += extra_cost->ldst.loadf;
8629 else
8630 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8631 or MOV v0.s[0], wzr - neither of which are modeled by the
8632 cost tables. Just use the default cost. */
8637 return true;
8639 case MEM:
8640 if (speed)
8642 /* For loads we want the base cost of a load, plus an
8643 approximation for the additional cost of the addressing
8644 mode. */
8645 rtx address = XEXP (x, 0);
8646 if (VECTOR_MODE_P (mode))
8647 *cost += extra_cost->ldst.loadv;
8648 else if (GET_MODE_CLASS (mode) == MODE_INT)
8649 *cost += extra_cost->ldst.load;
8650 else if (mode == SFmode)
8651 *cost += extra_cost->ldst.loadf;
8652 else if (mode == DFmode)
8653 *cost += extra_cost->ldst.loadd;
8655 *cost +=
8656 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8657 0, speed));
8660 return true;
8662 case NEG:
8663 op0 = XEXP (x, 0);
8665 if (VECTOR_MODE_P (mode))
8667 if (speed)
8669 /* FNEG. */
8670 *cost += extra_cost->vect.alu;
8672 return false;
8675 if (GET_MODE_CLASS (mode) == MODE_INT)
8677 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8678 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8680 /* CSETM. */
8681 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8682 return true;
8685 /* Cost this as SUB wzr, X. */
8686 op0 = CONST0_RTX (mode);
8687 op1 = XEXP (x, 0);
8688 goto cost_minus;
8691 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8693 /* Support (neg(fma...)) as a single instruction only if
8694 sign of zeros is unimportant. This matches the decision
8695 making in aarch64.md. */
8696 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8698 /* FNMADD. */
8699 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8700 return true;
8702 if (GET_CODE (op0) == MULT)
8704 /* FNMUL. */
8705 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8706 return true;
8708 if (speed)
8709 /* FNEG. */
8710 *cost += extra_cost->fp[mode == DFmode].neg;
8711 return false;
8714 return false;
8716 case CLRSB:
8717 case CLZ:
8718 if (speed)
8720 if (VECTOR_MODE_P (mode))
8721 *cost += extra_cost->vect.alu;
8722 else
8723 *cost += extra_cost->alu.clz;
8726 return false;
8728 case COMPARE:
8729 op0 = XEXP (x, 0);
8730 op1 = XEXP (x, 1);
8732 if (op1 == const0_rtx
8733 && GET_CODE (op0) == AND)
8735 x = op0;
8736 mode = GET_MODE (op0);
8737 goto cost_logic;
8740 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8742 /* TODO: A write to the CC flags possibly costs extra, this
8743 needs encoding in the cost tables. */
8745 mode = GET_MODE (op0);
8746 /* ANDS. */
8747 if (GET_CODE (op0) == AND)
8749 x = op0;
8750 goto cost_logic;
8753 if (GET_CODE (op0) == PLUS)
8755 /* ADDS (and CMN alias). */
8756 x = op0;
8757 goto cost_plus;
8760 if (GET_CODE (op0) == MINUS)
8762 /* SUBS. */
8763 x = op0;
8764 goto cost_minus;
8767 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8768 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8769 && CONST_INT_P (XEXP (op0, 2)))
8771 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8772 Handle it here directly rather than going to cost_logic
8773 since we know the immediate generated for the TST is valid
8774 so we can avoid creating an intermediate rtx for it only
8775 for costing purposes. */
8776 if (speed)
8777 *cost += extra_cost->alu.logical;
8779 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8780 ZERO_EXTRACT, 0, speed);
8781 return true;
8784 if (GET_CODE (op1) == NEG)
8786 /* CMN. */
8787 if (speed)
8788 *cost += extra_cost->alu.arith;
8790 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8791 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8792 return true;
8795 /* CMP.
8797 Compare can freely swap the order of operands, and
8798 canonicalization puts the more complex operation first.
8799 But the integer MINUS logic expects the shift/extend
8800 operation in op1. */
8801 if (! (REG_P (op0)
8802 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8804 op0 = XEXP (x, 1);
8805 op1 = XEXP (x, 0);
8807 goto cost_minus;
8810 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8812 /* FCMP. */
8813 if (speed)
8814 *cost += extra_cost->fp[mode == DFmode].compare;
8816 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8818 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8819 /* FCMP supports constant 0.0 for no extra cost. */
8820 return true;
8822 return false;
8825 if (VECTOR_MODE_P (mode))
8827 /* Vector compare. */
8828 if (speed)
8829 *cost += extra_cost->vect.alu;
8831 if (aarch64_float_const_zero_rtx_p (op1))
8833 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8834 cost. */
8835 return true;
8837 return false;
8839 return false;
8841 case MINUS:
8843 op0 = XEXP (x, 0);
8844 op1 = XEXP (x, 1);
8846 cost_minus:
8847 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8849 /* Detect valid immediates. */
8850 if ((GET_MODE_CLASS (mode) == MODE_INT
8851 || (GET_MODE_CLASS (mode) == MODE_CC
8852 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8853 && CONST_INT_P (op1)
8854 && aarch64_uimm12_shift (INTVAL (op1)))
8856 if (speed)
8857 /* SUB(S) (immediate). */
8858 *cost += extra_cost->alu.arith;
8859 return true;
8862 /* Look for SUB (extended register). */
8863 if (is_a <scalar_int_mode> (mode, &int_mode)
8864 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8866 if (speed)
8867 *cost += extra_cost->alu.extend_arith;
8869 op1 = aarch64_strip_extend (op1, true);
8870 *cost += rtx_cost (op1, VOIDmode,
8871 (enum rtx_code) GET_CODE (op1), 0, speed);
8872 return true;
8875 rtx new_op1 = aarch64_strip_extend (op1, false);
8877 /* Cost this as an FMA-alike operation. */
8878 if ((GET_CODE (new_op1) == MULT
8879 || aarch64_shift_p (GET_CODE (new_op1)))
8880 && code != COMPARE)
8882 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8883 (enum rtx_code) code,
8884 speed);
8885 return true;
8888 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8890 if (speed)
8892 if (VECTOR_MODE_P (mode))
8894 /* Vector SUB. */
8895 *cost += extra_cost->vect.alu;
8897 else if (GET_MODE_CLASS (mode) == MODE_INT)
8899 /* SUB(S). */
8900 *cost += extra_cost->alu.arith;
8902 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8904 /* FSUB. */
8905 *cost += extra_cost->fp[mode == DFmode].addsub;
8908 return true;
8911 case PLUS:
8913 rtx new_op0;
8915 op0 = XEXP (x, 0);
8916 op1 = XEXP (x, 1);
8918 cost_plus:
8919 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8920 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8922 /* CSINC. */
8923 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8924 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8925 return true;
8928 if (GET_MODE_CLASS (mode) == MODE_INT
8929 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8930 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8932 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8934 if (speed)
8935 /* ADD (immediate). */
8936 *cost += extra_cost->alu.arith;
8937 return true;
8940 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8942 /* Look for ADD (extended register). */
8943 if (is_a <scalar_int_mode> (mode, &int_mode)
8944 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8946 if (speed)
8947 *cost += extra_cost->alu.extend_arith;
8949 op0 = aarch64_strip_extend (op0, true);
8950 *cost += rtx_cost (op0, VOIDmode,
8951 (enum rtx_code) GET_CODE (op0), 0, speed);
8952 return true;
8955 /* Strip any extend, leave shifts behind as we will
8956 cost them through mult_cost. */
8957 new_op0 = aarch64_strip_extend (op0, false);
8959 if (GET_CODE (new_op0) == MULT
8960 || aarch64_shift_p (GET_CODE (new_op0)))
8962 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8963 speed);
8964 return true;
8967 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8969 if (speed)
8971 if (VECTOR_MODE_P (mode))
8973 /* Vector ADD. */
8974 *cost += extra_cost->vect.alu;
8976 else if (GET_MODE_CLASS (mode) == MODE_INT)
8978 /* ADD. */
8979 *cost += extra_cost->alu.arith;
8981 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8983 /* FADD. */
8984 *cost += extra_cost->fp[mode == DFmode].addsub;
8987 return true;
8990 case BSWAP:
8991 *cost = COSTS_N_INSNS (1);
8993 if (speed)
8995 if (VECTOR_MODE_P (mode))
8996 *cost += extra_cost->vect.alu;
8997 else
8998 *cost += extra_cost->alu.rev;
9000 return false;
9002 case IOR:
9003 if (aarch_rev16_p (x))
9005 *cost = COSTS_N_INSNS (1);
9007 if (speed)
9009 if (VECTOR_MODE_P (mode))
9010 *cost += extra_cost->vect.alu;
9011 else
9012 *cost += extra_cost->alu.rev;
9014 return true;
9017 if (aarch64_extr_rtx_p (x, &op0, &op1))
9019 *cost += rtx_cost (op0, mode, IOR, 0, speed);
9020 *cost += rtx_cost (op1, mode, IOR, 1, speed);
9021 if (speed)
9022 *cost += extra_cost->alu.shift;
9024 return true;
9026 /* Fall through. */
9027 case XOR:
9028 case AND:
9029 cost_logic:
9030 op0 = XEXP (x, 0);
9031 op1 = XEXP (x, 1);
9033 if (VECTOR_MODE_P (mode))
9035 if (speed)
9036 *cost += extra_cost->vect.alu;
9037 return true;
9040 if (code == AND
9041 && GET_CODE (op0) == MULT
9042 && CONST_INT_P (XEXP (op0, 1))
9043 && CONST_INT_P (op1)
9044 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9045 INTVAL (op1)) != 0)
9047 /* This is a UBFM/SBFM. */
9048 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9049 if (speed)
9050 *cost += extra_cost->alu.bfx;
9051 return true;
9054 if (is_int_mode (mode, &int_mode))
9056 if (CONST_INT_P (op1))
9058 /* We have a mask + shift version of a UBFIZ
9059 i.e. the *andim_ashift<mode>_bfiz pattern. */
9060 if (GET_CODE (op0) == ASHIFT
9061 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9062 XEXP (op0, 1)))
9064 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9065 (enum rtx_code) code, 0, speed);
9066 if (speed)
9067 *cost += extra_cost->alu.bfx;
9069 return true;
9071 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9073 /* We possibly get the immediate for free, this is not
9074 modelled. */
9075 *cost += rtx_cost (op0, int_mode,
9076 (enum rtx_code) code, 0, speed);
9077 if (speed)
9078 *cost += extra_cost->alu.logical;
9080 return true;
9083 else
9085 rtx new_op0 = op0;
9087 /* Handle ORN, EON, or BIC. */
9088 if (GET_CODE (op0) == NOT)
9089 op0 = XEXP (op0, 0);
9091 new_op0 = aarch64_strip_shift (op0);
9093 /* If we had a shift on op0 then this is a logical-shift-
9094 by-register/immediate operation. Otherwise, this is just
9095 a logical operation. */
9096 if (speed)
9098 if (new_op0 != op0)
9100 /* Shift by immediate. */
9101 if (CONST_INT_P (XEXP (op0, 1)))
9102 *cost += extra_cost->alu.log_shift;
9103 else
9104 *cost += extra_cost->alu.log_shift_reg;
9106 else
9107 *cost += extra_cost->alu.logical;
9110 /* In both cases we want to cost both operands. */
9111 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9112 0, speed);
9113 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9114 1, speed);
9116 return true;
9119 return false;
9121 case NOT:
9122 x = XEXP (x, 0);
9123 op0 = aarch64_strip_shift (x);
9125 if (VECTOR_MODE_P (mode))
9127 /* Vector NOT. */
9128 *cost += extra_cost->vect.alu;
9129 return false;
9132 /* MVN-shifted-reg. */
9133 if (op0 != x)
9135 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9137 if (speed)
9138 *cost += extra_cost->alu.log_shift;
9140 return true;
9142 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9143 Handle the second form here taking care that 'a' in the above can
9144 be a shift. */
9145 else if (GET_CODE (op0) == XOR)
9147 rtx newop0 = XEXP (op0, 0);
9148 rtx newop1 = XEXP (op0, 1);
9149 rtx op0_stripped = aarch64_strip_shift (newop0);
9151 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9152 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9154 if (speed)
9156 if (op0_stripped != newop0)
9157 *cost += extra_cost->alu.log_shift;
9158 else
9159 *cost += extra_cost->alu.logical;
9162 return true;
9164 /* MVN. */
9165 if (speed)
9166 *cost += extra_cost->alu.logical;
9168 return false;
9170 case ZERO_EXTEND:
9172 op0 = XEXP (x, 0);
9173 /* If a value is written in SI mode, then zero extended to DI
9174 mode, the operation will in general be free as a write to
9175 a 'w' register implicitly zeroes the upper bits of an 'x'
9176 register. However, if this is
9178 (set (reg) (zero_extend (reg)))
9180 we must cost the explicit register move. */
9181 if (mode == DImode
9182 && GET_MODE (op0) == SImode
9183 && outer == SET)
9185 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9187 /* If OP_COST is non-zero, then the cost of the zero extend
9188 is effectively the cost of the inner operation. Otherwise
9189 we have a MOV instruction and we take the cost from the MOV
9190 itself. This is true independently of whether we are
9191 optimizing for space or time. */
9192 if (op_cost)
9193 *cost = op_cost;
9195 return true;
9197 else if (MEM_P (op0))
9199 /* All loads can zero extend to any size for free. */
9200 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9201 return true;
9204 op0 = aarch64_extend_bitfield_pattern_p (x);
9205 if (op0)
9207 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9208 if (speed)
9209 *cost += extra_cost->alu.bfx;
9210 return true;
9213 if (speed)
9215 if (VECTOR_MODE_P (mode))
9217 /* UMOV. */
9218 *cost += extra_cost->vect.alu;
9220 else
9222 /* We generate an AND instead of UXTB/UXTH. */
9223 *cost += extra_cost->alu.logical;
9226 return false;
9228 case SIGN_EXTEND:
9229 if (MEM_P (XEXP (x, 0)))
9231 /* LDRSH. */
9232 if (speed)
9234 rtx address = XEXP (XEXP (x, 0), 0);
9235 *cost += extra_cost->ldst.load_sign_extend;
9237 *cost +=
9238 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9239 0, speed));
9241 return true;
9244 op0 = aarch64_extend_bitfield_pattern_p (x);
9245 if (op0)
9247 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9248 if (speed)
9249 *cost += extra_cost->alu.bfx;
9250 return true;
9253 if (speed)
9255 if (VECTOR_MODE_P (mode))
9256 *cost += extra_cost->vect.alu;
9257 else
9258 *cost += extra_cost->alu.extend;
9260 return false;
9262 case ASHIFT:
9263 op0 = XEXP (x, 0);
9264 op1 = XEXP (x, 1);
9266 if (CONST_INT_P (op1))
9268 if (speed)
9270 if (VECTOR_MODE_P (mode))
9272 /* Vector shift (immediate). */
9273 *cost += extra_cost->vect.alu;
9275 else
9277 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9278 aliases. */
9279 *cost += extra_cost->alu.shift;
9283 /* We can incorporate zero/sign extend for free. */
9284 if (GET_CODE (op0) == ZERO_EXTEND
9285 || GET_CODE (op0) == SIGN_EXTEND)
9286 op0 = XEXP (op0, 0);
9288 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9289 return true;
9291 else
9293 if (VECTOR_MODE_P (mode))
9295 if (speed)
9296 /* Vector shift (register). */
9297 *cost += extra_cost->vect.alu;
9299 else
9301 if (speed)
9302 /* LSLV. */
9303 *cost += extra_cost->alu.shift_reg;
9305 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9306 && CONST_INT_P (XEXP (op1, 1))
9307 && known_eq (INTVAL (XEXP (op1, 1)),
9308 GET_MODE_BITSIZE (mode) - 1))
9310 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9311 /* We already demanded XEXP (op1, 0) to be REG_P, so
9312 don't recurse into it. */
9313 return true;
9316 return false; /* All arguments need to be in registers. */
9319 case ROTATE:
9320 case ROTATERT:
9321 case LSHIFTRT:
9322 case ASHIFTRT:
9323 op0 = XEXP (x, 0);
9324 op1 = XEXP (x, 1);
9326 if (CONST_INT_P (op1))
9328 /* ASR (immediate) and friends. */
9329 if (speed)
9331 if (VECTOR_MODE_P (mode))
9332 *cost += extra_cost->vect.alu;
9333 else
9334 *cost += extra_cost->alu.shift;
9337 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9338 return true;
9340 else
9342 if (VECTOR_MODE_P (mode))
9344 if (speed)
9345 /* Vector shift (register). */
9346 *cost += extra_cost->vect.alu;
9348 else
9350 if (speed)
9351 /* ASR (register) and friends. */
9352 *cost += extra_cost->alu.shift_reg;
9354 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9355 && CONST_INT_P (XEXP (op1, 1))
9356 && known_eq (INTVAL (XEXP (op1, 1)),
9357 GET_MODE_BITSIZE (mode) - 1))
9359 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9360 /* We already demanded XEXP (op1, 0) to be REG_P, so
9361 don't recurse into it. */
9362 return true;
9365 return false; /* All arguments need to be in registers. */
9368 case SYMBOL_REF:
9370 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9371 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9373 /* LDR. */
9374 if (speed)
9375 *cost += extra_cost->ldst.load;
9377 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9378 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9380 /* ADRP, followed by ADD. */
9381 *cost += COSTS_N_INSNS (1);
9382 if (speed)
9383 *cost += 2 * extra_cost->alu.arith;
9385 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9386 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9388 /* ADR. */
9389 if (speed)
9390 *cost += extra_cost->alu.arith;
9393 if (flag_pic)
9395 /* One extra load instruction, after accessing the GOT. */
9396 *cost += COSTS_N_INSNS (1);
9397 if (speed)
9398 *cost += extra_cost->ldst.load;
9400 return true;
9402 case HIGH:
9403 case LO_SUM:
9404 /* ADRP/ADD (immediate). */
9405 if (speed)
9406 *cost += extra_cost->alu.arith;
9407 return true;
9409 case ZERO_EXTRACT:
9410 case SIGN_EXTRACT:
9411 /* UBFX/SBFX. */
9412 if (speed)
9414 if (VECTOR_MODE_P (mode))
9415 *cost += extra_cost->vect.alu;
9416 else
9417 *cost += extra_cost->alu.bfx;
9420 /* We can trust that the immediates used will be correct (there
9421 are no by-register forms), so we need only cost op0. */
9422 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9423 return true;
9425 case MULT:
9426 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9427 /* aarch64_rtx_mult_cost always handles recursion to its
9428 operands. */
9429 return true;
9431 case MOD:
9432 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9433 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9434 an unconditional negate. This case should only ever be reached through
9435 the set_smod_pow2_cheap check in expmed.c. */
9436 if (CONST_INT_P (XEXP (x, 1))
9437 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9438 && (mode == SImode || mode == DImode))
9440 /* We expand to 4 instructions. Reset the baseline. */
9441 *cost = COSTS_N_INSNS (4);
9443 if (speed)
9444 *cost += 2 * extra_cost->alu.logical
9445 + 2 * extra_cost->alu.arith;
9447 return true;
9450 /* Fall-through. */
9451 case UMOD:
9452 if (speed)
9454 /* Slighly prefer UMOD over SMOD. */
9455 if (VECTOR_MODE_P (mode))
9456 *cost += extra_cost->vect.alu;
9457 else if (GET_MODE_CLASS (mode) == MODE_INT)
9458 *cost += (extra_cost->mult[mode == DImode].add
9459 + extra_cost->mult[mode == DImode].idiv
9460 + (code == MOD ? 1 : 0));
9462 return false; /* All arguments need to be in registers. */
9464 case DIV:
9465 case UDIV:
9466 case SQRT:
9467 if (speed)
9469 if (VECTOR_MODE_P (mode))
9470 *cost += extra_cost->vect.alu;
9471 else if (GET_MODE_CLASS (mode) == MODE_INT)
9472 /* There is no integer SQRT, so only DIV and UDIV can get
9473 here. */
9474 *cost += (extra_cost->mult[mode == DImode].idiv
9475 /* Slighly prefer UDIV over SDIV. */
9476 + (code == DIV ? 1 : 0));
9477 else
9478 *cost += extra_cost->fp[mode == DFmode].div;
9480 return false; /* All arguments need to be in registers. */
9482 case IF_THEN_ELSE:
9483 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9484 XEXP (x, 2), cost, speed);
9486 case EQ:
9487 case NE:
9488 case GT:
9489 case GTU:
9490 case LT:
9491 case LTU:
9492 case GE:
9493 case GEU:
9494 case LE:
9495 case LEU:
9497 return false; /* All arguments must be in registers. */
9499 case FMA:
9500 op0 = XEXP (x, 0);
9501 op1 = XEXP (x, 1);
9502 op2 = XEXP (x, 2);
9504 if (speed)
9506 if (VECTOR_MODE_P (mode))
9507 *cost += extra_cost->vect.alu;
9508 else
9509 *cost += extra_cost->fp[mode == DFmode].fma;
9512 /* FMSUB, FNMADD, and FNMSUB are free. */
9513 if (GET_CODE (op0) == NEG)
9514 op0 = XEXP (op0, 0);
9516 if (GET_CODE (op2) == NEG)
9517 op2 = XEXP (op2, 0);
9519 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9520 and the by-element operand as operand 0. */
9521 if (GET_CODE (op1) == NEG)
9522 op1 = XEXP (op1, 0);
9524 /* Catch vector-by-element operations. The by-element operand can
9525 either be (vec_duplicate (vec_select (x))) or just
9526 (vec_select (x)), depending on whether we are multiplying by
9527 a vector or a scalar.
9529 Canonicalization is not very good in these cases, FMA4 will put the
9530 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9531 if (GET_CODE (op0) == VEC_DUPLICATE)
9532 op0 = XEXP (op0, 0);
9533 else if (GET_CODE (op1) == VEC_DUPLICATE)
9534 op1 = XEXP (op1, 0);
9536 if (GET_CODE (op0) == VEC_SELECT)
9537 op0 = XEXP (op0, 0);
9538 else if (GET_CODE (op1) == VEC_SELECT)
9539 op1 = XEXP (op1, 0);
9541 /* If the remaining parameters are not registers,
9542 get the cost to put them into registers. */
9543 *cost += rtx_cost (op0, mode, FMA, 0, speed);
9544 *cost += rtx_cost (op1, mode, FMA, 1, speed);
9545 *cost += rtx_cost (op2, mode, FMA, 2, speed);
9546 return true;
9548 case FLOAT:
9549 case UNSIGNED_FLOAT:
9550 if (speed)
9551 *cost += extra_cost->fp[mode == DFmode].fromint;
9552 return false;
9554 case FLOAT_EXTEND:
9555 if (speed)
9557 if (VECTOR_MODE_P (mode))
9559 /*Vector truncate. */
9560 *cost += extra_cost->vect.alu;
9562 else
9563 *cost += extra_cost->fp[mode == DFmode].widen;
9565 return false;
9567 case FLOAT_TRUNCATE:
9568 if (speed)
9570 if (VECTOR_MODE_P (mode))
9572 /*Vector conversion. */
9573 *cost += extra_cost->vect.alu;
9575 else
9576 *cost += extra_cost->fp[mode == DFmode].narrow;
9578 return false;
9580 case FIX:
9581 case UNSIGNED_FIX:
9582 x = XEXP (x, 0);
9583 /* Strip the rounding part. They will all be implemented
9584 by the fcvt* family of instructions anyway. */
9585 if (GET_CODE (x) == UNSPEC)
9587 unsigned int uns_code = XINT (x, 1);
9589 if (uns_code == UNSPEC_FRINTA
9590 || uns_code == UNSPEC_FRINTM
9591 || uns_code == UNSPEC_FRINTN
9592 || uns_code == UNSPEC_FRINTP
9593 || uns_code == UNSPEC_FRINTZ)
9594 x = XVECEXP (x, 0, 0);
9597 if (speed)
9599 if (VECTOR_MODE_P (mode))
9600 *cost += extra_cost->vect.alu;
9601 else
9602 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9605 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9606 fixed-point fcvt. */
9607 if (GET_CODE (x) == MULT
9608 && ((VECTOR_MODE_P (mode)
9609 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9610 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9612 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9613 0, speed);
9614 return true;
9617 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9618 return true;
9620 case ABS:
9621 if (VECTOR_MODE_P (mode))
9623 /* ABS (vector). */
9624 if (speed)
9625 *cost += extra_cost->vect.alu;
9627 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9629 op0 = XEXP (x, 0);
9631 /* FABD, which is analogous to FADD. */
9632 if (GET_CODE (op0) == MINUS)
9634 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9635 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9636 if (speed)
9637 *cost += extra_cost->fp[mode == DFmode].addsub;
9639 return true;
9641 /* Simple FABS is analogous to FNEG. */
9642 if (speed)
9643 *cost += extra_cost->fp[mode == DFmode].neg;
9645 else
9647 /* Integer ABS will either be split to
9648 two arithmetic instructions, or will be an ABS
9649 (scalar), which we don't model. */
9650 *cost = COSTS_N_INSNS (2);
9651 if (speed)
9652 *cost += 2 * extra_cost->alu.arith;
9654 return false;
9656 case SMAX:
9657 case SMIN:
9658 if (speed)
9660 if (VECTOR_MODE_P (mode))
9661 *cost += extra_cost->vect.alu;
9662 else
9664 /* FMAXNM/FMINNM/FMAX/FMIN.
9665 TODO: This may not be accurate for all implementations, but
9666 we do not model this in the cost tables. */
9667 *cost += extra_cost->fp[mode == DFmode].addsub;
9670 return false;
9672 case UNSPEC:
9673 /* The floating point round to integer frint* instructions. */
9674 if (aarch64_frint_unspec_p (XINT (x, 1)))
9676 if (speed)
9677 *cost += extra_cost->fp[mode == DFmode].roundint;
9679 return false;
9682 if (XINT (x, 1) == UNSPEC_RBIT)
9684 if (speed)
9685 *cost += extra_cost->alu.rev;
9687 return false;
9689 break;
9691 case TRUNCATE:
9693 /* Decompose <su>muldi3_highpart. */
9694 if (/* (truncate:DI */
9695 mode == DImode
9696 /* (lshiftrt:TI */
9697 && GET_MODE (XEXP (x, 0)) == TImode
9698 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9699 /* (mult:TI */
9700 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9701 /* (ANY_EXTEND:TI (reg:DI))
9702 (ANY_EXTEND:TI (reg:DI))) */
9703 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9704 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9705 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9706 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9707 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9708 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9709 /* (const_int 64) */
9710 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9711 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9713 /* UMULH/SMULH. */
9714 if (speed)
9715 *cost += extra_cost->mult[mode == DImode].extend;
9716 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9717 mode, MULT, 0, speed);
9718 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9719 mode, MULT, 1, speed);
9720 return true;
9723 /* Fall through. */
9724 default:
9725 break;
9728 if (dump_file
9729 && flag_aarch64_verbose_cost)
9730 fprintf (dump_file,
9731 "\nFailed to cost RTX. Assuming default cost.\n");
9733 return true;
9736 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9737 calculated for X. This cost is stored in *COST. Returns true
9738 if the total cost of X was calculated. */
9739 static bool
9740 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9741 int param, int *cost, bool speed)
9743 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9745 if (dump_file
9746 && flag_aarch64_verbose_cost)
9748 print_rtl_single (dump_file, x);
9749 fprintf (dump_file, "\n%s cost: %d (%s)\n",
9750 speed ? "Hot" : "Cold",
9751 *cost, result ? "final" : "partial");
9754 return result;
9757 static int
9758 aarch64_register_move_cost (machine_mode mode,
9759 reg_class_t from_i, reg_class_t to_i)
9761 enum reg_class from = (enum reg_class) from_i;
9762 enum reg_class to = (enum reg_class) to_i;
9763 const struct cpu_regmove_cost *regmove_cost
9764 = aarch64_tune_params.regmove_cost;
9766 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9767 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9768 to = GENERAL_REGS;
9770 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9771 from = GENERAL_REGS;
9773 /* Moving between GPR and stack cost is the same as GP2GP. */
9774 if ((from == GENERAL_REGS && to == STACK_REG)
9775 || (to == GENERAL_REGS && from == STACK_REG))
9776 return regmove_cost->GP2GP;
9778 /* To/From the stack register, we move via the gprs. */
9779 if (to == STACK_REG || from == STACK_REG)
9780 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9781 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9783 if (known_eq (GET_MODE_SIZE (mode), 16))
9785 /* 128-bit operations on general registers require 2 instructions. */
9786 if (from == GENERAL_REGS && to == GENERAL_REGS)
9787 return regmove_cost->GP2GP * 2;
9788 else if (from == GENERAL_REGS)
9789 return regmove_cost->GP2FP * 2;
9790 else if (to == GENERAL_REGS)
9791 return regmove_cost->FP2GP * 2;
9793 /* When AdvSIMD instructions are disabled it is not possible to move
9794 a 128-bit value directly between Q registers. This is handled in
9795 secondary reload. A general register is used as a scratch to move
9796 the upper DI value and the lower DI value is moved directly,
9797 hence the cost is the sum of three moves. */
9798 if (! TARGET_SIMD)
9799 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9801 return regmove_cost->FP2FP;
9804 if (from == GENERAL_REGS && to == GENERAL_REGS)
9805 return regmove_cost->GP2GP;
9806 else if (from == GENERAL_REGS)
9807 return regmove_cost->GP2FP;
9808 else if (to == GENERAL_REGS)
9809 return regmove_cost->FP2GP;
9811 return regmove_cost->FP2FP;
9814 static int
9815 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9816 reg_class_t rclass ATTRIBUTE_UNUSED,
9817 bool in ATTRIBUTE_UNUSED)
9819 return aarch64_tune_params.memmov_cost;
9822 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9823 to optimize 1.0/sqrt. */
9825 static bool
9826 use_rsqrt_p (machine_mode mode)
9828 return (!flag_trapping_math
9829 && flag_unsafe_math_optimizations
9830 && ((aarch64_tune_params.approx_modes->recip_sqrt
9831 & AARCH64_APPROX_MODE (mode))
9832 || flag_mrecip_low_precision_sqrt));
9835 /* Function to decide when to use the approximate reciprocal square root
9836 builtin. */
9838 static tree
9839 aarch64_builtin_reciprocal (tree fndecl)
9841 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9843 if (!use_rsqrt_p (mode))
9844 return NULL_TREE;
9845 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9848 typedef rtx (*rsqrte_type) (rtx, rtx);
9850 /* Select reciprocal square root initial estimate insn depending on machine
9851 mode. */
9853 static rsqrte_type
9854 get_rsqrte_type (machine_mode mode)
9856 switch (mode)
9858 case E_DFmode: return gen_aarch64_rsqrtedf;
9859 case E_SFmode: return gen_aarch64_rsqrtesf;
9860 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9861 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9862 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9863 default: gcc_unreachable ();
9867 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9869 /* Select reciprocal square root series step insn depending on machine mode. */
9871 static rsqrts_type
9872 get_rsqrts_type (machine_mode mode)
9874 switch (mode)
9876 case E_DFmode: return gen_aarch64_rsqrtsdf;
9877 case E_SFmode: return gen_aarch64_rsqrtssf;
9878 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9879 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9880 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9881 default: gcc_unreachable ();
9885 /* Emit instruction sequence to compute either the approximate square root
9886 or its approximate reciprocal, depending on the flag RECP, and return
9887 whether the sequence was emitted or not. */
9889 bool
9890 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9892 machine_mode mode = GET_MODE (dst);
9894 if (GET_MODE_INNER (mode) == HFmode)
9896 gcc_assert (!recp);
9897 return false;
9900 if (!recp)
9902 if (!(flag_mlow_precision_sqrt
9903 || (aarch64_tune_params.approx_modes->sqrt
9904 & AARCH64_APPROX_MODE (mode))))
9905 return false;
9907 if (flag_finite_math_only
9908 || flag_trapping_math
9909 || !flag_unsafe_math_optimizations
9910 || optimize_function_for_size_p (cfun))
9911 return false;
9913 else
9914 /* Caller assumes we cannot fail. */
9915 gcc_assert (use_rsqrt_p (mode));
9917 machine_mode mmsk = mode_for_int_vector (mode).require ();
9918 rtx xmsk = gen_reg_rtx (mmsk);
9919 if (!recp)
9920 /* When calculating the approximate square root, compare the
9921 argument with 0.0 and create a mask. */
9922 emit_insn (gen_rtx_SET (xmsk,
9923 gen_rtx_NEG (mmsk,
9924 gen_rtx_EQ (mmsk, src,
9925 CONST0_RTX (mode)))));
9927 /* Estimate the approximate reciprocal square root. */
9928 rtx xdst = gen_reg_rtx (mode);
9929 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9931 /* Iterate over the series twice for SF and thrice for DF. */
9932 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9934 /* Optionally iterate over the series once less for faster performance
9935 while sacrificing the accuracy. */
9936 if ((recp && flag_mrecip_low_precision_sqrt)
9937 || (!recp && flag_mlow_precision_sqrt))
9938 iterations--;
9940 /* Iterate over the series to calculate the approximate reciprocal square
9941 root. */
9942 rtx x1 = gen_reg_rtx (mode);
9943 while (iterations--)
9945 rtx x2 = gen_reg_rtx (mode);
9946 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9948 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9950 if (iterations > 0)
9951 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9954 if (!recp)
9956 /* Qualify the approximate reciprocal square root when the argument is
9957 0.0 by squashing the intermediary result to 0.0. */
9958 rtx xtmp = gen_reg_rtx (mmsk);
9959 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9960 gen_rtx_SUBREG (mmsk, xdst, 0)));
9961 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9963 /* Calculate the approximate square root. */
9964 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9967 /* Finalize the approximation. */
9968 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9970 return true;
9973 typedef rtx (*recpe_type) (rtx, rtx);
9975 /* Select reciprocal initial estimate insn depending on machine mode. */
9977 static recpe_type
9978 get_recpe_type (machine_mode mode)
9980 switch (mode)
9982 case E_SFmode: return (gen_aarch64_frecpesf);
9983 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9984 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9985 case E_DFmode: return (gen_aarch64_frecpedf);
9986 case E_V2DFmode: return (gen_aarch64_frecpev2df);
9987 default: gcc_unreachable ();
9991 typedef rtx (*recps_type) (rtx, rtx, rtx);
9993 /* Select reciprocal series step insn depending on machine mode. */
9995 static recps_type
9996 get_recps_type (machine_mode mode)
9998 switch (mode)
10000 case E_SFmode: return (gen_aarch64_frecpssf);
10001 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
10002 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
10003 case E_DFmode: return (gen_aarch64_frecpsdf);
10004 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
10005 default: gcc_unreachable ();
10009 /* Emit the instruction sequence to compute the approximation for the division
10010 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
10012 bool
10013 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
10015 machine_mode mode = GET_MODE (quo);
10017 if (GET_MODE_INNER (mode) == HFmode)
10018 return false;
10020 bool use_approx_division_p = (flag_mlow_precision_div
10021 || (aarch64_tune_params.approx_modes->division
10022 & AARCH64_APPROX_MODE (mode)));
10024 if (!flag_finite_math_only
10025 || flag_trapping_math
10026 || !flag_unsafe_math_optimizations
10027 || optimize_function_for_size_p (cfun)
10028 || !use_approx_division_p)
10029 return false;
10031 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
10032 return false;
10034 /* Estimate the approximate reciprocal. */
10035 rtx xrcp = gen_reg_rtx (mode);
10036 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
10038 /* Iterate over the series twice for SF and thrice for DF. */
10039 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
10041 /* Optionally iterate over the series once less for faster performance,
10042 while sacrificing the accuracy. */
10043 if (flag_mlow_precision_div)
10044 iterations--;
10046 /* Iterate over the series to calculate the approximate reciprocal. */
10047 rtx xtmp = gen_reg_rtx (mode);
10048 while (iterations--)
10050 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10052 if (iterations > 0)
10053 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10056 if (num != CONST1_RTX (mode))
10058 /* As the approximate reciprocal of DEN is already calculated, only
10059 calculate the approximate division when NUM is not 1.0. */
10060 rtx xnum = force_reg (mode, num);
10061 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10064 /* Finalize the approximation. */
10065 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10066 return true;
10069 /* Return the number of instructions that can be issued per cycle. */
10070 static int
10071 aarch64_sched_issue_rate (void)
10073 return aarch64_tune_params.issue_rate;
10076 static int
10077 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10079 int issue_rate = aarch64_sched_issue_rate ();
10081 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10085 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10086 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10087 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10089 static int
10090 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10091 int ready_index)
10093 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10097 /* Vectorizer cost model target hooks. */
10099 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10100 static int
10101 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10102 tree vectype,
10103 int misalign ATTRIBUTE_UNUSED)
10105 unsigned elements;
10106 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10107 bool fp = false;
10109 if (vectype != NULL)
10110 fp = FLOAT_TYPE_P (vectype);
10112 switch (type_of_cost)
10114 case scalar_stmt:
10115 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10117 case scalar_load:
10118 return costs->scalar_load_cost;
10120 case scalar_store:
10121 return costs->scalar_store_cost;
10123 case vector_stmt:
10124 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10126 case vector_load:
10127 return costs->vec_align_load_cost;
10129 case vector_store:
10130 return costs->vec_store_cost;
10132 case vec_to_scalar:
10133 return costs->vec_to_scalar_cost;
10135 case scalar_to_vec:
10136 return costs->scalar_to_vec_cost;
10138 case unaligned_load:
10139 case vector_gather_load:
10140 return costs->vec_unalign_load_cost;
10142 case unaligned_store:
10143 case vector_scatter_store:
10144 return costs->vec_unalign_store_cost;
10146 case cond_branch_taken:
10147 return costs->cond_taken_branch_cost;
10149 case cond_branch_not_taken:
10150 return costs->cond_not_taken_branch_cost;
10152 case vec_perm:
10153 return costs->vec_permute_cost;
10155 case vec_promote_demote:
10156 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10158 case vec_construct:
10159 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10160 return elements / 2 + 1;
10162 default:
10163 gcc_unreachable ();
10167 /* Implement targetm.vectorize.add_stmt_cost. */
10168 static unsigned
10169 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10170 struct _stmt_vec_info *stmt_info, int misalign,
10171 enum vect_cost_model_location where)
10173 unsigned *cost = (unsigned *) data;
10174 unsigned retval = 0;
10176 if (flag_vect_cost_model)
10178 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10179 int stmt_cost =
10180 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10182 /* Statements in an inner loop relative to the loop being
10183 vectorized are weighted more heavily. The value here is
10184 arbitrary and could potentially be improved with analysis. */
10185 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10186 count *= 50; /* FIXME */
10188 retval = (unsigned) (count * stmt_cost);
10189 cost[where] += retval;
10192 return retval;
10195 static void initialize_aarch64_code_model (struct gcc_options *);
10197 /* Parse the TO_PARSE string and put the architecture struct that it
10198 selects into RES and the architectural features into ISA_FLAGS.
10199 Return an aarch64_parse_opt_result describing the parse result.
10200 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10202 static enum aarch64_parse_opt_result
10203 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10204 unsigned long *isa_flags)
10206 char *ext;
10207 const struct processor *arch;
10208 char *str = (char *) alloca (strlen (to_parse) + 1);
10209 size_t len;
10211 strcpy (str, to_parse);
10213 ext = strchr (str, '+');
10215 if (ext != NULL)
10216 len = ext - str;
10217 else
10218 len = strlen (str);
10220 if (len == 0)
10221 return AARCH64_PARSE_MISSING_ARG;
10224 /* Loop through the list of supported ARCHes to find a match. */
10225 for (arch = all_architectures; arch->name != NULL; arch++)
10227 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10229 unsigned long isa_temp = arch->flags;
10231 if (ext != NULL)
10233 /* TO_PARSE string contains at least one extension. */
10234 enum aarch64_parse_opt_result ext_res
10235 = aarch64_parse_extension (ext, &isa_temp);
10237 if (ext_res != AARCH64_PARSE_OK)
10238 return ext_res;
10240 /* Extension parsing was successful. Confirm the result
10241 arch and ISA flags. */
10242 *res = arch;
10243 *isa_flags = isa_temp;
10244 return AARCH64_PARSE_OK;
10248 /* ARCH name not found in list. */
10249 return AARCH64_PARSE_INVALID_ARG;
10252 /* Parse the TO_PARSE string and put the result tuning in RES and the
10253 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10254 describing the parse result. If there is an error parsing, RES and
10255 ISA_FLAGS are left unchanged. */
10257 static enum aarch64_parse_opt_result
10258 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10259 unsigned long *isa_flags)
10261 char *ext;
10262 const struct processor *cpu;
10263 char *str = (char *) alloca (strlen (to_parse) + 1);
10264 size_t len;
10266 strcpy (str, to_parse);
10268 ext = strchr (str, '+');
10270 if (ext != NULL)
10271 len = ext - str;
10272 else
10273 len = strlen (str);
10275 if (len == 0)
10276 return AARCH64_PARSE_MISSING_ARG;
10279 /* Loop through the list of supported CPUs to find a match. */
10280 for (cpu = all_cores; cpu->name != NULL; cpu++)
10282 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10284 unsigned long isa_temp = cpu->flags;
10287 if (ext != NULL)
10289 /* TO_PARSE string contains at least one extension. */
10290 enum aarch64_parse_opt_result ext_res
10291 = aarch64_parse_extension (ext, &isa_temp);
10293 if (ext_res != AARCH64_PARSE_OK)
10294 return ext_res;
10296 /* Extension parsing was successfull. Confirm the result
10297 cpu and ISA flags. */
10298 *res = cpu;
10299 *isa_flags = isa_temp;
10300 return AARCH64_PARSE_OK;
10304 /* CPU name not found in list. */
10305 return AARCH64_PARSE_INVALID_ARG;
10308 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10309 Return an aarch64_parse_opt_result describing the parse result.
10310 If the parsing fails the RES does not change. */
10312 static enum aarch64_parse_opt_result
10313 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10315 const struct processor *cpu;
10316 char *str = (char *) alloca (strlen (to_parse) + 1);
10318 strcpy (str, to_parse);
10320 /* Loop through the list of supported CPUs to find a match. */
10321 for (cpu = all_cores; cpu->name != NULL; cpu++)
10323 if (strcmp (cpu->name, str) == 0)
10325 *res = cpu;
10326 return AARCH64_PARSE_OK;
10330 /* CPU name not found in list. */
10331 return AARCH64_PARSE_INVALID_ARG;
10334 /* Parse TOKEN, which has length LENGTH to see if it is an option
10335 described in FLAG. If it is, return the index bit for that fusion type.
10336 If not, error (printing OPTION_NAME) and return zero. */
10338 static unsigned int
10339 aarch64_parse_one_option_token (const char *token,
10340 size_t length,
10341 const struct aarch64_flag_desc *flag,
10342 const char *option_name)
10344 for (; flag->name != NULL; flag++)
10346 if (length == strlen (flag->name)
10347 && !strncmp (flag->name, token, length))
10348 return flag->flag;
10351 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10352 return 0;
10355 /* Parse OPTION which is a comma-separated list of flags to enable.
10356 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10357 default state we inherit from the CPU tuning structures. OPTION_NAME
10358 gives the top-level option we are parsing in the -moverride string,
10359 for use in error messages. */
10361 static unsigned int
10362 aarch64_parse_boolean_options (const char *option,
10363 const struct aarch64_flag_desc *flags,
10364 unsigned int initial_state,
10365 const char *option_name)
10367 const char separator = '.';
10368 const char* specs = option;
10369 const char* ntoken = option;
10370 unsigned int found_flags = initial_state;
10372 while ((ntoken = strchr (specs, separator)))
10374 size_t token_length = ntoken - specs;
10375 unsigned token_ops = aarch64_parse_one_option_token (specs,
10376 token_length,
10377 flags,
10378 option_name);
10379 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10380 in the token stream, reset the supported operations. So:
10382 adrp+add.cmp+branch.none.adrp+add
10384 would have the result of turning on only adrp+add fusion. */
10385 if (!token_ops)
10386 found_flags = 0;
10388 found_flags |= token_ops;
10389 specs = ++ntoken;
10392 /* We ended with a comma, print something. */
10393 if (!(*specs))
10395 error ("%s string ill-formed\n", option_name);
10396 return 0;
10399 /* We still have one more token to parse. */
10400 size_t token_length = strlen (specs);
10401 unsigned token_ops = aarch64_parse_one_option_token (specs,
10402 token_length,
10403 flags,
10404 option_name);
10405 if (!token_ops)
10406 found_flags = 0;
10408 found_flags |= token_ops;
10409 return found_flags;
10412 /* Support for overriding instruction fusion. */
10414 static void
10415 aarch64_parse_fuse_string (const char *fuse_string,
10416 struct tune_params *tune)
10418 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10419 aarch64_fusible_pairs,
10420 tune->fusible_ops,
10421 "fuse=");
10424 /* Support for overriding other tuning flags. */
10426 static void
10427 aarch64_parse_tune_string (const char *tune_string,
10428 struct tune_params *tune)
10430 tune->extra_tuning_flags
10431 = aarch64_parse_boolean_options (tune_string,
10432 aarch64_tuning_flags,
10433 tune->extra_tuning_flags,
10434 "tune=");
10437 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10438 we understand. If it is, extract the option string and handoff to
10439 the appropriate function. */
10441 void
10442 aarch64_parse_one_override_token (const char* token,
10443 size_t length,
10444 struct tune_params *tune)
10446 const struct aarch64_tuning_override_function *fn
10447 = aarch64_tuning_override_functions;
10449 const char *option_part = strchr (token, '=');
10450 if (!option_part)
10452 error ("tuning string missing in option (%s)", token);
10453 return;
10456 /* Get the length of the option name. */
10457 length = option_part - token;
10458 /* Skip the '=' to get to the option string. */
10459 option_part++;
10461 for (; fn->name != NULL; fn++)
10463 if (!strncmp (fn->name, token, length))
10465 fn->parse_override (option_part, tune);
10466 return;
10470 error ("unknown tuning option (%s)",token);
10471 return;
10474 /* A checking mechanism for the implementation of the tls size. */
10476 static void
10477 initialize_aarch64_tls_size (struct gcc_options *opts)
10479 if (aarch64_tls_size == 0)
10480 aarch64_tls_size = 24;
10482 switch (opts->x_aarch64_cmodel_var)
10484 case AARCH64_CMODEL_TINY:
10485 /* Both the default and maximum TLS size allowed under tiny is 1M which
10486 needs two instructions to address, so we clamp the size to 24. */
10487 if (aarch64_tls_size > 24)
10488 aarch64_tls_size = 24;
10489 break;
10490 case AARCH64_CMODEL_SMALL:
10491 /* The maximum TLS size allowed under small is 4G. */
10492 if (aarch64_tls_size > 32)
10493 aarch64_tls_size = 32;
10494 break;
10495 case AARCH64_CMODEL_LARGE:
10496 /* The maximum TLS size allowed under large is 16E.
10497 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10498 if (aarch64_tls_size > 48)
10499 aarch64_tls_size = 48;
10500 break;
10501 default:
10502 gcc_unreachable ();
10505 return;
10508 /* Parse STRING looking for options in the format:
10509 string :: option:string
10510 option :: name=substring
10511 name :: {a-z}
10512 substring :: defined by option. */
10514 static void
10515 aarch64_parse_override_string (const char* input_string,
10516 struct tune_params* tune)
10518 const char separator = ':';
10519 size_t string_length = strlen (input_string) + 1;
10520 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10521 char *string = string_root;
10522 strncpy (string, input_string, string_length);
10523 string[string_length - 1] = '\0';
10525 char* ntoken = string;
10527 while ((ntoken = strchr (string, separator)))
10529 size_t token_length = ntoken - string;
10530 /* Make this substring look like a string. */
10531 *ntoken = '\0';
10532 aarch64_parse_one_override_token (string, token_length, tune);
10533 string = ++ntoken;
10536 /* One last option to parse. */
10537 aarch64_parse_one_override_token (string, strlen (string), tune);
10538 free (string_root);
10542 static void
10543 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10545 /* PR 70044: We have to be careful about being called multiple times for the
10546 same function. This means all changes should be repeatable. */
10548 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10549 Disable the frame pointer flag so the mid-end will not use a frame
10550 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10551 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10552 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
10553 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10554 if (opts->x_flag_omit_frame_pointer == 0)
10555 opts->x_flag_omit_frame_pointer = 2;
10557 /* If not optimizing for size, set the default
10558 alignment to what the target wants. */
10559 if (!opts->x_optimize_size)
10561 if (opts->x_align_loops <= 0)
10562 opts->x_align_loops = aarch64_tune_params.loop_align;
10563 if (opts->x_align_jumps <= 0)
10564 opts->x_align_jumps = aarch64_tune_params.jump_align;
10565 if (opts->x_align_functions <= 0)
10566 opts->x_align_functions = aarch64_tune_params.function_align;
10569 /* We default to no pc-relative literal loads. */
10571 aarch64_pcrelative_literal_loads = false;
10573 /* If -mpc-relative-literal-loads is set on the command line, this
10574 implies that the user asked for PC relative literal loads. */
10575 if (opts->x_pcrelative_literal_loads == 1)
10576 aarch64_pcrelative_literal_loads = true;
10578 /* In the tiny memory model it makes no sense to disallow PC relative
10579 literal pool loads. */
10580 if (aarch64_cmodel == AARCH64_CMODEL_TINY
10581 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10582 aarch64_pcrelative_literal_loads = true;
10584 /* When enabling the lower precision Newton series for the square root, also
10585 enable it for the reciprocal square root, since the latter is an
10586 intermediary step for the former. */
10587 if (flag_mlow_precision_sqrt)
10588 flag_mrecip_low_precision_sqrt = true;
10591 /* 'Unpack' up the internal tuning structs and update the options
10592 in OPTS. The caller must have set up selected_tune and selected_arch
10593 as all the other target-specific codegen decisions are
10594 derived from them. */
10596 void
10597 aarch64_override_options_internal (struct gcc_options *opts)
10599 aarch64_tune_flags = selected_tune->flags;
10600 aarch64_tune = selected_tune->sched_core;
10601 /* Make a copy of the tuning parameters attached to the core, which
10602 we may later overwrite. */
10603 aarch64_tune_params = *(selected_tune->tune);
10604 aarch64_architecture_version = selected_arch->architecture_version;
10606 if (opts->x_aarch64_override_tune_string)
10607 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10608 &aarch64_tune_params);
10610 /* This target defaults to strict volatile bitfields. */
10611 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10612 opts->x_flag_strict_volatile_bitfields = 1;
10614 initialize_aarch64_code_model (opts);
10615 initialize_aarch64_tls_size (opts);
10617 int queue_depth = 0;
10618 switch (aarch64_tune_params.autoprefetcher_model)
10620 case tune_params::AUTOPREFETCHER_OFF:
10621 queue_depth = -1;
10622 break;
10623 case tune_params::AUTOPREFETCHER_WEAK:
10624 queue_depth = 0;
10625 break;
10626 case tune_params::AUTOPREFETCHER_STRONG:
10627 queue_depth = max_insn_queue_index + 1;
10628 break;
10629 default:
10630 gcc_unreachable ();
10633 /* We don't mind passing in global_options_set here as we don't use
10634 the *options_set structs anyway. */
10635 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10636 queue_depth,
10637 opts->x_param_values,
10638 global_options_set.x_param_values);
10640 /* Set up parameters to be used in prefetching algorithm. Do not
10641 override the defaults unless we are tuning for a core we have
10642 researched values for. */
10643 if (aarch64_tune_params.prefetch->num_slots > 0)
10644 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10645 aarch64_tune_params.prefetch->num_slots,
10646 opts->x_param_values,
10647 global_options_set.x_param_values);
10648 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10649 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10650 aarch64_tune_params.prefetch->l1_cache_size,
10651 opts->x_param_values,
10652 global_options_set.x_param_values);
10653 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10654 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10655 aarch64_tune_params.prefetch->l1_cache_line_size,
10656 opts->x_param_values,
10657 global_options_set.x_param_values);
10658 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10659 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10660 aarch64_tune_params.prefetch->l2_cache_size,
10661 opts->x_param_values,
10662 global_options_set.x_param_values);
10663 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10664 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10666 opts->x_param_values,
10667 global_options_set.x_param_values);
10668 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10669 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10670 aarch64_tune_params.prefetch->minimum_stride,
10671 opts->x_param_values,
10672 global_options_set.x_param_values);
10674 /* Use the alternative scheduling-pressure algorithm by default. */
10675 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10676 opts->x_param_values,
10677 global_options_set.x_param_values);
10679 /* Enable sw prefetching at specified optimization level for
10680 CPUS that have prefetch. Lower optimization level threshold by 1
10681 when profiling is enabled. */
10682 if (opts->x_flag_prefetch_loop_arrays < 0
10683 && !opts->x_optimize_size
10684 && aarch64_tune_params.prefetch->default_opt_level >= 0
10685 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10686 opts->x_flag_prefetch_loop_arrays = 1;
10688 aarch64_override_options_after_change_1 (opts);
10691 /* Print a hint with a suggestion for a core or architecture name that
10692 most closely resembles what the user passed in STR. ARCH is true if
10693 the user is asking for an architecture name. ARCH is false if the user
10694 is asking for a core name. */
10696 static void
10697 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10699 auto_vec<const char *> candidates;
10700 const struct processor *entry = arch ? all_architectures : all_cores;
10701 for (; entry->name != NULL; entry++)
10702 candidates.safe_push (entry->name);
10704 #ifdef HAVE_LOCAL_CPU_DETECT
10705 /* Add also "native" as possible value. */
10706 if (arch)
10707 candidates.safe_push ("native");
10708 #endif
10710 char *s;
10711 const char *hint = candidates_list_and_hint (str, s, candidates);
10712 if (hint)
10713 inform (input_location, "valid arguments are: %s;"
10714 " did you mean %qs?", s, hint);
10715 else
10716 inform (input_location, "valid arguments are: %s", s);
10718 XDELETEVEC (s);
10721 /* Print a hint with a suggestion for a core name that most closely resembles
10722 what the user passed in STR. */
10724 inline static void
10725 aarch64_print_hint_for_core (const char *str)
10727 aarch64_print_hint_for_core_or_arch (str, false);
10730 /* Print a hint with a suggestion for an architecture name that most closely
10731 resembles what the user passed in STR. */
10733 inline static void
10734 aarch64_print_hint_for_arch (const char *str)
10736 aarch64_print_hint_for_core_or_arch (str, true);
10739 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10740 specified in STR and throw errors if appropriate. Put the results if
10741 they are valid in RES and ISA_FLAGS. Return whether the option is
10742 valid. */
10744 static bool
10745 aarch64_validate_mcpu (const char *str, const struct processor **res,
10746 unsigned long *isa_flags)
10748 enum aarch64_parse_opt_result parse_res
10749 = aarch64_parse_cpu (str, res, isa_flags);
10751 if (parse_res == AARCH64_PARSE_OK)
10752 return true;
10754 switch (parse_res)
10756 case AARCH64_PARSE_MISSING_ARG:
10757 error ("missing cpu name in %<-mcpu=%s%>", str);
10758 break;
10759 case AARCH64_PARSE_INVALID_ARG:
10760 error ("unknown value %qs for -mcpu", str);
10761 aarch64_print_hint_for_core (str);
10762 break;
10763 case AARCH64_PARSE_INVALID_FEATURE:
10764 error ("invalid feature modifier in %<-mcpu=%s%>", str);
10765 break;
10766 default:
10767 gcc_unreachable ();
10770 return false;
10773 /* Validate a command-line -march option. Parse the arch and extensions
10774 (if any) specified in STR and throw errors if appropriate. Put the
10775 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10776 option is valid. */
10778 static bool
10779 aarch64_validate_march (const char *str, const struct processor **res,
10780 unsigned long *isa_flags)
10782 enum aarch64_parse_opt_result parse_res
10783 = aarch64_parse_arch (str, res, isa_flags);
10785 if (parse_res == AARCH64_PARSE_OK)
10786 return true;
10788 switch (parse_res)
10790 case AARCH64_PARSE_MISSING_ARG:
10791 error ("missing arch name in %<-march=%s%>", str);
10792 break;
10793 case AARCH64_PARSE_INVALID_ARG:
10794 error ("unknown value %qs for -march", str);
10795 aarch64_print_hint_for_arch (str);
10796 break;
10797 case AARCH64_PARSE_INVALID_FEATURE:
10798 error ("invalid feature modifier in %<-march=%s%>", str);
10799 break;
10800 default:
10801 gcc_unreachable ();
10804 return false;
10807 /* Validate a command-line -mtune option. Parse the cpu
10808 specified in STR and throw errors if appropriate. Put the
10809 result, if it is valid, in RES. Return whether the option is
10810 valid. */
10812 static bool
10813 aarch64_validate_mtune (const char *str, const struct processor **res)
10815 enum aarch64_parse_opt_result parse_res
10816 = aarch64_parse_tune (str, res);
10818 if (parse_res == AARCH64_PARSE_OK)
10819 return true;
10821 switch (parse_res)
10823 case AARCH64_PARSE_MISSING_ARG:
10824 error ("missing cpu name in %<-mtune=%s%>", str);
10825 break;
10826 case AARCH64_PARSE_INVALID_ARG:
10827 error ("unknown value %qs for -mtune", str);
10828 aarch64_print_hint_for_core (str);
10829 break;
10830 default:
10831 gcc_unreachable ();
10833 return false;
10836 /* Return the CPU corresponding to the enum CPU.
10837 If it doesn't specify a cpu, return the default. */
10839 static const struct processor *
10840 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10842 if (cpu != aarch64_none)
10843 return &all_cores[cpu];
10845 /* The & 0x3f is to extract the bottom 6 bits that encode the
10846 default cpu as selected by the --with-cpu GCC configure option
10847 in config.gcc.
10848 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10849 flags mechanism should be reworked to make it more sane. */
10850 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10853 /* Return the architecture corresponding to the enum ARCH.
10854 If it doesn't specify a valid architecture, return the default. */
10856 static const struct processor *
10857 aarch64_get_arch (enum aarch64_arch arch)
10859 if (arch != aarch64_no_arch)
10860 return &all_architectures[arch];
10862 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10864 return &all_architectures[cpu->arch];
10867 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10869 static poly_uint16
10870 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10872 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10873 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10874 deciding which .md file patterns to use and when deciding whether
10875 something is a legitimate address or constant. */
10876 if (value == SVE_SCALABLE || value == SVE_128)
10877 return poly_uint16 (2, 2);
10878 else
10879 return (int) value / 64;
10882 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10883 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10884 tuning structs. In particular it must set selected_tune and
10885 aarch64_isa_flags that define the available ISA features and tuning
10886 decisions. It must also set selected_arch as this will be used to
10887 output the .arch asm tags for each function. */
10889 static void
10890 aarch64_override_options (void)
10892 unsigned long cpu_isa = 0;
10893 unsigned long arch_isa = 0;
10894 aarch64_isa_flags = 0;
10896 bool valid_cpu = true;
10897 bool valid_tune = true;
10898 bool valid_arch = true;
10900 selected_cpu = NULL;
10901 selected_arch = NULL;
10902 selected_tune = NULL;
10904 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10905 If either of -march or -mtune is given, they override their
10906 respective component of -mcpu. */
10907 if (aarch64_cpu_string)
10908 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10909 &cpu_isa);
10911 if (aarch64_arch_string)
10912 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10913 &arch_isa);
10915 if (aarch64_tune_string)
10916 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10918 /* If the user did not specify a processor, choose the default
10919 one for them. This will be the CPU set during configuration using
10920 --with-cpu, otherwise it is "generic". */
10921 if (!selected_cpu)
10923 if (selected_arch)
10925 selected_cpu = &all_cores[selected_arch->ident];
10926 aarch64_isa_flags = arch_isa;
10927 explicit_arch = selected_arch->arch;
10929 else
10931 /* Get default configure-time CPU. */
10932 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10933 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10936 if (selected_tune)
10937 explicit_tune_core = selected_tune->ident;
10939 /* If both -mcpu and -march are specified check that they are architecturally
10940 compatible, warn if they're not and prefer the -march ISA flags. */
10941 else if (selected_arch)
10943 if (selected_arch->arch != selected_cpu->arch)
10945 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10946 all_architectures[selected_cpu->arch].name,
10947 selected_arch->name);
10949 aarch64_isa_flags = arch_isa;
10950 explicit_arch = selected_arch->arch;
10951 explicit_tune_core = selected_tune ? selected_tune->ident
10952 : selected_cpu->ident;
10954 else
10956 /* -mcpu but no -march. */
10957 aarch64_isa_flags = cpu_isa;
10958 explicit_tune_core = selected_tune ? selected_tune->ident
10959 : selected_cpu->ident;
10960 gcc_assert (selected_cpu);
10961 selected_arch = &all_architectures[selected_cpu->arch];
10962 explicit_arch = selected_arch->arch;
10965 /* Set the arch as well as we will need it when outputing
10966 the .arch directive in assembly. */
10967 if (!selected_arch)
10969 gcc_assert (selected_cpu);
10970 selected_arch = &all_architectures[selected_cpu->arch];
10973 if (!selected_tune)
10974 selected_tune = selected_cpu;
10976 #ifndef HAVE_AS_MABI_OPTION
10977 /* The compiler may have been configured with 2.23.* binutils, which does
10978 not have support for ILP32. */
10979 if (TARGET_ILP32)
10980 error ("assembler does not support -mabi=ilp32");
10981 #endif
10983 /* Convert -msve-vector-bits to a VG count. */
10984 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10986 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10987 sorry ("return address signing is only supported for -mabi=lp64");
10989 /* Make sure we properly set up the explicit options. */
10990 if ((aarch64_cpu_string && valid_cpu)
10991 || (aarch64_tune_string && valid_tune))
10992 gcc_assert (explicit_tune_core != aarch64_none);
10994 if ((aarch64_cpu_string && valid_cpu)
10995 || (aarch64_arch_string && valid_arch))
10996 gcc_assert (explicit_arch != aarch64_no_arch);
10998 aarch64_override_options_internal (&global_options);
11000 /* Save these options as the default ones in case we push and pop them later
11001 while processing functions with potential target attributes. */
11002 target_option_default_node = target_option_current_node
11003 = build_target_option_node (&global_options);
11006 /* Implement targetm.override_options_after_change. */
11008 static void
11009 aarch64_override_options_after_change (void)
11011 aarch64_override_options_after_change_1 (&global_options);
11014 static struct machine_function *
11015 aarch64_init_machine_status (void)
11017 struct machine_function *machine;
11018 machine = ggc_cleared_alloc<machine_function> ();
11019 return machine;
11022 void
11023 aarch64_init_expanders (void)
11025 init_machine_status = aarch64_init_machine_status;
11028 /* A checking mechanism for the implementation of the various code models. */
11029 static void
11030 initialize_aarch64_code_model (struct gcc_options *opts)
11032 if (opts->x_flag_pic)
11034 switch (opts->x_aarch64_cmodel_var)
11036 case AARCH64_CMODEL_TINY:
11037 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
11038 break;
11039 case AARCH64_CMODEL_SMALL:
11040 #ifdef HAVE_AS_SMALL_PIC_RELOCS
11041 aarch64_cmodel = (flag_pic == 2
11042 ? AARCH64_CMODEL_SMALL_PIC
11043 : AARCH64_CMODEL_SMALL_SPIC);
11044 #else
11045 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
11046 #endif
11047 break;
11048 case AARCH64_CMODEL_LARGE:
11049 sorry ("code model %qs with -f%s", "large",
11050 opts->x_flag_pic > 1 ? "PIC" : "pic");
11051 break;
11052 default:
11053 gcc_unreachable ();
11056 else
11057 aarch64_cmodel = opts->x_aarch64_cmodel_var;
11060 /* Implement TARGET_OPTION_SAVE. */
11062 static void
11063 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11065 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11068 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11069 using the information saved in PTR. */
11071 static void
11072 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11074 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11075 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11076 opts->x_explicit_arch = ptr->x_explicit_arch;
11077 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11078 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11080 aarch64_override_options_internal (opts);
11083 /* Implement TARGET_OPTION_PRINT. */
11085 static void
11086 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11088 const struct processor *cpu
11089 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11090 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11091 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11092 std::string extension
11093 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11095 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11096 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11097 arch->name, extension.c_str ());
11100 static GTY(()) tree aarch64_previous_fndecl;
11102 void
11103 aarch64_reset_previous_fndecl (void)
11105 aarch64_previous_fndecl = NULL;
11108 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11109 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11110 make sure optab availability predicates are recomputed when necessary. */
11112 void
11113 aarch64_save_restore_target_globals (tree new_tree)
11115 if (TREE_TARGET_GLOBALS (new_tree))
11116 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11117 else if (new_tree == target_option_default_node)
11118 restore_target_globals (&default_target_globals);
11119 else
11120 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11123 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11124 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11125 of the function, if such exists. This function may be called multiple
11126 times on a single function so use aarch64_previous_fndecl to avoid
11127 setting up identical state. */
11129 static void
11130 aarch64_set_current_function (tree fndecl)
11132 if (!fndecl || fndecl == aarch64_previous_fndecl)
11133 return;
11135 tree old_tree = (aarch64_previous_fndecl
11136 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11137 : NULL_TREE);
11139 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11141 /* If current function has no attributes but the previous one did,
11142 use the default node. */
11143 if (!new_tree && old_tree)
11144 new_tree = target_option_default_node;
11146 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11147 the default have been handled by aarch64_save_restore_target_globals from
11148 aarch64_pragma_target_parse. */
11149 if (old_tree == new_tree)
11150 return;
11152 aarch64_previous_fndecl = fndecl;
11154 /* First set the target options. */
11155 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11157 aarch64_save_restore_target_globals (new_tree);
11160 /* Enum describing the various ways we can handle attributes.
11161 In many cases we can reuse the generic option handling machinery. */
11163 enum aarch64_attr_opt_type
11165 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
11166 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
11167 aarch64_attr_enum, /* Attribute sets an enum variable. */
11168 aarch64_attr_custom /* Attribute requires a custom handling function. */
11171 /* All the information needed to handle a target attribute.
11172 NAME is the name of the attribute.
11173 ATTR_TYPE specifies the type of behavior of the attribute as described
11174 in the definition of enum aarch64_attr_opt_type.
11175 ALLOW_NEG is true if the attribute supports a "no-" form.
11176 HANDLER is the function that takes the attribute string as an argument
11177 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11178 OPT_NUM is the enum specifying the option that the attribute modifies.
11179 This is needed for attributes that mirror the behavior of a command-line
11180 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11181 aarch64_attr_enum. */
11183 struct aarch64_attribute_info
11185 const char *name;
11186 enum aarch64_attr_opt_type attr_type;
11187 bool allow_neg;
11188 bool (*handler) (const char *);
11189 enum opt_code opt_num;
11192 /* Handle the ARCH_STR argument to the arch= target attribute. */
11194 static bool
11195 aarch64_handle_attr_arch (const char *str)
11197 const struct processor *tmp_arch = NULL;
11198 enum aarch64_parse_opt_result parse_res
11199 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11201 if (parse_res == AARCH64_PARSE_OK)
11203 gcc_assert (tmp_arch);
11204 selected_arch = tmp_arch;
11205 explicit_arch = selected_arch->arch;
11206 return true;
11209 switch (parse_res)
11211 case AARCH64_PARSE_MISSING_ARG:
11212 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11213 break;
11214 case AARCH64_PARSE_INVALID_ARG:
11215 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11216 aarch64_print_hint_for_arch (str);
11217 break;
11218 case AARCH64_PARSE_INVALID_FEATURE:
11219 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11220 break;
11221 default:
11222 gcc_unreachable ();
11225 return false;
11228 /* Handle the argument CPU_STR to the cpu= target attribute. */
11230 static bool
11231 aarch64_handle_attr_cpu (const char *str)
11233 const struct processor *tmp_cpu = NULL;
11234 enum aarch64_parse_opt_result parse_res
11235 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11237 if (parse_res == AARCH64_PARSE_OK)
11239 gcc_assert (tmp_cpu);
11240 selected_tune = tmp_cpu;
11241 explicit_tune_core = selected_tune->ident;
11243 selected_arch = &all_architectures[tmp_cpu->arch];
11244 explicit_arch = selected_arch->arch;
11245 return true;
11248 switch (parse_res)
11250 case AARCH64_PARSE_MISSING_ARG:
11251 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11252 break;
11253 case AARCH64_PARSE_INVALID_ARG:
11254 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11255 aarch64_print_hint_for_core (str);
11256 break;
11257 case AARCH64_PARSE_INVALID_FEATURE:
11258 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11259 break;
11260 default:
11261 gcc_unreachable ();
11264 return false;
11267 /* Handle the argument STR to the tune= target attribute. */
11269 static bool
11270 aarch64_handle_attr_tune (const char *str)
11272 const struct processor *tmp_tune = NULL;
11273 enum aarch64_parse_opt_result parse_res
11274 = aarch64_parse_tune (str, &tmp_tune);
11276 if (parse_res == AARCH64_PARSE_OK)
11278 gcc_assert (tmp_tune);
11279 selected_tune = tmp_tune;
11280 explicit_tune_core = selected_tune->ident;
11281 return true;
11284 switch (parse_res)
11286 case AARCH64_PARSE_INVALID_ARG:
11287 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11288 aarch64_print_hint_for_core (str);
11289 break;
11290 default:
11291 gcc_unreachable ();
11294 return false;
11297 /* Parse an architecture extensions target attribute string specified in STR.
11298 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11299 if successful. Update aarch64_isa_flags to reflect the ISA features
11300 modified. */
11302 static bool
11303 aarch64_handle_attr_isa_flags (char *str)
11305 enum aarch64_parse_opt_result parse_res;
11306 unsigned long isa_flags = aarch64_isa_flags;
11308 /* We allow "+nothing" in the beginning to clear out all architectural
11309 features if the user wants to handpick specific features. */
11310 if (strncmp ("+nothing", str, 8) == 0)
11312 isa_flags = 0;
11313 str += 8;
11316 parse_res = aarch64_parse_extension (str, &isa_flags);
11318 if (parse_res == AARCH64_PARSE_OK)
11320 aarch64_isa_flags = isa_flags;
11321 return true;
11324 switch (parse_res)
11326 case AARCH64_PARSE_MISSING_ARG:
11327 error ("missing value in %<target()%> pragma or attribute");
11328 break;
11330 case AARCH64_PARSE_INVALID_FEATURE:
11331 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11332 break;
11334 default:
11335 gcc_unreachable ();
11338 return false;
11341 /* The target attributes that we support. On top of these we also support just
11342 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11343 handled explicitly in aarch64_process_one_target_attr. */
11345 static const struct aarch64_attribute_info aarch64_attributes[] =
11347 { "general-regs-only", aarch64_attr_mask, false, NULL,
11348 OPT_mgeneral_regs_only },
11349 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11350 OPT_mfix_cortex_a53_835769 },
11351 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11352 OPT_mfix_cortex_a53_843419 },
11353 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11354 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11355 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11356 OPT_momit_leaf_frame_pointer },
11357 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11358 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11359 OPT_march_ },
11360 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11361 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11362 OPT_mtune_ },
11363 { "sign-return-address", aarch64_attr_enum, false, NULL,
11364 OPT_msign_return_address_ },
11365 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11368 /* Parse ARG_STR which contains the definition of one target attribute.
11369 Show appropriate errors if any or return true if the attribute is valid. */
11371 static bool
11372 aarch64_process_one_target_attr (char *arg_str)
11374 bool invert = false;
11376 size_t len = strlen (arg_str);
11378 if (len == 0)
11380 error ("malformed %<target()%> pragma or attribute");
11381 return false;
11384 char *str_to_check = (char *) alloca (len + 1);
11385 strcpy (str_to_check, arg_str);
11387 /* Skip leading whitespace. */
11388 while (*str_to_check == ' ' || *str_to_check == '\t')
11389 str_to_check++;
11391 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11392 It is easier to detect and handle it explicitly here rather than going
11393 through the machinery for the rest of the target attributes in this
11394 function. */
11395 if (*str_to_check == '+')
11396 return aarch64_handle_attr_isa_flags (str_to_check);
11398 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11400 invert = true;
11401 str_to_check += 3;
11403 char *arg = strchr (str_to_check, '=');
11405 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11406 and point ARG to "foo". */
11407 if (arg)
11409 *arg = '\0';
11410 arg++;
11412 const struct aarch64_attribute_info *p_attr;
11413 bool found = false;
11414 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11416 /* If the names don't match up, or the user has given an argument
11417 to an attribute that doesn't accept one, or didn't give an argument
11418 to an attribute that expects one, fail to match. */
11419 if (strcmp (str_to_check, p_attr->name) != 0)
11420 continue;
11422 found = true;
11423 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11424 || p_attr->attr_type == aarch64_attr_enum;
11426 if (attr_need_arg_p ^ (arg != NULL))
11428 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11429 return false;
11432 /* If the name matches but the attribute does not allow "no-" versions
11433 then we can't match. */
11434 if (invert && !p_attr->allow_neg)
11436 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11437 return false;
11440 switch (p_attr->attr_type)
11442 /* Has a custom handler registered.
11443 For example, cpu=, arch=, tune=. */
11444 case aarch64_attr_custom:
11445 gcc_assert (p_attr->handler);
11446 if (!p_attr->handler (arg))
11447 return false;
11448 break;
11450 /* Either set or unset a boolean option. */
11451 case aarch64_attr_bool:
11453 struct cl_decoded_option decoded;
11455 generate_option (p_attr->opt_num, NULL, !invert,
11456 CL_TARGET, &decoded);
11457 aarch64_handle_option (&global_options, &global_options_set,
11458 &decoded, input_location);
11459 break;
11461 /* Set or unset a bit in the target_flags. aarch64_handle_option
11462 should know what mask to apply given the option number. */
11463 case aarch64_attr_mask:
11465 struct cl_decoded_option decoded;
11466 /* We only need to specify the option number.
11467 aarch64_handle_option will know which mask to apply. */
11468 decoded.opt_index = p_attr->opt_num;
11469 decoded.value = !invert;
11470 aarch64_handle_option (&global_options, &global_options_set,
11471 &decoded, input_location);
11472 break;
11474 /* Use the option setting machinery to set an option to an enum. */
11475 case aarch64_attr_enum:
11477 gcc_assert (arg);
11478 bool valid;
11479 int value;
11480 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11481 &value, CL_TARGET);
11482 if (valid)
11484 set_option (&global_options, NULL, p_attr->opt_num, value,
11485 NULL, DK_UNSPECIFIED, input_location,
11486 global_dc);
11488 else
11490 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11492 break;
11494 default:
11495 gcc_unreachable ();
11499 /* If we reached here we either have found an attribute and validated
11500 it or didn't match any. If we matched an attribute but its arguments
11501 were malformed we will have returned false already. */
11502 return found;
11505 /* Count how many times the character C appears in
11506 NULL-terminated string STR. */
11508 static unsigned int
11509 num_occurences_in_str (char c, char *str)
11511 unsigned int res = 0;
11512 while (*str != '\0')
11514 if (*str == c)
11515 res++;
11517 str++;
11520 return res;
11523 /* Parse the tree in ARGS that contains the target attribute information
11524 and update the global target options space. */
11526 bool
11527 aarch64_process_target_attr (tree args)
11529 if (TREE_CODE (args) == TREE_LIST)
11533 tree head = TREE_VALUE (args);
11534 if (head)
11536 if (!aarch64_process_target_attr (head))
11537 return false;
11539 args = TREE_CHAIN (args);
11540 } while (args);
11542 return true;
11545 if (TREE_CODE (args) != STRING_CST)
11547 error ("attribute %<target%> argument not a string");
11548 return false;
11551 size_t len = strlen (TREE_STRING_POINTER (args));
11552 char *str_to_check = (char *) alloca (len + 1);
11553 strcpy (str_to_check, TREE_STRING_POINTER (args));
11555 if (len == 0)
11557 error ("malformed %<target()%> pragma or attribute");
11558 return false;
11561 /* Used to catch empty spaces between commas i.e.
11562 attribute ((target ("attr1,,attr2"))). */
11563 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11565 /* Handle multiple target attributes separated by ','. */
11566 char *token = strtok (str_to_check, ",");
11568 unsigned int num_attrs = 0;
11569 while (token)
11571 num_attrs++;
11572 if (!aarch64_process_one_target_attr (token))
11574 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11575 return false;
11578 token = strtok (NULL, ",");
11581 if (num_attrs != num_commas + 1)
11583 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11584 return false;
11587 return true;
11590 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11591 process attribute ((target ("..."))). */
11593 static bool
11594 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11596 struct cl_target_option cur_target;
11597 bool ret;
11598 tree old_optimize;
11599 tree new_target, new_optimize;
11600 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11602 /* If what we're processing is the current pragma string then the
11603 target option node is already stored in target_option_current_node
11604 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11605 having to re-parse the string. This is especially useful to keep
11606 arm_neon.h compile times down since that header contains a lot
11607 of intrinsics enclosed in pragmas. */
11608 if (!existing_target && args == current_target_pragma)
11610 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11611 return true;
11613 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11615 old_optimize = build_optimization_node (&global_options);
11616 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11618 /* If the function changed the optimization levels as well as setting
11619 target options, start with the optimizations specified. */
11620 if (func_optimize && func_optimize != old_optimize)
11621 cl_optimization_restore (&global_options,
11622 TREE_OPTIMIZATION (func_optimize));
11624 /* Save the current target options to restore at the end. */
11625 cl_target_option_save (&cur_target, &global_options);
11627 /* If fndecl already has some target attributes applied to it, unpack
11628 them so that we add this attribute on top of them, rather than
11629 overwriting them. */
11630 if (existing_target)
11632 struct cl_target_option *existing_options
11633 = TREE_TARGET_OPTION (existing_target);
11635 if (existing_options)
11636 cl_target_option_restore (&global_options, existing_options);
11638 else
11639 cl_target_option_restore (&global_options,
11640 TREE_TARGET_OPTION (target_option_current_node));
11642 ret = aarch64_process_target_attr (args);
11644 /* Set up any additional state. */
11645 if (ret)
11647 aarch64_override_options_internal (&global_options);
11648 /* Initialize SIMD builtins if we haven't already.
11649 Set current_target_pragma to NULL for the duration so that
11650 the builtin initialization code doesn't try to tag the functions
11651 being built with the attributes specified by any current pragma, thus
11652 going into an infinite recursion. */
11653 if (TARGET_SIMD)
11655 tree saved_current_target_pragma = current_target_pragma;
11656 current_target_pragma = NULL;
11657 aarch64_init_simd_builtins ();
11658 current_target_pragma = saved_current_target_pragma;
11660 new_target = build_target_option_node (&global_options);
11662 else
11663 new_target = NULL;
11665 new_optimize = build_optimization_node (&global_options);
11667 if (fndecl && ret)
11669 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11671 if (old_optimize != new_optimize)
11672 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11675 cl_target_option_restore (&global_options, &cur_target);
11677 if (old_optimize != new_optimize)
11678 cl_optimization_restore (&global_options,
11679 TREE_OPTIMIZATION (old_optimize));
11680 return ret;
11683 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11684 tri-bool options (yes, no, don't care) and the default value is
11685 DEF, determine whether to reject inlining. */
11687 static bool
11688 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11689 int dont_care, int def)
11691 /* If the callee doesn't care, always allow inlining. */
11692 if (callee == dont_care)
11693 return true;
11695 /* If the caller doesn't care, always allow inlining. */
11696 if (caller == dont_care)
11697 return true;
11699 /* Otherwise, allow inlining if either the callee and caller values
11700 agree, or if the callee is using the default value. */
11701 return (callee == caller || callee == def);
11704 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11705 to inline CALLEE into CALLER based on target-specific info.
11706 Make sure that the caller and callee have compatible architectural
11707 features. Then go through the other possible target attributes
11708 and see if they can block inlining. Try not to reject always_inline
11709 callees unless they are incompatible architecturally. */
11711 static bool
11712 aarch64_can_inline_p (tree caller, tree callee)
11714 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11715 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11717 struct cl_target_option *caller_opts
11718 = TREE_TARGET_OPTION (caller_tree ? caller_tree
11719 : target_option_default_node);
11721 struct cl_target_option *callee_opts
11722 = TREE_TARGET_OPTION (callee_tree ? callee_tree
11723 : target_option_default_node);
11725 /* Callee's ISA flags should be a subset of the caller's. */
11726 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11727 != callee_opts->x_aarch64_isa_flags)
11728 return false;
11730 /* Allow non-strict aligned functions inlining into strict
11731 aligned ones. */
11732 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11733 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11734 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11735 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11736 return false;
11738 bool always_inline = lookup_attribute ("always_inline",
11739 DECL_ATTRIBUTES (callee));
11741 /* If the architectural features match up and the callee is always_inline
11742 then the other attributes don't matter. */
11743 if (always_inline)
11744 return true;
11746 if (caller_opts->x_aarch64_cmodel_var
11747 != callee_opts->x_aarch64_cmodel_var)
11748 return false;
11750 if (caller_opts->x_aarch64_tls_dialect
11751 != callee_opts->x_aarch64_tls_dialect)
11752 return false;
11754 /* Honour explicit requests to workaround errata. */
11755 if (!aarch64_tribools_ok_for_inlining_p (
11756 caller_opts->x_aarch64_fix_a53_err835769,
11757 callee_opts->x_aarch64_fix_a53_err835769,
11758 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11759 return false;
11761 if (!aarch64_tribools_ok_for_inlining_p (
11762 caller_opts->x_aarch64_fix_a53_err843419,
11763 callee_opts->x_aarch64_fix_a53_err843419,
11764 2, TARGET_FIX_ERR_A53_843419))
11765 return false;
11767 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11768 caller and calle and they don't match up, reject inlining. */
11769 if (!aarch64_tribools_ok_for_inlining_p (
11770 caller_opts->x_flag_omit_leaf_frame_pointer,
11771 callee_opts->x_flag_omit_leaf_frame_pointer,
11772 2, 1))
11773 return false;
11775 /* If the callee has specific tuning overrides, respect them. */
11776 if (callee_opts->x_aarch64_override_tune_string != NULL
11777 && caller_opts->x_aarch64_override_tune_string == NULL)
11778 return false;
11780 /* If the user specified tuning override strings for the
11781 caller and callee and they don't match up, reject inlining.
11782 We just do a string compare here, we don't analyze the meaning
11783 of the string, as it would be too costly for little gain. */
11784 if (callee_opts->x_aarch64_override_tune_string
11785 && caller_opts->x_aarch64_override_tune_string
11786 && (strcmp (callee_opts->x_aarch64_override_tune_string,
11787 caller_opts->x_aarch64_override_tune_string) != 0))
11788 return false;
11790 return true;
11793 /* Return true if SYMBOL_REF X binds locally. */
11795 static bool
11796 aarch64_symbol_binds_local_p (const_rtx x)
11798 return (SYMBOL_REF_DECL (x)
11799 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11800 : SYMBOL_REF_LOCAL_P (x));
11803 /* Return true if SYMBOL_REF X is thread local */
11804 static bool
11805 aarch64_tls_symbol_p (rtx x)
11807 if (! TARGET_HAVE_TLS)
11808 return false;
11810 if (GET_CODE (x) != SYMBOL_REF)
11811 return false;
11813 return SYMBOL_REF_TLS_MODEL (x) != 0;
11816 /* Classify a TLS symbol into one of the TLS kinds. */
11817 enum aarch64_symbol_type
11818 aarch64_classify_tls_symbol (rtx x)
11820 enum tls_model tls_kind = tls_symbolic_operand_type (x);
11822 switch (tls_kind)
11824 case TLS_MODEL_GLOBAL_DYNAMIC:
11825 case TLS_MODEL_LOCAL_DYNAMIC:
11826 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11828 case TLS_MODEL_INITIAL_EXEC:
11829 switch (aarch64_cmodel)
11831 case AARCH64_CMODEL_TINY:
11832 case AARCH64_CMODEL_TINY_PIC:
11833 return SYMBOL_TINY_TLSIE;
11834 default:
11835 return SYMBOL_SMALL_TLSIE;
11838 case TLS_MODEL_LOCAL_EXEC:
11839 if (aarch64_tls_size == 12)
11840 return SYMBOL_TLSLE12;
11841 else if (aarch64_tls_size == 24)
11842 return SYMBOL_TLSLE24;
11843 else if (aarch64_tls_size == 32)
11844 return SYMBOL_TLSLE32;
11845 else if (aarch64_tls_size == 48)
11846 return SYMBOL_TLSLE48;
11847 else
11848 gcc_unreachable ();
11850 case TLS_MODEL_EMULATED:
11851 case TLS_MODEL_NONE:
11852 return SYMBOL_FORCE_TO_MEM;
11854 default:
11855 gcc_unreachable ();
11859 /* Return the correct method for accessing X + OFFSET, where X is either
11860 a SYMBOL_REF or LABEL_REF. */
11862 enum aarch64_symbol_type
11863 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11865 if (GET_CODE (x) == LABEL_REF)
11867 switch (aarch64_cmodel)
11869 case AARCH64_CMODEL_LARGE:
11870 return SYMBOL_FORCE_TO_MEM;
11872 case AARCH64_CMODEL_TINY_PIC:
11873 case AARCH64_CMODEL_TINY:
11874 return SYMBOL_TINY_ABSOLUTE;
11876 case AARCH64_CMODEL_SMALL_SPIC:
11877 case AARCH64_CMODEL_SMALL_PIC:
11878 case AARCH64_CMODEL_SMALL:
11879 return SYMBOL_SMALL_ABSOLUTE;
11881 default:
11882 gcc_unreachable ();
11886 if (GET_CODE (x) == SYMBOL_REF)
11888 if (aarch64_tls_symbol_p (x))
11889 return aarch64_classify_tls_symbol (x);
11891 switch (aarch64_cmodel)
11893 case AARCH64_CMODEL_TINY:
11894 /* When we retrieve symbol + offset address, we have to make sure
11895 the offset does not cause overflow of the final address. But
11896 we have no way of knowing the address of symbol at compile time
11897 so we can't accurately say if the distance between the PC and
11898 symbol + offset is outside the addressible range of +/-1M in the
11899 TINY code model. So we rely on images not being greater than
11900 1M and cap the offset at 1M and anything beyond 1M will have to
11901 be loaded using an alternative mechanism. Furthermore if the
11902 symbol is a weak reference to something that isn't known to
11903 resolve to a symbol in this module, then force to memory. */
11904 if ((SYMBOL_REF_WEAK (x)
11905 && !aarch64_symbol_binds_local_p (x))
11906 || !IN_RANGE (offset, -1048575, 1048575))
11907 return SYMBOL_FORCE_TO_MEM;
11908 return SYMBOL_TINY_ABSOLUTE;
11910 case AARCH64_CMODEL_SMALL:
11911 /* Same reasoning as the tiny code model, but the offset cap here is
11912 4G. */
11913 if ((SYMBOL_REF_WEAK (x)
11914 && !aarch64_symbol_binds_local_p (x))
11915 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11916 HOST_WIDE_INT_C (4294967264)))
11917 return SYMBOL_FORCE_TO_MEM;
11918 return SYMBOL_SMALL_ABSOLUTE;
11920 case AARCH64_CMODEL_TINY_PIC:
11921 if (!aarch64_symbol_binds_local_p (x))
11922 return SYMBOL_TINY_GOT;
11923 return SYMBOL_TINY_ABSOLUTE;
11925 case AARCH64_CMODEL_SMALL_SPIC:
11926 case AARCH64_CMODEL_SMALL_PIC:
11927 if (!aarch64_symbol_binds_local_p (x))
11928 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11929 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11930 return SYMBOL_SMALL_ABSOLUTE;
11932 case AARCH64_CMODEL_LARGE:
11933 /* This is alright even in PIC code as the constant
11934 pool reference is always PC relative and within
11935 the same translation unit. */
11936 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11937 return SYMBOL_SMALL_ABSOLUTE;
11938 else
11939 return SYMBOL_FORCE_TO_MEM;
11941 default:
11942 gcc_unreachable ();
11946 /* By default push everything into the constant pool. */
11947 return SYMBOL_FORCE_TO_MEM;
11950 bool
11951 aarch64_constant_address_p (rtx x)
11953 return (CONSTANT_P (x) && memory_address_p (DImode, x));
11956 bool
11957 aarch64_legitimate_pic_operand_p (rtx x)
11959 if (GET_CODE (x) == SYMBOL_REF
11960 || (GET_CODE (x) == CONST
11961 && GET_CODE (XEXP (x, 0)) == PLUS
11962 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11963 return false;
11965 return true;
11968 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11969 that should be rematerialized rather than spilled. */
11971 static bool
11972 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11974 /* Support CSE and rematerialization of common constants. */
11975 if (CONST_INT_P (x)
11976 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11977 || GET_CODE (x) == CONST_VECTOR)
11978 return true;
11980 /* Do not allow vector struct mode constants for Advanced SIMD.
11981 We could support 0 and -1 easily, but they need support in
11982 aarch64-simd.md. */
11983 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11984 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11985 return false;
11987 /* Only accept variable-length vector constants if they can be
11988 handled directly.
11990 ??? It would be possible to handle rematerialization of other
11991 constants via secondary reloads. */
11992 if (vec_flags & VEC_ANY_SVE)
11993 return aarch64_simd_valid_immediate (x, NULL);
11995 if (GET_CODE (x) == HIGH)
11996 x = XEXP (x, 0);
11998 /* Accept polynomial constants that can be calculated by using the
11999 destination of a move as the sole temporary. Constants that
12000 require a second temporary cannot be rematerialized (they can't be
12001 forced to memory and also aren't legitimate constants). */
12002 poly_int64 offset;
12003 if (poly_int_rtx_p (x, &offset))
12004 return aarch64_offset_temporaries (false, offset) <= 1;
12006 /* If an offset is being added to something else, we need to allow the
12007 base to be moved into the destination register, meaning that there
12008 are no free temporaries for the offset. */
12009 x = strip_offset (x, &offset);
12010 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
12011 return false;
12013 /* Do not allow const (plus (anchor_symbol, const_int)). */
12014 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
12015 return false;
12017 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
12018 so spilling them is better than rematerialization. */
12019 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
12020 return true;
12022 /* Label references are always constant. */
12023 if (GET_CODE (x) == LABEL_REF)
12024 return true;
12026 return false;
12030 aarch64_load_tp (rtx target)
12032 if (!target
12033 || GET_MODE (target) != Pmode
12034 || !register_operand (target, Pmode))
12035 target = gen_reg_rtx (Pmode);
12037 /* Can return in any reg. */
12038 emit_insn (gen_aarch64_load_tp_hard (target));
12039 return target;
12042 /* On AAPCS systems, this is the "struct __va_list". */
12043 static GTY(()) tree va_list_type;
12045 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
12046 Return the type to use as __builtin_va_list.
12048 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12050 struct __va_list
12052 void *__stack;
12053 void *__gr_top;
12054 void *__vr_top;
12055 int __gr_offs;
12056 int __vr_offs;
12057 }; */
12059 static tree
12060 aarch64_build_builtin_va_list (void)
12062 tree va_list_name;
12063 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12065 /* Create the type. */
12066 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12067 /* Give it the required name. */
12068 va_list_name = build_decl (BUILTINS_LOCATION,
12069 TYPE_DECL,
12070 get_identifier ("__va_list"),
12071 va_list_type);
12072 DECL_ARTIFICIAL (va_list_name) = 1;
12073 TYPE_NAME (va_list_type) = va_list_name;
12074 TYPE_STUB_DECL (va_list_type) = va_list_name;
12076 /* Create the fields. */
12077 f_stack = build_decl (BUILTINS_LOCATION,
12078 FIELD_DECL, get_identifier ("__stack"),
12079 ptr_type_node);
12080 f_grtop = build_decl (BUILTINS_LOCATION,
12081 FIELD_DECL, get_identifier ("__gr_top"),
12082 ptr_type_node);
12083 f_vrtop = build_decl (BUILTINS_LOCATION,
12084 FIELD_DECL, get_identifier ("__vr_top"),
12085 ptr_type_node);
12086 f_groff = build_decl (BUILTINS_LOCATION,
12087 FIELD_DECL, get_identifier ("__gr_offs"),
12088 integer_type_node);
12089 f_vroff = build_decl (BUILTINS_LOCATION,
12090 FIELD_DECL, get_identifier ("__vr_offs"),
12091 integer_type_node);
12093 /* Tell tree-stdarg pass about our internal offset fields.
12094 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12095 purpose to identify whether the code is updating va_list internal
12096 offset fields through irregular way. */
12097 va_list_gpr_counter_field = f_groff;
12098 va_list_fpr_counter_field = f_vroff;
12100 DECL_ARTIFICIAL (f_stack) = 1;
12101 DECL_ARTIFICIAL (f_grtop) = 1;
12102 DECL_ARTIFICIAL (f_vrtop) = 1;
12103 DECL_ARTIFICIAL (f_groff) = 1;
12104 DECL_ARTIFICIAL (f_vroff) = 1;
12106 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12107 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12108 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12109 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12110 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12112 TYPE_FIELDS (va_list_type) = f_stack;
12113 DECL_CHAIN (f_stack) = f_grtop;
12114 DECL_CHAIN (f_grtop) = f_vrtop;
12115 DECL_CHAIN (f_vrtop) = f_groff;
12116 DECL_CHAIN (f_groff) = f_vroff;
12118 /* Compute its layout. */
12119 layout_type (va_list_type);
12121 return va_list_type;
12124 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12125 static void
12126 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12128 const CUMULATIVE_ARGS *cum;
12129 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12130 tree stack, grtop, vrtop, groff, vroff;
12131 tree t;
12132 int gr_save_area_size = cfun->va_list_gpr_size;
12133 int vr_save_area_size = cfun->va_list_fpr_size;
12134 int vr_offset;
12136 cum = &crtl->args.info;
12137 if (cfun->va_list_gpr_size)
12138 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12139 cfun->va_list_gpr_size);
12140 if (cfun->va_list_fpr_size)
12141 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12142 * UNITS_PER_VREG, cfun->va_list_fpr_size);
12144 if (!TARGET_FLOAT)
12146 gcc_assert (cum->aapcs_nvrn == 0);
12147 vr_save_area_size = 0;
12150 f_stack = TYPE_FIELDS (va_list_type_node);
12151 f_grtop = DECL_CHAIN (f_stack);
12152 f_vrtop = DECL_CHAIN (f_grtop);
12153 f_groff = DECL_CHAIN (f_vrtop);
12154 f_vroff = DECL_CHAIN (f_groff);
12156 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12157 NULL_TREE);
12158 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12159 NULL_TREE);
12160 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12161 NULL_TREE);
12162 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12163 NULL_TREE);
12164 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12165 NULL_TREE);
12167 /* Emit code to initialize STACK, which points to the next varargs stack
12168 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12169 by named arguments. STACK is 8-byte aligned. */
12170 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12171 if (cum->aapcs_stack_size > 0)
12172 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12173 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12174 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12176 /* Emit code to initialize GRTOP, the top of the GR save area.
12177 virtual_incoming_args_rtx should have been 16 byte aligned. */
12178 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12179 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12180 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12182 /* Emit code to initialize VRTOP, the top of the VR save area.
12183 This address is gr_save_area_bytes below GRTOP, rounded
12184 down to the next 16-byte boundary. */
12185 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12186 vr_offset = ROUND_UP (gr_save_area_size,
12187 STACK_BOUNDARY / BITS_PER_UNIT);
12189 if (vr_offset)
12190 t = fold_build_pointer_plus_hwi (t, -vr_offset);
12191 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12192 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12194 /* Emit code to initialize GROFF, the offset from GRTOP of the
12195 next GPR argument. */
12196 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12197 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12198 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12200 /* Likewise emit code to initialize VROFF, the offset from FTOP
12201 of the next VR argument. */
12202 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12203 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12204 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12207 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12209 static tree
12210 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12211 gimple_seq *post_p ATTRIBUTE_UNUSED)
12213 tree addr;
12214 bool indirect_p;
12215 bool is_ha; /* is HFA or HVA. */
12216 bool dw_align; /* double-word align. */
12217 machine_mode ag_mode = VOIDmode;
12218 int nregs;
12219 machine_mode mode;
12221 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12222 tree stack, f_top, f_off, off, arg, roundup, on_stack;
12223 HOST_WIDE_INT size, rsize, adjust, align;
12224 tree t, u, cond1, cond2;
12226 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12227 if (indirect_p)
12228 type = build_pointer_type (type);
12230 mode = TYPE_MODE (type);
12232 f_stack = TYPE_FIELDS (va_list_type_node);
12233 f_grtop = DECL_CHAIN (f_stack);
12234 f_vrtop = DECL_CHAIN (f_grtop);
12235 f_groff = DECL_CHAIN (f_vrtop);
12236 f_vroff = DECL_CHAIN (f_groff);
12238 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12239 f_stack, NULL_TREE);
12240 size = int_size_in_bytes (type);
12241 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12243 dw_align = false;
12244 adjust = 0;
12245 if (aarch64_vfp_is_call_or_return_candidate (mode,
12246 type,
12247 &ag_mode,
12248 &nregs,
12249 &is_ha))
12251 /* No frontends can create types with variable-sized modes, so we
12252 shouldn't be asked to pass or return them. */
12253 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12255 /* TYPE passed in fp/simd registers. */
12256 if (!TARGET_FLOAT)
12257 aarch64_err_no_fpadvsimd (mode, "varargs");
12259 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12260 unshare_expr (valist), f_vrtop, NULL_TREE);
12261 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12262 unshare_expr (valist), f_vroff, NULL_TREE);
12264 rsize = nregs * UNITS_PER_VREG;
12266 if (is_ha)
12268 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12269 adjust = UNITS_PER_VREG - ag_size;
12271 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12272 && size < UNITS_PER_VREG)
12274 adjust = UNITS_PER_VREG - size;
12277 else
12279 /* TYPE passed in general registers. */
12280 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12281 unshare_expr (valist), f_grtop, NULL_TREE);
12282 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12283 unshare_expr (valist), f_groff, NULL_TREE);
12284 rsize = ROUND_UP (size, UNITS_PER_WORD);
12285 nregs = rsize / UNITS_PER_WORD;
12287 if (align > 8)
12288 dw_align = true;
12290 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12291 && size < UNITS_PER_WORD)
12293 adjust = UNITS_PER_WORD - size;
12297 /* Get a local temporary for the field value. */
12298 off = get_initialized_tmp_var (f_off, pre_p, NULL);
12300 /* Emit code to branch if off >= 0. */
12301 t = build2 (GE_EXPR, boolean_type_node, off,
12302 build_int_cst (TREE_TYPE (off), 0));
12303 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12305 if (dw_align)
12307 /* Emit: offs = (offs + 15) & -16. */
12308 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12309 build_int_cst (TREE_TYPE (off), 15));
12310 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12311 build_int_cst (TREE_TYPE (off), -16));
12312 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12314 else
12315 roundup = NULL;
12317 /* Update ap.__[g|v]r_offs */
12318 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12319 build_int_cst (TREE_TYPE (off), rsize));
12320 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12322 /* String up. */
12323 if (roundup)
12324 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12326 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12327 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12328 build_int_cst (TREE_TYPE (f_off), 0));
12329 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12331 /* String up: make sure the assignment happens before the use. */
12332 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12333 COND_EXPR_ELSE (cond1) = t;
12335 /* Prepare the trees handling the argument that is passed on the stack;
12336 the top level node will store in ON_STACK. */
12337 arg = get_initialized_tmp_var (stack, pre_p, NULL);
12338 if (align > 8)
12340 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12341 t = fold_build_pointer_plus_hwi (arg, 15);
12342 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12343 build_int_cst (TREE_TYPE (t), -16));
12344 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12346 else
12347 roundup = NULL;
12348 /* Advance ap.__stack */
12349 t = fold_build_pointer_plus_hwi (arg, size + 7);
12350 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12351 build_int_cst (TREE_TYPE (t), -8));
12352 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12353 /* String up roundup and advance. */
12354 if (roundup)
12355 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12356 /* String up with arg */
12357 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12358 /* Big-endianness related address adjustment. */
12359 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12360 && size < UNITS_PER_WORD)
12362 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12363 size_int (UNITS_PER_WORD - size));
12364 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12367 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12368 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12370 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12371 t = off;
12372 if (adjust)
12373 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12374 build_int_cst (TREE_TYPE (off), adjust));
12376 t = fold_convert (sizetype, t);
12377 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12379 if (is_ha)
12381 /* type ha; // treat as "struct {ftype field[n];}"
12382 ... [computing offs]
12383 for (i = 0; i <nregs; ++i, offs += 16)
12384 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12385 return ha; */
12386 int i;
12387 tree tmp_ha, field_t, field_ptr_t;
12389 /* Declare a local variable. */
12390 tmp_ha = create_tmp_var_raw (type, "ha");
12391 gimple_add_tmp_var (tmp_ha);
12393 /* Establish the base type. */
12394 switch (ag_mode)
12396 case E_SFmode:
12397 field_t = float_type_node;
12398 field_ptr_t = float_ptr_type_node;
12399 break;
12400 case E_DFmode:
12401 field_t = double_type_node;
12402 field_ptr_t = double_ptr_type_node;
12403 break;
12404 case E_TFmode:
12405 field_t = long_double_type_node;
12406 field_ptr_t = long_double_ptr_type_node;
12407 break;
12408 case E_HFmode:
12409 field_t = aarch64_fp16_type_node;
12410 field_ptr_t = aarch64_fp16_ptr_type_node;
12411 break;
12412 case E_V2SImode:
12413 case E_V4SImode:
12415 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12416 field_t = build_vector_type_for_mode (innertype, ag_mode);
12417 field_ptr_t = build_pointer_type (field_t);
12419 break;
12420 default:
12421 gcc_assert (0);
12424 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12425 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12426 addr = t;
12427 t = fold_convert (field_ptr_t, addr);
12428 t = build2 (MODIFY_EXPR, field_t,
12429 build1 (INDIRECT_REF, field_t, tmp_ha),
12430 build1 (INDIRECT_REF, field_t, t));
12432 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12433 for (i = 1; i < nregs; ++i)
12435 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12436 u = fold_convert (field_ptr_t, addr);
12437 u = build2 (MODIFY_EXPR, field_t,
12438 build2 (MEM_REF, field_t, tmp_ha,
12439 build_int_cst (field_ptr_t,
12440 (i *
12441 int_size_in_bytes (field_t)))),
12442 build1 (INDIRECT_REF, field_t, u));
12443 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12446 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12447 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12450 COND_EXPR_ELSE (cond2) = t;
12451 addr = fold_convert (build_pointer_type (type), cond1);
12452 addr = build_va_arg_indirect_ref (addr);
12454 if (indirect_p)
12455 addr = build_va_arg_indirect_ref (addr);
12457 return addr;
12460 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12462 static void
12463 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12464 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12465 int no_rtl)
12467 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12468 CUMULATIVE_ARGS local_cum;
12469 int gr_saved = cfun->va_list_gpr_size;
12470 int vr_saved = cfun->va_list_fpr_size;
12472 /* The caller has advanced CUM up to, but not beyond, the last named
12473 argument. Advance a local copy of CUM past the last "real" named
12474 argument, to find out how many registers are left over. */
12475 local_cum = *cum;
12476 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12478 /* Found out how many registers we need to save.
12479 Honor tree-stdvar analysis results. */
12480 if (cfun->va_list_gpr_size)
12481 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12482 cfun->va_list_gpr_size / UNITS_PER_WORD);
12483 if (cfun->va_list_fpr_size)
12484 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12485 cfun->va_list_fpr_size / UNITS_PER_VREG);
12487 if (!TARGET_FLOAT)
12489 gcc_assert (local_cum.aapcs_nvrn == 0);
12490 vr_saved = 0;
12493 if (!no_rtl)
12495 if (gr_saved > 0)
12497 rtx ptr, mem;
12499 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12500 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12501 - gr_saved * UNITS_PER_WORD);
12502 mem = gen_frame_mem (BLKmode, ptr);
12503 set_mem_alias_set (mem, get_varargs_alias_set ());
12505 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12506 mem, gr_saved);
12508 if (vr_saved > 0)
12510 /* We can't use move_block_from_reg, because it will use
12511 the wrong mode, storing D regs only. */
12512 machine_mode mode = TImode;
12513 int off, i, vr_start;
12515 /* Set OFF to the offset from virtual_incoming_args_rtx of
12516 the first vector register. The VR save area lies below
12517 the GR one, and is aligned to 16 bytes. */
12518 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12519 STACK_BOUNDARY / BITS_PER_UNIT);
12520 off -= vr_saved * UNITS_PER_VREG;
12522 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12523 for (i = 0; i < vr_saved; ++i)
12525 rtx ptr, mem;
12527 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12528 mem = gen_frame_mem (mode, ptr);
12529 set_mem_alias_set (mem, get_varargs_alias_set ());
12530 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12531 off += UNITS_PER_VREG;
12536 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12537 any complication of having crtl->args.pretend_args_size changed. */
12538 cfun->machine->frame.saved_varargs_size
12539 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12540 STACK_BOUNDARY / BITS_PER_UNIT)
12541 + vr_saved * UNITS_PER_VREG);
12544 static void
12545 aarch64_conditional_register_usage (void)
12547 int i;
12548 if (!TARGET_FLOAT)
12550 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12552 fixed_regs[i] = 1;
12553 call_used_regs[i] = 1;
12556 if (!TARGET_SVE)
12557 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12559 fixed_regs[i] = 1;
12560 call_used_regs[i] = 1;
12564 /* Walk down the type tree of TYPE counting consecutive base elements.
12565 If *MODEP is VOIDmode, then set it to the first valid floating point
12566 type. If a non-floating point type is found, or if a floating point
12567 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12568 otherwise return the count in the sub-tree. */
12569 static int
12570 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12572 machine_mode mode;
12573 HOST_WIDE_INT size;
12575 switch (TREE_CODE (type))
12577 case REAL_TYPE:
12578 mode = TYPE_MODE (type);
12579 if (mode != DFmode && mode != SFmode
12580 && mode != TFmode && mode != HFmode)
12581 return -1;
12583 if (*modep == VOIDmode)
12584 *modep = mode;
12586 if (*modep == mode)
12587 return 1;
12589 break;
12591 case COMPLEX_TYPE:
12592 mode = TYPE_MODE (TREE_TYPE (type));
12593 if (mode != DFmode && mode != SFmode
12594 && mode != TFmode && mode != HFmode)
12595 return -1;
12597 if (*modep == VOIDmode)
12598 *modep = mode;
12600 if (*modep == mode)
12601 return 2;
12603 break;
12605 case VECTOR_TYPE:
12606 /* Use V2SImode and V4SImode as representatives of all 64-bit
12607 and 128-bit vector types. */
12608 size = int_size_in_bytes (type);
12609 switch (size)
12611 case 8:
12612 mode = V2SImode;
12613 break;
12614 case 16:
12615 mode = V4SImode;
12616 break;
12617 default:
12618 return -1;
12621 if (*modep == VOIDmode)
12622 *modep = mode;
12624 /* Vector modes are considered to be opaque: two vectors are
12625 equivalent for the purposes of being homogeneous aggregates
12626 if they are the same size. */
12627 if (*modep == mode)
12628 return 1;
12630 break;
12632 case ARRAY_TYPE:
12634 int count;
12635 tree index = TYPE_DOMAIN (type);
12637 /* Can't handle incomplete types nor sizes that are not
12638 fixed. */
12639 if (!COMPLETE_TYPE_P (type)
12640 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12641 return -1;
12643 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12644 if (count == -1
12645 || !index
12646 || !TYPE_MAX_VALUE (index)
12647 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12648 || !TYPE_MIN_VALUE (index)
12649 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12650 || count < 0)
12651 return -1;
12653 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12654 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12656 /* There must be no padding. */
12657 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12658 count * GET_MODE_BITSIZE (*modep)))
12659 return -1;
12661 return count;
12664 case RECORD_TYPE:
12666 int count = 0;
12667 int sub_count;
12668 tree field;
12670 /* Can't handle incomplete types nor sizes that are not
12671 fixed. */
12672 if (!COMPLETE_TYPE_P (type)
12673 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12674 return -1;
12676 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12678 if (TREE_CODE (field) != FIELD_DECL)
12679 continue;
12681 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12682 if (sub_count < 0)
12683 return -1;
12684 count += sub_count;
12687 /* There must be no padding. */
12688 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12689 count * GET_MODE_BITSIZE (*modep)))
12690 return -1;
12692 return count;
12695 case UNION_TYPE:
12696 case QUAL_UNION_TYPE:
12698 /* These aren't very interesting except in a degenerate case. */
12699 int count = 0;
12700 int sub_count;
12701 tree field;
12703 /* Can't handle incomplete types nor sizes that are not
12704 fixed. */
12705 if (!COMPLETE_TYPE_P (type)
12706 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12707 return -1;
12709 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12711 if (TREE_CODE (field) != FIELD_DECL)
12712 continue;
12714 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12715 if (sub_count < 0)
12716 return -1;
12717 count = count > sub_count ? count : sub_count;
12720 /* There must be no padding. */
12721 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12722 count * GET_MODE_BITSIZE (*modep)))
12723 return -1;
12725 return count;
12728 default:
12729 break;
12732 return -1;
12735 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12736 type as described in AAPCS64 \S 4.1.2.
12738 See the comment above aarch64_composite_type_p for the notes on MODE. */
12740 static bool
12741 aarch64_short_vector_p (const_tree type,
12742 machine_mode mode)
12744 poly_int64 size = -1;
12746 if (type && TREE_CODE (type) == VECTOR_TYPE)
12747 size = int_size_in_bytes (type);
12748 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12749 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12750 size = GET_MODE_SIZE (mode);
12752 return known_eq (size, 8) || known_eq (size, 16);
12755 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12756 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12757 array types. The C99 floating-point complex types are also considered
12758 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12759 types, which are GCC extensions and out of the scope of AAPCS64, are
12760 treated as composite types here as well.
12762 Note that MODE itself is not sufficient in determining whether a type
12763 is such a composite type or not. This is because
12764 stor-layout.c:compute_record_mode may have already changed the MODE
12765 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12766 structure with only one field may have its MODE set to the mode of the
12767 field. Also an integer mode whose size matches the size of the
12768 RECORD_TYPE type may be used to substitute the original mode
12769 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12770 solely relied on. */
12772 static bool
12773 aarch64_composite_type_p (const_tree type,
12774 machine_mode mode)
12776 if (aarch64_short_vector_p (type, mode))
12777 return false;
12779 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12780 return true;
12782 if (mode == BLKmode
12783 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12784 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12785 return true;
12787 return false;
12790 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12791 shall be passed or returned in simd/fp register(s) (providing these
12792 parameter passing registers are available).
12794 Upon successful return, *COUNT returns the number of needed registers,
12795 *BASE_MODE returns the mode of the individual register and when IS_HAF
12796 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12797 floating-point aggregate or a homogeneous short-vector aggregate. */
12799 static bool
12800 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12801 const_tree type,
12802 machine_mode *base_mode,
12803 int *count,
12804 bool *is_ha)
12806 machine_mode new_mode = VOIDmode;
12807 bool composite_p = aarch64_composite_type_p (type, mode);
12809 if (is_ha != NULL) *is_ha = false;
12811 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12812 || aarch64_short_vector_p (type, mode))
12814 *count = 1;
12815 new_mode = mode;
12817 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12819 if (is_ha != NULL) *is_ha = true;
12820 *count = 2;
12821 new_mode = GET_MODE_INNER (mode);
12823 else if (type && composite_p)
12825 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12827 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12829 if (is_ha != NULL) *is_ha = true;
12830 *count = ag_count;
12832 else
12833 return false;
12835 else
12836 return false;
12838 *base_mode = new_mode;
12839 return true;
12842 /* Implement TARGET_STRUCT_VALUE_RTX. */
12844 static rtx
12845 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12846 int incoming ATTRIBUTE_UNUSED)
12848 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12851 /* Implements target hook vector_mode_supported_p. */
12852 static bool
12853 aarch64_vector_mode_supported_p (machine_mode mode)
12855 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12856 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12859 /* Return appropriate SIMD container
12860 for MODE within a vector of WIDTH bits. */
12861 static machine_mode
12862 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12864 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12865 switch (mode)
12867 case E_DFmode:
12868 return VNx2DFmode;
12869 case E_SFmode:
12870 return VNx4SFmode;
12871 case E_HFmode:
12872 return VNx8HFmode;
12873 case E_DImode:
12874 return VNx2DImode;
12875 case E_SImode:
12876 return VNx4SImode;
12877 case E_HImode:
12878 return VNx8HImode;
12879 case E_QImode:
12880 return VNx16QImode;
12881 default:
12882 return word_mode;
12885 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12886 if (TARGET_SIMD)
12888 if (known_eq (width, 128))
12889 switch (mode)
12891 case E_DFmode:
12892 return V2DFmode;
12893 case E_SFmode:
12894 return V4SFmode;
12895 case E_HFmode:
12896 return V8HFmode;
12897 case E_SImode:
12898 return V4SImode;
12899 case E_HImode:
12900 return V8HImode;
12901 case E_QImode:
12902 return V16QImode;
12903 case E_DImode:
12904 return V2DImode;
12905 default:
12906 break;
12908 else
12909 switch (mode)
12911 case E_SFmode:
12912 return V2SFmode;
12913 case E_HFmode:
12914 return V4HFmode;
12915 case E_SImode:
12916 return V2SImode;
12917 case E_HImode:
12918 return V4HImode;
12919 case E_QImode:
12920 return V8QImode;
12921 default:
12922 break;
12925 return word_mode;
12928 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12929 static machine_mode
12930 aarch64_preferred_simd_mode (scalar_mode mode)
12932 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12933 return aarch64_simd_container_mode (mode, bits);
12936 /* Return a list of possible vector sizes for the vectorizer
12937 to iterate over. */
12938 static void
12939 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12941 if (TARGET_SVE)
12942 sizes->safe_push (BYTES_PER_SVE_VECTOR);
12943 sizes->safe_push (16);
12944 sizes->safe_push (8);
12947 /* Implement TARGET_MANGLE_TYPE. */
12949 static const char *
12950 aarch64_mangle_type (const_tree type)
12952 /* The AArch64 ABI documents say that "__va_list" has to be
12953 managled as if it is in the "std" namespace. */
12954 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12955 return "St9__va_list";
12957 /* Half-precision float. */
12958 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12959 return "Dh";
12961 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12962 builtin types. */
12963 if (TYPE_NAME (type) != NULL)
12964 return aarch64_mangle_builtin_type (type);
12966 /* Use the default mangling. */
12967 return NULL;
12970 /* Find the first rtx_insn before insn that will generate an assembly
12971 instruction. */
12973 static rtx_insn *
12974 aarch64_prev_real_insn (rtx_insn *insn)
12976 if (!insn)
12977 return NULL;
12981 insn = prev_real_insn (insn);
12983 while (insn && recog_memoized (insn) < 0);
12985 return insn;
12988 static bool
12989 is_madd_op (enum attr_type t1)
12991 unsigned int i;
12992 /* A number of these may be AArch32 only. */
12993 enum attr_type mlatypes[] = {
12994 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12995 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12996 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12999 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
13001 if (t1 == mlatypes[i])
13002 return true;
13005 return false;
13008 /* Check if there is a register dependency between a load and the insn
13009 for which we hold recog_data. */
13011 static bool
13012 dep_between_memop_and_curr (rtx memop)
13014 rtx load_reg;
13015 int opno;
13017 gcc_assert (GET_CODE (memop) == SET);
13019 if (!REG_P (SET_DEST (memop)))
13020 return false;
13022 load_reg = SET_DEST (memop);
13023 for (opno = 1; opno < recog_data.n_operands; opno++)
13025 rtx operand = recog_data.operand[opno];
13026 if (REG_P (operand)
13027 && reg_overlap_mentioned_p (load_reg, operand))
13028 return true;
13031 return false;
13035 /* When working around the Cortex-A53 erratum 835769,
13036 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
13037 instruction and has a preceding memory instruction such that a NOP
13038 should be inserted between them. */
13040 bool
13041 aarch64_madd_needs_nop (rtx_insn* insn)
13043 enum attr_type attr_type;
13044 rtx_insn *prev;
13045 rtx body;
13047 if (!TARGET_FIX_ERR_A53_835769)
13048 return false;
13050 if (!INSN_P (insn) || recog_memoized (insn) < 0)
13051 return false;
13053 attr_type = get_attr_type (insn);
13054 if (!is_madd_op (attr_type))
13055 return false;
13057 prev = aarch64_prev_real_insn (insn);
13058 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13059 Restore recog state to INSN to avoid state corruption. */
13060 extract_constrain_insn_cached (insn);
13062 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13063 return false;
13065 body = single_set (prev);
13067 /* If the previous insn is a memory op and there is no dependency between
13068 it and the DImode madd, emit a NOP between them. If body is NULL then we
13069 have a complex memory operation, probably a load/store pair.
13070 Be conservative for now and emit a NOP. */
13071 if (GET_MODE (recog_data.operand[0]) == DImode
13072 && (!body || !dep_between_memop_and_curr (body)))
13073 return true;
13075 return false;
13080 /* Implement FINAL_PRESCAN_INSN. */
13082 void
13083 aarch64_final_prescan_insn (rtx_insn *insn)
13085 if (aarch64_madd_needs_nop (insn))
13086 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13090 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13091 instruction. */
13093 bool
13094 aarch64_sve_index_immediate_p (rtx base_or_step)
13096 return (CONST_INT_P (base_or_step)
13097 && IN_RANGE (INTVAL (base_or_step), -16, 15));
13100 /* Return true if X is a valid immediate for the SVE ADD and SUB
13101 instructions. Negate X first if NEGATE_P is true. */
13103 bool
13104 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13106 rtx elt;
13108 if (!const_vec_duplicate_p (x, &elt)
13109 || !CONST_INT_P (elt))
13110 return false;
13112 HOST_WIDE_INT val = INTVAL (elt);
13113 if (negate_p)
13114 val = -val;
13115 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13117 if (val & 0xff)
13118 return IN_RANGE (val, 0, 0xff);
13119 return IN_RANGE (val, 0, 0xff00);
13122 /* Return true if X is a valid immediate operand for an SVE logical
13123 instruction such as AND. */
13125 bool
13126 aarch64_sve_bitmask_immediate_p (rtx x)
13128 rtx elt;
13130 return (const_vec_duplicate_p (x, &elt)
13131 && CONST_INT_P (elt)
13132 && aarch64_bitmask_imm (INTVAL (elt),
13133 GET_MODE_INNER (GET_MODE (x))));
13136 /* Return true if X is a valid immediate for the SVE DUP and CPY
13137 instructions. */
13139 bool
13140 aarch64_sve_dup_immediate_p (rtx x)
13142 rtx elt;
13144 if (!const_vec_duplicate_p (x, &elt)
13145 || !CONST_INT_P (elt))
13146 return false;
13148 HOST_WIDE_INT val = INTVAL (elt);
13149 if (val & 0xff)
13150 return IN_RANGE (val, -0x80, 0x7f);
13151 return IN_RANGE (val, -0x8000, 0x7f00);
13154 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13155 SIGNED_P says whether the operand is signed rather than unsigned. */
13157 bool
13158 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13160 rtx elt;
13162 return (const_vec_duplicate_p (x, &elt)
13163 && CONST_INT_P (elt)
13164 && (signed_p
13165 ? IN_RANGE (INTVAL (elt), -16, 15)
13166 : IN_RANGE (INTVAL (elt), 0, 127)));
13169 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13170 instruction. Negate X first if NEGATE_P is true. */
13172 bool
13173 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13175 rtx elt;
13176 REAL_VALUE_TYPE r;
13178 if (!const_vec_duplicate_p (x, &elt)
13179 || GET_CODE (elt) != CONST_DOUBLE)
13180 return false;
13182 r = *CONST_DOUBLE_REAL_VALUE (elt);
13184 if (negate_p)
13185 r = real_value_negate (&r);
13187 if (real_equal (&r, &dconst1))
13188 return true;
13189 if (real_equal (&r, &dconsthalf))
13190 return true;
13191 return false;
13194 /* Return true if X is a valid immediate operand for an SVE FMUL
13195 instruction. */
13197 bool
13198 aarch64_sve_float_mul_immediate_p (rtx x)
13200 rtx elt;
13202 /* GCC will never generate a multiply with an immediate of 2, so there is no
13203 point testing for it (even though it is a valid constant). */
13204 return (const_vec_duplicate_p (x, &elt)
13205 && GET_CODE (elt) == CONST_DOUBLE
13206 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13209 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13210 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13211 is nonnull, use it to describe valid immediates. */
13212 static bool
13213 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13214 simd_immediate_info *info,
13215 enum simd_immediate_check which,
13216 simd_immediate_info::insn_type insn)
13218 /* Try a 4-byte immediate with LSL. */
13219 for (unsigned int shift = 0; shift < 32; shift += 8)
13220 if ((val32 & (0xff << shift)) == val32)
13222 if (info)
13223 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13224 simd_immediate_info::LSL, shift);
13225 return true;
13228 /* Try a 2-byte immediate with LSL. */
13229 unsigned int imm16 = val32 & 0xffff;
13230 if (imm16 == (val32 >> 16))
13231 for (unsigned int shift = 0; shift < 16; shift += 8)
13232 if ((imm16 & (0xff << shift)) == imm16)
13234 if (info)
13235 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13236 simd_immediate_info::LSL, shift);
13237 return true;
13240 /* Try a 4-byte immediate with MSL, except for cases that MVN
13241 can handle. */
13242 if (which == AARCH64_CHECK_MOV)
13243 for (unsigned int shift = 8; shift < 24; shift += 8)
13245 unsigned int low = (1 << shift) - 1;
13246 if (((val32 & (0xff << shift)) | low) == val32)
13248 if (info)
13249 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13250 simd_immediate_info::MSL, shift);
13251 return true;
13255 return false;
13258 /* Return true if replicating VAL64 is a valid immediate for the
13259 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13260 use it to describe valid immediates. */
13261 static bool
13262 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13263 simd_immediate_info *info,
13264 enum simd_immediate_check which)
13266 unsigned int val32 = val64 & 0xffffffff;
13267 unsigned int val16 = val64 & 0xffff;
13268 unsigned int val8 = val64 & 0xff;
13270 if (val32 == (val64 >> 32))
13272 if ((which & AARCH64_CHECK_ORR) != 0
13273 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13274 simd_immediate_info::MOV))
13275 return true;
13277 if ((which & AARCH64_CHECK_BIC) != 0
13278 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13279 simd_immediate_info::MVN))
13280 return true;
13282 /* Try using a replicated byte. */
13283 if (which == AARCH64_CHECK_MOV
13284 && val16 == (val32 >> 16)
13285 && val8 == (val16 >> 8))
13287 if (info)
13288 *info = simd_immediate_info (QImode, val8);
13289 return true;
13293 /* Try using a bit-to-bytemask. */
13294 if (which == AARCH64_CHECK_MOV)
13296 unsigned int i;
13297 for (i = 0; i < 64; i += 8)
13299 unsigned char byte = (val64 >> i) & 0xff;
13300 if (byte != 0 && byte != 0xff)
13301 break;
13303 if (i == 64)
13305 if (info)
13306 *info = simd_immediate_info (DImode, val64);
13307 return true;
13310 return false;
13313 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13314 instruction. If INFO is nonnull, use it to describe valid immediates. */
13316 static bool
13317 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13318 simd_immediate_info *info)
13320 scalar_int_mode mode = DImode;
13321 unsigned int val32 = val64 & 0xffffffff;
13322 if (val32 == (val64 >> 32))
13324 mode = SImode;
13325 unsigned int val16 = val32 & 0xffff;
13326 if (val16 == (val32 >> 16))
13328 mode = HImode;
13329 unsigned int val8 = val16 & 0xff;
13330 if (val8 == (val16 >> 8))
13331 mode = QImode;
13334 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13335 if (IN_RANGE (val, -0x80, 0x7f))
13337 /* DUP with no shift. */
13338 if (info)
13339 *info = simd_immediate_info (mode, val);
13340 return true;
13342 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13344 /* DUP with LSL #8. */
13345 if (info)
13346 *info = simd_immediate_info (mode, val);
13347 return true;
13349 if (aarch64_bitmask_imm (val64, mode))
13351 /* DUPM. */
13352 if (info)
13353 *info = simd_immediate_info (mode, val);
13354 return true;
13356 return false;
13359 /* Return true if OP is a valid SIMD immediate for the operation
13360 described by WHICH. If INFO is nonnull, use it to describe valid
13361 immediates. */
13362 bool
13363 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13364 enum simd_immediate_check which)
13366 machine_mode mode = GET_MODE (op);
13367 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13368 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13369 return false;
13371 scalar_mode elt_mode = GET_MODE_INNER (mode);
13372 rtx base, step;
13373 unsigned int n_elts;
13374 if (GET_CODE (op) == CONST_VECTOR
13375 && CONST_VECTOR_DUPLICATE_P (op))
13376 n_elts = CONST_VECTOR_NPATTERNS (op);
13377 else if ((vec_flags & VEC_SVE_DATA)
13378 && const_vec_series_p (op, &base, &step))
13380 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13381 if (!aarch64_sve_index_immediate_p (base)
13382 || !aarch64_sve_index_immediate_p (step))
13383 return false;
13385 if (info)
13386 *info = simd_immediate_info (elt_mode, base, step);
13387 return true;
13389 else if (GET_CODE (op) == CONST_VECTOR
13390 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13391 /* N_ELTS set above. */;
13392 else
13393 return false;
13395 /* Handle PFALSE and PTRUE. */
13396 if (vec_flags & VEC_SVE_PRED)
13397 return (op == CONST0_RTX (mode)
13398 || op == CONSTM1_RTX (mode));
13400 scalar_float_mode elt_float_mode;
13401 if (n_elts == 1
13402 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13404 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13405 if (aarch64_float_const_zero_rtx_p (elt)
13406 || aarch64_float_const_representable_p (elt))
13408 if (info)
13409 *info = simd_immediate_info (elt_float_mode, elt);
13410 return true;
13414 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13415 if (elt_size > 8)
13416 return false;
13418 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13420 /* Expand the vector constant out into a byte vector, with the least
13421 significant byte of the register first. */
13422 auto_vec<unsigned char, 16> bytes;
13423 bytes.reserve (n_elts * elt_size);
13424 for (unsigned int i = 0; i < n_elts; i++)
13426 /* The vector is provided in gcc endian-neutral fashion.
13427 For aarch64_be Advanced SIMD, it must be laid out in the vector
13428 register in reverse order. */
13429 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13430 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13432 if (elt_mode != elt_int_mode)
13433 elt = gen_lowpart (elt_int_mode, elt);
13435 if (!CONST_INT_P (elt))
13436 return false;
13438 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13439 for (unsigned int byte = 0; byte < elt_size; byte++)
13441 bytes.quick_push (elt_val & 0xff);
13442 elt_val >>= BITS_PER_UNIT;
13446 /* The immediate must repeat every eight bytes. */
13447 unsigned int nbytes = bytes.length ();
13448 for (unsigned i = 8; i < nbytes; ++i)
13449 if (bytes[i] != bytes[i - 8])
13450 return false;
13452 /* Get the repeating 8-byte value as an integer. No endian correction
13453 is needed here because bytes is already in lsb-first order. */
13454 unsigned HOST_WIDE_INT val64 = 0;
13455 for (unsigned int i = 0; i < 8; i++)
13456 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13457 << (i * BITS_PER_UNIT));
13459 if (vec_flags & VEC_SVE_DATA)
13460 return aarch64_sve_valid_immediate (val64, info);
13461 else
13462 return aarch64_advsimd_valid_immediate (val64, info, which);
13465 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13466 has a step in the range of INDEX. Return the index expression if so,
13467 otherwise return null. */
13469 aarch64_check_zero_based_sve_index_immediate (rtx x)
13471 rtx base, step;
13472 if (const_vec_series_p (x, &base, &step)
13473 && base == const0_rtx
13474 && aarch64_sve_index_immediate_p (step))
13475 return step;
13476 return NULL_RTX;
13479 /* Check of immediate shift constants are within range. */
13480 bool
13481 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13483 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13484 if (left)
13485 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13486 else
13487 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13490 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13491 operation of width WIDTH at bit position POS. */
13494 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13496 gcc_assert (CONST_INT_P (width));
13497 gcc_assert (CONST_INT_P (pos));
13499 unsigned HOST_WIDE_INT mask
13500 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13501 return GEN_INT (mask << UINTVAL (pos));
13504 bool
13505 aarch64_mov_operand_p (rtx x, machine_mode mode)
13507 if (GET_CODE (x) == HIGH
13508 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13509 return true;
13511 if (CONST_INT_P (x))
13512 return true;
13514 if (VECTOR_MODE_P (GET_MODE (x)))
13515 return aarch64_simd_valid_immediate (x, NULL);
13517 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13518 return true;
13520 if (aarch64_sve_cnt_immediate_p (x))
13521 return true;
13523 return aarch64_classify_symbolic_expression (x)
13524 == SYMBOL_TINY_ABSOLUTE;
13527 /* Return a const_int vector of VAL. */
13529 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13531 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13532 return gen_const_vec_duplicate (mode, c);
13535 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13537 bool
13538 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13540 machine_mode vmode;
13542 vmode = aarch64_simd_container_mode (mode, 64);
13543 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13544 return aarch64_simd_valid_immediate (op_v, NULL);
13547 /* Construct and return a PARALLEL RTX vector with elements numbering the
13548 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13549 the vector - from the perspective of the architecture. This does not
13550 line up with GCC's perspective on lane numbers, so we end up with
13551 different masks depending on our target endian-ness. The diagram
13552 below may help. We must draw the distinction when building masks
13553 which select one half of the vector. An instruction selecting
13554 architectural low-lanes for a big-endian target, must be described using
13555 a mask selecting GCC high-lanes.
13557 Big-Endian Little-Endian
13559 GCC 0 1 2 3 3 2 1 0
13560 | x | x | x | x | | x | x | x | x |
13561 Architecture 3 2 1 0 3 2 1 0
13563 Low Mask: { 2, 3 } { 0, 1 }
13564 High Mask: { 0, 1 } { 2, 3 }
13566 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13569 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13571 rtvec v = rtvec_alloc (nunits / 2);
13572 int high_base = nunits / 2;
13573 int low_base = 0;
13574 int base;
13575 rtx t1;
13576 int i;
13578 if (BYTES_BIG_ENDIAN)
13579 base = high ? low_base : high_base;
13580 else
13581 base = high ? high_base : low_base;
13583 for (i = 0; i < nunits / 2; i++)
13584 RTVEC_ELT (v, i) = GEN_INT (base + i);
13586 t1 = gen_rtx_PARALLEL (mode, v);
13587 return t1;
13590 /* Check OP for validity as a PARALLEL RTX vector with elements
13591 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13592 from the perspective of the architecture. See the diagram above
13593 aarch64_simd_vect_par_cnst_half for more details. */
13595 bool
13596 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13597 bool high)
13599 int nelts;
13600 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13601 return false;
13603 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13604 HOST_WIDE_INT count_op = XVECLEN (op, 0);
13605 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13606 int i = 0;
13608 if (count_op != count_ideal)
13609 return false;
13611 for (i = 0; i < count_ideal; i++)
13613 rtx elt_op = XVECEXP (op, 0, i);
13614 rtx elt_ideal = XVECEXP (ideal, 0, i);
13616 if (!CONST_INT_P (elt_op)
13617 || INTVAL (elt_ideal) != INTVAL (elt_op))
13618 return false;
13620 return true;
13623 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13624 HIGH (exclusive). */
13625 void
13626 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13627 const_tree exp)
13629 HOST_WIDE_INT lane;
13630 gcc_assert (CONST_INT_P (operand));
13631 lane = INTVAL (operand);
13633 if (lane < low || lane >= high)
13635 if (exp)
13636 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13637 else
13638 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13642 /* Peform endian correction on lane number N, which indexes a vector
13643 of mode MODE, and return the result as an SImode rtx. */
13646 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13648 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13651 /* Return TRUE if OP is a valid vector addressing mode. */
13653 bool
13654 aarch64_simd_mem_operand_p (rtx op)
13656 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13657 || REG_P (XEXP (op, 0)));
13660 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13662 bool
13663 aarch64_sve_ld1r_operand_p (rtx op)
13665 struct aarch64_address_info addr;
13666 scalar_mode mode;
13668 return (MEM_P (op)
13669 && is_a <scalar_mode> (GET_MODE (op), &mode)
13670 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13671 && addr.type == ADDRESS_REG_IMM
13672 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13675 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13676 The conditions for STR are the same. */
13677 bool
13678 aarch64_sve_ldr_operand_p (rtx op)
13680 struct aarch64_address_info addr;
13682 return (MEM_P (op)
13683 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13684 false, ADDR_QUERY_ANY)
13685 && addr.type == ADDRESS_REG_IMM);
13688 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13689 We need to be able to access the individual pieces, so the range
13690 is different from LD[234] and ST[234]. */
13691 bool
13692 aarch64_sve_struct_memory_operand_p (rtx op)
13694 if (!MEM_P (op))
13695 return false;
13697 machine_mode mode = GET_MODE (op);
13698 struct aarch64_address_info addr;
13699 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13700 ADDR_QUERY_ANY)
13701 || addr.type != ADDRESS_REG_IMM)
13702 return false;
13704 poly_int64 first = addr.const_offset;
13705 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13706 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13707 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13710 /* Emit a register copy from operand to operand, taking care not to
13711 early-clobber source registers in the process.
13713 COUNT is the number of components into which the copy needs to be
13714 decomposed. */
13715 void
13716 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13717 unsigned int count)
13719 unsigned int i;
13720 int rdest = REGNO (operands[0]);
13721 int rsrc = REGNO (operands[1]);
13723 if (!reg_overlap_mentioned_p (operands[0], operands[1])
13724 || rdest < rsrc)
13725 for (i = 0; i < count; i++)
13726 emit_move_insn (gen_rtx_REG (mode, rdest + i),
13727 gen_rtx_REG (mode, rsrc + i));
13728 else
13729 for (i = 0; i < count; i++)
13730 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13731 gen_rtx_REG (mode, rsrc + count - i - 1));
13734 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13735 one of VSTRUCT modes: OI, CI, or XI. */
13737 aarch64_simd_attr_length_rglist (machine_mode mode)
13739 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13740 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13743 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13744 alignment of a vector to 128 bits. SVE predicates have an alignment of
13745 16 bits. */
13746 static HOST_WIDE_INT
13747 aarch64_simd_vector_alignment (const_tree type)
13749 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13750 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13751 be set for non-predicate vectors of booleans. Modes are the most
13752 direct way we have of identifying real SVE predicate types. */
13753 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13754 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13755 return MIN (align, 128);
13758 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13759 static HOST_WIDE_INT
13760 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13762 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13764 /* If the length of the vector is fixed, try to align to that length,
13765 otherwise don't try to align at all. */
13766 HOST_WIDE_INT result;
13767 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13768 result = TYPE_ALIGN (TREE_TYPE (type));
13769 return result;
13771 return TYPE_ALIGN (type);
13774 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13775 static bool
13776 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13778 if (is_packed)
13779 return false;
13781 /* For fixed-length vectors, check that the vectorizer will aim for
13782 full-vector alignment. This isn't true for generic GCC vectors
13783 that are wider than the ABI maximum of 128 bits. */
13784 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13785 && (wi::to_widest (TYPE_SIZE (type))
13786 != aarch64_vectorize_preferred_vector_alignment (type)))
13787 return false;
13789 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13790 return true;
13793 /* Return true if the vector misalignment factor is supported by the
13794 target. */
13795 static bool
13796 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13797 const_tree type, int misalignment,
13798 bool is_packed)
13800 if (TARGET_SIMD && STRICT_ALIGNMENT)
13802 /* Return if movmisalign pattern is not supported for this mode. */
13803 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13804 return false;
13806 /* Misalignment factor is unknown at compile time. */
13807 if (misalignment == -1)
13808 return false;
13810 return default_builtin_support_vector_misalignment (mode, type, misalignment,
13811 is_packed);
13814 /* If VALS is a vector constant that can be loaded into a register
13815 using DUP, generate instructions to do so and return an RTX to
13816 assign to the register. Otherwise return NULL_RTX. */
13817 static rtx
13818 aarch64_simd_dup_constant (rtx vals)
13820 machine_mode mode = GET_MODE (vals);
13821 machine_mode inner_mode = GET_MODE_INNER (mode);
13822 rtx x;
13824 if (!const_vec_duplicate_p (vals, &x))
13825 return NULL_RTX;
13827 /* We can load this constant by using DUP and a constant in a
13828 single ARM register. This will be cheaper than a vector
13829 load. */
13830 x = copy_to_mode_reg (inner_mode, x);
13831 return gen_vec_duplicate (mode, x);
13835 /* Generate code to load VALS, which is a PARALLEL containing only
13836 constants (for vec_init) or CONST_VECTOR, efficiently into a
13837 register. Returns an RTX to copy into the register, or NULL_RTX
13838 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13839 static rtx
13840 aarch64_simd_make_constant (rtx vals)
13842 machine_mode mode = GET_MODE (vals);
13843 rtx const_dup;
13844 rtx const_vec = NULL_RTX;
13845 int n_const = 0;
13846 int i;
13848 if (GET_CODE (vals) == CONST_VECTOR)
13849 const_vec = vals;
13850 else if (GET_CODE (vals) == PARALLEL)
13852 /* A CONST_VECTOR must contain only CONST_INTs and
13853 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13854 Only store valid constants in a CONST_VECTOR. */
13855 int n_elts = XVECLEN (vals, 0);
13856 for (i = 0; i < n_elts; ++i)
13858 rtx x = XVECEXP (vals, 0, i);
13859 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13860 n_const++;
13862 if (n_const == n_elts)
13863 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13865 else
13866 gcc_unreachable ();
13868 if (const_vec != NULL_RTX
13869 && aarch64_simd_valid_immediate (const_vec, NULL))
13870 /* Load using MOVI/MVNI. */
13871 return const_vec;
13872 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13873 /* Loaded using DUP. */
13874 return const_dup;
13875 else if (const_vec != NULL_RTX)
13876 /* Load from constant pool. We can not take advantage of single-cycle
13877 LD1 because we need a PC-relative addressing mode. */
13878 return const_vec;
13879 else
13880 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13881 We can not construct an initializer. */
13882 return NULL_RTX;
13885 /* Expand a vector initialisation sequence, such that TARGET is
13886 initialised to contain VALS. */
13888 void
13889 aarch64_expand_vector_init (rtx target, rtx vals)
13891 machine_mode mode = GET_MODE (target);
13892 scalar_mode inner_mode = GET_MODE_INNER (mode);
13893 /* The number of vector elements. */
13894 int n_elts = XVECLEN (vals, 0);
13895 /* The number of vector elements which are not constant. */
13896 int n_var = 0;
13897 rtx any_const = NULL_RTX;
13898 /* The first element of vals. */
13899 rtx v0 = XVECEXP (vals, 0, 0);
13900 bool all_same = true;
13902 /* Count the number of variable elements to initialise. */
13903 for (int i = 0; i < n_elts; ++i)
13905 rtx x = XVECEXP (vals, 0, i);
13906 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13907 ++n_var;
13908 else
13909 any_const = x;
13911 all_same &= rtx_equal_p (x, v0);
13914 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13915 how best to handle this. */
13916 if (n_var == 0)
13918 rtx constant = aarch64_simd_make_constant (vals);
13919 if (constant != NULL_RTX)
13921 emit_move_insn (target, constant);
13922 return;
13926 /* Splat a single non-constant element if we can. */
13927 if (all_same)
13929 rtx x = copy_to_mode_reg (inner_mode, v0);
13930 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13931 return;
13934 enum insn_code icode = optab_handler (vec_set_optab, mode);
13935 gcc_assert (icode != CODE_FOR_nothing);
13937 /* If there are only variable elements, try to optimize
13938 the insertion using dup for the most common element
13939 followed by insertions. */
13941 /* The algorithm will fill matches[*][0] with the earliest matching element,
13942 and matches[X][1] with the count of duplicate elements (if X is the
13943 earliest element which has duplicates). */
13945 if (n_var == n_elts && n_elts <= 16)
13947 int matches[16][2] = {0};
13948 for (int i = 0; i < n_elts; i++)
13950 for (int j = 0; j <= i; j++)
13952 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13954 matches[i][0] = j;
13955 matches[j][1]++;
13956 break;
13960 int maxelement = 0;
13961 int maxv = 0;
13962 for (int i = 0; i < n_elts; i++)
13963 if (matches[i][1] > maxv)
13965 maxelement = i;
13966 maxv = matches[i][1];
13969 /* Create a duplicate of the most common element, unless all elements
13970 are equally useless to us, in which case just immediately set the
13971 vector register using the first element. */
13973 if (maxv == 1)
13975 /* For vectors of two 64-bit elements, we can do even better. */
13976 if (n_elts == 2
13977 && (inner_mode == E_DImode
13978 || inner_mode == E_DFmode))
13981 rtx x0 = XVECEXP (vals, 0, 0);
13982 rtx x1 = XVECEXP (vals, 0, 1);
13983 /* Combine can pick up this case, but handling it directly
13984 here leaves clearer RTL.
13986 This is load_pair_lanes<mode>, and also gives us a clean-up
13987 for store_pair_lanes<mode>. */
13988 if (memory_operand (x0, inner_mode)
13989 && memory_operand (x1, inner_mode)
13990 && !STRICT_ALIGNMENT
13991 && rtx_equal_p (XEXP (x1, 0),
13992 plus_constant (Pmode,
13993 XEXP (x0, 0),
13994 GET_MODE_SIZE (inner_mode))))
13996 rtx t;
13997 if (inner_mode == DFmode)
13998 t = gen_load_pair_lanesdf (target, x0, x1);
13999 else
14000 t = gen_load_pair_lanesdi (target, x0, x1);
14001 emit_insn (t);
14002 return;
14005 /* The subreg-move sequence below will move into lane zero of the
14006 vector register. For big-endian we want that position to hold
14007 the last element of VALS. */
14008 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
14009 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14010 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
14012 else
14014 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
14015 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
14018 /* Insert the rest. */
14019 for (int i = 0; i < n_elts; i++)
14021 rtx x = XVECEXP (vals, 0, i);
14022 if (matches[i][0] == maxelement)
14023 continue;
14024 x = copy_to_mode_reg (inner_mode, x);
14025 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14027 return;
14030 /* Initialise a vector which is part-variable. We want to first try
14031 to build those lanes which are constant in the most efficient way we
14032 can. */
14033 if (n_var != n_elts)
14035 rtx copy = copy_rtx (vals);
14037 /* Load constant part of vector. We really don't care what goes into the
14038 parts we will overwrite, but we're more likely to be able to load the
14039 constant efficiently if it has fewer, larger, repeating parts
14040 (see aarch64_simd_valid_immediate). */
14041 for (int i = 0; i < n_elts; i++)
14043 rtx x = XVECEXP (vals, 0, i);
14044 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14045 continue;
14046 rtx subst = any_const;
14047 for (int bit = n_elts / 2; bit > 0; bit /= 2)
14049 /* Look in the copied vector, as more elements are const. */
14050 rtx test = XVECEXP (copy, 0, i ^ bit);
14051 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14053 subst = test;
14054 break;
14057 XVECEXP (copy, 0, i) = subst;
14059 aarch64_expand_vector_init (target, copy);
14062 /* Insert the variable lanes directly. */
14063 for (int i = 0; i < n_elts; i++)
14065 rtx x = XVECEXP (vals, 0, i);
14066 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14067 continue;
14068 x = copy_to_mode_reg (inner_mode, x);
14069 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14073 static unsigned HOST_WIDE_INT
14074 aarch64_shift_truncation_mask (machine_mode mode)
14076 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14077 return 0;
14078 return GET_MODE_UNIT_BITSIZE (mode) - 1;
14081 /* Select a format to encode pointers in exception handling data. */
14083 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14085 int type;
14086 switch (aarch64_cmodel)
14088 case AARCH64_CMODEL_TINY:
14089 case AARCH64_CMODEL_TINY_PIC:
14090 case AARCH64_CMODEL_SMALL:
14091 case AARCH64_CMODEL_SMALL_PIC:
14092 case AARCH64_CMODEL_SMALL_SPIC:
14093 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
14094 for everything. */
14095 type = DW_EH_PE_sdata4;
14096 break;
14097 default:
14098 /* No assumptions here. 8-byte relocs required. */
14099 type = DW_EH_PE_sdata8;
14100 break;
14102 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14105 /* The last .arch and .tune assembly strings that we printed. */
14106 static std::string aarch64_last_printed_arch_string;
14107 static std::string aarch64_last_printed_tune_string;
14109 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14110 by the function fndecl. */
14112 void
14113 aarch64_declare_function_name (FILE *stream, const char* name,
14114 tree fndecl)
14116 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14118 struct cl_target_option *targ_options;
14119 if (target_parts)
14120 targ_options = TREE_TARGET_OPTION (target_parts);
14121 else
14122 targ_options = TREE_TARGET_OPTION (target_option_current_node);
14123 gcc_assert (targ_options);
14125 const struct processor *this_arch
14126 = aarch64_get_arch (targ_options->x_explicit_arch);
14128 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14129 std::string extension
14130 = aarch64_get_extension_string_for_isa_flags (isa_flags,
14131 this_arch->flags);
14132 /* Only update the assembler .arch string if it is distinct from the last
14133 such string we printed. */
14134 std::string to_print = this_arch->name + extension;
14135 if (to_print != aarch64_last_printed_arch_string)
14137 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14138 aarch64_last_printed_arch_string = to_print;
14141 /* Print the cpu name we're tuning for in the comments, might be
14142 useful to readers of the generated asm. Do it only when it changes
14143 from function to function and verbose assembly is requested. */
14144 const struct processor *this_tune
14145 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14147 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14149 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14150 this_tune->name);
14151 aarch64_last_printed_tune_string = this_tune->name;
14154 /* Don't forget the type directive for ELF. */
14155 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14156 ASM_OUTPUT_LABEL (stream, name);
14159 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14161 static void
14162 aarch64_start_file (void)
14164 struct cl_target_option *default_options
14165 = TREE_TARGET_OPTION (target_option_default_node);
14167 const struct processor *default_arch
14168 = aarch64_get_arch (default_options->x_explicit_arch);
14169 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14170 std::string extension
14171 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14172 default_arch->flags);
14174 aarch64_last_printed_arch_string = default_arch->name + extension;
14175 aarch64_last_printed_tune_string = "";
14176 asm_fprintf (asm_out_file, "\t.arch %s\n",
14177 aarch64_last_printed_arch_string.c_str ());
14179 default_file_start ();
14182 /* Emit load exclusive. */
14184 static void
14185 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14186 rtx mem, rtx model_rtx)
14188 rtx (*gen) (rtx, rtx, rtx);
14190 switch (mode)
14192 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14193 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14194 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14195 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14196 default:
14197 gcc_unreachable ();
14200 emit_insn (gen (rval, mem, model_rtx));
14203 /* Emit store exclusive. */
14205 static void
14206 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14207 rtx rval, rtx mem, rtx model_rtx)
14209 rtx (*gen) (rtx, rtx, rtx, rtx);
14211 switch (mode)
14213 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14214 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14215 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14216 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14217 default:
14218 gcc_unreachable ();
14221 emit_insn (gen (bval, rval, mem, model_rtx));
14224 /* Mark the previous jump instruction as unlikely. */
14226 static void
14227 aarch64_emit_unlikely_jump (rtx insn)
14229 rtx_insn *jump = emit_jump_insn (insn);
14230 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14233 /* Expand a compare and swap pattern. */
14235 void
14236 aarch64_expand_compare_and_swap (rtx operands[])
14238 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14239 machine_mode mode, cmp_mode;
14240 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14241 int idx;
14242 gen_cas_fn gen;
14243 const gen_cas_fn split_cas[] =
14245 gen_aarch64_compare_and_swapqi,
14246 gen_aarch64_compare_and_swaphi,
14247 gen_aarch64_compare_and_swapsi,
14248 gen_aarch64_compare_and_swapdi
14250 const gen_cas_fn atomic_cas[] =
14252 gen_aarch64_compare_and_swapqi_lse,
14253 gen_aarch64_compare_and_swaphi_lse,
14254 gen_aarch64_compare_and_swapsi_lse,
14255 gen_aarch64_compare_and_swapdi_lse
14258 bval = operands[0];
14259 rval = operands[1];
14260 mem = operands[2];
14261 oldval = operands[3];
14262 newval = operands[4];
14263 is_weak = operands[5];
14264 mod_s = operands[6];
14265 mod_f = operands[7];
14266 mode = GET_MODE (mem);
14267 cmp_mode = mode;
14269 /* Normally the succ memory model must be stronger than fail, but in the
14270 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14271 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14273 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14274 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14275 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14277 switch (mode)
14279 case E_QImode:
14280 case E_HImode:
14281 /* For short modes, we're going to perform the comparison in SImode,
14282 so do the zero-extension now. */
14283 cmp_mode = SImode;
14284 rval = gen_reg_rtx (SImode);
14285 oldval = convert_modes (SImode, mode, oldval, true);
14286 /* Fall through. */
14288 case E_SImode:
14289 case E_DImode:
14290 /* Force the value into a register if needed. */
14291 if (!aarch64_plus_operand (oldval, mode))
14292 oldval = force_reg (cmp_mode, oldval);
14293 break;
14295 default:
14296 gcc_unreachable ();
14299 switch (mode)
14301 case E_QImode: idx = 0; break;
14302 case E_HImode: idx = 1; break;
14303 case E_SImode: idx = 2; break;
14304 case E_DImode: idx = 3; break;
14305 default:
14306 gcc_unreachable ();
14308 if (TARGET_LSE)
14309 gen = atomic_cas[idx];
14310 else
14311 gen = split_cas[idx];
14313 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14315 if (mode == QImode || mode == HImode)
14316 emit_move_insn (operands[1], gen_lowpart (mode, rval));
14318 x = gen_rtx_REG (CCmode, CC_REGNUM);
14319 x = gen_rtx_EQ (SImode, x, const0_rtx);
14320 emit_insn (gen_rtx_SET (bval, x));
14323 /* Test whether the target supports using a atomic load-operate instruction.
14324 CODE is the operation and AFTER is TRUE if the data in memory after the
14325 operation should be returned and FALSE if the data before the operation
14326 should be returned. Returns FALSE if the operation isn't supported by the
14327 architecture. */
14329 bool
14330 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14332 if (!TARGET_LSE)
14333 return false;
14335 switch (code)
14337 case SET:
14338 case AND:
14339 case IOR:
14340 case XOR:
14341 case MINUS:
14342 case PLUS:
14343 return true;
14344 default:
14345 return false;
14349 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14350 sequence implementing an atomic operation. */
14352 static void
14353 aarch64_emit_post_barrier (enum memmodel model)
14355 const enum memmodel base_model = memmodel_base (model);
14357 if (is_mm_sync (model)
14358 && (base_model == MEMMODEL_ACQUIRE
14359 || base_model == MEMMODEL_ACQ_REL
14360 || base_model == MEMMODEL_SEQ_CST))
14362 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14366 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14367 for the data in memory. EXPECTED is the value expected to be in memory.
14368 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14369 is the memory ordering to use. */
14371 void
14372 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14373 rtx expected, rtx desired,
14374 rtx model)
14376 rtx (*gen) (rtx, rtx, rtx, rtx);
14377 machine_mode mode;
14379 mode = GET_MODE (mem);
14381 switch (mode)
14383 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14384 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14385 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14386 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14387 default:
14388 gcc_unreachable ();
14391 /* Move the expected value into the CAS destination register. */
14392 emit_insn (gen_rtx_SET (rval, expected));
14394 /* Emit the CAS. */
14395 emit_insn (gen (rval, mem, desired, model));
14397 /* Compare the expected value with the value loaded by the CAS, to establish
14398 whether the swap was made. */
14399 aarch64_gen_compare_reg (EQ, rval, expected);
14402 /* Split a compare and swap pattern. */
14404 void
14405 aarch64_split_compare_and_swap (rtx operands[])
14407 rtx rval, mem, oldval, newval, scratch;
14408 machine_mode mode;
14409 bool is_weak;
14410 rtx_code_label *label1, *label2;
14411 rtx x, cond;
14412 enum memmodel model;
14413 rtx model_rtx;
14415 rval = operands[0];
14416 mem = operands[1];
14417 oldval = operands[2];
14418 newval = operands[3];
14419 is_weak = (operands[4] != const0_rtx);
14420 model_rtx = operands[5];
14421 scratch = operands[7];
14422 mode = GET_MODE (mem);
14423 model = memmodel_from_int (INTVAL (model_rtx));
14425 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14426 loop:
14427 .label1:
14428 LD[A]XR rval, [mem]
14429 CBNZ rval, .label2
14430 ST[L]XR scratch, newval, [mem]
14431 CBNZ scratch, .label1
14432 .label2:
14433 CMP rval, 0. */
14434 bool strong_zero_p = !is_weak && oldval == const0_rtx;
14436 label1 = NULL;
14437 if (!is_weak)
14439 label1 = gen_label_rtx ();
14440 emit_label (label1);
14442 label2 = gen_label_rtx ();
14444 /* The initial load can be relaxed for a __sync operation since a final
14445 barrier will be emitted to stop code hoisting. */
14446 if (is_mm_sync (model))
14447 aarch64_emit_load_exclusive (mode, rval, mem,
14448 GEN_INT (MEMMODEL_RELAXED));
14449 else
14450 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14452 if (strong_zero_p)
14454 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14455 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14456 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14457 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14459 else
14461 cond = aarch64_gen_compare_reg (NE, rval, oldval);
14462 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14463 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14464 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14465 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14468 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14470 if (!is_weak)
14472 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14473 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14474 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14475 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14477 else
14479 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14480 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14481 emit_insn (gen_rtx_SET (cond, x));
14484 emit_label (label2);
14485 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14486 to set the condition flags. If this is not used it will be removed by
14487 later passes. */
14488 if (strong_zero_p)
14490 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14491 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14492 emit_insn (gen_rtx_SET (cond, x));
14494 /* Emit any final barrier needed for a __sync operation. */
14495 if (is_mm_sync (model))
14496 aarch64_emit_post_barrier (model);
14499 /* Emit a BIC instruction. */
14501 static void
14502 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14504 rtx shift_rtx = GEN_INT (shift);
14505 rtx (*gen) (rtx, rtx, rtx, rtx);
14507 switch (mode)
14509 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14510 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14511 default:
14512 gcc_unreachable ();
14515 emit_insn (gen (dst, s2, shift_rtx, s1));
14518 /* Emit an atomic swap. */
14520 static void
14521 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14522 rtx mem, rtx model)
14524 rtx (*gen) (rtx, rtx, rtx, rtx);
14526 switch (mode)
14528 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14529 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14530 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14531 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14532 default:
14533 gcc_unreachable ();
14536 emit_insn (gen (dst, mem, value, model));
14539 /* Operations supported by aarch64_emit_atomic_load_op. */
14541 enum aarch64_atomic_load_op_code
14543 AARCH64_LDOP_PLUS, /* A + B */
14544 AARCH64_LDOP_XOR, /* A ^ B */
14545 AARCH64_LDOP_OR, /* A | B */
14546 AARCH64_LDOP_BIC /* A & ~B */
14549 /* Emit an atomic load-operate. */
14551 static void
14552 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14553 machine_mode mode, rtx dst, rtx src,
14554 rtx mem, rtx model)
14556 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14557 const aarch64_atomic_load_op_fn plus[] =
14559 gen_aarch64_atomic_loadaddqi,
14560 gen_aarch64_atomic_loadaddhi,
14561 gen_aarch64_atomic_loadaddsi,
14562 gen_aarch64_atomic_loadadddi
14564 const aarch64_atomic_load_op_fn eor[] =
14566 gen_aarch64_atomic_loadeorqi,
14567 gen_aarch64_atomic_loadeorhi,
14568 gen_aarch64_atomic_loadeorsi,
14569 gen_aarch64_atomic_loadeordi
14571 const aarch64_atomic_load_op_fn ior[] =
14573 gen_aarch64_atomic_loadsetqi,
14574 gen_aarch64_atomic_loadsethi,
14575 gen_aarch64_atomic_loadsetsi,
14576 gen_aarch64_atomic_loadsetdi
14578 const aarch64_atomic_load_op_fn bic[] =
14580 gen_aarch64_atomic_loadclrqi,
14581 gen_aarch64_atomic_loadclrhi,
14582 gen_aarch64_atomic_loadclrsi,
14583 gen_aarch64_atomic_loadclrdi
14585 aarch64_atomic_load_op_fn gen;
14586 int idx = 0;
14588 switch (mode)
14590 case E_QImode: idx = 0; break;
14591 case E_HImode: idx = 1; break;
14592 case E_SImode: idx = 2; break;
14593 case E_DImode: idx = 3; break;
14594 default:
14595 gcc_unreachable ();
14598 switch (code)
14600 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14601 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14602 case AARCH64_LDOP_OR: gen = ior[idx]; break;
14603 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14604 default:
14605 gcc_unreachable ();
14608 emit_insn (gen (dst, mem, src, model));
14611 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14612 location to store the data read from memory. OUT_RESULT is the location to
14613 store the result of the operation. MEM is the memory location to read and
14614 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14615 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14616 be NULL. */
14618 void
14619 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14620 rtx mem, rtx value, rtx model_rtx)
14622 machine_mode mode = GET_MODE (mem);
14623 machine_mode wmode = (mode == DImode ? DImode : SImode);
14624 const bool short_mode = (mode < SImode);
14625 aarch64_atomic_load_op_code ldop_code;
14626 rtx src;
14627 rtx x;
14629 if (out_data)
14630 out_data = gen_lowpart (mode, out_data);
14632 if (out_result)
14633 out_result = gen_lowpart (mode, out_result);
14635 /* Make sure the value is in a register, putting it into a destination
14636 register if it needs to be manipulated. */
14637 if (!register_operand (value, mode)
14638 || code == AND || code == MINUS)
14640 src = out_result ? out_result : out_data;
14641 emit_move_insn (src, gen_lowpart (mode, value));
14643 else
14644 src = value;
14645 gcc_assert (register_operand (src, mode));
14647 /* Preprocess the data for the operation as necessary. If the operation is
14648 a SET then emit a swap instruction and finish. */
14649 switch (code)
14651 case SET:
14652 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14653 return;
14655 case MINUS:
14656 /* Negate the value and treat it as a PLUS. */
14658 rtx neg_src;
14660 /* Resize the value if necessary. */
14661 if (short_mode)
14662 src = gen_lowpart (wmode, src);
14664 neg_src = gen_rtx_NEG (wmode, src);
14665 emit_insn (gen_rtx_SET (src, neg_src));
14667 if (short_mode)
14668 src = gen_lowpart (mode, src);
14670 /* Fall-through. */
14671 case PLUS:
14672 ldop_code = AARCH64_LDOP_PLUS;
14673 break;
14675 case IOR:
14676 ldop_code = AARCH64_LDOP_OR;
14677 break;
14679 case XOR:
14680 ldop_code = AARCH64_LDOP_XOR;
14681 break;
14683 case AND:
14685 rtx not_src;
14687 /* Resize the value if necessary. */
14688 if (short_mode)
14689 src = gen_lowpart (wmode, src);
14691 not_src = gen_rtx_NOT (wmode, src);
14692 emit_insn (gen_rtx_SET (src, not_src));
14694 if (short_mode)
14695 src = gen_lowpart (mode, src);
14697 ldop_code = AARCH64_LDOP_BIC;
14698 break;
14700 default:
14701 /* The operation can't be done with atomic instructions. */
14702 gcc_unreachable ();
14705 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14707 /* If necessary, calculate the data in memory after the update by redoing the
14708 operation from values in registers. */
14709 if (!out_result)
14710 return;
14712 if (short_mode)
14714 src = gen_lowpart (wmode, src);
14715 out_data = gen_lowpart (wmode, out_data);
14716 out_result = gen_lowpart (wmode, out_result);
14719 x = NULL_RTX;
14721 switch (code)
14723 case MINUS:
14724 case PLUS:
14725 x = gen_rtx_PLUS (wmode, out_data, src);
14726 break;
14727 case IOR:
14728 x = gen_rtx_IOR (wmode, out_data, src);
14729 break;
14730 case XOR:
14731 x = gen_rtx_XOR (wmode, out_data, src);
14732 break;
14733 case AND:
14734 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14735 return;
14736 default:
14737 gcc_unreachable ();
14740 emit_set_insn (out_result, x);
14742 return;
14745 /* Split an atomic operation. */
14747 void
14748 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14749 rtx value, rtx model_rtx, rtx cond)
14751 machine_mode mode = GET_MODE (mem);
14752 machine_mode wmode = (mode == DImode ? DImode : SImode);
14753 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14754 const bool is_sync = is_mm_sync (model);
14755 rtx_code_label *label;
14756 rtx x;
14758 /* Split the atomic operation into a sequence. */
14759 label = gen_label_rtx ();
14760 emit_label (label);
14762 if (new_out)
14763 new_out = gen_lowpart (wmode, new_out);
14764 if (old_out)
14765 old_out = gen_lowpart (wmode, old_out);
14766 else
14767 old_out = new_out;
14768 value = simplify_gen_subreg (wmode, value, mode, 0);
14770 /* The initial load can be relaxed for a __sync operation since a final
14771 barrier will be emitted to stop code hoisting. */
14772 if (is_sync)
14773 aarch64_emit_load_exclusive (mode, old_out, mem,
14774 GEN_INT (MEMMODEL_RELAXED));
14775 else
14776 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14778 switch (code)
14780 case SET:
14781 new_out = value;
14782 break;
14784 case NOT:
14785 x = gen_rtx_AND (wmode, old_out, value);
14786 emit_insn (gen_rtx_SET (new_out, x));
14787 x = gen_rtx_NOT (wmode, new_out);
14788 emit_insn (gen_rtx_SET (new_out, x));
14789 break;
14791 case MINUS:
14792 if (CONST_INT_P (value))
14794 value = GEN_INT (-INTVAL (value));
14795 code = PLUS;
14797 /* Fall through. */
14799 default:
14800 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14801 emit_insn (gen_rtx_SET (new_out, x));
14802 break;
14805 aarch64_emit_store_exclusive (mode, cond, mem,
14806 gen_lowpart (mode, new_out), model_rtx);
14808 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14809 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14810 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14811 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14813 /* Emit any final barrier needed for a __sync operation. */
14814 if (is_sync)
14815 aarch64_emit_post_barrier (model);
14818 static void
14819 aarch64_init_libfuncs (void)
14821 /* Half-precision float operations. The compiler handles all operations
14822 with NULL libfuncs by converting to SFmode. */
14824 /* Conversions. */
14825 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14826 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14828 /* Arithmetic. */
14829 set_optab_libfunc (add_optab, HFmode, NULL);
14830 set_optab_libfunc (sdiv_optab, HFmode, NULL);
14831 set_optab_libfunc (smul_optab, HFmode, NULL);
14832 set_optab_libfunc (neg_optab, HFmode, NULL);
14833 set_optab_libfunc (sub_optab, HFmode, NULL);
14835 /* Comparisons. */
14836 set_optab_libfunc (eq_optab, HFmode, NULL);
14837 set_optab_libfunc (ne_optab, HFmode, NULL);
14838 set_optab_libfunc (lt_optab, HFmode, NULL);
14839 set_optab_libfunc (le_optab, HFmode, NULL);
14840 set_optab_libfunc (ge_optab, HFmode, NULL);
14841 set_optab_libfunc (gt_optab, HFmode, NULL);
14842 set_optab_libfunc (unord_optab, HFmode, NULL);
14845 /* Target hook for c_mode_for_suffix. */
14846 static machine_mode
14847 aarch64_c_mode_for_suffix (char suffix)
14849 if (suffix == 'q')
14850 return TFmode;
14852 return VOIDmode;
14855 /* We can only represent floating point constants which will fit in
14856 "quarter-precision" values. These values are characterised by
14857 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14860 (-1)^s * (n/16) * 2^r
14862 Where:
14863 's' is the sign bit.
14864 'n' is an integer in the range 16 <= n <= 31.
14865 'r' is an integer in the range -3 <= r <= 4. */
14867 /* Return true iff X can be represented by a quarter-precision
14868 floating point immediate operand X. Note, we cannot represent 0.0. */
14869 bool
14870 aarch64_float_const_representable_p (rtx x)
14872 /* This represents our current view of how many bits
14873 make up the mantissa. */
14874 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14875 int exponent;
14876 unsigned HOST_WIDE_INT mantissa, mask;
14877 REAL_VALUE_TYPE r, m;
14878 bool fail;
14880 if (!CONST_DOUBLE_P (x))
14881 return false;
14883 /* We don't support HFmode constants yet. */
14884 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14885 return false;
14887 r = *CONST_DOUBLE_REAL_VALUE (x);
14889 /* We cannot represent infinities, NaNs or +/-zero. We won't
14890 know if we have +zero until we analyse the mantissa, but we
14891 can reject the other invalid values. */
14892 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14893 || REAL_VALUE_MINUS_ZERO (r))
14894 return false;
14896 /* Extract exponent. */
14897 r = real_value_abs (&r);
14898 exponent = REAL_EXP (&r);
14900 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14901 highest (sign) bit, with a fixed binary point at bit point_pos.
14902 m1 holds the low part of the mantissa, m2 the high part.
14903 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14904 bits for the mantissa, this can fail (low bits will be lost). */
14905 real_ldexp (&m, &r, point_pos - exponent);
14906 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14908 /* If the low part of the mantissa has bits set we cannot represent
14909 the value. */
14910 if (w.ulow () != 0)
14911 return false;
14912 /* We have rejected the lower HOST_WIDE_INT, so update our
14913 understanding of how many bits lie in the mantissa and
14914 look only at the high HOST_WIDE_INT. */
14915 mantissa = w.elt (1);
14916 point_pos -= HOST_BITS_PER_WIDE_INT;
14918 /* We can only represent values with a mantissa of the form 1.xxxx. */
14919 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14920 if ((mantissa & mask) != 0)
14921 return false;
14923 /* Having filtered unrepresentable values, we may now remove all
14924 but the highest 5 bits. */
14925 mantissa >>= point_pos - 5;
14927 /* We cannot represent the value 0.0, so reject it. This is handled
14928 elsewhere. */
14929 if (mantissa == 0)
14930 return false;
14932 /* Then, as bit 4 is always set, we can mask it off, leaving
14933 the mantissa in the range [0, 15]. */
14934 mantissa &= ~(1 << 4);
14935 gcc_assert (mantissa <= 15);
14937 /* GCC internally does not use IEEE754-like encoding (where normalized
14938 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14939 Our mantissa values are shifted 4 places to the left relative to
14940 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14941 by 5 places to correct for GCC's representation. */
14942 exponent = 5 - exponent;
14944 return (exponent >= 0 && exponent <= 7);
14947 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14948 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14949 output MOVI/MVNI, ORR or BIC immediate. */
14950 char*
14951 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14952 enum simd_immediate_check which)
14954 bool is_valid;
14955 static char templ[40];
14956 const char *mnemonic;
14957 const char *shift_op;
14958 unsigned int lane_count = 0;
14959 char element_char;
14961 struct simd_immediate_info info;
14963 /* This will return true to show const_vector is legal for use as either
14964 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14965 It will also update INFO to show how the immediate should be generated.
14966 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14967 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14968 gcc_assert (is_valid);
14970 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14971 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14973 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14975 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14976 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14977 move immediate path. */
14978 if (aarch64_float_const_zero_rtx_p (info.value))
14979 info.value = GEN_INT (0);
14980 else
14982 const unsigned int buf_size = 20;
14983 char float_buf[buf_size] = {'\0'};
14984 real_to_decimal_for_mode (float_buf,
14985 CONST_DOUBLE_REAL_VALUE (info.value),
14986 buf_size, buf_size, 1, info.elt_mode);
14988 if (lane_count == 1)
14989 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14990 else
14991 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14992 lane_count, element_char, float_buf);
14993 return templ;
14997 gcc_assert (CONST_INT_P (info.value));
14999 if (which == AARCH64_CHECK_MOV)
15001 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
15002 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
15003 if (lane_count == 1)
15004 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
15005 mnemonic, UINTVAL (info.value));
15006 else if (info.shift)
15007 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15008 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
15009 element_char, UINTVAL (info.value), shift_op, info.shift);
15010 else
15011 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
15012 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
15013 element_char, UINTVAL (info.value));
15015 else
15017 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
15018 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
15019 if (info.shift)
15020 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15021 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
15022 element_char, UINTVAL (info.value), "lsl", info.shift);
15023 else
15024 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
15025 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
15026 element_char, UINTVAL (info.value));
15028 return templ;
15031 char*
15032 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
15035 /* If a floating point number was passed and we desire to use it in an
15036 integer mode do the conversion to integer. */
15037 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
15039 unsigned HOST_WIDE_INT ival;
15040 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
15041 gcc_unreachable ();
15042 immediate = gen_int_mode (ival, mode);
15045 machine_mode vmode;
15046 /* use a 64 bit mode for everything except for DI/DF mode, where we use
15047 a 128 bit vector mode. */
15048 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15050 vmode = aarch64_simd_container_mode (mode, width);
15051 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15052 return aarch64_output_simd_mov_immediate (v_op, width);
15055 /* Return the output string to use for moving immediate CONST_VECTOR
15056 into an SVE register. */
15058 char *
15059 aarch64_output_sve_mov_immediate (rtx const_vector)
15061 static char templ[40];
15062 struct simd_immediate_info info;
15063 char element_char;
15065 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15066 gcc_assert (is_valid);
15068 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15070 if (info.step)
15072 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15073 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15074 element_char, INTVAL (info.value), INTVAL (info.step));
15075 return templ;
15078 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15080 if (aarch64_float_const_zero_rtx_p (info.value))
15081 info.value = GEN_INT (0);
15082 else
15084 const int buf_size = 20;
15085 char float_buf[buf_size] = {};
15086 real_to_decimal_for_mode (float_buf,
15087 CONST_DOUBLE_REAL_VALUE (info.value),
15088 buf_size, buf_size, 1, info.elt_mode);
15090 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15091 element_char, float_buf);
15092 return templ;
15096 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15097 element_char, INTVAL (info.value));
15098 return templ;
15101 /* Return the asm format for a PTRUE instruction whose destination has
15102 mode MODE. SUFFIX is the element size suffix. */
15104 char *
15105 aarch64_output_ptrue (machine_mode mode, char suffix)
15107 unsigned int nunits;
15108 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15109 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15110 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15111 else
15112 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15113 return buf;
15116 /* Split operands into moves from op[1] + op[2] into op[0]. */
15118 void
15119 aarch64_split_combinev16qi (rtx operands[3])
15121 unsigned int dest = REGNO (operands[0]);
15122 unsigned int src1 = REGNO (operands[1]);
15123 unsigned int src2 = REGNO (operands[2]);
15124 machine_mode halfmode = GET_MODE (operands[1]);
15125 unsigned int halfregs = REG_NREGS (operands[1]);
15126 rtx destlo, desthi;
15128 gcc_assert (halfmode == V16QImode);
15130 if (src1 == dest && src2 == dest + halfregs)
15132 /* No-op move. Can't split to nothing; emit something. */
15133 emit_note (NOTE_INSN_DELETED);
15134 return;
15137 /* Preserve register attributes for variable tracking. */
15138 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15139 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15140 GET_MODE_SIZE (halfmode));
15142 /* Special case of reversed high/low parts. */
15143 if (reg_overlap_mentioned_p (operands[2], destlo)
15144 && reg_overlap_mentioned_p (operands[1], desthi))
15146 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15147 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15148 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15150 else if (!reg_overlap_mentioned_p (operands[2], destlo))
15152 /* Try to avoid unnecessary moves if part of the result
15153 is in the right place already. */
15154 if (src1 != dest)
15155 emit_move_insn (destlo, operands[1]);
15156 if (src2 != dest + halfregs)
15157 emit_move_insn (desthi, operands[2]);
15159 else
15161 if (src2 != dest + halfregs)
15162 emit_move_insn (desthi, operands[2]);
15163 if (src1 != dest)
15164 emit_move_insn (destlo, operands[1]);
15168 /* vec_perm support. */
15170 struct expand_vec_perm_d
15172 rtx target, op0, op1;
15173 vec_perm_indices perm;
15174 machine_mode vmode;
15175 unsigned int vec_flags;
15176 bool one_vector_p;
15177 bool testing_p;
15180 /* Generate a variable permutation. */
15182 static void
15183 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15185 machine_mode vmode = GET_MODE (target);
15186 bool one_vector_p = rtx_equal_p (op0, op1);
15188 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15189 gcc_checking_assert (GET_MODE (op0) == vmode);
15190 gcc_checking_assert (GET_MODE (op1) == vmode);
15191 gcc_checking_assert (GET_MODE (sel) == vmode);
15192 gcc_checking_assert (TARGET_SIMD);
15194 if (one_vector_p)
15196 if (vmode == V8QImode)
15198 /* Expand the argument to a V16QI mode by duplicating it. */
15199 rtx pair = gen_reg_rtx (V16QImode);
15200 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15201 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15203 else
15205 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15208 else
15210 rtx pair;
15212 if (vmode == V8QImode)
15214 pair = gen_reg_rtx (V16QImode);
15215 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15216 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15218 else
15220 pair = gen_reg_rtx (OImode);
15221 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15222 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15227 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15228 NELT is the number of elements in the vector. */
15230 void
15231 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15232 unsigned int nelt)
15234 machine_mode vmode = GET_MODE (target);
15235 bool one_vector_p = rtx_equal_p (op0, op1);
15236 rtx mask;
15238 /* The TBL instruction does not use a modulo index, so we must take care
15239 of that ourselves. */
15240 mask = aarch64_simd_gen_const_vector_dup (vmode,
15241 one_vector_p ? nelt - 1 : 2 * nelt - 1);
15242 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15244 /* For big-endian, we also need to reverse the index within the vector
15245 (but not which vector). */
15246 if (BYTES_BIG_ENDIAN)
15248 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15249 if (!one_vector_p)
15250 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15251 sel = expand_simple_binop (vmode, XOR, sel, mask,
15252 NULL, 0, OPTAB_LIB_WIDEN);
15254 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15257 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15259 static void
15260 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15262 emit_insn (gen_rtx_SET (target,
15263 gen_rtx_UNSPEC (GET_MODE (target),
15264 gen_rtvec (2, op0, op1), code)));
15267 /* Expand an SVE vec_perm with the given operands. */
15269 void
15270 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15272 machine_mode data_mode = GET_MODE (target);
15273 machine_mode sel_mode = GET_MODE (sel);
15274 /* Enforced by the pattern condition. */
15275 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15277 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15278 size of the two value vectors, i.e. the upper bits of the indices
15279 are effectively ignored. SVE TBL instead produces 0 for any
15280 out-of-range indices, so we need to modulo all the vec_perm indices
15281 to ensure they are all in range. */
15282 rtx sel_reg = force_reg (sel_mode, sel);
15284 /* Check if the sel only references the first values vector. */
15285 if (GET_CODE (sel) == CONST_VECTOR
15286 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15288 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15289 return;
15292 /* Check if the two values vectors are the same. */
15293 if (rtx_equal_p (op0, op1))
15295 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15296 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15297 NULL, 0, OPTAB_DIRECT);
15298 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15299 return;
15302 /* Run TBL on for each value vector and combine the results. */
15304 rtx res0 = gen_reg_rtx (data_mode);
15305 rtx res1 = gen_reg_rtx (data_mode);
15306 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15307 if (GET_CODE (sel) != CONST_VECTOR
15308 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15310 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15311 2 * nunits - 1);
15312 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15313 NULL, 0, OPTAB_DIRECT);
15315 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15316 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15317 NULL, 0, OPTAB_DIRECT);
15318 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15319 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15320 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15321 else
15322 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15325 /* Recognize patterns suitable for the TRN instructions. */
15326 static bool
15327 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15329 HOST_WIDE_INT odd;
15330 poly_uint64 nelt = d->perm.length ();
15331 rtx out, in0, in1, x;
15332 machine_mode vmode = d->vmode;
15334 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15335 return false;
15337 /* Note that these are little-endian tests.
15338 We correct for big-endian later. */
15339 if (!d->perm[0].is_constant (&odd)
15340 || (odd != 0 && odd != 1)
15341 || !d->perm.series_p (0, 2, odd, 2)
15342 || !d->perm.series_p (1, 2, nelt + odd, 2))
15343 return false;
15345 /* Success! */
15346 if (d->testing_p)
15347 return true;
15349 in0 = d->op0;
15350 in1 = d->op1;
15351 /* We don't need a big-endian lane correction for SVE; see the comment
15352 at the head of aarch64-sve.md for details. */
15353 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15355 x = in0, in0 = in1, in1 = x;
15356 odd = !odd;
15358 out = d->target;
15360 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15361 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15362 return true;
15365 /* Recognize patterns suitable for the UZP instructions. */
15366 static bool
15367 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15369 HOST_WIDE_INT odd;
15370 rtx out, in0, in1, x;
15371 machine_mode vmode = d->vmode;
15373 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15374 return false;
15376 /* Note that these are little-endian tests.
15377 We correct for big-endian later. */
15378 if (!d->perm[0].is_constant (&odd)
15379 || (odd != 0 && odd != 1)
15380 || !d->perm.series_p (0, 1, odd, 2))
15381 return false;
15383 /* Success! */
15384 if (d->testing_p)
15385 return true;
15387 in0 = d->op0;
15388 in1 = d->op1;
15389 /* We don't need a big-endian lane correction for SVE; see the comment
15390 at the head of aarch64-sve.md for details. */
15391 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15393 x = in0, in0 = in1, in1 = x;
15394 odd = !odd;
15396 out = d->target;
15398 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15399 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15400 return true;
15403 /* Recognize patterns suitable for the ZIP instructions. */
15404 static bool
15405 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15407 unsigned int high;
15408 poly_uint64 nelt = d->perm.length ();
15409 rtx out, in0, in1, x;
15410 machine_mode vmode = d->vmode;
15412 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15413 return false;
15415 /* Note that these are little-endian tests.
15416 We correct for big-endian later. */
15417 poly_uint64 first = d->perm[0];
15418 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15419 || !d->perm.series_p (0, 2, first, 1)
15420 || !d->perm.series_p (1, 2, first + nelt, 1))
15421 return false;
15422 high = maybe_ne (first, 0U);
15424 /* Success! */
15425 if (d->testing_p)
15426 return true;
15428 in0 = d->op0;
15429 in1 = d->op1;
15430 /* We don't need a big-endian lane correction for SVE; see the comment
15431 at the head of aarch64-sve.md for details. */
15432 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15434 x = in0, in0 = in1, in1 = x;
15435 high = !high;
15437 out = d->target;
15439 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15440 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15441 return true;
15444 /* Recognize patterns for the EXT insn. */
15446 static bool
15447 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15449 HOST_WIDE_INT location;
15450 rtx offset;
15452 /* The first element always refers to the first vector.
15453 Check if the extracted indices are increasing by one. */
15454 if (d->vec_flags == VEC_SVE_PRED
15455 || !d->perm[0].is_constant (&location)
15456 || !d->perm.series_p (0, 1, location, 1))
15457 return false;
15459 /* Success! */
15460 if (d->testing_p)
15461 return true;
15463 /* The case where (location == 0) is a no-op for both big- and little-endian,
15464 and is removed by the mid-end at optimization levels -O1 and higher.
15466 We don't need a big-endian lane correction for SVE; see the comment
15467 at the head of aarch64-sve.md for details. */
15468 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15470 /* After setup, we want the high elements of the first vector (stored
15471 at the LSB end of the register), and the low elements of the second
15472 vector (stored at the MSB end of the register). So swap. */
15473 std::swap (d->op0, d->op1);
15474 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15475 to_constant () is safe since this is restricted to Advanced SIMD
15476 vectors. */
15477 location = d->perm.length ().to_constant () - location;
15480 offset = GEN_INT (location);
15481 emit_set_insn (d->target,
15482 gen_rtx_UNSPEC (d->vmode,
15483 gen_rtvec (3, d->op0, d->op1, offset),
15484 UNSPEC_EXT));
15485 return true;
15488 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15489 within each 64-bit, 32-bit or 16-bit granule. */
15491 static bool
15492 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15494 HOST_WIDE_INT diff;
15495 unsigned int i, size, unspec;
15496 machine_mode pred_mode;
15498 if (d->vec_flags == VEC_SVE_PRED
15499 || !d->one_vector_p
15500 || !d->perm[0].is_constant (&diff))
15501 return false;
15503 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15504 if (size == 8)
15506 unspec = UNSPEC_REV64;
15507 pred_mode = VNx2BImode;
15509 else if (size == 4)
15511 unspec = UNSPEC_REV32;
15512 pred_mode = VNx4BImode;
15514 else if (size == 2)
15516 unspec = UNSPEC_REV16;
15517 pred_mode = VNx8BImode;
15519 else
15520 return false;
15522 unsigned int step = diff + 1;
15523 for (i = 0; i < step; ++i)
15524 if (!d->perm.series_p (i, step, diff - i, step))
15525 return false;
15527 /* Success! */
15528 if (d->testing_p)
15529 return true;
15531 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15532 if (d->vec_flags == VEC_SVE_DATA)
15534 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15535 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15536 UNSPEC_MERGE_PTRUE);
15538 emit_set_insn (d->target, src);
15539 return true;
15542 /* Recognize patterns for the REV insn, which reverses elements within
15543 a full vector. */
15545 static bool
15546 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15548 poly_uint64 nelt = d->perm.length ();
15550 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15551 return false;
15553 if (!d->perm.series_p (0, 1, nelt - 1, -1))
15554 return false;
15556 /* Success! */
15557 if (d->testing_p)
15558 return true;
15560 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15561 emit_set_insn (d->target, src);
15562 return true;
15565 static bool
15566 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15568 rtx out = d->target;
15569 rtx in0;
15570 HOST_WIDE_INT elt;
15571 machine_mode vmode = d->vmode;
15572 rtx lane;
15574 if (d->vec_flags == VEC_SVE_PRED
15575 || d->perm.encoding ().encoded_nelts () != 1
15576 || !d->perm[0].is_constant (&elt))
15577 return false;
15579 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15580 return false;
15582 /* Success! */
15583 if (d->testing_p)
15584 return true;
15586 /* The generic preparation in aarch64_expand_vec_perm_const_1
15587 swaps the operand order and the permute indices if it finds
15588 d->perm[0] to be in the second operand. Thus, we can always
15589 use d->op0 and need not do any extra arithmetic to get the
15590 correct lane number. */
15591 in0 = d->op0;
15592 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
15594 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15595 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15596 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15597 return true;
15600 static bool
15601 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15603 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15604 machine_mode vmode = d->vmode;
15606 /* Make sure that the indices are constant. */
15607 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15608 for (unsigned int i = 0; i < encoded_nelts; ++i)
15609 if (!d->perm[i].is_constant ())
15610 return false;
15612 if (d->testing_p)
15613 return true;
15615 /* Generic code will try constant permutation twice. Once with the
15616 original mode and again with the elements lowered to QImode.
15617 So wait and don't do the selector expansion ourselves. */
15618 if (vmode != V8QImode && vmode != V16QImode)
15619 return false;
15621 /* to_constant is safe since this routine is specific to Advanced SIMD
15622 vectors. */
15623 unsigned int nelt = d->perm.length ().to_constant ();
15624 for (unsigned int i = 0; i < nelt; ++i)
15625 /* If big-endian and two vectors we end up with a weird mixed-endian
15626 mode on NEON. Reverse the index within each word but not the word
15627 itself. to_constant is safe because we checked is_constant above. */
15628 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15629 ? d->perm[i].to_constant () ^ (nelt - 1)
15630 : d->perm[i].to_constant ());
15632 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15633 sel = force_reg (vmode, sel);
15635 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15636 return true;
15639 /* Try to implement D using an SVE TBL instruction. */
15641 static bool
15642 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15644 unsigned HOST_WIDE_INT nelt;
15646 /* Permuting two variable-length vectors could overflow the
15647 index range. */
15648 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15649 return false;
15651 if (d->testing_p)
15652 return true;
15654 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15655 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15656 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15657 return true;
15660 static bool
15661 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15663 /* The pattern matching functions above are written to look for a small
15664 number to begin the sequence (0, 1, N/2). If we begin with an index
15665 from the second operand, we can swap the operands. */
15666 poly_int64 nelt = d->perm.length ();
15667 if (known_ge (d->perm[0], nelt))
15669 d->perm.rotate_inputs (1);
15670 std::swap (d->op0, d->op1);
15673 if ((d->vec_flags == VEC_ADVSIMD
15674 || d->vec_flags == VEC_SVE_DATA
15675 || d->vec_flags == VEC_SVE_PRED)
15676 && known_gt (nelt, 1))
15678 if (aarch64_evpc_rev_local (d))
15679 return true;
15680 else if (aarch64_evpc_rev_global (d))
15681 return true;
15682 else if (aarch64_evpc_ext (d))
15683 return true;
15684 else if (aarch64_evpc_dup (d))
15685 return true;
15686 else if (aarch64_evpc_zip (d))
15687 return true;
15688 else if (aarch64_evpc_uzp (d))
15689 return true;
15690 else if (aarch64_evpc_trn (d))
15691 return true;
15692 if (d->vec_flags == VEC_SVE_DATA)
15693 return aarch64_evpc_sve_tbl (d);
15694 else if (d->vec_flags == VEC_SVE_DATA)
15695 return aarch64_evpc_tbl (d);
15697 return false;
15700 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15702 static bool
15703 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15704 rtx op1, const vec_perm_indices &sel)
15706 struct expand_vec_perm_d d;
15708 /* Check whether the mask can be applied to a single vector. */
15709 if (op0 && rtx_equal_p (op0, op1))
15710 d.one_vector_p = true;
15711 else if (sel.all_from_input_p (0))
15713 d.one_vector_p = true;
15714 op1 = op0;
15716 else if (sel.all_from_input_p (1))
15718 d.one_vector_p = true;
15719 op0 = op1;
15721 else
15722 d.one_vector_p = false;
15724 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15725 sel.nelts_per_input ());
15726 d.vmode = vmode;
15727 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15728 d.target = target;
15729 d.op0 = op0;
15730 d.op1 = op1;
15731 d.testing_p = !target;
15733 if (!d.testing_p)
15734 return aarch64_expand_vec_perm_const_1 (&d);
15736 rtx_insn *last = get_last_insn ();
15737 bool ret = aarch64_expand_vec_perm_const_1 (&d);
15738 gcc_assert (last == get_last_insn ());
15740 return ret;
15743 /* Generate a byte permute mask for a register of mode MODE,
15744 which has NUNITS units. */
15747 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15749 /* We have to reverse each vector because we dont have
15750 a permuted load that can reverse-load according to ABI rules. */
15751 rtx mask;
15752 rtvec v = rtvec_alloc (16);
15753 unsigned int i, j;
15754 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15756 gcc_assert (BYTES_BIG_ENDIAN);
15757 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15759 for (i = 0; i < nunits; i++)
15760 for (j = 0; j < usize; j++)
15761 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15762 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15763 return force_reg (V16QImode, mask);
15766 /* Return true if X is a valid second operand for the SVE instruction
15767 that implements integer comparison OP_CODE. */
15769 static bool
15770 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15772 if (register_operand (x, VOIDmode))
15773 return true;
15775 switch (op_code)
15777 case LTU:
15778 case LEU:
15779 case GEU:
15780 case GTU:
15781 return aarch64_sve_cmp_immediate_p (x, false);
15782 case LT:
15783 case LE:
15784 case GE:
15785 case GT:
15786 case NE:
15787 case EQ:
15788 return aarch64_sve_cmp_immediate_p (x, true);
15789 default:
15790 gcc_unreachable ();
15794 /* Use predicated SVE instructions to implement the equivalent of:
15796 (set TARGET OP)
15798 given that PTRUE is an all-true predicate of the appropriate mode. */
15800 static void
15801 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15803 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15804 gen_rtvec (2, ptrue, op),
15805 UNSPEC_MERGE_PTRUE);
15806 rtx_insn *insn = emit_set_insn (target, unspec);
15807 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15810 /* Likewise, but also clobber the condition codes. */
15812 static void
15813 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15815 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15816 gen_rtvec (2, ptrue, op),
15817 UNSPEC_MERGE_PTRUE);
15818 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15819 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15822 /* Return the UNSPEC_COND_* code for comparison CODE. */
15824 static unsigned int
15825 aarch64_unspec_cond_code (rtx_code code)
15827 switch (code)
15829 case NE:
15830 return UNSPEC_COND_NE;
15831 case EQ:
15832 return UNSPEC_COND_EQ;
15833 case LT:
15834 return UNSPEC_COND_LT;
15835 case GT:
15836 return UNSPEC_COND_GT;
15837 case LE:
15838 return UNSPEC_COND_LE;
15839 case GE:
15840 return UNSPEC_COND_GE;
15841 default:
15842 gcc_unreachable ();
15846 /* Emit:
15848 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15850 where <X> is the operation associated with comparison CODE. This form
15851 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15852 semantics, such as when PRED might not be all-true and when comparing
15853 inactive lanes could have side effects. */
15855 static void
15856 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15857 rtx pred, rtx op0, rtx op1)
15859 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15860 gen_rtvec (3, pred, op0, op1),
15861 aarch64_unspec_cond_code (code));
15862 emit_set_insn (target, unspec);
15865 /* Expand an SVE integer comparison using the SVE equivalent of:
15867 (set TARGET (CODE OP0 OP1)). */
15869 void
15870 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15872 machine_mode pred_mode = GET_MODE (target);
15873 machine_mode data_mode = GET_MODE (op0);
15875 if (!aarch64_sve_cmp_operand_p (code, op1))
15876 op1 = force_reg (data_mode, op1);
15878 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15879 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15880 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15883 /* Emit the SVE equivalent of:
15885 (set TMP1 (CODE1 OP0 OP1))
15886 (set TMP2 (CODE2 OP0 OP1))
15887 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15889 PTRUE is an all-true predicate with the same mode as TARGET. */
15891 static void
15892 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15893 rtx ptrue, rtx op0, rtx op1)
15895 machine_mode pred_mode = GET_MODE (ptrue);
15896 rtx tmp1 = gen_reg_rtx (pred_mode);
15897 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15898 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15899 rtx tmp2 = gen_reg_rtx (pred_mode);
15900 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15901 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15902 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15905 /* Emit the SVE equivalent of:
15907 (set TMP (CODE OP0 OP1))
15908 (set TARGET (not TMP))
15910 PTRUE is an all-true predicate with the same mode as TARGET. */
15912 static void
15913 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15914 rtx op0, rtx op1)
15916 machine_mode pred_mode = GET_MODE (ptrue);
15917 rtx tmp = gen_reg_rtx (pred_mode);
15918 aarch64_emit_sve_ptrue_op (tmp, ptrue,
15919 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15920 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15923 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15925 (set TARGET (CODE OP0 OP1))
15927 If CAN_INVERT_P is true, the caller can also handle inverted results;
15928 return true if the result is in fact inverted. */
15930 bool
15931 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15932 rtx op0, rtx op1, bool can_invert_p)
15934 machine_mode pred_mode = GET_MODE (target);
15935 machine_mode data_mode = GET_MODE (op0);
15937 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15938 switch (code)
15940 case UNORDERED:
15941 /* UNORDERED has no immediate form. */
15942 op1 = force_reg (data_mode, op1);
15943 /* fall through */
15944 case LT:
15945 case LE:
15946 case GT:
15947 case GE:
15948 case EQ:
15949 case NE:
15951 /* There is native support for the comparison. */
15952 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15953 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15954 return false;
15957 case LTGT:
15958 /* This is a trapping operation (LT or GT). */
15959 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15960 return false;
15962 case UNEQ:
15963 if (!flag_trapping_math)
15965 /* This would trap for signaling NaNs. */
15966 op1 = force_reg (data_mode, op1);
15967 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15968 return false;
15970 /* fall through */
15971 case UNLT:
15972 case UNLE:
15973 case UNGT:
15974 case UNGE:
15975 if (flag_trapping_math)
15977 /* Work out which elements are ordered. */
15978 rtx ordered = gen_reg_rtx (pred_mode);
15979 op1 = force_reg (data_mode, op1);
15980 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
15982 /* Test the opposite condition for the ordered elements,
15983 then invert the result. */
15984 if (code == UNEQ)
15985 code = NE;
15986 else
15987 code = reverse_condition_maybe_unordered (code);
15988 if (can_invert_p)
15990 aarch64_emit_sve_predicated_cond (target, code,
15991 ordered, op0, op1);
15992 return true;
15994 rtx tmp = gen_reg_rtx (pred_mode);
15995 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
15996 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15997 return false;
15999 break;
16001 case ORDERED:
16002 /* ORDERED has no immediate form. */
16003 op1 = force_reg (data_mode, op1);
16004 break;
16006 default:
16007 gcc_unreachable ();
16010 /* There is native support for the inverse comparison. */
16011 code = reverse_condition_maybe_unordered (code);
16012 if (can_invert_p)
16014 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
16015 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
16016 return true;
16018 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
16019 return false;
16022 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
16023 of the data being selected and CMP_MODE is the mode of the values being
16024 compared. */
16026 void
16027 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
16028 rtx *ops)
16030 machine_mode pred_mode
16031 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
16032 GET_MODE_SIZE (cmp_mode)).require ();
16033 rtx pred = gen_reg_rtx (pred_mode);
16034 if (FLOAT_MODE_P (cmp_mode))
16036 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
16037 ops[4], ops[5], true))
16038 std::swap (ops[1], ops[2]);
16040 else
16041 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
16043 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
16044 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
16047 /* Prepare a cond_<optab><mode> operation that has the operands
16048 given by OPERANDS, where:
16050 - operand 0 is the destination
16051 - operand 1 is a predicate
16052 - operands 2 to NOPS - 2 are the operands to an operation that is
16053 performed for active lanes
16054 - operand NOPS - 1 specifies the values to use for inactive lanes.
16056 COMMUTATIVE_P is true if operands 2 and 3 are commutative. In that case,
16057 no pattern is provided for a tie between operands 3 and NOPS - 1. */
16059 void
16060 aarch64_sve_prepare_conditional_op (rtx *operands, unsigned int nops,
16061 bool commutative_p)
16063 /* We can do the operation directly if the "else" value matches one
16064 of the other inputs. */
16065 for (unsigned int i = 2; i < nops - 1; ++i)
16066 if (rtx_equal_p (operands[i], operands[nops - 1]))
16068 if (i == 3 && commutative_p)
16069 std::swap (operands[2], operands[3]);
16070 return;
16073 /* If the "else" value is different from the other operands, we have
16074 the choice of doing a SEL on the output or a SEL on an input.
16075 Neither choice is better in all cases, but one advantage of
16076 selecting the input is that it can avoid a move when the output
16077 needs to be distinct from the inputs. E.g. if operand N maps to
16078 register N, selecting the output would give:
16080 MOVPRFX Z0.S, Z2.S
16081 ADD Z0.S, P1/M, Z0.S, Z3.S
16082 SEL Z0.S, P1, Z0.S, Z4.S
16084 whereas selecting the input avoids the MOVPRFX:
16086 SEL Z0.S, P1, Z2.S, Z4.S
16087 ADD Z0.S, P1/M, Z0.S, Z3.S. */
16088 machine_mode mode = GET_MODE (operands[0]);
16089 rtx temp = gen_reg_rtx (mode);
16090 rtvec vec = gen_rtvec (3, operands[1], operands[2], operands[nops - 1]);
16091 emit_set_insn (temp, gen_rtx_UNSPEC (mode, vec, UNSPEC_SEL));
16092 operands[2] = operands[nops - 1] = temp;
16095 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
16096 true. However due to issues with register allocation it is preferable
16097 to avoid tieing integer scalar and FP scalar modes. Executing integer
16098 operations in general registers is better than treating them as scalar
16099 vector operations. This reduces latency and avoids redundant int<->FP
16100 moves. So tie modes if they are either the same class, or vector modes
16101 with other vector modes, vector structs or any scalar mode. */
16103 static bool
16104 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16106 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16107 return true;
16109 /* We specifically want to allow elements of "structure" modes to
16110 be tieable to the structure. This more general condition allows
16111 other rarer situations too. The reason we don't extend this to
16112 predicate modes is that there are no predicate structure modes
16113 nor any specific instructions for extracting part of a predicate
16114 register. */
16115 if (aarch64_vector_data_mode_p (mode1)
16116 && aarch64_vector_data_mode_p (mode2))
16117 return true;
16119 /* Also allow any scalar modes with vectors. */
16120 if (aarch64_vector_mode_supported_p (mode1)
16121 || aarch64_vector_mode_supported_p (mode2))
16122 return true;
16124 return false;
16127 /* Return a new RTX holding the result of moving POINTER forward by
16128 AMOUNT bytes. */
16130 static rtx
16131 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16133 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16135 return adjust_automodify_address (pointer, GET_MODE (pointer),
16136 next, amount);
16139 /* Return a new RTX holding the result of moving POINTER forward by the
16140 size of the mode it points to. */
16142 static rtx
16143 aarch64_progress_pointer (rtx pointer)
16145 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16148 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16149 MODE bytes. */
16151 static void
16152 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16153 machine_mode mode)
16155 rtx reg = gen_reg_rtx (mode);
16157 /* "Cast" the pointers to the correct mode. */
16158 *src = adjust_address (*src, mode, 0);
16159 *dst = adjust_address (*dst, mode, 0);
16160 /* Emit the memcpy. */
16161 emit_move_insn (reg, *src);
16162 emit_move_insn (*dst, reg);
16163 /* Move the pointers forward. */
16164 *src = aarch64_progress_pointer (*src);
16165 *dst = aarch64_progress_pointer (*dst);
16168 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16169 we succeed, otherwise return false. */
16171 bool
16172 aarch64_expand_movmem (rtx *operands)
16174 unsigned int n;
16175 rtx dst = operands[0];
16176 rtx src = operands[1];
16177 rtx base;
16178 bool speed_p = !optimize_function_for_size_p (cfun);
16180 /* When optimizing for size, give a better estimate of the length of a
16181 memcpy call, but use the default otherwise. */
16182 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16184 /* We can't do anything smart if the amount to copy is not constant. */
16185 if (!CONST_INT_P (operands[2]))
16186 return false;
16188 n = UINTVAL (operands[2]);
16190 /* Try to keep the number of instructions low. For cases below 16 bytes we
16191 need to make at most two moves. For cases above 16 bytes it will be one
16192 move for each 16 byte chunk, then at most two additional moves. */
16193 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16194 return false;
16196 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16197 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16199 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16200 src = adjust_automodify_address (src, VOIDmode, base, 0);
16202 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16203 1-byte chunk. */
16204 if (n < 4)
16206 if (n >= 2)
16208 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16209 n -= 2;
16212 if (n == 1)
16213 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16215 return true;
16218 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
16219 4-byte chunk, partially overlapping with the previously copied chunk. */
16220 if (n < 8)
16222 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16223 n -= 4;
16224 if (n > 0)
16226 int move = n - 4;
16228 src = aarch64_move_pointer (src, move);
16229 dst = aarch64_move_pointer (dst, move);
16230 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16232 return true;
16235 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
16236 them, then (if applicable) an 8-byte chunk. */
16237 while (n >= 8)
16239 if (n / 16)
16241 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16242 n -= 16;
16244 else
16246 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16247 n -= 8;
16251 /* Finish the final bytes of the copy. We can always do this in one
16252 instruction. We either copy the exact amount we need, or partially
16253 overlap with the previous chunk we copied and copy 8-bytes. */
16254 if (n == 0)
16255 return true;
16256 else if (n == 1)
16257 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16258 else if (n == 2)
16259 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16260 else if (n == 4)
16261 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16262 else
16264 if (n == 3)
16266 src = aarch64_move_pointer (src, -1);
16267 dst = aarch64_move_pointer (dst, -1);
16268 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16270 else
16272 int move = n - 8;
16274 src = aarch64_move_pointer (src, move);
16275 dst = aarch64_move_pointer (dst, move);
16276 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16280 return true;
16283 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16284 SImode stores. Handle the case when the constant has identical
16285 bottom and top halves. This is beneficial when the two stores can be
16286 merged into an STP and we avoid synthesising potentially expensive
16287 immediates twice. Return true if such a split is possible. */
16289 bool
16290 aarch64_split_dimode_const_store (rtx dst, rtx src)
16292 rtx lo = gen_lowpart (SImode, src);
16293 rtx hi = gen_highpart_mode (SImode, DImode, src);
16295 bool size_p = optimize_function_for_size_p (cfun);
16297 if (!rtx_equal_p (lo, hi))
16298 return false;
16300 unsigned int orig_cost
16301 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16302 unsigned int lo_cost
16303 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16305 /* We want to transform:
16306 MOV x1, 49370
16307 MOVK x1, 0x140, lsl 16
16308 MOVK x1, 0xc0da, lsl 32
16309 MOVK x1, 0x140, lsl 48
16310 STR x1, [x0]
16311 into:
16312 MOV w1, 49370
16313 MOVK w1, 0x140, lsl 16
16314 STP w1, w1, [x0]
16315 So we want to perform this only when we save two instructions
16316 or more. When optimizing for size, however, accept any code size
16317 savings we can. */
16318 if (size_p && orig_cost <= lo_cost)
16319 return false;
16321 if (!size_p
16322 && (orig_cost <= lo_cost + 1))
16323 return false;
16325 rtx mem_lo = adjust_address (dst, SImode, 0);
16326 if (!aarch64_mem_pair_operand (mem_lo, SImode))
16327 return false;
16329 rtx tmp_reg = gen_reg_rtx (SImode);
16330 aarch64_expand_mov_immediate (tmp_reg, lo);
16331 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16332 /* Don't emit an explicit store pair as this may not be always profitable.
16333 Let the sched-fusion logic decide whether to merge them. */
16334 emit_move_insn (mem_lo, tmp_reg);
16335 emit_move_insn (mem_hi, tmp_reg);
16337 return true;
16340 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16342 static unsigned HOST_WIDE_INT
16343 aarch64_asan_shadow_offset (void)
16345 return (HOST_WIDE_INT_1 << 36);
16348 static rtx
16349 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16350 int code, tree treeop0, tree treeop1)
16352 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16353 rtx op0, op1;
16354 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16355 insn_code icode;
16356 struct expand_operand ops[4];
16358 start_sequence ();
16359 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16361 op_mode = GET_MODE (op0);
16362 if (op_mode == VOIDmode)
16363 op_mode = GET_MODE (op1);
16365 switch (op_mode)
16367 case E_QImode:
16368 case E_HImode:
16369 case E_SImode:
16370 cmp_mode = SImode;
16371 icode = CODE_FOR_cmpsi;
16372 break;
16374 case E_DImode:
16375 cmp_mode = DImode;
16376 icode = CODE_FOR_cmpdi;
16377 break;
16379 case E_SFmode:
16380 cmp_mode = SFmode;
16381 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16382 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16383 break;
16385 case E_DFmode:
16386 cmp_mode = DFmode;
16387 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16388 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16389 break;
16391 default:
16392 end_sequence ();
16393 return NULL_RTX;
16396 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16397 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16398 if (!op0 || !op1)
16400 end_sequence ();
16401 return NULL_RTX;
16403 *prep_seq = get_insns ();
16404 end_sequence ();
16406 create_fixed_operand (&ops[0], op0);
16407 create_fixed_operand (&ops[1], op1);
16409 start_sequence ();
16410 if (!maybe_expand_insn (icode, 2, ops))
16412 end_sequence ();
16413 return NULL_RTX;
16415 *gen_seq = get_insns ();
16416 end_sequence ();
16418 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16419 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16422 static rtx
16423 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16424 int cmp_code, tree treeop0, tree treeop1, int bit_code)
16426 rtx op0, op1, target;
16427 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16428 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16429 insn_code icode;
16430 struct expand_operand ops[6];
16431 int aarch64_cond;
16433 push_to_sequence (*prep_seq);
16434 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16436 op_mode = GET_MODE (op0);
16437 if (op_mode == VOIDmode)
16438 op_mode = GET_MODE (op1);
16440 switch (op_mode)
16442 case E_QImode:
16443 case E_HImode:
16444 case E_SImode:
16445 cmp_mode = SImode;
16446 icode = CODE_FOR_ccmpsi;
16447 break;
16449 case E_DImode:
16450 cmp_mode = DImode;
16451 icode = CODE_FOR_ccmpdi;
16452 break;
16454 case E_SFmode:
16455 cmp_mode = SFmode;
16456 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16457 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16458 break;
16460 case E_DFmode:
16461 cmp_mode = DFmode;
16462 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16463 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16464 break;
16466 default:
16467 end_sequence ();
16468 return NULL_RTX;
16471 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16472 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16473 if (!op0 || !op1)
16475 end_sequence ();
16476 return NULL_RTX;
16478 *prep_seq = get_insns ();
16479 end_sequence ();
16481 target = gen_rtx_REG (cc_mode, CC_REGNUM);
16482 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16484 if (bit_code != AND)
16486 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16487 GET_MODE (XEXP (prev, 0))),
16488 VOIDmode, XEXP (prev, 0), const0_rtx);
16489 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16492 create_fixed_operand (&ops[0], XEXP (prev, 0));
16493 create_fixed_operand (&ops[1], target);
16494 create_fixed_operand (&ops[2], op0);
16495 create_fixed_operand (&ops[3], op1);
16496 create_fixed_operand (&ops[4], prev);
16497 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16499 push_to_sequence (*gen_seq);
16500 if (!maybe_expand_insn (icode, 6, ops))
16502 end_sequence ();
16503 return NULL_RTX;
16506 *gen_seq = get_insns ();
16507 end_sequence ();
16509 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16512 #undef TARGET_GEN_CCMP_FIRST
16513 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16515 #undef TARGET_GEN_CCMP_NEXT
16516 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16518 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16519 instruction fusion of some sort. */
16521 static bool
16522 aarch64_macro_fusion_p (void)
16524 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16528 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16529 should be kept together during scheduling. */
16531 static bool
16532 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16534 rtx set_dest;
16535 rtx prev_set = single_set (prev);
16536 rtx curr_set = single_set (curr);
16537 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16538 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16540 if (!aarch64_macro_fusion_p ())
16541 return false;
16543 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16545 /* We are trying to match:
16546 prev (mov) == (set (reg r0) (const_int imm16))
16547 curr (movk) == (set (zero_extract (reg r0)
16548 (const_int 16)
16549 (const_int 16))
16550 (const_int imm16_1)) */
16552 set_dest = SET_DEST (curr_set);
16554 if (GET_CODE (set_dest) == ZERO_EXTRACT
16555 && CONST_INT_P (SET_SRC (curr_set))
16556 && CONST_INT_P (SET_SRC (prev_set))
16557 && CONST_INT_P (XEXP (set_dest, 2))
16558 && INTVAL (XEXP (set_dest, 2)) == 16
16559 && REG_P (XEXP (set_dest, 0))
16560 && REG_P (SET_DEST (prev_set))
16561 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16563 return true;
16567 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16570 /* We're trying to match:
16571 prev (adrp) == (set (reg r1)
16572 (high (symbol_ref ("SYM"))))
16573 curr (add) == (set (reg r0)
16574 (lo_sum (reg r1)
16575 (symbol_ref ("SYM"))))
16576 Note that r0 need not necessarily be the same as r1, especially
16577 during pre-regalloc scheduling. */
16579 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16580 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16582 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16583 && REG_P (XEXP (SET_SRC (curr_set), 0))
16584 && REGNO (XEXP (SET_SRC (curr_set), 0))
16585 == REGNO (SET_DEST (prev_set))
16586 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16587 XEXP (SET_SRC (curr_set), 1)))
16588 return true;
16592 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16595 /* We're trying to match:
16596 prev (movk) == (set (zero_extract (reg r0)
16597 (const_int 16)
16598 (const_int 32))
16599 (const_int imm16_1))
16600 curr (movk) == (set (zero_extract (reg r0)
16601 (const_int 16)
16602 (const_int 48))
16603 (const_int imm16_2)) */
16605 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16606 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16607 && REG_P (XEXP (SET_DEST (prev_set), 0))
16608 && REG_P (XEXP (SET_DEST (curr_set), 0))
16609 && REGNO (XEXP (SET_DEST (prev_set), 0))
16610 == REGNO (XEXP (SET_DEST (curr_set), 0))
16611 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16612 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16613 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16614 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16615 && CONST_INT_P (SET_SRC (prev_set))
16616 && CONST_INT_P (SET_SRC (curr_set)))
16617 return true;
16620 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16622 /* We're trying to match:
16623 prev (adrp) == (set (reg r0)
16624 (high (symbol_ref ("SYM"))))
16625 curr (ldr) == (set (reg r1)
16626 (mem (lo_sum (reg r0)
16627 (symbol_ref ("SYM")))))
16629 curr (ldr) == (set (reg r1)
16630 (zero_extend (mem
16631 (lo_sum (reg r0)
16632 (symbol_ref ("SYM")))))) */
16633 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16634 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16636 rtx curr_src = SET_SRC (curr_set);
16638 if (GET_CODE (curr_src) == ZERO_EXTEND)
16639 curr_src = XEXP (curr_src, 0);
16641 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16642 && REG_P (XEXP (XEXP (curr_src, 0), 0))
16643 && REGNO (XEXP (XEXP (curr_src, 0), 0))
16644 == REGNO (SET_DEST (prev_set))
16645 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16646 XEXP (SET_SRC (prev_set), 0)))
16647 return true;
16651 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16652 && aarch_crypto_can_dual_issue (prev, curr))
16653 return true;
16655 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16656 && any_condjump_p (curr))
16658 enum attr_type prev_type = get_attr_type (prev);
16660 unsigned int condreg1, condreg2;
16661 rtx cc_reg_1;
16662 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16663 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16665 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16666 && prev
16667 && modified_in_p (cc_reg_1, prev))
16669 /* FIXME: this misses some which is considered simple arthematic
16670 instructions for ThunderX. Simple shifts are missed here. */
16671 if (prev_type == TYPE_ALUS_SREG
16672 || prev_type == TYPE_ALUS_IMM
16673 || prev_type == TYPE_LOGICS_REG
16674 || prev_type == TYPE_LOGICS_IMM)
16675 return true;
16679 if (prev_set
16680 && curr_set
16681 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16682 && any_condjump_p (curr))
16684 /* We're trying to match:
16685 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16686 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16687 (const_int 0))
16688 (label_ref ("SYM"))
16689 (pc)) */
16690 if (SET_DEST (curr_set) == (pc_rtx)
16691 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16692 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16693 && REG_P (SET_DEST (prev_set))
16694 && REGNO (SET_DEST (prev_set))
16695 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16697 /* Fuse ALU operations followed by conditional branch instruction. */
16698 switch (get_attr_type (prev))
16700 case TYPE_ALU_IMM:
16701 case TYPE_ALU_SREG:
16702 case TYPE_ADC_REG:
16703 case TYPE_ADC_IMM:
16704 case TYPE_ADCS_REG:
16705 case TYPE_ADCS_IMM:
16706 case TYPE_LOGIC_REG:
16707 case TYPE_LOGIC_IMM:
16708 case TYPE_CSEL:
16709 case TYPE_ADR:
16710 case TYPE_MOV_IMM:
16711 case TYPE_SHIFT_REG:
16712 case TYPE_SHIFT_IMM:
16713 case TYPE_BFM:
16714 case TYPE_RBIT:
16715 case TYPE_REV:
16716 case TYPE_EXTEND:
16717 return true;
16719 default:;
16724 return false;
16727 /* Return true iff the instruction fusion described by OP is enabled. */
16729 bool
16730 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16732 return (aarch64_tune_params.fusible_ops & op) != 0;
16735 /* If MEM is in the form of [base+offset], extract the two parts
16736 of address and set to BASE and OFFSET, otherwise return false
16737 after clearing BASE and OFFSET. */
16739 bool
16740 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16742 rtx addr;
16744 gcc_assert (MEM_P (mem));
16746 addr = XEXP (mem, 0);
16748 if (REG_P (addr))
16750 *base = addr;
16751 *offset = const0_rtx;
16752 return true;
16755 if (GET_CODE (addr) == PLUS
16756 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16758 *base = XEXP (addr, 0);
16759 *offset = XEXP (addr, 1);
16760 return true;
16763 *base = NULL_RTX;
16764 *offset = NULL_RTX;
16766 return false;
16769 /* Types for scheduling fusion. */
16770 enum sched_fusion_type
16772 SCHED_FUSION_NONE = 0,
16773 SCHED_FUSION_LD_SIGN_EXTEND,
16774 SCHED_FUSION_LD_ZERO_EXTEND,
16775 SCHED_FUSION_LD,
16776 SCHED_FUSION_ST,
16777 SCHED_FUSION_NUM
16780 /* If INSN is a load or store of address in the form of [base+offset],
16781 extract the two parts and set to BASE and OFFSET. Return scheduling
16782 fusion type this INSN is. */
16784 static enum sched_fusion_type
16785 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16787 rtx x, dest, src;
16788 enum sched_fusion_type fusion = SCHED_FUSION_LD;
16790 gcc_assert (INSN_P (insn));
16791 x = PATTERN (insn);
16792 if (GET_CODE (x) != SET)
16793 return SCHED_FUSION_NONE;
16795 src = SET_SRC (x);
16796 dest = SET_DEST (x);
16798 machine_mode dest_mode = GET_MODE (dest);
16800 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16801 return SCHED_FUSION_NONE;
16803 if (GET_CODE (src) == SIGN_EXTEND)
16805 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16806 src = XEXP (src, 0);
16807 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16808 return SCHED_FUSION_NONE;
16810 else if (GET_CODE (src) == ZERO_EXTEND)
16812 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16813 src = XEXP (src, 0);
16814 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16815 return SCHED_FUSION_NONE;
16818 if (GET_CODE (src) == MEM && REG_P (dest))
16819 extract_base_offset_in_addr (src, base, offset);
16820 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16822 fusion = SCHED_FUSION_ST;
16823 extract_base_offset_in_addr (dest, base, offset);
16825 else
16826 return SCHED_FUSION_NONE;
16828 if (*base == NULL_RTX || *offset == NULL_RTX)
16829 fusion = SCHED_FUSION_NONE;
16831 return fusion;
16834 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16836 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16837 and PRI are only calculated for these instructions. For other instruction,
16838 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16839 type instruction fusion can be added by returning different priorities.
16841 It's important that irrelevant instructions get the largest FUSION_PRI. */
16843 static void
16844 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16845 int *fusion_pri, int *pri)
16847 int tmp, off_val;
16848 rtx base, offset;
16849 enum sched_fusion_type fusion;
16851 gcc_assert (INSN_P (insn));
16853 tmp = max_pri - 1;
16854 fusion = fusion_load_store (insn, &base, &offset);
16855 if (fusion == SCHED_FUSION_NONE)
16857 *pri = tmp;
16858 *fusion_pri = tmp;
16859 return;
16862 /* Set FUSION_PRI according to fusion type and base register. */
16863 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16865 /* Calculate PRI. */
16866 tmp /= 2;
16868 /* INSN with smaller offset goes first. */
16869 off_val = (int)(INTVAL (offset));
16870 if (off_val >= 0)
16871 tmp -= (off_val & 0xfffff);
16872 else
16873 tmp += ((- off_val) & 0xfffff);
16875 *pri = tmp;
16876 return;
16879 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16880 Adjust priority of sha1h instructions so they are scheduled before
16881 other SHA1 instructions. */
16883 static int
16884 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16886 rtx x = PATTERN (insn);
16888 if (GET_CODE (x) == SET)
16890 x = SET_SRC (x);
16892 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16893 return priority + 10;
16896 return priority;
16899 /* Given OPERANDS of consecutive load/store, check if we can merge
16900 them into ldp/stp. LOAD is true if they are load instructions.
16901 MODE is the mode of memory operands. */
16903 bool
16904 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16905 machine_mode mode)
16907 HOST_WIDE_INT offval_1, offval_2, msize;
16908 enum reg_class rclass_1, rclass_2;
16909 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16911 if (load)
16913 mem_1 = operands[1];
16914 mem_2 = operands[3];
16915 reg_1 = operands[0];
16916 reg_2 = operands[2];
16917 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16918 if (REGNO (reg_1) == REGNO (reg_2))
16919 return false;
16921 else
16923 mem_1 = operands[0];
16924 mem_2 = operands[2];
16925 reg_1 = operands[1];
16926 reg_2 = operands[3];
16929 /* The mems cannot be volatile. */
16930 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16931 return false;
16933 /* If we have SImode and slow unaligned ldp,
16934 check the alignment to be at least 8 byte. */
16935 if (mode == SImode
16936 && (aarch64_tune_params.extra_tuning_flags
16937 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16938 && !optimize_size
16939 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16940 return false;
16942 /* Check if the addresses are in the form of [base+offset]. */
16943 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16944 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16945 return false;
16946 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16947 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16948 return false;
16950 /* Check if the bases are same. */
16951 if (!rtx_equal_p (base_1, base_2))
16952 return false;
16954 /* The operands must be of the same size. */
16955 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16956 GET_MODE_SIZE (GET_MODE (mem_2))));
16958 offval_1 = INTVAL (offset_1);
16959 offval_2 = INTVAL (offset_2);
16960 /* We should only be trying this for fixed-sized modes. There is no
16961 SVE LDP/STP instruction. */
16962 msize = GET_MODE_SIZE (mode).to_constant ();
16963 /* Check if the offsets are consecutive. */
16964 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16965 return false;
16967 /* Check if the addresses are clobbered by load. */
16968 if (load)
16970 if (reg_mentioned_p (reg_1, mem_1))
16971 return false;
16973 /* In increasing order, the last load can clobber the address. */
16974 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16975 return false;
16978 /* One of the memory accesses must be a mempair operand.
16979 If it is not the first one, they need to be swapped by the
16980 peephole. */
16981 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16982 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16983 return false;
16985 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16986 rclass_1 = FP_REGS;
16987 else
16988 rclass_1 = GENERAL_REGS;
16990 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16991 rclass_2 = FP_REGS;
16992 else
16993 rclass_2 = GENERAL_REGS;
16995 /* Check if the registers are of same class. */
16996 if (rclass_1 != rclass_2)
16997 return false;
16999 return true;
17002 /* Given OPERANDS of consecutive load/store that can be merged,
17003 swap them if they are not in ascending order. */
17004 void
17005 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
17007 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
17008 HOST_WIDE_INT offval_1, offval_2;
17010 if (load)
17012 mem_1 = operands[1];
17013 mem_2 = operands[3];
17015 else
17017 mem_1 = operands[0];
17018 mem_2 = operands[2];
17021 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17022 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17024 offval_1 = INTVAL (offset_1);
17025 offval_2 = INTVAL (offset_2);
17027 if (offval_1 > offval_2)
17029 /* Irrespective of whether this is a load or a store,
17030 we do the same swap. */
17031 std::swap (operands[0], operands[2]);
17032 std::swap (operands[1], operands[3]);
17036 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
17037 comparison between the two. */
17039 aarch64_host_wide_int_compare (const void *x, const void *y)
17041 return wi::cmps (* ((const HOST_WIDE_INT *) x),
17042 * ((const HOST_WIDE_INT *) y));
17045 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
17046 other pointing to a REG rtx containing an offset, compare the offsets
17047 of the two pairs.
17049 Return:
17051 1 iff offset (X) > offset (Y)
17052 0 iff offset (X) == offset (Y)
17053 -1 iff offset (X) < offset (Y) */
17055 aarch64_ldrstr_offset_compare (const void *x, const void *y)
17057 const rtx * operands_1 = (const rtx *) x;
17058 const rtx * operands_2 = (const rtx *) y;
17059 rtx mem_1, mem_2, base, offset_1, offset_2;
17061 if (MEM_P (operands_1[0]))
17062 mem_1 = operands_1[0];
17063 else
17064 mem_1 = operands_1[1];
17066 if (MEM_P (operands_2[0]))
17067 mem_2 = operands_2[0];
17068 else
17069 mem_2 = operands_2[1];
17071 /* Extract the offsets. */
17072 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17073 extract_base_offset_in_addr (mem_2, &base, &offset_2);
17075 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
17077 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
17080 /* Given OPERANDS of consecutive load/store, check if we can merge
17081 them into ldp/stp by adjusting the offset. LOAD is true if they
17082 are load instructions. MODE is the mode of memory operands.
17084 Given below consecutive stores:
17086 str w1, [xb, 0x100]
17087 str w1, [xb, 0x104]
17088 str w1, [xb, 0x108]
17089 str w1, [xb, 0x10c]
17091 Though the offsets are out of the range supported by stp, we can
17092 still pair them after adjusting the offset, like:
17094 add scratch, xb, 0x100
17095 stp w1, w1, [scratch]
17096 stp w1, w1, [scratch, 0x8]
17098 The peephole patterns detecting this opportunity should guarantee
17099 the scratch register is avaliable. */
17101 bool
17102 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
17103 scalar_mode mode)
17105 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
17106 HOST_WIDE_INT offvals[4], msize;
17107 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
17108 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
17110 if (load)
17112 reg_1 = operands[0];
17113 mem_1 = operands[1];
17114 reg_2 = operands[2];
17115 mem_2 = operands[3];
17116 reg_3 = operands[4];
17117 mem_3 = operands[5];
17118 reg_4 = operands[6];
17119 mem_4 = operands[7];
17120 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
17121 && REG_P (reg_3) && REG_P (reg_4));
17123 /* Do not attempt to merge the loads if the loads clobber each other. */
17124 for (int i = 0; i < 8; i += 2)
17125 for (int j = i + 2; j < 8; j += 2)
17126 if (reg_overlap_mentioned_p (operands[i], operands[j]))
17127 return false;
17129 else
17131 mem_1 = operands[0];
17132 reg_1 = operands[1];
17133 mem_2 = operands[2];
17134 reg_2 = operands[3];
17135 mem_3 = operands[4];
17136 reg_3 = operands[5];
17137 mem_4 = operands[6];
17138 reg_4 = operands[7];
17140 /* Skip if memory operand is by itslef valid for ldp/stp. */
17141 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
17142 return false;
17144 /* The mems cannot be volatile. */
17145 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
17146 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
17147 return false;
17149 /* Check if the addresses are in the form of [base+offset]. */
17150 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
17151 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
17152 return false;
17153 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
17154 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
17155 return false;
17156 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
17157 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
17158 return false;
17159 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
17160 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
17161 return false;
17163 /* Check if the bases are same. */
17164 if (!rtx_equal_p (base_1, base_2)
17165 || !rtx_equal_p (base_2, base_3)
17166 || !rtx_equal_p (base_3, base_4))
17167 return false;
17169 offvals[0] = INTVAL (offset_1);
17170 offvals[1] = INTVAL (offset_2);
17171 offvals[2] = INTVAL (offset_3);
17172 offvals[3] = INTVAL (offset_4);
17173 msize = GET_MODE_SIZE (mode);
17175 /* Check if the offsets can be put in the right order to do a ldp/stp. */
17176 qsort (offvals, 4, sizeof (HOST_WIDE_INT), aarch64_host_wide_int_compare);
17178 if (!(offvals[1] == offvals[0] + msize
17179 && offvals[3] == offvals[2] + msize))
17180 return false;
17182 /* Check that offsets are within range of each other. The ldp/stp
17183 instructions have 7 bit immediate offsets, so use 0x80. */
17184 if (offvals[2] - offvals[0] >= msize * 0x80)
17185 return false;
17187 /* The offsets must be aligned with respect to each other. */
17188 if (offvals[0] % msize != offvals[2] % msize)
17189 return false;
17191 /* Check if the addresses are clobbered by load. */
17192 if (load && (reg_mentioned_p (reg_1, mem_1)
17193 || reg_mentioned_p (reg_2, mem_2)
17194 || reg_mentioned_p (reg_3, mem_3)
17195 || reg_mentioned_p (reg_4, mem_4)))
17196 return false;
17198 /* If we have SImode and slow unaligned ldp,
17199 check the alignment to be at least 8 byte. */
17200 if (mode == SImode
17201 && (aarch64_tune_params.extra_tuning_flags
17202 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17203 && !optimize_size
17204 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17205 return false;
17207 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17208 rclass_1 = FP_REGS;
17209 else
17210 rclass_1 = GENERAL_REGS;
17212 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17213 rclass_2 = FP_REGS;
17214 else
17215 rclass_2 = GENERAL_REGS;
17217 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
17218 rclass_3 = FP_REGS;
17219 else
17220 rclass_3 = GENERAL_REGS;
17222 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
17223 rclass_4 = FP_REGS;
17224 else
17225 rclass_4 = GENERAL_REGS;
17227 /* Check if the registers are of same class. */
17228 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17229 return false;
17231 return true;
17234 /* Given OPERANDS of consecutive load/store, this function pairs them
17235 into LDP/STP after adjusting the offset. It depends on the fact
17236 that the operands can be sorted so the offsets are correct for STP.
17237 MODE is the mode of memory operands. CODE is the rtl operator
17238 which should be applied to all memory operands, it's SIGN_EXTEND,
17239 ZERO_EXTEND or UNKNOWN. */
17241 bool
17242 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17243 scalar_mode mode, RTX_CODE code)
17245 rtx base, offset_1, offset_3, t1, t2;
17246 rtx mem_1, mem_2, mem_3, mem_4;
17247 rtx temp_operands[8];
17248 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17249 stp_off_upper_limit, stp_off_lower_limit, msize;
17251 /* We make changes on a copy as we may still bail out. */
17252 for (int i = 0; i < 8; i ++)
17253 temp_operands[i] = operands[i];
17255 /* Sort the operands. */
17256 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17258 if (load)
17260 mem_1 = temp_operands[1];
17261 mem_2 = temp_operands[3];
17262 mem_3 = temp_operands[5];
17263 mem_4 = temp_operands[7];
17265 else
17267 mem_1 = temp_operands[0];
17268 mem_2 = temp_operands[2];
17269 mem_3 = temp_operands[4];
17270 mem_4 = temp_operands[6];
17271 gcc_assert (code == UNKNOWN);
17274 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17275 extract_base_offset_in_addr (mem_3, &base, &offset_3);
17276 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17277 && offset_3 != NULL_RTX);
17279 /* Adjust offset so it can fit in LDP/STP instruction. */
17280 msize = GET_MODE_SIZE (mode);
17281 stp_off_upper_limit = msize * (0x40 - 1);
17282 stp_off_lower_limit = - msize * 0x40;
17284 off_val_1 = INTVAL (offset_1);
17285 off_val_3 = INTVAL (offset_3);
17287 /* The base offset is optimally half way between the two STP/LDP offsets. */
17288 if (msize <= 4)
17289 base_off = (off_val_1 + off_val_3) / 2;
17290 else
17291 /* However, due to issues with negative LDP/STP offset generation for
17292 larger modes, for DF, DI and vector modes. we must not use negative
17293 addresses smaller than 9 signed unadjusted bits can store. This
17294 provides the most range in this case. */
17295 base_off = off_val_1;
17297 /* Adjust the base so that it is aligned with the addresses but still
17298 optimal. */
17299 if (base_off % msize != off_val_1 % msize)
17300 /* Fix the offset, bearing in mind we want to make it bigger not
17301 smaller. */
17302 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17303 else if (msize <= 4)
17304 /* The negative range of LDP/STP is one larger than the positive range. */
17305 base_off += msize;
17307 /* Check if base offset is too big or too small. We can attempt to resolve
17308 this issue by setting it to the maximum value and seeing if the offsets
17309 still fit. */
17310 if (base_off >= 0x1000)
17312 base_off = 0x1000 - 1;
17313 /* We must still make sure that the base offset is aligned with respect
17314 to the address. But it may may not be made any bigger. */
17315 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17318 /* Likewise for the case where the base is too small. */
17319 if (base_off <= -0x1000)
17321 base_off = -0x1000 + 1;
17322 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17325 /* Offset of the first STP/LDP. */
17326 new_off_1 = off_val_1 - base_off;
17328 /* Offset of the second STP/LDP. */
17329 new_off_3 = off_val_3 - base_off;
17331 /* The offsets must be within the range of the LDP/STP instructions. */
17332 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17333 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17334 return false;
17336 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17337 new_off_1), true);
17338 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17339 new_off_1 + msize), true);
17340 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17341 new_off_3), true);
17342 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17343 new_off_3 + msize), true);
17345 if (!aarch64_mem_pair_operand (mem_1, mode)
17346 || !aarch64_mem_pair_operand (mem_3, mode))
17347 return false;
17349 if (code == ZERO_EXTEND)
17351 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17352 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17353 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17354 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17356 else if (code == SIGN_EXTEND)
17358 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17359 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17360 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17361 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17364 if (load)
17366 operands[0] = temp_operands[0];
17367 operands[1] = mem_1;
17368 operands[2] = temp_operands[2];
17369 operands[3] = mem_2;
17370 operands[4] = temp_operands[4];
17371 operands[5] = mem_3;
17372 operands[6] = temp_operands[6];
17373 operands[7] = mem_4;
17375 else
17377 operands[0] = mem_1;
17378 operands[1] = temp_operands[1];
17379 operands[2] = mem_2;
17380 operands[3] = temp_operands[3];
17381 operands[4] = mem_3;
17382 operands[5] = temp_operands[5];
17383 operands[6] = mem_4;
17384 operands[7] = temp_operands[7];
17387 /* Emit adjusting instruction. */
17388 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17389 /* Emit ldp/stp instructions. */
17390 t1 = gen_rtx_SET (operands[0], operands[1]);
17391 t2 = gen_rtx_SET (operands[2], operands[3]);
17392 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17393 t1 = gen_rtx_SET (operands[4], operands[5]);
17394 t2 = gen_rtx_SET (operands[6], operands[7]);
17395 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17396 return true;
17399 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17400 it isn't worth branching around empty masked ops (including masked
17401 stores). */
17403 static bool
17404 aarch64_empty_mask_is_expensive (unsigned)
17406 return false;
17409 /* Return 1 if pseudo register should be created and used to hold
17410 GOT address for PIC code. */
17412 bool
17413 aarch64_use_pseudo_pic_reg (void)
17415 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17418 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17420 static int
17421 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17423 switch (XINT (x, 1))
17425 case UNSPEC_GOTSMALLPIC:
17426 case UNSPEC_GOTSMALLPIC28K:
17427 case UNSPEC_GOTTINYPIC:
17428 return 0;
17429 default:
17430 break;
17433 return default_unspec_may_trap_p (x, flags);
17437 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17438 return the log2 of that value. Otherwise return -1. */
17441 aarch64_fpconst_pow_of_2 (rtx x)
17443 const REAL_VALUE_TYPE *r;
17445 if (!CONST_DOUBLE_P (x))
17446 return -1;
17448 r = CONST_DOUBLE_REAL_VALUE (x);
17450 if (REAL_VALUE_NEGATIVE (*r)
17451 || REAL_VALUE_ISNAN (*r)
17452 || REAL_VALUE_ISINF (*r)
17453 || !real_isinteger (r, DFmode))
17454 return -1;
17456 return exact_log2 (real_to_integer (r));
17459 /* If X is a vector of equal CONST_DOUBLE values and that value is
17460 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17463 aarch64_vec_fpconst_pow_of_2 (rtx x)
17465 int nelts;
17466 if (GET_CODE (x) != CONST_VECTOR
17467 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17468 return -1;
17470 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17471 return -1;
17473 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17474 if (firstval <= 0)
17475 return -1;
17477 for (int i = 1; i < nelts; i++)
17478 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17479 return -1;
17481 return firstval;
17484 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17485 to float.
17487 __fp16 always promotes through this hook.
17488 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17489 through the generic excess precision logic rather than here. */
17491 static tree
17492 aarch64_promoted_type (const_tree t)
17494 if (SCALAR_FLOAT_TYPE_P (t)
17495 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17496 return float_type_node;
17498 return NULL_TREE;
17501 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17503 static bool
17504 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17505 optimization_type opt_type)
17507 switch (op)
17509 case rsqrt_optab:
17510 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17512 default:
17513 return true;
17517 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17519 static unsigned int
17520 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17521 int *offset)
17523 /* Polynomial invariant 1 == (VG / 2) - 1. */
17524 gcc_assert (i == 1);
17525 *factor = 2;
17526 *offset = 1;
17527 return AARCH64_DWARF_VG;
17530 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17531 if MODE is HFmode, and punt to the generic implementation otherwise. */
17533 static bool
17534 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17536 return (mode == HFmode
17537 ? true
17538 : default_libgcc_floating_mode_supported_p (mode));
17541 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17542 if MODE is HFmode, and punt to the generic implementation otherwise. */
17544 static bool
17545 aarch64_scalar_mode_supported_p (scalar_mode mode)
17547 return (mode == HFmode
17548 ? true
17549 : default_scalar_mode_supported_p (mode));
17552 /* Set the value of FLT_EVAL_METHOD.
17553 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17555 0: evaluate all operations and constants, whose semantic type has at
17556 most the range and precision of type float, to the range and
17557 precision of float; evaluate all other operations and constants to
17558 the range and precision of the semantic type;
17560 N, where _FloatN is a supported interchange floating type
17561 evaluate all operations and constants, whose semantic type has at
17562 most the range and precision of _FloatN type, to the range and
17563 precision of the _FloatN type; evaluate all other operations and
17564 constants to the range and precision of the semantic type;
17566 If we have the ARMv8.2-A extensions then we support _Float16 in native
17567 precision, so we should set this to 16. Otherwise, we support the type,
17568 but want to evaluate expressions in float precision, so set this to
17569 0. */
17571 static enum flt_eval_method
17572 aarch64_excess_precision (enum excess_precision_type type)
17574 switch (type)
17576 case EXCESS_PRECISION_TYPE_FAST:
17577 case EXCESS_PRECISION_TYPE_STANDARD:
17578 /* We can calculate either in 16-bit range and precision or
17579 32-bit range and precision. Make that decision based on whether
17580 we have native support for the ARMv8.2-A 16-bit floating-point
17581 instructions or not. */
17582 return (TARGET_FP_F16INST
17583 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17584 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17585 case EXCESS_PRECISION_TYPE_IMPLICIT:
17586 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17587 default:
17588 gcc_unreachable ();
17590 return FLT_EVAL_METHOD_UNPREDICTABLE;
17593 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17594 scheduled for speculative execution. Reject the long-running division
17595 and square-root instructions. */
17597 static bool
17598 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17600 switch (get_attr_type (insn))
17602 case TYPE_SDIV:
17603 case TYPE_UDIV:
17604 case TYPE_FDIVS:
17605 case TYPE_FDIVD:
17606 case TYPE_FSQRTS:
17607 case TYPE_FSQRTD:
17608 case TYPE_NEON_FP_SQRT_S:
17609 case TYPE_NEON_FP_SQRT_D:
17610 case TYPE_NEON_FP_SQRT_S_Q:
17611 case TYPE_NEON_FP_SQRT_D_Q:
17612 case TYPE_NEON_FP_DIV_S:
17613 case TYPE_NEON_FP_DIV_D:
17614 case TYPE_NEON_FP_DIV_S_Q:
17615 case TYPE_NEON_FP_DIV_D_Q:
17616 return false;
17617 default:
17618 return true;
17622 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17624 static int
17625 aarch64_compute_pressure_classes (reg_class *classes)
17627 int i = 0;
17628 classes[i++] = GENERAL_REGS;
17629 classes[i++] = FP_REGS;
17630 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17631 registers need to go in PR_LO_REGS at some point during their
17632 lifetime. Splitting it into two halves has the effect of making
17633 all predicates count against PR_LO_REGS, so that we try whenever
17634 possible to restrict the number of live predicates to 8. This
17635 greatly reduces the amount of spilling in certain loops. */
17636 classes[i++] = PR_LO_REGS;
17637 classes[i++] = PR_HI_REGS;
17638 return i;
17641 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17643 static bool
17644 aarch64_can_change_mode_class (machine_mode from,
17645 machine_mode to, reg_class_t)
17647 if (BYTES_BIG_ENDIAN)
17649 bool from_sve_p = aarch64_sve_data_mode_p (from);
17650 bool to_sve_p = aarch64_sve_data_mode_p (to);
17652 /* Don't allow changes between SVE data modes and non-SVE modes.
17653 See the comment at the head of aarch64-sve.md for details. */
17654 if (from_sve_p != to_sve_p)
17655 return false;
17657 /* Don't allow changes in element size: lane 0 of the new vector
17658 would not then be lane 0 of the old vector. See the comment
17659 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17660 description.
17662 In the worst case, this forces a register to be spilled in
17663 one mode and reloaded in the other, which handles the
17664 endianness correctly. */
17665 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17666 return false;
17668 return true;
17671 /* Implement TARGET_EARLY_REMAT_MODES. */
17673 static void
17674 aarch64_select_early_remat_modes (sbitmap modes)
17676 /* SVE values are not normally live across a call, so it should be
17677 worth doing early rematerialization even in VL-specific mode. */
17678 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17680 machine_mode mode = (machine_mode) i;
17681 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17682 if (vec_flags & VEC_ANY_SVE)
17683 bitmap_set_bit (modes, i);
17687 /* Target-specific selftests. */
17689 #if CHECKING_P
17691 namespace selftest {
17693 /* Selftest for the RTL loader.
17694 Verify that the RTL loader copes with a dump from
17695 print_rtx_function. This is essentially just a test that class
17696 function_reader can handle a real dump, but it also verifies
17697 that lookup_reg_by_dump_name correctly handles hard regs.
17698 The presence of hard reg names in the dump means that the test is
17699 target-specific, hence it is in this file. */
17701 static void
17702 aarch64_test_loading_full_dump ()
17704 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17706 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17708 rtx_insn *insn_1 = get_insn_by_uid (1);
17709 ASSERT_EQ (NOTE, GET_CODE (insn_1));
17711 rtx_insn *insn_15 = get_insn_by_uid (15);
17712 ASSERT_EQ (INSN, GET_CODE (insn_15));
17713 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17715 /* Verify crtl->return_rtx. */
17716 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17717 ASSERT_EQ (0, REGNO (crtl->return_rtx));
17718 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17721 /* Run all target-specific selftests. */
17723 static void
17724 aarch64_run_selftests (void)
17726 aarch64_test_loading_full_dump ();
17729 } // namespace selftest
17731 #endif /* #if CHECKING_P */
17733 #undef TARGET_ADDRESS_COST
17734 #define TARGET_ADDRESS_COST aarch64_address_cost
17736 /* This hook will determines whether unnamed bitfields affect the alignment
17737 of the containing structure. The hook returns true if the structure
17738 should inherit the alignment requirements of an unnamed bitfield's
17739 type. */
17740 #undef TARGET_ALIGN_ANON_BITFIELD
17741 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17743 #undef TARGET_ASM_ALIGNED_DI_OP
17744 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17746 #undef TARGET_ASM_ALIGNED_HI_OP
17747 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17749 #undef TARGET_ASM_ALIGNED_SI_OP
17750 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17752 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17753 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17754 hook_bool_const_tree_hwi_hwi_const_tree_true
17756 #undef TARGET_ASM_FILE_START
17757 #define TARGET_ASM_FILE_START aarch64_start_file
17759 #undef TARGET_ASM_OUTPUT_MI_THUNK
17760 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17762 #undef TARGET_ASM_SELECT_RTX_SECTION
17763 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17765 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17766 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17768 #undef TARGET_BUILD_BUILTIN_VA_LIST
17769 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17771 #undef TARGET_CALLEE_COPIES
17772 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17774 #undef TARGET_CAN_ELIMINATE
17775 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17777 #undef TARGET_CAN_INLINE_P
17778 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17780 #undef TARGET_CANNOT_FORCE_CONST_MEM
17781 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17783 #undef TARGET_CASE_VALUES_THRESHOLD
17784 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17786 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17787 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17789 /* Only the least significant bit is used for initialization guard
17790 variables. */
17791 #undef TARGET_CXX_GUARD_MASK_BIT
17792 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17794 #undef TARGET_C_MODE_FOR_SUFFIX
17795 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17797 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17798 #undef TARGET_DEFAULT_TARGET_FLAGS
17799 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17800 #endif
17802 #undef TARGET_CLASS_MAX_NREGS
17803 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17805 #undef TARGET_BUILTIN_DECL
17806 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17808 #undef TARGET_BUILTIN_RECIPROCAL
17809 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17811 #undef TARGET_C_EXCESS_PRECISION
17812 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17814 #undef TARGET_EXPAND_BUILTIN
17815 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17817 #undef TARGET_EXPAND_BUILTIN_VA_START
17818 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17820 #undef TARGET_FOLD_BUILTIN
17821 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17823 #undef TARGET_FUNCTION_ARG
17824 #define TARGET_FUNCTION_ARG aarch64_function_arg
17826 #undef TARGET_FUNCTION_ARG_ADVANCE
17827 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17829 #undef TARGET_FUNCTION_ARG_BOUNDARY
17830 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17832 #undef TARGET_FUNCTION_ARG_PADDING
17833 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17835 #undef TARGET_GET_RAW_RESULT_MODE
17836 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17837 #undef TARGET_GET_RAW_ARG_MODE
17838 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17840 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17841 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17843 #undef TARGET_FUNCTION_VALUE
17844 #define TARGET_FUNCTION_VALUE aarch64_function_value
17846 #undef TARGET_FUNCTION_VALUE_REGNO_P
17847 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17849 #undef TARGET_GIMPLE_FOLD_BUILTIN
17850 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17852 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17853 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17855 #undef TARGET_INIT_BUILTINS
17856 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17858 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17859 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17860 aarch64_ira_change_pseudo_allocno_class
17862 #undef TARGET_LEGITIMATE_ADDRESS_P
17863 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17865 #undef TARGET_LEGITIMATE_CONSTANT_P
17866 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17868 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17869 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17870 aarch64_legitimize_address_displacement
17872 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17873 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17875 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17876 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17877 aarch64_libgcc_floating_mode_supported_p
17879 #undef TARGET_MANGLE_TYPE
17880 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17882 #undef TARGET_MEMORY_MOVE_COST
17883 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17885 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17886 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17888 #undef TARGET_MUST_PASS_IN_STACK
17889 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17891 /* This target hook should return true if accesses to volatile bitfields
17892 should use the narrowest mode possible. It should return false if these
17893 accesses should use the bitfield container type. */
17894 #undef TARGET_NARROW_VOLATILE_BITFIELD
17895 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17897 #undef TARGET_OPTION_OVERRIDE
17898 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17900 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17901 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17902 aarch64_override_options_after_change
17904 #undef TARGET_OPTION_SAVE
17905 #define TARGET_OPTION_SAVE aarch64_option_save
17907 #undef TARGET_OPTION_RESTORE
17908 #define TARGET_OPTION_RESTORE aarch64_option_restore
17910 #undef TARGET_OPTION_PRINT
17911 #define TARGET_OPTION_PRINT aarch64_option_print
17913 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17914 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17916 #undef TARGET_SET_CURRENT_FUNCTION
17917 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17919 #undef TARGET_PASS_BY_REFERENCE
17920 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17922 #undef TARGET_PREFERRED_RELOAD_CLASS
17923 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17925 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17926 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17928 #undef TARGET_PROMOTED_TYPE
17929 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17931 #undef TARGET_SECONDARY_RELOAD
17932 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17934 #undef TARGET_SHIFT_TRUNCATION_MASK
17935 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17937 #undef TARGET_SETUP_INCOMING_VARARGS
17938 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17940 #undef TARGET_STRUCT_VALUE_RTX
17941 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17943 #undef TARGET_REGISTER_MOVE_COST
17944 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17946 #undef TARGET_RETURN_IN_MEMORY
17947 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17949 #undef TARGET_RETURN_IN_MSB
17950 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17952 #undef TARGET_RTX_COSTS
17953 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17955 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17956 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17958 #undef TARGET_SCHED_ISSUE_RATE
17959 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17961 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17962 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17963 aarch64_sched_first_cycle_multipass_dfa_lookahead
17965 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17966 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17967 aarch64_first_cycle_multipass_dfa_lookahead_guard
17969 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17970 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17971 aarch64_get_separate_components
17973 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17974 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17975 aarch64_components_for_bb
17977 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17978 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17979 aarch64_disqualify_components
17981 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17982 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17983 aarch64_emit_prologue_components
17985 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17986 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17987 aarch64_emit_epilogue_components
17989 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17990 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17991 aarch64_set_handled_components
17993 #undef TARGET_TRAMPOLINE_INIT
17994 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17996 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17997 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17999 #undef TARGET_VECTOR_MODE_SUPPORTED_P
18000 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
18002 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
18003 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
18004 aarch64_builtin_support_vector_misalignment
18006 #undef TARGET_ARRAY_MODE
18007 #define TARGET_ARRAY_MODE aarch64_array_mode
18009 #undef TARGET_ARRAY_MODE_SUPPORTED_P
18010 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
18012 #undef TARGET_VECTORIZE_ADD_STMT_COST
18013 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
18015 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
18016 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
18017 aarch64_builtin_vectorization_cost
18019 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
18020 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
18022 #undef TARGET_VECTORIZE_BUILTINS
18023 #define TARGET_VECTORIZE_BUILTINS
18025 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
18026 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
18027 aarch64_builtin_vectorized_function
18029 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
18030 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
18031 aarch64_autovectorize_vector_sizes
18033 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
18034 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
18035 aarch64_atomic_assign_expand_fenv
18037 /* Section anchor support. */
18039 #undef TARGET_MIN_ANCHOR_OFFSET
18040 #define TARGET_MIN_ANCHOR_OFFSET -256
18042 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
18043 byte offset; we can do much more for larger data types, but have no way
18044 to determine the size of the access. We assume accesses are aligned. */
18045 #undef TARGET_MAX_ANCHOR_OFFSET
18046 #define TARGET_MAX_ANCHOR_OFFSET 4095
18048 #undef TARGET_VECTOR_ALIGNMENT
18049 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
18051 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
18052 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
18053 aarch64_vectorize_preferred_vector_alignment
18054 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
18055 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
18056 aarch64_simd_vector_alignment_reachable
18058 /* vec_perm support. */
18060 #undef TARGET_VECTORIZE_VEC_PERM_CONST
18061 #define TARGET_VECTORIZE_VEC_PERM_CONST \
18062 aarch64_vectorize_vec_perm_const
18064 #undef TARGET_VECTORIZE_GET_MASK_MODE
18065 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
18066 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
18067 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
18068 aarch64_empty_mask_is_expensive
18070 #undef TARGET_INIT_LIBFUNCS
18071 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
18073 #undef TARGET_FIXED_CONDITION_CODE_REGS
18074 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
18076 #undef TARGET_FLAGS_REGNUM
18077 #define TARGET_FLAGS_REGNUM CC_REGNUM
18079 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
18080 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
18082 #undef TARGET_ASAN_SHADOW_OFFSET
18083 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
18085 #undef TARGET_LEGITIMIZE_ADDRESS
18086 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
18088 #undef TARGET_SCHED_CAN_SPECULATE_INSN
18089 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
18091 #undef TARGET_CAN_USE_DOLOOP_P
18092 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
18094 #undef TARGET_SCHED_ADJUST_PRIORITY
18095 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
18097 #undef TARGET_SCHED_MACRO_FUSION_P
18098 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
18100 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
18101 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
18103 #undef TARGET_SCHED_FUSION_PRIORITY
18104 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
18106 #undef TARGET_UNSPEC_MAY_TRAP_P
18107 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
18109 #undef TARGET_USE_PSEUDO_PIC_REG
18110 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
18112 #undef TARGET_PRINT_OPERAND
18113 #define TARGET_PRINT_OPERAND aarch64_print_operand
18115 #undef TARGET_PRINT_OPERAND_ADDRESS
18116 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
18118 #undef TARGET_OPTAB_SUPPORTED_P
18119 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
18121 #undef TARGET_OMIT_STRUCT_RETURN_REG
18122 #define TARGET_OMIT_STRUCT_RETURN_REG true
18124 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
18125 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18126 aarch64_dwarf_poly_indeterminate_value
18128 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
18129 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18130 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18132 #undef TARGET_HARD_REGNO_NREGS
18133 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18134 #undef TARGET_HARD_REGNO_MODE_OK
18135 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18137 #undef TARGET_MODES_TIEABLE_P
18138 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18140 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18141 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18142 aarch64_hard_regno_call_part_clobbered
18144 #undef TARGET_CONSTANT_ALIGNMENT
18145 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18147 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18148 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18150 #undef TARGET_CAN_CHANGE_MODE_CLASS
18151 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18153 #undef TARGET_SELECT_EARLY_REMAT_MODES
18154 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18156 #if CHECKING_P
18157 #undef TARGET_RUN_TARGET_SELFTESTS
18158 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18159 #endif /* #if CHECKING_P */
18161 struct gcc_target targetm = TARGET_INITIALIZER;
18163 #include "gt-aarch64.h"