[AArch64] PR84114: Avoid reassociating FMA
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob07c55b132a7a8c4ceeaf9f406ac8c3c9d7b6bb20
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Classifies an address.
83 ADDRESS_REG_IMM
84 A simple base register plus immediate offset.
86 ADDRESS_REG_WB
87 A base register indexed by immediate offset with writeback.
89 ADDRESS_REG_REG
90 A base register indexed by (optionally scaled) register.
92 ADDRESS_REG_UXTW
93 A base register indexed by (optionally scaled) zero-extended register.
95 ADDRESS_REG_SXTW
96 A base register indexed by (optionally scaled) sign-extended register.
98 ADDRESS_LO_SUM
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
101 ADDRESS_SYMBOLIC:
102 A constant symbolic address, in pc-relative literal pool. */
104 enum aarch64_address_type {
105 ADDRESS_REG_IMM,
106 ADDRESS_REG_WB,
107 ADDRESS_REG_REG,
108 ADDRESS_REG_UXTW,
109 ADDRESS_REG_SXTW,
110 ADDRESS_LO_SUM,
111 ADDRESS_SYMBOLIC
114 struct aarch64_address_info {
115 enum aarch64_address_type type;
116 rtx base;
117 rtx offset;
118 poly_int64 const_offset;
119 int shift;
120 enum aarch64_symbol_type symbol_type;
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
126 enum insn_type { MOV, MVN };
127 enum modifier_type { LSL, MSL };
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode, rtx);
131 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
132 insn_type = MOV, modifier_type = LSL,
133 unsigned int = 0);
134 simd_immediate_info (scalar_mode, rtx, rtx);
136 /* The mode of the elements. */
137 scalar_mode elt_mode;
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
141 rtx value;
143 /* The value of the step if the constant is a series, null otherwise. */
144 rtx step;
146 /* The instruction to use to move the immediate into a vector. */
147 insn_type insn;
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier;
152 unsigned int shift;
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
159 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
160 modifier (LSL), shift (0)
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
165 fields. */
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
168 unsigned HOST_WIDE_INT value_in,
169 insn_type insn_in, modifier_type modifier_in,
170 unsigned int shift_in)
171 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
172 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
179 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
180 modifier (LSL), shift (0)
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel;
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg;
189 #ifdef HAVE_AS_TLS
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
192 #endif
194 static bool aarch64_composite_type_p (const_tree, machine_mode);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
196 const_tree,
197 machine_mode *, int *,
198 bool *);
199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode);
203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
205 const_tree type,
206 int misalignment,
207 bool is_packed);
208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
211 /* Major revision number of the ARM Architecture implemented by the target. */
212 unsigned aarch64_architecture_version;
214 /* The processor for which instructions should be scheduled. */
215 enum aarch64_processor aarch64_tune = cortexa53;
217 /* Mask to specify which instruction scheduling options should be used. */
218 unsigned long aarch64_tune_flags = 0;
220 /* Global flag for PC relative loads. */
221 bool aarch64_pcrelative_literal_loads;
223 /* Support for command line parsing of boolean flags in the tuning
224 structures. */
225 struct aarch64_flag_desc
227 const char* name;
228 unsigned int flag;
231 #define AARCH64_FUSION_PAIR(name, internal_name) \
232 { name, AARCH64_FUSE_##internal_name },
233 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
235 { "none", AARCH64_FUSE_NOTHING },
236 #include "aarch64-fusion-pairs.def"
237 { "all", AARCH64_FUSE_ALL },
238 { NULL, AARCH64_FUSE_NOTHING }
241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
242 { name, AARCH64_EXTRA_TUNE_##internal_name },
243 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
245 { "none", AARCH64_EXTRA_TUNE_NONE },
246 #include "aarch64-tuning-flags.def"
247 { "all", AARCH64_EXTRA_TUNE_ALL },
248 { NULL, AARCH64_EXTRA_TUNE_NONE }
251 /* Tuning parameters. */
253 static const struct cpu_addrcost_table generic_addrcost_table =
256 1, /* hi */
257 0, /* si */
258 0, /* di */
259 1, /* ti */
261 0, /* pre_modify */
262 0, /* post_modify */
263 0, /* register_offset */
264 0, /* register_sextend */
265 0, /* register_zextend */
266 0 /* imm_offset */
269 static const struct cpu_addrcost_table exynosm1_addrcost_table =
272 0, /* hi */
273 0, /* si */
274 0, /* di */
275 2, /* ti */
277 0, /* pre_modify */
278 0, /* post_modify */
279 1, /* register_offset */
280 1, /* register_sextend */
281 2, /* register_zextend */
282 0, /* imm_offset */
285 static const struct cpu_addrcost_table xgene1_addrcost_table =
288 1, /* hi */
289 0, /* si */
290 0, /* di */
291 1, /* ti */
293 1, /* pre_modify */
294 0, /* post_modify */
295 0, /* register_offset */
296 1, /* register_sextend */
297 1, /* register_zextend */
298 0, /* imm_offset */
301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
304 1, /* hi */
305 1, /* si */
306 1, /* di */
307 2, /* ti */
309 0, /* pre_modify */
310 0, /* post_modify */
311 2, /* register_offset */
312 3, /* register_sextend */
313 3, /* register_zextend */
314 0, /* imm_offset */
317 static const struct cpu_regmove_cost generic_regmove_cost =
319 1, /* GP2GP */
320 /* Avoid the use of slow int<->fp moves for spilling by setting
321 their cost higher than memmov_cost. */
322 5, /* GP2FP */
323 5, /* FP2GP */
324 2 /* FP2FP */
327 static const struct cpu_regmove_cost cortexa57_regmove_cost =
329 1, /* GP2GP */
330 /* Avoid the use of slow int<->fp moves for spilling by setting
331 their cost higher than memmov_cost. */
332 5, /* GP2FP */
333 5, /* FP2GP */
334 2 /* FP2FP */
337 static const struct cpu_regmove_cost cortexa53_regmove_cost =
339 1, /* GP2GP */
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
342 5, /* GP2FP */
343 5, /* FP2GP */
344 2 /* FP2FP */
347 static const struct cpu_regmove_cost exynosm1_regmove_cost =
349 1, /* GP2GP */
350 /* Avoid the use of slow int<->fp moves for spilling by setting
351 their cost higher than memmov_cost (actual, 4 and 9). */
352 9, /* GP2FP */
353 9, /* FP2GP */
354 1 /* FP2FP */
357 static const struct cpu_regmove_cost thunderx_regmove_cost =
359 2, /* GP2GP */
360 2, /* GP2FP */
361 6, /* FP2GP */
362 4 /* FP2FP */
365 static const struct cpu_regmove_cost xgene1_regmove_cost =
367 1, /* GP2GP */
368 /* Avoid the use of slow int<->fp moves for spilling by setting
369 their cost higher than memmov_cost. */
370 8, /* GP2FP */
371 8, /* FP2GP */
372 2 /* FP2FP */
375 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
377 2, /* GP2GP */
378 /* Avoid the use of int<->fp moves for spilling. */
379 6, /* GP2FP */
380 6, /* FP2GP */
381 4 /* FP2FP */
384 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
386 1, /* GP2GP */
387 /* Avoid the use of int<->fp moves for spilling. */
388 8, /* GP2FP */
389 8, /* FP2GP */
390 4 /* FP2FP */
393 /* Generic costs for vector insn classes. */
394 static const struct cpu_vector_cost generic_vector_cost =
396 1, /* scalar_int_stmt_cost */
397 1, /* scalar_fp_stmt_cost */
398 1, /* scalar_load_cost */
399 1, /* scalar_store_cost */
400 1, /* vec_int_stmt_cost */
401 1, /* vec_fp_stmt_cost */
402 2, /* vec_permute_cost */
403 1, /* vec_to_scalar_cost */
404 1, /* scalar_to_vec_cost */
405 1, /* vec_align_load_cost */
406 1, /* vec_unalign_load_cost */
407 1, /* vec_unalign_store_cost */
408 1, /* vec_store_cost */
409 3, /* cond_taken_branch_cost */
410 1 /* cond_not_taken_branch_cost */
413 /* ThunderX costs for vector insn classes. */
414 static const struct cpu_vector_cost thunderx_vector_cost =
416 1, /* scalar_int_stmt_cost */
417 1, /* scalar_fp_stmt_cost */
418 3, /* scalar_load_cost */
419 1, /* scalar_store_cost */
420 4, /* vec_int_stmt_cost */
421 1, /* vec_fp_stmt_cost */
422 4, /* vec_permute_cost */
423 2, /* vec_to_scalar_cost */
424 2, /* scalar_to_vec_cost */
425 3, /* vec_align_load_cost */
426 5, /* vec_unalign_load_cost */
427 5, /* vec_unalign_store_cost */
428 1, /* vec_store_cost */
429 3, /* cond_taken_branch_cost */
430 3 /* cond_not_taken_branch_cost */
433 /* Generic costs for vector insn classes. */
434 static const struct cpu_vector_cost cortexa57_vector_cost =
436 1, /* scalar_int_stmt_cost */
437 1, /* scalar_fp_stmt_cost */
438 4, /* scalar_load_cost */
439 1, /* scalar_store_cost */
440 2, /* vec_int_stmt_cost */
441 2, /* vec_fp_stmt_cost */
442 3, /* vec_permute_cost */
443 8, /* vec_to_scalar_cost */
444 8, /* scalar_to_vec_cost */
445 4, /* vec_align_load_cost */
446 4, /* vec_unalign_load_cost */
447 1, /* vec_unalign_store_cost */
448 1, /* vec_store_cost */
449 1, /* cond_taken_branch_cost */
450 1 /* cond_not_taken_branch_cost */
453 static const struct cpu_vector_cost exynosm1_vector_cost =
455 1, /* scalar_int_stmt_cost */
456 1, /* scalar_fp_stmt_cost */
457 5, /* scalar_load_cost */
458 1, /* scalar_store_cost */
459 3, /* vec_int_stmt_cost */
460 3, /* vec_fp_stmt_cost */
461 3, /* vec_permute_cost */
462 3, /* vec_to_scalar_cost */
463 3, /* scalar_to_vec_cost */
464 5, /* vec_align_load_cost */
465 5, /* vec_unalign_load_cost */
466 1, /* vec_unalign_store_cost */
467 1, /* vec_store_cost */
468 1, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
472 /* Generic costs for vector insn classes. */
473 static const struct cpu_vector_cost xgene1_vector_cost =
475 1, /* scalar_int_stmt_cost */
476 1, /* scalar_fp_stmt_cost */
477 5, /* scalar_load_cost */
478 1, /* scalar_store_cost */
479 2, /* vec_int_stmt_cost */
480 2, /* vec_fp_stmt_cost */
481 2, /* vec_permute_cost */
482 4, /* vec_to_scalar_cost */
483 4, /* scalar_to_vec_cost */
484 10, /* vec_align_load_cost */
485 10, /* vec_unalign_load_cost */
486 2, /* vec_unalign_store_cost */
487 2, /* vec_store_cost */
488 2, /* cond_taken_branch_cost */
489 1 /* cond_not_taken_branch_cost */
492 /* Costs for vector insn classes for Vulcan. */
493 static const struct cpu_vector_cost thunderx2t99_vector_cost =
495 1, /* scalar_int_stmt_cost */
496 6, /* scalar_fp_stmt_cost */
497 4, /* scalar_load_cost */
498 1, /* scalar_store_cost */
499 5, /* vec_int_stmt_cost */
500 6, /* vec_fp_stmt_cost */
501 3, /* vec_permute_cost */
502 6, /* vec_to_scalar_cost */
503 5, /* scalar_to_vec_cost */
504 8, /* vec_align_load_cost */
505 8, /* vec_unalign_load_cost */
506 4, /* vec_unalign_store_cost */
507 4, /* vec_store_cost */
508 2, /* cond_taken_branch_cost */
509 1 /* cond_not_taken_branch_cost */
512 /* Generic costs for branch instructions. */
513 static const struct cpu_branch_cost generic_branch_cost =
515 1, /* Predictable. */
516 3 /* Unpredictable. */
519 /* Generic approximation modes. */
520 static const cpu_approx_modes generic_approx_modes =
522 AARCH64_APPROX_NONE, /* division */
523 AARCH64_APPROX_NONE, /* sqrt */
524 AARCH64_APPROX_NONE /* recip_sqrt */
527 /* Approximation modes for Exynos M1. */
528 static const cpu_approx_modes exynosm1_approx_modes =
530 AARCH64_APPROX_NONE, /* division */
531 AARCH64_APPROX_ALL, /* sqrt */
532 AARCH64_APPROX_ALL /* recip_sqrt */
535 /* Approximation modes for X-Gene 1. */
536 static const cpu_approx_modes xgene1_approx_modes =
538 AARCH64_APPROX_NONE, /* division */
539 AARCH64_APPROX_NONE, /* sqrt */
540 AARCH64_APPROX_ALL /* recip_sqrt */
543 /* Generic prefetch settings (which disable prefetch). */
544 static const cpu_prefetch_tune generic_prefetch_tune =
546 0, /* num_slots */
547 -1, /* l1_cache_size */
548 -1, /* l1_cache_line_size */
549 -1, /* l2_cache_size */
550 -1 /* default_opt_level */
553 static const cpu_prefetch_tune exynosm1_prefetch_tune =
555 0, /* num_slots */
556 -1, /* l1_cache_size */
557 64, /* l1_cache_line_size */
558 -1, /* l2_cache_size */
559 -1 /* default_opt_level */
562 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
564 4, /* num_slots */
565 32, /* l1_cache_size */
566 64, /* l1_cache_line_size */
567 1024, /* l2_cache_size */
568 -1 /* default_opt_level */
571 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
573 8, /* num_slots */
574 32, /* l1_cache_size */
575 128, /* l1_cache_line_size */
576 16*1024, /* l2_cache_size */
577 3 /* default_opt_level */
580 static const cpu_prefetch_tune thunderx_prefetch_tune =
582 8, /* num_slots */
583 32, /* l1_cache_size */
584 128, /* l1_cache_line_size */
585 -1, /* l2_cache_size */
586 -1 /* default_opt_level */
589 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
591 8, /* num_slots */
592 32, /* l1_cache_size */
593 64, /* l1_cache_line_size */
594 256, /* l2_cache_size */
595 -1 /* default_opt_level */
598 static const struct tune_params generic_tunings =
600 &cortexa57_extra_costs,
601 &generic_addrcost_table,
602 &generic_regmove_cost,
603 &generic_vector_cost,
604 &generic_branch_cost,
605 &generic_approx_modes,
606 4, /* memmov_cost */
607 2, /* issue_rate */
608 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
609 8, /* function_align. */
610 4, /* jump_align. */
611 8, /* loop_align. */
612 2, /* int_reassoc_width. */
613 4, /* fp_reassoc_width. */
614 1, /* vec_reassoc_width. */
615 2, /* min_div_recip_mul_sf. */
616 2, /* min_div_recip_mul_df. */
617 0, /* max_case_values. */
618 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
619 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
620 &generic_prefetch_tune
623 static const struct tune_params cortexa35_tunings =
625 &cortexa53_extra_costs,
626 &generic_addrcost_table,
627 &cortexa53_regmove_cost,
628 &generic_vector_cost,
629 &generic_branch_cost,
630 &generic_approx_modes,
631 4, /* memmov_cost */
632 1, /* issue_rate */
633 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
634 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
635 16, /* function_align. */
636 4, /* jump_align. */
637 8, /* loop_align. */
638 2, /* int_reassoc_width. */
639 4, /* fp_reassoc_width. */
640 1, /* vec_reassoc_width. */
641 2, /* min_div_recip_mul_sf. */
642 2, /* min_div_recip_mul_df. */
643 0, /* max_case_values. */
644 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
645 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
646 &generic_prefetch_tune
649 static const struct tune_params cortexa53_tunings =
651 &cortexa53_extra_costs,
652 &generic_addrcost_table,
653 &cortexa53_regmove_cost,
654 &generic_vector_cost,
655 &generic_branch_cost,
656 &generic_approx_modes,
657 4, /* memmov_cost */
658 2, /* issue_rate */
659 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
660 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
661 16, /* function_align. */
662 4, /* jump_align. */
663 8, /* loop_align. */
664 2, /* int_reassoc_width. */
665 4, /* fp_reassoc_width. */
666 1, /* vec_reassoc_width. */
667 2, /* min_div_recip_mul_sf. */
668 2, /* min_div_recip_mul_df. */
669 0, /* max_case_values. */
670 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
671 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
672 &generic_prefetch_tune
675 static const struct tune_params cortexa57_tunings =
677 &cortexa57_extra_costs,
678 &generic_addrcost_table,
679 &cortexa57_regmove_cost,
680 &cortexa57_vector_cost,
681 &generic_branch_cost,
682 &generic_approx_modes,
683 4, /* memmov_cost */
684 3, /* issue_rate */
685 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
686 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
687 16, /* function_align. */
688 4, /* jump_align. */
689 8, /* loop_align. */
690 2, /* int_reassoc_width. */
691 4, /* fp_reassoc_width. */
692 1, /* vec_reassoc_width. */
693 2, /* min_div_recip_mul_sf. */
694 2, /* min_div_recip_mul_df. */
695 0, /* max_case_values. */
696 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
697 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
698 &generic_prefetch_tune
701 static const struct tune_params cortexa72_tunings =
703 &cortexa57_extra_costs,
704 &generic_addrcost_table,
705 &cortexa57_regmove_cost,
706 &cortexa57_vector_cost,
707 &generic_branch_cost,
708 &generic_approx_modes,
709 4, /* memmov_cost */
710 3, /* issue_rate */
711 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
712 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
713 16, /* function_align. */
714 4, /* jump_align. */
715 8, /* loop_align. */
716 2, /* int_reassoc_width. */
717 4, /* fp_reassoc_width. */
718 1, /* vec_reassoc_width. */
719 2, /* min_div_recip_mul_sf. */
720 2, /* min_div_recip_mul_df. */
721 0, /* max_case_values. */
722 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
723 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
724 &generic_prefetch_tune
727 static const struct tune_params cortexa73_tunings =
729 &cortexa57_extra_costs,
730 &generic_addrcost_table,
731 &cortexa57_regmove_cost,
732 &cortexa57_vector_cost,
733 &generic_branch_cost,
734 &generic_approx_modes,
735 4, /* memmov_cost. */
736 2, /* issue_rate. */
737 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
738 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
739 16, /* function_align. */
740 4, /* jump_align. */
741 8, /* loop_align. */
742 2, /* int_reassoc_width. */
743 4, /* fp_reassoc_width. */
744 1, /* vec_reassoc_width. */
745 2, /* min_div_recip_mul_sf. */
746 2, /* min_div_recip_mul_df. */
747 0, /* max_case_values. */
748 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
749 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
750 &generic_prefetch_tune
755 static const struct tune_params exynosm1_tunings =
757 &exynosm1_extra_costs,
758 &exynosm1_addrcost_table,
759 &exynosm1_regmove_cost,
760 &exynosm1_vector_cost,
761 &generic_branch_cost,
762 &exynosm1_approx_modes,
763 4, /* memmov_cost */
764 3, /* issue_rate */
765 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
766 4, /* function_align. */
767 4, /* jump_align. */
768 4, /* loop_align. */
769 2, /* int_reassoc_width. */
770 4, /* fp_reassoc_width. */
771 1, /* vec_reassoc_width. */
772 2, /* min_div_recip_mul_sf. */
773 2, /* min_div_recip_mul_df. */
774 48, /* max_case_values. */
775 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
776 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
777 &exynosm1_prefetch_tune
780 static const struct tune_params thunderxt88_tunings =
782 &thunderx_extra_costs,
783 &generic_addrcost_table,
784 &thunderx_regmove_cost,
785 &thunderx_vector_cost,
786 &generic_branch_cost,
787 &generic_approx_modes,
788 6, /* memmov_cost */
789 2, /* issue_rate */
790 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
791 8, /* function_align. */
792 8, /* jump_align. */
793 8, /* loop_align. */
794 2, /* int_reassoc_width. */
795 4, /* fp_reassoc_width. */
796 1, /* vec_reassoc_width. */
797 2, /* min_div_recip_mul_sf. */
798 2, /* min_div_recip_mul_df. */
799 0, /* max_case_values. */
800 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
801 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
802 &thunderxt88_prefetch_tune
805 static const struct tune_params thunderx_tunings =
807 &thunderx_extra_costs,
808 &generic_addrcost_table,
809 &thunderx_regmove_cost,
810 &thunderx_vector_cost,
811 &generic_branch_cost,
812 &generic_approx_modes,
813 6, /* memmov_cost */
814 2, /* issue_rate */
815 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
816 8, /* function_align. */
817 8, /* jump_align. */
818 8, /* loop_align. */
819 2, /* int_reassoc_width. */
820 4, /* fp_reassoc_width. */
821 1, /* vec_reassoc_width. */
822 2, /* min_div_recip_mul_sf. */
823 2, /* min_div_recip_mul_df. */
824 0, /* max_case_values. */
825 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
826 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
827 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
828 &thunderx_prefetch_tune
831 static const struct tune_params xgene1_tunings =
833 &xgene1_extra_costs,
834 &xgene1_addrcost_table,
835 &xgene1_regmove_cost,
836 &xgene1_vector_cost,
837 &generic_branch_cost,
838 &xgene1_approx_modes,
839 6, /* memmov_cost */
840 4, /* issue_rate */
841 AARCH64_FUSE_NOTHING, /* fusible_ops */
842 16, /* function_align. */
843 8, /* jump_align. */
844 16, /* loop_align. */
845 2, /* int_reassoc_width. */
846 4, /* fp_reassoc_width. */
847 1, /* vec_reassoc_width. */
848 2, /* min_div_recip_mul_sf. */
849 2, /* min_div_recip_mul_df. */
850 0, /* max_case_values. */
851 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
852 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
853 &generic_prefetch_tune
856 static const struct tune_params qdf24xx_tunings =
858 &qdf24xx_extra_costs,
859 &generic_addrcost_table,
860 &qdf24xx_regmove_cost,
861 &generic_vector_cost,
862 &generic_branch_cost,
863 &generic_approx_modes,
864 4, /* memmov_cost */
865 4, /* issue_rate */
866 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
867 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
868 16, /* function_align. */
869 8, /* jump_align. */
870 16, /* loop_align. */
871 2, /* int_reassoc_width. */
872 4, /* fp_reassoc_width. */
873 1, /* vec_reassoc_width. */
874 2, /* min_div_recip_mul_sf. */
875 2, /* min_div_recip_mul_df. */
876 0, /* max_case_values. */
877 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
878 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
879 &qdf24xx_prefetch_tune
882 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
883 for now. */
884 static const struct tune_params saphira_tunings =
886 &generic_extra_costs,
887 &generic_addrcost_table,
888 &generic_regmove_cost,
889 &generic_vector_cost,
890 &generic_branch_cost,
891 &generic_approx_modes,
892 4, /* memmov_cost */
893 4, /* issue_rate */
894 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
895 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
896 16, /* function_align. */
897 8, /* jump_align. */
898 16, /* loop_align. */
899 2, /* int_reassoc_width. */
900 4, /* fp_reassoc_width. */
901 1, /* vec_reassoc_width. */
902 2, /* min_div_recip_mul_sf. */
903 2, /* min_div_recip_mul_df. */
904 0, /* max_case_values. */
905 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
906 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
907 &generic_prefetch_tune
910 static const struct tune_params thunderx2t99_tunings =
912 &thunderx2t99_extra_costs,
913 &thunderx2t99_addrcost_table,
914 &thunderx2t99_regmove_cost,
915 &thunderx2t99_vector_cost,
916 &generic_branch_cost,
917 &generic_approx_modes,
918 4, /* memmov_cost. */
919 4, /* issue_rate. */
920 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
921 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
922 16, /* function_align. */
923 8, /* jump_align. */
924 16, /* loop_align. */
925 3, /* int_reassoc_width. */
926 2, /* fp_reassoc_width. */
927 2, /* vec_reassoc_width. */
928 2, /* min_div_recip_mul_sf. */
929 2, /* min_div_recip_mul_df. */
930 0, /* max_case_values. */
931 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
932 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
933 &thunderx2t99_prefetch_tune
936 /* Support for fine-grained override of the tuning structures. */
937 struct aarch64_tuning_override_function
939 const char* name;
940 void (*parse_override)(const char*, struct tune_params*);
943 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
944 static void aarch64_parse_tune_string (const char*, struct tune_params*);
946 static const struct aarch64_tuning_override_function
947 aarch64_tuning_override_functions[] =
949 { "fuse", aarch64_parse_fuse_string },
950 { "tune", aarch64_parse_tune_string },
951 { NULL, NULL }
954 /* A processor implementing AArch64. */
955 struct processor
957 const char *const name;
958 enum aarch64_processor ident;
959 enum aarch64_processor sched_core;
960 enum aarch64_arch arch;
961 unsigned architecture_version;
962 const unsigned long flags;
963 const struct tune_params *const tune;
966 /* Architectures implementing AArch64. */
967 static const struct processor all_architectures[] =
969 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
970 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
971 #include "aarch64-arches.def"
972 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
975 /* Processor cores implementing AArch64. */
976 static const struct processor all_cores[] =
978 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
979 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
980 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
981 FLAGS, &COSTS##_tunings},
982 #include "aarch64-cores.def"
983 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
984 AARCH64_FL_FOR_ARCH8, &generic_tunings},
985 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
989 /* Target specification. These are populated by the -march, -mtune, -mcpu
990 handling code or by target attributes. */
991 static const struct processor *selected_arch;
992 static const struct processor *selected_cpu;
993 static const struct processor *selected_tune;
995 /* The current tuning set. */
996 struct tune_params aarch64_tune_params = generic_tunings;
998 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1000 /* An ISA extension in the co-processor and main instruction set space. */
1001 struct aarch64_option_extension
1003 const char *const name;
1004 const unsigned long flags_on;
1005 const unsigned long flags_off;
1008 typedef enum aarch64_cond_code
1010 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1011 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1012 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1014 aarch64_cc;
1016 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1018 /* The condition codes of the processor, and the inverse function. */
1019 static const char * const aarch64_condition_codes[] =
1021 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1022 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1025 /* Generate code to enable conditional branches in functions over 1 MiB. */
1026 const char *
1027 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1028 const char * branch_format)
1030 rtx_code_label * tmp_label = gen_label_rtx ();
1031 char label_buf[256];
1032 char buffer[128];
1033 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1034 CODE_LABEL_NUMBER (tmp_label));
1035 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1036 rtx dest_label = operands[pos_label];
1037 operands[pos_label] = tmp_label;
1039 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1040 output_asm_insn (buffer, operands);
1042 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1043 operands[pos_label] = dest_label;
1044 output_asm_insn (buffer, operands);
1045 return "";
1048 void
1049 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1051 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1052 if (TARGET_GENERAL_REGS_ONLY)
1053 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1054 else
1055 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1058 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1059 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1060 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
1061 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1062 cost (in this case the best class is the lowest cost one). Using ALL_REGS
1063 irrespectively of its cost results in bad allocations with many redundant
1064 int<->FP moves which are expensive on various cores.
1065 To avoid this we don't allow ALL_REGS as the allocno class, but force a
1066 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
1067 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
1068 Otherwise set the allocno class depending on the mode.
1069 The result of this is that it is no longer inefficient to have a higher
1070 memory move cost than the register move cost.
1073 static reg_class_t
1074 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1075 reg_class_t best_class)
1077 machine_mode mode;
1079 if (allocno_class != ALL_REGS)
1080 return allocno_class;
1082 if (best_class != ALL_REGS)
1083 return best_class;
1085 mode = PSEUDO_REGNO_MODE (regno);
1086 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1089 static unsigned int
1090 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1092 if (GET_MODE_UNIT_SIZE (mode) == 4)
1093 return aarch64_tune_params.min_div_recip_mul_sf;
1094 return aarch64_tune_params.min_div_recip_mul_df;
1097 /* Return the reassociation width of treeop OPC with mode MODE. */
1098 static int
1099 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1101 if (VECTOR_MODE_P (mode))
1102 return aarch64_tune_params.vec_reassoc_width;
1103 if (INTEGRAL_MODE_P (mode))
1104 return aarch64_tune_params.int_reassoc_width;
1105 /* Avoid reassociating floating point addition so we emit more FMAs. */
1106 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1107 return aarch64_tune_params.fp_reassoc_width;
1108 return 1;
1111 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1112 unsigned
1113 aarch64_dbx_register_number (unsigned regno)
1115 if (GP_REGNUM_P (regno))
1116 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1117 else if (regno == SP_REGNUM)
1118 return AARCH64_DWARF_SP;
1119 else if (FP_REGNUM_P (regno))
1120 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1121 else if (PR_REGNUM_P (regno))
1122 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1123 else if (regno == VG_REGNUM)
1124 return AARCH64_DWARF_VG;
1126 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1127 equivalent DWARF register. */
1128 return DWARF_FRAME_REGISTERS;
1131 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1132 static bool
1133 aarch64_advsimd_struct_mode_p (machine_mode mode)
1135 return (TARGET_SIMD
1136 && (mode == OImode || mode == CImode || mode == XImode));
1139 /* Return true if MODE is an SVE predicate mode. */
1140 static bool
1141 aarch64_sve_pred_mode_p (machine_mode mode)
1143 return (TARGET_SVE
1144 && (mode == VNx16BImode
1145 || mode == VNx8BImode
1146 || mode == VNx4BImode
1147 || mode == VNx2BImode));
1150 /* Three mutually-exclusive flags describing a vector or predicate type. */
1151 const unsigned int VEC_ADVSIMD = 1;
1152 const unsigned int VEC_SVE_DATA = 2;
1153 const unsigned int VEC_SVE_PRED = 4;
1154 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1155 a structure of 2, 3 or 4 vectors. */
1156 const unsigned int VEC_STRUCT = 8;
1157 /* Useful combinations of the above. */
1158 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1159 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1161 /* Return a set of flags describing the vector properties of mode MODE.
1162 Ignore modes that are not supported by the current target. */
1163 static unsigned int
1164 aarch64_classify_vector_mode (machine_mode mode)
1166 if (aarch64_advsimd_struct_mode_p (mode))
1167 return VEC_ADVSIMD | VEC_STRUCT;
1169 if (aarch64_sve_pred_mode_p (mode))
1170 return VEC_SVE_PRED;
1172 scalar_mode inner = GET_MODE_INNER (mode);
1173 if (VECTOR_MODE_P (mode)
1174 && (inner == QImode
1175 || inner == HImode
1176 || inner == HFmode
1177 || inner == SImode
1178 || inner == SFmode
1179 || inner == DImode
1180 || inner == DFmode))
1182 if (TARGET_SVE)
1184 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1185 return VEC_SVE_DATA;
1186 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1187 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1188 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1189 return VEC_SVE_DATA | VEC_STRUCT;
1192 /* This includes V1DF but not V1DI (which doesn't exist). */
1193 if (TARGET_SIMD
1194 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1195 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1196 return VEC_ADVSIMD;
1199 return 0;
1202 /* Return true if MODE is any of the data vector modes, including
1203 structure modes. */
1204 static bool
1205 aarch64_vector_data_mode_p (machine_mode mode)
1207 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1210 /* Return true if MODE is an SVE data vector mode; either a single vector
1211 or a structure of vectors. */
1212 static bool
1213 aarch64_sve_data_mode_p (machine_mode mode)
1215 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1218 /* Implement target hook TARGET_ARRAY_MODE. */
1219 static opt_machine_mode
1220 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1222 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1223 && IN_RANGE (nelems, 2, 4))
1224 return mode_for_vector (GET_MODE_INNER (mode),
1225 GET_MODE_NUNITS (mode) * nelems);
1227 return opt_machine_mode ();
1230 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1231 static bool
1232 aarch64_array_mode_supported_p (machine_mode mode,
1233 unsigned HOST_WIDE_INT nelems)
1235 if (TARGET_SIMD
1236 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1237 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1238 && (nelems >= 2 && nelems <= 4))
1239 return true;
1241 return false;
1244 /* Return the SVE predicate mode to use for elements that have
1245 ELEM_NBYTES bytes, if such a mode exists. */
1247 opt_machine_mode
1248 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1250 if (TARGET_SVE)
1252 if (elem_nbytes == 1)
1253 return VNx16BImode;
1254 if (elem_nbytes == 2)
1255 return VNx8BImode;
1256 if (elem_nbytes == 4)
1257 return VNx4BImode;
1258 if (elem_nbytes == 8)
1259 return VNx2BImode;
1261 return opt_machine_mode ();
1264 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1266 static opt_machine_mode
1267 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1269 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1271 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1272 machine_mode pred_mode;
1273 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1274 return pred_mode;
1277 return default_get_mask_mode (nunits, nbytes);
1280 /* Implement TARGET_HARD_REGNO_NREGS. */
1282 static unsigned int
1283 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1285 /* ??? Logically we should only need to provide a value when
1286 HARD_REGNO_MODE_OK says that the combination is valid,
1287 but at the moment we need to handle all modes. Just ignore
1288 any runtime parts for registers that can't store them. */
1289 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1290 switch (aarch64_regno_regclass (regno))
1292 case FP_REGS:
1293 case FP_LO_REGS:
1294 if (aarch64_sve_data_mode_p (mode))
1295 return exact_div (GET_MODE_SIZE (mode),
1296 BYTES_PER_SVE_VECTOR).to_constant ();
1297 return CEIL (lowest_size, UNITS_PER_VREG);
1298 case PR_REGS:
1299 case PR_LO_REGS:
1300 case PR_HI_REGS:
1301 return 1;
1302 default:
1303 return CEIL (lowest_size, UNITS_PER_WORD);
1305 gcc_unreachable ();
1308 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1310 static bool
1311 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1313 if (GET_MODE_CLASS (mode) == MODE_CC)
1314 return regno == CC_REGNUM;
1316 if (regno == VG_REGNUM)
1317 /* This must have the same size as _Unwind_Word. */
1318 return mode == DImode;
1320 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1321 if (vec_flags & VEC_SVE_PRED)
1322 return PR_REGNUM_P (regno);
1324 if (PR_REGNUM_P (regno))
1325 return 0;
1327 if (regno == SP_REGNUM)
1328 /* The purpose of comparing with ptr_mode is to support the
1329 global register variable associated with the stack pointer
1330 register via the syntax of asm ("wsp") in ILP32. */
1331 return mode == Pmode || mode == ptr_mode;
1333 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1334 return mode == Pmode;
1336 if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1337 return true;
1339 if (FP_REGNUM_P (regno))
1341 if (vec_flags & VEC_STRUCT)
1342 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1343 else
1344 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1347 return false;
1350 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1351 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1352 clobbers the top 64 bits when restoring the bottom 64 bits. */
1354 static bool
1355 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1357 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1360 /* Implement REGMODE_NATURAL_SIZE. */
1361 poly_uint64
1362 aarch64_regmode_natural_size (machine_mode mode)
1364 /* The natural size for SVE data modes is one SVE data vector,
1365 and similarly for predicates. We can't independently modify
1366 anything smaller than that. */
1367 /* ??? For now, only do this for variable-width SVE registers.
1368 Doing it for constant-sized registers breaks lower-subreg.c. */
1369 /* ??? And once that's fixed, we should probably have similar
1370 code for Advanced SIMD. */
1371 if (!aarch64_sve_vg.is_constant ())
1373 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1374 if (vec_flags & VEC_SVE_PRED)
1375 return BYTES_PER_SVE_PRED;
1376 if (vec_flags & VEC_SVE_DATA)
1377 return BYTES_PER_SVE_VECTOR;
1379 return UNITS_PER_WORD;
1382 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1383 machine_mode
1384 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1385 machine_mode mode)
1387 /* The predicate mode determines which bits are significant and
1388 which are "don't care". Decreasing the number of lanes would
1389 lose data while increasing the number of lanes would make bits
1390 unnecessarily significant. */
1391 if (PR_REGNUM_P (regno))
1392 return mode;
1393 if (known_ge (GET_MODE_SIZE (mode), 4))
1394 return mode;
1395 else
1396 return SImode;
1399 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1400 that strcpy from constants will be faster. */
1402 static HOST_WIDE_INT
1403 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1405 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1406 return MAX (align, BITS_PER_WORD);
1407 return align;
1410 /* Return true if calls to DECL should be treated as
1411 long-calls (ie called via a register). */
1412 static bool
1413 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1415 return false;
1418 /* Return true if calls to symbol-ref SYM should be treated as
1419 long-calls (ie called via a register). */
1420 bool
1421 aarch64_is_long_call_p (rtx sym)
1423 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1426 /* Return true if calls to symbol-ref SYM should not go through
1427 plt stubs. */
1429 bool
1430 aarch64_is_noplt_call_p (rtx sym)
1432 const_tree decl = SYMBOL_REF_DECL (sym);
1434 if (flag_pic
1435 && decl
1436 && (!flag_plt
1437 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1438 && !targetm.binds_local_p (decl))
1439 return true;
1441 return false;
1444 /* Return true if the offsets to a zero/sign-extract operation
1445 represent an expression that matches an extend operation. The
1446 operands represent the paramters from
1448 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1449 bool
1450 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1451 rtx extract_imm)
1453 HOST_WIDE_INT mult_val, extract_val;
1455 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1456 return false;
1458 mult_val = INTVAL (mult_imm);
1459 extract_val = INTVAL (extract_imm);
1461 if (extract_val > 8
1462 && extract_val < GET_MODE_BITSIZE (mode)
1463 && exact_log2 (extract_val & ~7) > 0
1464 && (extract_val & 7) <= 4
1465 && mult_val == (1 << (extract_val & 7)))
1466 return true;
1468 return false;
1471 /* Emit an insn that's a simple single-set. Both the operands must be
1472 known to be valid. */
1473 inline static rtx_insn *
1474 emit_set_insn (rtx x, rtx y)
1476 return emit_insn (gen_rtx_SET (x, y));
1479 /* X and Y are two things to compare using CODE. Emit the compare insn and
1480 return the rtx for register 0 in the proper mode. */
1482 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1484 machine_mode mode = SELECT_CC_MODE (code, x, y);
1485 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1487 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1488 return cc_reg;
1491 /* Build the SYMBOL_REF for __tls_get_addr. */
1493 static GTY(()) rtx tls_get_addr_libfunc;
1496 aarch64_tls_get_addr (void)
1498 if (!tls_get_addr_libfunc)
1499 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1500 return tls_get_addr_libfunc;
1503 /* Return the TLS model to use for ADDR. */
1505 static enum tls_model
1506 tls_symbolic_operand_type (rtx addr)
1508 enum tls_model tls_kind = TLS_MODEL_NONE;
1509 if (GET_CODE (addr) == CONST)
1511 poly_int64 addend;
1512 rtx sym = strip_offset (addr, &addend);
1513 if (GET_CODE (sym) == SYMBOL_REF)
1514 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1516 else if (GET_CODE (addr) == SYMBOL_REF)
1517 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1519 return tls_kind;
1522 /* We'll allow lo_sum's in addresses in our legitimate addresses
1523 so that combine would take care of combining addresses where
1524 necessary, but for generation purposes, we'll generate the address
1525 as :
1526 RTL Absolute
1527 tmp = hi (symbol_ref); adrp x1, foo
1528 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1531 PIC TLS
1532 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1533 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1534 bl __tls_get_addr
1537 Load TLS symbol, depending on TLS mechanism and TLS access model.
1539 Global Dynamic - Traditional TLS:
1540 adrp tmp, :tlsgd:imm
1541 add dest, tmp, #:tlsgd_lo12:imm
1542 bl __tls_get_addr
1544 Global Dynamic - TLS Descriptors:
1545 adrp dest, :tlsdesc:imm
1546 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1547 add dest, dest, #:tlsdesc_lo12:imm
1548 blr tmp
1549 mrs tp, tpidr_el0
1550 add dest, dest, tp
1552 Initial Exec:
1553 mrs tp, tpidr_el0
1554 adrp tmp, :gottprel:imm
1555 ldr dest, [tmp, #:gottprel_lo12:imm]
1556 add dest, dest, tp
1558 Local Exec:
1559 mrs tp, tpidr_el0
1560 add t0, tp, #:tprel_hi12:imm, lsl #12
1561 add t0, t0, #:tprel_lo12_nc:imm
1564 static void
1565 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1566 enum aarch64_symbol_type type)
1568 switch (type)
1570 case SYMBOL_SMALL_ABSOLUTE:
1572 /* In ILP32, the mode of dest can be either SImode or DImode. */
1573 rtx tmp_reg = dest;
1574 machine_mode mode = GET_MODE (dest);
1576 gcc_assert (mode == Pmode || mode == ptr_mode);
1578 if (can_create_pseudo_p ())
1579 tmp_reg = gen_reg_rtx (mode);
1581 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1582 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1583 return;
1586 case SYMBOL_TINY_ABSOLUTE:
1587 emit_insn (gen_rtx_SET (dest, imm));
1588 return;
1590 case SYMBOL_SMALL_GOT_28K:
1592 machine_mode mode = GET_MODE (dest);
1593 rtx gp_rtx = pic_offset_table_rtx;
1594 rtx insn;
1595 rtx mem;
1597 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1598 here before rtl expand. Tree IVOPT will generate rtl pattern to
1599 decide rtx costs, in which case pic_offset_table_rtx is not
1600 initialized. For that case no need to generate the first adrp
1601 instruction as the final cost for global variable access is
1602 one instruction. */
1603 if (gp_rtx != NULL)
1605 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1606 using the page base as GOT base, the first page may be wasted,
1607 in the worst scenario, there is only 28K space for GOT).
1609 The generate instruction sequence for accessing global variable
1612 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1614 Only one instruction needed. But we must initialize
1615 pic_offset_table_rtx properly. We generate initialize insn for
1616 every global access, and allow CSE to remove all redundant.
1618 The final instruction sequences will look like the following
1619 for multiply global variables access.
1621 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1623 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1624 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1625 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1626 ... */
1628 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1629 crtl->uses_pic_offset_table = 1;
1630 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1632 if (mode != GET_MODE (gp_rtx))
1633 gp_rtx = gen_lowpart (mode, gp_rtx);
1637 if (mode == ptr_mode)
1639 if (mode == DImode)
1640 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1641 else
1642 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1644 mem = XVECEXP (SET_SRC (insn), 0, 0);
1646 else
1648 gcc_assert (mode == Pmode);
1650 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1651 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1654 /* The operand is expected to be MEM. Whenever the related insn
1655 pattern changed, above code which calculate mem should be
1656 updated. */
1657 gcc_assert (GET_CODE (mem) == MEM);
1658 MEM_READONLY_P (mem) = 1;
1659 MEM_NOTRAP_P (mem) = 1;
1660 emit_insn (insn);
1661 return;
1664 case SYMBOL_SMALL_GOT_4G:
1666 /* In ILP32, the mode of dest can be either SImode or DImode,
1667 while the got entry is always of SImode size. The mode of
1668 dest depends on how dest is used: if dest is assigned to a
1669 pointer (e.g. in the memory), it has SImode; it may have
1670 DImode if dest is dereferenced to access the memeory.
1671 This is why we have to handle three different ldr_got_small
1672 patterns here (two patterns for ILP32). */
1674 rtx insn;
1675 rtx mem;
1676 rtx tmp_reg = dest;
1677 machine_mode mode = GET_MODE (dest);
1679 if (can_create_pseudo_p ())
1680 tmp_reg = gen_reg_rtx (mode);
1682 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1683 if (mode == ptr_mode)
1685 if (mode == DImode)
1686 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1687 else
1688 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1690 mem = XVECEXP (SET_SRC (insn), 0, 0);
1692 else
1694 gcc_assert (mode == Pmode);
1696 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1697 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1700 gcc_assert (GET_CODE (mem) == MEM);
1701 MEM_READONLY_P (mem) = 1;
1702 MEM_NOTRAP_P (mem) = 1;
1703 emit_insn (insn);
1704 return;
1707 case SYMBOL_SMALL_TLSGD:
1709 rtx_insn *insns;
1710 machine_mode mode = GET_MODE (dest);
1711 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1713 start_sequence ();
1714 if (TARGET_ILP32)
1715 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1716 else
1717 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1718 insns = get_insns ();
1719 end_sequence ();
1721 RTL_CONST_CALL_P (insns) = 1;
1722 emit_libcall_block (insns, dest, result, imm);
1723 return;
1726 case SYMBOL_SMALL_TLSDESC:
1728 machine_mode mode = GET_MODE (dest);
1729 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1730 rtx tp;
1732 gcc_assert (mode == Pmode || mode == ptr_mode);
1734 /* In ILP32, the got entry is always of SImode size. Unlike
1735 small GOT, the dest is fixed at reg 0. */
1736 if (TARGET_ILP32)
1737 emit_insn (gen_tlsdesc_small_si (imm));
1738 else
1739 emit_insn (gen_tlsdesc_small_di (imm));
1740 tp = aarch64_load_tp (NULL);
1742 if (mode != Pmode)
1743 tp = gen_lowpart (mode, tp);
1745 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1746 if (REG_P (dest))
1747 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1748 return;
1751 case SYMBOL_SMALL_TLSIE:
1753 /* In ILP32, the mode of dest can be either SImode or DImode,
1754 while the got entry is always of SImode size. The mode of
1755 dest depends on how dest is used: if dest is assigned to a
1756 pointer (e.g. in the memory), it has SImode; it may have
1757 DImode if dest is dereferenced to access the memeory.
1758 This is why we have to handle three different tlsie_small
1759 patterns here (two patterns for ILP32). */
1760 machine_mode mode = GET_MODE (dest);
1761 rtx tmp_reg = gen_reg_rtx (mode);
1762 rtx tp = aarch64_load_tp (NULL);
1764 if (mode == ptr_mode)
1766 if (mode == DImode)
1767 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1768 else
1770 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1771 tp = gen_lowpart (mode, tp);
1774 else
1776 gcc_assert (mode == Pmode);
1777 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1780 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1781 if (REG_P (dest))
1782 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1783 return;
1786 case SYMBOL_TLSLE12:
1787 case SYMBOL_TLSLE24:
1788 case SYMBOL_TLSLE32:
1789 case SYMBOL_TLSLE48:
1791 machine_mode mode = GET_MODE (dest);
1792 rtx tp = aarch64_load_tp (NULL);
1794 if (mode != Pmode)
1795 tp = gen_lowpart (mode, tp);
1797 switch (type)
1799 case SYMBOL_TLSLE12:
1800 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1801 (dest, tp, imm));
1802 break;
1803 case SYMBOL_TLSLE24:
1804 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1805 (dest, tp, imm));
1806 break;
1807 case SYMBOL_TLSLE32:
1808 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1809 (dest, imm));
1810 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1811 (dest, dest, tp));
1812 break;
1813 case SYMBOL_TLSLE48:
1814 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1815 (dest, imm));
1816 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1817 (dest, dest, tp));
1818 break;
1819 default:
1820 gcc_unreachable ();
1823 if (REG_P (dest))
1824 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1825 return;
1828 case SYMBOL_TINY_GOT:
1829 emit_insn (gen_ldr_got_tiny (dest, imm));
1830 return;
1832 case SYMBOL_TINY_TLSIE:
1834 machine_mode mode = GET_MODE (dest);
1835 rtx tp = aarch64_load_tp (NULL);
1837 if (mode == ptr_mode)
1839 if (mode == DImode)
1840 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1841 else
1843 tp = gen_lowpart (mode, tp);
1844 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1847 else
1849 gcc_assert (mode == Pmode);
1850 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1853 if (REG_P (dest))
1854 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1855 return;
1858 default:
1859 gcc_unreachable ();
1863 /* Emit a move from SRC to DEST. Assume that the move expanders can
1864 handle all moves if !can_create_pseudo_p (). The distinction is
1865 important because, unlike emit_move_insn, the move expanders know
1866 how to force Pmode objects into the constant pool even when the
1867 constant pool address is not itself legitimate. */
1868 static rtx
1869 aarch64_emit_move (rtx dest, rtx src)
1871 return (can_create_pseudo_p ()
1872 ? emit_move_insn (dest, src)
1873 : emit_move_insn_1 (dest, src));
1876 /* Split a 128-bit move operation into two 64-bit move operations,
1877 taking care to handle partial overlap of register to register
1878 copies. Special cases are needed when moving between GP regs and
1879 FP regs. SRC can be a register, constant or memory; DST a register
1880 or memory. If either operand is memory it must not have any side
1881 effects. */
1882 void
1883 aarch64_split_128bit_move (rtx dst, rtx src)
1885 rtx dst_lo, dst_hi;
1886 rtx src_lo, src_hi;
1888 machine_mode mode = GET_MODE (dst);
1890 gcc_assert (mode == TImode || mode == TFmode);
1891 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1892 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1894 if (REG_P (dst) && REG_P (src))
1896 int src_regno = REGNO (src);
1897 int dst_regno = REGNO (dst);
1899 /* Handle FP <-> GP regs. */
1900 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1902 src_lo = gen_lowpart (word_mode, src);
1903 src_hi = gen_highpart (word_mode, src);
1905 if (mode == TImode)
1907 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1908 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1910 else
1912 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1913 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1915 return;
1917 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1919 dst_lo = gen_lowpart (word_mode, dst);
1920 dst_hi = gen_highpart (word_mode, dst);
1922 if (mode == TImode)
1924 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1925 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1927 else
1929 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1930 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1932 return;
1936 dst_lo = gen_lowpart (word_mode, dst);
1937 dst_hi = gen_highpart (word_mode, dst);
1938 src_lo = gen_lowpart (word_mode, src);
1939 src_hi = gen_highpart_mode (word_mode, mode, src);
1941 /* At most one pairing may overlap. */
1942 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1944 aarch64_emit_move (dst_hi, src_hi);
1945 aarch64_emit_move (dst_lo, src_lo);
1947 else
1949 aarch64_emit_move (dst_lo, src_lo);
1950 aarch64_emit_move (dst_hi, src_hi);
1954 bool
1955 aarch64_split_128bit_move_p (rtx dst, rtx src)
1957 return (! REG_P (src)
1958 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1961 /* Split a complex SIMD combine. */
1963 void
1964 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1966 machine_mode src_mode = GET_MODE (src1);
1967 machine_mode dst_mode = GET_MODE (dst);
1969 gcc_assert (VECTOR_MODE_P (dst_mode));
1970 gcc_assert (register_operand (dst, dst_mode)
1971 && register_operand (src1, src_mode)
1972 && register_operand (src2, src_mode));
1974 rtx (*gen) (rtx, rtx, rtx);
1976 switch (src_mode)
1978 case E_V8QImode:
1979 gen = gen_aarch64_simd_combinev8qi;
1980 break;
1981 case E_V4HImode:
1982 gen = gen_aarch64_simd_combinev4hi;
1983 break;
1984 case E_V2SImode:
1985 gen = gen_aarch64_simd_combinev2si;
1986 break;
1987 case E_V4HFmode:
1988 gen = gen_aarch64_simd_combinev4hf;
1989 break;
1990 case E_V2SFmode:
1991 gen = gen_aarch64_simd_combinev2sf;
1992 break;
1993 case E_DImode:
1994 gen = gen_aarch64_simd_combinedi;
1995 break;
1996 case E_DFmode:
1997 gen = gen_aarch64_simd_combinedf;
1998 break;
1999 default:
2000 gcc_unreachable ();
2003 emit_insn (gen (dst, src1, src2));
2004 return;
2007 /* Split a complex SIMD move. */
2009 void
2010 aarch64_split_simd_move (rtx dst, rtx src)
2012 machine_mode src_mode = GET_MODE (src);
2013 machine_mode dst_mode = GET_MODE (dst);
2015 gcc_assert (VECTOR_MODE_P (dst_mode));
2017 if (REG_P (dst) && REG_P (src))
2019 rtx (*gen) (rtx, rtx);
2021 gcc_assert (VECTOR_MODE_P (src_mode));
2023 switch (src_mode)
2025 case E_V16QImode:
2026 gen = gen_aarch64_split_simd_movv16qi;
2027 break;
2028 case E_V8HImode:
2029 gen = gen_aarch64_split_simd_movv8hi;
2030 break;
2031 case E_V4SImode:
2032 gen = gen_aarch64_split_simd_movv4si;
2033 break;
2034 case E_V2DImode:
2035 gen = gen_aarch64_split_simd_movv2di;
2036 break;
2037 case E_V8HFmode:
2038 gen = gen_aarch64_split_simd_movv8hf;
2039 break;
2040 case E_V4SFmode:
2041 gen = gen_aarch64_split_simd_movv4sf;
2042 break;
2043 case E_V2DFmode:
2044 gen = gen_aarch64_split_simd_movv2df;
2045 break;
2046 default:
2047 gcc_unreachable ();
2050 emit_insn (gen (dst, src));
2051 return;
2055 bool
2056 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2057 machine_mode ymode, rtx y)
2059 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2060 gcc_assert (r != NULL);
2061 return rtx_equal_p (x, r);
2065 static rtx
2066 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2068 if (can_create_pseudo_p ())
2069 return force_reg (mode, value);
2070 else
2072 gcc_assert (x);
2073 aarch64_emit_move (x, value);
2074 return x;
2078 /* Return true if we can move VALUE into a register using a single
2079 CNT[BHWD] instruction. */
2081 static bool
2082 aarch64_sve_cnt_immediate_p (poly_int64 value)
2084 HOST_WIDE_INT factor = value.coeffs[0];
2085 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2086 return (value.coeffs[1] == factor
2087 && IN_RANGE (factor, 2, 16 * 16)
2088 && (factor & 1) == 0
2089 && factor <= 16 * (factor & -factor));
2092 /* Likewise for rtx X. */
2094 bool
2095 aarch64_sve_cnt_immediate_p (rtx x)
2097 poly_int64 value;
2098 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2101 /* Return the asm string for an instruction with a CNT-like vector size
2102 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2103 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2104 first part of the operands template (the part that comes before the
2105 vector size itself). FACTOR is the number of quadwords.
2106 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2107 If it is zero, we can use any element size. */
2109 static char *
2110 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2111 unsigned int factor,
2112 unsigned int nelts_per_vq)
2114 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2116 if (nelts_per_vq == 0)
2117 /* There is some overlap in the ranges of the four CNT instructions.
2118 Here we always use the smallest possible element size, so that the
2119 multiplier is 1 whereever possible. */
2120 nelts_per_vq = factor & -factor;
2121 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2122 gcc_assert (IN_RANGE (shift, 1, 4));
2123 char suffix = "dwhb"[shift - 1];
2125 factor >>= shift;
2126 unsigned int written;
2127 if (factor == 1)
2128 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2129 prefix, suffix, operands);
2130 else
2131 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2132 prefix, suffix, operands, factor);
2133 gcc_assert (written < sizeof (buffer));
2134 return buffer;
2137 /* Return the asm string for an instruction with a CNT-like vector size
2138 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2139 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2140 first part of the operands template (the part that comes before the
2141 vector size itself). X is the value of the vector size operand,
2142 as a polynomial integer rtx. */
2144 char *
2145 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2146 rtx x)
2148 poly_int64 value = rtx_to_poly_int64 (x);
2149 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2150 return aarch64_output_sve_cnt_immediate (prefix, operands,
2151 value.coeffs[1], 0);
2154 /* Return true if we can add VALUE to a register using a single ADDVL
2155 or ADDPL instruction. */
2157 static bool
2158 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2160 HOST_WIDE_INT factor = value.coeffs[0];
2161 if (factor == 0 || value.coeffs[1] != factor)
2162 return false;
2163 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2164 and a value of 16 is one vector width. */
2165 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2166 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2169 /* Likewise for rtx X. */
2171 bool
2172 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2174 poly_int64 value;
2175 return (poly_int_rtx_p (x, &value)
2176 && aarch64_sve_addvl_addpl_immediate_p (value));
2179 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2180 and storing the result in operand 0. */
2182 char *
2183 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2185 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2186 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2187 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2189 /* Use INC or DEC if possible. */
2190 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2192 if (aarch64_sve_cnt_immediate_p (offset_value))
2193 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2194 offset_value.coeffs[1], 0);
2195 if (aarch64_sve_cnt_immediate_p (-offset_value))
2196 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2197 -offset_value.coeffs[1], 0);
2200 int factor = offset_value.coeffs[1];
2201 if ((factor & 15) == 0)
2202 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2203 else
2204 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2205 return buffer;
2208 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2209 instruction. If it is, store the number of elements in each vector
2210 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2211 factor in *FACTOR_OUT (if nonnull). */
2213 bool
2214 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2215 unsigned int *nelts_per_vq_out)
2217 rtx elt;
2218 poly_int64 value;
2220 if (!const_vec_duplicate_p (x, &elt)
2221 || !poly_int_rtx_p (elt, &value))
2222 return false;
2224 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2225 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2226 /* There's no vector INCB. */
2227 return false;
2229 HOST_WIDE_INT factor = value.coeffs[0];
2230 if (value.coeffs[1] != factor)
2231 return false;
2233 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2234 if ((factor % nelts_per_vq) != 0
2235 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2236 return false;
2238 if (factor_out)
2239 *factor_out = factor;
2240 if (nelts_per_vq_out)
2241 *nelts_per_vq_out = nelts_per_vq;
2242 return true;
2245 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2246 instruction. */
2248 bool
2249 aarch64_sve_inc_dec_immediate_p (rtx x)
2251 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2254 /* Return the asm template for an SVE vector INC or DEC instruction.
2255 OPERANDS gives the operands before the vector count and X is the
2256 value of the vector count operand itself. */
2258 char *
2259 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2261 int factor;
2262 unsigned int nelts_per_vq;
2263 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2264 gcc_unreachable ();
2265 if (factor < 0)
2266 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2267 nelts_per_vq);
2268 else
2269 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2270 nelts_per_vq);
2273 static int
2274 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2275 scalar_int_mode mode)
2277 int i;
2278 unsigned HOST_WIDE_INT val, val2, mask;
2279 int one_match, zero_match;
2280 int num_insns;
2282 val = INTVAL (imm);
2284 if (aarch64_move_imm (val, mode))
2286 if (generate)
2287 emit_insn (gen_rtx_SET (dest, imm));
2288 return 1;
2291 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2292 (with XXXX non-zero). In that case check to see if the move can be done in
2293 a smaller mode. */
2294 val2 = val & 0xffffffff;
2295 if (mode == DImode
2296 && aarch64_move_imm (val2, SImode)
2297 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2299 if (generate)
2300 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2302 /* Check if we have to emit a second instruction by checking to see
2303 if any of the upper 32 bits of the original DI mode value is set. */
2304 if (val == val2)
2305 return 1;
2307 i = (val >> 48) ? 48 : 32;
2309 if (generate)
2310 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2311 GEN_INT ((val >> i) & 0xffff)));
2313 return 2;
2316 if ((val >> 32) == 0 || mode == SImode)
2318 if (generate)
2320 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2321 if (mode == SImode)
2322 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2323 GEN_INT ((val >> 16) & 0xffff)));
2324 else
2325 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2326 GEN_INT ((val >> 16) & 0xffff)));
2328 return 2;
2331 /* Remaining cases are all for DImode. */
2333 mask = 0xffff;
2334 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2335 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2336 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2337 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2339 if (zero_match != 2 && one_match != 2)
2341 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2342 For a 64-bit bitmask try whether changing 16 bits to all ones or
2343 zeroes creates a valid bitmask. To check any repeated bitmask,
2344 try using 16 bits from the other 32-bit half of val. */
2346 for (i = 0; i < 64; i += 16, mask <<= 16)
2348 val2 = val & ~mask;
2349 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2350 break;
2351 val2 = val | mask;
2352 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2353 break;
2354 val2 = val2 & ~mask;
2355 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2356 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2357 break;
2359 if (i != 64)
2361 if (generate)
2363 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2364 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2365 GEN_INT ((val >> i) & 0xffff)));
2367 return 2;
2371 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2372 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2373 otherwise skip zero bits. */
2375 num_insns = 1;
2376 mask = 0xffff;
2377 val2 = one_match > zero_match ? ~val : val;
2378 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2380 if (generate)
2381 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2382 ? (val | ~(mask << i))
2383 : (val & (mask << i)))));
2384 for (i += 16; i < 64; i += 16)
2386 if ((val2 & (mask << i)) == 0)
2387 continue;
2388 if (generate)
2389 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2390 GEN_INT ((val >> i) & 0xffff)));
2391 num_insns ++;
2394 return num_insns;
2397 /* Return whether imm is a 128-bit immediate which is simple enough to
2398 expand inline. */
2399 bool
2400 aarch64_mov128_immediate (rtx imm)
2402 if (GET_CODE (imm) == CONST_INT)
2403 return true;
2405 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2407 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2408 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2410 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2411 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2415 /* Return the number of temporary registers that aarch64_add_offset_1
2416 would need to add OFFSET to a register. */
2418 static unsigned int
2419 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2421 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2424 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2425 a non-polynomial OFFSET. MODE is the mode of the addition.
2426 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2427 be set and CFA adjustments added to the generated instructions.
2429 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2430 temporary if register allocation is already complete. This temporary
2431 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2432 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2433 the immediate again.
2435 Since this function may be used to adjust the stack pointer, we must
2436 ensure that it cannot cause transient stack deallocation (for example
2437 by first incrementing SP and then decrementing when adjusting by a
2438 large immediate). */
2440 static void
2441 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2442 rtx src, HOST_WIDE_INT offset, rtx temp1,
2443 bool frame_related_p, bool emit_move_imm)
2445 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2446 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2448 HOST_WIDE_INT moffset = abs_hwi (offset);
2449 rtx_insn *insn;
2451 if (!moffset)
2453 if (!rtx_equal_p (dest, src))
2455 insn = emit_insn (gen_rtx_SET (dest, src));
2456 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2458 return;
2461 /* Single instruction adjustment. */
2462 if (aarch64_uimm12_shift (moffset))
2464 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2465 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2466 return;
2469 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2470 and either:
2472 a) the offset cannot be loaded by a 16-bit move or
2473 b) there is no spare register into which we can move it. */
2474 if (moffset < 0x1000000
2475 && ((!temp1 && !can_create_pseudo_p ())
2476 || !aarch64_move_imm (moffset, mode)))
2478 HOST_WIDE_INT low_off = moffset & 0xfff;
2480 low_off = offset < 0 ? -low_off : low_off;
2481 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2482 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2483 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2484 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2485 return;
2488 /* Emit a move immediate if required and an addition/subtraction. */
2489 if (emit_move_imm)
2491 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2492 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2494 insn = emit_insn (offset < 0
2495 ? gen_sub3_insn (dest, src, temp1)
2496 : gen_add3_insn (dest, src, temp1));
2497 if (frame_related_p)
2499 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2500 rtx adj = plus_constant (mode, src, offset);
2501 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2505 /* Return the number of temporary registers that aarch64_add_offset
2506 would need to move OFFSET into a register or add OFFSET to a register;
2507 ADD_P is true if we want the latter rather than the former. */
2509 static unsigned int
2510 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2512 /* This follows the same structure as aarch64_add_offset. */
2513 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2514 return 0;
2516 unsigned int count = 0;
2517 HOST_WIDE_INT factor = offset.coeffs[1];
2518 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2519 poly_int64 poly_offset (factor, factor);
2520 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2521 /* Need one register for the ADDVL/ADDPL result. */
2522 count += 1;
2523 else if (factor != 0)
2525 factor = abs (factor);
2526 if (factor > 16 * (factor & -factor))
2527 /* Need one register for the CNT result and one for the multiplication
2528 factor. If necessary, the second temporary can be reused for the
2529 constant part of the offset. */
2530 return 2;
2531 /* Need one register for the CNT result (which might then
2532 be shifted). */
2533 count += 1;
2535 return count + aarch64_add_offset_1_temporaries (constant);
2538 /* If X can be represented as a poly_int64, return the number
2539 of temporaries that are required to add it to a register.
2540 Return -1 otherwise. */
2543 aarch64_add_offset_temporaries (rtx x)
2545 poly_int64 offset;
2546 if (!poly_int_rtx_p (x, &offset))
2547 return -1;
2548 return aarch64_offset_temporaries (true, offset);
2551 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2552 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2553 be set and CFA adjustments added to the generated instructions.
2555 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2556 temporary if register allocation is already complete. This temporary
2557 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2558 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2559 false to avoid emitting the immediate again.
2561 TEMP2, if nonnull, is a second temporary register that doesn't
2562 overlap either DEST or REG.
2564 Since this function may be used to adjust the stack pointer, we must
2565 ensure that it cannot cause transient stack deallocation (for example
2566 by first incrementing SP and then decrementing when adjusting by a
2567 large immediate). */
2569 static void
2570 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2571 poly_int64 offset, rtx temp1, rtx temp2,
2572 bool frame_related_p, bool emit_move_imm = true)
2574 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2575 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2576 gcc_assert (temp1 == NULL_RTX
2577 || !frame_related_p
2578 || !reg_overlap_mentioned_p (temp1, dest));
2579 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2581 /* Try using ADDVL or ADDPL to add the whole value. */
2582 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2584 rtx offset_rtx = gen_int_mode (offset, mode);
2585 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2586 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2587 return;
2590 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2591 SVE vector register, over and above the minimum size of 128 bits.
2592 This is equivalent to half the value returned by CNTD with a
2593 vector shape of ALL. */
2594 HOST_WIDE_INT factor = offset.coeffs[1];
2595 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2597 /* Try using ADDVL or ADDPL to add the VG-based part. */
2598 poly_int64 poly_offset (factor, factor);
2599 if (src != const0_rtx
2600 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2602 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2603 if (frame_related_p)
2605 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2606 RTX_FRAME_RELATED_P (insn) = true;
2607 src = dest;
2609 else
2611 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2612 src = aarch64_force_temporary (mode, temp1, addr);
2613 temp1 = temp2;
2614 temp2 = NULL_RTX;
2617 /* Otherwise use a CNT-based sequence. */
2618 else if (factor != 0)
2620 /* Use a subtraction if we have a negative factor. */
2621 rtx_code code = PLUS;
2622 if (factor < 0)
2624 factor = -factor;
2625 code = MINUS;
2628 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2629 into the multiplication. */
2630 rtx val;
2631 int shift = 0;
2632 if (factor & 1)
2633 /* Use a right shift by 1. */
2634 shift = -1;
2635 else
2636 factor /= 2;
2637 HOST_WIDE_INT low_bit = factor & -factor;
2638 if (factor <= 16 * low_bit)
2640 if (factor > 16 * 8)
2642 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2643 the value with the minimum multiplier and shift it into
2644 position. */
2645 int extra_shift = exact_log2 (low_bit);
2646 shift += extra_shift;
2647 factor >>= extra_shift;
2649 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2651 else
2653 /* Use CNTD, then multiply it by FACTOR. */
2654 val = gen_int_mode (poly_int64 (2, 2), mode);
2655 val = aarch64_force_temporary (mode, temp1, val);
2657 /* Go back to using a negative multiplication factor if we have
2658 no register from which to subtract. */
2659 if (code == MINUS && src == const0_rtx)
2661 factor = -factor;
2662 code = PLUS;
2664 rtx coeff1 = gen_int_mode (factor, mode);
2665 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2666 val = gen_rtx_MULT (mode, val, coeff1);
2669 if (shift > 0)
2671 /* Multiply by 1 << SHIFT. */
2672 val = aarch64_force_temporary (mode, temp1, val);
2673 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2675 else if (shift == -1)
2677 /* Divide by 2. */
2678 val = aarch64_force_temporary (mode, temp1, val);
2679 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2682 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2683 if (src != const0_rtx)
2685 val = aarch64_force_temporary (mode, temp1, val);
2686 val = gen_rtx_fmt_ee (code, mode, src, val);
2688 else if (code == MINUS)
2690 val = aarch64_force_temporary (mode, temp1, val);
2691 val = gen_rtx_NEG (mode, val);
2694 if (constant == 0 || frame_related_p)
2696 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2697 if (frame_related_p)
2699 RTX_FRAME_RELATED_P (insn) = true;
2700 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2701 gen_rtx_SET (dest, plus_constant (Pmode, src,
2702 poly_offset)));
2704 src = dest;
2705 if (constant == 0)
2706 return;
2708 else
2710 src = aarch64_force_temporary (mode, temp1, val);
2711 temp1 = temp2;
2712 temp2 = NULL_RTX;
2715 emit_move_imm = true;
2718 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2719 frame_related_p, emit_move_imm);
2722 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2723 than a poly_int64. */
2725 void
2726 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2727 rtx offset_rtx, rtx temp1, rtx temp2)
2729 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2730 temp1, temp2, false);
2733 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2734 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2735 if TEMP1 already contains abs (DELTA). */
2737 static inline void
2738 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2740 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2741 temp1, temp2, true, emit_move_imm);
2744 /* Subtract DELTA from the stack pointer, marking the instructions
2745 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2746 if nonnull. */
2748 static inline void
2749 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2751 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2752 temp1, temp2, frame_related_p);
2755 /* Set DEST to (vec_series BASE STEP). */
2757 static void
2758 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2760 machine_mode mode = GET_MODE (dest);
2761 scalar_mode inner = GET_MODE_INNER (mode);
2763 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2764 if (!aarch64_sve_index_immediate_p (base))
2765 base = force_reg (inner, base);
2766 if (!aarch64_sve_index_immediate_p (step))
2767 step = force_reg (inner, step);
2769 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2772 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2773 integer of mode INT_MODE. Return true on success. */
2775 static bool
2776 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2777 rtx src)
2779 /* If the constant is smaller than 128 bits, we can do the move
2780 using a vector of SRC_MODEs. */
2781 if (src_mode != TImode)
2783 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2784 GET_MODE_SIZE (src_mode));
2785 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2786 emit_move_insn (gen_lowpart (dup_mode, dest),
2787 gen_const_vec_duplicate (dup_mode, src));
2788 return true;
2791 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2792 src = force_const_mem (src_mode, src);
2793 if (!src)
2794 return false;
2796 /* Make sure that the address is legitimate. */
2797 if (!aarch64_sve_ld1r_operand_p (src))
2799 rtx addr = force_reg (Pmode, XEXP (src, 0));
2800 src = replace_equiv_address (src, addr);
2803 machine_mode mode = GET_MODE (dest);
2804 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2805 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2806 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2807 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2808 emit_insn (gen_rtx_SET (dest, src));
2809 return true;
2812 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2813 isn't a simple duplicate or series. */
2815 static void
2816 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2818 machine_mode mode = GET_MODE (src);
2819 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2820 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2821 gcc_assert (npatterns > 1);
2823 if (nelts_per_pattern == 1)
2825 /* The constant is a repeating seqeuence of at least two elements,
2826 where the repeating elements occupy no more than 128 bits.
2827 Get an integer representation of the replicated value. */
2828 scalar_int_mode int_mode;
2829 if (BYTES_BIG_ENDIAN)
2830 /* For now, always use LD1RQ to load the value on big-endian
2831 targets, since the handling of smaller integers includes a
2832 subreg that is semantically an element reverse. */
2833 int_mode = TImode;
2834 else
2836 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2837 gcc_assert (int_bits <= 128);
2838 int_mode = int_mode_for_size (int_bits, 0).require ();
2840 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2841 if (int_value
2842 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2843 return;
2846 /* Expand each pattern individually. */
2847 rtx_vector_builder builder;
2848 auto_vec<rtx, 16> vectors (npatterns);
2849 for (unsigned int i = 0; i < npatterns; ++i)
2851 builder.new_vector (mode, 1, nelts_per_pattern);
2852 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2853 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2854 vectors.quick_push (force_reg (mode, builder.build ()));
2857 /* Use permutes to interleave the separate vectors. */
2858 while (npatterns > 1)
2860 npatterns /= 2;
2861 for (unsigned int i = 0; i < npatterns; ++i)
2863 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2864 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2865 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2866 vectors[i] = tmp;
2869 gcc_assert (vectors[0] == dest);
2872 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2873 is a pattern that can be used to set DEST to a replicated scalar
2874 element. */
2876 void
2877 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2878 rtx (*gen_vec_duplicate) (rtx, rtx))
2880 machine_mode mode = GET_MODE (dest);
2882 /* Check on what type of symbol it is. */
2883 scalar_int_mode int_mode;
2884 if ((GET_CODE (imm) == SYMBOL_REF
2885 || GET_CODE (imm) == LABEL_REF
2886 || GET_CODE (imm) == CONST
2887 || GET_CODE (imm) == CONST_POLY_INT)
2888 && is_a <scalar_int_mode> (mode, &int_mode))
2890 rtx mem;
2891 poly_int64 offset;
2892 HOST_WIDE_INT const_offset;
2893 enum aarch64_symbol_type sty;
2895 /* If we have (const (plus symbol offset)), separate out the offset
2896 before we start classifying the symbol. */
2897 rtx base = strip_offset (imm, &offset);
2899 /* We must always add an offset involving VL separately, rather than
2900 folding it into the relocation. */
2901 if (!offset.is_constant (&const_offset))
2903 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2904 emit_insn (gen_rtx_SET (dest, imm));
2905 else
2907 /* Do arithmetic on 32-bit values if the result is smaller
2908 than that. */
2909 if (partial_subreg_p (int_mode, SImode))
2911 /* It is invalid to do symbol calculations in modes
2912 narrower than SImode. */
2913 gcc_assert (base == const0_rtx);
2914 dest = gen_lowpart (SImode, dest);
2915 int_mode = SImode;
2917 if (base != const0_rtx)
2919 base = aarch64_force_temporary (int_mode, dest, base);
2920 aarch64_add_offset (int_mode, dest, base, offset,
2921 NULL_RTX, NULL_RTX, false);
2923 else
2924 aarch64_add_offset (int_mode, dest, base, offset,
2925 dest, NULL_RTX, false);
2927 return;
2930 sty = aarch64_classify_symbol (base, const_offset);
2931 switch (sty)
2933 case SYMBOL_FORCE_TO_MEM:
2934 if (const_offset != 0
2935 && targetm.cannot_force_const_mem (int_mode, imm))
2937 gcc_assert (can_create_pseudo_p ());
2938 base = aarch64_force_temporary (int_mode, dest, base);
2939 aarch64_add_offset (int_mode, dest, base, const_offset,
2940 NULL_RTX, NULL_RTX, false);
2941 return;
2944 mem = force_const_mem (ptr_mode, imm);
2945 gcc_assert (mem);
2947 /* If we aren't generating PC relative literals, then
2948 we need to expand the literal pool access carefully.
2949 This is something that needs to be done in a number
2950 of places, so could well live as a separate function. */
2951 if (!aarch64_pcrelative_literal_loads)
2953 gcc_assert (can_create_pseudo_p ());
2954 base = gen_reg_rtx (ptr_mode);
2955 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2956 if (ptr_mode != Pmode)
2957 base = convert_memory_address (Pmode, base);
2958 mem = gen_rtx_MEM (ptr_mode, base);
2961 if (int_mode != ptr_mode)
2962 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2964 emit_insn (gen_rtx_SET (dest, mem));
2966 return;
2968 case SYMBOL_SMALL_TLSGD:
2969 case SYMBOL_SMALL_TLSDESC:
2970 case SYMBOL_SMALL_TLSIE:
2971 case SYMBOL_SMALL_GOT_28K:
2972 case SYMBOL_SMALL_GOT_4G:
2973 case SYMBOL_TINY_GOT:
2974 case SYMBOL_TINY_TLSIE:
2975 if (const_offset != 0)
2977 gcc_assert(can_create_pseudo_p ());
2978 base = aarch64_force_temporary (int_mode, dest, base);
2979 aarch64_add_offset (int_mode, dest, base, const_offset,
2980 NULL_RTX, NULL_RTX, false);
2981 return;
2983 /* FALLTHRU */
2985 case SYMBOL_SMALL_ABSOLUTE:
2986 case SYMBOL_TINY_ABSOLUTE:
2987 case SYMBOL_TLSLE12:
2988 case SYMBOL_TLSLE24:
2989 case SYMBOL_TLSLE32:
2990 case SYMBOL_TLSLE48:
2991 aarch64_load_symref_appropriately (dest, imm, sty);
2992 return;
2994 default:
2995 gcc_unreachable ();
2999 if (!CONST_INT_P (imm))
3001 rtx base, step, value;
3002 if (GET_CODE (imm) == HIGH
3003 || aarch64_simd_valid_immediate (imm, NULL))
3004 emit_insn (gen_rtx_SET (dest, imm));
3005 else if (const_vec_series_p (imm, &base, &step))
3006 aarch64_expand_vec_series (dest, base, step);
3007 else if (const_vec_duplicate_p (imm, &value))
3009 /* If the constant is out of range of an SVE vector move,
3010 load it from memory if we can, otherwise move it into
3011 a register and use a DUP. */
3012 scalar_mode inner_mode = GET_MODE_INNER (mode);
3013 rtx op = force_const_mem (inner_mode, value);
3014 if (!op)
3015 op = force_reg (inner_mode, value);
3016 else if (!aarch64_sve_ld1r_operand_p (op))
3018 rtx addr = force_reg (Pmode, XEXP (op, 0));
3019 op = replace_equiv_address (op, addr);
3021 emit_insn (gen_vec_duplicate (dest, op));
3023 else if (GET_CODE (imm) == CONST_VECTOR
3024 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3025 aarch64_expand_sve_const_vector (dest, imm);
3026 else
3028 rtx mem = force_const_mem (mode, imm);
3029 gcc_assert (mem);
3030 emit_move_insn (dest, mem);
3033 return;
3036 aarch64_internal_mov_immediate (dest, imm, true,
3037 as_a <scalar_int_mode> (mode));
3040 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3041 that is known to contain PTRUE. */
3043 void
3044 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3046 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3047 gen_rtvec (2, pred, src),
3048 UNSPEC_MERGE_PTRUE)));
3051 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3052 operand is in memory. In this case we need to use the predicated LD1
3053 and ST1 instead of LDR and STR, both for correctness on big-endian
3054 targets and because LD1 and ST1 support a wider range of addressing modes.
3055 PRED_MODE is the mode of the predicate.
3057 See the comment at the head of aarch64-sve.md for details about the
3058 big-endian handling. */
3060 void
3061 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3063 machine_mode mode = GET_MODE (dest);
3064 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3065 if (!register_operand (src, mode)
3066 && !register_operand (dest, mode))
3068 rtx tmp = gen_reg_rtx (mode);
3069 if (MEM_P (src))
3070 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3071 else
3072 emit_move_insn (tmp, src);
3073 src = tmp;
3075 aarch64_emit_sve_pred_move (dest, ptrue, src);
3078 /* Called only on big-endian targets. See whether an SVE vector move
3079 from SRC to DEST is effectively a REV[BHW] instruction, because at
3080 least one operand is a subreg of an SVE vector that has wider or
3081 narrower elements. Return true and emit the instruction if so.
3083 For example:
3085 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3087 represents a VIEW_CONVERT between the following vectors, viewed
3088 in memory order:
3090 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3091 R1: { [0], [1], [2], [3], ... }
3093 The high part of lane X in R2 should therefore correspond to lane X*2
3094 of R1, but the register representations are:
3096 msb lsb
3097 R2: ...... [1].high [1].low [0].high [0].low
3098 R1: ...... [3] [2] [1] [0]
3100 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3101 We therefore need a reverse operation to swap the high and low values
3102 around.
3104 This is purely an optimization. Without it we would spill the
3105 subreg operand to the stack in one mode and reload it in the
3106 other mode, which has the same effect as the REV. */
3108 bool
3109 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3111 gcc_assert (BYTES_BIG_ENDIAN);
3112 if (GET_CODE (dest) == SUBREG)
3113 dest = SUBREG_REG (dest);
3114 if (GET_CODE (src) == SUBREG)
3115 src = SUBREG_REG (src);
3117 /* The optimization handles two single SVE REGs with different element
3118 sizes. */
3119 if (!REG_P (dest)
3120 || !REG_P (src)
3121 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3122 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3123 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3124 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3125 return false;
3127 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3128 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3129 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3130 UNSPEC_REV_SUBREG);
3131 emit_insn (gen_rtx_SET (dest, unspec));
3132 return true;
3135 /* Return a copy of X with mode MODE, without changing its other
3136 attributes. Unlike gen_lowpart, this doesn't care whether the
3137 mode change is valid. */
3139 static rtx
3140 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3142 if (GET_MODE (x) == mode)
3143 return x;
3145 x = shallow_copy_rtx (x);
3146 set_mode_and_regno (x, mode, REGNO (x));
3147 return x;
3150 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3151 operands. */
3153 void
3154 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3156 /* Decide which REV operation we need. The mode with narrower elements
3157 determines the mode of the operands and the mode with the wider
3158 elements determines the reverse width. */
3159 machine_mode mode_with_wider_elts = GET_MODE (dest);
3160 machine_mode mode_with_narrower_elts = GET_MODE (src);
3161 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3162 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3163 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3165 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3166 unsigned int unspec;
3167 if (wider_bytes == 8)
3168 unspec = UNSPEC_REV64;
3169 else if (wider_bytes == 4)
3170 unspec = UNSPEC_REV32;
3171 else if (wider_bytes == 2)
3172 unspec = UNSPEC_REV16;
3173 else
3174 gcc_unreachable ();
3175 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3177 /* Emit:
3179 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3180 UNSPEC_MERGE_PTRUE))
3182 with the appropriate modes. */
3183 ptrue = gen_lowpart (pred_mode, ptrue);
3184 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3185 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3186 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3187 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3188 UNSPEC_MERGE_PTRUE);
3189 emit_insn (gen_rtx_SET (dest, src));
3192 static bool
3193 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3194 tree exp ATTRIBUTE_UNUSED)
3196 /* Currently, always true. */
3197 return true;
3200 /* Implement TARGET_PASS_BY_REFERENCE. */
3202 static bool
3203 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3204 machine_mode mode,
3205 const_tree type,
3206 bool named ATTRIBUTE_UNUSED)
3208 HOST_WIDE_INT size;
3209 machine_mode dummymode;
3210 int nregs;
3212 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3213 if (mode == BLKmode && type)
3214 size = int_size_in_bytes (type);
3215 else
3216 /* No frontends can create types with variable-sized modes, so we
3217 shouldn't be asked to pass or return them. */
3218 size = GET_MODE_SIZE (mode).to_constant ();
3220 /* Aggregates are passed by reference based on their size. */
3221 if (type && AGGREGATE_TYPE_P (type))
3223 size = int_size_in_bytes (type);
3226 /* Variable sized arguments are always returned by reference. */
3227 if (size < 0)
3228 return true;
3230 /* Can this be a candidate to be passed in fp/simd register(s)? */
3231 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3232 &dummymode, &nregs,
3233 NULL))
3234 return false;
3236 /* Arguments which are variable sized or larger than 2 registers are
3237 passed by reference unless they are a homogenous floating point
3238 aggregate. */
3239 return size > 2 * UNITS_PER_WORD;
3242 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3243 static bool
3244 aarch64_return_in_msb (const_tree valtype)
3246 machine_mode dummy_mode;
3247 int dummy_int;
3249 /* Never happens in little-endian mode. */
3250 if (!BYTES_BIG_ENDIAN)
3251 return false;
3253 /* Only composite types smaller than or equal to 16 bytes can
3254 be potentially returned in registers. */
3255 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3256 || int_size_in_bytes (valtype) <= 0
3257 || int_size_in_bytes (valtype) > 16)
3258 return false;
3260 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3261 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3262 is always passed/returned in the least significant bits of fp/simd
3263 register(s). */
3264 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3265 &dummy_mode, &dummy_int, NULL))
3266 return false;
3268 return true;
3271 /* Implement TARGET_FUNCTION_VALUE.
3272 Define how to find the value returned by a function. */
3274 static rtx
3275 aarch64_function_value (const_tree type, const_tree func,
3276 bool outgoing ATTRIBUTE_UNUSED)
3278 machine_mode mode;
3279 int unsignedp;
3280 int count;
3281 machine_mode ag_mode;
3283 mode = TYPE_MODE (type);
3284 if (INTEGRAL_TYPE_P (type))
3285 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3287 if (aarch64_return_in_msb (type))
3289 HOST_WIDE_INT size = int_size_in_bytes (type);
3291 if (size % UNITS_PER_WORD != 0)
3293 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3294 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3298 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3299 &ag_mode, &count, NULL))
3301 if (!aarch64_composite_type_p (type, mode))
3303 gcc_assert (count == 1 && mode == ag_mode);
3304 return gen_rtx_REG (mode, V0_REGNUM);
3306 else
3308 int i;
3309 rtx par;
3311 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3312 for (i = 0; i < count; i++)
3314 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3315 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3316 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3317 XVECEXP (par, 0, i) = tmp;
3319 return par;
3322 else
3323 return gen_rtx_REG (mode, R0_REGNUM);
3326 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3327 Return true if REGNO is the number of a hard register in which the values
3328 of called function may come back. */
3330 static bool
3331 aarch64_function_value_regno_p (const unsigned int regno)
3333 /* Maximum of 16 bytes can be returned in the general registers. Examples
3334 of 16-byte return values are: 128-bit integers and 16-byte small
3335 structures (excluding homogeneous floating-point aggregates). */
3336 if (regno == R0_REGNUM || regno == R1_REGNUM)
3337 return true;
3339 /* Up to four fp/simd registers can return a function value, e.g. a
3340 homogeneous floating-point aggregate having four members. */
3341 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3342 return TARGET_FLOAT;
3344 return false;
3347 /* Implement TARGET_RETURN_IN_MEMORY.
3349 If the type T of the result of a function is such that
3350 void func (T arg)
3351 would require that arg be passed as a value in a register (or set of
3352 registers) according to the parameter passing rules, then the result
3353 is returned in the same registers as would be used for such an
3354 argument. */
3356 static bool
3357 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3359 HOST_WIDE_INT size;
3360 machine_mode ag_mode;
3361 int count;
3363 if (!AGGREGATE_TYPE_P (type)
3364 && TREE_CODE (type) != COMPLEX_TYPE
3365 && TREE_CODE (type) != VECTOR_TYPE)
3366 /* Simple scalar types always returned in registers. */
3367 return false;
3369 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3370 type,
3371 &ag_mode,
3372 &count,
3373 NULL))
3374 return false;
3376 /* Types larger than 2 registers returned in memory. */
3377 size = int_size_in_bytes (type);
3378 return (size < 0 || size > 2 * UNITS_PER_WORD);
3381 static bool
3382 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3383 const_tree type, int *nregs)
3385 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3386 return aarch64_vfp_is_call_or_return_candidate (mode,
3387 type,
3388 &pcum->aapcs_vfp_rmode,
3389 nregs,
3390 NULL);
3393 /* Given MODE and TYPE of a function argument, return the alignment in
3394 bits. The idea is to suppress any stronger alignment requested by
3395 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3396 This is a helper function for local use only. */
3398 static unsigned int
3399 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3401 if (!type)
3402 return GET_MODE_ALIGNMENT (mode);
3404 if (integer_zerop (TYPE_SIZE (type)))
3405 return 0;
3407 gcc_assert (TYPE_MODE (type) == mode);
3409 if (!AGGREGATE_TYPE_P (type))
3410 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3412 if (TREE_CODE (type) == ARRAY_TYPE)
3413 return TYPE_ALIGN (TREE_TYPE (type));
3415 unsigned int alignment = 0;
3416 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3417 if (TREE_CODE (field) == FIELD_DECL)
3418 alignment = std::max (alignment, DECL_ALIGN (field));
3420 return alignment;
3423 /* Layout a function argument according to the AAPCS64 rules. The rule
3424 numbers refer to the rule numbers in the AAPCS64. */
3426 static void
3427 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3428 const_tree type,
3429 bool named ATTRIBUTE_UNUSED)
3431 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3432 int ncrn, nvrn, nregs;
3433 bool allocate_ncrn, allocate_nvrn;
3434 HOST_WIDE_INT size;
3436 /* We need to do this once per argument. */
3437 if (pcum->aapcs_arg_processed)
3438 return;
3440 pcum->aapcs_arg_processed = true;
3442 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3443 if (type)
3444 size = int_size_in_bytes (type);
3445 else
3446 /* No frontends can create types with variable-sized modes, so we
3447 shouldn't be asked to pass or return them. */
3448 size = GET_MODE_SIZE (mode).to_constant ();
3449 size = ROUND_UP (size, UNITS_PER_WORD);
3451 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3452 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3453 mode,
3454 type,
3455 &nregs);
3457 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3458 The following code thus handles passing by SIMD/FP registers first. */
3460 nvrn = pcum->aapcs_nvrn;
3462 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3463 and homogenous short-vector aggregates (HVA). */
3464 if (allocate_nvrn)
3466 if (!TARGET_FLOAT)
3467 aarch64_err_no_fpadvsimd (mode, "argument");
3469 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3471 pcum->aapcs_nextnvrn = nvrn + nregs;
3472 if (!aarch64_composite_type_p (type, mode))
3474 gcc_assert (nregs == 1);
3475 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3477 else
3479 rtx par;
3480 int i;
3481 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3482 for (i = 0; i < nregs; i++)
3484 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3485 V0_REGNUM + nvrn + i);
3486 rtx offset = gen_int_mode
3487 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3488 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3489 XVECEXP (par, 0, i) = tmp;
3491 pcum->aapcs_reg = par;
3493 return;
3495 else
3497 /* C.3 NSRN is set to 8. */
3498 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3499 goto on_stack;
3503 ncrn = pcum->aapcs_ncrn;
3504 nregs = size / UNITS_PER_WORD;
3506 /* C6 - C9. though the sign and zero extension semantics are
3507 handled elsewhere. This is the case where the argument fits
3508 entirely general registers. */
3509 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3512 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3514 /* C.8 if the argument has an alignment of 16 then the NGRN is
3515 rounded up to the next even number. */
3516 if (nregs == 2
3517 && ncrn % 2
3518 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3519 comparison is there because for > 16 * BITS_PER_UNIT
3520 alignment nregs should be > 2 and therefore it should be
3521 passed by reference rather than value. */
3522 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3524 ++ncrn;
3525 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3528 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3529 A reg is still generated for it, but the caller should be smart
3530 enough not to use it. */
3531 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3532 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3533 else
3535 rtx par;
3536 int i;
3538 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3539 for (i = 0; i < nregs; i++)
3541 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3542 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3543 GEN_INT (i * UNITS_PER_WORD));
3544 XVECEXP (par, 0, i) = tmp;
3546 pcum->aapcs_reg = par;
3549 pcum->aapcs_nextncrn = ncrn + nregs;
3550 return;
3553 /* C.11 */
3554 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3556 /* The argument is passed on stack; record the needed number of words for
3557 this argument and align the total size if necessary. */
3558 on_stack:
3559 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3561 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3562 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3563 16 / UNITS_PER_WORD);
3564 return;
3567 /* Implement TARGET_FUNCTION_ARG. */
3569 static rtx
3570 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3571 const_tree type, bool named)
3573 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3574 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3576 if (mode == VOIDmode)
3577 return NULL_RTX;
3579 aarch64_layout_arg (pcum_v, mode, type, named);
3580 return pcum->aapcs_reg;
3583 void
3584 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3585 const_tree fntype ATTRIBUTE_UNUSED,
3586 rtx libname ATTRIBUTE_UNUSED,
3587 const_tree fndecl ATTRIBUTE_UNUSED,
3588 unsigned n_named ATTRIBUTE_UNUSED)
3590 pcum->aapcs_ncrn = 0;
3591 pcum->aapcs_nvrn = 0;
3592 pcum->aapcs_nextncrn = 0;
3593 pcum->aapcs_nextnvrn = 0;
3594 pcum->pcs_variant = ARM_PCS_AAPCS64;
3595 pcum->aapcs_reg = NULL_RTX;
3596 pcum->aapcs_arg_processed = false;
3597 pcum->aapcs_stack_words = 0;
3598 pcum->aapcs_stack_size = 0;
3600 if (!TARGET_FLOAT
3601 && fndecl && TREE_PUBLIC (fndecl)
3602 && fntype && fntype != error_mark_node)
3604 const_tree type = TREE_TYPE (fntype);
3605 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3606 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3607 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3608 &mode, &nregs, NULL))
3609 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3611 return;
3614 static void
3615 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3616 machine_mode mode,
3617 const_tree type,
3618 bool named)
3620 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3621 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3623 aarch64_layout_arg (pcum_v, mode, type, named);
3624 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3625 != (pcum->aapcs_stack_words != 0));
3626 pcum->aapcs_arg_processed = false;
3627 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3628 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3629 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3630 pcum->aapcs_stack_words = 0;
3631 pcum->aapcs_reg = NULL_RTX;
3635 bool
3636 aarch64_function_arg_regno_p (unsigned regno)
3638 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3639 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3642 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3643 PARM_BOUNDARY bits of alignment, but will be given anything up
3644 to STACK_BOUNDARY bits if the type requires it. This makes sure
3645 that both before and after the layout of each argument, the Next
3646 Stacked Argument Address (NSAA) will have a minimum alignment of
3647 8 bytes. */
3649 static unsigned int
3650 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3652 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3653 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3656 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3658 static fixed_size_mode
3659 aarch64_get_reg_raw_mode (int regno)
3661 if (TARGET_SVE && FP_REGNUM_P (regno))
3662 /* Don't use the SVE part of the register for __builtin_apply and
3663 __builtin_return. The SVE registers aren't used by the normal PCS,
3664 so using them there would be a waste of time. The PCS extensions
3665 for SVE types are fundamentally incompatible with the
3666 __builtin_return/__builtin_apply interface. */
3667 return as_a <fixed_size_mode> (V16QImode);
3668 return default_get_reg_raw_mode (regno);
3671 /* Implement TARGET_FUNCTION_ARG_PADDING.
3673 Small aggregate types are placed in the lowest memory address.
3675 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3677 static pad_direction
3678 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3680 /* On little-endian targets, the least significant byte of every stack
3681 argument is passed at the lowest byte address of the stack slot. */
3682 if (!BYTES_BIG_ENDIAN)
3683 return PAD_UPWARD;
3685 /* Otherwise, integral, floating-point and pointer types are padded downward:
3686 the least significant byte of a stack argument is passed at the highest
3687 byte address of the stack slot. */
3688 if (type
3689 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3690 || POINTER_TYPE_P (type))
3691 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3692 return PAD_DOWNWARD;
3694 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3695 return PAD_UPWARD;
3698 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3700 It specifies padding for the last (may also be the only)
3701 element of a block move between registers and memory. If
3702 assuming the block is in the memory, padding upward means that
3703 the last element is padded after its highest significant byte,
3704 while in downward padding, the last element is padded at the
3705 its least significant byte side.
3707 Small aggregates and small complex types are always padded
3708 upwards.
3710 We don't need to worry about homogeneous floating-point or
3711 short-vector aggregates; their move is not affected by the
3712 padding direction determined here. Regardless of endianness,
3713 each element of such an aggregate is put in the least
3714 significant bits of a fp/simd register.
3716 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3717 register has useful data, and return the opposite if the most
3718 significant byte does. */
3720 bool
3721 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3722 bool first ATTRIBUTE_UNUSED)
3725 /* Small composite types are always padded upward. */
3726 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3728 HOST_WIDE_INT size;
3729 if (type)
3730 size = int_size_in_bytes (type);
3731 else
3732 /* No frontends can create types with variable-sized modes, so we
3733 shouldn't be asked to pass or return them. */
3734 size = GET_MODE_SIZE (mode).to_constant ();
3735 if (size < 2 * UNITS_PER_WORD)
3736 return true;
3739 /* Otherwise, use the default padding. */
3740 return !BYTES_BIG_ENDIAN;
3743 static scalar_int_mode
3744 aarch64_libgcc_cmp_return_mode (void)
3746 return SImode;
3749 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3751 /* We use the 12-bit shifted immediate arithmetic instructions so values
3752 must be multiple of (1 << 12), i.e. 4096. */
3753 #define ARITH_FACTOR 4096
3755 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3756 #error Cannot use simple address calculation for stack probing
3757 #endif
3759 /* The pair of scratch registers used for stack probing. */
3760 #define PROBE_STACK_FIRST_REG 9
3761 #define PROBE_STACK_SECOND_REG 10
3763 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3764 inclusive. These are offsets from the current stack pointer. */
3766 static void
3767 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3769 HOST_WIDE_INT size;
3770 if (!poly_size.is_constant (&size))
3772 sorry ("stack probes for SVE frames");
3773 return;
3776 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3778 /* See the same assertion on PROBE_INTERVAL above. */
3779 gcc_assert ((first % ARITH_FACTOR) == 0);
3781 /* See if we have a constant small number of probes to generate. If so,
3782 that's the easy case. */
3783 if (size <= PROBE_INTERVAL)
3785 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3787 emit_set_insn (reg1,
3788 plus_constant (Pmode,
3789 stack_pointer_rtx, -(first + base)));
3790 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3793 /* The run-time loop is made up of 8 insns in the generic case while the
3794 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3795 else if (size <= 4 * PROBE_INTERVAL)
3797 HOST_WIDE_INT i, rem;
3799 emit_set_insn (reg1,
3800 plus_constant (Pmode,
3801 stack_pointer_rtx,
3802 -(first + PROBE_INTERVAL)));
3803 emit_stack_probe (reg1);
3805 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3806 it exceeds SIZE. If only two probes are needed, this will not
3807 generate any code. Then probe at FIRST + SIZE. */
3808 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3810 emit_set_insn (reg1,
3811 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3812 emit_stack_probe (reg1);
3815 rem = size - (i - PROBE_INTERVAL);
3816 if (rem > 256)
3818 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3820 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3821 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3823 else
3824 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3827 /* Otherwise, do the same as above, but in a loop. Note that we must be
3828 extra careful with variables wrapping around because we might be at
3829 the very top (or the very bottom) of the address space and we have
3830 to be able to handle this case properly; in particular, we use an
3831 equality test for the loop condition. */
3832 else
3834 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3836 /* Step 1: round SIZE to the previous multiple of the interval. */
3838 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3841 /* Step 2: compute initial and final value of the loop counter. */
3843 /* TEST_ADDR = SP + FIRST. */
3844 emit_set_insn (reg1,
3845 plus_constant (Pmode, stack_pointer_rtx, -first));
3847 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3848 HOST_WIDE_INT adjustment = - (first + rounded_size);
3849 if (! aarch64_uimm12_shift (adjustment))
3851 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3852 true, Pmode);
3853 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3855 else
3856 emit_set_insn (reg2,
3857 plus_constant (Pmode, stack_pointer_rtx, adjustment));
3859 /* Step 3: the loop
3863 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3864 probe at TEST_ADDR
3866 while (TEST_ADDR != LAST_ADDR)
3868 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3869 until it is equal to ROUNDED_SIZE. */
3871 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3874 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3875 that SIZE is equal to ROUNDED_SIZE. */
3877 if (size != rounded_size)
3879 HOST_WIDE_INT rem = size - rounded_size;
3881 if (rem > 256)
3883 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3885 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3886 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3888 else
3889 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3893 /* Make sure nothing is scheduled before we are done. */
3894 emit_insn (gen_blockage ());
3897 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3898 absolute addresses. */
3900 const char *
3901 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3903 static int labelno = 0;
3904 char loop_lab[32];
3905 rtx xops[2];
3907 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3909 /* Loop. */
3910 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3912 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3913 xops[0] = reg1;
3914 xops[1] = GEN_INT (PROBE_INTERVAL);
3915 output_asm_insn ("sub\t%0, %0, %1", xops);
3917 /* Probe at TEST_ADDR. */
3918 output_asm_insn ("str\txzr, [%0]", xops);
3920 /* Test if TEST_ADDR == LAST_ADDR. */
3921 xops[1] = reg2;
3922 output_asm_insn ("cmp\t%0, %1", xops);
3924 /* Branch. */
3925 fputs ("\tb.ne\t", asm_out_file);
3926 assemble_name_raw (asm_out_file, loop_lab);
3927 fputc ('\n', asm_out_file);
3929 return "";
3932 /* Mark the registers that need to be saved by the callee and calculate
3933 the size of the callee-saved registers area and frame record (both FP
3934 and LR may be omitted). */
3935 static void
3936 aarch64_layout_frame (void)
3938 HOST_WIDE_INT offset = 0;
3939 int regno, last_fp_reg = INVALID_REGNUM;
3941 if (reload_completed && cfun->machine->frame.laid_out)
3942 return;
3944 /* Force a frame chain for EH returns so the return address is at FP+8. */
3945 cfun->machine->frame.emit_frame_chain
3946 = frame_pointer_needed || crtl->calls_eh_return;
3948 /* Emit a frame chain if the frame pointer is enabled.
3949 If -momit-leaf-frame-pointer is used, do not use a frame chain
3950 in leaf functions which do not use LR. */
3951 if (flag_omit_frame_pointer == 2
3952 && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
3953 && !df_regs_ever_live_p (LR_REGNUM)))
3954 cfun->machine->frame.emit_frame_chain = true;
3956 #define SLOT_NOT_REQUIRED (-2)
3957 #define SLOT_REQUIRED (-1)
3959 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3960 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3962 /* First mark all the registers that really need to be saved... */
3963 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3964 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3966 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3967 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3969 /* ... that includes the eh data registers (if needed)... */
3970 if (crtl->calls_eh_return)
3971 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
3972 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
3973 = SLOT_REQUIRED;
3975 /* ... and any callee saved register that dataflow says is live. */
3976 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3977 if (df_regs_ever_live_p (regno)
3978 && (regno == R30_REGNUM
3979 || !call_used_regs[regno]))
3980 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3982 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3983 if (df_regs_ever_live_p (regno)
3984 && !call_used_regs[regno])
3986 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3987 last_fp_reg = regno;
3990 if (cfun->machine->frame.emit_frame_chain)
3992 /* FP and LR are placed in the linkage record. */
3993 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
3994 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
3995 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
3996 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
3997 offset = 2 * UNITS_PER_WORD;
4000 /* Now assign stack slots for them. */
4001 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4002 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4004 cfun->machine->frame.reg_offset[regno] = offset;
4005 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4006 cfun->machine->frame.wb_candidate1 = regno;
4007 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4008 cfun->machine->frame.wb_candidate2 = regno;
4009 offset += UNITS_PER_WORD;
4012 HOST_WIDE_INT max_int_offset = offset;
4013 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4014 bool has_align_gap = offset != max_int_offset;
4016 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4017 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4019 /* If there is an alignment gap between integer and fp callee-saves,
4020 allocate the last fp register to it if possible. */
4021 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4023 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4024 break;
4027 cfun->machine->frame.reg_offset[regno] = offset;
4028 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4029 cfun->machine->frame.wb_candidate1 = regno;
4030 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4031 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4032 cfun->machine->frame.wb_candidate2 = regno;
4033 offset += UNITS_PER_WORD;
4036 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4038 cfun->machine->frame.saved_regs_size = offset;
4040 HOST_WIDE_INT varargs_and_saved_regs_size
4041 = offset + cfun->machine->frame.saved_varargs_size;
4043 cfun->machine->frame.hard_fp_offset
4044 = aligned_upper_bound (varargs_and_saved_regs_size
4045 + get_frame_size (),
4046 STACK_BOUNDARY / BITS_PER_UNIT);
4048 /* Both these values are already aligned. */
4049 gcc_assert (multiple_p (crtl->outgoing_args_size,
4050 STACK_BOUNDARY / BITS_PER_UNIT));
4051 cfun->machine->frame.frame_size
4052 = (cfun->machine->frame.hard_fp_offset
4053 + crtl->outgoing_args_size);
4055 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4057 cfun->machine->frame.initial_adjust = 0;
4058 cfun->machine->frame.final_adjust = 0;
4059 cfun->machine->frame.callee_adjust = 0;
4060 cfun->machine->frame.callee_offset = 0;
4062 HOST_WIDE_INT max_push_offset = 0;
4063 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4064 max_push_offset = 512;
4065 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4066 max_push_offset = 256;
4068 HOST_WIDE_INT const_size, const_fp_offset;
4069 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4070 && const_size < max_push_offset
4071 && known_eq (crtl->outgoing_args_size, 0))
4073 /* Simple, small frame with no outgoing arguments:
4074 stp reg1, reg2, [sp, -frame_size]!
4075 stp reg3, reg4, [sp, 16] */
4076 cfun->machine->frame.callee_adjust = const_size;
4078 else if (known_lt (crtl->outgoing_args_size
4079 + cfun->machine->frame.saved_regs_size, 512)
4080 && !(cfun->calls_alloca
4081 && known_lt (cfun->machine->frame.hard_fp_offset,
4082 max_push_offset)))
4084 /* Frame with small outgoing arguments:
4085 sub sp, sp, frame_size
4086 stp reg1, reg2, [sp, outgoing_args_size]
4087 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4088 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4089 cfun->machine->frame.callee_offset
4090 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4092 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4093 && const_fp_offset < max_push_offset)
4095 /* Frame with large outgoing arguments but a small local area:
4096 stp reg1, reg2, [sp, -hard_fp_offset]!
4097 stp reg3, reg4, [sp, 16]
4098 sub sp, sp, outgoing_args_size */
4099 cfun->machine->frame.callee_adjust = const_fp_offset;
4100 cfun->machine->frame.final_adjust
4101 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4103 else
4105 /* Frame with large local area and outgoing arguments using frame pointer:
4106 sub sp, sp, hard_fp_offset
4107 stp x29, x30, [sp, 0]
4108 add x29, sp, 0
4109 stp reg3, reg4, [sp, 16]
4110 sub sp, sp, outgoing_args_size */
4111 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4112 cfun->machine->frame.final_adjust
4113 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4116 cfun->machine->frame.laid_out = true;
4119 /* Return true if the register REGNO is saved on entry to
4120 the current function. */
4122 static bool
4123 aarch64_register_saved_on_entry (int regno)
4125 return cfun->machine->frame.reg_offset[regno] >= 0;
4128 /* Return the next register up from REGNO up to LIMIT for the callee
4129 to save. */
4131 static unsigned
4132 aarch64_next_callee_save (unsigned regno, unsigned limit)
4134 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4135 regno ++;
4136 return regno;
4139 /* Push the register number REGNO of mode MODE to the stack with write-back
4140 adjusting the stack by ADJUSTMENT. */
4142 static void
4143 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4144 HOST_WIDE_INT adjustment)
4146 rtx base_rtx = stack_pointer_rtx;
4147 rtx insn, reg, mem;
4149 reg = gen_rtx_REG (mode, regno);
4150 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4151 plus_constant (Pmode, base_rtx, -adjustment));
4152 mem = gen_frame_mem (mode, mem);
4154 insn = emit_move_insn (mem, reg);
4155 RTX_FRAME_RELATED_P (insn) = 1;
4158 /* Generate and return an instruction to store the pair of registers
4159 REG and REG2 of mode MODE to location BASE with write-back adjusting
4160 the stack location BASE by ADJUSTMENT. */
4162 static rtx
4163 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4164 HOST_WIDE_INT adjustment)
4166 switch (mode)
4168 case E_DImode:
4169 return gen_storewb_pairdi_di (base, base, reg, reg2,
4170 GEN_INT (-adjustment),
4171 GEN_INT (UNITS_PER_WORD - adjustment));
4172 case E_DFmode:
4173 return gen_storewb_pairdf_di (base, base, reg, reg2,
4174 GEN_INT (-adjustment),
4175 GEN_INT (UNITS_PER_WORD - adjustment));
4176 default:
4177 gcc_unreachable ();
4181 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4182 stack pointer by ADJUSTMENT. */
4184 static void
4185 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4187 rtx_insn *insn;
4188 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4190 if (regno2 == INVALID_REGNUM)
4191 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4193 rtx reg1 = gen_rtx_REG (mode, regno1);
4194 rtx reg2 = gen_rtx_REG (mode, regno2);
4196 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4197 reg2, adjustment));
4198 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4199 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4200 RTX_FRAME_RELATED_P (insn) = 1;
4203 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4204 adjusting it by ADJUSTMENT afterwards. */
4206 static rtx
4207 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4208 HOST_WIDE_INT adjustment)
4210 switch (mode)
4212 case E_DImode:
4213 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4214 GEN_INT (UNITS_PER_WORD));
4215 case E_DFmode:
4216 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4217 GEN_INT (UNITS_PER_WORD));
4218 default:
4219 gcc_unreachable ();
4223 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4224 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4225 into CFI_OPS. */
4227 static void
4228 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4229 rtx *cfi_ops)
4231 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4232 rtx reg1 = gen_rtx_REG (mode, regno1);
4234 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4236 if (regno2 == INVALID_REGNUM)
4238 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4239 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4240 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4242 else
4244 rtx reg2 = gen_rtx_REG (mode, regno2);
4245 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4246 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4247 reg2, adjustment));
4251 /* Generate and return a store pair instruction of mode MODE to store
4252 register REG1 to MEM1 and register REG2 to MEM2. */
4254 static rtx
4255 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4256 rtx reg2)
4258 switch (mode)
4260 case E_DImode:
4261 return gen_store_pairdi (mem1, reg1, mem2, reg2);
4263 case E_DFmode:
4264 return gen_store_pairdf (mem1, reg1, mem2, reg2);
4266 default:
4267 gcc_unreachable ();
4271 /* Generate and regurn a load pair isntruction of mode MODE to load register
4272 REG1 from MEM1 and register REG2 from MEM2. */
4274 static rtx
4275 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4276 rtx mem2)
4278 switch (mode)
4280 case E_DImode:
4281 return gen_load_pairdi (reg1, mem1, reg2, mem2);
4283 case E_DFmode:
4284 return gen_load_pairdf (reg1, mem1, reg2, mem2);
4286 default:
4287 gcc_unreachable ();
4291 /* Return TRUE if return address signing should be enabled for the current
4292 function, otherwise return FALSE. */
4294 bool
4295 aarch64_return_address_signing_enabled (void)
4297 /* This function should only be called after frame laid out. */
4298 gcc_assert (cfun->machine->frame.laid_out);
4300 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4301 if it's LR is pushed onto stack. */
4302 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4303 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4304 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4307 /* Emit code to save the callee-saved registers from register number START
4308 to LIMIT to the stack at the location starting at offset START_OFFSET,
4309 skipping any write-back candidates if SKIP_WB is true. */
4311 static void
4312 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4313 unsigned start, unsigned limit, bool skip_wb)
4315 rtx_insn *insn;
4316 unsigned regno;
4317 unsigned regno2;
4319 for (regno = aarch64_next_callee_save (start, limit);
4320 regno <= limit;
4321 regno = aarch64_next_callee_save (regno + 1, limit))
4323 rtx reg, mem;
4324 poly_int64 offset;
4326 if (skip_wb
4327 && (regno == cfun->machine->frame.wb_candidate1
4328 || regno == cfun->machine->frame.wb_candidate2))
4329 continue;
4331 if (cfun->machine->reg_is_wrapped_separately[regno])
4332 continue;
4334 reg = gen_rtx_REG (mode, regno);
4335 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4336 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4337 offset));
4339 regno2 = aarch64_next_callee_save (regno + 1, limit);
4341 if (regno2 <= limit
4342 && !cfun->machine->reg_is_wrapped_separately[regno2]
4343 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4344 == cfun->machine->frame.reg_offset[regno2]))
4347 rtx reg2 = gen_rtx_REG (mode, regno2);
4348 rtx mem2;
4350 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4351 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4352 offset));
4353 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4354 reg2));
4356 /* The first part of a frame-related parallel insn is
4357 always assumed to be relevant to the frame
4358 calculations; subsequent parts, are only
4359 frame-related if explicitly marked. */
4360 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4361 regno = regno2;
4363 else
4364 insn = emit_move_insn (mem, reg);
4366 RTX_FRAME_RELATED_P (insn) = 1;
4370 /* Emit code to restore the callee registers of mode MODE from register
4371 number START up to and including LIMIT. Restore from the stack offset
4372 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4373 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4375 static void
4376 aarch64_restore_callee_saves (machine_mode mode,
4377 poly_int64 start_offset, unsigned start,
4378 unsigned limit, bool skip_wb, rtx *cfi_ops)
4380 rtx base_rtx = stack_pointer_rtx;
4381 unsigned regno;
4382 unsigned regno2;
4383 poly_int64 offset;
4385 for (regno = aarch64_next_callee_save (start, limit);
4386 regno <= limit;
4387 regno = aarch64_next_callee_save (regno + 1, limit))
4389 if (cfun->machine->reg_is_wrapped_separately[regno])
4390 continue;
4392 rtx reg, mem;
4394 if (skip_wb
4395 && (regno == cfun->machine->frame.wb_candidate1
4396 || regno == cfun->machine->frame.wb_candidate2))
4397 continue;
4399 reg = gen_rtx_REG (mode, regno);
4400 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4401 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4403 regno2 = aarch64_next_callee_save (regno + 1, limit);
4405 if (regno2 <= limit
4406 && !cfun->machine->reg_is_wrapped_separately[regno2]
4407 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4408 == cfun->machine->frame.reg_offset[regno2]))
4410 rtx reg2 = gen_rtx_REG (mode, regno2);
4411 rtx mem2;
4413 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4414 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4415 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4417 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4418 regno = regno2;
4420 else
4421 emit_move_insn (reg, mem);
4422 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4426 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4427 of MODE. */
4429 static inline bool
4430 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4432 HOST_WIDE_INT multiple;
4433 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4434 && IN_RANGE (multiple, -8, 7));
4437 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4438 of MODE. */
4440 static inline bool
4441 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4443 HOST_WIDE_INT multiple;
4444 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4445 && IN_RANGE (multiple, 0, 63));
4448 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4449 of MODE. */
4451 bool
4452 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4454 HOST_WIDE_INT multiple;
4455 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4456 && IN_RANGE (multiple, -64, 63));
4459 /* Return true if OFFSET is a signed 9-bit value. */
4461 static inline bool
4462 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4463 poly_int64 offset)
4465 HOST_WIDE_INT const_offset;
4466 return (offset.is_constant (&const_offset)
4467 && IN_RANGE (const_offset, -256, 255));
4470 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4471 of MODE. */
4473 static inline bool
4474 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4476 HOST_WIDE_INT multiple;
4477 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4478 && IN_RANGE (multiple, -256, 255));
4481 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4482 of MODE. */
4484 static inline bool
4485 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4487 HOST_WIDE_INT multiple;
4488 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4489 && IN_RANGE (multiple, 0, 4095));
4492 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4494 static sbitmap
4495 aarch64_get_separate_components (void)
4497 aarch64_layout_frame ();
4499 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4500 bitmap_clear (components);
4502 /* The registers we need saved to the frame. */
4503 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4504 if (aarch64_register_saved_on_entry (regno))
4506 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4507 if (!frame_pointer_needed)
4508 offset += cfun->machine->frame.frame_size
4509 - cfun->machine->frame.hard_fp_offset;
4510 /* Check that we can access the stack slot of the register with one
4511 direct load with no adjustments needed. */
4512 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4513 bitmap_set_bit (components, regno);
4516 /* Don't mess with the hard frame pointer. */
4517 if (frame_pointer_needed)
4518 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4520 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4521 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4522 /* If aarch64_layout_frame has chosen registers to store/restore with
4523 writeback don't interfere with them to avoid having to output explicit
4524 stack adjustment instructions. */
4525 if (reg2 != INVALID_REGNUM)
4526 bitmap_clear_bit (components, reg2);
4527 if (reg1 != INVALID_REGNUM)
4528 bitmap_clear_bit (components, reg1);
4530 bitmap_clear_bit (components, LR_REGNUM);
4531 bitmap_clear_bit (components, SP_REGNUM);
4533 return components;
4536 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4538 static sbitmap
4539 aarch64_components_for_bb (basic_block bb)
4541 bitmap in = DF_LIVE_IN (bb);
4542 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4543 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4545 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4546 bitmap_clear (components);
4548 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4549 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4550 if ((!call_used_regs[regno])
4551 && (bitmap_bit_p (in, regno)
4552 || bitmap_bit_p (gen, regno)
4553 || bitmap_bit_p (kill, regno)))
4555 unsigned regno2, offset, offset2;
4556 bitmap_set_bit (components, regno);
4558 /* If there is a callee-save at an adjacent offset, add it too
4559 to increase the use of LDP/STP. */
4560 offset = cfun->machine->frame.reg_offset[regno];
4561 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4563 if (regno2 <= LAST_SAVED_REGNUM)
4565 offset2 = cfun->machine->frame.reg_offset[regno2];
4566 if ((offset & ~8) == (offset2 & ~8))
4567 bitmap_set_bit (components, regno2);
4571 return components;
4574 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4575 Nothing to do for aarch64. */
4577 static void
4578 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4582 /* Return the next set bit in BMP from START onwards. Return the total number
4583 of bits in BMP if no set bit is found at or after START. */
4585 static unsigned int
4586 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4588 unsigned int nbits = SBITMAP_SIZE (bmp);
4589 if (start == nbits)
4590 return start;
4592 gcc_assert (start < nbits);
4593 for (unsigned int i = start; i < nbits; i++)
4594 if (bitmap_bit_p (bmp, i))
4595 return i;
4597 return nbits;
4600 /* Do the work for aarch64_emit_prologue_components and
4601 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4602 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4603 for these components or the epilogue sequence. That is, it determines
4604 whether we should emit stores or loads and what kind of CFA notes to attach
4605 to the insns. Otherwise the logic for the two sequences is very
4606 similar. */
4608 static void
4609 aarch64_process_components (sbitmap components, bool prologue_p)
4611 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4612 ? HARD_FRAME_POINTER_REGNUM
4613 : STACK_POINTER_REGNUM);
4615 unsigned last_regno = SBITMAP_SIZE (components);
4616 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4617 rtx_insn *insn = NULL;
4619 while (regno != last_regno)
4621 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4622 so DFmode for the vector registers is enough. */
4623 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4624 rtx reg = gen_rtx_REG (mode, regno);
4625 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4626 if (!frame_pointer_needed)
4627 offset += cfun->machine->frame.frame_size
4628 - cfun->machine->frame.hard_fp_offset;
4629 rtx addr = plus_constant (Pmode, ptr_reg, offset);
4630 rtx mem = gen_frame_mem (mode, addr);
4632 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4633 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4634 /* No more registers to handle after REGNO.
4635 Emit a single save/restore and exit. */
4636 if (regno2 == last_regno)
4638 insn = emit_insn (set);
4639 RTX_FRAME_RELATED_P (insn) = 1;
4640 if (prologue_p)
4641 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4642 else
4643 add_reg_note (insn, REG_CFA_RESTORE, reg);
4644 break;
4647 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4648 /* The next register is not of the same class or its offset is not
4649 mergeable with the current one into a pair. */
4650 if (!satisfies_constraint_Ump (mem)
4651 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4652 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4653 GET_MODE_SIZE (mode)))
4655 insn = emit_insn (set);
4656 RTX_FRAME_RELATED_P (insn) = 1;
4657 if (prologue_p)
4658 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4659 else
4660 add_reg_note (insn, REG_CFA_RESTORE, reg);
4662 regno = regno2;
4663 continue;
4666 /* REGNO2 can be saved/restored in a pair with REGNO. */
4667 rtx reg2 = gen_rtx_REG (mode, regno2);
4668 if (!frame_pointer_needed)
4669 offset2 += cfun->machine->frame.frame_size
4670 - cfun->machine->frame.hard_fp_offset;
4671 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4672 rtx mem2 = gen_frame_mem (mode, addr2);
4673 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4674 : gen_rtx_SET (reg2, mem2);
4676 if (prologue_p)
4677 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4678 else
4679 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4681 RTX_FRAME_RELATED_P (insn) = 1;
4682 if (prologue_p)
4684 add_reg_note (insn, REG_CFA_OFFSET, set);
4685 add_reg_note (insn, REG_CFA_OFFSET, set2);
4687 else
4689 add_reg_note (insn, REG_CFA_RESTORE, reg);
4690 add_reg_note (insn, REG_CFA_RESTORE, reg2);
4693 regno = aarch64_get_next_set_bit (components, regno2 + 1);
4697 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4699 static void
4700 aarch64_emit_prologue_components (sbitmap components)
4702 aarch64_process_components (components, true);
4705 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4707 static void
4708 aarch64_emit_epilogue_components (sbitmap components)
4710 aarch64_process_components (components, false);
4713 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4715 static void
4716 aarch64_set_handled_components (sbitmap components)
4718 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4719 if (bitmap_bit_p (components, regno))
4720 cfun->machine->reg_is_wrapped_separately[regno] = true;
4723 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4724 is saved at BASE + OFFSET. */
4726 static void
4727 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4728 rtx base, poly_int64 offset)
4730 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4731 add_reg_note (insn, REG_CFA_EXPRESSION,
4732 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4735 /* AArch64 stack frames generated by this compiler look like:
4737 +-------------------------------+
4739 | incoming stack arguments |
4741 +-------------------------------+
4742 | | <-- incoming stack pointer (aligned)
4743 | callee-allocated save area |
4744 | for register varargs |
4746 +-------------------------------+
4747 | local variables | <-- frame_pointer_rtx
4749 +-------------------------------+
4750 | padding0 | \
4751 +-------------------------------+ |
4752 | callee-saved registers | | frame.saved_regs_size
4753 +-------------------------------+ |
4754 | LR' | |
4755 +-------------------------------+ |
4756 | FP' | / <- hard_frame_pointer_rtx (aligned)
4757 +-------------------------------+
4758 | dynamic allocation |
4759 +-------------------------------+
4760 | padding |
4761 +-------------------------------+
4762 | outgoing stack arguments | <-- arg_pointer
4764 +-------------------------------+
4765 | | <-- stack_pointer_rtx (aligned)
4767 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4768 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4769 unchanged. */
4771 /* Generate the prologue instructions for entry into a function.
4772 Establish the stack frame by decreasing the stack pointer with a
4773 properly calculated size and, if necessary, create a frame record
4774 filled with the values of LR and previous frame pointer. The
4775 current FP is also set up if it is in use. */
4777 void
4778 aarch64_expand_prologue (void)
4780 aarch64_layout_frame ();
4782 poly_int64 frame_size = cfun->machine->frame.frame_size;
4783 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4784 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4785 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4786 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4787 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4788 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4789 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4790 rtx_insn *insn;
4792 /* Sign return address for functions. */
4793 if (aarch64_return_address_signing_enabled ())
4795 insn = emit_insn (gen_pacisp ());
4796 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4797 RTX_FRAME_RELATED_P (insn) = 1;
4800 if (flag_stack_usage_info)
4801 current_function_static_stack_size = constant_lower_bound (frame_size);
4803 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4805 if (crtl->is_leaf && !cfun->calls_alloca)
4807 if (maybe_gt (frame_size, PROBE_INTERVAL)
4808 && maybe_gt (frame_size, get_stack_check_protect ()))
4809 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4810 (frame_size
4811 - get_stack_check_protect ()));
4813 else if (maybe_gt (frame_size, 0))
4814 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4817 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4818 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4820 aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4822 if (callee_adjust != 0)
4823 aarch64_push_regs (reg1, reg2, callee_adjust);
4825 if (emit_frame_chain)
4827 poly_int64 reg_offset = callee_adjust;
4828 if (callee_adjust == 0)
4830 reg1 = R29_REGNUM;
4831 reg2 = R30_REGNUM;
4832 reg_offset = callee_offset;
4833 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4835 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4836 stack_pointer_rtx, callee_offset,
4837 ip1_rtx, ip0_rtx, frame_pointer_needed);
4838 if (frame_pointer_needed && !frame_size.is_constant ())
4840 /* Variable-sized frames need to describe the save slot
4841 address using DW_CFA_expression rather than DW_CFA_offset.
4842 This means that, without taking further action, the
4843 locations of the registers that we've already saved would
4844 remain based on the stack pointer even after we redefine
4845 the CFA based on the frame pointer. We therefore need new
4846 DW_CFA_expressions to re-express the save slots with addresses
4847 based on the frame pointer. */
4848 rtx_insn *insn = get_last_insn ();
4849 gcc_assert (RTX_FRAME_RELATED_P (insn));
4851 /* Add an explicit CFA definition if this was previously
4852 implicit. */
4853 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4855 rtx src = plus_constant (Pmode, stack_pointer_rtx,
4856 callee_offset);
4857 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4858 gen_rtx_SET (hard_frame_pointer_rtx, src));
4861 /* Change the save slot expressions for the registers that
4862 we've already saved. */
4863 reg_offset -= callee_offset;
4864 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4865 reg_offset + UNITS_PER_WORD);
4866 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4867 reg_offset);
4869 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4872 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4873 callee_adjust != 0 || emit_frame_chain);
4874 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4875 callee_adjust != 0 || emit_frame_chain);
4876 aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4879 /* Return TRUE if we can use a simple_return insn.
4881 This function checks whether the callee saved stack is empty, which
4882 means no restore actions are need. The pro_and_epilogue will use
4883 this to check whether shrink-wrapping opt is feasible. */
4885 bool
4886 aarch64_use_return_insn_p (void)
4888 if (!reload_completed)
4889 return false;
4891 if (crtl->profile)
4892 return false;
4894 aarch64_layout_frame ();
4896 return known_eq (cfun->machine->frame.frame_size, 0);
4899 /* Generate the epilogue instructions for returning from a function.
4900 This is almost exactly the reverse of the prolog sequence, except
4901 that we need to insert barriers to avoid scheduling loads that read
4902 from a deallocated stack, and we optimize the unwind records by
4903 emitting them all together if possible. */
4904 void
4905 aarch64_expand_epilogue (bool for_sibcall)
4907 aarch64_layout_frame ();
4909 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4910 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4911 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4912 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4913 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4914 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4915 rtx cfi_ops = NULL;
4916 rtx_insn *insn;
4917 /* A stack clash protection prologue may not have left IP0_REGNUM or
4918 IP1_REGNUM in a usable state. The same is true for allocations
4919 with an SVE component, since we then need both temporary registers
4920 for each allocation. */
4921 bool can_inherit_p = (initial_adjust.is_constant ()
4922 && final_adjust.is_constant ()
4923 && !flag_stack_clash_protection);
4925 /* We need to add memory barrier to prevent read from deallocated stack. */
4926 bool need_barrier_p
4927 = maybe_ne (get_frame_size ()
4928 + cfun->machine->frame.saved_varargs_size, 0);
4930 /* Emit a barrier to prevent loads from a deallocated stack. */
4931 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4932 || cfun->calls_alloca
4933 || crtl->calls_eh_return)
4935 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4936 need_barrier_p = false;
4939 /* Restore the stack pointer from the frame pointer if it may not
4940 be the same as the stack pointer. */
4941 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4942 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4943 if (frame_pointer_needed
4944 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4945 /* If writeback is used when restoring callee-saves, the CFA
4946 is restored on the instruction doing the writeback. */
4947 aarch64_add_offset (Pmode, stack_pointer_rtx,
4948 hard_frame_pointer_rtx, -callee_offset,
4949 ip1_rtx, ip0_rtx, callee_adjust == 0);
4950 else
4951 aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4952 !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4954 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4955 callee_adjust != 0, &cfi_ops);
4956 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4957 callee_adjust != 0, &cfi_ops);
4959 if (need_barrier_p)
4960 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4962 if (callee_adjust != 0)
4963 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4965 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4967 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
4968 insn = get_last_insn ();
4969 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
4970 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
4971 RTX_FRAME_RELATED_P (insn) = 1;
4972 cfi_ops = NULL;
4975 aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
4976 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
4978 if (cfi_ops)
4980 /* Emit delayed restores and reset the CFA to be SP. */
4981 insn = get_last_insn ();
4982 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
4983 REG_NOTES (insn) = cfi_ops;
4984 RTX_FRAME_RELATED_P (insn) = 1;
4987 /* We prefer to emit the combined return/authenticate instruction RETAA,
4988 however there are three cases in which we must instead emit an explicit
4989 authentication instruction.
4991 1) Sibcalls don't return in a normal way, so if we're about to call one
4992 we must authenticate.
4994 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
4995 generating code for !TARGET_ARMV8_3 we can't use it and must
4996 explicitly authenticate.
4998 3) On an eh_return path we make extra stack adjustments to update the
4999 canonical frame address to be the exception handler's CFA. We want
5000 to authenticate using the CFA of the function which calls eh_return.
5002 if (aarch64_return_address_signing_enabled ()
5003 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5005 insn = emit_insn (gen_autisp ());
5006 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5007 RTX_FRAME_RELATED_P (insn) = 1;
5010 /* Stack adjustment for exception handler. */
5011 if (crtl->calls_eh_return)
5013 /* We need to unwind the stack by the offset computed by
5014 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5015 to be SP; letting the CFA move during this adjustment
5016 is just as correct as retaining the CFA from the body
5017 of the function. Therefore, do nothing special. */
5018 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5021 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5022 if (!for_sibcall)
5023 emit_jump_insn (ret_rtx);
5026 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5027 normally or return to a previous frame after unwinding.
5029 An EH return uses a single shared return sequence. The epilogue is
5030 exactly like a normal epilogue except that it has an extra input
5031 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5032 that must be applied after the frame has been destroyed. An extra label
5033 is inserted before the epilogue which initializes this register to zero,
5034 and this is the entry point for a normal return.
5036 An actual EH return updates the return address, initializes the stack
5037 adjustment and jumps directly into the epilogue (bypassing the zeroing
5038 of the adjustment). Since the return address is typically saved on the
5039 stack when a function makes a call, the saved LR must be updated outside
5040 the epilogue.
5042 This poses problems as the store is generated well before the epilogue,
5043 so the offset of LR is not known yet. Also optimizations will remove the
5044 store as it appears dead, even after the epilogue is generated (as the
5045 base or offset for loading LR is different in many cases).
5047 To avoid these problems this implementation forces the frame pointer
5048 in eh_return functions so that the location of LR is fixed and known early.
5049 It also marks the store volatile, so no optimization is permitted to
5050 remove the store. */
5052 aarch64_eh_return_handler_rtx (void)
5054 rtx tmp = gen_frame_mem (Pmode,
5055 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5057 /* Mark the store volatile, so no optimization is permitted to remove it. */
5058 MEM_VOLATILE_P (tmp) = true;
5059 return tmp;
5062 /* Output code to add DELTA to the first argument, and then jump
5063 to FUNCTION. Used for C++ multiple inheritance. */
5064 static void
5065 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5066 HOST_WIDE_INT delta,
5067 HOST_WIDE_INT vcall_offset,
5068 tree function)
5070 /* The this pointer is always in x0. Note that this differs from
5071 Arm where the this pointer maybe bumped to r1 if r0 is required
5072 to return a pointer to an aggregate. On AArch64 a result value
5073 pointer will be in x8. */
5074 int this_regno = R0_REGNUM;
5075 rtx this_rtx, temp0, temp1, addr, funexp;
5076 rtx_insn *insn;
5078 reload_completed = 1;
5079 emit_note (NOTE_INSN_PROLOGUE_END);
5081 this_rtx = gen_rtx_REG (Pmode, this_regno);
5082 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5083 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5085 if (vcall_offset == 0)
5086 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5087 else
5089 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5091 addr = this_rtx;
5092 if (delta != 0)
5094 if (delta >= -256 && delta < 256)
5095 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5096 plus_constant (Pmode, this_rtx, delta));
5097 else
5098 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5099 temp1, temp0, false);
5102 if (Pmode == ptr_mode)
5103 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5104 else
5105 aarch64_emit_move (temp0,
5106 gen_rtx_ZERO_EXTEND (Pmode,
5107 gen_rtx_MEM (ptr_mode, addr)));
5109 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5110 addr = plus_constant (Pmode, temp0, vcall_offset);
5111 else
5113 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5114 Pmode);
5115 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5118 if (Pmode == ptr_mode)
5119 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5120 else
5121 aarch64_emit_move (temp1,
5122 gen_rtx_SIGN_EXTEND (Pmode,
5123 gen_rtx_MEM (ptr_mode, addr)));
5125 emit_insn (gen_add2_insn (this_rtx, temp1));
5128 /* Generate a tail call to the target function. */
5129 if (!TREE_USED (function))
5131 assemble_external (function);
5132 TREE_USED (function) = 1;
5134 funexp = XEXP (DECL_RTL (function), 0);
5135 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5136 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5137 SIBLING_CALL_P (insn) = 1;
5139 insn = get_insns ();
5140 shorten_branches (insn);
5141 final_start_function (insn, file, 1);
5142 final (insn, file, 1);
5143 final_end_function ();
5145 /* Stop pretending to be a post-reload pass. */
5146 reload_completed = 0;
5149 static bool
5150 aarch64_tls_referenced_p (rtx x)
5152 if (!TARGET_HAVE_TLS)
5153 return false;
5154 subrtx_iterator::array_type array;
5155 FOR_EACH_SUBRTX (iter, array, x, ALL)
5157 const_rtx x = *iter;
5158 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5159 return true;
5160 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5161 TLS offsets, not real symbol references. */
5162 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5163 iter.skip_subrtxes ();
5165 return false;
5169 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5170 a left shift of 0 or 12 bits. */
5171 bool
5172 aarch64_uimm12_shift (HOST_WIDE_INT val)
5174 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5175 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5180 /* Return true if val is an immediate that can be loaded into a
5181 register by a MOVZ instruction. */
5182 static bool
5183 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5185 if (GET_MODE_SIZE (mode) > 4)
5187 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5188 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5189 return 1;
5191 else
5193 /* Ignore sign extension. */
5194 val &= (HOST_WIDE_INT) 0xffffffff;
5196 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5197 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5200 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5201 64-bit (DImode) integer. */
5203 static unsigned HOST_WIDE_INT
5204 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5206 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5207 while (size < 64)
5209 val &= (HOST_WIDE_INT_1U << size) - 1;
5210 val |= val << size;
5211 size *= 2;
5213 return val;
5216 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5218 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5220 0x0000000100000001ull,
5221 0x0001000100010001ull,
5222 0x0101010101010101ull,
5223 0x1111111111111111ull,
5224 0x5555555555555555ull,
5228 /* Return true if val is a valid bitmask immediate. */
5230 bool
5231 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5233 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5234 int bits;
5236 /* Check for a single sequence of one bits and return quickly if so.
5237 The special cases of all ones and all zeroes returns false. */
5238 val = aarch64_replicate_bitmask_imm (val_in, mode);
5239 tmp = val + (val & -val);
5241 if (tmp == (tmp & -tmp))
5242 return (val + 1) > 1;
5244 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5245 if (mode == SImode)
5246 val = (val << 32) | (val & 0xffffffff);
5248 /* Invert if the immediate doesn't start with a zero bit - this means we
5249 only need to search for sequences of one bits. */
5250 if (val & 1)
5251 val = ~val;
5253 /* Find the first set bit and set tmp to val with the first sequence of one
5254 bits removed. Return success if there is a single sequence of ones. */
5255 first_one = val & -val;
5256 tmp = val & (val + first_one);
5258 if (tmp == 0)
5259 return true;
5261 /* Find the next set bit and compute the difference in bit position. */
5262 next_one = tmp & -tmp;
5263 bits = clz_hwi (first_one) - clz_hwi (next_one);
5264 mask = val ^ tmp;
5266 /* Check the bit position difference is a power of 2, and that the first
5267 sequence of one bits fits within 'bits' bits. */
5268 if ((mask >> bits) != 0 || bits != (bits & -bits))
5269 return false;
5271 /* Check the sequence of one bits is repeated 64/bits times. */
5272 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5275 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5276 Assumed precondition: VAL_IN Is not zero. */
5278 unsigned HOST_WIDE_INT
5279 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5281 int lowest_bit_set = ctz_hwi (val_in);
5282 int highest_bit_set = floor_log2 (val_in);
5283 gcc_assert (val_in != 0);
5285 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5286 (HOST_WIDE_INT_1U << lowest_bit_set));
5289 /* Create constant where bits outside of lowest bit set to highest bit set
5290 are set to 1. */
5292 unsigned HOST_WIDE_INT
5293 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5295 return val_in | ~aarch64_and_split_imm1 (val_in);
5298 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5300 bool
5301 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5303 scalar_int_mode int_mode;
5304 if (!is_a <scalar_int_mode> (mode, &int_mode))
5305 return false;
5307 if (aarch64_bitmask_imm (val_in, int_mode))
5308 return false;
5310 if (aarch64_move_imm (val_in, int_mode))
5311 return false;
5313 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5315 return aarch64_bitmask_imm (imm2, int_mode);
5318 /* Return true if val is an immediate that can be loaded into a
5319 register in a single instruction. */
5320 bool
5321 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5323 scalar_int_mode int_mode;
5324 if (!is_a <scalar_int_mode> (mode, &int_mode))
5325 return false;
5327 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5328 return 1;
5329 return aarch64_bitmask_imm (val, int_mode);
5332 static bool
5333 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5335 rtx base, offset;
5337 if (GET_CODE (x) == HIGH)
5338 return true;
5340 /* There's no way to calculate VL-based values using relocations. */
5341 subrtx_iterator::array_type array;
5342 FOR_EACH_SUBRTX (iter, array, x, ALL)
5343 if (GET_CODE (*iter) == CONST_POLY_INT)
5344 return true;
5346 split_const (x, &base, &offset);
5347 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5349 if (aarch64_classify_symbol (base, INTVAL (offset))
5350 != SYMBOL_FORCE_TO_MEM)
5351 return true;
5352 else
5353 /* Avoid generating a 64-bit relocation in ILP32; leave
5354 to aarch64_expand_mov_immediate to handle it properly. */
5355 return mode != ptr_mode;
5358 return aarch64_tls_referenced_p (x);
5361 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5362 The expansion for a table switch is quite expensive due to the number
5363 of instructions, the table lookup and hard to predict indirect jump.
5364 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5365 set, otherwise use tables for > 16 cases as a tradeoff between size and
5366 performance. When optimizing for size, use the default setting. */
5368 static unsigned int
5369 aarch64_case_values_threshold (void)
5371 /* Use the specified limit for the number of cases before using jump
5372 tables at higher optimization levels. */
5373 if (optimize > 2
5374 && selected_cpu->tune->max_case_values != 0)
5375 return selected_cpu->tune->max_case_values;
5376 else
5377 return optimize_size ? default_case_values_threshold () : 17;
5380 /* Return true if register REGNO is a valid index register.
5381 STRICT_P is true if REG_OK_STRICT is in effect. */
5383 bool
5384 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5386 if (!HARD_REGISTER_NUM_P (regno))
5388 if (!strict_p)
5389 return true;
5391 if (!reg_renumber)
5392 return false;
5394 regno = reg_renumber[regno];
5396 return GP_REGNUM_P (regno);
5399 /* Return true if register REGNO is a valid base register for mode MODE.
5400 STRICT_P is true if REG_OK_STRICT is in effect. */
5402 bool
5403 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5405 if (!HARD_REGISTER_NUM_P (regno))
5407 if (!strict_p)
5408 return true;
5410 if (!reg_renumber)
5411 return false;
5413 regno = reg_renumber[regno];
5416 /* The fake registers will be eliminated to either the stack or
5417 hard frame pointer, both of which are usually valid base registers.
5418 Reload deals with the cases where the eliminated form isn't valid. */
5419 return (GP_REGNUM_P (regno)
5420 || regno == SP_REGNUM
5421 || regno == FRAME_POINTER_REGNUM
5422 || regno == ARG_POINTER_REGNUM);
5425 /* Return true if X is a valid base register for mode MODE.
5426 STRICT_P is true if REG_OK_STRICT is in effect. */
5428 static bool
5429 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5431 if (!strict_p
5432 && GET_CODE (x) == SUBREG
5433 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5434 x = SUBREG_REG (x);
5436 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5439 /* Return true if address offset is a valid index. If it is, fill in INFO
5440 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5442 static bool
5443 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5444 machine_mode mode, bool strict_p)
5446 enum aarch64_address_type type;
5447 rtx index;
5448 int shift;
5450 /* (reg:P) */
5451 if ((REG_P (x) || GET_CODE (x) == SUBREG)
5452 && GET_MODE (x) == Pmode)
5454 type = ADDRESS_REG_REG;
5455 index = x;
5456 shift = 0;
5458 /* (sign_extend:DI (reg:SI)) */
5459 else if ((GET_CODE (x) == SIGN_EXTEND
5460 || GET_CODE (x) == ZERO_EXTEND)
5461 && GET_MODE (x) == DImode
5462 && GET_MODE (XEXP (x, 0)) == SImode)
5464 type = (GET_CODE (x) == SIGN_EXTEND)
5465 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5466 index = XEXP (x, 0);
5467 shift = 0;
5469 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5470 else if (GET_CODE (x) == MULT
5471 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5472 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5473 && GET_MODE (XEXP (x, 0)) == DImode
5474 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5475 && CONST_INT_P (XEXP (x, 1)))
5477 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5478 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5479 index = XEXP (XEXP (x, 0), 0);
5480 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5482 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5483 else if (GET_CODE (x) == ASHIFT
5484 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5485 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5486 && GET_MODE (XEXP (x, 0)) == DImode
5487 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5488 && CONST_INT_P (XEXP (x, 1)))
5490 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5491 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5492 index = XEXP (XEXP (x, 0), 0);
5493 shift = INTVAL (XEXP (x, 1));
5495 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5496 else if ((GET_CODE (x) == SIGN_EXTRACT
5497 || GET_CODE (x) == ZERO_EXTRACT)
5498 && GET_MODE (x) == DImode
5499 && GET_CODE (XEXP (x, 0)) == MULT
5500 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5501 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5503 type = (GET_CODE (x) == SIGN_EXTRACT)
5504 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5505 index = XEXP (XEXP (x, 0), 0);
5506 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5507 if (INTVAL (XEXP (x, 1)) != 32 + shift
5508 || INTVAL (XEXP (x, 2)) != 0)
5509 shift = -1;
5511 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5512 (const_int 0xffffffff<<shift)) */
5513 else if (GET_CODE (x) == AND
5514 && GET_MODE (x) == DImode
5515 && GET_CODE (XEXP (x, 0)) == MULT
5516 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5517 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5518 && CONST_INT_P (XEXP (x, 1)))
5520 type = ADDRESS_REG_UXTW;
5521 index = XEXP (XEXP (x, 0), 0);
5522 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5523 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5524 shift = -1;
5526 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5527 else if ((GET_CODE (x) == SIGN_EXTRACT
5528 || GET_CODE (x) == ZERO_EXTRACT)
5529 && GET_MODE (x) == DImode
5530 && GET_CODE (XEXP (x, 0)) == ASHIFT
5531 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5532 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5534 type = (GET_CODE (x) == SIGN_EXTRACT)
5535 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5536 index = XEXP (XEXP (x, 0), 0);
5537 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5538 if (INTVAL (XEXP (x, 1)) != 32 + shift
5539 || INTVAL (XEXP (x, 2)) != 0)
5540 shift = -1;
5542 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5543 (const_int 0xffffffff<<shift)) */
5544 else if (GET_CODE (x) == AND
5545 && GET_MODE (x) == DImode
5546 && GET_CODE (XEXP (x, 0)) == ASHIFT
5547 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5548 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5549 && CONST_INT_P (XEXP (x, 1)))
5551 type = ADDRESS_REG_UXTW;
5552 index = XEXP (XEXP (x, 0), 0);
5553 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5554 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5555 shift = -1;
5557 /* (mult:P (reg:P) (const_int scale)) */
5558 else if (GET_CODE (x) == MULT
5559 && GET_MODE (x) == Pmode
5560 && GET_MODE (XEXP (x, 0)) == Pmode
5561 && CONST_INT_P (XEXP (x, 1)))
5563 type = ADDRESS_REG_REG;
5564 index = XEXP (x, 0);
5565 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5567 /* (ashift:P (reg:P) (const_int shift)) */
5568 else if (GET_CODE (x) == ASHIFT
5569 && GET_MODE (x) == Pmode
5570 && GET_MODE (XEXP (x, 0)) == Pmode
5571 && CONST_INT_P (XEXP (x, 1)))
5573 type = ADDRESS_REG_REG;
5574 index = XEXP (x, 0);
5575 shift = INTVAL (XEXP (x, 1));
5577 else
5578 return false;
5580 if (!strict_p
5581 && GET_CODE (index) == SUBREG
5582 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5583 index = SUBREG_REG (index);
5585 if (aarch64_sve_data_mode_p (mode))
5587 if (type != ADDRESS_REG_REG
5588 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5589 return false;
5591 else
5593 if (shift != 0
5594 && !(IN_RANGE (shift, 1, 3)
5595 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5596 return false;
5599 if (REG_P (index)
5600 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5602 info->type = type;
5603 info->offset = index;
5604 info->shift = shift;
5605 return true;
5608 return false;
5611 /* Return true if MODE is one of the modes for which we
5612 support LDP/STP operations. */
5614 static bool
5615 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5617 return mode == SImode || mode == DImode
5618 || mode == SFmode || mode == DFmode
5619 || (aarch64_vector_mode_supported_p (mode)
5620 && known_eq (GET_MODE_SIZE (mode), 8));
5623 /* Return true if REGNO is a virtual pointer register, or an eliminable
5624 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5625 include stack_pointer or hard_frame_pointer. */
5626 static bool
5627 virt_or_elim_regno_p (unsigned regno)
5629 return ((regno >= FIRST_VIRTUAL_REGISTER
5630 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5631 || regno == FRAME_POINTER_REGNUM
5632 || regno == ARG_POINTER_REGNUM);
5635 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5636 If it is, fill in INFO appropriately. STRICT_P is true if
5637 REG_OK_STRICT is in effect. */
5639 static bool
5640 aarch64_classify_address (struct aarch64_address_info *info,
5641 rtx x, machine_mode mode, bool strict_p,
5642 aarch64_addr_query_type type = ADDR_QUERY_M)
5644 enum rtx_code code = GET_CODE (x);
5645 rtx op0, op1;
5646 poly_int64 offset;
5648 HOST_WIDE_INT const_size;
5650 /* On BE, we use load/store pair for all large int mode load/stores.
5651 TI/TFmode may also use a load/store pair. */
5652 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5653 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5654 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5655 || mode == TImode
5656 || mode == TFmode
5657 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5659 bool allow_reg_index_p = (!load_store_pair_p
5660 && (known_lt (GET_MODE_SIZE (mode), 16)
5661 || vec_flags == VEC_ADVSIMD
5662 || vec_flags == VEC_SVE_DATA));
5664 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5665 [Rn, #offset, MUL VL]. */
5666 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5667 && (code != REG && code != PLUS))
5668 return false;
5670 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5671 REG addressing. */
5672 if (advsimd_struct_p
5673 && !BYTES_BIG_ENDIAN
5674 && (code != POST_INC && code != REG))
5675 return false;
5677 gcc_checking_assert (GET_MODE (x) == VOIDmode
5678 || SCALAR_INT_MODE_P (GET_MODE (x)));
5680 switch (code)
5682 case REG:
5683 case SUBREG:
5684 info->type = ADDRESS_REG_IMM;
5685 info->base = x;
5686 info->offset = const0_rtx;
5687 info->const_offset = 0;
5688 return aarch64_base_register_rtx_p (x, strict_p);
5690 case PLUS:
5691 op0 = XEXP (x, 0);
5692 op1 = XEXP (x, 1);
5694 if (! strict_p
5695 && REG_P (op0)
5696 && virt_or_elim_regno_p (REGNO (op0))
5697 && poly_int_rtx_p (op1, &offset))
5699 info->type = ADDRESS_REG_IMM;
5700 info->base = op0;
5701 info->offset = op1;
5702 info->const_offset = offset;
5704 return true;
5707 if (maybe_ne (GET_MODE_SIZE (mode), 0)
5708 && aarch64_base_register_rtx_p (op0, strict_p)
5709 && poly_int_rtx_p (op1, &offset))
5711 info->type = ADDRESS_REG_IMM;
5712 info->base = op0;
5713 info->offset = op1;
5714 info->const_offset = offset;
5716 /* TImode and TFmode values are allowed in both pairs of X
5717 registers and individual Q registers. The available
5718 address modes are:
5719 X,X: 7-bit signed scaled offset
5720 Q: 9-bit signed offset
5721 We conservatively require an offset representable in either mode.
5722 When performing the check for pairs of X registers i.e. LDP/STP
5723 pass down DImode since that is the natural size of the LDP/STP
5724 instruction memory accesses. */
5725 if (mode == TImode || mode == TFmode)
5726 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5727 && (offset_9bit_signed_unscaled_p (mode, offset)
5728 || offset_12bit_unsigned_scaled_p (mode, offset)));
5730 /* A 7bit offset check because OImode will emit a ldp/stp
5731 instruction (only big endian will get here).
5732 For ldp/stp instructions, the offset is scaled for the size of a
5733 single element of the pair. */
5734 if (mode == OImode)
5735 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5737 /* Three 9/12 bit offsets checks because CImode will emit three
5738 ldr/str instructions (only big endian will get here). */
5739 if (mode == CImode)
5740 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5741 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5742 || offset_12bit_unsigned_scaled_p (V16QImode,
5743 offset + 32)));
5745 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5746 instructions (only big endian will get here). */
5747 if (mode == XImode)
5748 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5749 && aarch64_offset_7bit_signed_scaled_p (TImode,
5750 offset + 32));
5752 /* Make "m" use the LD1 offset range for SVE data modes, so
5753 that pre-RTL optimizers like ivopts will work to that
5754 instead of the wider LDR/STR range. */
5755 if (vec_flags == VEC_SVE_DATA)
5756 return (type == ADDR_QUERY_M
5757 ? offset_4bit_signed_scaled_p (mode, offset)
5758 : offset_9bit_signed_scaled_p (mode, offset));
5760 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5762 poly_int64 end_offset = (offset
5763 + GET_MODE_SIZE (mode)
5764 - BYTES_PER_SVE_VECTOR);
5765 return (type == ADDR_QUERY_M
5766 ? offset_4bit_signed_scaled_p (mode, offset)
5767 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5768 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5769 end_offset)));
5772 if (vec_flags == VEC_SVE_PRED)
5773 return offset_9bit_signed_scaled_p (mode, offset);
5775 if (load_store_pair_p)
5776 return ((known_eq (GET_MODE_SIZE (mode), 4)
5777 || known_eq (GET_MODE_SIZE (mode), 8))
5778 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5779 else
5780 return (offset_9bit_signed_unscaled_p (mode, offset)
5781 || offset_12bit_unsigned_scaled_p (mode, offset));
5784 if (allow_reg_index_p)
5786 /* Look for base + (scaled/extended) index register. */
5787 if (aarch64_base_register_rtx_p (op0, strict_p)
5788 && aarch64_classify_index (info, op1, mode, strict_p))
5790 info->base = op0;
5791 return true;
5793 if (aarch64_base_register_rtx_p (op1, strict_p)
5794 && aarch64_classify_index (info, op0, mode, strict_p))
5796 info->base = op1;
5797 return true;
5801 return false;
5803 case POST_INC:
5804 case POST_DEC:
5805 case PRE_INC:
5806 case PRE_DEC:
5807 info->type = ADDRESS_REG_WB;
5808 info->base = XEXP (x, 0);
5809 info->offset = NULL_RTX;
5810 return aarch64_base_register_rtx_p (info->base, strict_p);
5812 case POST_MODIFY:
5813 case PRE_MODIFY:
5814 info->type = ADDRESS_REG_WB;
5815 info->base = XEXP (x, 0);
5816 if (GET_CODE (XEXP (x, 1)) == PLUS
5817 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5818 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5819 && aarch64_base_register_rtx_p (info->base, strict_p))
5821 info->offset = XEXP (XEXP (x, 1), 1);
5822 info->const_offset = offset;
5824 /* TImode and TFmode values are allowed in both pairs of X
5825 registers and individual Q registers. The available
5826 address modes are:
5827 X,X: 7-bit signed scaled offset
5828 Q: 9-bit signed offset
5829 We conservatively require an offset representable in either mode.
5831 if (mode == TImode || mode == TFmode)
5832 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5833 && offset_9bit_signed_unscaled_p (mode, offset));
5835 if (load_store_pair_p)
5836 return ((known_eq (GET_MODE_SIZE (mode), 4)
5837 || known_eq (GET_MODE_SIZE (mode), 8))
5838 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5839 else
5840 return offset_9bit_signed_unscaled_p (mode, offset);
5842 return false;
5844 case CONST:
5845 case SYMBOL_REF:
5846 case LABEL_REF:
5847 /* load literal: pc-relative constant pool entry. Only supported
5848 for SI mode or larger. */
5849 info->type = ADDRESS_SYMBOLIC;
5851 if (!load_store_pair_p
5852 && GET_MODE_SIZE (mode).is_constant (&const_size)
5853 && const_size >= 4)
5855 rtx sym, addend;
5857 split_const (x, &sym, &addend);
5858 return ((GET_CODE (sym) == LABEL_REF
5859 || (GET_CODE (sym) == SYMBOL_REF
5860 && CONSTANT_POOL_ADDRESS_P (sym)
5861 && aarch64_pcrelative_literal_loads)));
5863 return false;
5865 case LO_SUM:
5866 info->type = ADDRESS_LO_SUM;
5867 info->base = XEXP (x, 0);
5868 info->offset = XEXP (x, 1);
5869 if (allow_reg_index_p
5870 && aarch64_base_register_rtx_p (info->base, strict_p))
5872 rtx sym, offs;
5873 split_const (info->offset, &sym, &offs);
5874 if (GET_CODE (sym) == SYMBOL_REF
5875 && (aarch64_classify_symbol (sym, INTVAL (offs))
5876 == SYMBOL_SMALL_ABSOLUTE))
5878 /* The symbol and offset must be aligned to the access size. */
5879 unsigned int align;
5881 if (CONSTANT_POOL_ADDRESS_P (sym))
5882 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5883 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5885 tree exp = SYMBOL_REF_DECL (sym);
5886 align = TYPE_ALIGN (TREE_TYPE (exp));
5887 align = aarch64_constant_alignment (exp, align);
5889 else if (SYMBOL_REF_DECL (sym))
5890 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5891 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5892 && SYMBOL_REF_BLOCK (sym) != NULL)
5893 align = SYMBOL_REF_BLOCK (sym)->alignment;
5894 else
5895 align = BITS_PER_UNIT;
5897 poly_int64 ref_size = GET_MODE_SIZE (mode);
5898 if (known_eq (ref_size, 0))
5899 ref_size = GET_MODE_SIZE (DImode);
5901 return (multiple_p (INTVAL (offs), ref_size)
5902 && multiple_p (align / BITS_PER_UNIT, ref_size));
5905 return false;
5907 default:
5908 return false;
5912 /* Return true if the address X is valid for a PRFM instruction.
5913 STRICT_P is true if we should do strict checking with
5914 aarch64_classify_address. */
5916 bool
5917 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5919 struct aarch64_address_info addr;
5921 /* PRFM accepts the same addresses as DImode... */
5922 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5923 if (!res)
5924 return false;
5926 /* ... except writeback forms. */
5927 return addr.type != ADDRESS_REG_WB;
5930 bool
5931 aarch64_symbolic_address_p (rtx x)
5933 rtx offset;
5935 split_const (x, &x, &offset);
5936 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5939 /* Classify the base of symbolic expression X. */
5941 enum aarch64_symbol_type
5942 aarch64_classify_symbolic_expression (rtx x)
5944 rtx offset;
5946 split_const (x, &x, &offset);
5947 return aarch64_classify_symbol (x, INTVAL (offset));
5951 /* Return TRUE if X is a legitimate address for accessing memory in
5952 mode MODE. */
5953 static bool
5954 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5956 struct aarch64_address_info addr;
5958 return aarch64_classify_address (&addr, x, mode, strict_p);
5961 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5962 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
5963 bool
5964 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5965 aarch64_addr_query_type type)
5967 struct aarch64_address_info addr;
5969 return aarch64_classify_address (&addr, x, mode, strict_p, type);
5972 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
5974 static bool
5975 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
5976 poly_int64 orig_offset,
5977 machine_mode mode)
5979 HOST_WIDE_INT size;
5980 if (GET_MODE_SIZE (mode).is_constant (&size))
5982 HOST_WIDE_INT const_offset, second_offset;
5984 /* A general SVE offset is A * VQ + B. Remove the A component from
5985 coefficient 0 in order to get the constant B. */
5986 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
5988 /* Split an out-of-range address displacement into a base and
5989 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
5990 range otherwise to increase opportunities for sharing the base
5991 address of different sizes. Unaligned accesses use the signed
5992 9-bit range, TImode/TFmode use the intersection of signed
5993 scaled 7-bit and signed 9-bit offset. */
5994 if (mode == TImode || mode == TFmode)
5995 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
5996 else if ((const_offset & (size - 1)) != 0)
5997 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
5998 else
5999 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6001 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6002 return false;
6004 /* Split the offset into second_offset and the rest. */
6005 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6006 *offset2 = gen_int_mode (second_offset, Pmode);
6007 return true;
6009 else
6011 /* Get the mode we should use as the basis of the range. For structure
6012 modes this is the mode of one vector. */
6013 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6014 machine_mode step_mode
6015 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6017 /* Get the "mul vl" multiplier we'd like to use. */
6018 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6019 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6020 if (vec_flags & VEC_SVE_DATA)
6021 /* LDR supports a 9-bit range, but the move patterns for
6022 structure modes require all vectors to be in range of the
6023 same base. The simplest way of accomodating that while still
6024 promoting reuse of anchor points between different modes is
6025 to use an 8-bit range unconditionally. */
6026 vnum = ((vnum + 128) & 255) - 128;
6027 else
6028 /* Predicates are only handled singly, so we might as well use
6029 the full range. */
6030 vnum = ((vnum + 256) & 511) - 256;
6031 if (vnum == 0)
6032 return false;
6034 /* Convert the "mul vl" multiplier into a byte offset. */
6035 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6036 if (known_eq (second_offset, orig_offset))
6037 return false;
6039 /* Split the offset into second_offset and the rest. */
6040 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6041 *offset2 = gen_int_mode (second_offset, Pmode);
6042 return true;
6046 /* Return the binary representation of floating point constant VALUE in INTVAL.
6047 If the value cannot be converted, return false without setting INTVAL.
6048 The conversion is done in the given MODE. */
6049 bool
6050 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6053 /* We make a general exception for 0. */
6054 if (aarch64_float_const_zero_rtx_p (value))
6056 *intval = 0;
6057 return true;
6060 scalar_float_mode mode;
6061 if (GET_CODE (value) != CONST_DOUBLE
6062 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6063 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6064 /* Only support up to DF mode. */
6065 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6066 return false;
6068 unsigned HOST_WIDE_INT ival = 0;
6070 long res[2];
6071 real_to_target (res,
6072 CONST_DOUBLE_REAL_VALUE (value),
6073 REAL_MODE_FORMAT (mode));
6075 if (mode == DFmode)
6077 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6078 ival = zext_hwi (res[order], 32);
6079 ival |= (zext_hwi (res[1 - order], 32) << 32);
6081 else
6082 ival = zext_hwi (res[0], 32);
6084 *intval = ival;
6085 return true;
6088 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6089 single MOV(+MOVK) followed by an FMOV. */
6090 bool
6091 aarch64_float_const_rtx_p (rtx x)
6093 machine_mode mode = GET_MODE (x);
6094 if (mode == VOIDmode)
6095 return false;
6097 /* Determine whether it's cheaper to write float constants as
6098 mov/movk pairs over ldr/adrp pairs. */
6099 unsigned HOST_WIDE_INT ival;
6101 if (GET_CODE (x) == CONST_DOUBLE
6102 && SCALAR_FLOAT_MODE_P (mode)
6103 && aarch64_reinterpret_float_as_int (x, &ival))
6105 scalar_int_mode imode = (mode == HFmode
6106 ? SImode
6107 : int_mode_for_mode (mode).require ());
6108 int num_instr = aarch64_internal_mov_immediate
6109 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6110 return num_instr < 3;
6113 return false;
6116 /* Return TRUE if rtx X is immediate constant 0.0 */
6117 bool
6118 aarch64_float_const_zero_rtx_p (rtx x)
6120 if (GET_MODE (x) == VOIDmode)
6121 return false;
6123 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6124 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6125 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6128 /* Return TRUE if rtx X is immediate constant that fits in a single
6129 MOVI immediate operation. */
6130 bool
6131 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6133 if (!TARGET_SIMD)
6134 return false;
6136 machine_mode vmode;
6137 scalar_int_mode imode;
6138 unsigned HOST_WIDE_INT ival;
6140 if (GET_CODE (x) == CONST_DOUBLE
6141 && SCALAR_FLOAT_MODE_P (mode))
6143 if (!aarch64_reinterpret_float_as_int (x, &ival))
6144 return false;
6146 /* We make a general exception for 0. */
6147 if (aarch64_float_const_zero_rtx_p (x))
6148 return true;
6150 imode = int_mode_for_mode (mode).require ();
6152 else if (GET_CODE (x) == CONST_INT
6153 && is_a <scalar_int_mode> (mode, &imode))
6154 ival = INTVAL (x);
6155 else
6156 return false;
6158 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6159 a 128 bit vector mode. */
6160 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6162 vmode = aarch64_simd_container_mode (imode, width);
6163 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6165 return aarch64_simd_valid_immediate (v_op, NULL);
6169 /* Return the fixed registers used for condition codes. */
6171 static bool
6172 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6174 *p1 = CC_REGNUM;
6175 *p2 = INVALID_REGNUM;
6176 return true;
6179 /* This function is used by the call expanders of the machine description.
6180 RESULT is the register in which the result is returned. It's NULL for
6181 "call" and "sibcall".
6182 MEM is the location of the function call.
6183 SIBCALL indicates whether this function call is normal call or sibling call.
6184 It will generate different pattern accordingly. */
6186 void
6187 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6189 rtx call, callee, tmp;
6190 rtvec vec;
6191 machine_mode mode;
6193 gcc_assert (MEM_P (mem));
6194 callee = XEXP (mem, 0);
6195 mode = GET_MODE (callee);
6196 gcc_assert (mode == Pmode);
6198 /* Decide if we should generate indirect calls by loading the
6199 address of the callee into a register before performing
6200 the branch-and-link. */
6201 if (SYMBOL_REF_P (callee)
6202 ? (aarch64_is_long_call_p (callee)
6203 || aarch64_is_noplt_call_p (callee))
6204 : !REG_P (callee))
6205 XEXP (mem, 0) = force_reg (mode, callee);
6207 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6209 if (result != NULL_RTX)
6210 call = gen_rtx_SET (result, call);
6212 if (sibcall)
6213 tmp = ret_rtx;
6214 else
6215 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6217 vec = gen_rtvec (2, call, tmp);
6218 call = gen_rtx_PARALLEL (VOIDmode, vec);
6220 aarch64_emit_call_insn (call);
6223 /* Emit call insn with PAT and do aarch64-specific handling. */
6225 void
6226 aarch64_emit_call_insn (rtx pat)
6228 rtx insn = emit_call_insn (pat);
6230 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6231 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6232 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6235 machine_mode
6236 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6238 /* All floating point compares return CCFP if it is an equality
6239 comparison, and CCFPE otherwise. */
6240 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6242 switch (code)
6244 case EQ:
6245 case NE:
6246 case UNORDERED:
6247 case ORDERED:
6248 case UNLT:
6249 case UNLE:
6250 case UNGT:
6251 case UNGE:
6252 case UNEQ:
6253 return CCFPmode;
6255 case LT:
6256 case LE:
6257 case GT:
6258 case GE:
6259 case LTGT:
6260 return CCFPEmode;
6262 default:
6263 gcc_unreachable ();
6267 /* Equality comparisons of short modes against zero can be performed
6268 using the TST instruction with the appropriate bitmask. */
6269 if (y == const0_rtx && REG_P (x)
6270 && (code == EQ || code == NE)
6271 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6272 return CC_NZmode;
6274 /* Similarly, comparisons of zero_extends from shorter modes can
6275 be performed using an ANDS with an immediate mask. */
6276 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6277 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6278 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6279 && (code == EQ || code == NE))
6280 return CC_NZmode;
6282 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6283 && y == const0_rtx
6284 && (code == EQ || code == NE || code == LT || code == GE)
6285 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6286 || GET_CODE (x) == NEG
6287 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6288 && CONST_INT_P (XEXP (x, 2)))))
6289 return CC_NZmode;
6291 /* A compare with a shifted operand. Because of canonicalization,
6292 the comparison will have to be swapped when we emit the assembly
6293 code. */
6294 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6295 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6296 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6297 || GET_CODE (x) == LSHIFTRT
6298 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6299 return CC_SWPmode;
6301 /* Similarly for a negated operand, but we can only do this for
6302 equalities. */
6303 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6304 && (REG_P (y) || GET_CODE (y) == SUBREG)
6305 && (code == EQ || code == NE)
6306 && GET_CODE (x) == NEG)
6307 return CC_Zmode;
6309 /* A test for unsigned overflow. */
6310 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6311 && code == NE
6312 && GET_CODE (x) == PLUS
6313 && GET_CODE (y) == ZERO_EXTEND)
6314 return CC_Cmode;
6316 /* For everything else, return CCmode. */
6317 return CCmode;
6320 static int
6321 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6324 aarch64_get_condition_code (rtx x)
6326 machine_mode mode = GET_MODE (XEXP (x, 0));
6327 enum rtx_code comp_code = GET_CODE (x);
6329 if (GET_MODE_CLASS (mode) != MODE_CC)
6330 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6331 return aarch64_get_condition_code_1 (mode, comp_code);
6334 static int
6335 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6337 switch (mode)
6339 case E_CCFPmode:
6340 case E_CCFPEmode:
6341 switch (comp_code)
6343 case GE: return AARCH64_GE;
6344 case GT: return AARCH64_GT;
6345 case LE: return AARCH64_LS;
6346 case LT: return AARCH64_MI;
6347 case NE: return AARCH64_NE;
6348 case EQ: return AARCH64_EQ;
6349 case ORDERED: return AARCH64_VC;
6350 case UNORDERED: return AARCH64_VS;
6351 case UNLT: return AARCH64_LT;
6352 case UNLE: return AARCH64_LE;
6353 case UNGT: return AARCH64_HI;
6354 case UNGE: return AARCH64_PL;
6355 default: return -1;
6357 break;
6359 case E_CCmode:
6360 switch (comp_code)
6362 case NE: return AARCH64_NE;
6363 case EQ: return AARCH64_EQ;
6364 case GE: return AARCH64_GE;
6365 case GT: return AARCH64_GT;
6366 case LE: return AARCH64_LE;
6367 case LT: return AARCH64_LT;
6368 case GEU: return AARCH64_CS;
6369 case GTU: return AARCH64_HI;
6370 case LEU: return AARCH64_LS;
6371 case LTU: return AARCH64_CC;
6372 default: return -1;
6374 break;
6376 case E_CC_SWPmode:
6377 switch (comp_code)
6379 case NE: return AARCH64_NE;
6380 case EQ: return AARCH64_EQ;
6381 case GE: return AARCH64_LE;
6382 case GT: return AARCH64_LT;
6383 case LE: return AARCH64_GE;
6384 case LT: return AARCH64_GT;
6385 case GEU: return AARCH64_LS;
6386 case GTU: return AARCH64_CC;
6387 case LEU: return AARCH64_CS;
6388 case LTU: return AARCH64_HI;
6389 default: return -1;
6391 break;
6393 case E_CC_NZmode:
6394 switch (comp_code)
6396 case NE: return AARCH64_NE;
6397 case EQ: return AARCH64_EQ;
6398 case GE: return AARCH64_PL;
6399 case LT: return AARCH64_MI;
6400 default: return -1;
6402 break;
6404 case E_CC_Zmode:
6405 switch (comp_code)
6407 case NE: return AARCH64_NE;
6408 case EQ: return AARCH64_EQ;
6409 default: return -1;
6411 break;
6413 case E_CC_Cmode:
6414 switch (comp_code)
6416 case NE: return AARCH64_CS;
6417 case EQ: return AARCH64_CC;
6418 default: return -1;
6420 break;
6422 default:
6423 return -1;
6426 return -1;
6429 bool
6430 aarch64_const_vec_all_same_in_range_p (rtx x,
6431 HOST_WIDE_INT minval,
6432 HOST_WIDE_INT maxval)
6434 rtx elt;
6435 return (const_vec_duplicate_p (x, &elt)
6436 && CONST_INT_P (elt)
6437 && IN_RANGE (INTVAL (elt), minval, maxval));
6440 bool
6441 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6443 return aarch64_const_vec_all_same_in_range_p (x, val, val);
6446 /* Return true if VEC is a constant in which every element is in the range
6447 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6449 static bool
6450 aarch64_const_vec_all_in_range_p (rtx vec,
6451 HOST_WIDE_INT minval,
6452 HOST_WIDE_INT maxval)
6454 if (GET_CODE (vec) != CONST_VECTOR
6455 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6456 return false;
6458 int nunits;
6459 if (!CONST_VECTOR_STEPPED_P (vec))
6460 nunits = const_vector_encoded_nelts (vec);
6461 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6462 return false;
6464 for (int i = 0; i < nunits; i++)
6466 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6467 if (!CONST_INT_P (vec_elem)
6468 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6469 return false;
6471 return true;
6474 /* N Z C V. */
6475 #define AARCH64_CC_V 1
6476 #define AARCH64_CC_C (1 << 1)
6477 #define AARCH64_CC_Z (1 << 2)
6478 #define AARCH64_CC_N (1 << 3)
6480 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6481 static const int aarch64_nzcv_codes[] =
6483 0, /* EQ, Z == 1. */
6484 AARCH64_CC_Z, /* NE, Z == 0. */
6485 0, /* CS, C == 1. */
6486 AARCH64_CC_C, /* CC, C == 0. */
6487 0, /* MI, N == 1. */
6488 AARCH64_CC_N, /* PL, N == 0. */
6489 0, /* VS, V == 1. */
6490 AARCH64_CC_V, /* VC, V == 0. */
6491 0, /* HI, C ==1 && Z == 0. */
6492 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
6493 AARCH64_CC_V, /* GE, N == V. */
6494 0, /* LT, N != V. */
6495 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
6496 0, /* LE, !(Z == 0 && N == V). */
6497 0, /* AL, Any. */
6498 0 /* NV, Any. */
6501 /* Print floating-point vector immediate operand X to F, negating it
6502 first if NEGATE is true. Return true on success, false if it isn't
6503 a constant we can handle. */
6505 static bool
6506 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6508 rtx elt;
6510 if (!const_vec_duplicate_p (x, &elt))
6511 return false;
6513 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6514 if (negate)
6515 r = real_value_negate (&r);
6517 /* We only handle the SVE single-bit immediates here. */
6518 if (real_equal (&r, &dconst0))
6519 asm_fprintf (f, "0.0");
6520 else if (real_equal (&r, &dconst1))
6521 asm_fprintf (f, "1.0");
6522 else if (real_equal (&r, &dconsthalf))
6523 asm_fprintf (f, "0.5");
6524 else
6525 return false;
6527 return true;
6530 /* Return the equivalent letter for size. */
6531 static char
6532 sizetochar (int size)
6534 switch (size)
6536 case 64: return 'd';
6537 case 32: return 's';
6538 case 16: return 'h';
6539 case 8 : return 'b';
6540 default: gcc_unreachable ();
6544 /* Print operand X to file F in a target specific manner according to CODE.
6545 The acceptable formatting commands given by CODE are:
6546 'c': An integer or symbol address without a preceding #
6547 sign.
6548 'C': Take the duplicated element in a vector constant
6549 and print it in hex.
6550 'D': Take the duplicated element in a vector constant
6551 and print it as an unsigned integer, in decimal.
6552 'e': Print the sign/zero-extend size as a character 8->b,
6553 16->h, 32->w.
6554 'p': Prints N such that 2^N == X (X must be power of 2 and
6555 const int).
6556 'P': Print the number of non-zero bits in X (a const_int).
6557 'H': Print the higher numbered register of a pair (TImode)
6558 of regs.
6559 'm': Print a condition (eq, ne, etc).
6560 'M': Same as 'm', but invert condition.
6561 'N': Take the duplicated element in a vector constant
6562 and print the negative of it in decimal.
6563 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6564 'S/T/U/V': Print a FP/SIMD register name for a register list.
6565 The register printed is the FP/SIMD register name
6566 of X + 0/1/2/3 for S/T/U/V.
6567 'R': Print a scalar FP/SIMD register name + 1.
6568 'X': Print bottom 16 bits of integer constant in hex.
6569 'w/x': Print a general register name or the zero register
6570 (32-bit or 64-bit).
6571 '0': Print a normal operand, if it's a general register,
6572 then we assume DImode.
6573 'k': Print NZCV for conditional compare instructions.
6574 'A': Output address constant representing the first
6575 argument of X, specifying a relocation offset
6576 if appropriate.
6577 'L': Output constant address specified by X
6578 with a relocation offset if appropriate.
6579 'G': Prints address of X, specifying a PC relative
6580 relocation mode if appropriate.
6581 'y': Output address of LDP or STP - this is used for
6582 some LDP/STPs which don't use a PARALLEL in their
6583 pattern (so the mode needs to be adjusted).
6584 'z': Output address of a typical LDP or STP. */
6586 static void
6587 aarch64_print_operand (FILE *f, rtx x, int code)
6589 rtx elt;
6590 switch (code)
6592 case 'c':
6593 switch (GET_CODE (x))
6595 case CONST_INT:
6596 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6597 break;
6599 case SYMBOL_REF:
6600 output_addr_const (f, x);
6601 break;
6603 case CONST:
6604 if (GET_CODE (XEXP (x, 0)) == PLUS
6605 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6607 output_addr_const (f, x);
6608 break;
6610 /* Fall through. */
6612 default:
6613 output_operand_lossage ("unsupported operand for code '%c'", code);
6615 break;
6617 case 'e':
6619 int n;
6621 if (!CONST_INT_P (x)
6622 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6624 output_operand_lossage ("invalid operand for '%%%c'", code);
6625 return;
6628 switch (n)
6630 case 3:
6631 fputc ('b', f);
6632 break;
6633 case 4:
6634 fputc ('h', f);
6635 break;
6636 case 5:
6637 fputc ('w', f);
6638 break;
6639 default:
6640 output_operand_lossage ("invalid operand for '%%%c'", code);
6641 return;
6644 break;
6646 case 'p':
6648 int n;
6650 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6652 output_operand_lossage ("invalid operand for '%%%c'", code);
6653 return;
6656 asm_fprintf (f, "%d", n);
6658 break;
6660 case 'P':
6661 if (!CONST_INT_P (x))
6663 output_operand_lossage ("invalid operand for '%%%c'", code);
6664 return;
6667 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6668 break;
6670 case 'H':
6671 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6673 output_operand_lossage ("invalid operand for '%%%c'", code);
6674 return;
6677 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6678 break;
6680 case 'M':
6681 case 'm':
6683 int cond_code;
6684 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6685 if (x == const_true_rtx)
6687 if (code == 'M')
6688 fputs ("nv", f);
6689 return;
6692 if (!COMPARISON_P (x))
6694 output_operand_lossage ("invalid operand for '%%%c'", code);
6695 return;
6698 cond_code = aarch64_get_condition_code (x);
6699 gcc_assert (cond_code >= 0);
6700 if (code == 'M')
6701 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6702 fputs (aarch64_condition_codes[cond_code], f);
6704 break;
6706 case 'N':
6707 if (!const_vec_duplicate_p (x, &elt))
6709 output_operand_lossage ("invalid vector constant");
6710 return;
6713 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6714 asm_fprintf (f, "%wd", -INTVAL (elt));
6715 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6716 && aarch64_print_vector_float_operand (f, x, true))
6718 else
6720 output_operand_lossage ("invalid vector constant");
6721 return;
6723 break;
6725 case 'b':
6726 case 'h':
6727 case 's':
6728 case 'd':
6729 case 'q':
6730 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6732 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6733 return;
6735 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6736 break;
6738 case 'S':
6739 case 'T':
6740 case 'U':
6741 case 'V':
6742 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6744 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6745 return;
6747 asm_fprintf (f, "%c%d",
6748 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6749 REGNO (x) - V0_REGNUM + (code - 'S'));
6750 break;
6752 case 'R':
6753 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6755 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6756 return;
6758 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6759 break;
6761 case 'X':
6762 if (!CONST_INT_P (x))
6764 output_operand_lossage ("invalid operand for '%%%c'", code);
6765 return;
6767 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6768 break;
6770 case 'C':
6772 /* Print a replicated constant in hex. */
6773 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6775 output_operand_lossage ("invalid operand for '%%%c'", code);
6776 return;
6778 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6779 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6781 break;
6783 case 'D':
6785 /* Print a replicated constant in decimal, treating it as
6786 unsigned. */
6787 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6789 output_operand_lossage ("invalid operand for '%%%c'", code);
6790 return;
6792 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6793 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6795 break;
6797 case 'w':
6798 case 'x':
6799 if (x == const0_rtx
6800 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6802 asm_fprintf (f, "%czr", code);
6803 break;
6806 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6808 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6809 break;
6812 if (REG_P (x) && REGNO (x) == SP_REGNUM)
6814 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6815 break;
6818 /* Fall through */
6820 case 0:
6821 if (x == NULL)
6823 output_operand_lossage ("missing operand");
6824 return;
6827 switch (GET_CODE (x))
6829 case REG:
6830 if (aarch64_sve_data_mode_p (GET_MODE (x)))
6832 if (REG_NREGS (x) == 1)
6833 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6834 else
6836 char suffix
6837 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6838 asm_fprintf (f, "{z%d.%c - z%d.%c}",
6839 REGNO (x) - V0_REGNUM, suffix,
6840 END_REGNO (x) - V0_REGNUM - 1, suffix);
6843 else
6844 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6845 break;
6847 case MEM:
6848 output_address (GET_MODE (x), XEXP (x, 0));
6849 break;
6851 case LABEL_REF:
6852 case SYMBOL_REF:
6853 output_addr_const (asm_out_file, x);
6854 break;
6856 case CONST_INT:
6857 asm_fprintf (f, "%wd", INTVAL (x));
6858 break;
6860 case CONST:
6861 if (!VECTOR_MODE_P (GET_MODE (x)))
6863 output_addr_const (asm_out_file, x);
6864 break;
6866 /* fall through */
6868 case CONST_VECTOR:
6869 if (!const_vec_duplicate_p (x, &elt))
6871 output_operand_lossage ("invalid vector constant");
6872 return;
6875 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6876 asm_fprintf (f, "%wd", INTVAL (elt));
6877 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6878 && aarch64_print_vector_float_operand (f, x, false))
6880 else
6882 output_operand_lossage ("invalid vector constant");
6883 return;
6885 break;
6887 case CONST_DOUBLE:
6888 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6889 be getting CONST_DOUBLEs holding integers. */
6890 gcc_assert (GET_MODE (x) != VOIDmode);
6891 if (aarch64_float_const_zero_rtx_p (x))
6893 fputc ('0', f);
6894 break;
6896 else if (aarch64_float_const_representable_p (x))
6898 #define buf_size 20
6899 char float_buf[buf_size] = {'\0'};
6900 real_to_decimal_for_mode (float_buf,
6901 CONST_DOUBLE_REAL_VALUE (x),
6902 buf_size, buf_size,
6903 1, GET_MODE (x));
6904 asm_fprintf (asm_out_file, "%s", float_buf);
6905 break;
6906 #undef buf_size
6908 output_operand_lossage ("invalid constant");
6909 return;
6910 default:
6911 output_operand_lossage ("invalid operand");
6912 return;
6914 break;
6916 case 'A':
6917 if (GET_CODE (x) == HIGH)
6918 x = XEXP (x, 0);
6920 switch (aarch64_classify_symbolic_expression (x))
6922 case SYMBOL_SMALL_GOT_4G:
6923 asm_fprintf (asm_out_file, ":got:");
6924 break;
6926 case SYMBOL_SMALL_TLSGD:
6927 asm_fprintf (asm_out_file, ":tlsgd:");
6928 break;
6930 case SYMBOL_SMALL_TLSDESC:
6931 asm_fprintf (asm_out_file, ":tlsdesc:");
6932 break;
6934 case SYMBOL_SMALL_TLSIE:
6935 asm_fprintf (asm_out_file, ":gottprel:");
6936 break;
6938 case SYMBOL_TLSLE24:
6939 asm_fprintf (asm_out_file, ":tprel:");
6940 break;
6942 case SYMBOL_TINY_GOT:
6943 gcc_unreachable ();
6944 break;
6946 default:
6947 break;
6949 output_addr_const (asm_out_file, x);
6950 break;
6952 case 'L':
6953 switch (aarch64_classify_symbolic_expression (x))
6955 case SYMBOL_SMALL_GOT_4G:
6956 asm_fprintf (asm_out_file, ":lo12:");
6957 break;
6959 case SYMBOL_SMALL_TLSGD:
6960 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6961 break;
6963 case SYMBOL_SMALL_TLSDESC:
6964 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
6965 break;
6967 case SYMBOL_SMALL_TLSIE:
6968 asm_fprintf (asm_out_file, ":gottprel_lo12:");
6969 break;
6971 case SYMBOL_TLSLE12:
6972 asm_fprintf (asm_out_file, ":tprel_lo12:");
6973 break;
6975 case SYMBOL_TLSLE24:
6976 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
6977 break;
6979 case SYMBOL_TINY_GOT:
6980 asm_fprintf (asm_out_file, ":got:");
6981 break;
6983 case SYMBOL_TINY_TLSIE:
6984 asm_fprintf (asm_out_file, ":gottprel:");
6985 break;
6987 default:
6988 break;
6990 output_addr_const (asm_out_file, x);
6991 break;
6993 case 'G':
6994 switch (aarch64_classify_symbolic_expression (x))
6996 case SYMBOL_TLSLE24:
6997 asm_fprintf (asm_out_file, ":tprel_hi12:");
6998 break;
6999 default:
7000 break;
7002 output_addr_const (asm_out_file, x);
7003 break;
7005 case 'k':
7007 HOST_WIDE_INT cond_code;
7009 if (!CONST_INT_P (x))
7011 output_operand_lossage ("invalid operand for '%%%c'", code);
7012 return;
7015 cond_code = INTVAL (x);
7016 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7017 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7019 break;
7021 case 'y':
7022 case 'z':
7024 machine_mode mode = GET_MODE (x);
7026 if (GET_CODE (x) != MEM
7027 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7029 output_operand_lossage ("invalid operand for '%%%c'", code);
7030 return;
7033 if (code == 'y')
7034 /* LDP/STP which uses a single double-width memory operand.
7035 Adjust the mode to appear like a typical LDP/STP.
7036 Currently this is supported for 16-byte accesses only. */
7037 mode = DFmode;
7039 if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7040 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7042 break;
7044 default:
7045 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7046 return;
7050 /* Print address 'x' of a memory access with mode 'mode'.
7051 'op' is the context required by aarch64_classify_address. It can either be
7052 MEM for a normal memory access or PARALLEL for LDP/STP. */
7053 static bool
7054 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7055 aarch64_addr_query_type type)
7057 struct aarch64_address_info addr;
7058 unsigned int size;
7060 /* Check all addresses are Pmode - including ILP32. */
7061 if (GET_MODE (x) != Pmode)
7062 output_operand_lossage ("invalid address mode");
7064 if (aarch64_classify_address (&addr, x, mode, true, type))
7065 switch (addr.type)
7067 case ADDRESS_REG_IMM:
7068 if (known_eq (addr.const_offset, 0))
7069 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7070 else if (aarch64_sve_data_mode_p (mode))
7072 HOST_WIDE_INT vnum
7073 = exact_div (addr.const_offset,
7074 BYTES_PER_SVE_VECTOR).to_constant ();
7075 asm_fprintf (f, "[%s, #%wd, mul vl]",
7076 reg_names[REGNO (addr.base)], vnum);
7078 else if (aarch64_sve_pred_mode_p (mode))
7080 HOST_WIDE_INT vnum
7081 = exact_div (addr.const_offset,
7082 BYTES_PER_SVE_PRED).to_constant ();
7083 asm_fprintf (f, "[%s, #%wd, mul vl]",
7084 reg_names[REGNO (addr.base)], vnum);
7086 else
7087 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7088 INTVAL (addr.offset));
7089 return true;
7091 case ADDRESS_REG_REG:
7092 if (addr.shift == 0)
7093 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7094 reg_names [REGNO (addr.offset)]);
7095 else
7096 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7097 reg_names [REGNO (addr.offset)], addr.shift);
7098 return true;
7100 case ADDRESS_REG_UXTW:
7101 if (addr.shift == 0)
7102 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7103 REGNO (addr.offset) - R0_REGNUM);
7104 else
7105 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7106 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7107 return true;
7109 case ADDRESS_REG_SXTW:
7110 if (addr.shift == 0)
7111 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7112 REGNO (addr.offset) - R0_REGNUM);
7113 else
7114 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7115 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7116 return true;
7118 case ADDRESS_REG_WB:
7119 /* Writeback is only supported for fixed-width modes. */
7120 size = GET_MODE_SIZE (mode).to_constant ();
7121 switch (GET_CODE (x))
7123 case PRE_INC:
7124 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7125 return true;
7126 case POST_INC:
7127 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7128 return true;
7129 case PRE_DEC:
7130 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7131 return true;
7132 case POST_DEC:
7133 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7134 return true;
7135 case PRE_MODIFY:
7136 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7137 INTVAL (addr.offset));
7138 return true;
7139 case POST_MODIFY:
7140 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7141 INTVAL (addr.offset));
7142 return true;
7143 default:
7144 break;
7146 break;
7148 case ADDRESS_LO_SUM:
7149 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7150 output_addr_const (f, addr.offset);
7151 asm_fprintf (f, "]");
7152 return true;
7154 case ADDRESS_SYMBOLIC:
7155 output_addr_const (f, x);
7156 return true;
7159 return false;
7162 /* Print address 'x' of a LDP/STP with mode 'mode'. */
7163 static bool
7164 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7166 return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7169 /* Print address 'x' of a memory access with mode 'mode'. */
7170 static void
7171 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7173 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7174 output_addr_const (f, x);
7177 bool
7178 aarch64_label_mentioned_p (rtx x)
7180 const char *fmt;
7181 int i;
7183 if (GET_CODE (x) == LABEL_REF)
7184 return true;
7186 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7187 referencing instruction, but they are constant offsets, not
7188 symbols. */
7189 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7190 return false;
7192 fmt = GET_RTX_FORMAT (GET_CODE (x));
7193 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7195 if (fmt[i] == 'E')
7197 int j;
7199 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7200 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7201 return 1;
7203 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7204 return 1;
7207 return 0;
7210 /* Implement REGNO_REG_CLASS. */
7212 enum reg_class
7213 aarch64_regno_regclass (unsigned regno)
7215 if (GP_REGNUM_P (regno))
7216 return GENERAL_REGS;
7218 if (regno == SP_REGNUM)
7219 return STACK_REG;
7221 if (regno == FRAME_POINTER_REGNUM
7222 || regno == ARG_POINTER_REGNUM)
7223 return POINTER_REGS;
7225 if (FP_REGNUM_P (regno))
7226 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
7228 if (PR_REGNUM_P (regno))
7229 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7231 return NO_REGS;
7234 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7235 If OFFSET is out of range, return an offset of an anchor point
7236 that is in range. Return 0 otherwise. */
7238 static HOST_WIDE_INT
7239 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7240 machine_mode mode)
7242 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7243 if (size > 16)
7244 return (offset + 0x400) & ~0x7f0;
7246 /* For offsets that aren't a multiple of the access size, the limit is
7247 -256...255. */
7248 if (offset & (size - 1))
7250 /* BLKmode typically uses LDP of X-registers. */
7251 if (mode == BLKmode)
7252 return (offset + 512) & ~0x3ff;
7253 return (offset + 0x100) & ~0x1ff;
7256 /* Small negative offsets are supported. */
7257 if (IN_RANGE (offset, -256, 0))
7258 return 0;
7260 if (mode == TImode || mode == TFmode)
7261 return (offset + 0x100) & ~0x1ff;
7263 /* Use 12-bit offset by access size. */
7264 return offset & (~0xfff * size);
7267 static rtx
7268 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
7270 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7271 where mask is selected by alignment and size of the offset.
7272 We try to pick as large a range for the offset as possible to
7273 maximize the chance of a CSE. However, for aligned addresses
7274 we limit the range to 4k so that structures with different sized
7275 elements are likely to use the same base. We need to be careful
7276 not to split a CONST for some forms of address expression, otherwise
7277 it will generate sub-optimal code. */
7279 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7281 rtx base = XEXP (x, 0);
7282 rtx offset_rtx = XEXP (x, 1);
7283 HOST_WIDE_INT offset = INTVAL (offset_rtx);
7285 if (GET_CODE (base) == PLUS)
7287 rtx op0 = XEXP (base, 0);
7288 rtx op1 = XEXP (base, 1);
7290 /* Force any scaling into a temp for CSE. */
7291 op0 = force_reg (Pmode, op0);
7292 op1 = force_reg (Pmode, op1);
7294 /* Let the pointer register be in op0. */
7295 if (REG_POINTER (op1))
7296 std::swap (op0, op1);
7298 /* If the pointer is virtual or frame related, then we know that
7299 virtual register instantiation or register elimination is going
7300 to apply a second constant. We want the two constants folded
7301 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7302 if (virt_or_elim_regno_p (REGNO (op0)))
7304 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7305 NULL_RTX, true, OPTAB_DIRECT);
7306 return gen_rtx_PLUS (Pmode, base, op1);
7309 /* Otherwise, in order to encourage CSE (and thence loop strength
7310 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7311 base = expand_binop (Pmode, add_optab, op0, op1,
7312 NULL_RTX, true, OPTAB_DIRECT);
7313 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7316 HOST_WIDE_INT size;
7317 if (GET_MODE_SIZE (mode).is_constant (&size))
7319 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7320 mode);
7321 if (base_offset != 0)
7323 base = plus_constant (Pmode, base, base_offset);
7324 base = force_operand (base, NULL_RTX);
7325 return plus_constant (Pmode, base, offset - base_offset);
7330 return x;
7333 /* Return the reload icode required for a constant pool in mode. */
7334 static enum insn_code
7335 aarch64_constant_pool_reload_icode (machine_mode mode)
7337 switch (mode)
7339 case E_SFmode:
7340 return CODE_FOR_aarch64_reload_movcpsfdi;
7342 case E_DFmode:
7343 return CODE_FOR_aarch64_reload_movcpdfdi;
7345 case E_TFmode:
7346 return CODE_FOR_aarch64_reload_movcptfdi;
7348 case E_V8QImode:
7349 return CODE_FOR_aarch64_reload_movcpv8qidi;
7351 case E_V16QImode:
7352 return CODE_FOR_aarch64_reload_movcpv16qidi;
7354 case E_V4HImode:
7355 return CODE_FOR_aarch64_reload_movcpv4hidi;
7357 case E_V8HImode:
7358 return CODE_FOR_aarch64_reload_movcpv8hidi;
7360 case E_V2SImode:
7361 return CODE_FOR_aarch64_reload_movcpv2sidi;
7363 case E_V4SImode:
7364 return CODE_FOR_aarch64_reload_movcpv4sidi;
7366 case E_V2DImode:
7367 return CODE_FOR_aarch64_reload_movcpv2didi;
7369 case E_V2DFmode:
7370 return CODE_FOR_aarch64_reload_movcpv2dfdi;
7372 default:
7373 gcc_unreachable ();
7376 gcc_unreachable ();
7378 static reg_class_t
7379 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7380 reg_class_t rclass,
7381 machine_mode mode,
7382 secondary_reload_info *sri)
7384 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7385 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7386 comment at the head of aarch64-sve.md for more details about the
7387 big-endian handling. */
7388 if (BYTES_BIG_ENDIAN
7389 && reg_class_subset_p (rclass, FP_REGS)
7390 && !((REG_P (x) && HARD_REGISTER_P (x))
7391 || aarch64_simd_valid_immediate (x, NULL))
7392 && aarch64_sve_data_mode_p (mode))
7394 sri->icode = CODE_FOR_aarch64_sve_reload_be;
7395 return NO_REGS;
7398 /* If we have to disable direct literal pool loads and stores because the
7399 function is too big, then we need a scratch register. */
7400 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7401 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7402 || targetm.vector_mode_supported_p (GET_MODE (x)))
7403 && !aarch64_pcrelative_literal_loads)
7405 sri->icode = aarch64_constant_pool_reload_icode (mode);
7406 return NO_REGS;
7409 /* Without the TARGET_SIMD instructions we cannot move a Q register
7410 to a Q register directly. We need a scratch. */
7411 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7412 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7413 && reg_class_subset_p (rclass, FP_REGS))
7415 if (mode == TFmode)
7416 sri->icode = CODE_FOR_aarch64_reload_movtf;
7417 else if (mode == TImode)
7418 sri->icode = CODE_FOR_aarch64_reload_movti;
7419 return NO_REGS;
7422 /* A TFmode or TImode memory access should be handled via an FP_REGS
7423 because AArch64 has richer addressing modes for LDR/STR instructions
7424 than LDP/STP instructions. */
7425 if (TARGET_FLOAT && rclass == GENERAL_REGS
7426 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7427 return FP_REGS;
7429 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7430 return GENERAL_REGS;
7432 return NO_REGS;
7435 static bool
7436 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7438 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7440 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7441 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7442 if (frame_pointer_needed)
7443 return to == HARD_FRAME_POINTER_REGNUM;
7444 return true;
7447 poly_int64
7448 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7450 aarch64_layout_frame ();
7452 if (to == HARD_FRAME_POINTER_REGNUM)
7454 if (from == ARG_POINTER_REGNUM)
7455 return cfun->machine->frame.hard_fp_offset;
7457 if (from == FRAME_POINTER_REGNUM)
7458 return cfun->machine->frame.hard_fp_offset
7459 - cfun->machine->frame.locals_offset;
7462 if (to == STACK_POINTER_REGNUM)
7464 if (from == FRAME_POINTER_REGNUM)
7465 return cfun->machine->frame.frame_size
7466 - cfun->machine->frame.locals_offset;
7469 return cfun->machine->frame.frame_size;
7472 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7473 previous frame. */
7476 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7478 if (count != 0)
7479 return const0_rtx;
7480 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7484 static void
7485 aarch64_asm_trampoline_template (FILE *f)
7487 if (TARGET_ILP32)
7489 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7490 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7492 else
7494 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7495 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7497 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7498 assemble_aligned_integer (4, const0_rtx);
7499 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7500 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7503 static void
7504 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7506 rtx fnaddr, mem, a_tramp;
7507 const int tramp_code_sz = 16;
7509 /* Don't need to copy the trailing D-words, we fill those in below. */
7510 emit_block_move (m_tramp, assemble_trampoline_template (),
7511 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7512 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7513 fnaddr = XEXP (DECL_RTL (fndecl), 0);
7514 if (GET_MODE (fnaddr) != ptr_mode)
7515 fnaddr = convert_memory_address (ptr_mode, fnaddr);
7516 emit_move_insn (mem, fnaddr);
7518 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7519 emit_move_insn (mem, chain_value);
7521 /* XXX We should really define a "clear_cache" pattern and use
7522 gen_clear_cache(). */
7523 a_tramp = XEXP (m_tramp, 0);
7524 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7525 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7526 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7527 ptr_mode);
7530 static unsigned char
7531 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7533 /* ??? Logically we should only need to provide a value when
7534 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7535 can hold MODE, but at the moment we need to handle all modes.
7536 Just ignore any runtime parts for registers that can't store them. */
7537 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7538 unsigned int nregs;
7539 switch (regclass)
7541 case TAILCALL_ADDR_REGS:
7542 case POINTER_REGS:
7543 case GENERAL_REGS:
7544 case ALL_REGS:
7545 case POINTER_AND_FP_REGS:
7546 case FP_REGS:
7547 case FP_LO_REGS:
7548 if (aarch64_sve_data_mode_p (mode)
7549 && constant_multiple_p (GET_MODE_SIZE (mode),
7550 BYTES_PER_SVE_VECTOR, &nregs))
7551 return nregs;
7552 return (aarch64_vector_data_mode_p (mode)
7553 ? CEIL (lowest_size, UNITS_PER_VREG)
7554 : CEIL (lowest_size, UNITS_PER_WORD));
7555 case STACK_REG:
7556 case PR_REGS:
7557 case PR_LO_REGS:
7558 case PR_HI_REGS:
7559 return 1;
7561 case NO_REGS:
7562 return 0;
7564 default:
7565 break;
7567 gcc_unreachable ();
7570 static reg_class_t
7571 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7573 if (regclass == POINTER_REGS)
7574 return GENERAL_REGS;
7576 if (regclass == STACK_REG)
7578 if (REG_P(x)
7579 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7580 return regclass;
7582 return NO_REGS;
7585 /* Register eliminiation can result in a request for
7586 SP+constant->FP_REGS. We cannot support such operations which
7587 use SP as source and an FP_REG as destination, so reject out
7588 right now. */
7589 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7591 rtx lhs = XEXP (x, 0);
7593 /* Look through a possible SUBREG introduced by ILP32. */
7594 if (GET_CODE (lhs) == SUBREG)
7595 lhs = SUBREG_REG (lhs);
7597 gcc_assert (REG_P (lhs));
7598 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7599 POINTER_REGS));
7600 return NO_REGS;
7603 return regclass;
7606 void
7607 aarch64_asm_output_labelref (FILE* f, const char *name)
7609 asm_fprintf (f, "%U%s", name);
7612 static void
7613 aarch64_elf_asm_constructor (rtx symbol, int priority)
7615 if (priority == DEFAULT_INIT_PRIORITY)
7616 default_ctor_section_asm_out_constructor (symbol, priority);
7617 else
7619 section *s;
7620 /* While priority is known to be in range [0, 65535], so 18 bytes
7621 would be enough, the compiler might not know that. To avoid
7622 -Wformat-truncation false positive, use a larger size. */
7623 char buf[23];
7624 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7625 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7626 switch_to_section (s);
7627 assemble_align (POINTER_SIZE);
7628 assemble_aligned_integer (POINTER_BYTES, symbol);
7632 static void
7633 aarch64_elf_asm_destructor (rtx symbol, int priority)
7635 if (priority == DEFAULT_INIT_PRIORITY)
7636 default_dtor_section_asm_out_destructor (symbol, priority);
7637 else
7639 section *s;
7640 /* While priority is known to be in range [0, 65535], so 18 bytes
7641 would be enough, the compiler might not know that. To avoid
7642 -Wformat-truncation false positive, use a larger size. */
7643 char buf[23];
7644 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7645 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7646 switch_to_section (s);
7647 assemble_align (POINTER_SIZE);
7648 assemble_aligned_integer (POINTER_BYTES, symbol);
7652 const char*
7653 aarch64_output_casesi (rtx *operands)
7655 char buf[100];
7656 char label[100];
7657 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7658 int index;
7659 static const char *const patterns[4][2] =
7662 "ldrb\t%w3, [%0,%w1,uxtw]",
7663 "add\t%3, %4, %w3, sxtb #2"
7666 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7667 "add\t%3, %4, %w3, sxth #2"
7670 "ldr\t%w3, [%0,%w1,uxtw #2]",
7671 "add\t%3, %4, %w3, sxtw #2"
7673 /* We assume that DImode is only generated when not optimizing and
7674 that we don't really need 64-bit address offsets. That would
7675 imply an object file with 8GB of code in a single function! */
7677 "ldr\t%w3, [%0,%w1,uxtw #2]",
7678 "add\t%3, %4, %w3, sxtw #2"
7682 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7684 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7685 index = exact_log2 (GET_MODE_SIZE (mode));
7687 gcc_assert (index >= 0 && index <= 3);
7689 /* Need to implement table size reduction, by chaning the code below. */
7690 output_asm_insn (patterns[index][0], operands);
7691 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7692 snprintf (buf, sizeof (buf),
7693 "adr\t%%4, %s", targetm.strip_name_encoding (label));
7694 output_asm_insn (buf, operands);
7695 output_asm_insn (patterns[index][1], operands);
7696 output_asm_insn ("br\t%3", operands);
7697 assemble_label (asm_out_file, label);
7698 return "";
7702 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7703 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7704 operator. */
7707 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7709 if (shift >= 0 && shift <= 3)
7711 int size;
7712 for (size = 8; size <= 32; size *= 2)
7714 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7715 if (mask == bits << shift)
7716 return size;
7719 return 0;
7722 /* Constant pools are per function only when PC relative
7723 literal loads are true or we are in the large memory
7724 model. */
7726 static inline bool
7727 aarch64_can_use_per_function_literal_pools_p (void)
7729 return (aarch64_pcrelative_literal_loads
7730 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7733 static bool
7734 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7736 /* Fixme:: In an ideal world this would work similar
7737 to the logic in aarch64_select_rtx_section but this
7738 breaks bootstrap in gcc go. For now we workaround
7739 this by returning false here. */
7740 return false;
7743 /* Select appropriate section for constants depending
7744 on where we place literal pools. */
7746 static section *
7747 aarch64_select_rtx_section (machine_mode mode,
7748 rtx x,
7749 unsigned HOST_WIDE_INT align)
7751 if (aarch64_can_use_per_function_literal_pools_p ())
7752 return function_section (current_function_decl);
7754 return default_elf_select_rtx_section (mode, x, align);
7757 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7758 void
7759 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7760 HOST_WIDE_INT offset)
7762 /* When using per-function literal pools, we must ensure that any code
7763 section is aligned to the minimal instruction length, lest we get
7764 errors from the assembler re "unaligned instructions". */
7765 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7766 ASM_OUTPUT_ALIGN (f, 2);
7769 /* Costs. */
7771 /* Helper function for rtx cost calculation. Strip a shift expression
7772 from X. Returns the inner operand if successful, or the original
7773 expression on failure. */
7774 static rtx
7775 aarch64_strip_shift (rtx x)
7777 rtx op = x;
7779 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7780 we can convert both to ROR during final output. */
7781 if ((GET_CODE (op) == ASHIFT
7782 || GET_CODE (op) == ASHIFTRT
7783 || GET_CODE (op) == LSHIFTRT
7784 || GET_CODE (op) == ROTATERT
7785 || GET_CODE (op) == ROTATE)
7786 && CONST_INT_P (XEXP (op, 1)))
7787 return XEXP (op, 0);
7789 if (GET_CODE (op) == MULT
7790 && CONST_INT_P (XEXP (op, 1))
7791 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7792 return XEXP (op, 0);
7794 return x;
7797 /* Helper function for rtx cost calculation. Strip an extend
7798 expression from X. Returns the inner operand if successful, or the
7799 original expression on failure. We deal with a number of possible
7800 canonicalization variations here. If STRIP_SHIFT is true, then
7801 we can strip off a shift also. */
7802 static rtx
7803 aarch64_strip_extend (rtx x, bool strip_shift)
7805 scalar_int_mode mode;
7806 rtx op = x;
7808 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7809 return op;
7811 /* Zero and sign extraction of a widened value. */
7812 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7813 && XEXP (op, 2) == const0_rtx
7814 && GET_CODE (XEXP (op, 0)) == MULT
7815 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7816 XEXP (op, 1)))
7817 return XEXP (XEXP (op, 0), 0);
7819 /* It can also be represented (for zero-extend) as an AND with an
7820 immediate. */
7821 if (GET_CODE (op) == AND
7822 && GET_CODE (XEXP (op, 0)) == MULT
7823 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7824 && CONST_INT_P (XEXP (op, 1))
7825 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7826 INTVAL (XEXP (op, 1))) != 0)
7827 return XEXP (XEXP (op, 0), 0);
7829 /* Now handle extended register, as this may also have an optional
7830 left shift by 1..4. */
7831 if (strip_shift
7832 && GET_CODE (op) == ASHIFT
7833 && CONST_INT_P (XEXP (op, 1))
7834 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7835 op = XEXP (op, 0);
7837 if (GET_CODE (op) == ZERO_EXTEND
7838 || GET_CODE (op) == SIGN_EXTEND)
7839 op = XEXP (op, 0);
7841 if (op != x)
7842 return op;
7844 return x;
7847 /* Return true iff CODE is a shift supported in combination
7848 with arithmetic instructions. */
7850 static bool
7851 aarch64_shift_p (enum rtx_code code)
7853 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7857 /* Return true iff X is a cheap shift without a sign extend. */
7859 static bool
7860 aarch64_cheap_mult_shift_p (rtx x)
7862 rtx op0, op1;
7864 op0 = XEXP (x, 0);
7865 op1 = XEXP (x, 1);
7867 if (!(aarch64_tune_params.extra_tuning_flags
7868 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7869 return false;
7871 if (GET_CODE (op0) == SIGN_EXTEND)
7872 return false;
7874 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7875 && UINTVAL (op1) <= 4)
7876 return true;
7878 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7879 return false;
7881 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7883 if (l2 > 0 && l2 <= 4)
7884 return true;
7886 return false;
7889 /* Helper function for rtx cost calculation. Calculate the cost of
7890 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7891 Return the calculated cost of the expression, recursing manually in to
7892 operands where needed. */
7894 static int
7895 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7897 rtx op0, op1;
7898 const struct cpu_cost_table *extra_cost
7899 = aarch64_tune_params.insn_extra_cost;
7900 int cost = 0;
7901 bool compound_p = (outer == PLUS || outer == MINUS);
7902 machine_mode mode = GET_MODE (x);
7904 gcc_checking_assert (code == MULT);
7906 op0 = XEXP (x, 0);
7907 op1 = XEXP (x, 1);
7909 if (VECTOR_MODE_P (mode))
7910 mode = GET_MODE_INNER (mode);
7912 /* Integer multiply/fma. */
7913 if (GET_MODE_CLASS (mode) == MODE_INT)
7915 /* The multiply will be canonicalized as a shift, cost it as such. */
7916 if (aarch64_shift_p (GET_CODE (x))
7917 || (CONST_INT_P (op1)
7918 && exact_log2 (INTVAL (op1)) > 0))
7920 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7921 || GET_CODE (op0) == SIGN_EXTEND;
7922 if (speed)
7924 if (compound_p)
7926 /* If the shift is considered cheap,
7927 then don't add any cost. */
7928 if (aarch64_cheap_mult_shift_p (x))
7930 else if (REG_P (op1))
7931 /* ARITH + shift-by-register. */
7932 cost += extra_cost->alu.arith_shift_reg;
7933 else if (is_extend)
7934 /* ARITH + extended register. We don't have a cost field
7935 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
7936 cost += extra_cost->alu.extend_arith;
7937 else
7938 /* ARITH + shift-by-immediate. */
7939 cost += extra_cost->alu.arith_shift;
7941 else
7942 /* LSL (immediate). */
7943 cost += extra_cost->alu.shift;
7946 /* Strip extends as we will have costed them in the case above. */
7947 if (is_extend)
7948 op0 = aarch64_strip_extend (op0, true);
7950 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7952 return cost;
7955 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
7956 compound and let the below cases handle it. After all, MNEG is a
7957 special-case alias of MSUB. */
7958 if (GET_CODE (op0) == NEG)
7960 op0 = XEXP (op0, 0);
7961 compound_p = true;
7964 /* Integer multiplies or FMAs have zero/sign extending variants. */
7965 if ((GET_CODE (op0) == ZERO_EXTEND
7966 && GET_CODE (op1) == ZERO_EXTEND)
7967 || (GET_CODE (op0) == SIGN_EXTEND
7968 && GET_CODE (op1) == SIGN_EXTEND))
7970 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
7971 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
7973 if (speed)
7975 if (compound_p)
7976 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
7977 cost += extra_cost->mult[0].extend_add;
7978 else
7979 /* MUL/SMULL/UMULL. */
7980 cost += extra_cost->mult[0].extend;
7983 return cost;
7986 /* This is either an integer multiply or a MADD. In both cases
7987 we want to recurse and cost the operands. */
7988 cost += rtx_cost (op0, mode, MULT, 0, speed);
7989 cost += rtx_cost (op1, mode, MULT, 1, speed);
7991 if (speed)
7993 if (compound_p)
7994 /* MADD/MSUB. */
7995 cost += extra_cost->mult[mode == DImode].add;
7996 else
7997 /* MUL. */
7998 cost += extra_cost->mult[mode == DImode].simple;
8001 return cost;
8003 else
8005 if (speed)
8007 /* Floating-point FMA/FMUL can also support negations of the
8008 operands, unless the rounding mode is upward or downward in
8009 which case FNMUL is different than FMUL with operand negation. */
8010 bool neg0 = GET_CODE (op0) == NEG;
8011 bool neg1 = GET_CODE (op1) == NEG;
8012 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8014 if (neg0)
8015 op0 = XEXP (op0, 0);
8016 if (neg1)
8017 op1 = XEXP (op1, 0);
8020 if (compound_p)
8021 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8022 cost += extra_cost->fp[mode == DFmode].fma;
8023 else
8024 /* FMUL/FNMUL. */
8025 cost += extra_cost->fp[mode == DFmode].mult;
8028 cost += rtx_cost (op0, mode, MULT, 0, speed);
8029 cost += rtx_cost (op1, mode, MULT, 1, speed);
8030 return cost;
8034 static int
8035 aarch64_address_cost (rtx x,
8036 machine_mode mode,
8037 addr_space_t as ATTRIBUTE_UNUSED,
8038 bool speed)
8040 enum rtx_code c = GET_CODE (x);
8041 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8042 struct aarch64_address_info info;
8043 int cost = 0;
8044 info.shift = 0;
8046 if (!aarch64_classify_address (&info, x, mode, false))
8048 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8050 /* This is a CONST or SYMBOL ref which will be split
8051 in a different way depending on the code model in use.
8052 Cost it through the generic infrastructure. */
8053 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8054 /* Divide through by the cost of one instruction to
8055 bring it to the same units as the address costs. */
8056 cost_symbol_ref /= COSTS_N_INSNS (1);
8057 /* The cost is then the cost of preparing the address,
8058 followed by an immediate (possibly 0) offset. */
8059 return cost_symbol_ref + addr_cost->imm_offset;
8061 else
8063 /* This is most likely a jump table from a case
8064 statement. */
8065 return addr_cost->register_offset;
8069 switch (info.type)
8071 case ADDRESS_LO_SUM:
8072 case ADDRESS_SYMBOLIC:
8073 case ADDRESS_REG_IMM:
8074 cost += addr_cost->imm_offset;
8075 break;
8077 case ADDRESS_REG_WB:
8078 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8079 cost += addr_cost->pre_modify;
8080 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8081 cost += addr_cost->post_modify;
8082 else
8083 gcc_unreachable ();
8085 break;
8087 case ADDRESS_REG_REG:
8088 cost += addr_cost->register_offset;
8089 break;
8091 case ADDRESS_REG_SXTW:
8092 cost += addr_cost->register_sextend;
8093 break;
8095 case ADDRESS_REG_UXTW:
8096 cost += addr_cost->register_zextend;
8097 break;
8099 default:
8100 gcc_unreachable ();
8104 if (info.shift > 0)
8106 /* For the sake of calculating the cost of the shifted register
8107 component, we can treat same sized modes in the same way. */
8108 if (known_eq (GET_MODE_BITSIZE (mode), 16))
8109 cost += addr_cost->addr_scale_costs.hi;
8110 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8111 cost += addr_cost->addr_scale_costs.si;
8112 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8113 cost += addr_cost->addr_scale_costs.di;
8114 else
8115 /* We can't tell, or this is a 128-bit vector. */
8116 cost += addr_cost->addr_scale_costs.ti;
8119 return cost;
8122 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8123 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8124 to be taken. */
8127 aarch64_branch_cost (bool speed_p, bool predictable_p)
8129 /* When optimizing for speed, use the cost of unpredictable branches. */
8130 const struct cpu_branch_cost *branch_costs =
8131 aarch64_tune_params.branch_costs;
8133 if (!speed_p || predictable_p)
8134 return branch_costs->predictable;
8135 else
8136 return branch_costs->unpredictable;
8139 /* Return true if the RTX X in mode MODE is a zero or sign extract
8140 usable in an ADD or SUB (extended register) instruction. */
8141 static bool
8142 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8144 /* Catch add with a sign extract.
8145 This is add_<optab><mode>_multp2. */
8146 if (GET_CODE (x) == SIGN_EXTRACT
8147 || GET_CODE (x) == ZERO_EXTRACT)
8149 rtx op0 = XEXP (x, 0);
8150 rtx op1 = XEXP (x, 1);
8151 rtx op2 = XEXP (x, 2);
8153 if (GET_CODE (op0) == MULT
8154 && CONST_INT_P (op1)
8155 && op2 == const0_rtx
8156 && CONST_INT_P (XEXP (op0, 1))
8157 && aarch64_is_extend_from_extract (mode,
8158 XEXP (op0, 1),
8159 op1))
8161 return true;
8164 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8165 No shift. */
8166 else if (GET_CODE (x) == SIGN_EXTEND
8167 || GET_CODE (x) == ZERO_EXTEND)
8168 return REG_P (XEXP (x, 0));
8170 return false;
8173 static bool
8174 aarch64_frint_unspec_p (unsigned int u)
8176 switch (u)
8178 case UNSPEC_FRINTZ:
8179 case UNSPEC_FRINTP:
8180 case UNSPEC_FRINTM:
8181 case UNSPEC_FRINTA:
8182 case UNSPEC_FRINTN:
8183 case UNSPEC_FRINTX:
8184 case UNSPEC_FRINTI:
8185 return true;
8187 default:
8188 return false;
8192 /* Return true iff X is an rtx that will match an extr instruction
8193 i.e. as described in the *extr<mode>5_insn family of patterns.
8194 OP0 and OP1 will be set to the operands of the shifts involved
8195 on success and will be NULL_RTX otherwise. */
8197 static bool
8198 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8200 rtx op0, op1;
8201 scalar_int_mode mode;
8202 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8203 return false;
8205 *res_op0 = NULL_RTX;
8206 *res_op1 = NULL_RTX;
8208 if (GET_CODE (x) != IOR)
8209 return false;
8211 op0 = XEXP (x, 0);
8212 op1 = XEXP (x, 1);
8214 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8215 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8217 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8218 if (GET_CODE (op1) == ASHIFT)
8219 std::swap (op0, op1);
8221 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8222 return false;
8224 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8225 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8227 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8228 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8230 *res_op0 = XEXP (op0, 0);
8231 *res_op1 = XEXP (op1, 0);
8232 return true;
8236 return false;
8239 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8240 storing it in *COST. Result is true if the total cost of the operation
8241 has now been calculated. */
8242 static bool
8243 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8245 rtx inner;
8246 rtx comparator;
8247 enum rtx_code cmpcode;
8249 if (COMPARISON_P (op0))
8251 inner = XEXP (op0, 0);
8252 comparator = XEXP (op0, 1);
8253 cmpcode = GET_CODE (op0);
8255 else
8257 inner = op0;
8258 comparator = const0_rtx;
8259 cmpcode = NE;
8262 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8264 /* Conditional branch. */
8265 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8266 return true;
8267 else
8269 if (cmpcode == NE || cmpcode == EQ)
8271 if (comparator == const0_rtx)
8273 /* TBZ/TBNZ/CBZ/CBNZ. */
8274 if (GET_CODE (inner) == ZERO_EXTRACT)
8275 /* TBZ/TBNZ. */
8276 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8277 ZERO_EXTRACT, 0, speed);
8278 else
8279 /* CBZ/CBNZ. */
8280 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8282 return true;
8285 else if (cmpcode == LT || cmpcode == GE)
8287 /* TBZ/TBNZ. */
8288 if (comparator == const0_rtx)
8289 return true;
8293 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8295 /* CCMP. */
8296 if (GET_CODE (op1) == COMPARE)
8298 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8299 if (XEXP (op1, 1) == const0_rtx)
8300 *cost += 1;
8301 if (speed)
8303 machine_mode mode = GET_MODE (XEXP (op1, 0));
8304 const struct cpu_cost_table *extra_cost
8305 = aarch64_tune_params.insn_extra_cost;
8307 if (GET_MODE_CLASS (mode) == MODE_INT)
8308 *cost += extra_cost->alu.arith;
8309 else
8310 *cost += extra_cost->fp[mode == DFmode].compare;
8312 return true;
8315 /* It's a conditional operation based on the status flags,
8316 so it must be some flavor of CSEL. */
8318 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8319 if (GET_CODE (op1) == NEG
8320 || GET_CODE (op1) == NOT
8321 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8322 op1 = XEXP (op1, 0);
8323 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8325 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8326 op1 = XEXP (op1, 0);
8327 op2 = XEXP (op2, 0);
8330 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8331 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8332 return true;
8335 /* We don't know what this is, cost all operands. */
8336 return false;
8339 /* Check whether X is a bitfield operation of the form shift + extend that
8340 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8341 operand to which the bitfield operation is applied. Otherwise return
8342 NULL_RTX. */
8344 static rtx
8345 aarch64_extend_bitfield_pattern_p (rtx x)
8347 rtx_code outer_code = GET_CODE (x);
8348 machine_mode outer_mode = GET_MODE (x);
8350 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8351 && outer_mode != SImode && outer_mode != DImode)
8352 return NULL_RTX;
8354 rtx inner = XEXP (x, 0);
8355 rtx_code inner_code = GET_CODE (inner);
8356 machine_mode inner_mode = GET_MODE (inner);
8357 rtx op = NULL_RTX;
8359 switch (inner_code)
8361 case ASHIFT:
8362 if (CONST_INT_P (XEXP (inner, 1))
8363 && (inner_mode == QImode || inner_mode == HImode))
8364 op = XEXP (inner, 0);
8365 break;
8366 case LSHIFTRT:
8367 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8368 && (inner_mode == QImode || inner_mode == HImode))
8369 op = XEXP (inner, 0);
8370 break;
8371 case ASHIFTRT:
8372 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8373 && (inner_mode == QImode || inner_mode == HImode))
8374 op = XEXP (inner, 0);
8375 break;
8376 default:
8377 break;
8380 return op;
8383 /* Return true if the mask and a shift amount from an RTX of the form
8384 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8385 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8387 bool
8388 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8389 rtx shft_amnt)
8391 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8392 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8393 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8394 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8397 /* Calculate the cost of calculating X, storing it in *COST. Result
8398 is true if the total cost of the operation has now been calculated. */
8399 static bool
8400 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8401 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8403 rtx op0, op1, op2;
8404 const struct cpu_cost_table *extra_cost
8405 = aarch64_tune_params.insn_extra_cost;
8406 int code = GET_CODE (x);
8407 scalar_int_mode int_mode;
8409 /* By default, assume that everything has equivalent cost to the
8410 cheapest instruction. Any additional costs are applied as a delta
8411 above this default. */
8412 *cost = COSTS_N_INSNS (1);
8414 switch (code)
8416 case SET:
8417 /* The cost depends entirely on the operands to SET. */
8418 *cost = 0;
8419 op0 = SET_DEST (x);
8420 op1 = SET_SRC (x);
8422 switch (GET_CODE (op0))
8424 case MEM:
8425 if (speed)
8427 rtx address = XEXP (op0, 0);
8428 if (VECTOR_MODE_P (mode))
8429 *cost += extra_cost->ldst.storev;
8430 else if (GET_MODE_CLASS (mode) == MODE_INT)
8431 *cost += extra_cost->ldst.store;
8432 else if (mode == SFmode)
8433 *cost += extra_cost->ldst.storef;
8434 else if (mode == DFmode)
8435 *cost += extra_cost->ldst.stored;
8437 *cost +=
8438 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8439 0, speed));
8442 *cost += rtx_cost (op1, mode, SET, 1, speed);
8443 return true;
8445 case SUBREG:
8446 if (! REG_P (SUBREG_REG (op0)))
8447 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8449 /* Fall through. */
8450 case REG:
8451 /* The cost is one per vector-register copied. */
8452 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8454 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8455 *cost = COSTS_N_INSNS (nregs);
8457 /* const0_rtx is in general free, but we will use an
8458 instruction to set a register to 0. */
8459 else if (REG_P (op1) || op1 == const0_rtx)
8461 /* The cost is 1 per register copied. */
8462 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8463 *cost = COSTS_N_INSNS (nregs);
8465 else
8466 /* Cost is just the cost of the RHS of the set. */
8467 *cost += rtx_cost (op1, mode, SET, 1, speed);
8468 return true;
8470 case ZERO_EXTRACT:
8471 case SIGN_EXTRACT:
8472 /* Bit-field insertion. Strip any redundant widening of
8473 the RHS to meet the width of the target. */
8474 if (GET_CODE (op1) == SUBREG)
8475 op1 = SUBREG_REG (op1);
8476 if ((GET_CODE (op1) == ZERO_EXTEND
8477 || GET_CODE (op1) == SIGN_EXTEND)
8478 && CONST_INT_P (XEXP (op0, 1))
8479 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8480 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8481 op1 = XEXP (op1, 0);
8483 if (CONST_INT_P (op1))
8485 /* MOV immediate is assumed to always be cheap. */
8486 *cost = COSTS_N_INSNS (1);
8488 else
8490 /* BFM. */
8491 if (speed)
8492 *cost += extra_cost->alu.bfi;
8493 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8496 return true;
8498 default:
8499 /* We can't make sense of this, assume default cost. */
8500 *cost = COSTS_N_INSNS (1);
8501 return false;
8503 return false;
8505 case CONST_INT:
8506 /* If an instruction can incorporate a constant within the
8507 instruction, the instruction's expression avoids calling
8508 rtx_cost() on the constant. If rtx_cost() is called on a
8509 constant, then it is usually because the constant must be
8510 moved into a register by one or more instructions.
8512 The exception is constant 0, which can be expressed
8513 as XZR/WZR and is therefore free. The exception to this is
8514 if we have (set (reg) (const0_rtx)) in which case we must cost
8515 the move. However, we can catch that when we cost the SET, so
8516 we don't need to consider that here. */
8517 if (x == const0_rtx)
8518 *cost = 0;
8519 else
8521 /* To an approximation, building any other constant is
8522 proportionally expensive to the number of instructions
8523 required to build that constant. This is true whether we
8524 are compiling for SPEED or otherwise. */
8525 if (!is_a <scalar_int_mode> (mode, &int_mode))
8526 int_mode = word_mode;
8527 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8528 (NULL_RTX, x, false, int_mode));
8530 return true;
8532 case CONST_DOUBLE:
8534 /* First determine number of instructions to do the move
8535 as an integer constant. */
8536 if (!aarch64_float_const_representable_p (x)
8537 && !aarch64_can_const_movi_rtx_p (x, mode)
8538 && aarch64_float_const_rtx_p (x))
8540 unsigned HOST_WIDE_INT ival;
8541 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8542 gcc_assert (succeed);
8544 scalar_int_mode imode = (mode == HFmode
8545 ? SImode
8546 : int_mode_for_mode (mode).require ());
8547 int ncost = aarch64_internal_mov_immediate
8548 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8549 *cost += COSTS_N_INSNS (ncost);
8550 return true;
8553 if (speed)
8555 /* mov[df,sf]_aarch64. */
8556 if (aarch64_float_const_representable_p (x))
8557 /* FMOV (scalar immediate). */
8558 *cost += extra_cost->fp[mode == DFmode].fpconst;
8559 else if (!aarch64_float_const_zero_rtx_p (x))
8561 /* This will be a load from memory. */
8562 if (mode == DFmode)
8563 *cost += extra_cost->ldst.loadd;
8564 else
8565 *cost += extra_cost->ldst.loadf;
8567 else
8568 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8569 or MOV v0.s[0], wzr - neither of which are modeled by the
8570 cost tables. Just use the default cost. */
8575 return true;
8577 case MEM:
8578 if (speed)
8580 /* For loads we want the base cost of a load, plus an
8581 approximation for the additional cost of the addressing
8582 mode. */
8583 rtx address = XEXP (x, 0);
8584 if (VECTOR_MODE_P (mode))
8585 *cost += extra_cost->ldst.loadv;
8586 else if (GET_MODE_CLASS (mode) == MODE_INT)
8587 *cost += extra_cost->ldst.load;
8588 else if (mode == SFmode)
8589 *cost += extra_cost->ldst.loadf;
8590 else if (mode == DFmode)
8591 *cost += extra_cost->ldst.loadd;
8593 *cost +=
8594 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8595 0, speed));
8598 return true;
8600 case NEG:
8601 op0 = XEXP (x, 0);
8603 if (VECTOR_MODE_P (mode))
8605 if (speed)
8607 /* FNEG. */
8608 *cost += extra_cost->vect.alu;
8610 return false;
8613 if (GET_MODE_CLASS (mode) == MODE_INT)
8615 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8616 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8618 /* CSETM. */
8619 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8620 return true;
8623 /* Cost this as SUB wzr, X. */
8624 op0 = CONST0_RTX (mode);
8625 op1 = XEXP (x, 0);
8626 goto cost_minus;
8629 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8631 /* Support (neg(fma...)) as a single instruction only if
8632 sign of zeros is unimportant. This matches the decision
8633 making in aarch64.md. */
8634 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8636 /* FNMADD. */
8637 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8638 return true;
8640 if (GET_CODE (op0) == MULT)
8642 /* FNMUL. */
8643 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8644 return true;
8646 if (speed)
8647 /* FNEG. */
8648 *cost += extra_cost->fp[mode == DFmode].neg;
8649 return false;
8652 return false;
8654 case CLRSB:
8655 case CLZ:
8656 if (speed)
8658 if (VECTOR_MODE_P (mode))
8659 *cost += extra_cost->vect.alu;
8660 else
8661 *cost += extra_cost->alu.clz;
8664 return false;
8666 case COMPARE:
8667 op0 = XEXP (x, 0);
8668 op1 = XEXP (x, 1);
8670 if (op1 == const0_rtx
8671 && GET_CODE (op0) == AND)
8673 x = op0;
8674 mode = GET_MODE (op0);
8675 goto cost_logic;
8678 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8680 /* TODO: A write to the CC flags possibly costs extra, this
8681 needs encoding in the cost tables. */
8683 mode = GET_MODE (op0);
8684 /* ANDS. */
8685 if (GET_CODE (op0) == AND)
8687 x = op0;
8688 goto cost_logic;
8691 if (GET_CODE (op0) == PLUS)
8693 /* ADDS (and CMN alias). */
8694 x = op0;
8695 goto cost_plus;
8698 if (GET_CODE (op0) == MINUS)
8700 /* SUBS. */
8701 x = op0;
8702 goto cost_minus;
8705 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8706 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8707 && CONST_INT_P (XEXP (op0, 2)))
8709 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8710 Handle it here directly rather than going to cost_logic
8711 since we know the immediate generated for the TST is valid
8712 so we can avoid creating an intermediate rtx for it only
8713 for costing purposes. */
8714 if (speed)
8715 *cost += extra_cost->alu.logical;
8717 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8718 ZERO_EXTRACT, 0, speed);
8719 return true;
8722 if (GET_CODE (op1) == NEG)
8724 /* CMN. */
8725 if (speed)
8726 *cost += extra_cost->alu.arith;
8728 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8729 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8730 return true;
8733 /* CMP.
8735 Compare can freely swap the order of operands, and
8736 canonicalization puts the more complex operation first.
8737 But the integer MINUS logic expects the shift/extend
8738 operation in op1. */
8739 if (! (REG_P (op0)
8740 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8742 op0 = XEXP (x, 1);
8743 op1 = XEXP (x, 0);
8745 goto cost_minus;
8748 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8750 /* FCMP. */
8751 if (speed)
8752 *cost += extra_cost->fp[mode == DFmode].compare;
8754 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8756 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8757 /* FCMP supports constant 0.0 for no extra cost. */
8758 return true;
8760 return false;
8763 if (VECTOR_MODE_P (mode))
8765 /* Vector compare. */
8766 if (speed)
8767 *cost += extra_cost->vect.alu;
8769 if (aarch64_float_const_zero_rtx_p (op1))
8771 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8772 cost. */
8773 return true;
8775 return false;
8777 return false;
8779 case MINUS:
8781 op0 = XEXP (x, 0);
8782 op1 = XEXP (x, 1);
8784 cost_minus:
8785 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8787 /* Detect valid immediates. */
8788 if ((GET_MODE_CLASS (mode) == MODE_INT
8789 || (GET_MODE_CLASS (mode) == MODE_CC
8790 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8791 && CONST_INT_P (op1)
8792 && aarch64_uimm12_shift (INTVAL (op1)))
8794 if (speed)
8795 /* SUB(S) (immediate). */
8796 *cost += extra_cost->alu.arith;
8797 return true;
8800 /* Look for SUB (extended register). */
8801 if (is_a <scalar_int_mode> (mode, &int_mode)
8802 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8804 if (speed)
8805 *cost += extra_cost->alu.extend_arith;
8807 op1 = aarch64_strip_extend (op1, true);
8808 *cost += rtx_cost (op1, VOIDmode,
8809 (enum rtx_code) GET_CODE (op1), 0, speed);
8810 return true;
8813 rtx new_op1 = aarch64_strip_extend (op1, false);
8815 /* Cost this as an FMA-alike operation. */
8816 if ((GET_CODE (new_op1) == MULT
8817 || aarch64_shift_p (GET_CODE (new_op1)))
8818 && code != COMPARE)
8820 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8821 (enum rtx_code) code,
8822 speed);
8823 return true;
8826 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8828 if (speed)
8830 if (VECTOR_MODE_P (mode))
8832 /* Vector SUB. */
8833 *cost += extra_cost->vect.alu;
8835 else if (GET_MODE_CLASS (mode) == MODE_INT)
8837 /* SUB(S). */
8838 *cost += extra_cost->alu.arith;
8840 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8842 /* FSUB. */
8843 *cost += extra_cost->fp[mode == DFmode].addsub;
8846 return true;
8849 case PLUS:
8851 rtx new_op0;
8853 op0 = XEXP (x, 0);
8854 op1 = XEXP (x, 1);
8856 cost_plus:
8857 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8858 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8860 /* CSINC. */
8861 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8862 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8863 return true;
8866 if (GET_MODE_CLASS (mode) == MODE_INT
8867 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8868 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8870 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8872 if (speed)
8873 /* ADD (immediate). */
8874 *cost += extra_cost->alu.arith;
8875 return true;
8878 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8880 /* Look for ADD (extended register). */
8881 if (is_a <scalar_int_mode> (mode, &int_mode)
8882 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8884 if (speed)
8885 *cost += extra_cost->alu.extend_arith;
8887 op0 = aarch64_strip_extend (op0, true);
8888 *cost += rtx_cost (op0, VOIDmode,
8889 (enum rtx_code) GET_CODE (op0), 0, speed);
8890 return true;
8893 /* Strip any extend, leave shifts behind as we will
8894 cost them through mult_cost. */
8895 new_op0 = aarch64_strip_extend (op0, false);
8897 if (GET_CODE (new_op0) == MULT
8898 || aarch64_shift_p (GET_CODE (new_op0)))
8900 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8901 speed);
8902 return true;
8905 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8907 if (speed)
8909 if (VECTOR_MODE_P (mode))
8911 /* Vector ADD. */
8912 *cost += extra_cost->vect.alu;
8914 else if (GET_MODE_CLASS (mode) == MODE_INT)
8916 /* ADD. */
8917 *cost += extra_cost->alu.arith;
8919 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8921 /* FADD. */
8922 *cost += extra_cost->fp[mode == DFmode].addsub;
8925 return true;
8928 case BSWAP:
8929 *cost = COSTS_N_INSNS (1);
8931 if (speed)
8933 if (VECTOR_MODE_P (mode))
8934 *cost += extra_cost->vect.alu;
8935 else
8936 *cost += extra_cost->alu.rev;
8938 return false;
8940 case IOR:
8941 if (aarch_rev16_p (x))
8943 *cost = COSTS_N_INSNS (1);
8945 if (speed)
8947 if (VECTOR_MODE_P (mode))
8948 *cost += extra_cost->vect.alu;
8949 else
8950 *cost += extra_cost->alu.rev;
8952 return true;
8955 if (aarch64_extr_rtx_p (x, &op0, &op1))
8957 *cost += rtx_cost (op0, mode, IOR, 0, speed);
8958 *cost += rtx_cost (op1, mode, IOR, 1, speed);
8959 if (speed)
8960 *cost += extra_cost->alu.shift;
8962 return true;
8964 /* Fall through. */
8965 case XOR:
8966 case AND:
8967 cost_logic:
8968 op0 = XEXP (x, 0);
8969 op1 = XEXP (x, 1);
8971 if (VECTOR_MODE_P (mode))
8973 if (speed)
8974 *cost += extra_cost->vect.alu;
8975 return true;
8978 if (code == AND
8979 && GET_CODE (op0) == MULT
8980 && CONST_INT_P (XEXP (op0, 1))
8981 && CONST_INT_P (op1)
8982 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
8983 INTVAL (op1)) != 0)
8985 /* This is a UBFM/SBFM. */
8986 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
8987 if (speed)
8988 *cost += extra_cost->alu.bfx;
8989 return true;
8992 if (is_int_mode (mode, &int_mode))
8994 if (CONST_INT_P (op1))
8996 /* We have a mask + shift version of a UBFIZ
8997 i.e. the *andim_ashift<mode>_bfiz pattern. */
8998 if (GET_CODE (op0) == ASHIFT
8999 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9000 XEXP (op0, 1)))
9002 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9003 (enum rtx_code) code, 0, speed);
9004 if (speed)
9005 *cost += extra_cost->alu.bfx;
9007 return true;
9009 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9011 /* We possibly get the immediate for free, this is not
9012 modelled. */
9013 *cost += rtx_cost (op0, int_mode,
9014 (enum rtx_code) code, 0, speed);
9015 if (speed)
9016 *cost += extra_cost->alu.logical;
9018 return true;
9021 else
9023 rtx new_op0 = op0;
9025 /* Handle ORN, EON, or BIC. */
9026 if (GET_CODE (op0) == NOT)
9027 op0 = XEXP (op0, 0);
9029 new_op0 = aarch64_strip_shift (op0);
9031 /* If we had a shift on op0 then this is a logical-shift-
9032 by-register/immediate operation. Otherwise, this is just
9033 a logical operation. */
9034 if (speed)
9036 if (new_op0 != op0)
9038 /* Shift by immediate. */
9039 if (CONST_INT_P (XEXP (op0, 1)))
9040 *cost += extra_cost->alu.log_shift;
9041 else
9042 *cost += extra_cost->alu.log_shift_reg;
9044 else
9045 *cost += extra_cost->alu.logical;
9048 /* In both cases we want to cost both operands. */
9049 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9050 0, speed);
9051 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9052 1, speed);
9054 return true;
9057 return false;
9059 case NOT:
9060 x = XEXP (x, 0);
9061 op0 = aarch64_strip_shift (x);
9063 if (VECTOR_MODE_P (mode))
9065 /* Vector NOT. */
9066 *cost += extra_cost->vect.alu;
9067 return false;
9070 /* MVN-shifted-reg. */
9071 if (op0 != x)
9073 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9075 if (speed)
9076 *cost += extra_cost->alu.log_shift;
9078 return true;
9080 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9081 Handle the second form here taking care that 'a' in the above can
9082 be a shift. */
9083 else if (GET_CODE (op0) == XOR)
9085 rtx newop0 = XEXP (op0, 0);
9086 rtx newop1 = XEXP (op0, 1);
9087 rtx op0_stripped = aarch64_strip_shift (newop0);
9089 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9090 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9092 if (speed)
9094 if (op0_stripped != newop0)
9095 *cost += extra_cost->alu.log_shift;
9096 else
9097 *cost += extra_cost->alu.logical;
9100 return true;
9102 /* MVN. */
9103 if (speed)
9104 *cost += extra_cost->alu.logical;
9106 return false;
9108 case ZERO_EXTEND:
9110 op0 = XEXP (x, 0);
9111 /* If a value is written in SI mode, then zero extended to DI
9112 mode, the operation will in general be free as a write to
9113 a 'w' register implicitly zeroes the upper bits of an 'x'
9114 register. However, if this is
9116 (set (reg) (zero_extend (reg)))
9118 we must cost the explicit register move. */
9119 if (mode == DImode
9120 && GET_MODE (op0) == SImode
9121 && outer == SET)
9123 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9125 /* If OP_COST is non-zero, then the cost of the zero extend
9126 is effectively the cost of the inner operation. Otherwise
9127 we have a MOV instruction and we take the cost from the MOV
9128 itself. This is true independently of whether we are
9129 optimizing for space or time. */
9130 if (op_cost)
9131 *cost = op_cost;
9133 return true;
9135 else if (MEM_P (op0))
9137 /* All loads can zero extend to any size for free. */
9138 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9139 return true;
9142 op0 = aarch64_extend_bitfield_pattern_p (x);
9143 if (op0)
9145 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9146 if (speed)
9147 *cost += extra_cost->alu.bfx;
9148 return true;
9151 if (speed)
9153 if (VECTOR_MODE_P (mode))
9155 /* UMOV. */
9156 *cost += extra_cost->vect.alu;
9158 else
9160 /* We generate an AND instead of UXTB/UXTH. */
9161 *cost += extra_cost->alu.logical;
9164 return false;
9166 case SIGN_EXTEND:
9167 if (MEM_P (XEXP (x, 0)))
9169 /* LDRSH. */
9170 if (speed)
9172 rtx address = XEXP (XEXP (x, 0), 0);
9173 *cost += extra_cost->ldst.load_sign_extend;
9175 *cost +=
9176 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9177 0, speed));
9179 return true;
9182 op0 = aarch64_extend_bitfield_pattern_p (x);
9183 if (op0)
9185 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9186 if (speed)
9187 *cost += extra_cost->alu.bfx;
9188 return true;
9191 if (speed)
9193 if (VECTOR_MODE_P (mode))
9194 *cost += extra_cost->vect.alu;
9195 else
9196 *cost += extra_cost->alu.extend;
9198 return false;
9200 case ASHIFT:
9201 op0 = XEXP (x, 0);
9202 op1 = XEXP (x, 1);
9204 if (CONST_INT_P (op1))
9206 if (speed)
9208 if (VECTOR_MODE_P (mode))
9210 /* Vector shift (immediate). */
9211 *cost += extra_cost->vect.alu;
9213 else
9215 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9216 aliases. */
9217 *cost += extra_cost->alu.shift;
9221 /* We can incorporate zero/sign extend for free. */
9222 if (GET_CODE (op0) == ZERO_EXTEND
9223 || GET_CODE (op0) == SIGN_EXTEND)
9224 op0 = XEXP (op0, 0);
9226 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9227 return true;
9229 else
9231 if (VECTOR_MODE_P (mode))
9233 if (speed)
9234 /* Vector shift (register). */
9235 *cost += extra_cost->vect.alu;
9237 else
9239 if (speed)
9240 /* LSLV. */
9241 *cost += extra_cost->alu.shift_reg;
9243 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9244 && CONST_INT_P (XEXP (op1, 1))
9245 && known_eq (INTVAL (XEXP (op1, 1)),
9246 GET_MODE_BITSIZE (mode) - 1))
9248 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9249 /* We already demanded XEXP (op1, 0) to be REG_P, so
9250 don't recurse into it. */
9251 return true;
9254 return false; /* All arguments need to be in registers. */
9257 case ROTATE:
9258 case ROTATERT:
9259 case LSHIFTRT:
9260 case ASHIFTRT:
9261 op0 = XEXP (x, 0);
9262 op1 = XEXP (x, 1);
9264 if (CONST_INT_P (op1))
9266 /* ASR (immediate) and friends. */
9267 if (speed)
9269 if (VECTOR_MODE_P (mode))
9270 *cost += extra_cost->vect.alu;
9271 else
9272 *cost += extra_cost->alu.shift;
9275 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9276 return true;
9278 else
9280 if (VECTOR_MODE_P (mode))
9282 if (speed)
9283 /* Vector shift (register). */
9284 *cost += extra_cost->vect.alu;
9286 else
9288 if (speed)
9289 /* ASR (register) and friends. */
9290 *cost += extra_cost->alu.shift_reg;
9292 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9293 && CONST_INT_P (XEXP (op1, 1))
9294 && known_eq (INTVAL (XEXP (op1, 1)),
9295 GET_MODE_BITSIZE (mode) - 1))
9297 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9298 /* We already demanded XEXP (op1, 0) to be REG_P, so
9299 don't recurse into it. */
9300 return true;
9303 return false; /* All arguments need to be in registers. */
9306 case SYMBOL_REF:
9308 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9309 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9311 /* LDR. */
9312 if (speed)
9313 *cost += extra_cost->ldst.load;
9315 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9316 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9318 /* ADRP, followed by ADD. */
9319 *cost += COSTS_N_INSNS (1);
9320 if (speed)
9321 *cost += 2 * extra_cost->alu.arith;
9323 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9324 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9326 /* ADR. */
9327 if (speed)
9328 *cost += extra_cost->alu.arith;
9331 if (flag_pic)
9333 /* One extra load instruction, after accessing the GOT. */
9334 *cost += COSTS_N_INSNS (1);
9335 if (speed)
9336 *cost += extra_cost->ldst.load;
9338 return true;
9340 case HIGH:
9341 case LO_SUM:
9342 /* ADRP/ADD (immediate). */
9343 if (speed)
9344 *cost += extra_cost->alu.arith;
9345 return true;
9347 case ZERO_EXTRACT:
9348 case SIGN_EXTRACT:
9349 /* UBFX/SBFX. */
9350 if (speed)
9352 if (VECTOR_MODE_P (mode))
9353 *cost += extra_cost->vect.alu;
9354 else
9355 *cost += extra_cost->alu.bfx;
9358 /* We can trust that the immediates used will be correct (there
9359 are no by-register forms), so we need only cost op0. */
9360 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9361 return true;
9363 case MULT:
9364 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9365 /* aarch64_rtx_mult_cost always handles recursion to its
9366 operands. */
9367 return true;
9369 case MOD:
9370 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9371 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9372 an unconditional negate. This case should only ever be reached through
9373 the set_smod_pow2_cheap check in expmed.c. */
9374 if (CONST_INT_P (XEXP (x, 1))
9375 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9376 && (mode == SImode || mode == DImode))
9378 /* We expand to 4 instructions. Reset the baseline. */
9379 *cost = COSTS_N_INSNS (4);
9381 if (speed)
9382 *cost += 2 * extra_cost->alu.logical
9383 + 2 * extra_cost->alu.arith;
9385 return true;
9388 /* Fall-through. */
9389 case UMOD:
9390 if (speed)
9392 /* Slighly prefer UMOD over SMOD. */
9393 if (VECTOR_MODE_P (mode))
9394 *cost += extra_cost->vect.alu;
9395 else if (GET_MODE_CLASS (mode) == MODE_INT)
9396 *cost += (extra_cost->mult[mode == DImode].add
9397 + extra_cost->mult[mode == DImode].idiv
9398 + (code == MOD ? 1 : 0));
9400 return false; /* All arguments need to be in registers. */
9402 case DIV:
9403 case UDIV:
9404 case SQRT:
9405 if (speed)
9407 if (VECTOR_MODE_P (mode))
9408 *cost += extra_cost->vect.alu;
9409 else if (GET_MODE_CLASS (mode) == MODE_INT)
9410 /* There is no integer SQRT, so only DIV and UDIV can get
9411 here. */
9412 *cost += (extra_cost->mult[mode == DImode].idiv
9413 /* Slighly prefer UDIV over SDIV. */
9414 + (code == DIV ? 1 : 0));
9415 else
9416 *cost += extra_cost->fp[mode == DFmode].div;
9418 return false; /* All arguments need to be in registers. */
9420 case IF_THEN_ELSE:
9421 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9422 XEXP (x, 2), cost, speed);
9424 case EQ:
9425 case NE:
9426 case GT:
9427 case GTU:
9428 case LT:
9429 case LTU:
9430 case GE:
9431 case GEU:
9432 case LE:
9433 case LEU:
9435 return false; /* All arguments must be in registers. */
9437 case FMA:
9438 op0 = XEXP (x, 0);
9439 op1 = XEXP (x, 1);
9440 op2 = XEXP (x, 2);
9442 if (speed)
9444 if (VECTOR_MODE_P (mode))
9445 *cost += extra_cost->vect.alu;
9446 else
9447 *cost += extra_cost->fp[mode == DFmode].fma;
9450 /* FMSUB, FNMADD, and FNMSUB are free. */
9451 if (GET_CODE (op0) == NEG)
9452 op0 = XEXP (op0, 0);
9454 if (GET_CODE (op2) == NEG)
9455 op2 = XEXP (op2, 0);
9457 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9458 and the by-element operand as operand 0. */
9459 if (GET_CODE (op1) == NEG)
9460 op1 = XEXP (op1, 0);
9462 /* Catch vector-by-element operations. The by-element operand can
9463 either be (vec_duplicate (vec_select (x))) or just
9464 (vec_select (x)), depending on whether we are multiplying by
9465 a vector or a scalar.
9467 Canonicalization is not very good in these cases, FMA4 will put the
9468 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9469 if (GET_CODE (op0) == VEC_DUPLICATE)
9470 op0 = XEXP (op0, 0);
9471 else if (GET_CODE (op1) == VEC_DUPLICATE)
9472 op1 = XEXP (op1, 0);
9474 if (GET_CODE (op0) == VEC_SELECT)
9475 op0 = XEXP (op0, 0);
9476 else if (GET_CODE (op1) == VEC_SELECT)
9477 op1 = XEXP (op1, 0);
9479 /* If the remaining parameters are not registers,
9480 get the cost to put them into registers. */
9481 *cost += rtx_cost (op0, mode, FMA, 0, speed);
9482 *cost += rtx_cost (op1, mode, FMA, 1, speed);
9483 *cost += rtx_cost (op2, mode, FMA, 2, speed);
9484 return true;
9486 case FLOAT:
9487 case UNSIGNED_FLOAT:
9488 if (speed)
9489 *cost += extra_cost->fp[mode == DFmode].fromint;
9490 return false;
9492 case FLOAT_EXTEND:
9493 if (speed)
9495 if (VECTOR_MODE_P (mode))
9497 /*Vector truncate. */
9498 *cost += extra_cost->vect.alu;
9500 else
9501 *cost += extra_cost->fp[mode == DFmode].widen;
9503 return false;
9505 case FLOAT_TRUNCATE:
9506 if (speed)
9508 if (VECTOR_MODE_P (mode))
9510 /*Vector conversion. */
9511 *cost += extra_cost->vect.alu;
9513 else
9514 *cost += extra_cost->fp[mode == DFmode].narrow;
9516 return false;
9518 case FIX:
9519 case UNSIGNED_FIX:
9520 x = XEXP (x, 0);
9521 /* Strip the rounding part. They will all be implemented
9522 by the fcvt* family of instructions anyway. */
9523 if (GET_CODE (x) == UNSPEC)
9525 unsigned int uns_code = XINT (x, 1);
9527 if (uns_code == UNSPEC_FRINTA
9528 || uns_code == UNSPEC_FRINTM
9529 || uns_code == UNSPEC_FRINTN
9530 || uns_code == UNSPEC_FRINTP
9531 || uns_code == UNSPEC_FRINTZ)
9532 x = XVECEXP (x, 0, 0);
9535 if (speed)
9537 if (VECTOR_MODE_P (mode))
9538 *cost += extra_cost->vect.alu;
9539 else
9540 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9543 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9544 fixed-point fcvt. */
9545 if (GET_CODE (x) == MULT
9546 && ((VECTOR_MODE_P (mode)
9547 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9548 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9550 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9551 0, speed);
9552 return true;
9555 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9556 return true;
9558 case ABS:
9559 if (VECTOR_MODE_P (mode))
9561 /* ABS (vector). */
9562 if (speed)
9563 *cost += extra_cost->vect.alu;
9565 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9567 op0 = XEXP (x, 0);
9569 /* FABD, which is analogous to FADD. */
9570 if (GET_CODE (op0) == MINUS)
9572 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9573 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9574 if (speed)
9575 *cost += extra_cost->fp[mode == DFmode].addsub;
9577 return true;
9579 /* Simple FABS is analogous to FNEG. */
9580 if (speed)
9581 *cost += extra_cost->fp[mode == DFmode].neg;
9583 else
9585 /* Integer ABS will either be split to
9586 two arithmetic instructions, or will be an ABS
9587 (scalar), which we don't model. */
9588 *cost = COSTS_N_INSNS (2);
9589 if (speed)
9590 *cost += 2 * extra_cost->alu.arith;
9592 return false;
9594 case SMAX:
9595 case SMIN:
9596 if (speed)
9598 if (VECTOR_MODE_P (mode))
9599 *cost += extra_cost->vect.alu;
9600 else
9602 /* FMAXNM/FMINNM/FMAX/FMIN.
9603 TODO: This may not be accurate for all implementations, but
9604 we do not model this in the cost tables. */
9605 *cost += extra_cost->fp[mode == DFmode].addsub;
9608 return false;
9610 case UNSPEC:
9611 /* The floating point round to integer frint* instructions. */
9612 if (aarch64_frint_unspec_p (XINT (x, 1)))
9614 if (speed)
9615 *cost += extra_cost->fp[mode == DFmode].roundint;
9617 return false;
9620 if (XINT (x, 1) == UNSPEC_RBIT)
9622 if (speed)
9623 *cost += extra_cost->alu.rev;
9625 return false;
9627 break;
9629 case TRUNCATE:
9631 /* Decompose <su>muldi3_highpart. */
9632 if (/* (truncate:DI */
9633 mode == DImode
9634 /* (lshiftrt:TI */
9635 && GET_MODE (XEXP (x, 0)) == TImode
9636 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9637 /* (mult:TI */
9638 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9639 /* (ANY_EXTEND:TI (reg:DI))
9640 (ANY_EXTEND:TI (reg:DI))) */
9641 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9642 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9643 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9644 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9645 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9646 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9647 /* (const_int 64) */
9648 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9649 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9651 /* UMULH/SMULH. */
9652 if (speed)
9653 *cost += extra_cost->mult[mode == DImode].extend;
9654 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9655 mode, MULT, 0, speed);
9656 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9657 mode, MULT, 1, speed);
9658 return true;
9661 /* Fall through. */
9662 default:
9663 break;
9666 if (dump_file
9667 && flag_aarch64_verbose_cost)
9668 fprintf (dump_file,
9669 "\nFailed to cost RTX. Assuming default cost.\n");
9671 return true;
9674 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9675 calculated for X. This cost is stored in *COST. Returns true
9676 if the total cost of X was calculated. */
9677 static bool
9678 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9679 int param, int *cost, bool speed)
9681 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9683 if (dump_file
9684 && flag_aarch64_verbose_cost)
9686 print_rtl_single (dump_file, x);
9687 fprintf (dump_file, "\n%s cost: %d (%s)\n",
9688 speed ? "Hot" : "Cold",
9689 *cost, result ? "final" : "partial");
9692 return result;
9695 static int
9696 aarch64_register_move_cost (machine_mode mode,
9697 reg_class_t from_i, reg_class_t to_i)
9699 enum reg_class from = (enum reg_class) from_i;
9700 enum reg_class to = (enum reg_class) to_i;
9701 const struct cpu_regmove_cost *regmove_cost
9702 = aarch64_tune_params.regmove_cost;
9704 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9705 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9706 to = GENERAL_REGS;
9708 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9709 from = GENERAL_REGS;
9711 /* Moving between GPR and stack cost is the same as GP2GP. */
9712 if ((from == GENERAL_REGS && to == STACK_REG)
9713 || (to == GENERAL_REGS && from == STACK_REG))
9714 return regmove_cost->GP2GP;
9716 /* To/From the stack register, we move via the gprs. */
9717 if (to == STACK_REG || from == STACK_REG)
9718 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9719 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9721 if (known_eq (GET_MODE_SIZE (mode), 16))
9723 /* 128-bit operations on general registers require 2 instructions. */
9724 if (from == GENERAL_REGS && to == GENERAL_REGS)
9725 return regmove_cost->GP2GP * 2;
9726 else if (from == GENERAL_REGS)
9727 return regmove_cost->GP2FP * 2;
9728 else if (to == GENERAL_REGS)
9729 return regmove_cost->FP2GP * 2;
9731 /* When AdvSIMD instructions are disabled it is not possible to move
9732 a 128-bit value directly between Q registers. This is handled in
9733 secondary reload. A general register is used as a scratch to move
9734 the upper DI value and the lower DI value is moved directly,
9735 hence the cost is the sum of three moves. */
9736 if (! TARGET_SIMD)
9737 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9739 return regmove_cost->FP2FP;
9742 if (from == GENERAL_REGS && to == GENERAL_REGS)
9743 return regmove_cost->GP2GP;
9744 else if (from == GENERAL_REGS)
9745 return regmove_cost->GP2FP;
9746 else if (to == GENERAL_REGS)
9747 return regmove_cost->FP2GP;
9749 return regmove_cost->FP2FP;
9752 static int
9753 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9754 reg_class_t rclass ATTRIBUTE_UNUSED,
9755 bool in ATTRIBUTE_UNUSED)
9757 return aarch64_tune_params.memmov_cost;
9760 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9761 to optimize 1.0/sqrt. */
9763 static bool
9764 use_rsqrt_p (machine_mode mode)
9766 return (!flag_trapping_math
9767 && flag_unsafe_math_optimizations
9768 && ((aarch64_tune_params.approx_modes->recip_sqrt
9769 & AARCH64_APPROX_MODE (mode))
9770 || flag_mrecip_low_precision_sqrt));
9773 /* Function to decide when to use the approximate reciprocal square root
9774 builtin. */
9776 static tree
9777 aarch64_builtin_reciprocal (tree fndecl)
9779 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9781 if (!use_rsqrt_p (mode))
9782 return NULL_TREE;
9783 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9786 typedef rtx (*rsqrte_type) (rtx, rtx);
9788 /* Select reciprocal square root initial estimate insn depending on machine
9789 mode. */
9791 static rsqrte_type
9792 get_rsqrte_type (machine_mode mode)
9794 switch (mode)
9796 case E_DFmode: return gen_aarch64_rsqrtedf;
9797 case E_SFmode: return gen_aarch64_rsqrtesf;
9798 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9799 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9800 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9801 default: gcc_unreachable ();
9805 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9807 /* Select reciprocal square root series step insn depending on machine mode. */
9809 static rsqrts_type
9810 get_rsqrts_type (machine_mode mode)
9812 switch (mode)
9814 case E_DFmode: return gen_aarch64_rsqrtsdf;
9815 case E_SFmode: return gen_aarch64_rsqrtssf;
9816 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9817 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9818 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9819 default: gcc_unreachable ();
9823 /* Emit instruction sequence to compute either the approximate square root
9824 or its approximate reciprocal, depending on the flag RECP, and return
9825 whether the sequence was emitted or not. */
9827 bool
9828 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9830 machine_mode mode = GET_MODE (dst);
9832 if (GET_MODE_INNER (mode) == HFmode)
9834 gcc_assert (!recp);
9835 return false;
9838 if (!recp)
9840 if (!(flag_mlow_precision_sqrt
9841 || (aarch64_tune_params.approx_modes->sqrt
9842 & AARCH64_APPROX_MODE (mode))))
9843 return false;
9845 if (flag_finite_math_only
9846 || flag_trapping_math
9847 || !flag_unsafe_math_optimizations
9848 || optimize_function_for_size_p (cfun))
9849 return false;
9851 else
9852 /* Caller assumes we cannot fail. */
9853 gcc_assert (use_rsqrt_p (mode));
9855 machine_mode mmsk = mode_for_int_vector (mode).require ();
9856 rtx xmsk = gen_reg_rtx (mmsk);
9857 if (!recp)
9858 /* When calculating the approximate square root, compare the
9859 argument with 0.0 and create a mask. */
9860 emit_insn (gen_rtx_SET (xmsk,
9861 gen_rtx_NEG (mmsk,
9862 gen_rtx_EQ (mmsk, src,
9863 CONST0_RTX (mode)))));
9865 /* Estimate the approximate reciprocal square root. */
9866 rtx xdst = gen_reg_rtx (mode);
9867 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9869 /* Iterate over the series twice for SF and thrice for DF. */
9870 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9872 /* Optionally iterate over the series once less for faster performance
9873 while sacrificing the accuracy. */
9874 if ((recp && flag_mrecip_low_precision_sqrt)
9875 || (!recp && flag_mlow_precision_sqrt))
9876 iterations--;
9878 /* Iterate over the series to calculate the approximate reciprocal square
9879 root. */
9880 rtx x1 = gen_reg_rtx (mode);
9881 while (iterations--)
9883 rtx x2 = gen_reg_rtx (mode);
9884 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9886 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9888 if (iterations > 0)
9889 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9892 if (!recp)
9894 /* Qualify the approximate reciprocal square root when the argument is
9895 0.0 by squashing the intermediary result to 0.0. */
9896 rtx xtmp = gen_reg_rtx (mmsk);
9897 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9898 gen_rtx_SUBREG (mmsk, xdst, 0)));
9899 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9901 /* Calculate the approximate square root. */
9902 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9905 /* Finalize the approximation. */
9906 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9908 return true;
9911 typedef rtx (*recpe_type) (rtx, rtx);
9913 /* Select reciprocal initial estimate insn depending on machine mode. */
9915 static recpe_type
9916 get_recpe_type (machine_mode mode)
9918 switch (mode)
9920 case E_SFmode: return (gen_aarch64_frecpesf);
9921 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9922 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9923 case E_DFmode: return (gen_aarch64_frecpedf);
9924 case E_V2DFmode: return (gen_aarch64_frecpev2df);
9925 default: gcc_unreachable ();
9929 typedef rtx (*recps_type) (rtx, rtx, rtx);
9931 /* Select reciprocal series step insn depending on machine mode. */
9933 static recps_type
9934 get_recps_type (machine_mode mode)
9936 switch (mode)
9938 case E_SFmode: return (gen_aarch64_frecpssf);
9939 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9940 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9941 case E_DFmode: return (gen_aarch64_frecpsdf);
9942 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9943 default: gcc_unreachable ();
9947 /* Emit the instruction sequence to compute the approximation for the division
9948 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
9950 bool
9951 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9953 machine_mode mode = GET_MODE (quo);
9955 if (GET_MODE_INNER (mode) == HFmode)
9956 return false;
9958 bool use_approx_division_p = (flag_mlow_precision_div
9959 || (aarch64_tune_params.approx_modes->division
9960 & AARCH64_APPROX_MODE (mode)));
9962 if (!flag_finite_math_only
9963 || flag_trapping_math
9964 || !flag_unsafe_math_optimizations
9965 || optimize_function_for_size_p (cfun)
9966 || !use_approx_division_p)
9967 return false;
9969 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9970 return false;
9972 /* Estimate the approximate reciprocal. */
9973 rtx xrcp = gen_reg_rtx (mode);
9974 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
9976 /* Iterate over the series twice for SF and thrice for DF. */
9977 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9979 /* Optionally iterate over the series once less for faster performance,
9980 while sacrificing the accuracy. */
9981 if (flag_mlow_precision_div)
9982 iterations--;
9984 /* Iterate over the series to calculate the approximate reciprocal. */
9985 rtx xtmp = gen_reg_rtx (mode);
9986 while (iterations--)
9988 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
9990 if (iterations > 0)
9991 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
9994 if (num != CONST1_RTX (mode))
9996 /* As the approximate reciprocal of DEN is already calculated, only
9997 calculate the approximate division when NUM is not 1.0. */
9998 rtx xnum = force_reg (mode, num);
9999 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10002 /* Finalize the approximation. */
10003 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10004 return true;
10007 /* Return the number of instructions that can be issued per cycle. */
10008 static int
10009 aarch64_sched_issue_rate (void)
10011 return aarch64_tune_params.issue_rate;
10014 static int
10015 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10017 int issue_rate = aarch64_sched_issue_rate ();
10019 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10023 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10024 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10025 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10027 static int
10028 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10029 int ready_index)
10031 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10035 /* Vectorizer cost model target hooks. */
10037 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10038 static int
10039 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10040 tree vectype,
10041 int misalign ATTRIBUTE_UNUSED)
10043 unsigned elements;
10044 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10045 bool fp = false;
10047 if (vectype != NULL)
10048 fp = FLOAT_TYPE_P (vectype);
10050 switch (type_of_cost)
10052 case scalar_stmt:
10053 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10055 case scalar_load:
10056 return costs->scalar_load_cost;
10058 case scalar_store:
10059 return costs->scalar_store_cost;
10061 case vector_stmt:
10062 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10064 case vector_load:
10065 return costs->vec_align_load_cost;
10067 case vector_store:
10068 return costs->vec_store_cost;
10070 case vec_to_scalar:
10071 return costs->vec_to_scalar_cost;
10073 case scalar_to_vec:
10074 return costs->scalar_to_vec_cost;
10076 case unaligned_load:
10077 case vector_gather_load:
10078 return costs->vec_unalign_load_cost;
10080 case unaligned_store:
10081 case vector_scatter_store:
10082 return costs->vec_unalign_store_cost;
10084 case cond_branch_taken:
10085 return costs->cond_taken_branch_cost;
10087 case cond_branch_not_taken:
10088 return costs->cond_not_taken_branch_cost;
10090 case vec_perm:
10091 return costs->vec_permute_cost;
10093 case vec_promote_demote:
10094 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10096 case vec_construct:
10097 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10098 return elements / 2 + 1;
10100 default:
10101 gcc_unreachable ();
10105 /* Implement targetm.vectorize.add_stmt_cost. */
10106 static unsigned
10107 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10108 struct _stmt_vec_info *stmt_info, int misalign,
10109 enum vect_cost_model_location where)
10111 unsigned *cost = (unsigned *) data;
10112 unsigned retval = 0;
10114 if (flag_vect_cost_model)
10116 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10117 int stmt_cost =
10118 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10120 /* Statements in an inner loop relative to the loop being
10121 vectorized are weighted more heavily. The value here is
10122 arbitrary and could potentially be improved with analysis. */
10123 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10124 count *= 50; /* FIXME */
10126 retval = (unsigned) (count * stmt_cost);
10127 cost[where] += retval;
10130 return retval;
10133 static void initialize_aarch64_code_model (struct gcc_options *);
10135 /* Parse the TO_PARSE string and put the architecture struct that it
10136 selects into RES and the architectural features into ISA_FLAGS.
10137 Return an aarch64_parse_opt_result describing the parse result.
10138 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10140 static enum aarch64_parse_opt_result
10141 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10142 unsigned long *isa_flags)
10144 char *ext;
10145 const struct processor *arch;
10146 char *str = (char *) alloca (strlen (to_parse) + 1);
10147 size_t len;
10149 strcpy (str, to_parse);
10151 ext = strchr (str, '+');
10153 if (ext != NULL)
10154 len = ext - str;
10155 else
10156 len = strlen (str);
10158 if (len == 0)
10159 return AARCH64_PARSE_MISSING_ARG;
10162 /* Loop through the list of supported ARCHes to find a match. */
10163 for (arch = all_architectures; arch->name != NULL; arch++)
10165 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10167 unsigned long isa_temp = arch->flags;
10169 if (ext != NULL)
10171 /* TO_PARSE string contains at least one extension. */
10172 enum aarch64_parse_opt_result ext_res
10173 = aarch64_parse_extension (ext, &isa_temp);
10175 if (ext_res != AARCH64_PARSE_OK)
10176 return ext_res;
10178 /* Extension parsing was successful. Confirm the result
10179 arch and ISA flags. */
10180 *res = arch;
10181 *isa_flags = isa_temp;
10182 return AARCH64_PARSE_OK;
10186 /* ARCH name not found in list. */
10187 return AARCH64_PARSE_INVALID_ARG;
10190 /* Parse the TO_PARSE string and put the result tuning in RES and the
10191 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10192 describing the parse result. If there is an error parsing, RES and
10193 ISA_FLAGS are left unchanged. */
10195 static enum aarch64_parse_opt_result
10196 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10197 unsigned long *isa_flags)
10199 char *ext;
10200 const struct processor *cpu;
10201 char *str = (char *) alloca (strlen (to_parse) + 1);
10202 size_t len;
10204 strcpy (str, to_parse);
10206 ext = strchr (str, '+');
10208 if (ext != NULL)
10209 len = ext - str;
10210 else
10211 len = strlen (str);
10213 if (len == 0)
10214 return AARCH64_PARSE_MISSING_ARG;
10217 /* Loop through the list of supported CPUs to find a match. */
10218 for (cpu = all_cores; cpu->name != NULL; cpu++)
10220 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10222 unsigned long isa_temp = cpu->flags;
10225 if (ext != NULL)
10227 /* TO_PARSE string contains at least one extension. */
10228 enum aarch64_parse_opt_result ext_res
10229 = aarch64_parse_extension (ext, &isa_temp);
10231 if (ext_res != AARCH64_PARSE_OK)
10232 return ext_res;
10234 /* Extension parsing was successfull. Confirm the result
10235 cpu and ISA flags. */
10236 *res = cpu;
10237 *isa_flags = isa_temp;
10238 return AARCH64_PARSE_OK;
10242 /* CPU name not found in list. */
10243 return AARCH64_PARSE_INVALID_ARG;
10246 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10247 Return an aarch64_parse_opt_result describing the parse result.
10248 If the parsing fails the RES does not change. */
10250 static enum aarch64_parse_opt_result
10251 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10253 const struct processor *cpu;
10254 char *str = (char *) alloca (strlen (to_parse) + 1);
10256 strcpy (str, to_parse);
10258 /* Loop through the list of supported CPUs to find a match. */
10259 for (cpu = all_cores; cpu->name != NULL; cpu++)
10261 if (strcmp (cpu->name, str) == 0)
10263 *res = cpu;
10264 return AARCH64_PARSE_OK;
10268 /* CPU name not found in list. */
10269 return AARCH64_PARSE_INVALID_ARG;
10272 /* Parse TOKEN, which has length LENGTH to see if it is an option
10273 described in FLAG. If it is, return the index bit for that fusion type.
10274 If not, error (printing OPTION_NAME) and return zero. */
10276 static unsigned int
10277 aarch64_parse_one_option_token (const char *token,
10278 size_t length,
10279 const struct aarch64_flag_desc *flag,
10280 const char *option_name)
10282 for (; flag->name != NULL; flag++)
10284 if (length == strlen (flag->name)
10285 && !strncmp (flag->name, token, length))
10286 return flag->flag;
10289 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10290 return 0;
10293 /* Parse OPTION which is a comma-separated list of flags to enable.
10294 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10295 default state we inherit from the CPU tuning structures. OPTION_NAME
10296 gives the top-level option we are parsing in the -moverride string,
10297 for use in error messages. */
10299 static unsigned int
10300 aarch64_parse_boolean_options (const char *option,
10301 const struct aarch64_flag_desc *flags,
10302 unsigned int initial_state,
10303 const char *option_name)
10305 const char separator = '.';
10306 const char* specs = option;
10307 const char* ntoken = option;
10308 unsigned int found_flags = initial_state;
10310 while ((ntoken = strchr (specs, separator)))
10312 size_t token_length = ntoken - specs;
10313 unsigned token_ops = aarch64_parse_one_option_token (specs,
10314 token_length,
10315 flags,
10316 option_name);
10317 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10318 in the token stream, reset the supported operations. So:
10320 adrp+add.cmp+branch.none.adrp+add
10322 would have the result of turning on only adrp+add fusion. */
10323 if (!token_ops)
10324 found_flags = 0;
10326 found_flags |= token_ops;
10327 specs = ++ntoken;
10330 /* We ended with a comma, print something. */
10331 if (!(*specs))
10333 error ("%s string ill-formed\n", option_name);
10334 return 0;
10337 /* We still have one more token to parse. */
10338 size_t token_length = strlen (specs);
10339 unsigned token_ops = aarch64_parse_one_option_token (specs,
10340 token_length,
10341 flags,
10342 option_name);
10343 if (!token_ops)
10344 found_flags = 0;
10346 found_flags |= token_ops;
10347 return found_flags;
10350 /* Support for overriding instruction fusion. */
10352 static void
10353 aarch64_parse_fuse_string (const char *fuse_string,
10354 struct tune_params *tune)
10356 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10357 aarch64_fusible_pairs,
10358 tune->fusible_ops,
10359 "fuse=");
10362 /* Support for overriding other tuning flags. */
10364 static void
10365 aarch64_parse_tune_string (const char *tune_string,
10366 struct tune_params *tune)
10368 tune->extra_tuning_flags
10369 = aarch64_parse_boolean_options (tune_string,
10370 aarch64_tuning_flags,
10371 tune->extra_tuning_flags,
10372 "tune=");
10375 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10376 we understand. If it is, extract the option string and handoff to
10377 the appropriate function. */
10379 void
10380 aarch64_parse_one_override_token (const char* token,
10381 size_t length,
10382 struct tune_params *tune)
10384 const struct aarch64_tuning_override_function *fn
10385 = aarch64_tuning_override_functions;
10387 const char *option_part = strchr (token, '=');
10388 if (!option_part)
10390 error ("tuning string missing in option (%s)", token);
10391 return;
10394 /* Get the length of the option name. */
10395 length = option_part - token;
10396 /* Skip the '=' to get to the option string. */
10397 option_part++;
10399 for (; fn->name != NULL; fn++)
10401 if (!strncmp (fn->name, token, length))
10403 fn->parse_override (option_part, tune);
10404 return;
10408 error ("unknown tuning option (%s)",token);
10409 return;
10412 /* A checking mechanism for the implementation of the tls size. */
10414 static void
10415 initialize_aarch64_tls_size (struct gcc_options *opts)
10417 if (aarch64_tls_size == 0)
10418 aarch64_tls_size = 24;
10420 switch (opts->x_aarch64_cmodel_var)
10422 case AARCH64_CMODEL_TINY:
10423 /* Both the default and maximum TLS size allowed under tiny is 1M which
10424 needs two instructions to address, so we clamp the size to 24. */
10425 if (aarch64_tls_size > 24)
10426 aarch64_tls_size = 24;
10427 break;
10428 case AARCH64_CMODEL_SMALL:
10429 /* The maximum TLS size allowed under small is 4G. */
10430 if (aarch64_tls_size > 32)
10431 aarch64_tls_size = 32;
10432 break;
10433 case AARCH64_CMODEL_LARGE:
10434 /* The maximum TLS size allowed under large is 16E.
10435 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10436 if (aarch64_tls_size > 48)
10437 aarch64_tls_size = 48;
10438 break;
10439 default:
10440 gcc_unreachable ();
10443 return;
10446 /* Parse STRING looking for options in the format:
10447 string :: option:string
10448 option :: name=substring
10449 name :: {a-z}
10450 substring :: defined by option. */
10452 static void
10453 aarch64_parse_override_string (const char* input_string,
10454 struct tune_params* tune)
10456 const char separator = ':';
10457 size_t string_length = strlen (input_string) + 1;
10458 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10459 char *string = string_root;
10460 strncpy (string, input_string, string_length);
10461 string[string_length - 1] = '\0';
10463 char* ntoken = string;
10465 while ((ntoken = strchr (string, separator)))
10467 size_t token_length = ntoken - string;
10468 /* Make this substring look like a string. */
10469 *ntoken = '\0';
10470 aarch64_parse_one_override_token (string, token_length, tune);
10471 string = ++ntoken;
10474 /* One last option to parse. */
10475 aarch64_parse_one_override_token (string, strlen (string), tune);
10476 free (string_root);
10480 static void
10481 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10483 /* PR 70044: We have to be careful about being called multiple times for the
10484 same function. This means all changes should be repeatable. */
10486 /* If the frame pointer is enabled, set it to a special value that behaves
10487 similar to frame pointer omission. If we don't do this all leaf functions
10488 will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10489 If flag_omit_frame_pointer has this special value, we must force the
10490 frame pointer if not in a leaf function. We also need to force it in a
10491 leaf function if flag_omit_frame_pointer is not set or if LR is used. */
10492 if (opts->x_flag_omit_frame_pointer == 0)
10493 opts->x_flag_omit_frame_pointer = 2;
10495 /* If not optimizing for size, set the default
10496 alignment to what the target wants. */
10497 if (!opts->x_optimize_size)
10499 if (opts->x_align_loops <= 0)
10500 opts->x_align_loops = aarch64_tune_params.loop_align;
10501 if (opts->x_align_jumps <= 0)
10502 opts->x_align_jumps = aarch64_tune_params.jump_align;
10503 if (opts->x_align_functions <= 0)
10504 opts->x_align_functions = aarch64_tune_params.function_align;
10507 /* We default to no pc-relative literal loads. */
10509 aarch64_pcrelative_literal_loads = false;
10511 /* If -mpc-relative-literal-loads is set on the command line, this
10512 implies that the user asked for PC relative literal loads. */
10513 if (opts->x_pcrelative_literal_loads == 1)
10514 aarch64_pcrelative_literal_loads = true;
10516 /* In the tiny memory model it makes no sense to disallow PC relative
10517 literal pool loads. */
10518 if (aarch64_cmodel == AARCH64_CMODEL_TINY
10519 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10520 aarch64_pcrelative_literal_loads = true;
10522 /* When enabling the lower precision Newton series for the square root, also
10523 enable it for the reciprocal square root, since the latter is an
10524 intermediary step for the former. */
10525 if (flag_mlow_precision_sqrt)
10526 flag_mrecip_low_precision_sqrt = true;
10529 /* 'Unpack' up the internal tuning structs and update the options
10530 in OPTS. The caller must have set up selected_tune and selected_arch
10531 as all the other target-specific codegen decisions are
10532 derived from them. */
10534 void
10535 aarch64_override_options_internal (struct gcc_options *opts)
10537 aarch64_tune_flags = selected_tune->flags;
10538 aarch64_tune = selected_tune->sched_core;
10539 /* Make a copy of the tuning parameters attached to the core, which
10540 we may later overwrite. */
10541 aarch64_tune_params = *(selected_tune->tune);
10542 aarch64_architecture_version = selected_arch->architecture_version;
10544 if (opts->x_aarch64_override_tune_string)
10545 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10546 &aarch64_tune_params);
10548 /* This target defaults to strict volatile bitfields. */
10549 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10550 opts->x_flag_strict_volatile_bitfields = 1;
10552 initialize_aarch64_code_model (opts);
10553 initialize_aarch64_tls_size (opts);
10555 int queue_depth = 0;
10556 switch (aarch64_tune_params.autoprefetcher_model)
10558 case tune_params::AUTOPREFETCHER_OFF:
10559 queue_depth = -1;
10560 break;
10561 case tune_params::AUTOPREFETCHER_WEAK:
10562 queue_depth = 0;
10563 break;
10564 case tune_params::AUTOPREFETCHER_STRONG:
10565 queue_depth = max_insn_queue_index + 1;
10566 break;
10567 default:
10568 gcc_unreachable ();
10571 /* We don't mind passing in global_options_set here as we don't use
10572 the *options_set structs anyway. */
10573 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10574 queue_depth,
10575 opts->x_param_values,
10576 global_options_set.x_param_values);
10578 /* Set up parameters to be used in prefetching algorithm. Do not
10579 override the defaults unless we are tuning for a core we have
10580 researched values for. */
10581 if (aarch64_tune_params.prefetch->num_slots > 0)
10582 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10583 aarch64_tune_params.prefetch->num_slots,
10584 opts->x_param_values,
10585 global_options_set.x_param_values);
10586 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10587 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10588 aarch64_tune_params.prefetch->l1_cache_size,
10589 opts->x_param_values,
10590 global_options_set.x_param_values);
10591 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10592 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10593 aarch64_tune_params.prefetch->l1_cache_line_size,
10594 opts->x_param_values,
10595 global_options_set.x_param_values);
10596 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10597 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10598 aarch64_tune_params.prefetch->l2_cache_size,
10599 opts->x_param_values,
10600 global_options_set.x_param_values);
10602 /* Use the alternative scheduling-pressure algorithm by default. */
10603 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10604 opts->x_param_values,
10605 global_options_set.x_param_values);
10607 /* Enable sw prefetching at specified optimization level for
10608 CPUS that have prefetch. Lower optimization level threshold by 1
10609 when profiling is enabled. */
10610 if (opts->x_flag_prefetch_loop_arrays < 0
10611 && !opts->x_optimize_size
10612 && aarch64_tune_params.prefetch->default_opt_level >= 0
10613 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10614 opts->x_flag_prefetch_loop_arrays = 1;
10616 aarch64_override_options_after_change_1 (opts);
10619 /* Print a hint with a suggestion for a core or architecture name that
10620 most closely resembles what the user passed in STR. ARCH is true if
10621 the user is asking for an architecture name. ARCH is false if the user
10622 is asking for a core name. */
10624 static void
10625 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10627 auto_vec<const char *> candidates;
10628 const struct processor *entry = arch ? all_architectures : all_cores;
10629 for (; entry->name != NULL; entry++)
10630 candidates.safe_push (entry->name);
10632 #ifdef HAVE_LOCAL_CPU_DETECT
10633 /* Add also "native" as possible value. */
10634 if (arch)
10635 candidates.safe_push ("native");
10636 #endif
10638 char *s;
10639 const char *hint = candidates_list_and_hint (str, s, candidates);
10640 if (hint)
10641 inform (input_location, "valid arguments are: %s;"
10642 " did you mean %qs?", s, hint);
10643 else
10644 inform (input_location, "valid arguments are: %s", s);
10646 XDELETEVEC (s);
10649 /* Print a hint with a suggestion for a core name that most closely resembles
10650 what the user passed in STR. */
10652 inline static void
10653 aarch64_print_hint_for_core (const char *str)
10655 aarch64_print_hint_for_core_or_arch (str, false);
10658 /* Print a hint with a suggestion for an architecture name that most closely
10659 resembles what the user passed in STR. */
10661 inline static void
10662 aarch64_print_hint_for_arch (const char *str)
10664 aarch64_print_hint_for_core_or_arch (str, true);
10667 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10668 specified in STR and throw errors if appropriate. Put the results if
10669 they are valid in RES and ISA_FLAGS. Return whether the option is
10670 valid. */
10672 static bool
10673 aarch64_validate_mcpu (const char *str, const struct processor **res,
10674 unsigned long *isa_flags)
10676 enum aarch64_parse_opt_result parse_res
10677 = aarch64_parse_cpu (str, res, isa_flags);
10679 if (parse_res == AARCH64_PARSE_OK)
10680 return true;
10682 switch (parse_res)
10684 case AARCH64_PARSE_MISSING_ARG:
10685 error ("missing cpu name in %<-mcpu=%s%>", str);
10686 break;
10687 case AARCH64_PARSE_INVALID_ARG:
10688 error ("unknown value %qs for -mcpu", str);
10689 aarch64_print_hint_for_core (str);
10690 break;
10691 case AARCH64_PARSE_INVALID_FEATURE:
10692 error ("invalid feature modifier in %<-mcpu=%s%>", str);
10693 break;
10694 default:
10695 gcc_unreachable ();
10698 return false;
10701 /* Validate a command-line -march option. Parse the arch and extensions
10702 (if any) specified in STR and throw errors if appropriate. Put the
10703 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10704 option is valid. */
10706 static bool
10707 aarch64_validate_march (const char *str, const struct processor **res,
10708 unsigned long *isa_flags)
10710 enum aarch64_parse_opt_result parse_res
10711 = aarch64_parse_arch (str, res, isa_flags);
10713 if (parse_res == AARCH64_PARSE_OK)
10714 return true;
10716 switch (parse_res)
10718 case AARCH64_PARSE_MISSING_ARG:
10719 error ("missing arch name in %<-march=%s%>", str);
10720 break;
10721 case AARCH64_PARSE_INVALID_ARG:
10722 error ("unknown value %qs for -march", str);
10723 aarch64_print_hint_for_arch (str);
10724 break;
10725 case AARCH64_PARSE_INVALID_FEATURE:
10726 error ("invalid feature modifier in %<-march=%s%>", str);
10727 break;
10728 default:
10729 gcc_unreachable ();
10732 return false;
10735 /* Validate a command-line -mtune option. Parse the cpu
10736 specified in STR and throw errors if appropriate. Put the
10737 result, if it is valid, in RES. Return whether the option is
10738 valid. */
10740 static bool
10741 aarch64_validate_mtune (const char *str, const struct processor **res)
10743 enum aarch64_parse_opt_result parse_res
10744 = aarch64_parse_tune (str, res);
10746 if (parse_res == AARCH64_PARSE_OK)
10747 return true;
10749 switch (parse_res)
10751 case AARCH64_PARSE_MISSING_ARG:
10752 error ("missing cpu name in %<-mtune=%s%>", str);
10753 break;
10754 case AARCH64_PARSE_INVALID_ARG:
10755 error ("unknown value %qs for -mtune", str);
10756 aarch64_print_hint_for_core (str);
10757 break;
10758 default:
10759 gcc_unreachable ();
10761 return false;
10764 /* Return the CPU corresponding to the enum CPU.
10765 If it doesn't specify a cpu, return the default. */
10767 static const struct processor *
10768 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10770 if (cpu != aarch64_none)
10771 return &all_cores[cpu];
10773 /* The & 0x3f is to extract the bottom 6 bits that encode the
10774 default cpu as selected by the --with-cpu GCC configure option
10775 in config.gcc.
10776 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10777 flags mechanism should be reworked to make it more sane. */
10778 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10781 /* Return the architecture corresponding to the enum ARCH.
10782 If it doesn't specify a valid architecture, return the default. */
10784 static const struct processor *
10785 aarch64_get_arch (enum aarch64_arch arch)
10787 if (arch != aarch64_no_arch)
10788 return &all_architectures[arch];
10790 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10792 return &all_architectures[cpu->arch];
10795 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10797 static poly_uint16
10798 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10800 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10801 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10802 deciding which .md file patterns to use and when deciding whether
10803 something is a legitimate address or constant. */
10804 if (value == SVE_SCALABLE || value == SVE_128)
10805 return poly_uint16 (2, 2);
10806 else
10807 return (int) value / 64;
10810 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10811 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10812 tuning structs. In particular it must set selected_tune and
10813 aarch64_isa_flags that define the available ISA features and tuning
10814 decisions. It must also set selected_arch as this will be used to
10815 output the .arch asm tags for each function. */
10817 static void
10818 aarch64_override_options (void)
10820 unsigned long cpu_isa = 0;
10821 unsigned long arch_isa = 0;
10822 aarch64_isa_flags = 0;
10824 bool valid_cpu = true;
10825 bool valid_tune = true;
10826 bool valid_arch = true;
10828 selected_cpu = NULL;
10829 selected_arch = NULL;
10830 selected_tune = NULL;
10832 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10833 If either of -march or -mtune is given, they override their
10834 respective component of -mcpu. */
10835 if (aarch64_cpu_string)
10836 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10837 &cpu_isa);
10839 if (aarch64_arch_string)
10840 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10841 &arch_isa);
10843 if (aarch64_tune_string)
10844 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10846 /* If the user did not specify a processor, choose the default
10847 one for them. This will be the CPU set during configuration using
10848 --with-cpu, otherwise it is "generic". */
10849 if (!selected_cpu)
10851 if (selected_arch)
10853 selected_cpu = &all_cores[selected_arch->ident];
10854 aarch64_isa_flags = arch_isa;
10855 explicit_arch = selected_arch->arch;
10857 else
10859 /* Get default configure-time CPU. */
10860 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10861 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10864 if (selected_tune)
10865 explicit_tune_core = selected_tune->ident;
10867 /* If both -mcpu and -march are specified check that they are architecturally
10868 compatible, warn if they're not and prefer the -march ISA flags. */
10869 else if (selected_arch)
10871 if (selected_arch->arch != selected_cpu->arch)
10873 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10874 all_architectures[selected_cpu->arch].name,
10875 selected_arch->name);
10877 aarch64_isa_flags = arch_isa;
10878 explicit_arch = selected_arch->arch;
10879 explicit_tune_core = selected_tune ? selected_tune->ident
10880 : selected_cpu->ident;
10882 else
10884 /* -mcpu but no -march. */
10885 aarch64_isa_flags = cpu_isa;
10886 explicit_tune_core = selected_tune ? selected_tune->ident
10887 : selected_cpu->ident;
10888 gcc_assert (selected_cpu);
10889 selected_arch = &all_architectures[selected_cpu->arch];
10890 explicit_arch = selected_arch->arch;
10893 /* Set the arch as well as we will need it when outputing
10894 the .arch directive in assembly. */
10895 if (!selected_arch)
10897 gcc_assert (selected_cpu);
10898 selected_arch = &all_architectures[selected_cpu->arch];
10901 if (!selected_tune)
10902 selected_tune = selected_cpu;
10904 #ifndef HAVE_AS_MABI_OPTION
10905 /* The compiler may have been configured with 2.23.* binutils, which does
10906 not have support for ILP32. */
10907 if (TARGET_ILP32)
10908 error ("assembler does not support -mabi=ilp32");
10909 #endif
10911 /* Convert -msve-vector-bits to a VG count. */
10912 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10914 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10915 sorry ("return address signing is only supported for -mabi=lp64");
10917 /* Make sure we properly set up the explicit options. */
10918 if ((aarch64_cpu_string && valid_cpu)
10919 || (aarch64_tune_string && valid_tune))
10920 gcc_assert (explicit_tune_core != aarch64_none);
10922 if ((aarch64_cpu_string && valid_cpu)
10923 || (aarch64_arch_string && valid_arch))
10924 gcc_assert (explicit_arch != aarch64_no_arch);
10926 aarch64_override_options_internal (&global_options);
10928 /* Save these options as the default ones in case we push and pop them later
10929 while processing functions with potential target attributes. */
10930 target_option_default_node = target_option_current_node
10931 = build_target_option_node (&global_options);
10934 /* Implement targetm.override_options_after_change. */
10936 static void
10937 aarch64_override_options_after_change (void)
10939 aarch64_override_options_after_change_1 (&global_options);
10942 static struct machine_function *
10943 aarch64_init_machine_status (void)
10945 struct machine_function *machine;
10946 machine = ggc_cleared_alloc<machine_function> ();
10947 return machine;
10950 void
10951 aarch64_init_expanders (void)
10953 init_machine_status = aarch64_init_machine_status;
10956 /* A checking mechanism for the implementation of the various code models. */
10957 static void
10958 initialize_aarch64_code_model (struct gcc_options *opts)
10960 if (opts->x_flag_pic)
10962 switch (opts->x_aarch64_cmodel_var)
10964 case AARCH64_CMODEL_TINY:
10965 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10966 break;
10967 case AARCH64_CMODEL_SMALL:
10968 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10969 aarch64_cmodel = (flag_pic == 2
10970 ? AARCH64_CMODEL_SMALL_PIC
10971 : AARCH64_CMODEL_SMALL_SPIC);
10972 #else
10973 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10974 #endif
10975 break;
10976 case AARCH64_CMODEL_LARGE:
10977 sorry ("code model %qs with -f%s", "large",
10978 opts->x_flag_pic > 1 ? "PIC" : "pic");
10979 break;
10980 default:
10981 gcc_unreachable ();
10984 else
10985 aarch64_cmodel = opts->x_aarch64_cmodel_var;
10988 /* Implement TARGET_OPTION_SAVE. */
10990 static void
10991 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
10993 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
10996 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
10997 using the information saved in PTR. */
10999 static void
11000 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11002 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11003 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11004 opts->x_explicit_arch = ptr->x_explicit_arch;
11005 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11006 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11008 aarch64_override_options_internal (opts);
11011 /* Implement TARGET_OPTION_PRINT. */
11013 static void
11014 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11016 const struct processor *cpu
11017 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11018 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11019 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11020 std::string extension
11021 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11023 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11024 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11025 arch->name, extension.c_str ());
11028 static GTY(()) tree aarch64_previous_fndecl;
11030 void
11031 aarch64_reset_previous_fndecl (void)
11033 aarch64_previous_fndecl = NULL;
11036 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11037 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11038 make sure optab availability predicates are recomputed when necessary. */
11040 void
11041 aarch64_save_restore_target_globals (tree new_tree)
11043 if (TREE_TARGET_GLOBALS (new_tree))
11044 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11045 else if (new_tree == target_option_default_node)
11046 restore_target_globals (&default_target_globals);
11047 else
11048 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11051 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11052 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11053 of the function, if such exists. This function may be called multiple
11054 times on a single function so use aarch64_previous_fndecl to avoid
11055 setting up identical state. */
11057 static void
11058 aarch64_set_current_function (tree fndecl)
11060 if (!fndecl || fndecl == aarch64_previous_fndecl)
11061 return;
11063 tree old_tree = (aarch64_previous_fndecl
11064 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11065 : NULL_TREE);
11067 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11069 /* If current function has no attributes but the previous one did,
11070 use the default node. */
11071 if (!new_tree && old_tree)
11072 new_tree = target_option_default_node;
11074 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11075 the default have been handled by aarch64_save_restore_target_globals from
11076 aarch64_pragma_target_parse. */
11077 if (old_tree == new_tree)
11078 return;
11080 aarch64_previous_fndecl = fndecl;
11082 /* First set the target options. */
11083 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11085 aarch64_save_restore_target_globals (new_tree);
11088 /* Enum describing the various ways we can handle attributes.
11089 In many cases we can reuse the generic option handling machinery. */
11091 enum aarch64_attr_opt_type
11093 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
11094 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
11095 aarch64_attr_enum, /* Attribute sets an enum variable. */
11096 aarch64_attr_custom /* Attribute requires a custom handling function. */
11099 /* All the information needed to handle a target attribute.
11100 NAME is the name of the attribute.
11101 ATTR_TYPE specifies the type of behavior of the attribute as described
11102 in the definition of enum aarch64_attr_opt_type.
11103 ALLOW_NEG is true if the attribute supports a "no-" form.
11104 HANDLER is the function that takes the attribute string as an argument
11105 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11106 OPT_NUM is the enum specifying the option that the attribute modifies.
11107 This is needed for attributes that mirror the behavior of a command-line
11108 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11109 aarch64_attr_enum. */
11111 struct aarch64_attribute_info
11113 const char *name;
11114 enum aarch64_attr_opt_type attr_type;
11115 bool allow_neg;
11116 bool (*handler) (const char *);
11117 enum opt_code opt_num;
11120 /* Handle the ARCH_STR argument to the arch= target attribute. */
11122 static bool
11123 aarch64_handle_attr_arch (const char *str)
11125 const struct processor *tmp_arch = NULL;
11126 enum aarch64_parse_opt_result parse_res
11127 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11129 if (parse_res == AARCH64_PARSE_OK)
11131 gcc_assert (tmp_arch);
11132 selected_arch = tmp_arch;
11133 explicit_arch = selected_arch->arch;
11134 return true;
11137 switch (parse_res)
11139 case AARCH64_PARSE_MISSING_ARG:
11140 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11141 break;
11142 case AARCH64_PARSE_INVALID_ARG:
11143 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11144 aarch64_print_hint_for_arch (str);
11145 break;
11146 case AARCH64_PARSE_INVALID_FEATURE:
11147 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11148 break;
11149 default:
11150 gcc_unreachable ();
11153 return false;
11156 /* Handle the argument CPU_STR to the cpu= target attribute. */
11158 static bool
11159 aarch64_handle_attr_cpu (const char *str)
11161 const struct processor *tmp_cpu = NULL;
11162 enum aarch64_parse_opt_result parse_res
11163 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11165 if (parse_res == AARCH64_PARSE_OK)
11167 gcc_assert (tmp_cpu);
11168 selected_tune = tmp_cpu;
11169 explicit_tune_core = selected_tune->ident;
11171 selected_arch = &all_architectures[tmp_cpu->arch];
11172 explicit_arch = selected_arch->arch;
11173 return true;
11176 switch (parse_res)
11178 case AARCH64_PARSE_MISSING_ARG:
11179 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11180 break;
11181 case AARCH64_PARSE_INVALID_ARG:
11182 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11183 aarch64_print_hint_for_core (str);
11184 break;
11185 case AARCH64_PARSE_INVALID_FEATURE:
11186 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11187 break;
11188 default:
11189 gcc_unreachable ();
11192 return false;
11195 /* Handle the argument STR to the tune= target attribute. */
11197 static bool
11198 aarch64_handle_attr_tune (const char *str)
11200 const struct processor *tmp_tune = NULL;
11201 enum aarch64_parse_opt_result parse_res
11202 = aarch64_parse_tune (str, &tmp_tune);
11204 if (parse_res == AARCH64_PARSE_OK)
11206 gcc_assert (tmp_tune);
11207 selected_tune = tmp_tune;
11208 explicit_tune_core = selected_tune->ident;
11209 return true;
11212 switch (parse_res)
11214 case AARCH64_PARSE_INVALID_ARG:
11215 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11216 aarch64_print_hint_for_core (str);
11217 break;
11218 default:
11219 gcc_unreachable ();
11222 return false;
11225 /* Parse an architecture extensions target attribute string specified in STR.
11226 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11227 if successful. Update aarch64_isa_flags to reflect the ISA features
11228 modified. */
11230 static bool
11231 aarch64_handle_attr_isa_flags (char *str)
11233 enum aarch64_parse_opt_result parse_res;
11234 unsigned long isa_flags = aarch64_isa_flags;
11236 /* We allow "+nothing" in the beginning to clear out all architectural
11237 features if the user wants to handpick specific features. */
11238 if (strncmp ("+nothing", str, 8) == 0)
11240 isa_flags = 0;
11241 str += 8;
11244 parse_res = aarch64_parse_extension (str, &isa_flags);
11246 if (parse_res == AARCH64_PARSE_OK)
11248 aarch64_isa_flags = isa_flags;
11249 return true;
11252 switch (parse_res)
11254 case AARCH64_PARSE_MISSING_ARG:
11255 error ("missing value in %<target()%> pragma or attribute");
11256 break;
11258 case AARCH64_PARSE_INVALID_FEATURE:
11259 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11260 break;
11262 default:
11263 gcc_unreachable ();
11266 return false;
11269 /* The target attributes that we support. On top of these we also support just
11270 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11271 handled explicitly in aarch64_process_one_target_attr. */
11273 static const struct aarch64_attribute_info aarch64_attributes[] =
11275 { "general-regs-only", aarch64_attr_mask, false, NULL,
11276 OPT_mgeneral_regs_only },
11277 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11278 OPT_mfix_cortex_a53_835769 },
11279 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11280 OPT_mfix_cortex_a53_843419 },
11281 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11282 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
11283 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11284 OPT_momit_leaf_frame_pointer },
11285 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11286 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11287 OPT_march_ },
11288 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11289 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11290 OPT_mtune_ },
11291 { "sign-return-address", aarch64_attr_enum, false, NULL,
11292 OPT_msign_return_address_ },
11293 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11296 /* Parse ARG_STR which contains the definition of one target attribute.
11297 Show appropriate errors if any or return true if the attribute is valid. */
11299 static bool
11300 aarch64_process_one_target_attr (char *arg_str)
11302 bool invert = false;
11304 size_t len = strlen (arg_str);
11306 if (len == 0)
11308 error ("malformed %<target()%> pragma or attribute");
11309 return false;
11312 char *str_to_check = (char *) alloca (len + 1);
11313 strcpy (str_to_check, arg_str);
11315 /* Skip leading whitespace. */
11316 while (*str_to_check == ' ' || *str_to_check == '\t')
11317 str_to_check++;
11319 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11320 It is easier to detect and handle it explicitly here rather than going
11321 through the machinery for the rest of the target attributes in this
11322 function. */
11323 if (*str_to_check == '+')
11324 return aarch64_handle_attr_isa_flags (str_to_check);
11326 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11328 invert = true;
11329 str_to_check += 3;
11331 char *arg = strchr (str_to_check, '=');
11333 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11334 and point ARG to "foo". */
11335 if (arg)
11337 *arg = '\0';
11338 arg++;
11340 const struct aarch64_attribute_info *p_attr;
11341 bool found = false;
11342 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11344 /* If the names don't match up, or the user has given an argument
11345 to an attribute that doesn't accept one, or didn't give an argument
11346 to an attribute that expects one, fail to match. */
11347 if (strcmp (str_to_check, p_attr->name) != 0)
11348 continue;
11350 found = true;
11351 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11352 || p_attr->attr_type == aarch64_attr_enum;
11354 if (attr_need_arg_p ^ (arg != NULL))
11356 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11357 return false;
11360 /* If the name matches but the attribute does not allow "no-" versions
11361 then we can't match. */
11362 if (invert && !p_attr->allow_neg)
11364 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11365 return false;
11368 switch (p_attr->attr_type)
11370 /* Has a custom handler registered.
11371 For example, cpu=, arch=, tune=. */
11372 case aarch64_attr_custom:
11373 gcc_assert (p_attr->handler);
11374 if (!p_attr->handler (arg))
11375 return false;
11376 break;
11378 /* Either set or unset a boolean option. */
11379 case aarch64_attr_bool:
11381 struct cl_decoded_option decoded;
11383 generate_option (p_attr->opt_num, NULL, !invert,
11384 CL_TARGET, &decoded);
11385 aarch64_handle_option (&global_options, &global_options_set,
11386 &decoded, input_location);
11387 break;
11389 /* Set or unset a bit in the target_flags. aarch64_handle_option
11390 should know what mask to apply given the option number. */
11391 case aarch64_attr_mask:
11393 struct cl_decoded_option decoded;
11394 /* We only need to specify the option number.
11395 aarch64_handle_option will know which mask to apply. */
11396 decoded.opt_index = p_attr->opt_num;
11397 decoded.value = !invert;
11398 aarch64_handle_option (&global_options, &global_options_set,
11399 &decoded, input_location);
11400 break;
11402 /* Use the option setting machinery to set an option to an enum. */
11403 case aarch64_attr_enum:
11405 gcc_assert (arg);
11406 bool valid;
11407 int value;
11408 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11409 &value, CL_TARGET);
11410 if (valid)
11412 set_option (&global_options, NULL, p_attr->opt_num, value,
11413 NULL, DK_UNSPECIFIED, input_location,
11414 global_dc);
11416 else
11418 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11420 break;
11422 default:
11423 gcc_unreachable ();
11427 /* If we reached here we either have found an attribute and validated
11428 it or didn't match any. If we matched an attribute but its arguments
11429 were malformed we will have returned false already. */
11430 return found;
11433 /* Count how many times the character C appears in
11434 NULL-terminated string STR. */
11436 static unsigned int
11437 num_occurences_in_str (char c, char *str)
11439 unsigned int res = 0;
11440 while (*str != '\0')
11442 if (*str == c)
11443 res++;
11445 str++;
11448 return res;
11451 /* Parse the tree in ARGS that contains the target attribute information
11452 and update the global target options space. */
11454 bool
11455 aarch64_process_target_attr (tree args)
11457 if (TREE_CODE (args) == TREE_LIST)
11461 tree head = TREE_VALUE (args);
11462 if (head)
11464 if (!aarch64_process_target_attr (head))
11465 return false;
11467 args = TREE_CHAIN (args);
11468 } while (args);
11470 return true;
11473 if (TREE_CODE (args) != STRING_CST)
11475 error ("attribute %<target%> argument not a string");
11476 return false;
11479 size_t len = strlen (TREE_STRING_POINTER (args));
11480 char *str_to_check = (char *) alloca (len + 1);
11481 strcpy (str_to_check, TREE_STRING_POINTER (args));
11483 if (len == 0)
11485 error ("malformed %<target()%> pragma or attribute");
11486 return false;
11489 /* Used to catch empty spaces between commas i.e.
11490 attribute ((target ("attr1,,attr2"))). */
11491 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11493 /* Handle multiple target attributes separated by ','. */
11494 char *token = strtok (str_to_check, ",");
11496 unsigned int num_attrs = 0;
11497 while (token)
11499 num_attrs++;
11500 if (!aarch64_process_one_target_attr (token))
11502 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11503 return false;
11506 token = strtok (NULL, ",");
11509 if (num_attrs != num_commas + 1)
11511 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11512 return false;
11515 return true;
11518 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11519 process attribute ((target ("..."))). */
11521 static bool
11522 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11524 struct cl_target_option cur_target;
11525 bool ret;
11526 tree old_optimize;
11527 tree new_target, new_optimize;
11528 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11530 /* If what we're processing is the current pragma string then the
11531 target option node is already stored in target_option_current_node
11532 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11533 having to re-parse the string. This is especially useful to keep
11534 arm_neon.h compile times down since that header contains a lot
11535 of intrinsics enclosed in pragmas. */
11536 if (!existing_target && args == current_target_pragma)
11538 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11539 return true;
11541 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11543 old_optimize = build_optimization_node (&global_options);
11544 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11546 /* If the function changed the optimization levels as well as setting
11547 target options, start with the optimizations specified. */
11548 if (func_optimize && func_optimize != old_optimize)
11549 cl_optimization_restore (&global_options,
11550 TREE_OPTIMIZATION (func_optimize));
11552 /* Save the current target options to restore at the end. */
11553 cl_target_option_save (&cur_target, &global_options);
11555 /* If fndecl already has some target attributes applied to it, unpack
11556 them so that we add this attribute on top of them, rather than
11557 overwriting them. */
11558 if (existing_target)
11560 struct cl_target_option *existing_options
11561 = TREE_TARGET_OPTION (existing_target);
11563 if (existing_options)
11564 cl_target_option_restore (&global_options, existing_options);
11566 else
11567 cl_target_option_restore (&global_options,
11568 TREE_TARGET_OPTION (target_option_current_node));
11570 ret = aarch64_process_target_attr (args);
11572 /* Set up any additional state. */
11573 if (ret)
11575 aarch64_override_options_internal (&global_options);
11576 /* Initialize SIMD builtins if we haven't already.
11577 Set current_target_pragma to NULL for the duration so that
11578 the builtin initialization code doesn't try to tag the functions
11579 being built with the attributes specified by any current pragma, thus
11580 going into an infinite recursion. */
11581 if (TARGET_SIMD)
11583 tree saved_current_target_pragma = current_target_pragma;
11584 current_target_pragma = NULL;
11585 aarch64_init_simd_builtins ();
11586 current_target_pragma = saved_current_target_pragma;
11588 new_target = build_target_option_node (&global_options);
11590 else
11591 new_target = NULL;
11593 new_optimize = build_optimization_node (&global_options);
11595 if (fndecl && ret)
11597 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11599 if (old_optimize != new_optimize)
11600 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11603 cl_target_option_restore (&global_options, &cur_target);
11605 if (old_optimize != new_optimize)
11606 cl_optimization_restore (&global_options,
11607 TREE_OPTIMIZATION (old_optimize));
11608 return ret;
11611 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11612 tri-bool options (yes, no, don't care) and the default value is
11613 DEF, determine whether to reject inlining. */
11615 static bool
11616 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11617 int dont_care, int def)
11619 /* If the callee doesn't care, always allow inlining. */
11620 if (callee == dont_care)
11621 return true;
11623 /* If the caller doesn't care, always allow inlining. */
11624 if (caller == dont_care)
11625 return true;
11627 /* Otherwise, allow inlining if either the callee and caller values
11628 agree, or if the callee is using the default value. */
11629 return (callee == caller || callee == def);
11632 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11633 to inline CALLEE into CALLER based on target-specific info.
11634 Make sure that the caller and callee have compatible architectural
11635 features. Then go through the other possible target attributes
11636 and see if they can block inlining. Try not to reject always_inline
11637 callees unless they are incompatible architecturally. */
11639 static bool
11640 aarch64_can_inline_p (tree caller, tree callee)
11642 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11643 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11645 /* If callee has no option attributes, then it is ok to inline. */
11646 if (!callee_tree)
11647 return true;
11649 struct cl_target_option *caller_opts
11650 = TREE_TARGET_OPTION (caller_tree ? caller_tree
11651 : target_option_default_node);
11653 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
11656 /* Callee's ISA flags should be a subset of the caller's. */
11657 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11658 != callee_opts->x_aarch64_isa_flags)
11659 return false;
11661 /* Allow non-strict aligned functions inlining into strict
11662 aligned ones. */
11663 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11664 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11665 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11666 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11667 return false;
11669 bool always_inline = lookup_attribute ("always_inline",
11670 DECL_ATTRIBUTES (callee));
11672 /* If the architectural features match up and the callee is always_inline
11673 then the other attributes don't matter. */
11674 if (always_inline)
11675 return true;
11677 if (caller_opts->x_aarch64_cmodel_var
11678 != callee_opts->x_aarch64_cmodel_var)
11679 return false;
11681 if (caller_opts->x_aarch64_tls_dialect
11682 != callee_opts->x_aarch64_tls_dialect)
11683 return false;
11685 /* Honour explicit requests to workaround errata. */
11686 if (!aarch64_tribools_ok_for_inlining_p (
11687 caller_opts->x_aarch64_fix_a53_err835769,
11688 callee_opts->x_aarch64_fix_a53_err835769,
11689 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11690 return false;
11692 if (!aarch64_tribools_ok_for_inlining_p (
11693 caller_opts->x_aarch64_fix_a53_err843419,
11694 callee_opts->x_aarch64_fix_a53_err843419,
11695 2, TARGET_FIX_ERR_A53_843419))
11696 return false;
11698 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11699 caller and calle and they don't match up, reject inlining. */
11700 if (!aarch64_tribools_ok_for_inlining_p (
11701 caller_opts->x_flag_omit_leaf_frame_pointer,
11702 callee_opts->x_flag_omit_leaf_frame_pointer,
11703 2, 1))
11704 return false;
11706 /* If the callee has specific tuning overrides, respect them. */
11707 if (callee_opts->x_aarch64_override_tune_string != NULL
11708 && caller_opts->x_aarch64_override_tune_string == NULL)
11709 return false;
11711 /* If the user specified tuning override strings for the
11712 caller and callee and they don't match up, reject inlining.
11713 We just do a string compare here, we don't analyze the meaning
11714 of the string, as it would be too costly for little gain. */
11715 if (callee_opts->x_aarch64_override_tune_string
11716 && caller_opts->x_aarch64_override_tune_string
11717 && (strcmp (callee_opts->x_aarch64_override_tune_string,
11718 caller_opts->x_aarch64_override_tune_string) != 0))
11719 return false;
11721 return true;
11724 /* Return true if SYMBOL_REF X binds locally. */
11726 static bool
11727 aarch64_symbol_binds_local_p (const_rtx x)
11729 return (SYMBOL_REF_DECL (x)
11730 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11731 : SYMBOL_REF_LOCAL_P (x));
11734 /* Return true if SYMBOL_REF X is thread local */
11735 static bool
11736 aarch64_tls_symbol_p (rtx x)
11738 if (! TARGET_HAVE_TLS)
11739 return false;
11741 if (GET_CODE (x) != SYMBOL_REF)
11742 return false;
11744 return SYMBOL_REF_TLS_MODEL (x) != 0;
11747 /* Classify a TLS symbol into one of the TLS kinds. */
11748 enum aarch64_symbol_type
11749 aarch64_classify_tls_symbol (rtx x)
11751 enum tls_model tls_kind = tls_symbolic_operand_type (x);
11753 switch (tls_kind)
11755 case TLS_MODEL_GLOBAL_DYNAMIC:
11756 case TLS_MODEL_LOCAL_DYNAMIC:
11757 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11759 case TLS_MODEL_INITIAL_EXEC:
11760 switch (aarch64_cmodel)
11762 case AARCH64_CMODEL_TINY:
11763 case AARCH64_CMODEL_TINY_PIC:
11764 return SYMBOL_TINY_TLSIE;
11765 default:
11766 return SYMBOL_SMALL_TLSIE;
11769 case TLS_MODEL_LOCAL_EXEC:
11770 if (aarch64_tls_size == 12)
11771 return SYMBOL_TLSLE12;
11772 else if (aarch64_tls_size == 24)
11773 return SYMBOL_TLSLE24;
11774 else if (aarch64_tls_size == 32)
11775 return SYMBOL_TLSLE32;
11776 else if (aarch64_tls_size == 48)
11777 return SYMBOL_TLSLE48;
11778 else
11779 gcc_unreachable ();
11781 case TLS_MODEL_EMULATED:
11782 case TLS_MODEL_NONE:
11783 return SYMBOL_FORCE_TO_MEM;
11785 default:
11786 gcc_unreachable ();
11790 /* Return the correct method for accessing X + OFFSET, where X is either
11791 a SYMBOL_REF or LABEL_REF. */
11793 enum aarch64_symbol_type
11794 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11796 if (GET_CODE (x) == LABEL_REF)
11798 switch (aarch64_cmodel)
11800 case AARCH64_CMODEL_LARGE:
11801 return SYMBOL_FORCE_TO_MEM;
11803 case AARCH64_CMODEL_TINY_PIC:
11804 case AARCH64_CMODEL_TINY:
11805 return SYMBOL_TINY_ABSOLUTE;
11807 case AARCH64_CMODEL_SMALL_SPIC:
11808 case AARCH64_CMODEL_SMALL_PIC:
11809 case AARCH64_CMODEL_SMALL:
11810 return SYMBOL_SMALL_ABSOLUTE;
11812 default:
11813 gcc_unreachable ();
11817 if (GET_CODE (x) == SYMBOL_REF)
11819 if (aarch64_tls_symbol_p (x))
11820 return aarch64_classify_tls_symbol (x);
11822 switch (aarch64_cmodel)
11824 case AARCH64_CMODEL_TINY:
11825 /* When we retrieve symbol + offset address, we have to make sure
11826 the offset does not cause overflow of the final address. But
11827 we have no way of knowing the address of symbol at compile time
11828 so we can't accurately say if the distance between the PC and
11829 symbol + offset is outside the addressible range of +/-1M in the
11830 TINY code model. So we rely on images not being greater than
11831 1M and cap the offset at 1M and anything beyond 1M will have to
11832 be loaded using an alternative mechanism. Furthermore if the
11833 symbol is a weak reference to something that isn't known to
11834 resolve to a symbol in this module, then force to memory. */
11835 if ((SYMBOL_REF_WEAK (x)
11836 && !aarch64_symbol_binds_local_p (x))
11837 || !IN_RANGE (offset, -1048575, 1048575))
11838 return SYMBOL_FORCE_TO_MEM;
11839 return SYMBOL_TINY_ABSOLUTE;
11841 case AARCH64_CMODEL_SMALL:
11842 /* Same reasoning as the tiny code model, but the offset cap here is
11843 4G. */
11844 if ((SYMBOL_REF_WEAK (x)
11845 && !aarch64_symbol_binds_local_p (x))
11846 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11847 HOST_WIDE_INT_C (4294967264)))
11848 return SYMBOL_FORCE_TO_MEM;
11849 return SYMBOL_SMALL_ABSOLUTE;
11851 case AARCH64_CMODEL_TINY_PIC:
11852 if (!aarch64_symbol_binds_local_p (x))
11853 return SYMBOL_TINY_GOT;
11854 return SYMBOL_TINY_ABSOLUTE;
11856 case AARCH64_CMODEL_SMALL_SPIC:
11857 case AARCH64_CMODEL_SMALL_PIC:
11858 if (!aarch64_symbol_binds_local_p (x))
11859 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11860 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11861 return SYMBOL_SMALL_ABSOLUTE;
11863 case AARCH64_CMODEL_LARGE:
11864 /* This is alright even in PIC code as the constant
11865 pool reference is always PC relative and within
11866 the same translation unit. */
11867 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11868 return SYMBOL_SMALL_ABSOLUTE;
11869 else
11870 return SYMBOL_FORCE_TO_MEM;
11872 default:
11873 gcc_unreachable ();
11877 /* By default push everything into the constant pool. */
11878 return SYMBOL_FORCE_TO_MEM;
11881 bool
11882 aarch64_constant_address_p (rtx x)
11884 return (CONSTANT_P (x) && memory_address_p (DImode, x));
11887 bool
11888 aarch64_legitimate_pic_operand_p (rtx x)
11890 if (GET_CODE (x) == SYMBOL_REF
11891 || (GET_CODE (x) == CONST
11892 && GET_CODE (XEXP (x, 0)) == PLUS
11893 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11894 return false;
11896 return true;
11899 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11900 that should be rematerialized rather than spilled. */
11902 static bool
11903 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11905 /* Support CSE and rematerialization of common constants. */
11906 if (CONST_INT_P (x)
11907 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11908 || GET_CODE (x) == CONST_VECTOR)
11909 return true;
11911 /* Do not allow vector struct mode constants for Advanced SIMD.
11912 We could support 0 and -1 easily, but they need support in
11913 aarch64-simd.md. */
11914 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11915 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11916 return false;
11918 /* Only accept variable-length vector constants if they can be
11919 handled directly.
11921 ??? It would be possible to handle rematerialization of other
11922 constants via secondary reloads. */
11923 if (vec_flags & VEC_ANY_SVE)
11924 return aarch64_simd_valid_immediate (x, NULL);
11926 if (GET_CODE (x) == HIGH)
11927 x = XEXP (x, 0);
11929 /* Accept polynomial constants that can be calculated by using the
11930 destination of a move as the sole temporary. Constants that
11931 require a second temporary cannot be rematerialized (they can't be
11932 forced to memory and also aren't legitimate constants). */
11933 poly_int64 offset;
11934 if (poly_int_rtx_p (x, &offset))
11935 return aarch64_offset_temporaries (false, offset) <= 1;
11937 /* If an offset is being added to something else, we need to allow the
11938 base to be moved into the destination register, meaning that there
11939 are no free temporaries for the offset. */
11940 x = strip_offset (x, &offset);
11941 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11942 return false;
11944 /* Do not allow const (plus (anchor_symbol, const_int)). */
11945 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11946 return false;
11948 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
11949 so spilling them is better than rematerialization. */
11950 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11951 return true;
11953 /* Label references are always constant. */
11954 if (GET_CODE (x) == LABEL_REF)
11955 return true;
11957 return false;
11961 aarch64_load_tp (rtx target)
11963 if (!target
11964 || GET_MODE (target) != Pmode
11965 || !register_operand (target, Pmode))
11966 target = gen_reg_rtx (Pmode);
11968 /* Can return in any reg. */
11969 emit_insn (gen_aarch64_load_tp_hard (target));
11970 return target;
11973 /* On AAPCS systems, this is the "struct __va_list". */
11974 static GTY(()) tree va_list_type;
11976 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11977 Return the type to use as __builtin_va_list.
11979 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11981 struct __va_list
11983 void *__stack;
11984 void *__gr_top;
11985 void *__vr_top;
11986 int __gr_offs;
11987 int __vr_offs;
11988 }; */
11990 static tree
11991 aarch64_build_builtin_va_list (void)
11993 tree va_list_name;
11994 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11996 /* Create the type. */
11997 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
11998 /* Give it the required name. */
11999 va_list_name = build_decl (BUILTINS_LOCATION,
12000 TYPE_DECL,
12001 get_identifier ("__va_list"),
12002 va_list_type);
12003 DECL_ARTIFICIAL (va_list_name) = 1;
12004 TYPE_NAME (va_list_type) = va_list_name;
12005 TYPE_STUB_DECL (va_list_type) = va_list_name;
12007 /* Create the fields. */
12008 f_stack = build_decl (BUILTINS_LOCATION,
12009 FIELD_DECL, get_identifier ("__stack"),
12010 ptr_type_node);
12011 f_grtop = build_decl (BUILTINS_LOCATION,
12012 FIELD_DECL, get_identifier ("__gr_top"),
12013 ptr_type_node);
12014 f_vrtop = build_decl (BUILTINS_LOCATION,
12015 FIELD_DECL, get_identifier ("__vr_top"),
12016 ptr_type_node);
12017 f_groff = build_decl (BUILTINS_LOCATION,
12018 FIELD_DECL, get_identifier ("__gr_offs"),
12019 integer_type_node);
12020 f_vroff = build_decl (BUILTINS_LOCATION,
12021 FIELD_DECL, get_identifier ("__vr_offs"),
12022 integer_type_node);
12024 /* Tell tree-stdarg pass about our internal offset fields.
12025 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12026 purpose to identify whether the code is updating va_list internal
12027 offset fields through irregular way. */
12028 va_list_gpr_counter_field = f_groff;
12029 va_list_fpr_counter_field = f_vroff;
12031 DECL_ARTIFICIAL (f_stack) = 1;
12032 DECL_ARTIFICIAL (f_grtop) = 1;
12033 DECL_ARTIFICIAL (f_vrtop) = 1;
12034 DECL_ARTIFICIAL (f_groff) = 1;
12035 DECL_ARTIFICIAL (f_vroff) = 1;
12037 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12038 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12039 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12040 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12041 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12043 TYPE_FIELDS (va_list_type) = f_stack;
12044 DECL_CHAIN (f_stack) = f_grtop;
12045 DECL_CHAIN (f_grtop) = f_vrtop;
12046 DECL_CHAIN (f_vrtop) = f_groff;
12047 DECL_CHAIN (f_groff) = f_vroff;
12049 /* Compute its layout. */
12050 layout_type (va_list_type);
12052 return va_list_type;
12055 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12056 static void
12057 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12059 const CUMULATIVE_ARGS *cum;
12060 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12061 tree stack, grtop, vrtop, groff, vroff;
12062 tree t;
12063 int gr_save_area_size = cfun->va_list_gpr_size;
12064 int vr_save_area_size = cfun->va_list_fpr_size;
12065 int vr_offset;
12067 cum = &crtl->args.info;
12068 if (cfun->va_list_gpr_size)
12069 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12070 cfun->va_list_gpr_size);
12071 if (cfun->va_list_fpr_size)
12072 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12073 * UNITS_PER_VREG, cfun->va_list_fpr_size);
12075 if (!TARGET_FLOAT)
12077 gcc_assert (cum->aapcs_nvrn == 0);
12078 vr_save_area_size = 0;
12081 f_stack = TYPE_FIELDS (va_list_type_node);
12082 f_grtop = DECL_CHAIN (f_stack);
12083 f_vrtop = DECL_CHAIN (f_grtop);
12084 f_groff = DECL_CHAIN (f_vrtop);
12085 f_vroff = DECL_CHAIN (f_groff);
12087 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12088 NULL_TREE);
12089 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12090 NULL_TREE);
12091 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12092 NULL_TREE);
12093 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12094 NULL_TREE);
12095 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12096 NULL_TREE);
12098 /* Emit code to initialize STACK, which points to the next varargs stack
12099 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12100 by named arguments. STACK is 8-byte aligned. */
12101 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12102 if (cum->aapcs_stack_size > 0)
12103 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12104 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12105 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12107 /* Emit code to initialize GRTOP, the top of the GR save area.
12108 virtual_incoming_args_rtx should have been 16 byte aligned. */
12109 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12110 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12111 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12113 /* Emit code to initialize VRTOP, the top of the VR save area.
12114 This address is gr_save_area_bytes below GRTOP, rounded
12115 down to the next 16-byte boundary. */
12116 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12117 vr_offset = ROUND_UP (gr_save_area_size,
12118 STACK_BOUNDARY / BITS_PER_UNIT);
12120 if (vr_offset)
12121 t = fold_build_pointer_plus_hwi (t, -vr_offset);
12122 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12123 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12125 /* Emit code to initialize GROFF, the offset from GRTOP of the
12126 next GPR argument. */
12127 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12128 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12129 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12131 /* Likewise emit code to initialize VROFF, the offset from FTOP
12132 of the next VR argument. */
12133 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12134 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12135 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12138 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12140 static tree
12141 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12142 gimple_seq *post_p ATTRIBUTE_UNUSED)
12144 tree addr;
12145 bool indirect_p;
12146 bool is_ha; /* is HFA or HVA. */
12147 bool dw_align; /* double-word align. */
12148 machine_mode ag_mode = VOIDmode;
12149 int nregs;
12150 machine_mode mode;
12152 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12153 tree stack, f_top, f_off, off, arg, roundup, on_stack;
12154 HOST_WIDE_INT size, rsize, adjust, align;
12155 tree t, u, cond1, cond2;
12157 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12158 if (indirect_p)
12159 type = build_pointer_type (type);
12161 mode = TYPE_MODE (type);
12163 f_stack = TYPE_FIELDS (va_list_type_node);
12164 f_grtop = DECL_CHAIN (f_stack);
12165 f_vrtop = DECL_CHAIN (f_grtop);
12166 f_groff = DECL_CHAIN (f_vrtop);
12167 f_vroff = DECL_CHAIN (f_groff);
12169 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12170 f_stack, NULL_TREE);
12171 size = int_size_in_bytes (type);
12172 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12174 dw_align = false;
12175 adjust = 0;
12176 if (aarch64_vfp_is_call_or_return_candidate (mode,
12177 type,
12178 &ag_mode,
12179 &nregs,
12180 &is_ha))
12182 /* No frontends can create types with variable-sized modes, so we
12183 shouldn't be asked to pass or return them. */
12184 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12186 /* TYPE passed in fp/simd registers. */
12187 if (!TARGET_FLOAT)
12188 aarch64_err_no_fpadvsimd (mode, "varargs");
12190 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12191 unshare_expr (valist), f_vrtop, NULL_TREE);
12192 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12193 unshare_expr (valist), f_vroff, NULL_TREE);
12195 rsize = nregs * UNITS_PER_VREG;
12197 if (is_ha)
12199 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12200 adjust = UNITS_PER_VREG - ag_size;
12202 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12203 && size < UNITS_PER_VREG)
12205 adjust = UNITS_PER_VREG - size;
12208 else
12210 /* TYPE passed in general registers. */
12211 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12212 unshare_expr (valist), f_grtop, NULL_TREE);
12213 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12214 unshare_expr (valist), f_groff, NULL_TREE);
12215 rsize = ROUND_UP (size, UNITS_PER_WORD);
12216 nregs = rsize / UNITS_PER_WORD;
12218 if (align > 8)
12219 dw_align = true;
12221 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12222 && size < UNITS_PER_WORD)
12224 adjust = UNITS_PER_WORD - size;
12228 /* Get a local temporary for the field value. */
12229 off = get_initialized_tmp_var (f_off, pre_p, NULL);
12231 /* Emit code to branch if off >= 0. */
12232 t = build2 (GE_EXPR, boolean_type_node, off,
12233 build_int_cst (TREE_TYPE (off), 0));
12234 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12236 if (dw_align)
12238 /* Emit: offs = (offs + 15) & -16. */
12239 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12240 build_int_cst (TREE_TYPE (off), 15));
12241 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12242 build_int_cst (TREE_TYPE (off), -16));
12243 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12245 else
12246 roundup = NULL;
12248 /* Update ap.__[g|v]r_offs */
12249 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12250 build_int_cst (TREE_TYPE (off), rsize));
12251 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12253 /* String up. */
12254 if (roundup)
12255 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12257 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12258 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12259 build_int_cst (TREE_TYPE (f_off), 0));
12260 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12262 /* String up: make sure the assignment happens before the use. */
12263 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12264 COND_EXPR_ELSE (cond1) = t;
12266 /* Prepare the trees handling the argument that is passed on the stack;
12267 the top level node will store in ON_STACK. */
12268 arg = get_initialized_tmp_var (stack, pre_p, NULL);
12269 if (align > 8)
12271 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12272 t = fold_convert (intDI_type_node, arg);
12273 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12274 build_int_cst (TREE_TYPE (t), 15));
12275 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12276 build_int_cst (TREE_TYPE (t), -16));
12277 t = fold_convert (TREE_TYPE (arg), t);
12278 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12280 else
12281 roundup = NULL;
12282 /* Advance ap.__stack */
12283 t = fold_convert (intDI_type_node, arg);
12284 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12285 build_int_cst (TREE_TYPE (t), size + 7));
12286 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12287 build_int_cst (TREE_TYPE (t), -8));
12288 t = fold_convert (TREE_TYPE (arg), t);
12289 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12290 /* String up roundup and advance. */
12291 if (roundup)
12292 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12293 /* String up with arg */
12294 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12295 /* Big-endianness related address adjustment. */
12296 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12297 && size < UNITS_PER_WORD)
12299 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12300 size_int (UNITS_PER_WORD - size));
12301 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12304 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12305 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12307 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12308 t = off;
12309 if (adjust)
12310 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12311 build_int_cst (TREE_TYPE (off), adjust));
12313 t = fold_convert (sizetype, t);
12314 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12316 if (is_ha)
12318 /* type ha; // treat as "struct {ftype field[n];}"
12319 ... [computing offs]
12320 for (i = 0; i <nregs; ++i, offs += 16)
12321 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12322 return ha; */
12323 int i;
12324 tree tmp_ha, field_t, field_ptr_t;
12326 /* Declare a local variable. */
12327 tmp_ha = create_tmp_var_raw (type, "ha");
12328 gimple_add_tmp_var (tmp_ha);
12330 /* Establish the base type. */
12331 switch (ag_mode)
12333 case E_SFmode:
12334 field_t = float_type_node;
12335 field_ptr_t = float_ptr_type_node;
12336 break;
12337 case E_DFmode:
12338 field_t = double_type_node;
12339 field_ptr_t = double_ptr_type_node;
12340 break;
12341 case E_TFmode:
12342 field_t = long_double_type_node;
12343 field_ptr_t = long_double_ptr_type_node;
12344 break;
12345 case E_HFmode:
12346 field_t = aarch64_fp16_type_node;
12347 field_ptr_t = aarch64_fp16_ptr_type_node;
12348 break;
12349 case E_V2SImode:
12350 case E_V4SImode:
12352 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12353 field_t = build_vector_type_for_mode (innertype, ag_mode);
12354 field_ptr_t = build_pointer_type (field_t);
12356 break;
12357 default:
12358 gcc_assert (0);
12361 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12362 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12363 addr = t;
12364 t = fold_convert (field_ptr_t, addr);
12365 t = build2 (MODIFY_EXPR, field_t,
12366 build1 (INDIRECT_REF, field_t, tmp_ha),
12367 build1 (INDIRECT_REF, field_t, t));
12369 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12370 for (i = 1; i < nregs; ++i)
12372 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12373 u = fold_convert (field_ptr_t, addr);
12374 u = build2 (MODIFY_EXPR, field_t,
12375 build2 (MEM_REF, field_t, tmp_ha,
12376 build_int_cst (field_ptr_t,
12377 (i *
12378 int_size_in_bytes (field_t)))),
12379 build1 (INDIRECT_REF, field_t, u));
12380 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12383 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12384 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12387 COND_EXPR_ELSE (cond2) = t;
12388 addr = fold_convert (build_pointer_type (type), cond1);
12389 addr = build_va_arg_indirect_ref (addr);
12391 if (indirect_p)
12392 addr = build_va_arg_indirect_ref (addr);
12394 return addr;
12397 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12399 static void
12400 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12401 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12402 int no_rtl)
12404 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12405 CUMULATIVE_ARGS local_cum;
12406 int gr_saved = cfun->va_list_gpr_size;
12407 int vr_saved = cfun->va_list_fpr_size;
12409 /* The caller has advanced CUM up to, but not beyond, the last named
12410 argument. Advance a local copy of CUM past the last "real" named
12411 argument, to find out how many registers are left over. */
12412 local_cum = *cum;
12413 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12415 /* Found out how many registers we need to save.
12416 Honor tree-stdvar analysis results. */
12417 if (cfun->va_list_gpr_size)
12418 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12419 cfun->va_list_gpr_size / UNITS_PER_WORD);
12420 if (cfun->va_list_fpr_size)
12421 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12422 cfun->va_list_fpr_size / UNITS_PER_VREG);
12424 if (!TARGET_FLOAT)
12426 gcc_assert (local_cum.aapcs_nvrn == 0);
12427 vr_saved = 0;
12430 if (!no_rtl)
12432 if (gr_saved > 0)
12434 rtx ptr, mem;
12436 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12437 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12438 - gr_saved * UNITS_PER_WORD);
12439 mem = gen_frame_mem (BLKmode, ptr);
12440 set_mem_alias_set (mem, get_varargs_alias_set ());
12442 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12443 mem, gr_saved);
12445 if (vr_saved > 0)
12447 /* We can't use move_block_from_reg, because it will use
12448 the wrong mode, storing D regs only. */
12449 machine_mode mode = TImode;
12450 int off, i, vr_start;
12452 /* Set OFF to the offset from virtual_incoming_args_rtx of
12453 the first vector register. The VR save area lies below
12454 the GR one, and is aligned to 16 bytes. */
12455 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12456 STACK_BOUNDARY / BITS_PER_UNIT);
12457 off -= vr_saved * UNITS_PER_VREG;
12459 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12460 for (i = 0; i < vr_saved; ++i)
12462 rtx ptr, mem;
12464 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12465 mem = gen_frame_mem (mode, ptr);
12466 set_mem_alias_set (mem, get_varargs_alias_set ());
12467 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12468 off += UNITS_PER_VREG;
12473 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12474 any complication of having crtl->args.pretend_args_size changed. */
12475 cfun->machine->frame.saved_varargs_size
12476 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12477 STACK_BOUNDARY / BITS_PER_UNIT)
12478 + vr_saved * UNITS_PER_VREG);
12481 static void
12482 aarch64_conditional_register_usage (void)
12484 int i;
12485 if (!TARGET_FLOAT)
12487 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12489 fixed_regs[i] = 1;
12490 call_used_regs[i] = 1;
12493 if (!TARGET_SVE)
12494 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12496 fixed_regs[i] = 1;
12497 call_used_regs[i] = 1;
12501 /* Walk down the type tree of TYPE counting consecutive base elements.
12502 If *MODEP is VOIDmode, then set it to the first valid floating point
12503 type. If a non-floating point type is found, or if a floating point
12504 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12505 otherwise return the count in the sub-tree. */
12506 static int
12507 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12509 machine_mode mode;
12510 HOST_WIDE_INT size;
12512 switch (TREE_CODE (type))
12514 case REAL_TYPE:
12515 mode = TYPE_MODE (type);
12516 if (mode != DFmode && mode != SFmode
12517 && mode != TFmode && mode != HFmode)
12518 return -1;
12520 if (*modep == VOIDmode)
12521 *modep = mode;
12523 if (*modep == mode)
12524 return 1;
12526 break;
12528 case COMPLEX_TYPE:
12529 mode = TYPE_MODE (TREE_TYPE (type));
12530 if (mode != DFmode && mode != SFmode
12531 && mode != TFmode && mode != HFmode)
12532 return -1;
12534 if (*modep == VOIDmode)
12535 *modep = mode;
12537 if (*modep == mode)
12538 return 2;
12540 break;
12542 case VECTOR_TYPE:
12543 /* Use V2SImode and V4SImode as representatives of all 64-bit
12544 and 128-bit vector types. */
12545 size = int_size_in_bytes (type);
12546 switch (size)
12548 case 8:
12549 mode = V2SImode;
12550 break;
12551 case 16:
12552 mode = V4SImode;
12553 break;
12554 default:
12555 return -1;
12558 if (*modep == VOIDmode)
12559 *modep = mode;
12561 /* Vector modes are considered to be opaque: two vectors are
12562 equivalent for the purposes of being homogeneous aggregates
12563 if they are the same size. */
12564 if (*modep == mode)
12565 return 1;
12567 break;
12569 case ARRAY_TYPE:
12571 int count;
12572 tree index = TYPE_DOMAIN (type);
12574 /* Can't handle incomplete types nor sizes that are not
12575 fixed. */
12576 if (!COMPLETE_TYPE_P (type)
12577 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12578 return -1;
12580 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12581 if (count == -1
12582 || !index
12583 || !TYPE_MAX_VALUE (index)
12584 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12585 || !TYPE_MIN_VALUE (index)
12586 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12587 || count < 0)
12588 return -1;
12590 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12591 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12593 /* There must be no padding. */
12594 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12595 count * GET_MODE_BITSIZE (*modep)))
12596 return -1;
12598 return count;
12601 case RECORD_TYPE:
12603 int count = 0;
12604 int sub_count;
12605 tree field;
12607 /* Can't handle incomplete types nor sizes that are not
12608 fixed. */
12609 if (!COMPLETE_TYPE_P (type)
12610 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12611 return -1;
12613 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12615 if (TREE_CODE (field) != FIELD_DECL)
12616 continue;
12618 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12619 if (sub_count < 0)
12620 return -1;
12621 count += sub_count;
12624 /* There must be no padding. */
12625 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12626 count * GET_MODE_BITSIZE (*modep)))
12627 return -1;
12629 return count;
12632 case UNION_TYPE:
12633 case QUAL_UNION_TYPE:
12635 /* These aren't very interesting except in a degenerate case. */
12636 int count = 0;
12637 int sub_count;
12638 tree field;
12640 /* Can't handle incomplete types nor sizes that are not
12641 fixed. */
12642 if (!COMPLETE_TYPE_P (type)
12643 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12644 return -1;
12646 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12648 if (TREE_CODE (field) != FIELD_DECL)
12649 continue;
12651 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12652 if (sub_count < 0)
12653 return -1;
12654 count = count > sub_count ? count : sub_count;
12657 /* There must be no padding. */
12658 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12659 count * GET_MODE_BITSIZE (*modep)))
12660 return -1;
12662 return count;
12665 default:
12666 break;
12669 return -1;
12672 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12673 type as described in AAPCS64 \S 4.1.2.
12675 See the comment above aarch64_composite_type_p for the notes on MODE. */
12677 static bool
12678 aarch64_short_vector_p (const_tree type,
12679 machine_mode mode)
12681 poly_int64 size = -1;
12683 if (type && TREE_CODE (type) == VECTOR_TYPE)
12684 size = int_size_in_bytes (type);
12685 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12686 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12687 size = GET_MODE_SIZE (mode);
12689 return known_eq (size, 8) || known_eq (size, 16);
12692 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12693 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12694 array types. The C99 floating-point complex types are also considered
12695 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12696 types, which are GCC extensions and out of the scope of AAPCS64, are
12697 treated as composite types here as well.
12699 Note that MODE itself is not sufficient in determining whether a type
12700 is such a composite type or not. This is because
12701 stor-layout.c:compute_record_mode may have already changed the MODE
12702 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12703 structure with only one field may have its MODE set to the mode of the
12704 field. Also an integer mode whose size matches the size of the
12705 RECORD_TYPE type may be used to substitute the original mode
12706 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12707 solely relied on. */
12709 static bool
12710 aarch64_composite_type_p (const_tree type,
12711 machine_mode mode)
12713 if (aarch64_short_vector_p (type, mode))
12714 return false;
12716 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12717 return true;
12719 if (mode == BLKmode
12720 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12721 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12722 return true;
12724 return false;
12727 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12728 shall be passed or returned in simd/fp register(s) (providing these
12729 parameter passing registers are available).
12731 Upon successful return, *COUNT returns the number of needed registers,
12732 *BASE_MODE returns the mode of the individual register and when IS_HAF
12733 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12734 floating-point aggregate or a homogeneous short-vector aggregate. */
12736 static bool
12737 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12738 const_tree type,
12739 machine_mode *base_mode,
12740 int *count,
12741 bool *is_ha)
12743 machine_mode new_mode = VOIDmode;
12744 bool composite_p = aarch64_composite_type_p (type, mode);
12746 if (is_ha != NULL) *is_ha = false;
12748 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12749 || aarch64_short_vector_p (type, mode))
12751 *count = 1;
12752 new_mode = mode;
12754 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12756 if (is_ha != NULL) *is_ha = true;
12757 *count = 2;
12758 new_mode = GET_MODE_INNER (mode);
12760 else if (type && composite_p)
12762 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12764 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12766 if (is_ha != NULL) *is_ha = true;
12767 *count = ag_count;
12769 else
12770 return false;
12772 else
12773 return false;
12775 *base_mode = new_mode;
12776 return true;
12779 /* Implement TARGET_STRUCT_VALUE_RTX. */
12781 static rtx
12782 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12783 int incoming ATTRIBUTE_UNUSED)
12785 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12788 /* Implements target hook vector_mode_supported_p. */
12789 static bool
12790 aarch64_vector_mode_supported_p (machine_mode mode)
12792 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12793 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12796 /* Return appropriate SIMD container
12797 for MODE within a vector of WIDTH bits. */
12798 static machine_mode
12799 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12801 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12802 switch (mode)
12804 case E_DFmode:
12805 return VNx2DFmode;
12806 case E_SFmode:
12807 return VNx4SFmode;
12808 case E_HFmode:
12809 return VNx8HFmode;
12810 case E_DImode:
12811 return VNx2DImode;
12812 case E_SImode:
12813 return VNx4SImode;
12814 case E_HImode:
12815 return VNx8HImode;
12816 case E_QImode:
12817 return VNx16QImode;
12818 default:
12819 return word_mode;
12822 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12823 if (TARGET_SIMD)
12825 if (known_eq (width, 128))
12826 switch (mode)
12828 case E_DFmode:
12829 return V2DFmode;
12830 case E_SFmode:
12831 return V4SFmode;
12832 case E_HFmode:
12833 return V8HFmode;
12834 case E_SImode:
12835 return V4SImode;
12836 case E_HImode:
12837 return V8HImode;
12838 case E_QImode:
12839 return V16QImode;
12840 case E_DImode:
12841 return V2DImode;
12842 default:
12843 break;
12845 else
12846 switch (mode)
12848 case E_SFmode:
12849 return V2SFmode;
12850 case E_HFmode:
12851 return V4HFmode;
12852 case E_SImode:
12853 return V2SImode;
12854 case E_HImode:
12855 return V4HImode;
12856 case E_QImode:
12857 return V8QImode;
12858 default:
12859 break;
12862 return word_mode;
12865 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12866 static machine_mode
12867 aarch64_preferred_simd_mode (scalar_mode mode)
12869 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12870 return aarch64_simd_container_mode (mode, bits);
12873 /* Return a list of possible vector sizes for the vectorizer
12874 to iterate over. */
12875 static void
12876 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12878 if (TARGET_SVE)
12879 sizes->safe_push (BYTES_PER_SVE_VECTOR);
12880 sizes->safe_push (16);
12881 sizes->safe_push (8);
12884 /* Implement TARGET_MANGLE_TYPE. */
12886 static const char *
12887 aarch64_mangle_type (const_tree type)
12889 /* The AArch64 ABI documents say that "__va_list" has to be
12890 managled as if it is in the "std" namespace. */
12891 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12892 return "St9__va_list";
12894 /* Half-precision float. */
12895 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12896 return "Dh";
12898 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12899 builtin types. */
12900 if (TYPE_NAME (type) != NULL)
12901 return aarch64_mangle_builtin_type (type);
12903 /* Use the default mangling. */
12904 return NULL;
12907 /* Find the first rtx_insn before insn that will generate an assembly
12908 instruction. */
12910 static rtx_insn *
12911 aarch64_prev_real_insn (rtx_insn *insn)
12913 if (!insn)
12914 return NULL;
12918 insn = prev_real_insn (insn);
12920 while (insn && recog_memoized (insn) < 0);
12922 return insn;
12925 static bool
12926 is_madd_op (enum attr_type t1)
12928 unsigned int i;
12929 /* A number of these may be AArch32 only. */
12930 enum attr_type mlatypes[] = {
12931 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12932 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12933 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12936 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12938 if (t1 == mlatypes[i])
12939 return true;
12942 return false;
12945 /* Check if there is a register dependency between a load and the insn
12946 for which we hold recog_data. */
12948 static bool
12949 dep_between_memop_and_curr (rtx memop)
12951 rtx load_reg;
12952 int opno;
12954 gcc_assert (GET_CODE (memop) == SET);
12956 if (!REG_P (SET_DEST (memop)))
12957 return false;
12959 load_reg = SET_DEST (memop);
12960 for (opno = 1; opno < recog_data.n_operands; opno++)
12962 rtx operand = recog_data.operand[opno];
12963 if (REG_P (operand)
12964 && reg_overlap_mentioned_p (load_reg, operand))
12965 return true;
12968 return false;
12972 /* When working around the Cortex-A53 erratum 835769,
12973 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12974 instruction and has a preceding memory instruction such that a NOP
12975 should be inserted between them. */
12977 bool
12978 aarch64_madd_needs_nop (rtx_insn* insn)
12980 enum attr_type attr_type;
12981 rtx_insn *prev;
12982 rtx body;
12984 if (!TARGET_FIX_ERR_A53_835769)
12985 return false;
12987 if (!INSN_P (insn) || recog_memoized (insn) < 0)
12988 return false;
12990 attr_type = get_attr_type (insn);
12991 if (!is_madd_op (attr_type))
12992 return false;
12994 prev = aarch64_prev_real_insn (insn);
12995 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12996 Restore recog state to INSN to avoid state corruption. */
12997 extract_constrain_insn_cached (insn);
12999 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13000 return false;
13002 body = single_set (prev);
13004 /* If the previous insn is a memory op and there is no dependency between
13005 it and the DImode madd, emit a NOP between them. If body is NULL then we
13006 have a complex memory operation, probably a load/store pair.
13007 Be conservative for now and emit a NOP. */
13008 if (GET_MODE (recog_data.operand[0]) == DImode
13009 && (!body || !dep_between_memop_and_curr (body)))
13010 return true;
13012 return false;
13017 /* Implement FINAL_PRESCAN_INSN. */
13019 void
13020 aarch64_final_prescan_insn (rtx_insn *insn)
13022 if (aarch64_madd_needs_nop (insn))
13023 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13027 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13028 instruction. */
13030 bool
13031 aarch64_sve_index_immediate_p (rtx base_or_step)
13033 return (CONST_INT_P (base_or_step)
13034 && IN_RANGE (INTVAL (base_or_step), -16, 15));
13037 /* Return true if X is a valid immediate for the SVE ADD and SUB
13038 instructions. Negate X first if NEGATE_P is true. */
13040 bool
13041 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13043 rtx elt;
13045 if (!const_vec_duplicate_p (x, &elt)
13046 || !CONST_INT_P (elt))
13047 return false;
13049 HOST_WIDE_INT val = INTVAL (elt);
13050 if (negate_p)
13051 val = -val;
13052 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13054 if (val & 0xff)
13055 return IN_RANGE (val, 0, 0xff);
13056 return IN_RANGE (val, 0, 0xff00);
13059 /* Return true if X is a valid immediate operand for an SVE logical
13060 instruction such as AND. */
13062 bool
13063 aarch64_sve_bitmask_immediate_p (rtx x)
13065 rtx elt;
13067 return (const_vec_duplicate_p (x, &elt)
13068 && CONST_INT_P (elt)
13069 && aarch64_bitmask_imm (INTVAL (elt),
13070 GET_MODE_INNER (GET_MODE (x))));
13073 /* Return true if X is a valid immediate for the SVE DUP and CPY
13074 instructions. */
13076 bool
13077 aarch64_sve_dup_immediate_p (rtx x)
13079 rtx elt;
13081 if (!const_vec_duplicate_p (x, &elt)
13082 || !CONST_INT_P (elt))
13083 return false;
13085 HOST_WIDE_INT val = INTVAL (elt);
13086 if (val & 0xff)
13087 return IN_RANGE (val, -0x80, 0x7f);
13088 return IN_RANGE (val, -0x8000, 0x7f00);
13091 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13092 SIGNED_P says whether the operand is signed rather than unsigned. */
13094 bool
13095 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13097 rtx elt;
13099 return (const_vec_duplicate_p (x, &elt)
13100 && CONST_INT_P (elt)
13101 && (signed_p
13102 ? IN_RANGE (INTVAL (elt), -16, 15)
13103 : IN_RANGE (INTVAL (elt), 0, 127)));
13106 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13107 instruction. Negate X first if NEGATE_P is true. */
13109 bool
13110 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13112 rtx elt;
13113 REAL_VALUE_TYPE r;
13115 if (!const_vec_duplicate_p (x, &elt)
13116 || GET_CODE (elt) != CONST_DOUBLE)
13117 return false;
13119 r = *CONST_DOUBLE_REAL_VALUE (elt);
13121 if (negate_p)
13122 r = real_value_negate (&r);
13124 if (real_equal (&r, &dconst1))
13125 return true;
13126 if (real_equal (&r, &dconsthalf))
13127 return true;
13128 return false;
13131 /* Return true if X is a valid immediate operand for an SVE FMUL
13132 instruction. */
13134 bool
13135 aarch64_sve_float_mul_immediate_p (rtx x)
13137 rtx elt;
13139 /* GCC will never generate a multiply with an immediate of 2, so there is no
13140 point testing for it (even though it is a valid constant). */
13141 return (const_vec_duplicate_p (x, &elt)
13142 && GET_CODE (elt) == CONST_DOUBLE
13143 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13146 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13147 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13148 is nonnull, use it to describe valid immediates. */
13149 static bool
13150 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13151 simd_immediate_info *info,
13152 enum simd_immediate_check which,
13153 simd_immediate_info::insn_type insn)
13155 /* Try a 4-byte immediate with LSL. */
13156 for (unsigned int shift = 0; shift < 32; shift += 8)
13157 if ((val32 & (0xff << shift)) == val32)
13159 if (info)
13160 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13161 simd_immediate_info::LSL, shift);
13162 return true;
13165 /* Try a 2-byte immediate with LSL. */
13166 unsigned int imm16 = val32 & 0xffff;
13167 if (imm16 == (val32 >> 16))
13168 for (unsigned int shift = 0; shift < 16; shift += 8)
13169 if ((imm16 & (0xff << shift)) == imm16)
13171 if (info)
13172 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13173 simd_immediate_info::LSL, shift);
13174 return true;
13177 /* Try a 4-byte immediate with MSL, except for cases that MVN
13178 can handle. */
13179 if (which == AARCH64_CHECK_MOV)
13180 for (unsigned int shift = 8; shift < 24; shift += 8)
13182 unsigned int low = (1 << shift) - 1;
13183 if (((val32 & (0xff << shift)) | low) == val32)
13185 if (info)
13186 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13187 simd_immediate_info::MSL, shift);
13188 return true;
13192 return false;
13195 /* Return true if replicating VAL64 is a valid immediate for the
13196 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13197 use it to describe valid immediates. */
13198 static bool
13199 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13200 simd_immediate_info *info,
13201 enum simd_immediate_check which)
13203 unsigned int val32 = val64 & 0xffffffff;
13204 unsigned int val16 = val64 & 0xffff;
13205 unsigned int val8 = val64 & 0xff;
13207 if (val32 == (val64 >> 32))
13209 if ((which & AARCH64_CHECK_ORR) != 0
13210 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13211 simd_immediate_info::MOV))
13212 return true;
13214 if ((which & AARCH64_CHECK_BIC) != 0
13215 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13216 simd_immediate_info::MVN))
13217 return true;
13219 /* Try using a replicated byte. */
13220 if (which == AARCH64_CHECK_MOV
13221 && val16 == (val32 >> 16)
13222 && val8 == (val16 >> 8))
13224 if (info)
13225 *info = simd_immediate_info (QImode, val8);
13226 return true;
13230 /* Try using a bit-to-bytemask. */
13231 if (which == AARCH64_CHECK_MOV)
13233 unsigned int i;
13234 for (i = 0; i < 64; i += 8)
13236 unsigned char byte = (val64 >> i) & 0xff;
13237 if (byte != 0 && byte != 0xff)
13238 break;
13240 if (i == 64)
13242 if (info)
13243 *info = simd_immediate_info (DImode, val64);
13244 return true;
13247 return false;
13250 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13251 instruction. If INFO is nonnull, use it to describe valid immediates. */
13253 static bool
13254 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13255 simd_immediate_info *info)
13257 scalar_int_mode mode = DImode;
13258 unsigned int val32 = val64 & 0xffffffff;
13259 if (val32 == (val64 >> 32))
13261 mode = SImode;
13262 unsigned int val16 = val32 & 0xffff;
13263 if (val16 == (val32 >> 16))
13265 mode = HImode;
13266 unsigned int val8 = val16 & 0xff;
13267 if (val8 == (val16 >> 8))
13268 mode = QImode;
13271 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13272 if (IN_RANGE (val, -0x80, 0x7f))
13274 /* DUP with no shift. */
13275 if (info)
13276 *info = simd_immediate_info (mode, val);
13277 return true;
13279 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13281 /* DUP with LSL #8. */
13282 if (info)
13283 *info = simd_immediate_info (mode, val);
13284 return true;
13286 if (aarch64_bitmask_imm (val64, mode))
13288 /* DUPM. */
13289 if (info)
13290 *info = simd_immediate_info (mode, val);
13291 return true;
13293 return false;
13296 /* Return true if OP is a valid SIMD immediate for the operation
13297 described by WHICH. If INFO is nonnull, use it to describe valid
13298 immediates. */
13299 bool
13300 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13301 enum simd_immediate_check which)
13303 machine_mode mode = GET_MODE (op);
13304 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13305 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13306 return false;
13308 scalar_mode elt_mode = GET_MODE_INNER (mode);
13309 rtx base, step;
13310 unsigned int n_elts;
13311 if (GET_CODE (op) == CONST_VECTOR
13312 && CONST_VECTOR_DUPLICATE_P (op))
13313 n_elts = CONST_VECTOR_NPATTERNS (op);
13314 else if ((vec_flags & VEC_SVE_DATA)
13315 && const_vec_series_p (op, &base, &step))
13317 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13318 if (!aarch64_sve_index_immediate_p (base)
13319 || !aarch64_sve_index_immediate_p (step))
13320 return false;
13322 if (info)
13323 *info = simd_immediate_info (elt_mode, base, step);
13324 return true;
13326 else if (GET_CODE (op) == CONST_VECTOR
13327 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13328 /* N_ELTS set above. */;
13329 else
13330 return false;
13332 /* Handle PFALSE and PTRUE. */
13333 if (vec_flags & VEC_SVE_PRED)
13334 return (op == CONST0_RTX (mode)
13335 || op == CONSTM1_RTX (mode));
13337 scalar_float_mode elt_float_mode;
13338 if (n_elts == 1
13339 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13341 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13342 if (aarch64_float_const_zero_rtx_p (elt)
13343 || aarch64_float_const_representable_p (elt))
13345 if (info)
13346 *info = simd_immediate_info (elt_float_mode, elt);
13347 return true;
13351 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13352 if (elt_size > 8)
13353 return false;
13355 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13357 /* Expand the vector constant out into a byte vector, with the least
13358 significant byte of the register first. */
13359 auto_vec<unsigned char, 16> bytes;
13360 bytes.reserve (n_elts * elt_size);
13361 for (unsigned int i = 0; i < n_elts; i++)
13363 /* The vector is provided in gcc endian-neutral fashion.
13364 For aarch64_be Advanced SIMD, it must be laid out in the vector
13365 register in reverse order. */
13366 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13367 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13369 if (elt_mode != elt_int_mode)
13370 elt = gen_lowpart (elt_int_mode, elt);
13372 if (!CONST_INT_P (elt))
13373 return false;
13375 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13376 for (unsigned int byte = 0; byte < elt_size; byte++)
13378 bytes.quick_push (elt_val & 0xff);
13379 elt_val >>= BITS_PER_UNIT;
13383 /* The immediate must repeat every eight bytes. */
13384 unsigned int nbytes = bytes.length ();
13385 for (unsigned i = 8; i < nbytes; ++i)
13386 if (bytes[i] != bytes[i - 8])
13387 return false;
13389 /* Get the repeating 8-byte value as an integer. No endian correction
13390 is needed here because bytes is already in lsb-first order. */
13391 unsigned HOST_WIDE_INT val64 = 0;
13392 for (unsigned int i = 0; i < 8; i++)
13393 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13394 << (i * BITS_PER_UNIT));
13396 if (vec_flags & VEC_SVE_DATA)
13397 return aarch64_sve_valid_immediate (val64, info);
13398 else
13399 return aarch64_advsimd_valid_immediate (val64, info, which);
13402 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13403 has a step in the range of INDEX. Return the index expression if so,
13404 otherwise return null. */
13406 aarch64_check_zero_based_sve_index_immediate (rtx x)
13408 rtx base, step;
13409 if (const_vec_series_p (x, &base, &step)
13410 && base == const0_rtx
13411 && aarch64_sve_index_immediate_p (step))
13412 return step;
13413 return NULL_RTX;
13416 /* Check of immediate shift constants are within range. */
13417 bool
13418 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13420 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13421 if (left)
13422 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13423 else
13424 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13427 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13428 operation of width WIDTH at bit position POS. */
13431 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13433 gcc_assert (CONST_INT_P (width));
13434 gcc_assert (CONST_INT_P (pos));
13436 unsigned HOST_WIDE_INT mask
13437 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13438 return GEN_INT (mask << UINTVAL (pos));
13441 bool
13442 aarch64_mov_operand_p (rtx x, machine_mode mode)
13444 if (GET_CODE (x) == HIGH
13445 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13446 return true;
13448 if (CONST_INT_P (x))
13449 return true;
13451 if (VECTOR_MODE_P (GET_MODE (x)))
13452 return aarch64_simd_valid_immediate (x, NULL);
13454 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13455 return true;
13457 if (aarch64_sve_cnt_immediate_p (x))
13458 return true;
13460 return aarch64_classify_symbolic_expression (x)
13461 == SYMBOL_TINY_ABSOLUTE;
13464 /* Return a const_int vector of VAL. */
13466 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13468 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13469 return gen_const_vec_duplicate (mode, c);
13472 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13474 bool
13475 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13477 machine_mode vmode;
13479 vmode = aarch64_simd_container_mode (mode, 64);
13480 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13481 return aarch64_simd_valid_immediate (op_v, NULL);
13484 /* Construct and return a PARALLEL RTX vector with elements numbering the
13485 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13486 the vector - from the perspective of the architecture. This does not
13487 line up with GCC's perspective on lane numbers, so we end up with
13488 different masks depending on our target endian-ness. The diagram
13489 below may help. We must draw the distinction when building masks
13490 which select one half of the vector. An instruction selecting
13491 architectural low-lanes for a big-endian target, must be described using
13492 a mask selecting GCC high-lanes.
13494 Big-Endian Little-Endian
13496 GCC 0 1 2 3 3 2 1 0
13497 | x | x | x | x | | x | x | x | x |
13498 Architecture 3 2 1 0 3 2 1 0
13500 Low Mask: { 2, 3 } { 0, 1 }
13501 High Mask: { 0, 1 } { 2, 3 }
13503 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13506 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13508 rtvec v = rtvec_alloc (nunits / 2);
13509 int high_base = nunits / 2;
13510 int low_base = 0;
13511 int base;
13512 rtx t1;
13513 int i;
13515 if (BYTES_BIG_ENDIAN)
13516 base = high ? low_base : high_base;
13517 else
13518 base = high ? high_base : low_base;
13520 for (i = 0; i < nunits / 2; i++)
13521 RTVEC_ELT (v, i) = GEN_INT (base + i);
13523 t1 = gen_rtx_PARALLEL (mode, v);
13524 return t1;
13527 /* Check OP for validity as a PARALLEL RTX vector with elements
13528 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13529 from the perspective of the architecture. See the diagram above
13530 aarch64_simd_vect_par_cnst_half for more details. */
13532 bool
13533 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13534 bool high)
13536 int nelts;
13537 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13538 return false;
13540 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13541 HOST_WIDE_INT count_op = XVECLEN (op, 0);
13542 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13543 int i = 0;
13545 if (count_op != count_ideal)
13546 return false;
13548 for (i = 0; i < count_ideal; i++)
13550 rtx elt_op = XVECEXP (op, 0, i);
13551 rtx elt_ideal = XVECEXP (ideal, 0, i);
13553 if (!CONST_INT_P (elt_op)
13554 || INTVAL (elt_ideal) != INTVAL (elt_op))
13555 return false;
13557 return true;
13560 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13561 HIGH (exclusive). */
13562 void
13563 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13564 const_tree exp)
13566 HOST_WIDE_INT lane;
13567 gcc_assert (CONST_INT_P (operand));
13568 lane = INTVAL (operand);
13570 if (lane < low || lane >= high)
13572 if (exp)
13573 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13574 else
13575 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13579 /* Peform endian correction on lane number N, which indexes a vector
13580 of mode MODE, and return the result as an SImode rtx. */
13583 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13585 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13588 /* Return TRUE if OP is a valid vector addressing mode. */
13590 bool
13591 aarch64_simd_mem_operand_p (rtx op)
13593 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13594 || REG_P (XEXP (op, 0)));
13597 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13599 bool
13600 aarch64_sve_ld1r_operand_p (rtx op)
13602 struct aarch64_address_info addr;
13603 scalar_mode mode;
13605 return (MEM_P (op)
13606 && is_a <scalar_mode> (GET_MODE (op), &mode)
13607 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13608 && addr.type == ADDRESS_REG_IMM
13609 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13612 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13613 The conditions for STR are the same. */
13614 bool
13615 aarch64_sve_ldr_operand_p (rtx op)
13617 struct aarch64_address_info addr;
13619 return (MEM_P (op)
13620 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13621 false, ADDR_QUERY_ANY)
13622 && addr.type == ADDRESS_REG_IMM);
13625 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13626 We need to be able to access the individual pieces, so the range
13627 is different from LD[234] and ST[234]. */
13628 bool
13629 aarch64_sve_struct_memory_operand_p (rtx op)
13631 if (!MEM_P (op))
13632 return false;
13634 machine_mode mode = GET_MODE (op);
13635 struct aarch64_address_info addr;
13636 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13637 ADDR_QUERY_ANY)
13638 || addr.type != ADDRESS_REG_IMM)
13639 return false;
13641 poly_int64 first = addr.const_offset;
13642 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13643 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13644 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13647 /* Emit a register copy from operand to operand, taking care not to
13648 early-clobber source registers in the process.
13650 COUNT is the number of components into which the copy needs to be
13651 decomposed. */
13652 void
13653 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13654 unsigned int count)
13656 unsigned int i;
13657 int rdest = REGNO (operands[0]);
13658 int rsrc = REGNO (operands[1]);
13660 if (!reg_overlap_mentioned_p (operands[0], operands[1])
13661 || rdest < rsrc)
13662 for (i = 0; i < count; i++)
13663 emit_move_insn (gen_rtx_REG (mode, rdest + i),
13664 gen_rtx_REG (mode, rsrc + i));
13665 else
13666 for (i = 0; i < count; i++)
13667 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13668 gen_rtx_REG (mode, rsrc + count - i - 1));
13671 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13672 one of VSTRUCT modes: OI, CI, or XI. */
13674 aarch64_simd_attr_length_rglist (machine_mode mode)
13676 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13677 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13680 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13681 alignment of a vector to 128 bits. SVE predicates have an alignment of
13682 16 bits. */
13683 static HOST_WIDE_INT
13684 aarch64_simd_vector_alignment (const_tree type)
13686 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13687 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13688 be set for non-predicate vectors of booleans. Modes are the most
13689 direct way we have of identifying real SVE predicate types. */
13690 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13691 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13692 return MIN (align, 128);
13695 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13696 static HOST_WIDE_INT
13697 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13699 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13701 /* If the length of the vector is fixed, try to align to that length,
13702 otherwise don't try to align at all. */
13703 HOST_WIDE_INT result;
13704 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13705 result = TYPE_ALIGN (TREE_TYPE (type));
13706 return result;
13708 return TYPE_ALIGN (type);
13711 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13712 static bool
13713 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13715 if (is_packed)
13716 return false;
13718 /* For fixed-length vectors, check that the vectorizer will aim for
13719 full-vector alignment. This isn't true for generic GCC vectors
13720 that are wider than the ABI maximum of 128 bits. */
13721 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13722 && (wi::to_widest (TYPE_SIZE (type))
13723 != aarch64_vectorize_preferred_vector_alignment (type)))
13724 return false;
13726 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13727 return true;
13730 /* Return true if the vector misalignment factor is supported by the
13731 target. */
13732 static bool
13733 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13734 const_tree type, int misalignment,
13735 bool is_packed)
13737 if (TARGET_SIMD && STRICT_ALIGNMENT)
13739 /* Return if movmisalign pattern is not supported for this mode. */
13740 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13741 return false;
13743 /* Misalignment factor is unknown at compile time. */
13744 if (misalignment == -1)
13745 return false;
13747 return default_builtin_support_vector_misalignment (mode, type, misalignment,
13748 is_packed);
13751 /* If VALS is a vector constant that can be loaded into a register
13752 using DUP, generate instructions to do so and return an RTX to
13753 assign to the register. Otherwise return NULL_RTX. */
13754 static rtx
13755 aarch64_simd_dup_constant (rtx vals)
13757 machine_mode mode = GET_MODE (vals);
13758 machine_mode inner_mode = GET_MODE_INNER (mode);
13759 rtx x;
13761 if (!const_vec_duplicate_p (vals, &x))
13762 return NULL_RTX;
13764 /* We can load this constant by using DUP and a constant in a
13765 single ARM register. This will be cheaper than a vector
13766 load. */
13767 x = copy_to_mode_reg (inner_mode, x);
13768 return gen_vec_duplicate (mode, x);
13772 /* Generate code to load VALS, which is a PARALLEL containing only
13773 constants (for vec_init) or CONST_VECTOR, efficiently into a
13774 register. Returns an RTX to copy into the register, or NULL_RTX
13775 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13776 static rtx
13777 aarch64_simd_make_constant (rtx vals)
13779 machine_mode mode = GET_MODE (vals);
13780 rtx const_dup;
13781 rtx const_vec = NULL_RTX;
13782 int n_const = 0;
13783 int i;
13785 if (GET_CODE (vals) == CONST_VECTOR)
13786 const_vec = vals;
13787 else if (GET_CODE (vals) == PARALLEL)
13789 /* A CONST_VECTOR must contain only CONST_INTs and
13790 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13791 Only store valid constants in a CONST_VECTOR. */
13792 int n_elts = XVECLEN (vals, 0);
13793 for (i = 0; i < n_elts; ++i)
13795 rtx x = XVECEXP (vals, 0, i);
13796 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13797 n_const++;
13799 if (n_const == n_elts)
13800 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13802 else
13803 gcc_unreachable ();
13805 if (const_vec != NULL_RTX
13806 && aarch64_simd_valid_immediate (const_vec, NULL))
13807 /* Load using MOVI/MVNI. */
13808 return const_vec;
13809 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13810 /* Loaded using DUP. */
13811 return const_dup;
13812 else if (const_vec != NULL_RTX)
13813 /* Load from constant pool. We can not take advantage of single-cycle
13814 LD1 because we need a PC-relative addressing mode. */
13815 return const_vec;
13816 else
13817 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13818 We can not construct an initializer. */
13819 return NULL_RTX;
13822 /* Expand a vector initialisation sequence, such that TARGET is
13823 initialised to contain VALS. */
13825 void
13826 aarch64_expand_vector_init (rtx target, rtx vals)
13828 machine_mode mode = GET_MODE (target);
13829 scalar_mode inner_mode = GET_MODE_INNER (mode);
13830 /* The number of vector elements. */
13831 int n_elts = XVECLEN (vals, 0);
13832 /* The number of vector elements which are not constant. */
13833 int n_var = 0;
13834 rtx any_const = NULL_RTX;
13835 /* The first element of vals. */
13836 rtx v0 = XVECEXP (vals, 0, 0);
13837 bool all_same = true;
13839 /* Count the number of variable elements to initialise. */
13840 for (int i = 0; i < n_elts; ++i)
13842 rtx x = XVECEXP (vals, 0, i);
13843 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13844 ++n_var;
13845 else
13846 any_const = x;
13848 all_same &= rtx_equal_p (x, v0);
13851 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13852 how best to handle this. */
13853 if (n_var == 0)
13855 rtx constant = aarch64_simd_make_constant (vals);
13856 if (constant != NULL_RTX)
13858 emit_move_insn (target, constant);
13859 return;
13863 /* Splat a single non-constant element if we can. */
13864 if (all_same)
13866 rtx x = copy_to_mode_reg (inner_mode, v0);
13867 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13868 return;
13871 enum insn_code icode = optab_handler (vec_set_optab, mode);
13872 gcc_assert (icode != CODE_FOR_nothing);
13874 /* If there are only variable elements, try to optimize
13875 the insertion using dup for the most common element
13876 followed by insertions. */
13878 /* The algorithm will fill matches[*][0] with the earliest matching element,
13879 and matches[X][1] with the count of duplicate elements (if X is the
13880 earliest element which has duplicates). */
13882 if (n_var == n_elts && n_elts <= 16)
13884 int matches[16][2] = {0};
13885 for (int i = 0; i < n_elts; i++)
13887 for (int j = 0; j <= i; j++)
13889 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13891 matches[i][0] = j;
13892 matches[j][1]++;
13893 break;
13897 int maxelement = 0;
13898 int maxv = 0;
13899 for (int i = 0; i < n_elts; i++)
13900 if (matches[i][1] > maxv)
13902 maxelement = i;
13903 maxv = matches[i][1];
13906 /* Create a duplicate of the most common element. */
13907 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13908 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13910 /* Insert the rest. */
13911 for (int i = 0; i < n_elts; i++)
13913 rtx x = XVECEXP (vals, 0, i);
13914 if (matches[i][0] == maxelement)
13915 continue;
13916 x = copy_to_mode_reg (inner_mode, x);
13917 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13919 return;
13922 /* Initialise a vector which is part-variable. We want to first try
13923 to build those lanes which are constant in the most efficient way we
13924 can. */
13925 if (n_var != n_elts)
13927 rtx copy = copy_rtx (vals);
13929 /* Load constant part of vector. We really don't care what goes into the
13930 parts we will overwrite, but we're more likely to be able to load the
13931 constant efficiently if it has fewer, larger, repeating parts
13932 (see aarch64_simd_valid_immediate). */
13933 for (int i = 0; i < n_elts; i++)
13935 rtx x = XVECEXP (vals, 0, i);
13936 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13937 continue;
13938 rtx subst = any_const;
13939 for (int bit = n_elts / 2; bit > 0; bit /= 2)
13941 /* Look in the copied vector, as more elements are const. */
13942 rtx test = XVECEXP (copy, 0, i ^ bit);
13943 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13945 subst = test;
13946 break;
13949 XVECEXP (copy, 0, i) = subst;
13951 aarch64_expand_vector_init (target, copy);
13954 /* Insert the variable lanes directly. */
13955 for (int i = 0; i < n_elts; i++)
13957 rtx x = XVECEXP (vals, 0, i);
13958 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13959 continue;
13960 x = copy_to_mode_reg (inner_mode, x);
13961 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13965 static unsigned HOST_WIDE_INT
13966 aarch64_shift_truncation_mask (machine_mode mode)
13968 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
13969 return 0;
13970 return GET_MODE_UNIT_BITSIZE (mode) - 1;
13973 /* Select a format to encode pointers in exception handling data. */
13975 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
13977 int type;
13978 switch (aarch64_cmodel)
13980 case AARCH64_CMODEL_TINY:
13981 case AARCH64_CMODEL_TINY_PIC:
13982 case AARCH64_CMODEL_SMALL:
13983 case AARCH64_CMODEL_SMALL_PIC:
13984 case AARCH64_CMODEL_SMALL_SPIC:
13985 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
13986 for everything. */
13987 type = DW_EH_PE_sdata4;
13988 break;
13989 default:
13990 /* No assumptions here. 8-byte relocs required. */
13991 type = DW_EH_PE_sdata8;
13992 break;
13994 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
13997 /* The last .arch and .tune assembly strings that we printed. */
13998 static std::string aarch64_last_printed_arch_string;
13999 static std::string aarch64_last_printed_tune_string;
14001 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14002 by the function fndecl. */
14004 void
14005 aarch64_declare_function_name (FILE *stream, const char* name,
14006 tree fndecl)
14008 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14010 struct cl_target_option *targ_options;
14011 if (target_parts)
14012 targ_options = TREE_TARGET_OPTION (target_parts);
14013 else
14014 targ_options = TREE_TARGET_OPTION (target_option_current_node);
14015 gcc_assert (targ_options);
14017 const struct processor *this_arch
14018 = aarch64_get_arch (targ_options->x_explicit_arch);
14020 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14021 std::string extension
14022 = aarch64_get_extension_string_for_isa_flags (isa_flags,
14023 this_arch->flags);
14024 /* Only update the assembler .arch string if it is distinct from the last
14025 such string we printed. */
14026 std::string to_print = this_arch->name + extension;
14027 if (to_print != aarch64_last_printed_arch_string)
14029 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14030 aarch64_last_printed_arch_string = to_print;
14033 /* Print the cpu name we're tuning for in the comments, might be
14034 useful to readers of the generated asm. Do it only when it changes
14035 from function to function and verbose assembly is requested. */
14036 const struct processor *this_tune
14037 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14039 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14041 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14042 this_tune->name);
14043 aarch64_last_printed_tune_string = this_tune->name;
14046 /* Don't forget the type directive for ELF. */
14047 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14048 ASM_OUTPUT_LABEL (stream, name);
14051 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14053 static void
14054 aarch64_start_file (void)
14056 struct cl_target_option *default_options
14057 = TREE_TARGET_OPTION (target_option_default_node);
14059 const struct processor *default_arch
14060 = aarch64_get_arch (default_options->x_explicit_arch);
14061 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14062 std::string extension
14063 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14064 default_arch->flags);
14066 aarch64_last_printed_arch_string = default_arch->name + extension;
14067 aarch64_last_printed_tune_string = "";
14068 asm_fprintf (asm_out_file, "\t.arch %s\n",
14069 aarch64_last_printed_arch_string.c_str ());
14071 default_file_start ();
14074 /* Emit load exclusive. */
14076 static void
14077 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14078 rtx mem, rtx model_rtx)
14080 rtx (*gen) (rtx, rtx, rtx);
14082 switch (mode)
14084 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14085 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14086 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14087 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14088 default:
14089 gcc_unreachable ();
14092 emit_insn (gen (rval, mem, model_rtx));
14095 /* Emit store exclusive. */
14097 static void
14098 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14099 rtx rval, rtx mem, rtx model_rtx)
14101 rtx (*gen) (rtx, rtx, rtx, rtx);
14103 switch (mode)
14105 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14106 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14107 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14108 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14109 default:
14110 gcc_unreachable ();
14113 emit_insn (gen (bval, rval, mem, model_rtx));
14116 /* Mark the previous jump instruction as unlikely. */
14118 static void
14119 aarch64_emit_unlikely_jump (rtx insn)
14121 rtx_insn *jump = emit_jump_insn (insn);
14122 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14125 /* Expand a compare and swap pattern. */
14127 void
14128 aarch64_expand_compare_and_swap (rtx operands[])
14130 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14131 machine_mode mode, cmp_mode;
14132 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14133 int idx;
14134 gen_cas_fn gen;
14135 const gen_cas_fn split_cas[] =
14137 gen_aarch64_compare_and_swapqi,
14138 gen_aarch64_compare_and_swaphi,
14139 gen_aarch64_compare_and_swapsi,
14140 gen_aarch64_compare_and_swapdi
14142 const gen_cas_fn atomic_cas[] =
14144 gen_aarch64_compare_and_swapqi_lse,
14145 gen_aarch64_compare_and_swaphi_lse,
14146 gen_aarch64_compare_and_swapsi_lse,
14147 gen_aarch64_compare_and_swapdi_lse
14150 bval = operands[0];
14151 rval = operands[1];
14152 mem = operands[2];
14153 oldval = operands[3];
14154 newval = operands[4];
14155 is_weak = operands[5];
14156 mod_s = operands[6];
14157 mod_f = operands[7];
14158 mode = GET_MODE (mem);
14159 cmp_mode = mode;
14161 /* Normally the succ memory model must be stronger than fail, but in the
14162 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14163 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14165 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14166 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14167 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14169 switch (mode)
14171 case E_QImode:
14172 case E_HImode:
14173 /* For short modes, we're going to perform the comparison in SImode,
14174 so do the zero-extension now. */
14175 cmp_mode = SImode;
14176 rval = gen_reg_rtx (SImode);
14177 oldval = convert_modes (SImode, mode, oldval, true);
14178 /* Fall through. */
14180 case E_SImode:
14181 case E_DImode:
14182 /* Force the value into a register if needed. */
14183 if (!aarch64_plus_operand (oldval, mode))
14184 oldval = force_reg (cmp_mode, oldval);
14185 break;
14187 default:
14188 gcc_unreachable ();
14191 switch (mode)
14193 case E_QImode: idx = 0; break;
14194 case E_HImode: idx = 1; break;
14195 case E_SImode: idx = 2; break;
14196 case E_DImode: idx = 3; break;
14197 default:
14198 gcc_unreachable ();
14200 if (TARGET_LSE)
14201 gen = atomic_cas[idx];
14202 else
14203 gen = split_cas[idx];
14205 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14207 if (mode == QImode || mode == HImode)
14208 emit_move_insn (operands[1], gen_lowpart (mode, rval));
14210 x = gen_rtx_REG (CCmode, CC_REGNUM);
14211 x = gen_rtx_EQ (SImode, x, const0_rtx);
14212 emit_insn (gen_rtx_SET (bval, x));
14215 /* Test whether the target supports using a atomic load-operate instruction.
14216 CODE is the operation and AFTER is TRUE if the data in memory after the
14217 operation should be returned and FALSE if the data before the operation
14218 should be returned. Returns FALSE if the operation isn't supported by the
14219 architecture. */
14221 bool
14222 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14224 if (!TARGET_LSE)
14225 return false;
14227 switch (code)
14229 case SET:
14230 case AND:
14231 case IOR:
14232 case XOR:
14233 case MINUS:
14234 case PLUS:
14235 return true;
14236 default:
14237 return false;
14241 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14242 sequence implementing an atomic operation. */
14244 static void
14245 aarch64_emit_post_barrier (enum memmodel model)
14247 const enum memmodel base_model = memmodel_base (model);
14249 if (is_mm_sync (model)
14250 && (base_model == MEMMODEL_ACQUIRE
14251 || base_model == MEMMODEL_ACQ_REL
14252 || base_model == MEMMODEL_SEQ_CST))
14254 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14258 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14259 for the data in memory. EXPECTED is the value expected to be in memory.
14260 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14261 is the memory ordering to use. */
14263 void
14264 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14265 rtx expected, rtx desired,
14266 rtx model)
14268 rtx (*gen) (rtx, rtx, rtx, rtx);
14269 machine_mode mode;
14271 mode = GET_MODE (mem);
14273 switch (mode)
14275 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14276 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14277 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14278 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14279 default:
14280 gcc_unreachable ();
14283 /* Move the expected value into the CAS destination register. */
14284 emit_insn (gen_rtx_SET (rval, expected));
14286 /* Emit the CAS. */
14287 emit_insn (gen (rval, mem, desired, model));
14289 /* Compare the expected value with the value loaded by the CAS, to establish
14290 whether the swap was made. */
14291 aarch64_gen_compare_reg (EQ, rval, expected);
14294 /* Split a compare and swap pattern. */
14296 void
14297 aarch64_split_compare_and_swap (rtx operands[])
14299 rtx rval, mem, oldval, newval, scratch;
14300 machine_mode mode;
14301 bool is_weak;
14302 rtx_code_label *label1, *label2;
14303 rtx x, cond;
14304 enum memmodel model;
14305 rtx model_rtx;
14307 rval = operands[0];
14308 mem = operands[1];
14309 oldval = operands[2];
14310 newval = operands[3];
14311 is_weak = (operands[4] != const0_rtx);
14312 model_rtx = operands[5];
14313 scratch = operands[7];
14314 mode = GET_MODE (mem);
14315 model = memmodel_from_int (INTVAL (model_rtx));
14317 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14318 loop:
14319 .label1:
14320 LD[A]XR rval, [mem]
14321 CBNZ rval, .label2
14322 ST[L]XR scratch, newval, [mem]
14323 CBNZ scratch, .label1
14324 .label2:
14325 CMP rval, 0. */
14326 bool strong_zero_p = !is_weak && oldval == const0_rtx;
14328 label1 = NULL;
14329 if (!is_weak)
14331 label1 = gen_label_rtx ();
14332 emit_label (label1);
14334 label2 = gen_label_rtx ();
14336 /* The initial load can be relaxed for a __sync operation since a final
14337 barrier will be emitted to stop code hoisting. */
14338 if (is_mm_sync (model))
14339 aarch64_emit_load_exclusive (mode, rval, mem,
14340 GEN_INT (MEMMODEL_RELAXED));
14341 else
14342 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14344 if (strong_zero_p)
14346 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14347 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14348 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14349 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14351 else
14353 cond = aarch64_gen_compare_reg (NE, rval, oldval);
14354 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14355 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14356 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14357 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14360 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14362 if (!is_weak)
14364 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14365 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14366 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14367 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14369 else
14371 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14372 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14373 emit_insn (gen_rtx_SET (cond, x));
14376 emit_label (label2);
14377 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14378 to set the condition flags. If this is not used it will be removed by
14379 later passes. */
14380 if (strong_zero_p)
14382 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14383 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14384 emit_insn (gen_rtx_SET (cond, x));
14386 /* Emit any final barrier needed for a __sync operation. */
14387 if (is_mm_sync (model))
14388 aarch64_emit_post_barrier (model);
14391 /* Emit a BIC instruction. */
14393 static void
14394 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14396 rtx shift_rtx = GEN_INT (shift);
14397 rtx (*gen) (rtx, rtx, rtx, rtx);
14399 switch (mode)
14401 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14402 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14403 default:
14404 gcc_unreachable ();
14407 emit_insn (gen (dst, s2, shift_rtx, s1));
14410 /* Emit an atomic swap. */
14412 static void
14413 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14414 rtx mem, rtx model)
14416 rtx (*gen) (rtx, rtx, rtx, rtx);
14418 switch (mode)
14420 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14421 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14422 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14423 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14424 default:
14425 gcc_unreachable ();
14428 emit_insn (gen (dst, mem, value, model));
14431 /* Operations supported by aarch64_emit_atomic_load_op. */
14433 enum aarch64_atomic_load_op_code
14435 AARCH64_LDOP_PLUS, /* A + B */
14436 AARCH64_LDOP_XOR, /* A ^ B */
14437 AARCH64_LDOP_OR, /* A | B */
14438 AARCH64_LDOP_BIC /* A & ~B */
14441 /* Emit an atomic load-operate. */
14443 static void
14444 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14445 machine_mode mode, rtx dst, rtx src,
14446 rtx mem, rtx model)
14448 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14449 const aarch64_atomic_load_op_fn plus[] =
14451 gen_aarch64_atomic_loadaddqi,
14452 gen_aarch64_atomic_loadaddhi,
14453 gen_aarch64_atomic_loadaddsi,
14454 gen_aarch64_atomic_loadadddi
14456 const aarch64_atomic_load_op_fn eor[] =
14458 gen_aarch64_atomic_loadeorqi,
14459 gen_aarch64_atomic_loadeorhi,
14460 gen_aarch64_atomic_loadeorsi,
14461 gen_aarch64_atomic_loadeordi
14463 const aarch64_atomic_load_op_fn ior[] =
14465 gen_aarch64_atomic_loadsetqi,
14466 gen_aarch64_atomic_loadsethi,
14467 gen_aarch64_atomic_loadsetsi,
14468 gen_aarch64_atomic_loadsetdi
14470 const aarch64_atomic_load_op_fn bic[] =
14472 gen_aarch64_atomic_loadclrqi,
14473 gen_aarch64_atomic_loadclrhi,
14474 gen_aarch64_atomic_loadclrsi,
14475 gen_aarch64_atomic_loadclrdi
14477 aarch64_atomic_load_op_fn gen;
14478 int idx = 0;
14480 switch (mode)
14482 case E_QImode: idx = 0; break;
14483 case E_HImode: idx = 1; break;
14484 case E_SImode: idx = 2; break;
14485 case E_DImode: idx = 3; break;
14486 default:
14487 gcc_unreachable ();
14490 switch (code)
14492 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14493 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14494 case AARCH64_LDOP_OR: gen = ior[idx]; break;
14495 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14496 default:
14497 gcc_unreachable ();
14500 emit_insn (gen (dst, mem, src, model));
14503 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14504 location to store the data read from memory. OUT_RESULT is the location to
14505 store the result of the operation. MEM is the memory location to read and
14506 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14507 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14508 be NULL. */
14510 void
14511 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14512 rtx mem, rtx value, rtx model_rtx)
14514 machine_mode mode = GET_MODE (mem);
14515 machine_mode wmode = (mode == DImode ? DImode : SImode);
14516 const bool short_mode = (mode < SImode);
14517 aarch64_atomic_load_op_code ldop_code;
14518 rtx src;
14519 rtx x;
14521 if (out_data)
14522 out_data = gen_lowpart (mode, out_data);
14524 if (out_result)
14525 out_result = gen_lowpart (mode, out_result);
14527 /* Make sure the value is in a register, putting it into a destination
14528 register if it needs to be manipulated. */
14529 if (!register_operand (value, mode)
14530 || code == AND || code == MINUS)
14532 src = out_result ? out_result : out_data;
14533 emit_move_insn (src, gen_lowpart (mode, value));
14535 else
14536 src = value;
14537 gcc_assert (register_operand (src, mode));
14539 /* Preprocess the data for the operation as necessary. If the operation is
14540 a SET then emit a swap instruction and finish. */
14541 switch (code)
14543 case SET:
14544 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14545 return;
14547 case MINUS:
14548 /* Negate the value and treat it as a PLUS. */
14550 rtx neg_src;
14552 /* Resize the value if necessary. */
14553 if (short_mode)
14554 src = gen_lowpart (wmode, src);
14556 neg_src = gen_rtx_NEG (wmode, src);
14557 emit_insn (gen_rtx_SET (src, neg_src));
14559 if (short_mode)
14560 src = gen_lowpart (mode, src);
14562 /* Fall-through. */
14563 case PLUS:
14564 ldop_code = AARCH64_LDOP_PLUS;
14565 break;
14567 case IOR:
14568 ldop_code = AARCH64_LDOP_OR;
14569 break;
14571 case XOR:
14572 ldop_code = AARCH64_LDOP_XOR;
14573 break;
14575 case AND:
14577 rtx not_src;
14579 /* Resize the value if necessary. */
14580 if (short_mode)
14581 src = gen_lowpart (wmode, src);
14583 not_src = gen_rtx_NOT (wmode, src);
14584 emit_insn (gen_rtx_SET (src, not_src));
14586 if (short_mode)
14587 src = gen_lowpart (mode, src);
14589 ldop_code = AARCH64_LDOP_BIC;
14590 break;
14592 default:
14593 /* The operation can't be done with atomic instructions. */
14594 gcc_unreachable ();
14597 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14599 /* If necessary, calculate the data in memory after the update by redoing the
14600 operation from values in registers. */
14601 if (!out_result)
14602 return;
14604 if (short_mode)
14606 src = gen_lowpart (wmode, src);
14607 out_data = gen_lowpart (wmode, out_data);
14608 out_result = gen_lowpart (wmode, out_result);
14611 x = NULL_RTX;
14613 switch (code)
14615 case MINUS:
14616 case PLUS:
14617 x = gen_rtx_PLUS (wmode, out_data, src);
14618 break;
14619 case IOR:
14620 x = gen_rtx_IOR (wmode, out_data, src);
14621 break;
14622 case XOR:
14623 x = gen_rtx_XOR (wmode, out_data, src);
14624 break;
14625 case AND:
14626 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14627 return;
14628 default:
14629 gcc_unreachable ();
14632 emit_set_insn (out_result, x);
14634 return;
14637 /* Split an atomic operation. */
14639 void
14640 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14641 rtx value, rtx model_rtx, rtx cond)
14643 machine_mode mode = GET_MODE (mem);
14644 machine_mode wmode = (mode == DImode ? DImode : SImode);
14645 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14646 const bool is_sync = is_mm_sync (model);
14647 rtx_code_label *label;
14648 rtx x;
14650 /* Split the atomic operation into a sequence. */
14651 label = gen_label_rtx ();
14652 emit_label (label);
14654 if (new_out)
14655 new_out = gen_lowpart (wmode, new_out);
14656 if (old_out)
14657 old_out = gen_lowpart (wmode, old_out);
14658 else
14659 old_out = new_out;
14660 value = simplify_gen_subreg (wmode, value, mode, 0);
14662 /* The initial load can be relaxed for a __sync operation since a final
14663 barrier will be emitted to stop code hoisting. */
14664 if (is_sync)
14665 aarch64_emit_load_exclusive (mode, old_out, mem,
14666 GEN_INT (MEMMODEL_RELAXED));
14667 else
14668 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14670 switch (code)
14672 case SET:
14673 new_out = value;
14674 break;
14676 case NOT:
14677 x = gen_rtx_AND (wmode, old_out, value);
14678 emit_insn (gen_rtx_SET (new_out, x));
14679 x = gen_rtx_NOT (wmode, new_out);
14680 emit_insn (gen_rtx_SET (new_out, x));
14681 break;
14683 case MINUS:
14684 if (CONST_INT_P (value))
14686 value = GEN_INT (-INTVAL (value));
14687 code = PLUS;
14689 /* Fall through. */
14691 default:
14692 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14693 emit_insn (gen_rtx_SET (new_out, x));
14694 break;
14697 aarch64_emit_store_exclusive (mode, cond, mem,
14698 gen_lowpart (mode, new_out), model_rtx);
14700 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14701 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14702 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14703 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14705 /* Emit any final barrier needed for a __sync operation. */
14706 if (is_sync)
14707 aarch64_emit_post_barrier (model);
14710 static void
14711 aarch64_init_libfuncs (void)
14713 /* Half-precision float operations. The compiler handles all operations
14714 with NULL libfuncs by converting to SFmode. */
14716 /* Conversions. */
14717 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14718 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14720 /* Arithmetic. */
14721 set_optab_libfunc (add_optab, HFmode, NULL);
14722 set_optab_libfunc (sdiv_optab, HFmode, NULL);
14723 set_optab_libfunc (smul_optab, HFmode, NULL);
14724 set_optab_libfunc (neg_optab, HFmode, NULL);
14725 set_optab_libfunc (sub_optab, HFmode, NULL);
14727 /* Comparisons. */
14728 set_optab_libfunc (eq_optab, HFmode, NULL);
14729 set_optab_libfunc (ne_optab, HFmode, NULL);
14730 set_optab_libfunc (lt_optab, HFmode, NULL);
14731 set_optab_libfunc (le_optab, HFmode, NULL);
14732 set_optab_libfunc (ge_optab, HFmode, NULL);
14733 set_optab_libfunc (gt_optab, HFmode, NULL);
14734 set_optab_libfunc (unord_optab, HFmode, NULL);
14737 /* Target hook for c_mode_for_suffix. */
14738 static machine_mode
14739 aarch64_c_mode_for_suffix (char suffix)
14741 if (suffix == 'q')
14742 return TFmode;
14744 return VOIDmode;
14747 /* We can only represent floating point constants which will fit in
14748 "quarter-precision" values. These values are characterised by
14749 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14752 (-1)^s * (n/16) * 2^r
14754 Where:
14755 's' is the sign bit.
14756 'n' is an integer in the range 16 <= n <= 31.
14757 'r' is an integer in the range -3 <= r <= 4. */
14759 /* Return true iff X can be represented by a quarter-precision
14760 floating point immediate operand X. Note, we cannot represent 0.0. */
14761 bool
14762 aarch64_float_const_representable_p (rtx x)
14764 /* This represents our current view of how many bits
14765 make up the mantissa. */
14766 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14767 int exponent;
14768 unsigned HOST_WIDE_INT mantissa, mask;
14769 REAL_VALUE_TYPE r, m;
14770 bool fail;
14772 if (!CONST_DOUBLE_P (x))
14773 return false;
14775 /* We don't support HFmode constants yet. */
14776 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14777 return false;
14779 r = *CONST_DOUBLE_REAL_VALUE (x);
14781 /* We cannot represent infinities, NaNs or +/-zero. We won't
14782 know if we have +zero until we analyse the mantissa, but we
14783 can reject the other invalid values. */
14784 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14785 || REAL_VALUE_MINUS_ZERO (r))
14786 return false;
14788 /* Extract exponent. */
14789 r = real_value_abs (&r);
14790 exponent = REAL_EXP (&r);
14792 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14793 highest (sign) bit, with a fixed binary point at bit point_pos.
14794 m1 holds the low part of the mantissa, m2 the high part.
14795 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14796 bits for the mantissa, this can fail (low bits will be lost). */
14797 real_ldexp (&m, &r, point_pos - exponent);
14798 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14800 /* If the low part of the mantissa has bits set we cannot represent
14801 the value. */
14802 if (w.ulow () != 0)
14803 return false;
14804 /* We have rejected the lower HOST_WIDE_INT, so update our
14805 understanding of how many bits lie in the mantissa and
14806 look only at the high HOST_WIDE_INT. */
14807 mantissa = w.elt (1);
14808 point_pos -= HOST_BITS_PER_WIDE_INT;
14810 /* We can only represent values with a mantissa of the form 1.xxxx. */
14811 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14812 if ((mantissa & mask) != 0)
14813 return false;
14815 /* Having filtered unrepresentable values, we may now remove all
14816 but the highest 5 bits. */
14817 mantissa >>= point_pos - 5;
14819 /* We cannot represent the value 0.0, so reject it. This is handled
14820 elsewhere. */
14821 if (mantissa == 0)
14822 return false;
14824 /* Then, as bit 4 is always set, we can mask it off, leaving
14825 the mantissa in the range [0, 15]. */
14826 mantissa &= ~(1 << 4);
14827 gcc_assert (mantissa <= 15);
14829 /* GCC internally does not use IEEE754-like encoding (where normalized
14830 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14831 Our mantissa values are shifted 4 places to the left relative to
14832 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14833 by 5 places to correct for GCC's representation. */
14834 exponent = 5 - exponent;
14836 return (exponent >= 0 && exponent <= 7);
14839 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14840 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14841 output MOVI/MVNI, ORR or BIC immediate. */
14842 char*
14843 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14844 enum simd_immediate_check which)
14846 bool is_valid;
14847 static char templ[40];
14848 const char *mnemonic;
14849 const char *shift_op;
14850 unsigned int lane_count = 0;
14851 char element_char;
14853 struct simd_immediate_info info;
14855 /* This will return true to show const_vector is legal for use as either
14856 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14857 It will also update INFO to show how the immediate should be generated.
14858 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14859 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14860 gcc_assert (is_valid);
14862 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14863 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14865 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14867 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14868 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14869 move immediate path. */
14870 if (aarch64_float_const_zero_rtx_p (info.value))
14871 info.value = GEN_INT (0);
14872 else
14874 const unsigned int buf_size = 20;
14875 char float_buf[buf_size] = {'\0'};
14876 real_to_decimal_for_mode (float_buf,
14877 CONST_DOUBLE_REAL_VALUE (info.value),
14878 buf_size, buf_size, 1, info.elt_mode);
14880 if (lane_count == 1)
14881 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14882 else
14883 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14884 lane_count, element_char, float_buf);
14885 return templ;
14889 gcc_assert (CONST_INT_P (info.value));
14891 if (which == AARCH64_CHECK_MOV)
14893 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14894 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14895 if (lane_count == 1)
14896 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14897 mnemonic, UINTVAL (info.value));
14898 else if (info.shift)
14899 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14900 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14901 element_char, UINTVAL (info.value), shift_op, info.shift);
14902 else
14903 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14904 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14905 element_char, UINTVAL (info.value));
14907 else
14909 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
14910 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14911 if (info.shift)
14912 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14913 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14914 element_char, UINTVAL (info.value), "lsl", info.shift);
14915 else
14916 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14917 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14918 element_char, UINTVAL (info.value));
14920 return templ;
14923 char*
14924 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14927 /* If a floating point number was passed and we desire to use it in an
14928 integer mode do the conversion to integer. */
14929 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14931 unsigned HOST_WIDE_INT ival;
14932 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14933 gcc_unreachable ();
14934 immediate = gen_int_mode (ival, mode);
14937 machine_mode vmode;
14938 /* use a 64 bit mode for everything except for DI/DF mode, where we use
14939 a 128 bit vector mode. */
14940 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14942 vmode = aarch64_simd_container_mode (mode, width);
14943 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14944 return aarch64_output_simd_mov_immediate (v_op, width);
14947 /* Return the output string to use for moving immediate CONST_VECTOR
14948 into an SVE register. */
14950 char *
14951 aarch64_output_sve_mov_immediate (rtx const_vector)
14953 static char templ[40];
14954 struct simd_immediate_info info;
14955 char element_char;
14957 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
14958 gcc_assert (is_valid);
14960 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14962 if (info.step)
14964 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
14965 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
14966 element_char, INTVAL (info.value), INTVAL (info.step));
14967 return templ;
14970 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14972 if (aarch64_float_const_zero_rtx_p (info.value))
14973 info.value = GEN_INT (0);
14974 else
14976 const int buf_size = 20;
14977 char float_buf[buf_size] = {};
14978 real_to_decimal_for_mode (float_buf,
14979 CONST_DOUBLE_REAL_VALUE (info.value),
14980 buf_size, buf_size, 1, info.elt_mode);
14982 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
14983 element_char, float_buf);
14984 return templ;
14988 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
14989 element_char, INTVAL (info.value));
14990 return templ;
14993 /* Return the asm format for a PTRUE instruction whose destination has
14994 mode MODE. SUFFIX is the element size suffix. */
14996 char *
14997 aarch64_output_ptrue (machine_mode mode, char suffix)
14999 unsigned int nunits;
15000 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15001 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15002 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15003 else
15004 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15005 return buf;
15008 /* Split operands into moves from op[1] + op[2] into op[0]. */
15010 void
15011 aarch64_split_combinev16qi (rtx operands[3])
15013 unsigned int dest = REGNO (operands[0]);
15014 unsigned int src1 = REGNO (operands[1]);
15015 unsigned int src2 = REGNO (operands[2]);
15016 machine_mode halfmode = GET_MODE (operands[1]);
15017 unsigned int halfregs = REG_NREGS (operands[1]);
15018 rtx destlo, desthi;
15020 gcc_assert (halfmode == V16QImode);
15022 if (src1 == dest && src2 == dest + halfregs)
15024 /* No-op move. Can't split to nothing; emit something. */
15025 emit_note (NOTE_INSN_DELETED);
15026 return;
15029 /* Preserve register attributes for variable tracking. */
15030 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15031 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15032 GET_MODE_SIZE (halfmode));
15034 /* Special case of reversed high/low parts. */
15035 if (reg_overlap_mentioned_p (operands[2], destlo)
15036 && reg_overlap_mentioned_p (operands[1], desthi))
15038 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15039 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15040 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15042 else if (!reg_overlap_mentioned_p (operands[2], destlo))
15044 /* Try to avoid unnecessary moves if part of the result
15045 is in the right place already. */
15046 if (src1 != dest)
15047 emit_move_insn (destlo, operands[1]);
15048 if (src2 != dest + halfregs)
15049 emit_move_insn (desthi, operands[2]);
15051 else
15053 if (src2 != dest + halfregs)
15054 emit_move_insn (desthi, operands[2]);
15055 if (src1 != dest)
15056 emit_move_insn (destlo, operands[1]);
15060 /* vec_perm support. */
15062 struct expand_vec_perm_d
15064 rtx target, op0, op1;
15065 vec_perm_indices perm;
15066 machine_mode vmode;
15067 unsigned int vec_flags;
15068 bool one_vector_p;
15069 bool testing_p;
15072 /* Generate a variable permutation. */
15074 static void
15075 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15077 machine_mode vmode = GET_MODE (target);
15078 bool one_vector_p = rtx_equal_p (op0, op1);
15080 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15081 gcc_checking_assert (GET_MODE (op0) == vmode);
15082 gcc_checking_assert (GET_MODE (op1) == vmode);
15083 gcc_checking_assert (GET_MODE (sel) == vmode);
15084 gcc_checking_assert (TARGET_SIMD);
15086 if (one_vector_p)
15088 if (vmode == V8QImode)
15090 /* Expand the argument to a V16QI mode by duplicating it. */
15091 rtx pair = gen_reg_rtx (V16QImode);
15092 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15093 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15095 else
15097 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15100 else
15102 rtx pair;
15104 if (vmode == V8QImode)
15106 pair = gen_reg_rtx (V16QImode);
15107 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15108 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15110 else
15112 pair = gen_reg_rtx (OImode);
15113 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15114 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15119 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15120 NELT is the number of elements in the vector. */
15122 void
15123 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15124 unsigned int nelt)
15126 machine_mode vmode = GET_MODE (target);
15127 bool one_vector_p = rtx_equal_p (op0, op1);
15128 rtx mask;
15130 /* The TBL instruction does not use a modulo index, so we must take care
15131 of that ourselves. */
15132 mask = aarch64_simd_gen_const_vector_dup (vmode,
15133 one_vector_p ? nelt - 1 : 2 * nelt - 1);
15134 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15136 /* For big-endian, we also need to reverse the index within the vector
15137 (but not which vector). */
15138 if (BYTES_BIG_ENDIAN)
15140 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15141 if (!one_vector_p)
15142 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15143 sel = expand_simple_binop (vmode, XOR, sel, mask,
15144 NULL, 0, OPTAB_LIB_WIDEN);
15146 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15149 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15151 static void
15152 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15154 emit_insn (gen_rtx_SET (target,
15155 gen_rtx_UNSPEC (GET_MODE (target),
15156 gen_rtvec (2, op0, op1), code)));
15159 /* Expand an SVE vec_perm with the given operands. */
15161 void
15162 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15164 machine_mode data_mode = GET_MODE (target);
15165 machine_mode sel_mode = GET_MODE (sel);
15166 /* Enforced by the pattern condition. */
15167 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15169 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15170 size of the two value vectors, i.e. the upper bits of the indices
15171 are effectively ignored. SVE TBL instead produces 0 for any
15172 out-of-range indices, so we need to modulo all the vec_perm indices
15173 to ensure they are all in range. */
15174 rtx sel_reg = force_reg (sel_mode, sel);
15176 /* Check if the sel only references the first values vector. */
15177 if (GET_CODE (sel) == CONST_VECTOR
15178 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15180 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15181 return;
15184 /* Check if the two values vectors are the same. */
15185 if (rtx_equal_p (op0, op1))
15187 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15188 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15189 NULL, 0, OPTAB_DIRECT);
15190 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15191 return;
15194 /* Run TBL on for each value vector and combine the results. */
15196 rtx res0 = gen_reg_rtx (data_mode);
15197 rtx res1 = gen_reg_rtx (data_mode);
15198 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15199 if (GET_CODE (sel) != CONST_VECTOR
15200 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15202 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15203 2 * nunits - 1);
15204 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15205 NULL, 0, OPTAB_DIRECT);
15207 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15208 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15209 NULL, 0, OPTAB_DIRECT);
15210 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15211 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15212 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15213 else
15214 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15217 /* Recognize patterns suitable for the TRN instructions. */
15218 static bool
15219 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15221 HOST_WIDE_INT odd;
15222 poly_uint64 nelt = d->perm.length ();
15223 rtx out, in0, in1, x;
15224 machine_mode vmode = d->vmode;
15226 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15227 return false;
15229 /* Note that these are little-endian tests.
15230 We correct for big-endian later. */
15231 if (!d->perm[0].is_constant (&odd)
15232 || (odd != 0 && odd != 1)
15233 || !d->perm.series_p (0, 2, odd, 2)
15234 || !d->perm.series_p (1, 2, nelt + odd, 2))
15235 return false;
15237 /* Success! */
15238 if (d->testing_p)
15239 return true;
15241 in0 = d->op0;
15242 in1 = d->op1;
15243 /* We don't need a big-endian lane correction for SVE; see the comment
15244 at the head of aarch64-sve.md for details. */
15245 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15247 x = in0, in0 = in1, in1 = x;
15248 odd = !odd;
15250 out = d->target;
15252 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15253 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15254 return true;
15257 /* Recognize patterns suitable for the UZP instructions. */
15258 static bool
15259 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15261 HOST_WIDE_INT odd;
15262 rtx out, in0, in1, x;
15263 machine_mode vmode = d->vmode;
15265 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15266 return false;
15268 /* Note that these are little-endian tests.
15269 We correct for big-endian later. */
15270 if (!d->perm[0].is_constant (&odd)
15271 || (odd != 0 && odd != 1)
15272 || !d->perm.series_p (0, 1, odd, 2))
15273 return false;
15275 /* Success! */
15276 if (d->testing_p)
15277 return true;
15279 in0 = d->op0;
15280 in1 = d->op1;
15281 /* We don't need a big-endian lane correction for SVE; see the comment
15282 at the head of aarch64-sve.md for details. */
15283 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15285 x = in0, in0 = in1, in1 = x;
15286 odd = !odd;
15288 out = d->target;
15290 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15291 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15292 return true;
15295 /* Recognize patterns suitable for the ZIP instructions. */
15296 static bool
15297 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15299 unsigned int high;
15300 poly_uint64 nelt = d->perm.length ();
15301 rtx out, in0, in1, x;
15302 machine_mode vmode = d->vmode;
15304 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15305 return false;
15307 /* Note that these are little-endian tests.
15308 We correct for big-endian later. */
15309 poly_uint64 first = d->perm[0];
15310 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15311 || !d->perm.series_p (0, 2, first, 1)
15312 || !d->perm.series_p (1, 2, first + nelt, 1))
15313 return false;
15314 high = maybe_ne (first, 0U);
15316 /* Success! */
15317 if (d->testing_p)
15318 return true;
15320 in0 = d->op0;
15321 in1 = d->op1;
15322 /* We don't need a big-endian lane correction for SVE; see the comment
15323 at the head of aarch64-sve.md for details. */
15324 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15326 x = in0, in0 = in1, in1 = x;
15327 high = !high;
15329 out = d->target;
15331 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15332 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15333 return true;
15336 /* Recognize patterns for the EXT insn. */
15338 static bool
15339 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15341 HOST_WIDE_INT location;
15342 rtx offset;
15344 /* The first element always refers to the first vector.
15345 Check if the extracted indices are increasing by one. */
15346 if (d->vec_flags == VEC_SVE_PRED
15347 || !d->perm[0].is_constant (&location)
15348 || !d->perm.series_p (0, 1, location, 1))
15349 return false;
15351 /* Success! */
15352 if (d->testing_p)
15353 return true;
15355 /* The case where (location == 0) is a no-op for both big- and little-endian,
15356 and is removed by the mid-end at optimization levels -O1 and higher.
15358 We don't need a big-endian lane correction for SVE; see the comment
15359 at the head of aarch64-sve.md for details. */
15360 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15362 /* After setup, we want the high elements of the first vector (stored
15363 at the LSB end of the register), and the low elements of the second
15364 vector (stored at the MSB end of the register). So swap. */
15365 std::swap (d->op0, d->op1);
15366 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15367 to_constant () is safe since this is restricted to Advanced SIMD
15368 vectors. */
15369 location = d->perm.length ().to_constant () - location;
15372 offset = GEN_INT (location);
15373 emit_set_insn (d->target,
15374 gen_rtx_UNSPEC (d->vmode,
15375 gen_rtvec (3, d->op0, d->op1, offset),
15376 UNSPEC_EXT));
15377 return true;
15380 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15381 within each 64-bit, 32-bit or 16-bit granule. */
15383 static bool
15384 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15386 HOST_WIDE_INT diff;
15387 unsigned int i, size, unspec;
15388 machine_mode pred_mode;
15390 if (d->vec_flags == VEC_SVE_PRED
15391 || !d->one_vector_p
15392 || !d->perm[0].is_constant (&diff))
15393 return false;
15395 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15396 if (size == 8)
15398 unspec = UNSPEC_REV64;
15399 pred_mode = VNx2BImode;
15401 else if (size == 4)
15403 unspec = UNSPEC_REV32;
15404 pred_mode = VNx4BImode;
15406 else if (size == 2)
15408 unspec = UNSPEC_REV16;
15409 pred_mode = VNx8BImode;
15411 else
15412 return false;
15414 unsigned int step = diff + 1;
15415 for (i = 0; i < step; ++i)
15416 if (!d->perm.series_p (i, step, diff - i, step))
15417 return false;
15419 /* Success! */
15420 if (d->testing_p)
15421 return true;
15423 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15424 if (d->vec_flags == VEC_SVE_DATA)
15426 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15427 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15428 UNSPEC_MERGE_PTRUE);
15430 emit_set_insn (d->target, src);
15431 return true;
15434 /* Recognize patterns for the REV insn, which reverses elements within
15435 a full vector. */
15437 static bool
15438 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15440 poly_uint64 nelt = d->perm.length ();
15442 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15443 return false;
15445 if (!d->perm.series_p (0, 1, nelt - 1, -1))
15446 return false;
15448 /* Success! */
15449 if (d->testing_p)
15450 return true;
15452 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15453 emit_set_insn (d->target, src);
15454 return true;
15457 static bool
15458 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15460 rtx out = d->target;
15461 rtx in0;
15462 HOST_WIDE_INT elt;
15463 machine_mode vmode = d->vmode;
15464 rtx lane;
15466 if (d->vec_flags == VEC_SVE_PRED
15467 || d->perm.encoding ().encoded_nelts () != 1
15468 || !d->perm[0].is_constant (&elt))
15469 return false;
15471 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15472 return false;
15474 /* Success! */
15475 if (d->testing_p)
15476 return true;
15478 /* The generic preparation in aarch64_expand_vec_perm_const_1
15479 swaps the operand order and the permute indices if it finds
15480 d->perm[0] to be in the second operand. Thus, we can always
15481 use d->op0 and need not do any extra arithmetic to get the
15482 correct lane number. */
15483 in0 = d->op0;
15484 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
15486 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15487 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15488 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15489 return true;
15492 static bool
15493 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15495 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15496 machine_mode vmode = d->vmode;
15498 /* Make sure that the indices are constant. */
15499 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15500 for (unsigned int i = 0; i < encoded_nelts; ++i)
15501 if (!d->perm[i].is_constant ())
15502 return false;
15504 if (d->testing_p)
15505 return true;
15507 /* Generic code will try constant permutation twice. Once with the
15508 original mode and again with the elements lowered to QImode.
15509 So wait and don't do the selector expansion ourselves. */
15510 if (vmode != V8QImode && vmode != V16QImode)
15511 return false;
15513 /* to_constant is safe since this routine is specific to Advanced SIMD
15514 vectors. */
15515 unsigned int nelt = d->perm.length ().to_constant ();
15516 for (unsigned int i = 0; i < nelt; ++i)
15517 /* If big-endian and two vectors we end up with a weird mixed-endian
15518 mode on NEON. Reverse the index within each word but not the word
15519 itself. to_constant is safe because we checked is_constant above. */
15520 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15521 ? d->perm[i].to_constant () ^ (nelt - 1)
15522 : d->perm[i].to_constant ());
15524 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15525 sel = force_reg (vmode, sel);
15527 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15528 return true;
15531 /* Try to implement D using an SVE TBL instruction. */
15533 static bool
15534 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15536 unsigned HOST_WIDE_INT nelt;
15538 /* Permuting two variable-length vectors could overflow the
15539 index range. */
15540 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15541 return false;
15543 if (d->testing_p)
15544 return true;
15546 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15547 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15548 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15549 return true;
15552 static bool
15553 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15555 /* The pattern matching functions above are written to look for a small
15556 number to begin the sequence (0, 1, N/2). If we begin with an index
15557 from the second operand, we can swap the operands. */
15558 poly_int64 nelt = d->perm.length ();
15559 if (known_ge (d->perm[0], nelt))
15561 d->perm.rotate_inputs (1);
15562 std::swap (d->op0, d->op1);
15565 if ((d->vec_flags == VEC_ADVSIMD
15566 || d->vec_flags == VEC_SVE_DATA
15567 || d->vec_flags == VEC_SVE_PRED)
15568 && known_gt (nelt, 1))
15570 if (aarch64_evpc_rev_local (d))
15571 return true;
15572 else if (aarch64_evpc_rev_global (d))
15573 return true;
15574 else if (aarch64_evpc_ext (d))
15575 return true;
15576 else if (aarch64_evpc_dup (d))
15577 return true;
15578 else if (aarch64_evpc_zip (d))
15579 return true;
15580 else if (aarch64_evpc_uzp (d))
15581 return true;
15582 else if (aarch64_evpc_trn (d))
15583 return true;
15584 if (d->vec_flags == VEC_SVE_DATA)
15585 return aarch64_evpc_sve_tbl (d);
15586 else if (d->vec_flags == VEC_SVE_DATA)
15587 return aarch64_evpc_tbl (d);
15589 return false;
15592 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15594 static bool
15595 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15596 rtx op1, const vec_perm_indices &sel)
15598 struct expand_vec_perm_d d;
15600 /* Check whether the mask can be applied to a single vector. */
15601 if (op0 && rtx_equal_p (op0, op1))
15602 d.one_vector_p = true;
15603 else if (sel.all_from_input_p (0))
15605 d.one_vector_p = true;
15606 op1 = op0;
15608 else if (sel.all_from_input_p (1))
15610 d.one_vector_p = true;
15611 op0 = op1;
15613 else
15614 d.one_vector_p = false;
15616 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15617 sel.nelts_per_input ());
15618 d.vmode = vmode;
15619 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15620 d.target = target;
15621 d.op0 = op0;
15622 d.op1 = op1;
15623 d.testing_p = !target;
15625 if (!d.testing_p)
15626 return aarch64_expand_vec_perm_const_1 (&d);
15628 rtx_insn *last = get_last_insn ();
15629 bool ret = aarch64_expand_vec_perm_const_1 (&d);
15630 gcc_assert (last == get_last_insn ());
15632 return ret;
15635 /* Generate a byte permute mask for a register of mode MODE,
15636 which has NUNITS units. */
15639 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15641 /* We have to reverse each vector because we dont have
15642 a permuted load that can reverse-load according to ABI rules. */
15643 rtx mask;
15644 rtvec v = rtvec_alloc (16);
15645 unsigned int i, j;
15646 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15648 gcc_assert (BYTES_BIG_ENDIAN);
15649 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15651 for (i = 0; i < nunits; i++)
15652 for (j = 0; j < usize; j++)
15653 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15654 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15655 return force_reg (V16QImode, mask);
15658 /* Return true if X is a valid second operand for the SVE instruction
15659 that implements integer comparison OP_CODE. */
15661 static bool
15662 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15664 if (register_operand (x, VOIDmode))
15665 return true;
15667 switch (op_code)
15669 case LTU:
15670 case LEU:
15671 case GEU:
15672 case GTU:
15673 return aarch64_sve_cmp_immediate_p (x, false);
15674 case LT:
15675 case LE:
15676 case GE:
15677 case GT:
15678 case NE:
15679 case EQ:
15680 return aarch64_sve_cmp_immediate_p (x, true);
15681 default:
15682 gcc_unreachable ();
15686 /* Return the UNSPEC_COND_* code for comparison CODE. */
15688 static unsigned int
15689 aarch64_unspec_cond_code (rtx_code code)
15691 switch (code)
15693 case NE:
15694 return UNSPEC_COND_NE;
15695 case EQ:
15696 return UNSPEC_COND_EQ;
15697 case LT:
15698 return UNSPEC_COND_LT;
15699 case GT:
15700 return UNSPEC_COND_GT;
15701 case LE:
15702 return UNSPEC_COND_LE;
15703 case GE:
15704 return UNSPEC_COND_GE;
15705 case LTU:
15706 return UNSPEC_COND_LO;
15707 case GTU:
15708 return UNSPEC_COND_HI;
15709 case LEU:
15710 return UNSPEC_COND_LS;
15711 case GEU:
15712 return UNSPEC_COND_HS;
15713 case UNORDERED:
15714 return UNSPEC_COND_UO;
15715 default:
15716 gcc_unreachable ();
15720 /* Return an (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>) expression,
15721 where <X> is the operation associated with comparison CODE. */
15723 static rtx
15724 aarch64_gen_unspec_cond (rtx_code code, machine_mode pred_mode,
15725 rtx pred, rtx op0, rtx op1)
15727 rtvec vec = gen_rtvec (3, pred, op0, op1);
15728 return gen_rtx_UNSPEC (pred_mode, vec, aarch64_unspec_cond_code (code));
15731 /* Expand an SVE integer comparison:
15733 TARGET = CODE (OP0, OP1). */
15735 void
15736 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15738 machine_mode pred_mode = GET_MODE (target);
15739 machine_mode data_mode = GET_MODE (op0);
15741 if (!aarch64_sve_cmp_operand_p (code, op1))
15742 op1 = force_reg (data_mode, op1);
15744 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15745 rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, ptrue, op0, op1);
15746 emit_insn (gen_set_clobber_cc (target, unspec));
15749 /* Emit an instruction:
15751 (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15753 where <X> is the operation associated with comparison CODE. */
15755 static void
15756 aarch64_emit_unspec_cond (rtx target, rtx_code code, machine_mode pred_mode,
15757 rtx pred, rtx op0, rtx op1)
15759 rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, pred, op0, op1);
15760 emit_set_insn (target, unspec);
15763 /* Emit:
15765 (set TMP1 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X1>))
15766 (set TMP2 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X2>))
15767 (set TARGET (and:PRED_MODE (ior:PRED_MODE TMP1 TMP2) PTRUE))
15769 where <Xi> is the operation associated with comparison CODEi. */
15771 static void
15772 aarch64_emit_unspec_cond_or (rtx target, rtx_code code1, rtx_code code2,
15773 machine_mode pred_mode, rtx ptrue,
15774 rtx op0, rtx op1)
15776 rtx tmp1 = gen_reg_rtx (pred_mode);
15777 aarch64_emit_unspec_cond (tmp1, code1, pred_mode, ptrue, op0, op1);
15778 rtx tmp2 = gen_reg_rtx (pred_mode);
15779 aarch64_emit_unspec_cond (tmp2, code2, pred_mode, ptrue, op0, op1);
15780 emit_set_insn (target, gen_rtx_AND (pred_mode,
15781 gen_rtx_IOR (pred_mode, tmp1, tmp2),
15782 ptrue));
15785 /* If CAN_INVERT_P, emit an instruction:
15787 (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15789 where <X> is the operation associated with comparison CODE. Otherwise
15790 emit:
15792 (set TMP (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15793 (set TARGET (and:PRED_MODE (not:PRED_MODE TMP) PTRUE))
15795 where the second instructions sets TARGET to the inverse of TMP. */
15797 static void
15798 aarch64_emit_inverted_unspec_cond (rtx target, rtx_code code,
15799 machine_mode pred_mode, rtx ptrue, rtx pred,
15800 rtx op0, rtx op1, bool can_invert_p)
15802 if (can_invert_p)
15803 aarch64_emit_unspec_cond (target, code, pred_mode, pred, op0, op1);
15804 else
15806 rtx tmp = gen_reg_rtx (pred_mode);
15807 aarch64_emit_unspec_cond (tmp, code, pred_mode, pred, op0, op1);
15808 emit_set_insn (target, gen_rtx_AND (pred_mode,
15809 gen_rtx_NOT (pred_mode, tmp),
15810 ptrue));
15814 /* Expand an SVE floating-point comparison:
15816 TARGET = CODE (OP0, OP1)
15818 If CAN_INVERT_P is true, the caller can also handle inverted results;
15819 return true if the result is in fact inverted. */
15821 bool
15822 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15823 rtx op0, rtx op1, bool can_invert_p)
15825 machine_mode pred_mode = GET_MODE (target);
15826 machine_mode data_mode = GET_MODE (op0);
15828 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15829 switch (code)
15831 case UNORDERED:
15832 /* UNORDERED has no immediate form. */
15833 op1 = force_reg (data_mode, op1);
15834 aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15835 return false;
15837 case LT:
15838 case LE:
15839 case GT:
15840 case GE:
15841 case EQ:
15842 case NE:
15843 /* There is native support for the comparison. */
15844 aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15845 return false;
15847 case ORDERED:
15848 /* There is native support for the inverse comparison. */
15849 op1 = force_reg (data_mode, op1);
15850 aarch64_emit_inverted_unspec_cond (target, UNORDERED,
15851 pred_mode, ptrue, ptrue, op0, op1,
15852 can_invert_p);
15853 return can_invert_p;
15855 case LTGT:
15856 /* This is a trapping operation (LT or GT). */
15857 aarch64_emit_unspec_cond_or (target, LT, GT, pred_mode, ptrue, op0, op1);
15858 return false;
15860 case UNEQ:
15861 if (!flag_trapping_math)
15863 /* This would trap for signaling NaNs. */
15864 op1 = force_reg (data_mode, op1);
15865 aarch64_emit_unspec_cond_or (target, UNORDERED, EQ,
15866 pred_mode, ptrue, op0, op1);
15867 return false;
15869 /* fall through */
15871 case UNLT:
15872 case UNLE:
15873 case UNGT:
15874 case UNGE:
15876 rtx ordered = ptrue;
15877 if (flag_trapping_math)
15879 /* Only compare the elements that are known to be ordered. */
15880 ordered = gen_reg_rtx (pred_mode);
15881 op1 = force_reg (data_mode, op1);
15882 aarch64_emit_inverted_unspec_cond (ordered, UNORDERED, pred_mode,
15883 ptrue, ptrue, op0, op1, false);
15885 if (code == UNEQ)
15886 code = NE;
15887 else
15888 code = reverse_condition_maybe_unordered (code);
15889 aarch64_emit_inverted_unspec_cond (target, code, pred_mode, ptrue,
15890 ordered, op0, op1, can_invert_p);
15891 return can_invert_p;
15894 default:
15895 gcc_unreachable ();
15899 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
15900 of the data being selected and CMP_MODE is the mode of the values being
15901 compared. */
15903 void
15904 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15905 rtx *ops)
15907 machine_mode pred_mode
15908 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15909 GET_MODE_SIZE (cmp_mode)).require ();
15910 rtx pred = gen_reg_rtx (pred_mode);
15911 if (FLOAT_MODE_P (cmp_mode))
15913 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15914 ops[4], ops[5], true))
15915 std::swap (ops[1], ops[2]);
15917 else
15918 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15920 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15921 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15924 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
15925 true. However due to issues with register allocation it is preferable
15926 to avoid tieing integer scalar and FP scalar modes. Executing integer
15927 operations in general registers is better than treating them as scalar
15928 vector operations. This reduces latency and avoids redundant int<->FP
15929 moves. So tie modes if they are either the same class, or vector modes
15930 with other vector modes, vector structs or any scalar mode. */
15932 static bool
15933 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15935 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15936 return true;
15938 /* We specifically want to allow elements of "structure" modes to
15939 be tieable to the structure. This more general condition allows
15940 other rarer situations too. The reason we don't extend this to
15941 predicate modes is that there are no predicate structure modes
15942 nor any specific instructions for extracting part of a predicate
15943 register. */
15944 if (aarch64_vector_data_mode_p (mode1)
15945 && aarch64_vector_data_mode_p (mode2))
15946 return true;
15948 /* Also allow any scalar modes with vectors. */
15949 if (aarch64_vector_mode_supported_p (mode1)
15950 || aarch64_vector_mode_supported_p (mode2))
15951 return true;
15953 return false;
15956 /* Return a new RTX holding the result of moving POINTER forward by
15957 AMOUNT bytes. */
15959 static rtx
15960 aarch64_move_pointer (rtx pointer, poly_int64 amount)
15962 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
15964 return adjust_automodify_address (pointer, GET_MODE (pointer),
15965 next, amount);
15968 /* Return a new RTX holding the result of moving POINTER forward by the
15969 size of the mode it points to. */
15971 static rtx
15972 aarch64_progress_pointer (rtx pointer)
15974 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
15977 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15978 MODE bytes. */
15980 static void
15981 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
15982 machine_mode mode)
15984 rtx reg = gen_reg_rtx (mode);
15986 /* "Cast" the pointers to the correct mode. */
15987 *src = adjust_address (*src, mode, 0);
15988 *dst = adjust_address (*dst, mode, 0);
15989 /* Emit the memcpy. */
15990 emit_move_insn (reg, *src);
15991 emit_move_insn (*dst, reg);
15992 /* Move the pointers forward. */
15993 *src = aarch64_progress_pointer (*src);
15994 *dst = aarch64_progress_pointer (*dst);
15997 /* Expand movmem, as if from a __builtin_memcpy. Return true if
15998 we succeed, otherwise return false. */
16000 bool
16001 aarch64_expand_movmem (rtx *operands)
16003 unsigned int n;
16004 rtx dst = operands[0];
16005 rtx src = operands[1];
16006 rtx base;
16007 bool speed_p = !optimize_function_for_size_p (cfun);
16009 /* When optimizing for size, give a better estimate of the length of a
16010 memcpy call, but use the default otherwise. */
16011 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16013 /* We can't do anything smart if the amount to copy is not constant. */
16014 if (!CONST_INT_P (operands[2]))
16015 return false;
16017 n = UINTVAL (operands[2]);
16019 /* Try to keep the number of instructions low. For cases below 16 bytes we
16020 need to make at most two moves. For cases above 16 bytes it will be one
16021 move for each 16 byte chunk, then at most two additional moves. */
16022 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16023 return false;
16025 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16026 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16028 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16029 src = adjust_automodify_address (src, VOIDmode, base, 0);
16031 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16032 1-byte chunk. */
16033 if (n < 4)
16035 if (n >= 2)
16037 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16038 n -= 2;
16041 if (n == 1)
16042 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16044 return true;
16047 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
16048 4-byte chunk, partially overlapping with the previously copied chunk. */
16049 if (n < 8)
16051 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16052 n -= 4;
16053 if (n > 0)
16055 int move = n - 4;
16057 src = aarch64_move_pointer (src, move);
16058 dst = aarch64_move_pointer (dst, move);
16059 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16061 return true;
16064 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
16065 them, then (if applicable) an 8-byte chunk. */
16066 while (n >= 8)
16068 if (n / 16)
16070 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16071 n -= 16;
16073 else
16075 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16076 n -= 8;
16080 /* Finish the final bytes of the copy. We can always do this in one
16081 instruction. We either copy the exact amount we need, or partially
16082 overlap with the previous chunk we copied and copy 8-bytes. */
16083 if (n == 0)
16084 return true;
16085 else if (n == 1)
16086 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16087 else if (n == 2)
16088 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16089 else if (n == 4)
16090 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16091 else
16093 if (n == 3)
16095 src = aarch64_move_pointer (src, -1);
16096 dst = aarch64_move_pointer (dst, -1);
16097 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16099 else
16101 int move = n - 8;
16103 src = aarch64_move_pointer (src, move);
16104 dst = aarch64_move_pointer (dst, move);
16105 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16109 return true;
16112 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16113 SImode stores. Handle the case when the constant has identical
16114 bottom and top halves. This is beneficial when the two stores can be
16115 merged into an STP and we avoid synthesising potentially expensive
16116 immediates twice. Return true if such a split is possible. */
16118 bool
16119 aarch64_split_dimode_const_store (rtx dst, rtx src)
16121 rtx lo = gen_lowpart (SImode, src);
16122 rtx hi = gen_highpart_mode (SImode, DImode, src);
16124 bool size_p = optimize_function_for_size_p (cfun);
16126 if (!rtx_equal_p (lo, hi))
16127 return false;
16129 unsigned int orig_cost
16130 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16131 unsigned int lo_cost
16132 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16134 /* We want to transform:
16135 MOV x1, 49370
16136 MOVK x1, 0x140, lsl 16
16137 MOVK x1, 0xc0da, lsl 32
16138 MOVK x1, 0x140, lsl 48
16139 STR x1, [x0]
16140 into:
16141 MOV w1, 49370
16142 MOVK w1, 0x140, lsl 16
16143 STP w1, w1, [x0]
16144 So we want to perform this only when we save two instructions
16145 or more. When optimizing for size, however, accept any code size
16146 savings we can. */
16147 if (size_p && orig_cost <= lo_cost)
16148 return false;
16150 if (!size_p
16151 && (orig_cost <= lo_cost + 1))
16152 return false;
16154 rtx mem_lo = adjust_address (dst, SImode, 0);
16155 if (!aarch64_mem_pair_operand (mem_lo, SImode))
16156 return false;
16158 rtx tmp_reg = gen_reg_rtx (SImode);
16159 aarch64_expand_mov_immediate (tmp_reg, lo);
16160 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16161 /* Don't emit an explicit store pair as this may not be always profitable.
16162 Let the sched-fusion logic decide whether to merge them. */
16163 emit_move_insn (mem_lo, tmp_reg);
16164 emit_move_insn (mem_hi, tmp_reg);
16166 return true;
16169 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16171 static unsigned HOST_WIDE_INT
16172 aarch64_asan_shadow_offset (void)
16174 return (HOST_WIDE_INT_1 << 36);
16177 static rtx
16178 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16179 int code, tree treeop0, tree treeop1)
16181 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16182 rtx op0, op1;
16183 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16184 insn_code icode;
16185 struct expand_operand ops[4];
16187 start_sequence ();
16188 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16190 op_mode = GET_MODE (op0);
16191 if (op_mode == VOIDmode)
16192 op_mode = GET_MODE (op1);
16194 switch (op_mode)
16196 case E_QImode:
16197 case E_HImode:
16198 case E_SImode:
16199 cmp_mode = SImode;
16200 icode = CODE_FOR_cmpsi;
16201 break;
16203 case E_DImode:
16204 cmp_mode = DImode;
16205 icode = CODE_FOR_cmpdi;
16206 break;
16208 case E_SFmode:
16209 cmp_mode = SFmode;
16210 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16211 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16212 break;
16214 case E_DFmode:
16215 cmp_mode = DFmode;
16216 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16217 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16218 break;
16220 default:
16221 end_sequence ();
16222 return NULL_RTX;
16225 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16226 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16227 if (!op0 || !op1)
16229 end_sequence ();
16230 return NULL_RTX;
16232 *prep_seq = get_insns ();
16233 end_sequence ();
16235 create_fixed_operand (&ops[0], op0);
16236 create_fixed_operand (&ops[1], op1);
16238 start_sequence ();
16239 if (!maybe_expand_insn (icode, 2, ops))
16241 end_sequence ();
16242 return NULL_RTX;
16244 *gen_seq = get_insns ();
16245 end_sequence ();
16247 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16248 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16251 static rtx
16252 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16253 int cmp_code, tree treeop0, tree treeop1, int bit_code)
16255 rtx op0, op1, target;
16256 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16257 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16258 insn_code icode;
16259 struct expand_operand ops[6];
16260 int aarch64_cond;
16262 push_to_sequence (*prep_seq);
16263 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16265 op_mode = GET_MODE (op0);
16266 if (op_mode == VOIDmode)
16267 op_mode = GET_MODE (op1);
16269 switch (op_mode)
16271 case E_QImode:
16272 case E_HImode:
16273 case E_SImode:
16274 cmp_mode = SImode;
16275 icode = CODE_FOR_ccmpsi;
16276 break;
16278 case E_DImode:
16279 cmp_mode = DImode;
16280 icode = CODE_FOR_ccmpdi;
16281 break;
16283 case E_SFmode:
16284 cmp_mode = SFmode;
16285 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16286 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16287 break;
16289 case E_DFmode:
16290 cmp_mode = DFmode;
16291 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16292 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16293 break;
16295 default:
16296 end_sequence ();
16297 return NULL_RTX;
16300 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16301 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16302 if (!op0 || !op1)
16304 end_sequence ();
16305 return NULL_RTX;
16307 *prep_seq = get_insns ();
16308 end_sequence ();
16310 target = gen_rtx_REG (cc_mode, CC_REGNUM);
16311 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16313 if (bit_code != AND)
16315 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16316 GET_MODE (XEXP (prev, 0))),
16317 VOIDmode, XEXP (prev, 0), const0_rtx);
16318 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16321 create_fixed_operand (&ops[0], XEXP (prev, 0));
16322 create_fixed_operand (&ops[1], target);
16323 create_fixed_operand (&ops[2], op0);
16324 create_fixed_operand (&ops[3], op1);
16325 create_fixed_operand (&ops[4], prev);
16326 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16328 push_to_sequence (*gen_seq);
16329 if (!maybe_expand_insn (icode, 6, ops))
16331 end_sequence ();
16332 return NULL_RTX;
16335 *gen_seq = get_insns ();
16336 end_sequence ();
16338 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16341 #undef TARGET_GEN_CCMP_FIRST
16342 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16344 #undef TARGET_GEN_CCMP_NEXT
16345 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16347 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16348 instruction fusion of some sort. */
16350 static bool
16351 aarch64_macro_fusion_p (void)
16353 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16357 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16358 should be kept together during scheduling. */
16360 static bool
16361 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16363 rtx set_dest;
16364 rtx prev_set = single_set (prev);
16365 rtx curr_set = single_set (curr);
16366 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16367 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16369 if (!aarch64_macro_fusion_p ())
16370 return false;
16372 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16374 /* We are trying to match:
16375 prev (mov) == (set (reg r0) (const_int imm16))
16376 curr (movk) == (set (zero_extract (reg r0)
16377 (const_int 16)
16378 (const_int 16))
16379 (const_int imm16_1)) */
16381 set_dest = SET_DEST (curr_set);
16383 if (GET_CODE (set_dest) == ZERO_EXTRACT
16384 && CONST_INT_P (SET_SRC (curr_set))
16385 && CONST_INT_P (SET_SRC (prev_set))
16386 && CONST_INT_P (XEXP (set_dest, 2))
16387 && INTVAL (XEXP (set_dest, 2)) == 16
16388 && REG_P (XEXP (set_dest, 0))
16389 && REG_P (SET_DEST (prev_set))
16390 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16392 return true;
16396 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16399 /* We're trying to match:
16400 prev (adrp) == (set (reg r1)
16401 (high (symbol_ref ("SYM"))))
16402 curr (add) == (set (reg r0)
16403 (lo_sum (reg r1)
16404 (symbol_ref ("SYM"))))
16405 Note that r0 need not necessarily be the same as r1, especially
16406 during pre-regalloc scheduling. */
16408 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16409 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16411 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16412 && REG_P (XEXP (SET_SRC (curr_set), 0))
16413 && REGNO (XEXP (SET_SRC (curr_set), 0))
16414 == REGNO (SET_DEST (prev_set))
16415 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16416 XEXP (SET_SRC (curr_set), 1)))
16417 return true;
16421 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16424 /* We're trying to match:
16425 prev (movk) == (set (zero_extract (reg r0)
16426 (const_int 16)
16427 (const_int 32))
16428 (const_int imm16_1))
16429 curr (movk) == (set (zero_extract (reg r0)
16430 (const_int 16)
16431 (const_int 48))
16432 (const_int imm16_2)) */
16434 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16435 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16436 && REG_P (XEXP (SET_DEST (prev_set), 0))
16437 && REG_P (XEXP (SET_DEST (curr_set), 0))
16438 && REGNO (XEXP (SET_DEST (prev_set), 0))
16439 == REGNO (XEXP (SET_DEST (curr_set), 0))
16440 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16441 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16442 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16443 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16444 && CONST_INT_P (SET_SRC (prev_set))
16445 && CONST_INT_P (SET_SRC (curr_set)))
16446 return true;
16449 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16451 /* We're trying to match:
16452 prev (adrp) == (set (reg r0)
16453 (high (symbol_ref ("SYM"))))
16454 curr (ldr) == (set (reg r1)
16455 (mem (lo_sum (reg r0)
16456 (symbol_ref ("SYM")))))
16458 curr (ldr) == (set (reg r1)
16459 (zero_extend (mem
16460 (lo_sum (reg r0)
16461 (symbol_ref ("SYM")))))) */
16462 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16463 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16465 rtx curr_src = SET_SRC (curr_set);
16467 if (GET_CODE (curr_src) == ZERO_EXTEND)
16468 curr_src = XEXP (curr_src, 0);
16470 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16471 && REG_P (XEXP (XEXP (curr_src, 0), 0))
16472 && REGNO (XEXP (XEXP (curr_src, 0), 0))
16473 == REGNO (SET_DEST (prev_set))
16474 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16475 XEXP (SET_SRC (prev_set), 0)))
16476 return true;
16480 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16481 && aarch_crypto_can_dual_issue (prev, curr))
16482 return true;
16484 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16485 && any_condjump_p (curr))
16487 enum attr_type prev_type = get_attr_type (prev);
16489 unsigned int condreg1, condreg2;
16490 rtx cc_reg_1;
16491 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16492 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16494 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16495 && prev
16496 && modified_in_p (cc_reg_1, prev))
16498 /* FIXME: this misses some which is considered simple arthematic
16499 instructions for ThunderX. Simple shifts are missed here. */
16500 if (prev_type == TYPE_ALUS_SREG
16501 || prev_type == TYPE_ALUS_IMM
16502 || prev_type == TYPE_LOGICS_REG
16503 || prev_type == TYPE_LOGICS_IMM)
16504 return true;
16508 if (prev_set
16509 && curr_set
16510 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16511 && any_condjump_p (curr))
16513 /* We're trying to match:
16514 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16515 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16516 (const_int 0))
16517 (label_ref ("SYM"))
16518 (pc)) */
16519 if (SET_DEST (curr_set) == (pc_rtx)
16520 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16521 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16522 && REG_P (SET_DEST (prev_set))
16523 && REGNO (SET_DEST (prev_set))
16524 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16526 /* Fuse ALU operations followed by conditional branch instruction. */
16527 switch (get_attr_type (prev))
16529 case TYPE_ALU_IMM:
16530 case TYPE_ALU_SREG:
16531 case TYPE_ADC_REG:
16532 case TYPE_ADC_IMM:
16533 case TYPE_ADCS_REG:
16534 case TYPE_ADCS_IMM:
16535 case TYPE_LOGIC_REG:
16536 case TYPE_LOGIC_IMM:
16537 case TYPE_CSEL:
16538 case TYPE_ADR:
16539 case TYPE_MOV_IMM:
16540 case TYPE_SHIFT_REG:
16541 case TYPE_SHIFT_IMM:
16542 case TYPE_BFM:
16543 case TYPE_RBIT:
16544 case TYPE_REV:
16545 case TYPE_EXTEND:
16546 return true;
16548 default:;
16553 return false;
16556 /* Return true iff the instruction fusion described by OP is enabled. */
16558 bool
16559 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16561 return (aarch64_tune_params.fusible_ops & op) != 0;
16564 /* If MEM is in the form of [base+offset], extract the two parts
16565 of address and set to BASE and OFFSET, otherwise return false
16566 after clearing BASE and OFFSET. */
16568 bool
16569 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16571 rtx addr;
16573 gcc_assert (MEM_P (mem));
16575 addr = XEXP (mem, 0);
16577 if (REG_P (addr))
16579 *base = addr;
16580 *offset = const0_rtx;
16581 return true;
16584 if (GET_CODE (addr) == PLUS
16585 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16587 *base = XEXP (addr, 0);
16588 *offset = XEXP (addr, 1);
16589 return true;
16592 *base = NULL_RTX;
16593 *offset = NULL_RTX;
16595 return false;
16598 /* Types for scheduling fusion. */
16599 enum sched_fusion_type
16601 SCHED_FUSION_NONE = 0,
16602 SCHED_FUSION_LD_SIGN_EXTEND,
16603 SCHED_FUSION_LD_ZERO_EXTEND,
16604 SCHED_FUSION_LD,
16605 SCHED_FUSION_ST,
16606 SCHED_FUSION_NUM
16609 /* If INSN is a load or store of address in the form of [base+offset],
16610 extract the two parts and set to BASE and OFFSET. Return scheduling
16611 fusion type this INSN is. */
16613 static enum sched_fusion_type
16614 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16616 rtx x, dest, src;
16617 enum sched_fusion_type fusion = SCHED_FUSION_LD;
16619 gcc_assert (INSN_P (insn));
16620 x = PATTERN (insn);
16621 if (GET_CODE (x) != SET)
16622 return SCHED_FUSION_NONE;
16624 src = SET_SRC (x);
16625 dest = SET_DEST (x);
16627 machine_mode dest_mode = GET_MODE (dest);
16629 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16630 return SCHED_FUSION_NONE;
16632 if (GET_CODE (src) == SIGN_EXTEND)
16634 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16635 src = XEXP (src, 0);
16636 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16637 return SCHED_FUSION_NONE;
16639 else if (GET_CODE (src) == ZERO_EXTEND)
16641 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16642 src = XEXP (src, 0);
16643 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16644 return SCHED_FUSION_NONE;
16647 if (GET_CODE (src) == MEM && REG_P (dest))
16648 extract_base_offset_in_addr (src, base, offset);
16649 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16651 fusion = SCHED_FUSION_ST;
16652 extract_base_offset_in_addr (dest, base, offset);
16654 else
16655 return SCHED_FUSION_NONE;
16657 if (*base == NULL_RTX || *offset == NULL_RTX)
16658 fusion = SCHED_FUSION_NONE;
16660 return fusion;
16663 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16665 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16666 and PRI are only calculated for these instructions. For other instruction,
16667 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16668 type instruction fusion can be added by returning different priorities.
16670 It's important that irrelevant instructions get the largest FUSION_PRI. */
16672 static void
16673 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16674 int *fusion_pri, int *pri)
16676 int tmp, off_val;
16677 rtx base, offset;
16678 enum sched_fusion_type fusion;
16680 gcc_assert (INSN_P (insn));
16682 tmp = max_pri - 1;
16683 fusion = fusion_load_store (insn, &base, &offset);
16684 if (fusion == SCHED_FUSION_NONE)
16686 *pri = tmp;
16687 *fusion_pri = tmp;
16688 return;
16691 /* Set FUSION_PRI according to fusion type and base register. */
16692 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16694 /* Calculate PRI. */
16695 tmp /= 2;
16697 /* INSN with smaller offset goes first. */
16698 off_val = (int)(INTVAL (offset));
16699 if (off_val >= 0)
16700 tmp -= (off_val & 0xfffff);
16701 else
16702 tmp += ((- off_val) & 0xfffff);
16704 *pri = tmp;
16705 return;
16708 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16709 Adjust priority of sha1h instructions so they are scheduled before
16710 other SHA1 instructions. */
16712 static int
16713 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16715 rtx x = PATTERN (insn);
16717 if (GET_CODE (x) == SET)
16719 x = SET_SRC (x);
16721 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16722 return priority + 10;
16725 return priority;
16728 /* Given OPERANDS of consecutive load/store, check if we can merge
16729 them into ldp/stp. LOAD is true if they are load instructions.
16730 MODE is the mode of memory operands. */
16732 bool
16733 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16734 machine_mode mode)
16736 HOST_WIDE_INT offval_1, offval_2, msize;
16737 enum reg_class rclass_1, rclass_2;
16738 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16740 if (load)
16742 mem_1 = operands[1];
16743 mem_2 = operands[3];
16744 reg_1 = operands[0];
16745 reg_2 = operands[2];
16746 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16747 if (REGNO (reg_1) == REGNO (reg_2))
16748 return false;
16750 else
16752 mem_1 = operands[0];
16753 mem_2 = operands[2];
16754 reg_1 = operands[1];
16755 reg_2 = operands[3];
16758 /* The mems cannot be volatile. */
16759 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16760 return false;
16762 /* If we have SImode and slow unaligned ldp,
16763 check the alignment to be at least 8 byte. */
16764 if (mode == SImode
16765 && (aarch64_tune_params.extra_tuning_flags
16766 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16767 && !optimize_size
16768 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16769 return false;
16771 /* Check if the addresses are in the form of [base+offset]. */
16772 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16773 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16774 return false;
16775 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16776 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16777 return false;
16779 /* Check if the bases are same. */
16780 if (!rtx_equal_p (base_1, base_2))
16781 return false;
16783 offval_1 = INTVAL (offset_1);
16784 offval_2 = INTVAL (offset_2);
16785 /* We should only be trying this for fixed-sized modes. There is no
16786 SVE LDP/STP instruction. */
16787 msize = GET_MODE_SIZE (mode).to_constant ();
16788 /* Check if the offsets are consecutive. */
16789 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16790 return false;
16792 /* Check if the addresses are clobbered by load. */
16793 if (load)
16795 if (reg_mentioned_p (reg_1, mem_1))
16796 return false;
16798 /* In increasing order, the last load can clobber the address. */
16799 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16800 return false;
16803 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16804 rclass_1 = FP_REGS;
16805 else
16806 rclass_1 = GENERAL_REGS;
16808 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16809 rclass_2 = FP_REGS;
16810 else
16811 rclass_2 = GENERAL_REGS;
16813 /* Check if the registers are of same class. */
16814 if (rclass_1 != rclass_2)
16815 return false;
16817 return true;
16820 /* Given OPERANDS of consecutive load/store, check if we can merge
16821 them into ldp/stp by adjusting the offset. LOAD is true if they
16822 are load instructions. MODE is the mode of memory operands.
16824 Given below consecutive stores:
16826 str w1, [xb, 0x100]
16827 str w1, [xb, 0x104]
16828 str w1, [xb, 0x108]
16829 str w1, [xb, 0x10c]
16831 Though the offsets are out of the range supported by stp, we can
16832 still pair them after adjusting the offset, like:
16834 add scratch, xb, 0x100
16835 stp w1, w1, [scratch]
16836 stp w1, w1, [scratch, 0x8]
16838 The peephole patterns detecting this opportunity should guarantee
16839 the scratch register is avaliable. */
16841 bool
16842 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16843 scalar_mode mode)
16845 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16846 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16847 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16848 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16850 if (load)
16852 reg_1 = operands[0];
16853 mem_1 = operands[1];
16854 reg_2 = operands[2];
16855 mem_2 = operands[3];
16856 reg_3 = operands[4];
16857 mem_3 = operands[5];
16858 reg_4 = operands[6];
16859 mem_4 = operands[7];
16860 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16861 && REG_P (reg_3) && REG_P (reg_4));
16862 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16863 return false;
16865 else
16867 mem_1 = operands[0];
16868 reg_1 = operands[1];
16869 mem_2 = operands[2];
16870 reg_2 = operands[3];
16871 mem_3 = operands[4];
16872 reg_3 = operands[5];
16873 mem_4 = operands[6];
16874 reg_4 = operands[7];
16876 /* Skip if memory operand is by itslef valid for ldp/stp. */
16877 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
16878 return false;
16880 /* The mems cannot be volatile. */
16881 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
16882 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
16883 return false;
16885 /* Check if the addresses are in the form of [base+offset]. */
16886 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16887 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16888 return false;
16889 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16890 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16891 return false;
16892 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
16893 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
16894 return false;
16895 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
16896 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
16897 return false;
16899 /* Check if the bases are same. */
16900 if (!rtx_equal_p (base_1, base_2)
16901 || !rtx_equal_p (base_2, base_3)
16902 || !rtx_equal_p (base_3, base_4))
16903 return false;
16905 offval_1 = INTVAL (offset_1);
16906 offval_2 = INTVAL (offset_2);
16907 offval_3 = INTVAL (offset_3);
16908 offval_4 = INTVAL (offset_4);
16909 msize = GET_MODE_SIZE (mode);
16910 /* Check if the offsets are consecutive. */
16911 if ((offval_1 != (offval_2 + msize)
16912 || offval_1 != (offval_3 + msize * 2)
16913 || offval_1 != (offval_4 + msize * 3))
16914 && (offval_4 != (offval_3 + msize)
16915 || offval_4 != (offval_2 + msize * 2)
16916 || offval_4 != (offval_1 + msize * 3)))
16917 return false;
16919 /* Check if the addresses are clobbered by load. */
16920 if (load)
16922 if (reg_mentioned_p (reg_1, mem_1)
16923 || reg_mentioned_p (reg_2, mem_2)
16924 || reg_mentioned_p (reg_3, mem_3))
16925 return false;
16927 /* In increasing order, the last load can clobber the address. */
16928 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
16929 return false;
16932 /* If we have SImode and slow unaligned ldp,
16933 check the alignment to be at least 8 byte. */
16934 if (mode == SImode
16935 && (aarch64_tune_params.extra_tuning_flags
16936 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16937 && !optimize_size
16938 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16939 return false;
16941 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16942 rclass_1 = FP_REGS;
16943 else
16944 rclass_1 = GENERAL_REGS;
16946 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16947 rclass_2 = FP_REGS;
16948 else
16949 rclass_2 = GENERAL_REGS;
16951 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
16952 rclass_3 = FP_REGS;
16953 else
16954 rclass_3 = GENERAL_REGS;
16956 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
16957 rclass_4 = FP_REGS;
16958 else
16959 rclass_4 = GENERAL_REGS;
16961 /* Check if the registers are of same class. */
16962 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
16963 return false;
16965 return true;
16968 /* Given OPERANDS of consecutive load/store, this function pairs them
16969 into ldp/stp after adjusting the offset. It depends on the fact
16970 that addresses of load/store instructions are in increasing order.
16971 MODE is the mode of memory operands. CODE is the rtl operator
16972 which should be applied to all memory operands, it's SIGN_EXTEND,
16973 ZERO_EXTEND or UNKNOWN. */
16975 bool
16976 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
16977 scalar_mode mode, RTX_CODE code)
16979 rtx base, offset, t1, t2;
16980 rtx mem_1, mem_2, mem_3, mem_4;
16981 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
16983 if (load)
16985 mem_1 = operands[1];
16986 mem_2 = operands[3];
16987 mem_3 = operands[5];
16988 mem_4 = operands[7];
16990 else
16992 mem_1 = operands[0];
16993 mem_2 = operands[2];
16994 mem_3 = operands[4];
16995 mem_4 = operands[6];
16996 gcc_assert (code == UNKNOWN);
16999 extract_base_offset_in_addr (mem_1, &base, &offset);
17000 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
17002 /* Adjust offset thus it can fit in ldp/stp instruction. */
17003 msize = GET_MODE_SIZE (mode);
17004 stp_off_limit = msize * 0x40;
17005 off_val = INTVAL (offset);
17006 abs_off = (off_val < 0) ? -off_val : off_val;
17007 new_off = abs_off % stp_off_limit;
17008 adj_off = abs_off - new_off;
17010 /* Further adjust to make sure all offsets are OK. */
17011 if ((new_off + msize * 2) >= stp_off_limit)
17013 adj_off += stp_off_limit;
17014 new_off -= stp_off_limit;
17017 /* Make sure the adjustment can be done with ADD/SUB instructions. */
17018 if (adj_off >= 0x1000)
17019 return false;
17021 if (off_val < 0)
17023 adj_off = -adj_off;
17024 new_off = -new_off;
17027 /* Create new memory references. */
17028 mem_1 = change_address (mem_1, VOIDmode,
17029 plus_constant (DImode, operands[8], new_off));
17031 /* Check if the adjusted address is OK for ldp/stp. */
17032 if (!aarch64_mem_pair_operand (mem_1, mode))
17033 return false;
17035 msize = GET_MODE_SIZE (mode);
17036 mem_2 = change_address (mem_2, VOIDmode,
17037 plus_constant (DImode,
17038 operands[8],
17039 new_off + msize));
17040 mem_3 = change_address (mem_3, VOIDmode,
17041 plus_constant (DImode,
17042 operands[8],
17043 new_off + msize * 2));
17044 mem_4 = change_address (mem_4, VOIDmode,
17045 plus_constant (DImode,
17046 operands[8],
17047 new_off + msize * 3));
17049 if (code == ZERO_EXTEND)
17051 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17052 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17053 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17054 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17056 else if (code == SIGN_EXTEND)
17058 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17059 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17060 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17061 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17064 if (load)
17066 operands[1] = mem_1;
17067 operands[3] = mem_2;
17068 operands[5] = mem_3;
17069 operands[7] = mem_4;
17071 else
17073 operands[0] = mem_1;
17074 operands[2] = mem_2;
17075 operands[4] = mem_3;
17076 operands[6] = mem_4;
17079 /* Emit adjusting instruction. */
17080 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
17081 /* Emit ldp/stp instructions. */
17082 t1 = gen_rtx_SET (operands[0], operands[1]);
17083 t2 = gen_rtx_SET (operands[2], operands[3]);
17084 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17085 t1 = gen_rtx_SET (operands[4], operands[5]);
17086 t2 = gen_rtx_SET (operands[6], operands[7]);
17087 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17088 return true;
17091 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17092 it isn't worth branching around empty masked ops (including masked
17093 stores). */
17095 static bool
17096 aarch64_empty_mask_is_expensive (unsigned)
17098 return false;
17101 /* Return 1 if pseudo register should be created and used to hold
17102 GOT address for PIC code. */
17104 bool
17105 aarch64_use_pseudo_pic_reg (void)
17107 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17110 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17112 static int
17113 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17115 switch (XINT (x, 1))
17117 case UNSPEC_GOTSMALLPIC:
17118 case UNSPEC_GOTSMALLPIC28K:
17119 case UNSPEC_GOTTINYPIC:
17120 return 0;
17121 default:
17122 break;
17125 return default_unspec_may_trap_p (x, flags);
17129 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17130 return the log2 of that value. Otherwise return -1. */
17133 aarch64_fpconst_pow_of_2 (rtx x)
17135 const REAL_VALUE_TYPE *r;
17137 if (!CONST_DOUBLE_P (x))
17138 return -1;
17140 r = CONST_DOUBLE_REAL_VALUE (x);
17142 if (REAL_VALUE_NEGATIVE (*r)
17143 || REAL_VALUE_ISNAN (*r)
17144 || REAL_VALUE_ISINF (*r)
17145 || !real_isinteger (r, DFmode))
17146 return -1;
17148 return exact_log2 (real_to_integer (r));
17151 /* If X is a vector of equal CONST_DOUBLE values and that value is
17152 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17155 aarch64_vec_fpconst_pow_of_2 (rtx x)
17157 int nelts;
17158 if (GET_CODE (x) != CONST_VECTOR
17159 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17160 return -1;
17162 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17163 return -1;
17165 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17166 if (firstval <= 0)
17167 return -1;
17169 for (int i = 1; i < nelts; i++)
17170 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17171 return -1;
17173 return firstval;
17176 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17177 to float.
17179 __fp16 always promotes through this hook.
17180 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17181 through the generic excess precision logic rather than here. */
17183 static tree
17184 aarch64_promoted_type (const_tree t)
17186 if (SCALAR_FLOAT_TYPE_P (t)
17187 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17188 return float_type_node;
17190 return NULL_TREE;
17193 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17195 static bool
17196 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17197 optimization_type opt_type)
17199 switch (op)
17201 case rsqrt_optab:
17202 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17204 default:
17205 return true;
17209 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17211 static unsigned int
17212 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17213 int *offset)
17215 /* Polynomial invariant 1 == (VG / 2) - 1. */
17216 gcc_assert (i == 1);
17217 *factor = 2;
17218 *offset = 1;
17219 return AARCH64_DWARF_VG;
17222 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17223 if MODE is HFmode, and punt to the generic implementation otherwise. */
17225 static bool
17226 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17228 return (mode == HFmode
17229 ? true
17230 : default_libgcc_floating_mode_supported_p (mode));
17233 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17234 if MODE is HFmode, and punt to the generic implementation otherwise. */
17236 static bool
17237 aarch64_scalar_mode_supported_p (scalar_mode mode)
17239 return (mode == HFmode
17240 ? true
17241 : default_scalar_mode_supported_p (mode));
17244 /* Set the value of FLT_EVAL_METHOD.
17245 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17247 0: evaluate all operations and constants, whose semantic type has at
17248 most the range and precision of type float, to the range and
17249 precision of float; evaluate all other operations and constants to
17250 the range and precision of the semantic type;
17252 N, where _FloatN is a supported interchange floating type
17253 evaluate all operations and constants, whose semantic type has at
17254 most the range and precision of _FloatN type, to the range and
17255 precision of the _FloatN type; evaluate all other operations and
17256 constants to the range and precision of the semantic type;
17258 If we have the ARMv8.2-A extensions then we support _Float16 in native
17259 precision, so we should set this to 16. Otherwise, we support the type,
17260 but want to evaluate expressions in float precision, so set this to
17261 0. */
17263 static enum flt_eval_method
17264 aarch64_excess_precision (enum excess_precision_type type)
17266 switch (type)
17268 case EXCESS_PRECISION_TYPE_FAST:
17269 case EXCESS_PRECISION_TYPE_STANDARD:
17270 /* We can calculate either in 16-bit range and precision or
17271 32-bit range and precision. Make that decision based on whether
17272 we have native support for the ARMv8.2-A 16-bit floating-point
17273 instructions or not. */
17274 return (TARGET_FP_F16INST
17275 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17276 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17277 case EXCESS_PRECISION_TYPE_IMPLICIT:
17278 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17279 default:
17280 gcc_unreachable ();
17282 return FLT_EVAL_METHOD_UNPREDICTABLE;
17285 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17286 scheduled for speculative execution. Reject the long-running division
17287 and square-root instructions. */
17289 static bool
17290 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17292 switch (get_attr_type (insn))
17294 case TYPE_SDIV:
17295 case TYPE_UDIV:
17296 case TYPE_FDIVS:
17297 case TYPE_FDIVD:
17298 case TYPE_FSQRTS:
17299 case TYPE_FSQRTD:
17300 case TYPE_NEON_FP_SQRT_S:
17301 case TYPE_NEON_FP_SQRT_D:
17302 case TYPE_NEON_FP_SQRT_S_Q:
17303 case TYPE_NEON_FP_SQRT_D_Q:
17304 case TYPE_NEON_FP_DIV_S:
17305 case TYPE_NEON_FP_DIV_D:
17306 case TYPE_NEON_FP_DIV_S_Q:
17307 case TYPE_NEON_FP_DIV_D_Q:
17308 return false;
17309 default:
17310 return true;
17314 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17316 static int
17317 aarch64_compute_pressure_classes (reg_class *classes)
17319 int i = 0;
17320 classes[i++] = GENERAL_REGS;
17321 classes[i++] = FP_REGS;
17322 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17323 registers need to go in PR_LO_REGS at some point during their
17324 lifetime. Splitting it into two halves has the effect of making
17325 all predicates count against PR_LO_REGS, so that we try whenever
17326 possible to restrict the number of live predicates to 8. This
17327 greatly reduces the amount of spilling in certain loops. */
17328 classes[i++] = PR_LO_REGS;
17329 classes[i++] = PR_HI_REGS;
17330 return i;
17333 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17335 static bool
17336 aarch64_can_change_mode_class (machine_mode from,
17337 machine_mode to, reg_class_t)
17339 if (BYTES_BIG_ENDIAN)
17341 bool from_sve_p = aarch64_sve_data_mode_p (from);
17342 bool to_sve_p = aarch64_sve_data_mode_p (to);
17344 /* Don't allow changes between SVE data modes and non-SVE modes.
17345 See the comment at the head of aarch64-sve.md for details. */
17346 if (from_sve_p != to_sve_p)
17347 return false;
17349 /* Don't allow changes in element size: lane 0 of the new vector
17350 would not then be lane 0 of the old vector. See the comment
17351 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17352 description.
17354 In the worst case, this forces a register to be spilled in
17355 one mode and reloaded in the other, which handles the
17356 endianness correctly. */
17357 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17358 return false;
17360 return true;
17363 /* Implement TARGET_EARLY_REMAT_MODES. */
17365 static void
17366 aarch64_select_early_remat_modes (sbitmap modes)
17368 /* SVE values are not normally live across a call, so it should be
17369 worth doing early rematerialization even in VL-specific mode. */
17370 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17372 machine_mode mode = (machine_mode) i;
17373 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17374 if (vec_flags & VEC_ANY_SVE)
17375 bitmap_set_bit (modes, i);
17379 /* Target-specific selftests. */
17381 #if CHECKING_P
17383 namespace selftest {
17385 /* Selftest for the RTL loader.
17386 Verify that the RTL loader copes with a dump from
17387 print_rtx_function. This is essentially just a test that class
17388 function_reader can handle a real dump, but it also verifies
17389 that lookup_reg_by_dump_name correctly handles hard regs.
17390 The presence of hard reg names in the dump means that the test is
17391 target-specific, hence it is in this file. */
17393 static void
17394 aarch64_test_loading_full_dump ()
17396 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17398 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17400 rtx_insn *insn_1 = get_insn_by_uid (1);
17401 ASSERT_EQ (NOTE, GET_CODE (insn_1));
17403 rtx_insn *insn_15 = get_insn_by_uid (15);
17404 ASSERT_EQ (INSN, GET_CODE (insn_15));
17405 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17407 /* Verify crtl->return_rtx. */
17408 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17409 ASSERT_EQ (0, REGNO (crtl->return_rtx));
17410 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17413 /* Run all target-specific selftests. */
17415 static void
17416 aarch64_run_selftests (void)
17418 aarch64_test_loading_full_dump ();
17421 } // namespace selftest
17423 #endif /* #if CHECKING_P */
17425 #undef TARGET_ADDRESS_COST
17426 #define TARGET_ADDRESS_COST aarch64_address_cost
17428 /* This hook will determines whether unnamed bitfields affect the alignment
17429 of the containing structure. The hook returns true if the structure
17430 should inherit the alignment requirements of an unnamed bitfield's
17431 type. */
17432 #undef TARGET_ALIGN_ANON_BITFIELD
17433 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17435 #undef TARGET_ASM_ALIGNED_DI_OP
17436 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17438 #undef TARGET_ASM_ALIGNED_HI_OP
17439 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17441 #undef TARGET_ASM_ALIGNED_SI_OP
17442 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17444 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17445 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17446 hook_bool_const_tree_hwi_hwi_const_tree_true
17448 #undef TARGET_ASM_FILE_START
17449 #define TARGET_ASM_FILE_START aarch64_start_file
17451 #undef TARGET_ASM_OUTPUT_MI_THUNK
17452 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17454 #undef TARGET_ASM_SELECT_RTX_SECTION
17455 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17457 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17458 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17460 #undef TARGET_BUILD_BUILTIN_VA_LIST
17461 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17463 #undef TARGET_CALLEE_COPIES
17464 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17466 #undef TARGET_CAN_ELIMINATE
17467 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17469 #undef TARGET_CAN_INLINE_P
17470 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17472 #undef TARGET_CANNOT_FORCE_CONST_MEM
17473 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17475 #undef TARGET_CASE_VALUES_THRESHOLD
17476 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17478 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17479 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17481 /* Only the least significant bit is used for initialization guard
17482 variables. */
17483 #undef TARGET_CXX_GUARD_MASK_BIT
17484 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17486 #undef TARGET_C_MODE_FOR_SUFFIX
17487 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17489 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17490 #undef TARGET_DEFAULT_TARGET_FLAGS
17491 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17492 #endif
17494 #undef TARGET_CLASS_MAX_NREGS
17495 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17497 #undef TARGET_BUILTIN_DECL
17498 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17500 #undef TARGET_BUILTIN_RECIPROCAL
17501 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17503 #undef TARGET_C_EXCESS_PRECISION
17504 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17506 #undef TARGET_EXPAND_BUILTIN
17507 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17509 #undef TARGET_EXPAND_BUILTIN_VA_START
17510 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17512 #undef TARGET_FOLD_BUILTIN
17513 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17515 #undef TARGET_FUNCTION_ARG
17516 #define TARGET_FUNCTION_ARG aarch64_function_arg
17518 #undef TARGET_FUNCTION_ARG_ADVANCE
17519 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17521 #undef TARGET_FUNCTION_ARG_BOUNDARY
17522 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17524 #undef TARGET_FUNCTION_ARG_PADDING
17525 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17527 #undef TARGET_GET_RAW_RESULT_MODE
17528 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17529 #undef TARGET_GET_RAW_ARG_MODE
17530 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17532 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17533 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17535 #undef TARGET_FUNCTION_VALUE
17536 #define TARGET_FUNCTION_VALUE aarch64_function_value
17538 #undef TARGET_FUNCTION_VALUE_REGNO_P
17539 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17541 #undef TARGET_GIMPLE_FOLD_BUILTIN
17542 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17544 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17545 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17547 #undef TARGET_INIT_BUILTINS
17548 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17550 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17551 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17552 aarch64_ira_change_pseudo_allocno_class
17554 #undef TARGET_LEGITIMATE_ADDRESS_P
17555 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17557 #undef TARGET_LEGITIMATE_CONSTANT_P
17558 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17560 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17561 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17562 aarch64_legitimize_address_displacement
17564 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17565 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17567 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17568 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17569 aarch64_libgcc_floating_mode_supported_p
17571 #undef TARGET_MANGLE_TYPE
17572 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17574 #undef TARGET_MEMORY_MOVE_COST
17575 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17577 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17578 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17580 #undef TARGET_MUST_PASS_IN_STACK
17581 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17583 /* This target hook should return true if accesses to volatile bitfields
17584 should use the narrowest mode possible. It should return false if these
17585 accesses should use the bitfield container type. */
17586 #undef TARGET_NARROW_VOLATILE_BITFIELD
17587 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17589 #undef TARGET_OPTION_OVERRIDE
17590 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17592 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17593 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17594 aarch64_override_options_after_change
17596 #undef TARGET_OPTION_SAVE
17597 #define TARGET_OPTION_SAVE aarch64_option_save
17599 #undef TARGET_OPTION_RESTORE
17600 #define TARGET_OPTION_RESTORE aarch64_option_restore
17602 #undef TARGET_OPTION_PRINT
17603 #define TARGET_OPTION_PRINT aarch64_option_print
17605 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17606 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17608 #undef TARGET_SET_CURRENT_FUNCTION
17609 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17611 #undef TARGET_PASS_BY_REFERENCE
17612 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17614 #undef TARGET_PREFERRED_RELOAD_CLASS
17615 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17617 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17618 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17620 #undef TARGET_PROMOTED_TYPE
17621 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17623 #undef TARGET_SECONDARY_RELOAD
17624 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17626 #undef TARGET_SHIFT_TRUNCATION_MASK
17627 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17629 #undef TARGET_SETUP_INCOMING_VARARGS
17630 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17632 #undef TARGET_STRUCT_VALUE_RTX
17633 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17635 #undef TARGET_REGISTER_MOVE_COST
17636 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17638 #undef TARGET_RETURN_IN_MEMORY
17639 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17641 #undef TARGET_RETURN_IN_MSB
17642 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17644 #undef TARGET_RTX_COSTS
17645 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17647 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17648 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17650 #undef TARGET_SCHED_ISSUE_RATE
17651 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17653 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17654 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17655 aarch64_sched_first_cycle_multipass_dfa_lookahead
17657 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17658 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17659 aarch64_first_cycle_multipass_dfa_lookahead_guard
17661 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17662 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17663 aarch64_get_separate_components
17665 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17666 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17667 aarch64_components_for_bb
17669 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17670 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17671 aarch64_disqualify_components
17673 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17674 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17675 aarch64_emit_prologue_components
17677 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17678 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17679 aarch64_emit_epilogue_components
17681 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17682 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17683 aarch64_set_handled_components
17685 #undef TARGET_TRAMPOLINE_INIT
17686 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17688 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17689 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17691 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17692 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17694 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17695 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17696 aarch64_builtin_support_vector_misalignment
17698 #undef TARGET_ARRAY_MODE
17699 #define TARGET_ARRAY_MODE aarch64_array_mode
17701 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17702 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17704 #undef TARGET_VECTORIZE_ADD_STMT_COST
17705 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17707 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17708 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17709 aarch64_builtin_vectorization_cost
17711 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17712 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17714 #undef TARGET_VECTORIZE_BUILTINS
17715 #define TARGET_VECTORIZE_BUILTINS
17717 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17718 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17719 aarch64_builtin_vectorized_function
17721 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17722 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17723 aarch64_autovectorize_vector_sizes
17725 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17726 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17727 aarch64_atomic_assign_expand_fenv
17729 /* Section anchor support. */
17731 #undef TARGET_MIN_ANCHOR_OFFSET
17732 #define TARGET_MIN_ANCHOR_OFFSET -256
17734 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17735 byte offset; we can do much more for larger data types, but have no way
17736 to determine the size of the access. We assume accesses are aligned. */
17737 #undef TARGET_MAX_ANCHOR_OFFSET
17738 #define TARGET_MAX_ANCHOR_OFFSET 4095
17740 #undef TARGET_VECTOR_ALIGNMENT
17741 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17743 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17744 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17745 aarch64_vectorize_preferred_vector_alignment
17746 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17747 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17748 aarch64_simd_vector_alignment_reachable
17750 /* vec_perm support. */
17752 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17753 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17754 aarch64_vectorize_vec_perm_const
17756 #undef TARGET_VECTORIZE_GET_MASK_MODE
17757 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17758 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17759 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17760 aarch64_empty_mask_is_expensive
17762 #undef TARGET_INIT_LIBFUNCS
17763 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17765 #undef TARGET_FIXED_CONDITION_CODE_REGS
17766 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17768 #undef TARGET_FLAGS_REGNUM
17769 #define TARGET_FLAGS_REGNUM CC_REGNUM
17771 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17772 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17774 #undef TARGET_ASAN_SHADOW_OFFSET
17775 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17777 #undef TARGET_LEGITIMIZE_ADDRESS
17778 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17780 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17781 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17783 #undef TARGET_CAN_USE_DOLOOP_P
17784 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17786 #undef TARGET_SCHED_ADJUST_PRIORITY
17787 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17789 #undef TARGET_SCHED_MACRO_FUSION_P
17790 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17792 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17793 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17795 #undef TARGET_SCHED_FUSION_PRIORITY
17796 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17798 #undef TARGET_UNSPEC_MAY_TRAP_P
17799 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17801 #undef TARGET_USE_PSEUDO_PIC_REG
17802 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17804 #undef TARGET_PRINT_OPERAND
17805 #define TARGET_PRINT_OPERAND aarch64_print_operand
17807 #undef TARGET_PRINT_OPERAND_ADDRESS
17808 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17810 #undef TARGET_OPTAB_SUPPORTED_P
17811 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17813 #undef TARGET_OMIT_STRUCT_RETURN_REG
17814 #define TARGET_OMIT_STRUCT_RETURN_REG true
17816 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17817 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17818 aarch64_dwarf_poly_indeterminate_value
17820 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
17821 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17822 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17824 #undef TARGET_HARD_REGNO_NREGS
17825 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17826 #undef TARGET_HARD_REGNO_MODE_OK
17827 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17829 #undef TARGET_MODES_TIEABLE_P
17830 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17832 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17833 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17834 aarch64_hard_regno_call_part_clobbered
17836 #undef TARGET_CONSTANT_ALIGNMENT
17837 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17839 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17840 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17842 #undef TARGET_CAN_CHANGE_MODE_CLASS
17843 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17845 #undef TARGET_SELECT_EARLY_REMAT_MODES
17846 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17848 #if CHECKING_P
17849 #undef TARGET_RUN_TARGET_SELFTESTS
17850 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17851 #endif /* #if CHECKING_P */
17853 struct gcc_target targetm = TARGET_INITIALIZER;
17855 #include "gt-aarch64.h"