[gen/AArch64] Generate helpers for substituting iterator values into pattern names
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob13b5448aca88555222481f0955237b6fdcbb38b9
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Classifies an address.
83 ADDRESS_REG_IMM
84 A simple base register plus immediate offset.
86 ADDRESS_REG_WB
87 A base register indexed by immediate offset with writeback.
89 ADDRESS_REG_REG
90 A base register indexed by (optionally scaled) register.
92 ADDRESS_REG_UXTW
93 A base register indexed by (optionally scaled) zero-extended register.
95 ADDRESS_REG_SXTW
96 A base register indexed by (optionally scaled) sign-extended register.
98 ADDRESS_LO_SUM
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
101 ADDRESS_SYMBOLIC:
102 A constant symbolic address, in pc-relative literal pool. */
104 enum aarch64_address_type {
105 ADDRESS_REG_IMM,
106 ADDRESS_REG_WB,
107 ADDRESS_REG_REG,
108 ADDRESS_REG_UXTW,
109 ADDRESS_REG_SXTW,
110 ADDRESS_LO_SUM,
111 ADDRESS_SYMBOLIC
114 struct aarch64_address_info {
115 enum aarch64_address_type type;
116 rtx base;
117 rtx offset;
118 poly_int64 const_offset;
119 int shift;
120 enum aarch64_symbol_type symbol_type;
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
126 enum insn_type { MOV, MVN };
127 enum modifier_type { LSL, MSL };
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode, rtx);
131 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
132 insn_type = MOV, modifier_type = LSL,
133 unsigned int = 0);
134 simd_immediate_info (scalar_mode, rtx, rtx);
136 /* The mode of the elements. */
137 scalar_mode elt_mode;
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
141 rtx value;
143 /* The value of the step if the constant is a series, null otherwise. */
144 rtx step;
146 /* The instruction to use to move the immediate into a vector. */
147 insn_type insn;
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier;
152 unsigned int shift;
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
159 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
160 modifier (LSL), shift (0)
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
165 fields. */
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
168 unsigned HOST_WIDE_INT value_in,
169 insn_type insn_in, modifier_type modifier_in,
170 unsigned int shift_in)
171 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
172 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
179 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
180 modifier (LSL), shift (0)
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel;
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg;
189 #ifdef HAVE_AS_TLS
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
192 #endif
194 static bool aarch64_composite_type_p (const_tree, machine_mode);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
196 const_tree,
197 machine_mode *, int *,
198 bool *);
199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode);
203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
205 const_tree type,
206 int misalignment,
207 bool is_packed);
208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
209 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
210 aarch64_addr_query_type);
212 /* Major revision number of the ARM Architecture implemented by the target. */
213 unsigned aarch64_architecture_version;
215 /* The processor for which instructions should be scheduled. */
216 enum aarch64_processor aarch64_tune = cortexa53;
218 /* Mask to specify which instruction scheduling options should be used. */
219 unsigned long aarch64_tune_flags = 0;
221 /* Global flag for PC relative loads. */
222 bool aarch64_pcrelative_literal_loads;
224 /* Global flag for whether frame pointer is enabled. */
225 bool aarch64_use_frame_pointer;
227 /* Support for command line parsing of boolean flags in the tuning
228 structures. */
229 struct aarch64_flag_desc
231 const char* name;
232 unsigned int flag;
235 #define AARCH64_FUSION_PAIR(name, internal_name) \
236 { name, AARCH64_FUSE_##internal_name },
237 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
239 { "none", AARCH64_FUSE_NOTHING },
240 #include "aarch64-fusion-pairs.def"
241 { "all", AARCH64_FUSE_ALL },
242 { NULL, AARCH64_FUSE_NOTHING }
245 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
246 { name, AARCH64_EXTRA_TUNE_##internal_name },
247 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
249 { "none", AARCH64_EXTRA_TUNE_NONE },
250 #include "aarch64-tuning-flags.def"
251 { "all", AARCH64_EXTRA_TUNE_ALL },
252 { NULL, AARCH64_EXTRA_TUNE_NONE }
255 /* Tuning parameters. */
257 static const struct cpu_addrcost_table generic_addrcost_table =
260 1, /* hi */
261 0, /* si */
262 0, /* di */
263 1, /* ti */
265 0, /* pre_modify */
266 0, /* post_modify */
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
270 0 /* imm_offset */
273 static const struct cpu_addrcost_table exynosm1_addrcost_table =
276 0, /* hi */
277 0, /* si */
278 0, /* di */
279 2, /* ti */
281 0, /* pre_modify */
282 0, /* post_modify */
283 1, /* register_offset */
284 1, /* register_sextend */
285 2, /* register_zextend */
286 0, /* imm_offset */
289 static const struct cpu_addrcost_table xgene1_addrcost_table =
292 1, /* hi */
293 0, /* si */
294 0, /* di */
295 1, /* ti */
297 1, /* pre_modify */
298 0, /* post_modify */
299 0, /* register_offset */
300 1, /* register_sextend */
301 1, /* register_zextend */
302 0, /* imm_offset */
305 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
308 1, /* hi */
309 1, /* si */
310 1, /* di */
311 2, /* ti */
313 0, /* pre_modify */
314 0, /* post_modify */
315 2, /* register_offset */
316 3, /* register_sextend */
317 3, /* register_zextend */
318 0, /* imm_offset */
321 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
324 1, /* hi */
325 1, /* si */
326 1, /* di */
327 2, /* ti */
329 1, /* pre_modify */
330 1, /* post_modify */
331 3, /* register_offset */
332 4, /* register_sextend */
333 3, /* register_zextend */
334 2, /* imm_offset */
337 static const struct cpu_regmove_cost generic_regmove_cost =
339 1, /* GP2GP */
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
342 5, /* GP2FP */
343 5, /* FP2GP */
344 2 /* FP2FP */
347 static const struct cpu_regmove_cost cortexa57_regmove_cost =
349 1, /* GP2GP */
350 /* Avoid the use of slow int<->fp moves for spilling by setting
351 their cost higher than memmov_cost. */
352 5, /* GP2FP */
353 5, /* FP2GP */
354 2 /* FP2FP */
357 static const struct cpu_regmove_cost cortexa53_regmove_cost =
359 1, /* GP2GP */
360 /* Avoid the use of slow int<->fp moves for spilling by setting
361 their cost higher than memmov_cost. */
362 5, /* GP2FP */
363 5, /* FP2GP */
364 2 /* FP2FP */
367 static const struct cpu_regmove_cost exynosm1_regmove_cost =
369 1, /* GP2GP */
370 /* Avoid the use of slow int<->fp moves for spilling by setting
371 their cost higher than memmov_cost (actual, 4 and 9). */
372 9, /* GP2FP */
373 9, /* FP2GP */
374 1 /* FP2FP */
377 static const struct cpu_regmove_cost thunderx_regmove_cost =
379 2, /* GP2GP */
380 2, /* GP2FP */
381 6, /* FP2GP */
382 4 /* FP2FP */
385 static const struct cpu_regmove_cost xgene1_regmove_cost =
387 1, /* GP2GP */
388 /* Avoid the use of slow int<->fp moves for spilling by setting
389 their cost higher than memmov_cost. */
390 8, /* GP2FP */
391 8, /* FP2GP */
392 2 /* FP2FP */
395 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
397 2, /* GP2GP */
398 /* Avoid the use of int<->fp moves for spilling. */
399 6, /* GP2FP */
400 6, /* FP2GP */
401 4 /* FP2FP */
404 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
406 1, /* GP2GP */
407 /* Avoid the use of int<->fp moves for spilling. */
408 8, /* GP2FP */
409 8, /* FP2GP */
410 4 /* FP2FP */
413 /* Generic costs for vector insn classes. */
414 static const struct cpu_vector_cost generic_vector_cost =
416 1, /* scalar_int_stmt_cost */
417 1, /* scalar_fp_stmt_cost */
418 1, /* scalar_load_cost */
419 1, /* scalar_store_cost */
420 1, /* vec_int_stmt_cost */
421 1, /* vec_fp_stmt_cost */
422 2, /* vec_permute_cost */
423 1, /* vec_to_scalar_cost */
424 1, /* scalar_to_vec_cost */
425 1, /* vec_align_load_cost */
426 1, /* vec_unalign_load_cost */
427 1, /* vec_unalign_store_cost */
428 1, /* vec_store_cost */
429 3, /* cond_taken_branch_cost */
430 1 /* cond_not_taken_branch_cost */
433 /* ThunderX costs for vector insn classes. */
434 static const struct cpu_vector_cost thunderx_vector_cost =
436 1, /* scalar_int_stmt_cost */
437 1, /* scalar_fp_stmt_cost */
438 3, /* scalar_load_cost */
439 1, /* scalar_store_cost */
440 4, /* vec_int_stmt_cost */
441 1, /* vec_fp_stmt_cost */
442 4, /* vec_permute_cost */
443 2, /* vec_to_scalar_cost */
444 2, /* scalar_to_vec_cost */
445 3, /* vec_align_load_cost */
446 5, /* vec_unalign_load_cost */
447 5, /* vec_unalign_store_cost */
448 1, /* vec_store_cost */
449 3, /* cond_taken_branch_cost */
450 3 /* cond_not_taken_branch_cost */
453 /* Generic costs for vector insn classes. */
454 static const struct cpu_vector_cost cortexa57_vector_cost =
456 1, /* scalar_int_stmt_cost */
457 1, /* scalar_fp_stmt_cost */
458 4, /* scalar_load_cost */
459 1, /* scalar_store_cost */
460 2, /* vec_int_stmt_cost */
461 2, /* vec_fp_stmt_cost */
462 3, /* vec_permute_cost */
463 8, /* vec_to_scalar_cost */
464 8, /* scalar_to_vec_cost */
465 4, /* vec_align_load_cost */
466 4, /* vec_unalign_load_cost */
467 1, /* vec_unalign_store_cost */
468 1, /* vec_store_cost */
469 1, /* cond_taken_branch_cost */
470 1 /* cond_not_taken_branch_cost */
473 static const struct cpu_vector_cost exynosm1_vector_cost =
475 1, /* scalar_int_stmt_cost */
476 1, /* scalar_fp_stmt_cost */
477 5, /* scalar_load_cost */
478 1, /* scalar_store_cost */
479 3, /* vec_int_stmt_cost */
480 3, /* vec_fp_stmt_cost */
481 3, /* vec_permute_cost */
482 3, /* vec_to_scalar_cost */
483 3, /* scalar_to_vec_cost */
484 5, /* vec_align_load_cost */
485 5, /* vec_unalign_load_cost */
486 1, /* vec_unalign_store_cost */
487 1, /* vec_store_cost */
488 1, /* cond_taken_branch_cost */
489 1 /* cond_not_taken_branch_cost */
492 /* Generic costs for vector insn classes. */
493 static const struct cpu_vector_cost xgene1_vector_cost =
495 1, /* scalar_int_stmt_cost */
496 1, /* scalar_fp_stmt_cost */
497 5, /* scalar_load_cost */
498 1, /* scalar_store_cost */
499 2, /* vec_int_stmt_cost */
500 2, /* vec_fp_stmt_cost */
501 2, /* vec_permute_cost */
502 4, /* vec_to_scalar_cost */
503 4, /* scalar_to_vec_cost */
504 10, /* vec_align_load_cost */
505 10, /* vec_unalign_load_cost */
506 2, /* vec_unalign_store_cost */
507 2, /* vec_store_cost */
508 2, /* cond_taken_branch_cost */
509 1 /* cond_not_taken_branch_cost */
512 /* Costs for vector insn classes for Vulcan. */
513 static const struct cpu_vector_cost thunderx2t99_vector_cost =
515 1, /* scalar_int_stmt_cost */
516 6, /* scalar_fp_stmt_cost */
517 4, /* scalar_load_cost */
518 1, /* scalar_store_cost */
519 5, /* vec_int_stmt_cost */
520 6, /* vec_fp_stmt_cost */
521 3, /* vec_permute_cost */
522 6, /* vec_to_scalar_cost */
523 5, /* scalar_to_vec_cost */
524 8, /* vec_align_load_cost */
525 8, /* vec_unalign_load_cost */
526 4, /* vec_unalign_store_cost */
527 4, /* vec_store_cost */
528 2, /* cond_taken_branch_cost */
529 1 /* cond_not_taken_branch_cost */
532 /* Generic costs for branch instructions. */
533 static const struct cpu_branch_cost generic_branch_cost =
535 1, /* Predictable. */
536 3 /* Unpredictable. */
539 /* Generic approximation modes. */
540 static const cpu_approx_modes generic_approx_modes =
542 AARCH64_APPROX_NONE, /* division */
543 AARCH64_APPROX_NONE, /* sqrt */
544 AARCH64_APPROX_NONE /* recip_sqrt */
547 /* Approximation modes for Exynos M1. */
548 static const cpu_approx_modes exynosm1_approx_modes =
550 AARCH64_APPROX_NONE, /* division */
551 AARCH64_APPROX_ALL, /* sqrt */
552 AARCH64_APPROX_ALL /* recip_sqrt */
555 /* Approximation modes for X-Gene 1. */
556 static const cpu_approx_modes xgene1_approx_modes =
558 AARCH64_APPROX_NONE, /* division */
559 AARCH64_APPROX_NONE, /* sqrt */
560 AARCH64_APPROX_ALL /* recip_sqrt */
563 /* Generic prefetch settings (which disable prefetch). */
564 static const cpu_prefetch_tune generic_prefetch_tune =
566 0, /* num_slots */
567 -1, /* l1_cache_size */
568 -1, /* l1_cache_line_size */
569 -1, /* l2_cache_size */
570 true, /* prefetch_dynamic_strides */
571 -1, /* minimum_stride */
572 -1 /* default_opt_level */
575 static const cpu_prefetch_tune exynosm1_prefetch_tune =
577 0, /* num_slots */
578 -1, /* l1_cache_size */
579 64, /* l1_cache_line_size */
580 -1, /* l2_cache_size */
581 true, /* prefetch_dynamic_strides */
582 -1, /* minimum_stride */
583 -1 /* default_opt_level */
586 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
588 4, /* num_slots */
589 32, /* l1_cache_size */
590 64, /* l1_cache_line_size */
591 512, /* l2_cache_size */
592 false, /* prefetch_dynamic_strides */
593 2048, /* minimum_stride */
594 3 /* default_opt_level */
597 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
599 8, /* num_slots */
600 32, /* l1_cache_size */
601 128, /* l1_cache_line_size */
602 16*1024, /* l2_cache_size */
603 true, /* prefetch_dynamic_strides */
604 -1, /* minimum_stride */
605 3 /* default_opt_level */
608 static const cpu_prefetch_tune thunderx_prefetch_tune =
610 8, /* num_slots */
611 32, /* l1_cache_size */
612 128, /* l1_cache_line_size */
613 -1, /* l2_cache_size */
614 true, /* prefetch_dynamic_strides */
615 -1, /* minimum_stride */
616 -1 /* default_opt_level */
619 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
621 8, /* num_slots */
622 32, /* l1_cache_size */
623 64, /* l1_cache_line_size */
624 256, /* l2_cache_size */
625 true, /* prefetch_dynamic_strides */
626 -1, /* minimum_stride */
627 -1 /* default_opt_level */
630 static const struct tune_params generic_tunings =
632 &cortexa57_extra_costs,
633 &generic_addrcost_table,
634 &generic_regmove_cost,
635 &generic_vector_cost,
636 &generic_branch_cost,
637 &generic_approx_modes,
638 4, /* memmov_cost */
639 2, /* issue_rate */
640 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
641 "8", /* function_align. */
642 "4", /* jump_align. */
643 "8", /* loop_align. */
644 2, /* int_reassoc_width. */
645 4, /* fp_reassoc_width. */
646 1, /* vec_reassoc_width. */
647 2, /* min_div_recip_mul_sf. */
648 2, /* min_div_recip_mul_df. */
649 0, /* max_case_values. */
650 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
651 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
652 &generic_prefetch_tune
655 static const struct tune_params cortexa35_tunings =
657 &cortexa53_extra_costs,
658 &generic_addrcost_table,
659 &cortexa53_regmove_cost,
660 &generic_vector_cost,
661 &generic_branch_cost,
662 &generic_approx_modes,
663 4, /* memmov_cost */
664 1, /* issue_rate */
665 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
666 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
667 "16", /* function_align. */
668 "4", /* jump_align. */
669 "8", /* loop_align. */
670 2, /* int_reassoc_width. */
671 4, /* fp_reassoc_width. */
672 1, /* vec_reassoc_width. */
673 2, /* min_div_recip_mul_sf. */
674 2, /* min_div_recip_mul_df. */
675 0, /* max_case_values. */
676 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
677 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
678 &generic_prefetch_tune
681 static const struct tune_params cortexa53_tunings =
683 &cortexa53_extra_costs,
684 &generic_addrcost_table,
685 &cortexa53_regmove_cost,
686 &generic_vector_cost,
687 &generic_branch_cost,
688 &generic_approx_modes,
689 4, /* memmov_cost */
690 2, /* issue_rate */
691 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
692 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
693 "16", /* function_align. */
694 "4", /* jump_align. */
695 "8", /* loop_align. */
696 2, /* int_reassoc_width. */
697 4, /* fp_reassoc_width. */
698 1, /* vec_reassoc_width. */
699 2, /* min_div_recip_mul_sf. */
700 2, /* min_div_recip_mul_df. */
701 0, /* max_case_values. */
702 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
703 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
704 &generic_prefetch_tune
707 static const struct tune_params cortexa57_tunings =
709 &cortexa57_extra_costs,
710 &generic_addrcost_table,
711 &cortexa57_regmove_cost,
712 &cortexa57_vector_cost,
713 &generic_branch_cost,
714 &generic_approx_modes,
715 4, /* memmov_cost */
716 3, /* issue_rate */
717 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
718 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
719 "16", /* function_align. */
720 "4", /* jump_align. */
721 "8", /* loop_align. */
722 2, /* int_reassoc_width. */
723 4, /* fp_reassoc_width. */
724 1, /* vec_reassoc_width. */
725 2, /* min_div_recip_mul_sf. */
726 2, /* min_div_recip_mul_df. */
727 0, /* max_case_values. */
728 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
729 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
730 &generic_prefetch_tune
733 static const struct tune_params cortexa72_tunings =
735 &cortexa57_extra_costs,
736 &generic_addrcost_table,
737 &cortexa57_regmove_cost,
738 &cortexa57_vector_cost,
739 &generic_branch_cost,
740 &generic_approx_modes,
741 4, /* memmov_cost */
742 3, /* issue_rate */
743 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
744 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
745 "16", /* function_align. */
746 "4", /* jump_align. */
747 "8", /* loop_align. */
748 2, /* int_reassoc_width. */
749 4, /* fp_reassoc_width. */
750 1, /* vec_reassoc_width. */
751 2, /* min_div_recip_mul_sf. */
752 2, /* min_div_recip_mul_df. */
753 0, /* max_case_values. */
754 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
755 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
756 &generic_prefetch_tune
759 static const struct tune_params cortexa73_tunings =
761 &cortexa57_extra_costs,
762 &generic_addrcost_table,
763 &cortexa57_regmove_cost,
764 &cortexa57_vector_cost,
765 &generic_branch_cost,
766 &generic_approx_modes,
767 4, /* memmov_cost. */
768 2, /* issue_rate. */
769 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
770 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
771 "16", /* function_align. */
772 "4", /* jump_align. */
773 "8", /* loop_align. */
774 2, /* int_reassoc_width. */
775 4, /* fp_reassoc_width. */
776 1, /* vec_reassoc_width. */
777 2, /* min_div_recip_mul_sf. */
778 2, /* min_div_recip_mul_df. */
779 0, /* max_case_values. */
780 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
781 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
782 &generic_prefetch_tune
787 static const struct tune_params exynosm1_tunings =
789 &exynosm1_extra_costs,
790 &exynosm1_addrcost_table,
791 &exynosm1_regmove_cost,
792 &exynosm1_vector_cost,
793 &generic_branch_cost,
794 &exynosm1_approx_modes,
795 4, /* memmov_cost */
796 3, /* issue_rate */
797 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
798 "4", /* function_align. */
799 "4", /* jump_align. */
800 "4", /* loop_align. */
801 2, /* int_reassoc_width. */
802 4, /* fp_reassoc_width. */
803 1, /* vec_reassoc_width. */
804 2, /* min_div_recip_mul_sf. */
805 2, /* min_div_recip_mul_df. */
806 48, /* max_case_values. */
807 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
808 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
809 &exynosm1_prefetch_tune
812 static const struct tune_params thunderxt88_tunings =
814 &thunderx_extra_costs,
815 &generic_addrcost_table,
816 &thunderx_regmove_cost,
817 &thunderx_vector_cost,
818 &generic_branch_cost,
819 &generic_approx_modes,
820 6, /* memmov_cost */
821 2, /* issue_rate */
822 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
823 "8", /* function_align. */
824 "8", /* jump_align. */
825 "8", /* loop_align. */
826 2, /* int_reassoc_width. */
827 4, /* fp_reassoc_width. */
828 1, /* vec_reassoc_width. */
829 2, /* min_div_recip_mul_sf. */
830 2, /* min_div_recip_mul_df. */
831 0, /* max_case_values. */
832 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
833 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
834 &thunderxt88_prefetch_tune
837 static const struct tune_params thunderx_tunings =
839 &thunderx_extra_costs,
840 &generic_addrcost_table,
841 &thunderx_regmove_cost,
842 &thunderx_vector_cost,
843 &generic_branch_cost,
844 &generic_approx_modes,
845 6, /* memmov_cost */
846 2, /* issue_rate */
847 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
848 "8", /* function_align. */
849 "8", /* jump_align. */
850 "8", /* loop_align. */
851 2, /* int_reassoc_width. */
852 4, /* fp_reassoc_width. */
853 1, /* vec_reassoc_width. */
854 2, /* min_div_recip_mul_sf. */
855 2, /* min_div_recip_mul_df. */
856 0, /* max_case_values. */
857 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
858 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
859 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
860 &thunderx_prefetch_tune
863 static const struct tune_params xgene1_tunings =
865 &xgene1_extra_costs,
866 &xgene1_addrcost_table,
867 &xgene1_regmove_cost,
868 &xgene1_vector_cost,
869 &generic_branch_cost,
870 &xgene1_approx_modes,
871 6, /* memmov_cost */
872 4, /* issue_rate */
873 AARCH64_FUSE_NOTHING, /* fusible_ops */
874 "16", /* function_align. */
875 "8", /* jump_align. */
876 "16", /* loop_align. */
877 2, /* int_reassoc_width. */
878 4, /* fp_reassoc_width. */
879 1, /* vec_reassoc_width. */
880 2, /* min_div_recip_mul_sf. */
881 2, /* min_div_recip_mul_df. */
882 0, /* max_case_values. */
883 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
884 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
885 &generic_prefetch_tune
888 static const struct tune_params qdf24xx_tunings =
890 &qdf24xx_extra_costs,
891 &qdf24xx_addrcost_table,
892 &qdf24xx_regmove_cost,
893 &generic_vector_cost,
894 &generic_branch_cost,
895 &generic_approx_modes,
896 4, /* memmov_cost */
897 4, /* issue_rate */
898 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
899 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
900 "16", /* function_align. */
901 "8", /* jump_align. */
902 "16", /* loop_align. */
903 2, /* int_reassoc_width. */
904 4, /* fp_reassoc_width. */
905 1, /* vec_reassoc_width. */
906 2, /* min_div_recip_mul_sf. */
907 2, /* min_div_recip_mul_df. */
908 0, /* max_case_values. */
909 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
910 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
911 &qdf24xx_prefetch_tune
914 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
915 for now. */
916 static const struct tune_params saphira_tunings =
918 &generic_extra_costs,
919 &generic_addrcost_table,
920 &generic_regmove_cost,
921 &generic_vector_cost,
922 &generic_branch_cost,
923 &generic_approx_modes,
924 4, /* memmov_cost */
925 4, /* issue_rate */
926 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
927 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
928 "16", /* function_align. */
929 "8", /* jump_align. */
930 "16", /* loop_align. */
931 2, /* int_reassoc_width. */
932 4, /* fp_reassoc_width. */
933 1, /* vec_reassoc_width. */
934 2, /* min_div_recip_mul_sf. */
935 2, /* min_div_recip_mul_df. */
936 0, /* max_case_values. */
937 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
938 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
939 &generic_prefetch_tune
942 static const struct tune_params thunderx2t99_tunings =
944 &thunderx2t99_extra_costs,
945 &thunderx2t99_addrcost_table,
946 &thunderx2t99_regmove_cost,
947 &thunderx2t99_vector_cost,
948 &generic_branch_cost,
949 &generic_approx_modes,
950 4, /* memmov_cost. */
951 4, /* issue_rate. */
952 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
953 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
954 "16", /* function_align. */
955 "8", /* jump_align. */
956 "16", /* loop_align. */
957 3, /* int_reassoc_width. */
958 2, /* fp_reassoc_width. */
959 2, /* vec_reassoc_width. */
960 2, /* min_div_recip_mul_sf. */
961 2, /* min_div_recip_mul_df. */
962 0, /* max_case_values. */
963 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
964 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
965 &thunderx2t99_prefetch_tune
968 /* Support for fine-grained override of the tuning structures. */
969 struct aarch64_tuning_override_function
971 const char* name;
972 void (*parse_override)(const char*, struct tune_params*);
975 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
976 static void aarch64_parse_tune_string (const char*, struct tune_params*);
978 static const struct aarch64_tuning_override_function
979 aarch64_tuning_override_functions[] =
981 { "fuse", aarch64_parse_fuse_string },
982 { "tune", aarch64_parse_tune_string },
983 { NULL, NULL }
986 /* A processor implementing AArch64. */
987 struct processor
989 const char *const name;
990 enum aarch64_processor ident;
991 enum aarch64_processor sched_core;
992 enum aarch64_arch arch;
993 unsigned architecture_version;
994 const unsigned long flags;
995 const struct tune_params *const tune;
998 /* Architectures implementing AArch64. */
999 static const struct processor all_architectures[] =
1001 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1002 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1003 #include "aarch64-arches.def"
1004 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1007 /* Processor cores implementing AArch64. */
1008 static const struct processor all_cores[] =
1010 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1011 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1012 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1013 FLAGS, &COSTS##_tunings},
1014 #include "aarch64-cores.def"
1015 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1016 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1017 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1021 /* Target specification. These are populated by the -march, -mtune, -mcpu
1022 handling code or by target attributes. */
1023 static const struct processor *selected_arch;
1024 static const struct processor *selected_cpu;
1025 static const struct processor *selected_tune;
1027 /* The current tuning set. */
1028 struct tune_params aarch64_tune_params = generic_tunings;
1030 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1032 /* An ISA extension in the co-processor and main instruction set space. */
1033 struct aarch64_option_extension
1035 const char *const name;
1036 const unsigned long flags_on;
1037 const unsigned long flags_off;
1040 typedef enum aarch64_cond_code
1042 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1043 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1044 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1046 aarch64_cc;
1048 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1050 /* The condition codes of the processor, and the inverse function. */
1051 static const char * const aarch64_condition_codes[] =
1053 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1054 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1057 /* Generate code to enable conditional branches in functions over 1 MiB. */
1058 const char *
1059 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1060 const char * branch_format)
1062 rtx_code_label * tmp_label = gen_label_rtx ();
1063 char label_buf[256];
1064 char buffer[128];
1065 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1066 CODE_LABEL_NUMBER (tmp_label));
1067 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1068 rtx dest_label = operands[pos_label];
1069 operands[pos_label] = tmp_label;
1071 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1072 output_asm_insn (buffer, operands);
1074 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1075 operands[pos_label] = dest_label;
1076 output_asm_insn (buffer, operands);
1077 return "";
1080 void
1081 aarch64_err_no_fpadvsimd (machine_mode mode)
1083 if (TARGET_GENERAL_REGS_ONLY)
1084 if (FLOAT_MODE_P (mode))
1085 error ("%qs is incompatible with the use of floating-point types",
1086 "-mgeneral-regs-only");
1087 else
1088 error ("%qs is incompatible with the use of vector types",
1089 "-mgeneral-regs-only");
1090 else
1091 if (FLOAT_MODE_P (mode))
1092 error ("%qs feature modifier is incompatible with the use of"
1093 " floating-point types", "+nofp");
1094 else
1095 error ("%qs feature modifier is incompatible with the use of"
1096 " vector types", "+nofp");
1099 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1100 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1101 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1102 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1103 and GENERAL_REGS is lower than the memory cost (in this case the best class
1104 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1105 cost results in bad allocations with many redundant int<->FP moves which
1106 are expensive on various cores.
1107 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1108 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1109 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1110 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1111 The result of this is that it is no longer inefficient to have a higher
1112 memory move cost than the register move cost.
1115 static reg_class_t
1116 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1117 reg_class_t best_class)
1119 machine_mode mode;
1121 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1122 || !reg_class_subset_p (FP_REGS, allocno_class))
1123 return allocno_class;
1125 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1126 || !reg_class_subset_p (FP_REGS, best_class))
1127 return best_class;
1129 mode = PSEUDO_REGNO_MODE (regno);
1130 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1133 static unsigned int
1134 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1136 if (GET_MODE_UNIT_SIZE (mode) == 4)
1137 return aarch64_tune_params.min_div_recip_mul_sf;
1138 return aarch64_tune_params.min_div_recip_mul_df;
1141 /* Return the reassociation width of treeop OPC with mode MODE. */
1142 static int
1143 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1145 if (VECTOR_MODE_P (mode))
1146 return aarch64_tune_params.vec_reassoc_width;
1147 if (INTEGRAL_MODE_P (mode))
1148 return aarch64_tune_params.int_reassoc_width;
1149 /* Avoid reassociating floating point addition so we emit more FMAs. */
1150 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1151 return aarch64_tune_params.fp_reassoc_width;
1152 return 1;
1155 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1156 unsigned
1157 aarch64_dbx_register_number (unsigned regno)
1159 if (GP_REGNUM_P (regno))
1160 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1161 else if (regno == SP_REGNUM)
1162 return AARCH64_DWARF_SP;
1163 else if (FP_REGNUM_P (regno))
1164 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1165 else if (PR_REGNUM_P (regno))
1166 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1167 else if (regno == VG_REGNUM)
1168 return AARCH64_DWARF_VG;
1170 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1171 equivalent DWARF register. */
1172 return DWARF_FRAME_REGISTERS;
1175 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1176 static bool
1177 aarch64_advsimd_struct_mode_p (machine_mode mode)
1179 return (TARGET_SIMD
1180 && (mode == OImode || mode == CImode || mode == XImode));
1183 /* Return true if MODE is an SVE predicate mode. */
1184 static bool
1185 aarch64_sve_pred_mode_p (machine_mode mode)
1187 return (TARGET_SVE
1188 && (mode == VNx16BImode
1189 || mode == VNx8BImode
1190 || mode == VNx4BImode
1191 || mode == VNx2BImode));
1194 /* Three mutually-exclusive flags describing a vector or predicate type. */
1195 const unsigned int VEC_ADVSIMD = 1;
1196 const unsigned int VEC_SVE_DATA = 2;
1197 const unsigned int VEC_SVE_PRED = 4;
1198 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1199 a structure of 2, 3 or 4 vectors. */
1200 const unsigned int VEC_STRUCT = 8;
1201 /* Useful combinations of the above. */
1202 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1203 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1205 /* Return a set of flags describing the vector properties of mode MODE.
1206 Ignore modes that are not supported by the current target. */
1207 static unsigned int
1208 aarch64_classify_vector_mode (machine_mode mode)
1210 if (aarch64_advsimd_struct_mode_p (mode))
1211 return VEC_ADVSIMD | VEC_STRUCT;
1213 if (aarch64_sve_pred_mode_p (mode))
1214 return VEC_SVE_PRED;
1216 scalar_mode inner = GET_MODE_INNER (mode);
1217 if (VECTOR_MODE_P (mode)
1218 && (inner == QImode
1219 || inner == HImode
1220 || inner == HFmode
1221 || inner == SImode
1222 || inner == SFmode
1223 || inner == DImode
1224 || inner == DFmode))
1226 if (TARGET_SVE)
1228 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1229 return VEC_SVE_DATA;
1230 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1231 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1232 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1233 return VEC_SVE_DATA | VEC_STRUCT;
1236 /* This includes V1DF but not V1DI (which doesn't exist). */
1237 if (TARGET_SIMD
1238 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1239 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1240 return VEC_ADVSIMD;
1243 return 0;
1246 /* Return true if MODE is any of the data vector modes, including
1247 structure modes. */
1248 static bool
1249 aarch64_vector_data_mode_p (machine_mode mode)
1251 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1254 /* Return true if MODE is an SVE data vector mode; either a single vector
1255 or a structure of vectors. */
1256 static bool
1257 aarch64_sve_data_mode_p (machine_mode mode)
1259 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1262 /* Implement target hook TARGET_ARRAY_MODE. */
1263 static opt_machine_mode
1264 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1266 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1267 && IN_RANGE (nelems, 2, 4))
1268 return mode_for_vector (GET_MODE_INNER (mode),
1269 GET_MODE_NUNITS (mode) * nelems);
1271 return opt_machine_mode ();
1274 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1275 static bool
1276 aarch64_array_mode_supported_p (machine_mode mode,
1277 unsigned HOST_WIDE_INT nelems)
1279 if (TARGET_SIMD
1280 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1281 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1282 && (nelems >= 2 && nelems <= 4))
1283 return true;
1285 return false;
1288 /* Return the SVE predicate mode to use for elements that have
1289 ELEM_NBYTES bytes, if such a mode exists. */
1291 opt_machine_mode
1292 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1294 if (TARGET_SVE)
1296 if (elem_nbytes == 1)
1297 return VNx16BImode;
1298 if (elem_nbytes == 2)
1299 return VNx8BImode;
1300 if (elem_nbytes == 4)
1301 return VNx4BImode;
1302 if (elem_nbytes == 8)
1303 return VNx2BImode;
1305 return opt_machine_mode ();
1308 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1310 static opt_machine_mode
1311 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1313 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1315 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1316 machine_mode pred_mode;
1317 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1318 return pred_mode;
1321 return default_get_mask_mode (nunits, nbytes);
1324 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1325 prefer to use the first arithmetic operand as the else value if
1326 the else value doesn't matter, since that exactly matches the SVE
1327 destructive merging form. For ternary operations we could either
1328 pick the first operand and use FMAD-like instructions or the last
1329 operand and use FMLA-like instructions; the latter seems more
1330 natural. */
1332 static tree
1333 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1335 return nops == 3 ? ops[2] : ops[0];
1338 /* Implement TARGET_HARD_REGNO_NREGS. */
1340 static unsigned int
1341 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1343 /* ??? Logically we should only need to provide a value when
1344 HARD_REGNO_MODE_OK says that the combination is valid,
1345 but at the moment we need to handle all modes. Just ignore
1346 any runtime parts for registers that can't store them. */
1347 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1348 switch (aarch64_regno_regclass (regno))
1350 case FP_REGS:
1351 case FP_LO_REGS:
1352 if (aarch64_sve_data_mode_p (mode))
1353 return exact_div (GET_MODE_SIZE (mode),
1354 BYTES_PER_SVE_VECTOR).to_constant ();
1355 return CEIL (lowest_size, UNITS_PER_VREG);
1356 case PR_REGS:
1357 case PR_LO_REGS:
1358 case PR_HI_REGS:
1359 return 1;
1360 default:
1361 return CEIL (lowest_size, UNITS_PER_WORD);
1363 gcc_unreachable ();
1366 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1368 static bool
1369 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1371 if (GET_MODE_CLASS (mode) == MODE_CC)
1372 return regno == CC_REGNUM;
1374 if (regno == VG_REGNUM)
1375 /* This must have the same size as _Unwind_Word. */
1376 return mode == DImode;
1378 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1379 if (vec_flags & VEC_SVE_PRED)
1380 return PR_REGNUM_P (regno);
1382 if (PR_REGNUM_P (regno))
1383 return 0;
1385 if (regno == SP_REGNUM)
1386 /* The purpose of comparing with ptr_mode is to support the
1387 global register variable associated with the stack pointer
1388 register via the syntax of asm ("wsp") in ILP32. */
1389 return mode == Pmode || mode == ptr_mode;
1391 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1392 return mode == Pmode;
1394 if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1395 return true;
1397 if (FP_REGNUM_P (regno))
1399 if (vec_flags & VEC_STRUCT)
1400 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1401 else
1402 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1405 return false;
1408 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1409 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1410 clobbers the top 64 bits when restoring the bottom 64 bits. */
1412 static bool
1413 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1415 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1418 /* Implement REGMODE_NATURAL_SIZE. */
1419 poly_uint64
1420 aarch64_regmode_natural_size (machine_mode mode)
1422 /* The natural size for SVE data modes is one SVE data vector,
1423 and similarly for predicates. We can't independently modify
1424 anything smaller than that. */
1425 /* ??? For now, only do this for variable-width SVE registers.
1426 Doing it for constant-sized registers breaks lower-subreg.c. */
1427 /* ??? And once that's fixed, we should probably have similar
1428 code for Advanced SIMD. */
1429 if (!aarch64_sve_vg.is_constant ())
1431 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1432 if (vec_flags & VEC_SVE_PRED)
1433 return BYTES_PER_SVE_PRED;
1434 if (vec_flags & VEC_SVE_DATA)
1435 return BYTES_PER_SVE_VECTOR;
1437 return UNITS_PER_WORD;
1440 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1441 machine_mode
1442 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1443 machine_mode mode)
1445 /* The predicate mode determines which bits are significant and
1446 which are "don't care". Decreasing the number of lanes would
1447 lose data while increasing the number of lanes would make bits
1448 unnecessarily significant. */
1449 if (PR_REGNUM_P (regno))
1450 return mode;
1451 if (known_ge (GET_MODE_SIZE (mode), 4))
1452 return mode;
1453 else
1454 return SImode;
1457 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1458 that strcpy from constants will be faster. */
1460 static HOST_WIDE_INT
1461 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1463 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1464 return MAX (align, BITS_PER_WORD);
1465 return align;
1468 /* Return true if calls to DECL should be treated as
1469 long-calls (ie called via a register). */
1470 static bool
1471 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1473 return false;
1476 /* Return true if calls to symbol-ref SYM should be treated as
1477 long-calls (ie called via a register). */
1478 bool
1479 aarch64_is_long_call_p (rtx sym)
1481 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1484 /* Return true if calls to symbol-ref SYM should not go through
1485 plt stubs. */
1487 bool
1488 aarch64_is_noplt_call_p (rtx sym)
1490 const_tree decl = SYMBOL_REF_DECL (sym);
1492 if (flag_pic
1493 && decl
1494 && (!flag_plt
1495 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1496 && !targetm.binds_local_p (decl))
1497 return true;
1499 return false;
1502 /* Return true if the offsets to a zero/sign-extract operation
1503 represent an expression that matches an extend operation. The
1504 operands represent the paramters from
1506 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1507 bool
1508 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1509 rtx extract_imm)
1511 HOST_WIDE_INT mult_val, extract_val;
1513 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1514 return false;
1516 mult_val = INTVAL (mult_imm);
1517 extract_val = INTVAL (extract_imm);
1519 if (extract_val > 8
1520 && extract_val < GET_MODE_BITSIZE (mode)
1521 && exact_log2 (extract_val & ~7) > 0
1522 && (extract_val & 7) <= 4
1523 && mult_val == (1 << (extract_val & 7)))
1524 return true;
1526 return false;
1529 /* Emit an insn that's a simple single-set. Both the operands must be
1530 known to be valid. */
1531 inline static rtx_insn *
1532 emit_set_insn (rtx x, rtx y)
1534 return emit_insn (gen_rtx_SET (x, y));
1537 /* X and Y are two things to compare using CODE. Emit the compare insn and
1538 return the rtx for register 0 in the proper mode. */
1540 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1542 machine_mode mode = SELECT_CC_MODE (code, x, y);
1543 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1545 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1546 return cc_reg;
1549 /* Build the SYMBOL_REF for __tls_get_addr. */
1551 static GTY(()) rtx tls_get_addr_libfunc;
1554 aarch64_tls_get_addr (void)
1556 if (!tls_get_addr_libfunc)
1557 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1558 return tls_get_addr_libfunc;
1561 /* Return the TLS model to use for ADDR. */
1563 static enum tls_model
1564 tls_symbolic_operand_type (rtx addr)
1566 enum tls_model tls_kind = TLS_MODEL_NONE;
1567 if (GET_CODE (addr) == CONST)
1569 poly_int64 addend;
1570 rtx sym = strip_offset (addr, &addend);
1571 if (GET_CODE (sym) == SYMBOL_REF)
1572 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1574 else if (GET_CODE (addr) == SYMBOL_REF)
1575 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1577 return tls_kind;
1580 /* We'll allow lo_sum's in addresses in our legitimate addresses
1581 so that combine would take care of combining addresses where
1582 necessary, but for generation purposes, we'll generate the address
1583 as :
1584 RTL Absolute
1585 tmp = hi (symbol_ref); adrp x1, foo
1586 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1589 PIC TLS
1590 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1591 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1592 bl __tls_get_addr
1595 Load TLS symbol, depending on TLS mechanism and TLS access model.
1597 Global Dynamic - Traditional TLS:
1598 adrp tmp, :tlsgd:imm
1599 add dest, tmp, #:tlsgd_lo12:imm
1600 bl __tls_get_addr
1602 Global Dynamic - TLS Descriptors:
1603 adrp dest, :tlsdesc:imm
1604 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1605 add dest, dest, #:tlsdesc_lo12:imm
1606 blr tmp
1607 mrs tp, tpidr_el0
1608 add dest, dest, tp
1610 Initial Exec:
1611 mrs tp, tpidr_el0
1612 adrp tmp, :gottprel:imm
1613 ldr dest, [tmp, #:gottprel_lo12:imm]
1614 add dest, dest, tp
1616 Local Exec:
1617 mrs tp, tpidr_el0
1618 add t0, tp, #:tprel_hi12:imm, lsl #12
1619 add t0, t0, #:tprel_lo12_nc:imm
1622 static void
1623 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1624 enum aarch64_symbol_type type)
1626 switch (type)
1628 case SYMBOL_SMALL_ABSOLUTE:
1630 /* In ILP32, the mode of dest can be either SImode or DImode. */
1631 rtx tmp_reg = dest;
1632 machine_mode mode = GET_MODE (dest);
1634 gcc_assert (mode == Pmode || mode == ptr_mode);
1636 if (can_create_pseudo_p ())
1637 tmp_reg = gen_reg_rtx (mode);
1639 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1640 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1641 return;
1644 case SYMBOL_TINY_ABSOLUTE:
1645 emit_insn (gen_rtx_SET (dest, imm));
1646 return;
1648 case SYMBOL_SMALL_GOT_28K:
1650 machine_mode mode = GET_MODE (dest);
1651 rtx gp_rtx = pic_offset_table_rtx;
1652 rtx insn;
1653 rtx mem;
1655 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1656 here before rtl expand. Tree IVOPT will generate rtl pattern to
1657 decide rtx costs, in which case pic_offset_table_rtx is not
1658 initialized. For that case no need to generate the first adrp
1659 instruction as the final cost for global variable access is
1660 one instruction. */
1661 if (gp_rtx != NULL)
1663 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1664 using the page base as GOT base, the first page may be wasted,
1665 in the worst scenario, there is only 28K space for GOT).
1667 The generate instruction sequence for accessing global variable
1670 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1672 Only one instruction needed. But we must initialize
1673 pic_offset_table_rtx properly. We generate initialize insn for
1674 every global access, and allow CSE to remove all redundant.
1676 The final instruction sequences will look like the following
1677 for multiply global variables access.
1679 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1681 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1682 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1683 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1684 ... */
1686 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1687 crtl->uses_pic_offset_table = 1;
1688 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1690 if (mode != GET_MODE (gp_rtx))
1691 gp_rtx = gen_lowpart (mode, gp_rtx);
1695 if (mode == ptr_mode)
1697 if (mode == DImode)
1698 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1699 else
1700 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1702 mem = XVECEXP (SET_SRC (insn), 0, 0);
1704 else
1706 gcc_assert (mode == Pmode);
1708 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1709 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1712 /* The operand is expected to be MEM. Whenever the related insn
1713 pattern changed, above code which calculate mem should be
1714 updated. */
1715 gcc_assert (GET_CODE (mem) == MEM);
1716 MEM_READONLY_P (mem) = 1;
1717 MEM_NOTRAP_P (mem) = 1;
1718 emit_insn (insn);
1719 return;
1722 case SYMBOL_SMALL_GOT_4G:
1724 /* In ILP32, the mode of dest can be either SImode or DImode,
1725 while the got entry is always of SImode size. The mode of
1726 dest depends on how dest is used: if dest is assigned to a
1727 pointer (e.g. in the memory), it has SImode; it may have
1728 DImode if dest is dereferenced to access the memeory.
1729 This is why we have to handle three different ldr_got_small
1730 patterns here (two patterns for ILP32). */
1732 rtx insn;
1733 rtx mem;
1734 rtx tmp_reg = dest;
1735 machine_mode mode = GET_MODE (dest);
1737 if (can_create_pseudo_p ())
1738 tmp_reg = gen_reg_rtx (mode);
1740 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1741 if (mode == ptr_mode)
1743 if (mode == DImode)
1744 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1745 else
1746 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1748 mem = XVECEXP (SET_SRC (insn), 0, 0);
1750 else
1752 gcc_assert (mode == Pmode);
1754 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1755 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1758 gcc_assert (GET_CODE (mem) == MEM);
1759 MEM_READONLY_P (mem) = 1;
1760 MEM_NOTRAP_P (mem) = 1;
1761 emit_insn (insn);
1762 return;
1765 case SYMBOL_SMALL_TLSGD:
1767 rtx_insn *insns;
1768 machine_mode mode = GET_MODE (dest);
1769 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1771 start_sequence ();
1772 if (TARGET_ILP32)
1773 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1774 else
1775 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1776 insns = get_insns ();
1777 end_sequence ();
1779 RTL_CONST_CALL_P (insns) = 1;
1780 emit_libcall_block (insns, dest, result, imm);
1781 return;
1784 case SYMBOL_SMALL_TLSDESC:
1786 machine_mode mode = GET_MODE (dest);
1787 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1788 rtx tp;
1790 gcc_assert (mode == Pmode || mode == ptr_mode);
1792 /* In ILP32, the got entry is always of SImode size. Unlike
1793 small GOT, the dest is fixed at reg 0. */
1794 if (TARGET_ILP32)
1795 emit_insn (gen_tlsdesc_small_si (imm));
1796 else
1797 emit_insn (gen_tlsdesc_small_di (imm));
1798 tp = aarch64_load_tp (NULL);
1800 if (mode != Pmode)
1801 tp = gen_lowpart (mode, tp);
1803 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1804 if (REG_P (dest))
1805 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1806 return;
1809 case SYMBOL_SMALL_TLSIE:
1811 /* In ILP32, the mode of dest can be either SImode or DImode,
1812 while the got entry is always of SImode size. The mode of
1813 dest depends on how dest is used: if dest is assigned to a
1814 pointer (e.g. in the memory), it has SImode; it may have
1815 DImode if dest is dereferenced to access the memeory.
1816 This is why we have to handle three different tlsie_small
1817 patterns here (two patterns for ILP32). */
1818 machine_mode mode = GET_MODE (dest);
1819 rtx tmp_reg = gen_reg_rtx (mode);
1820 rtx tp = aarch64_load_tp (NULL);
1822 if (mode == ptr_mode)
1824 if (mode == DImode)
1825 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1826 else
1828 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1829 tp = gen_lowpart (mode, tp);
1832 else
1834 gcc_assert (mode == Pmode);
1835 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1838 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1839 if (REG_P (dest))
1840 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1841 return;
1844 case SYMBOL_TLSLE12:
1845 case SYMBOL_TLSLE24:
1846 case SYMBOL_TLSLE32:
1847 case SYMBOL_TLSLE48:
1849 machine_mode mode = GET_MODE (dest);
1850 rtx tp = aarch64_load_tp (NULL);
1852 if (mode != Pmode)
1853 tp = gen_lowpart (mode, tp);
1855 switch (type)
1857 case SYMBOL_TLSLE12:
1858 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1859 (dest, tp, imm));
1860 break;
1861 case SYMBOL_TLSLE24:
1862 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1863 (dest, tp, imm));
1864 break;
1865 case SYMBOL_TLSLE32:
1866 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1867 (dest, imm));
1868 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1869 (dest, dest, tp));
1870 break;
1871 case SYMBOL_TLSLE48:
1872 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1873 (dest, imm));
1874 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1875 (dest, dest, tp));
1876 break;
1877 default:
1878 gcc_unreachable ();
1881 if (REG_P (dest))
1882 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1883 return;
1886 case SYMBOL_TINY_GOT:
1887 emit_insn (gen_ldr_got_tiny (dest, imm));
1888 return;
1890 case SYMBOL_TINY_TLSIE:
1892 machine_mode mode = GET_MODE (dest);
1893 rtx tp = aarch64_load_tp (NULL);
1895 if (mode == ptr_mode)
1897 if (mode == DImode)
1898 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1899 else
1901 tp = gen_lowpart (mode, tp);
1902 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1905 else
1907 gcc_assert (mode == Pmode);
1908 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1911 if (REG_P (dest))
1912 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1913 return;
1916 default:
1917 gcc_unreachable ();
1921 /* Emit a move from SRC to DEST. Assume that the move expanders can
1922 handle all moves if !can_create_pseudo_p (). The distinction is
1923 important because, unlike emit_move_insn, the move expanders know
1924 how to force Pmode objects into the constant pool even when the
1925 constant pool address is not itself legitimate. */
1926 static rtx
1927 aarch64_emit_move (rtx dest, rtx src)
1929 return (can_create_pseudo_p ()
1930 ? emit_move_insn (dest, src)
1931 : emit_move_insn_1 (dest, src));
1934 /* Apply UNOPTAB to OP and store the result in DEST. */
1936 static void
1937 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1939 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1940 if (dest != tmp)
1941 emit_move_insn (dest, tmp);
1944 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
1946 static void
1947 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1949 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1950 OPTAB_DIRECT);
1951 if (dest != tmp)
1952 emit_move_insn (dest, tmp);
1955 /* Split a 128-bit move operation into two 64-bit move operations,
1956 taking care to handle partial overlap of register to register
1957 copies. Special cases are needed when moving between GP regs and
1958 FP regs. SRC can be a register, constant or memory; DST a register
1959 or memory. If either operand is memory it must not have any side
1960 effects. */
1961 void
1962 aarch64_split_128bit_move (rtx dst, rtx src)
1964 rtx dst_lo, dst_hi;
1965 rtx src_lo, src_hi;
1967 machine_mode mode = GET_MODE (dst);
1969 gcc_assert (mode == TImode || mode == TFmode);
1970 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1971 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1973 if (REG_P (dst) && REG_P (src))
1975 int src_regno = REGNO (src);
1976 int dst_regno = REGNO (dst);
1978 /* Handle FP <-> GP regs. */
1979 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1981 src_lo = gen_lowpart (word_mode, src);
1982 src_hi = gen_highpart (word_mode, src);
1984 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
1985 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
1986 return;
1988 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1990 dst_lo = gen_lowpart (word_mode, dst);
1991 dst_hi = gen_highpart (word_mode, dst);
1993 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
1994 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
1995 return;
1999 dst_lo = gen_lowpart (word_mode, dst);
2000 dst_hi = gen_highpart (word_mode, dst);
2001 src_lo = gen_lowpart (word_mode, src);
2002 src_hi = gen_highpart_mode (word_mode, mode, src);
2004 /* At most one pairing may overlap. */
2005 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2007 aarch64_emit_move (dst_hi, src_hi);
2008 aarch64_emit_move (dst_lo, src_lo);
2010 else
2012 aarch64_emit_move (dst_lo, src_lo);
2013 aarch64_emit_move (dst_hi, src_hi);
2017 bool
2018 aarch64_split_128bit_move_p (rtx dst, rtx src)
2020 return (! REG_P (src)
2021 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2024 /* Split a complex SIMD combine. */
2026 void
2027 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2029 machine_mode src_mode = GET_MODE (src1);
2030 machine_mode dst_mode = GET_MODE (dst);
2032 gcc_assert (VECTOR_MODE_P (dst_mode));
2033 gcc_assert (register_operand (dst, dst_mode)
2034 && register_operand (src1, src_mode)
2035 && register_operand (src2, src_mode));
2037 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2038 return;
2041 /* Split a complex SIMD move. */
2043 void
2044 aarch64_split_simd_move (rtx dst, rtx src)
2046 machine_mode src_mode = GET_MODE (src);
2047 machine_mode dst_mode = GET_MODE (dst);
2049 gcc_assert (VECTOR_MODE_P (dst_mode));
2051 if (REG_P (dst) && REG_P (src))
2053 gcc_assert (VECTOR_MODE_P (src_mode));
2054 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2058 bool
2059 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2060 machine_mode ymode, rtx y)
2062 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2063 gcc_assert (r != NULL);
2064 return rtx_equal_p (x, r);
2068 static rtx
2069 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2071 if (can_create_pseudo_p ())
2072 return force_reg (mode, value);
2073 else
2075 gcc_assert (x);
2076 aarch64_emit_move (x, value);
2077 return x;
2081 /* Return true if we can move VALUE into a register using a single
2082 CNT[BHWD] instruction. */
2084 static bool
2085 aarch64_sve_cnt_immediate_p (poly_int64 value)
2087 HOST_WIDE_INT factor = value.coeffs[0];
2088 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2089 return (value.coeffs[1] == factor
2090 && IN_RANGE (factor, 2, 16 * 16)
2091 && (factor & 1) == 0
2092 && factor <= 16 * (factor & -factor));
2095 /* Likewise for rtx X. */
2097 bool
2098 aarch64_sve_cnt_immediate_p (rtx x)
2100 poly_int64 value;
2101 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2104 /* Return the asm string for an instruction with a CNT-like vector size
2105 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2106 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2107 first part of the operands template (the part that comes before the
2108 vector size itself). FACTOR is the number of quadwords.
2109 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2110 If it is zero, we can use any element size. */
2112 static char *
2113 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2114 unsigned int factor,
2115 unsigned int nelts_per_vq)
2117 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2119 if (nelts_per_vq == 0)
2120 /* There is some overlap in the ranges of the four CNT instructions.
2121 Here we always use the smallest possible element size, so that the
2122 multiplier is 1 whereever possible. */
2123 nelts_per_vq = factor & -factor;
2124 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2125 gcc_assert (IN_RANGE (shift, 1, 4));
2126 char suffix = "dwhb"[shift - 1];
2128 factor >>= shift;
2129 unsigned int written;
2130 if (factor == 1)
2131 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2132 prefix, suffix, operands);
2133 else
2134 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2135 prefix, suffix, operands, factor);
2136 gcc_assert (written < sizeof (buffer));
2137 return buffer;
2140 /* Return the asm string for an instruction with a CNT-like vector size
2141 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2142 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2143 first part of the operands template (the part that comes before the
2144 vector size itself). X is the value of the vector size operand,
2145 as a polynomial integer rtx. */
2147 char *
2148 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2149 rtx x)
2151 poly_int64 value = rtx_to_poly_int64 (x);
2152 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2153 return aarch64_output_sve_cnt_immediate (prefix, operands,
2154 value.coeffs[1], 0);
2157 /* Return true if we can add VALUE to a register using a single ADDVL
2158 or ADDPL instruction. */
2160 static bool
2161 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2163 HOST_WIDE_INT factor = value.coeffs[0];
2164 if (factor == 0 || value.coeffs[1] != factor)
2165 return false;
2166 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2167 and a value of 16 is one vector width. */
2168 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2169 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2172 /* Likewise for rtx X. */
2174 bool
2175 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2177 poly_int64 value;
2178 return (poly_int_rtx_p (x, &value)
2179 && aarch64_sve_addvl_addpl_immediate_p (value));
2182 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2183 and storing the result in operand 0. */
2185 char *
2186 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2188 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2189 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2190 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2192 /* Use INC or DEC if possible. */
2193 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2195 if (aarch64_sve_cnt_immediate_p (offset_value))
2196 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2197 offset_value.coeffs[1], 0);
2198 if (aarch64_sve_cnt_immediate_p (-offset_value))
2199 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2200 -offset_value.coeffs[1], 0);
2203 int factor = offset_value.coeffs[1];
2204 if ((factor & 15) == 0)
2205 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2206 else
2207 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2208 return buffer;
2211 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2212 instruction. If it is, store the number of elements in each vector
2213 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2214 factor in *FACTOR_OUT (if nonnull). */
2216 bool
2217 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2218 unsigned int *nelts_per_vq_out)
2220 rtx elt;
2221 poly_int64 value;
2223 if (!const_vec_duplicate_p (x, &elt)
2224 || !poly_int_rtx_p (elt, &value))
2225 return false;
2227 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2228 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2229 /* There's no vector INCB. */
2230 return false;
2232 HOST_WIDE_INT factor = value.coeffs[0];
2233 if (value.coeffs[1] != factor)
2234 return false;
2236 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2237 if ((factor % nelts_per_vq) != 0
2238 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2239 return false;
2241 if (factor_out)
2242 *factor_out = factor;
2243 if (nelts_per_vq_out)
2244 *nelts_per_vq_out = nelts_per_vq;
2245 return true;
2248 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2249 instruction. */
2251 bool
2252 aarch64_sve_inc_dec_immediate_p (rtx x)
2254 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2257 /* Return the asm template for an SVE vector INC or DEC instruction.
2258 OPERANDS gives the operands before the vector count and X is the
2259 value of the vector count operand itself. */
2261 char *
2262 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2264 int factor;
2265 unsigned int nelts_per_vq;
2266 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2267 gcc_unreachable ();
2268 if (factor < 0)
2269 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2270 nelts_per_vq);
2271 else
2272 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2273 nelts_per_vq);
2276 static int
2277 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2278 scalar_int_mode mode)
2280 int i;
2281 unsigned HOST_WIDE_INT val, val2, mask;
2282 int one_match, zero_match;
2283 int num_insns;
2285 val = INTVAL (imm);
2287 if (aarch64_move_imm (val, mode))
2289 if (generate)
2290 emit_insn (gen_rtx_SET (dest, imm));
2291 return 1;
2294 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2295 (with XXXX non-zero). In that case check to see if the move can be done in
2296 a smaller mode. */
2297 val2 = val & 0xffffffff;
2298 if (mode == DImode
2299 && aarch64_move_imm (val2, SImode)
2300 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2302 if (generate)
2303 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2305 /* Check if we have to emit a second instruction by checking to see
2306 if any of the upper 32 bits of the original DI mode value is set. */
2307 if (val == val2)
2308 return 1;
2310 i = (val >> 48) ? 48 : 32;
2312 if (generate)
2313 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2314 GEN_INT ((val >> i) & 0xffff)));
2316 return 2;
2319 if ((val >> 32) == 0 || mode == SImode)
2321 if (generate)
2323 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2324 if (mode == SImode)
2325 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2326 GEN_INT ((val >> 16) & 0xffff)));
2327 else
2328 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2329 GEN_INT ((val >> 16) & 0xffff)));
2331 return 2;
2334 /* Remaining cases are all for DImode. */
2336 mask = 0xffff;
2337 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2338 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2339 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2340 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2342 if (zero_match != 2 && one_match != 2)
2344 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2345 For a 64-bit bitmask try whether changing 16 bits to all ones or
2346 zeroes creates a valid bitmask. To check any repeated bitmask,
2347 try using 16 bits from the other 32-bit half of val. */
2349 for (i = 0; i < 64; i += 16, mask <<= 16)
2351 val2 = val & ~mask;
2352 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2353 break;
2354 val2 = val | mask;
2355 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2356 break;
2357 val2 = val2 & ~mask;
2358 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2359 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2360 break;
2362 if (i != 64)
2364 if (generate)
2366 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2367 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2368 GEN_INT ((val >> i) & 0xffff)));
2370 return 2;
2374 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2375 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2376 otherwise skip zero bits. */
2378 num_insns = 1;
2379 mask = 0xffff;
2380 val2 = one_match > zero_match ? ~val : val;
2381 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2383 if (generate)
2384 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2385 ? (val | ~(mask << i))
2386 : (val & (mask << i)))));
2387 for (i += 16; i < 64; i += 16)
2389 if ((val2 & (mask << i)) == 0)
2390 continue;
2391 if (generate)
2392 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2393 GEN_INT ((val >> i) & 0xffff)));
2394 num_insns ++;
2397 return num_insns;
2400 /* Return whether imm is a 128-bit immediate which is simple enough to
2401 expand inline. */
2402 bool
2403 aarch64_mov128_immediate (rtx imm)
2405 if (GET_CODE (imm) == CONST_INT)
2406 return true;
2408 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2410 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2411 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2413 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2414 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2418 /* Return the number of temporary registers that aarch64_add_offset_1
2419 would need to add OFFSET to a register. */
2421 static unsigned int
2422 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2424 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2427 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2428 a non-polynomial OFFSET. MODE is the mode of the addition.
2429 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2430 be set and CFA adjustments added to the generated instructions.
2432 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2433 temporary if register allocation is already complete. This temporary
2434 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2435 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2436 the immediate again.
2438 Since this function may be used to adjust the stack pointer, we must
2439 ensure that it cannot cause transient stack deallocation (for example
2440 by first incrementing SP and then decrementing when adjusting by a
2441 large immediate). */
2443 static void
2444 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2445 rtx src, HOST_WIDE_INT offset, rtx temp1,
2446 bool frame_related_p, bool emit_move_imm)
2448 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2449 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2451 HOST_WIDE_INT moffset = abs_hwi (offset);
2452 rtx_insn *insn;
2454 if (!moffset)
2456 if (!rtx_equal_p (dest, src))
2458 insn = emit_insn (gen_rtx_SET (dest, src));
2459 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2461 return;
2464 /* Single instruction adjustment. */
2465 if (aarch64_uimm12_shift (moffset))
2467 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2468 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2469 return;
2472 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2473 and either:
2475 a) the offset cannot be loaded by a 16-bit move or
2476 b) there is no spare register into which we can move it. */
2477 if (moffset < 0x1000000
2478 && ((!temp1 && !can_create_pseudo_p ())
2479 || !aarch64_move_imm (moffset, mode)))
2481 HOST_WIDE_INT low_off = moffset & 0xfff;
2483 low_off = offset < 0 ? -low_off : low_off;
2484 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2485 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2486 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2487 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2488 return;
2491 /* Emit a move immediate if required and an addition/subtraction. */
2492 if (emit_move_imm)
2494 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2495 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2497 insn = emit_insn (offset < 0
2498 ? gen_sub3_insn (dest, src, temp1)
2499 : gen_add3_insn (dest, src, temp1));
2500 if (frame_related_p)
2502 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2503 rtx adj = plus_constant (mode, src, offset);
2504 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2508 /* Return the number of temporary registers that aarch64_add_offset
2509 would need to move OFFSET into a register or add OFFSET to a register;
2510 ADD_P is true if we want the latter rather than the former. */
2512 static unsigned int
2513 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2515 /* This follows the same structure as aarch64_add_offset. */
2516 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2517 return 0;
2519 unsigned int count = 0;
2520 HOST_WIDE_INT factor = offset.coeffs[1];
2521 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2522 poly_int64 poly_offset (factor, factor);
2523 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2524 /* Need one register for the ADDVL/ADDPL result. */
2525 count += 1;
2526 else if (factor != 0)
2528 factor = abs (factor);
2529 if (factor > 16 * (factor & -factor))
2530 /* Need one register for the CNT result and one for the multiplication
2531 factor. If necessary, the second temporary can be reused for the
2532 constant part of the offset. */
2533 return 2;
2534 /* Need one register for the CNT result (which might then
2535 be shifted). */
2536 count += 1;
2538 return count + aarch64_add_offset_1_temporaries (constant);
2541 /* If X can be represented as a poly_int64, return the number
2542 of temporaries that are required to add it to a register.
2543 Return -1 otherwise. */
2546 aarch64_add_offset_temporaries (rtx x)
2548 poly_int64 offset;
2549 if (!poly_int_rtx_p (x, &offset))
2550 return -1;
2551 return aarch64_offset_temporaries (true, offset);
2554 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2555 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2556 be set and CFA adjustments added to the generated instructions.
2558 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2559 temporary if register allocation is already complete. This temporary
2560 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2561 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2562 false to avoid emitting the immediate again.
2564 TEMP2, if nonnull, is a second temporary register that doesn't
2565 overlap either DEST or REG.
2567 Since this function may be used to adjust the stack pointer, we must
2568 ensure that it cannot cause transient stack deallocation (for example
2569 by first incrementing SP and then decrementing when adjusting by a
2570 large immediate). */
2572 static void
2573 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2574 poly_int64 offset, rtx temp1, rtx temp2,
2575 bool frame_related_p, bool emit_move_imm = true)
2577 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2578 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2579 gcc_assert (temp1 == NULL_RTX
2580 || !frame_related_p
2581 || !reg_overlap_mentioned_p (temp1, dest));
2582 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2584 /* Try using ADDVL or ADDPL to add the whole value. */
2585 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2587 rtx offset_rtx = gen_int_mode (offset, mode);
2588 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2589 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2590 return;
2593 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2594 SVE vector register, over and above the minimum size of 128 bits.
2595 This is equivalent to half the value returned by CNTD with a
2596 vector shape of ALL. */
2597 HOST_WIDE_INT factor = offset.coeffs[1];
2598 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2600 /* Try using ADDVL or ADDPL to add the VG-based part. */
2601 poly_int64 poly_offset (factor, factor);
2602 if (src != const0_rtx
2603 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2605 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2606 if (frame_related_p)
2608 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2609 RTX_FRAME_RELATED_P (insn) = true;
2610 src = dest;
2612 else
2614 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2615 src = aarch64_force_temporary (mode, temp1, addr);
2616 temp1 = temp2;
2617 temp2 = NULL_RTX;
2620 /* Otherwise use a CNT-based sequence. */
2621 else if (factor != 0)
2623 /* Use a subtraction if we have a negative factor. */
2624 rtx_code code = PLUS;
2625 if (factor < 0)
2627 factor = -factor;
2628 code = MINUS;
2631 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2632 into the multiplication. */
2633 rtx val;
2634 int shift = 0;
2635 if (factor & 1)
2636 /* Use a right shift by 1. */
2637 shift = -1;
2638 else
2639 factor /= 2;
2640 HOST_WIDE_INT low_bit = factor & -factor;
2641 if (factor <= 16 * low_bit)
2643 if (factor > 16 * 8)
2645 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2646 the value with the minimum multiplier and shift it into
2647 position. */
2648 int extra_shift = exact_log2 (low_bit);
2649 shift += extra_shift;
2650 factor >>= extra_shift;
2652 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2654 else
2656 /* Use CNTD, then multiply it by FACTOR. */
2657 val = gen_int_mode (poly_int64 (2, 2), mode);
2658 val = aarch64_force_temporary (mode, temp1, val);
2660 /* Go back to using a negative multiplication factor if we have
2661 no register from which to subtract. */
2662 if (code == MINUS && src == const0_rtx)
2664 factor = -factor;
2665 code = PLUS;
2667 rtx coeff1 = gen_int_mode (factor, mode);
2668 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2669 val = gen_rtx_MULT (mode, val, coeff1);
2672 if (shift > 0)
2674 /* Multiply by 1 << SHIFT. */
2675 val = aarch64_force_temporary (mode, temp1, val);
2676 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2678 else if (shift == -1)
2680 /* Divide by 2. */
2681 val = aarch64_force_temporary (mode, temp1, val);
2682 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2685 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2686 if (src != const0_rtx)
2688 val = aarch64_force_temporary (mode, temp1, val);
2689 val = gen_rtx_fmt_ee (code, mode, src, val);
2691 else if (code == MINUS)
2693 val = aarch64_force_temporary (mode, temp1, val);
2694 val = gen_rtx_NEG (mode, val);
2697 if (constant == 0 || frame_related_p)
2699 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2700 if (frame_related_p)
2702 RTX_FRAME_RELATED_P (insn) = true;
2703 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2704 gen_rtx_SET (dest, plus_constant (Pmode, src,
2705 poly_offset)));
2707 src = dest;
2708 if (constant == 0)
2709 return;
2711 else
2713 src = aarch64_force_temporary (mode, temp1, val);
2714 temp1 = temp2;
2715 temp2 = NULL_RTX;
2718 emit_move_imm = true;
2721 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2722 frame_related_p, emit_move_imm);
2725 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2726 than a poly_int64. */
2728 void
2729 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2730 rtx offset_rtx, rtx temp1, rtx temp2)
2732 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2733 temp1, temp2, false);
2736 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2737 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2738 if TEMP1 already contains abs (DELTA). */
2740 static inline void
2741 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2743 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2744 temp1, temp2, true, emit_move_imm);
2747 /* Subtract DELTA from the stack pointer, marking the instructions
2748 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2749 if nonnull. */
2751 static inline void
2752 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2754 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2755 temp1, temp2, frame_related_p);
2758 /* Set DEST to (vec_series BASE STEP). */
2760 static void
2761 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2763 machine_mode mode = GET_MODE (dest);
2764 scalar_mode inner = GET_MODE_INNER (mode);
2766 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2767 if (!aarch64_sve_index_immediate_p (base))
2768 base = force_reg (inner, base);
2769 if (!aarch64_sve_index_immediate_p (step))
2770 step = force_reg (inner, step);
2772 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2775 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2776 integer of mode INT_MODE. Return true on success. */
2778 static bool
2779 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2780 rtx src)
2782 /* If the constant is smaller than 128 bits, we can do the move
2783 using a vector of SRC_MODEs. */
2784 if (src_mode != TImode)
2786 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2787 GET_MODE_SIZE (src_mode));
2788 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2789 emit_move_insn (gen_lowpart (dup_mode, dest),
2790 gen_const_vec_duplicate (dup_mode, src));
2791 return true;
2794 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2795 src = force_const_mem (src_mode, src);
2796 if (!src)
2797 return false;
2799 /* Make sure that the address is legitimate. */
2800 if (!aarch64_sve_ld1r_operand_p (src))
2802 rtx addr = force_reg (Pmode, XEXP (src, 0));
2803 src = replace_equiv_address (src, addr);
2806 machine_mode mode = GET_MODE (dest);
2807 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2808 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2809 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2810 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2811 emit_insn (gen_rtx_SET (dest, src));
2812 return true;
2815 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2816 isn't a simple duplicate or series. */
2818 static void
2819 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2821 machine_mode mode = GET_MODE (src);
2822 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2823 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2824 gcc_assert (npatterns > 1);
2826 if (nelts_per_pattern == 1)
2828 /* The constant is a repeating seqeuence of at least two elements,
2829 where the repeating elements occupy no more than 128 bits.
2830 Get an integer representation of the replicated value. */
2831 scalar_int_mode int_mode;
2832 if (BYTES_BIG_ENDIAN)
2833 /* For now, always use LD1RQ to load the value on big-endian
2834 targets, since the handling of smaller integers includes a
2835 subreg that is semantically an element reverse. */
2836 int_mode = TImode;
2837 else
2839 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2840 gcc_assert (int_bits <= 128);
2841 int_mode = int_mode_for_size (int_bits, 0).require ();
2843 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2844 if (int_value
2845 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2846 return;
2849 /* Expand each pattern individually. */
2850 rtx_vector_builder builder;
2851 auto_vec<rtx, 16> vectors (npatterns);
2852 for (unsigned int i = 0; i < npatterns; ++i)
2854 builder.new_vector (mode, 1, nelts_per_pattern);
2855 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2856 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2857 vectors.quick_push (force_reg (mode, builder.build ()));
2860 /* Use permutes to interleave the separate vectors. */
2861 while (npatterns > 1)
2863 npatterns /= 2;
2864 for (unsigned int i = 0; i < npatterns; ++i)
2866 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2867 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2868 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2869 vectors[i] = tmp;
2872 gcc_assert (vectors[0] == dest);
2875 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2876 is a pattern that can be used to set DEST to a replicated scalar
2877 element. */
2879 void
2880 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2881 rtx (*gen_vec_duplicate) (rtx, rtx))
2883 machine_mode mode = GET_MODE (dest);
2885 /* Check on what type of symbol it is. */
2886 scalar_int_mode int_mode;
2887 if ((GET_CODE (imm) == SYMBOL_REF
2888 || GET_CODE (imm) == LABEL_REF
2889 || GET_CODE (imm) == CONST
2890 || GET_CODE (imm) == CONST_POLY_INT)
2891 && is_a <scalar_int_mode> (mode, &int_mode))
2893 rtx mem;
2894 poly_int64 offset;
2895 HOST_WIDE_INT const_offset;
2896 enum aarch64_symbol_type sty;
2898 /* If we have (const (plus symbol offset)), separate out the offset
2899 before we start classifying the symbol. */
2900 rtx base = strip_offset (imm, &offset);
2902 /* We must always add an offset involving VL separately, rather than
2903 folding it into the relocation. */
2904 if (!offset.is_constant (&const_offset))
2906 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2907 emit_insn (gen_rtx_SET (dest, imm));
2908 else
2910 /* Do arithmetic on 32-bit values if the result is smaller
2911 than that. */
2912 if (partial_subreg_p (int_mode, SImode))
2914 /* It is invalid to do symbol calculations in modes
2915 narrower than SImode. */
2916 gcc_assert (base == const0_rtx);
2917 dest = gen_lowpart (SImode, dest);
2918 int_mode = SImode;
2920 if (base != const0_rtx)
2922 base = aarch64_force_temporary (int_mode, dest, base);
2923 aarch64_add_offset (int_mode, dest, base, offset,
2924 NULL_RTX, NULL_RTX, false);
2926 else
2927 aarch64_add_offset (int_mode, dest, base, offset,
2928 dest, NULL_RTX, false);
2930 return;
2933 sty = aarch64_classify_symbol (base, const_offset);
2934 switch (sty)
2936 case SYMBOL_FORCE_TO_MEM:
2937 if (const_offset != 0
2938 && targetm.cannot_force_const_mem (int_mode, imm))
2940 gcc_assert (can_create_pseudo_p ());
2941 base = aarch64_force_temporary (int_mode, dest, base);
2942 aarch64_add_offset (int_mode, dest, base, const_offset,
2943 NULL_RTX, NULL_RTX, false);
2944 return;
2947 mem = force_const_mem (ptr_mode, imm);
2948 gcc_assert (mem);
2950 /* If we aren't generating PC relative literals, then
2951 we need to expand the literal pool access carefully.
2952 This is something that needs to be done in a number
2953 of places, so could well live as a separate function. */
2954 if (!aarch64_pcrelative_literal_loads)
2956 gcc_assert (can_create_pseudo_p ());
2957 base = gen_reg_rtx (ptr_mode);
2958 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2959 if (ptr_mode != Pmode)
2960 base = convert_memory_address (Pmode, base);
2961 mem = gen_rtx_MEM (ptr_mode, base);
2964 if (int_mode != ptr_mode)
2965 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2967 emit_insn (gen_rtx_SET (dest, mem));
2969 return;
2971 case SYMBOL_SMALL_TLSGD:
2972 case SYMBOL_SMALL_TLSDESC:
2973 case SYMBOL_SMALL_TLSIE:
2974 case SYMBOL_SMALL_GOT_28K:
2975 case SYMBOL_SMALL_GOT_4G:
2976 case SYMBOL_TINY_GOT:
2977 case SYMBOL_TINY_TLSIE:
2978 if (const_offset != 0)
2980 gcc_assert(can_create_pseudo_p ());
2981 base = aarch64_force_temporary (int_mode, dest, base);
2982 aarch64_add_offset (int_mode, dest, base, const_offset,
2983 NULL_RTX, NULL_RTX, false);
2984 return;
2986 /* FALLTHRU */
2988 case SYMBOL_SMALL_ABSOLUTE:
2989 case SYMBOL_TINY_ABSOLUTE:
2990 case SYMBOL_TLSLE12:
2991 case SYMBOL_TLSLE24:
2992 case SYMBOL_TLSLE32:
2993 case SYMBOL_TLSLE48:
2994 aarch64_load_symref_appropriately (dest, imm, sty);
2995 return;
2997 default:
2998 gcc_unreachable ();
3002 if (!CONST_INT_P (imm))
3004 rtx base, step, value;
3005 if (GET_CODE (imm) == HIGH
3006 || aarch64_simd_valid_immediate (imm, NULL))
3007 emit_insn (gen_rtx_SET (dest, imm));
3008 else if (const_vec_series_p (imm, &base, &step))
3009 aarch64_expand_vec_series (dest, base, step);
3010 else if (const_vec_duplicate_p (imm, &value))
3012 /* If the constant is out of range of an SVE vector move,
3013 load it from memory if we can, otherwise move it into
3014 a register and use a DUP. */
3015 scalar_mode inner_mode = GET_MODE_INNER (mode);
3016 rtx op = force_const_mem (inner_mode, value);
3017 if (!op)
3018 op = force_reg (inner_mode, value);
3019 else if (!aarch64_sve_ld1r_operand_p (op))
3021 rtx addr = force_reg (Pmode, XEXP (op, 0));
3022 op = replace_equiv_address (op, addr);
3024 emit_insn (gen_vec_duplicate (dest, op));
3026 else if (GET_CODE (imm) == CONST_VECTOR
3027 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3028 aarch64_expand_sve_const_vector (dest, imm);
3029 else
3031 rtx mem = force_const_mem (mode, imm);
3032 gcc_assert (mem);
3033 emit_move_insn (dest, mem);
3036 return;
3039 aarch64_internal_mov_immediate (dest, imm, true,
3040 as_a <scalar_int_mode> (mode));
3043 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3044 that is known to contain PTRUE. */
3046 void
3047 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3049 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3050 gen_rtvec (2, pred, src),
3051 UNSPEC_MERGE_PTRUE)));
3054 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3055 operand is in memory. In this case we need to use the predicated LD1
3056 and ST1 instead of LDR and STR, both for correctness on big-endian
3057 targets and because LD1 and ST1 support a wider range of addressing modes.
3058 PRED_MODE is the mode of the predicate.
3060 See the comment at the head of aarch64-sve.md for details about the
3061 big-endian handling. */
3063 void
3064 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3066 machine_mode mode = GET_MODE (dest);
3067 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3068 if (!register_operand (src, mode)
3069 && !register_operand (dest, mode))
3071 rtx tmp = gen_reg_rtx (mode);
3072 if (MEM_P (src))
3073 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3074 else
3075 emit_move_insn (tmp, src);
3076 src = tmp;
3078 aarch64_emit_sve_pred_move (dest, ptrue, src);
3081 /* Called only on big-endian targets. See whether an SVE vector move
3082 from SRC to DEST is effectively a REV[BHW] instruction, because at
3083 least one operand is a subreg of an SVE vector that has wider or
3084 narrower elements. Return true and emit the instruction if so.
3086 For example:
3088 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3090 represents a VIEW_CONVERT between the following vectors, viewed
3091 in memory order:
3093 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3094 R1: { [0], [1], [2], [3], ... }
3096 The high part of lane X in R2 should therefore correspond to lane X*2
3097 of R1, but the register representations are:
3099 msb lsb
3100 R2: ...... [1].high [1].low [0].high [0].low
3101 R1: ...... [3] [2] [1] [0]
3103 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3104 We therefore need a reverse operation to swap the high and low values
3105 around.
3107 This is purely an optimization. Without it we would spill the
3108 subreg operand to the stack in one mode and reload it in the
3109 other mode, which has the same effect as the REV. */
3111 bool
3112 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3114 gcc_assert (BYTES_BIG_ENDIAN);
3115 if (GET_CODE (dest) == SUBREG)
3116 dest = SUBREG_REG (dest);
3117 if (GET_CODE (src) == SUBREG)
3118 src = SUBREG_REG (src);
3120 /* The optimization handles two single SVE REGs with different element
3121 sizes. */
3122 if (!REG_P (dest)
3123 || !REG_P (src)
3124 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3125 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3126 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3127 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3128 return false;
3130 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3131 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3132 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3133 UNSPEC_REV_SUBREG);
3134 emit_insn (gen_rtx_SET (dest, unspec));
3135 return true;
3138 /* Return a copy of X with mode MODE, without changing its other
3139 attributes. Unlike gen_lowpart, this doesn't care whether the
3140 mode change is valid. */
3142 static rtx
3143 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3145 if (GET_MODE (x) == mode)
3146 return x;
3148 x = shallow_copy_rtx (x);
3149 set_mode_and_regno (x, mode, REGNO (x));
3150 return x;
3153 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3154 operands. */
3156 void
3157 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3159 /* Decide which REV operation we need. The mode with narrower elements
3160 determines the mode of the operands and the mode with the wider
3161 elements determines the reverse width. */
3162 machine_mode mode_with_wider_elts = GET_MODE (dest);
3163 machine_mode mode_with_narrower_elts = GET_MODE (src);
3164 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3165 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3166 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3168 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3169 unsigned int unspec;
3170 if (wider_bytes == 8)
3171 unspec = UNSPEC_REV64;
3172 else if (wider_bytes == 4)
3173 unspec = UNSPEC_REV32;
3174 else if (wider_bytes == 2)
3175 unspec = UNSPEC_REV16;
3176 else
3177 gcc_unreachable ();
3178 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3180 /* Emit:
3182 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3183 UNSPEC_MERGE_PTRUE))
3185 with the appropriate modes. */
3186 ptrue = gen_lowpart (pred_mode, ptrue);
3187 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3188 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3189 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3190 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3191 UNSPEC_MERGE_PTRUE);
3192 emit_insn (gen_rtx_SET (dest, src));
3195 static bool
3196 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3197 tree exp ATTRIBUTE_UNUSED)
3199 /* Currently, always true. */
3200 return true;
3203 /* Implement TARGET_PASS_BY_REFERENCE. */
3205 static bool
3206 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3207 machine_mode mode,
3208 const_tree type,
3209 bool named ATTRIBUTE_UNUSED)
3211 HOST_WIDE_INT size;
3212 machine_mode dummymode;
3213 int nregs;
3215 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3216 if (mode == BLKmode && type)
3217 size = int_size_in_bytes (type);
3218 else
3219 /* No frontends can create types with variable-sized modes, so we
3220 shouldn't be asked to pass or return them. */
3221 size = GET_MODE_SIZE (mode).to_constant ();
3223 /* Aggregates are passed by reference based on their size. */
3224 if (type && AGGREGATE_TYPE_P (type))
3226 size = int_size_in_bytes (type);
3229 /* Variable sized arguments are always returned by reference. */
3230 if (size < 0)
3231 return true;
3233 /* Can this be a candidate to be passed in fp/simd register(s)? */
3234 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3235 &dummymode, &nregs,
3236 NULL))
3237 return false;
3239 /* Arguments which are variable sized or larger than 2 registers are
3240 passed by reference unless they are a homogenous floating point
3241 aggregate. */
3242 return size > 2 * UNITS_PER_WORD;
3245 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3246 static bool
3247 aarch64_return_in_msb (const_tree valtype)
3249 machine_mode dummy_mode;
3250 int dummy_int;
3252 /* Never happens in little-endian mode. */
3253 if (!BYTES_BIG_ENDIAN)
3254 return false;
3256 /* Only composite types smaller than or equal to 16 bytes can
3257 be potentially returned in registers. */
3258 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3259 || int_size_in_bytes (valtype) <= 0
3260 || int_size_in_bytes (valtype) > 16)
3261 return false;
3263 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3264 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3265 is always passed/returned in the least significant bits of fp/simd
3266 register(s). */
3267 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3268 &dummy_mode, &dummy_int, NULL))
3269 return false;
3271 return true;
3274 /* Implement TARGET_FUNCTION_VALUE.
3275 Define how to find the value returned by a function. */
3277 static rtx
3278 aarch64_function_value (const_tree type, const_tree func,
3279 bool outgoing ATTRIBUTE_UNUSED)
3281 machine_mode mode;
3282 int unsignedp;
3283 int count;
3284 machine_mode ag_mode;
3286 mode = TYPE_MODE (type);
3287 if (INTEGRAL_TYPE_P (type))
3288 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3290 if (aarch64_return_in_msb (type))
3292 HOST_WIDE_INT size = int_size_in_bytes (type);
3294 if (size % UNITS_PER_WORD != 0)
3296 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3297 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3301 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3302 &ag_mode, &count, NULL))
3304 if (!aarch64_composite_type_p (type, mode))
3306 gcc_assert (count == 1 && mode == ag_mode);
3307 return gen_rtx_REG (mode, V0_REGNUM);
3309 else
3311 int i;
3312 rtx par;
3314 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3315 for (i = 0; i < count; i++)
3317 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3318 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3319 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3320 XVECEXP (par, 0, i) = tmp;
3322 return par;
3325 else
3326 return gen_rtx_REG (mode, R0_REGNUM);
3329 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3330 Return true if REGNO is the number of a hard register in which the values
3331 of called function may come back. */
3333 static bool
3334 aarch64_function_value_regno_p (const unsigned int regno)
3336 /* Maximum of 16 bytes can be returned in the general registers. Examples
3337 of 16-byte return values are: 128-bit integers and 16-byte small
3338 structures (excluding homogeneous floating-point aggregates). */
3339 if (regno == R0_REGNUM || regno == R1_REGNUM)
3340 return true;
3342 /* Up to four fp/simd registers can return a function value, e.g. a
3343 homogeneous floating-point aggregate having four members. */
3344 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3345 return TARGET_FLOAT;
3347 return false;
3350 /* Implement TARGET_RETURN_IN_MEMORY.
3352 If the type T of the result of a function is such that
3353 void func (T arg)
3354 would require that arg be passed as a value in a register (or set of
3355 registers) according to the parameter passing rules, then the result
3356 is returned in the same registers as would be used for such an
3357 argument. */
3359 static bool
3360 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3362 HOST_WIDE_INT size;
3363 machine_mode ag_mode;
3364 int count;
3366 if (!AGGREGATE_TYPE_P (type)
3367 && TREE_CODE (type) != COMPLEX_TYPE
3368 && TREE_CODE (type) != VECTOR_TYPE)
3369 /* Simple scalar types always returned in registers. */
3370 return false;
3372 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3373 type,
3374 &ag_mode,
3375 &count,
3376 NULL))
3377 return false;
3379 /* Types larger than 2 registers returned in memory. */
3380 size = int_size_in_bytes (type);
3381 return (size < 0 || size > 2 * UNITS_PER_WORD);
3384 static bool
3385 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3386 const_tree type, int *nregs)
3388 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3389 return aarch64_vfp_is_call_or_return_candidate (mode,
3390 type,
3391 &pcum->aapcs_vfp_rmode,
3392 nregs,
3393 NULL);
3396 /* Given MODE and TYPE of a function argument, return the alignment in
3397 bits. The idea is to suppress any stronger alignment requested by
3398 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3399 This is a helper function for local use only. */
3401 static unsigned int
3402 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3404 if (!type)
3405 return GET_MODE_ALIGNMENT (mode);
3407 if (integer_zerop (TYPE_SIZE (type)))
3408 return 0;
3410 gcc_assert (TYPE_MODE (type) == mode);
3412 if (!AGGREGATE_TYPE_P (type))
3413 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3415 if (TREE_CODE (type) == ARRAY_TYPE)
3416 return TYPE_ALIGN (TREE_TYPE (type));
3418 unsigned int alignment = 0;
3419 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3420 if (TREE_CODE (field) == FIELD_DECL)
3421 alignment = std::max (alignment, DECL_ALIGN (field));
3423 return alignment;
3426 /* Layout a function argument according to the AAPCS64 rules. The rule
3427 numbers refer to the rule numbers in the AAPCS64. */
3429 static void
3430 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3431 const_tree type,
3432 bool named ATTRIBUTE_UNUSED)
3434 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3435 int ncrn, nvrn, nregs;
3436 bool allocate_ncrn, allocate_nvrn;
3437 HOST_WIDE_INT size;
3439 /* We need to do this once per argument. */
3440 if (pcum->aapcs_arg_processed)
3441 return;
3443 pcum->aapcs_arg_processed = true;
3445 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3446 if (type)
3447 size = int_size_in_bytes (type);
3448 else
3449 /* No frontends can create types with variable-sized modes, so we
3450 shouldn't be asked to pass or return them. */
3451 size = GET_MODE_SIZE (mode).to_constant ();
3452 size = ROUND_UP (size, UNITS_PER_WORD);
3454 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3455 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3456 mode,
3457 type,
3458 &nregs);
3460 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3461 The following code thus handles passing by SIMD/FP registers first. */
3463 nvrn = pcum->aapcs_nvrn;
3465 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3466 and homogenous short-vector aggregates (HVA). */
3467 if (allocate_nvrn)
3469 if (!TARGET_FLOAT)
3470 aarch64_err_no_fpadvsimd (mode);
3472 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3474 pcum->aapcs_nextnvrn = nvrn + nregs;
3475 if (!aarch64_composite_type_p (type, mode))
3477 gcc_assert (nregs == 1);
3478 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3480 else
3482 rtx par;
3483 int i;
3484 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3485 for (i = 0; i < nregs; i++)
3487 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3488 V0_REGNUM + nvrn + i);
3489 rtx offset = gen_int_mode
3490 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3491 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3492 XVECEXP (par, 0, i) = tmp;
3494 pcum->aapcs_reg = par;
3496 return;
3498 else
3500 /* C.3 NSRN is set to 8. */
3501 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3502 goto on_stack;
3506 ncrn = pcum->aapcs_ncrn;
3507 nregs = size / UNITS_PER_WORD;
3509 /* C6 - C9. though the sign and zero extension semantics are
3510 handled elsewhere. This is the case where the argument fits
3511 entirely general registers. */
3512 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3515 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3517 /* C.8 if the argument has an alignment of 16 then the NGRN is
3518 rounded up to the next even number. */
3519 if (nregs == 2
3520 && ncrn % 2
3521 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3522 comparison is there because for > 16 * BITS_PER_UNIT
3523 alignment nregs should be > 2 and therefore it should be
3524 passed by reference rather than value. */
3525 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3527 ++ncrn;
3528 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3531 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3532 A reg is still generated for it, but the caller should be smart
3533 enough not to use it. */
3534 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3535 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3536 else
3538 rtx par;
3539 int i;
3541 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3542 for (i = 0; i < nregs; i++)
3544 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3545 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3546 GEN_INT (i * UNITS_PER_WORD));
3547 XVECEXP (par, 0, i) = tmp;
3549 pcum->aapcs_reg = par;
3552 pcum->aapcs_nextncrn = ncrn + nregs;
3553 return;
3556 /* C.11 */
3557 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3559 /* The argument is passed on stack; record the needed number of words for
3560 this argument and align the total size if necessary. */
3561 on_stack:
3562 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3564 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3565 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3566 16 / UNITS_PER_WORD);
3567 return;
3570 /* Implement TARGET_FUNCTION_ARG. */
3572 static rtx
3573 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3574 const_tree type, bool named)
3576 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3577 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3579 if (mode == VOIDmode)
3580 return NULL_RTX;
3582 aarch64_layout_arg (pcum_v, mode, type, named);
3583 return pcum->aapcs_reg;
3586 void
3587 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3588 const_tree fntype ATTRIBUTE_UNUSED,
3589 rtx libname ATTRIBUTE_UNUSED,
3590 const_tree fndecl ATTRIBUTE_UNUSED,
3591 unsigned n_named ATTRIBUTE_UNUSED)
3593 pcum->aapcs_ncrn = 0;
3594 pcum->aapcs_nvrn = 0;
3595 pcum->aapcs_nextncrn = 0;
3596 pcum->aapcs_nextnvrn = 0;
3597 pcum->pcs_variant = ARM_PCS_AAPCS64;
3598 pcum->aapcs_reg = NULL_RTX;
3599 pcum->aapcs_arg_processed = false;
3600 pcum->aapcs_stack_words = 0;
3601 pcum->aapcs_stack_size = 0;
3603 if (!TARGET_FLOAT
3604 && fndecl && TREE_PUBLIC (fndecl)
3605 && fntype && fntype != error_mark_node)
3607 const_tree type = TREE_TYPE (fntype);
3608 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3609 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3610 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3611 &mode, &nregs, NULL))
3612 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3614 return;
3617 static void
3618 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3619 machine_mode mode,
3620 const_tree type,
3621 bool named)
3623 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3624 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3626 aarch64_layout_arg (pcum_v, mode, type, named);
3627 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3628 != (pcum->aapcs_stack_words != 0));
3629 pcum->aapcs_arg_processed = false;
3630 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3631 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3632 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3633 pcum->aapcs_stack_words = 0;
3634 pcum->aapcs_reg = NULL_RTX;
3638 bool
3639 aarch64_function_arg_regno_p (unsigned regno)
3641 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3642 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3645 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3646 PARM_BOUNDARY bits of alignment, but will be given anything up
3647 to STACK_BOUNDARY bits if the type requires it. This makes sure
3648 that both before and after the layout of each argument, the Next
3649 Stacked Argument Address (NSAA) will have a minimum alignment of
3650 8 bytes. */
3652 static unsigned int
3653 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3655 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3656 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3659 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3661 static fixed_size_mode
3662 aarch64_get_reg_raw_mode (int regno)
3664 if (TARGET_SVE && FP_REGNUM_P (regno))
3665 /* Don't use the SVE part of the register for __builtin_apply and
3666 __builtin_return. The SVE registers aren't used by the normal PCS,
3667 so using them there would be a waste of time. The PCS extensions
3668 for SVE types are fundamentally incompatible with the
3669 __builtin_return/__builtin_apply interface. */
3670 return as_a <fixed_size_mode> (V16QImode);
3671 return default_get_reg_raw_mode (regno);
3674 /* Implement TARGET_FUNCTION_ARG_PADDING.
3676 Small aggregate types are placed in the lowest memory address.
3678 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3680 static pad_direction
3681 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3683 /* On little-endian targets, the least significant byte of every stack
3684 argument is passed at the lowest byte address of the stack slot. */
3685 if (!BYTES_BIG_ENDIAN)
3686 return PAD_UPWARD;
3688 /* Otherwise, integral, floating-point and pointer types are padded downward:
3689 the least significant byte of a stack argument is passed at the highest
3690 byte address of the stack slot. */
3691 if (type
3692 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3693 || POINTER_TYPE_P (type))
3694 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3695 return PAD_DOWNWARD;
3697 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3698 return PAD_UPWARD;
3701 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3703 It specifies padding for the last (may also be the only)
3704 element of a block move between registers and memory. If
3705 assuming the block is in the memory, padding upward means that
3706 the last element is padded after its highest significant byte,
3707 while in downward padding, the last element is padded at the
3708 its least significant byte side.
3710 Small aggregates and small complex types are always padded
3711 upwards.
3713 We don't need to worry about homogeneous floating-point or
3714 short-vector aggregates; their move is not affected by the
3715 padding direction determined here. Regardless of endianness,
3716 each element of such an aggregate is put in the least
3717 significant bits of a fp/simd register.
3719 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3720 register has useful data, and return the opposite if the most
3721 significant byte does. */
3723 bool
3724 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3725 bool first ATTRIBUTE_UNUSED)
3728 /* Small composite types are always padded upward. */
3729 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3731 HOST_WIDE_INT size;
3732 if (type)
3733 size = int_size_in_bytes (type);
3734 else
3735 /* No frontends can create types with variable-sized modes, so we
3736 shouldn't be asked to pass or return them. */
3737 size = GET_MODE_SIZE (mode).to_constant ();
3738 if (size < 2 * UNITS_PER_WORD)
3739 return true;
3742 /* Otherwise, use the default padding. */
3743 return !BYTES_BIG_ENDIAN;
3746 static scalar_int_mode
3747 aarch64_libgcc_cmp_return_mode (void)
3749 return SImode;
3752 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3754 /* We use the 12-bit shifted immediate arithmetic instructions so values
3755 must be multiple of (1 << 12), i.e. 4096. */
3756 #define ARITH_FACTOR 4096
3758 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3759 #error Cannot use simple address calculation for stack probing
3760 #endif
3762 /* The pair of scratch registers used for stack probing. */
3763 #define PROBE_STACK_FIRST_REG 9
3764 #define PROBE_STACK_SECOND_REG 10
3766 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3767 inclusive. These are offsets from the current stack pointer. */
3769 static void
3770 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3772 HOST_WIDE_INT size;
3773 if (!poly_size.is_constant (&size))
3775 sorry ("stack probes for SVE frames");
3776 return;
3779 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3781 /* See the same assertion on PROBE_INTERVAL above. */
3782 gcc_assert ((first % ARITH_FACTOR) == 0);
3784 /* See if we have a constant small number of probes to generate. If so,
3785 that's the easy case. */
3786 if (size <= PROBE_INTERVAL)
3788 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3790 emit_set_insn (reg1,
3791 plus_constant (Pmode,
3792 stack_pointer_rtx, -(first + base)));
3793 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3796 /* The run-time loop is made up of 8 insns in the generic case while the
3797 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3798 else if (size <= 4 * PROBE_INTERVAL)
3800 HOST_WIDE_INT i, rem;
3802 emit_set_insn (reg1,
3803 plus_constant (Pmode,
3804 stack_pointer_rtx,
3805 -(first + PROBE_INTERVAL)));
3806 emit_stack_probe (reg1);
3808 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3809 it exceeds SIZE. If only two probes are needed, this will not
3810 generate any code. Then probe at FIRST + SIZE. */
3811 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3813 emit_set_insn (reg1,
3814 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3815 emit_stack_probe (reg1);
3818 rem = size - (i - PROBE_INTERVAL);
3819 if (rem > 256)
3821 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3823 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3824 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3826 else
3827 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3830 /* Otherwise, do the same as above, but in a loop. Note that we must be
3831 extra careful with variables wrapping around because we might be at
3832 the very top (or the very bottom) of the address space and we have
3833 to be able to handle this case properly; in particular, we use an
3834 equality test for the loop condition. */
3835 else
3837 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3839 /* Step 1: round SIZE to the previous multiple of the interval. */
3841 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3844 /* Step 2: compute initial and final value of the loop counter. */
3846 /* TEST_ADDR = SP + FIRST. */
3847 emit_set_insn (reg1,
3848 plus_constant (Pmode, stack_pointer_rtx, -first));
3850 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3851 HOST_WIDE_INT adjustment = - (first + rounded_size);
3852 if (! aarch64_uimm12_shift (adjustment))
3854 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3855 true, Pmode);
3856 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3858 else
3859 emit_set_insn (reg2,
3860 plus_constant (Pmode, stack_pointer_rtx, adjustment));
3862 /* Step 3: the loop
3866 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3867 probe at TEST_ADDR
3869 while (TEST_ADDR != LAST_ADDR)
3871 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3872 until it is equal to ROUNDED_SIZE. */
3874 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3877 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3878 that SIZE is equal to ROUNDED_SIZE. */
3880 if (size != rounded_size)
3882 HOST_WIDE_INT rem = size - rounded_size;
3884 if (rem > 256)
3886 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3888 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3889 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3891 else
3892 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3896 /* Make sure nothing is scheduled before we are done. */
3897 emit_insn (gen_blockage ());
3900 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3901 absolute addresses. */
3903 const char *
3904 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3906 static int labelno = 0;
3907 char loop_lab[32];
3908 rtx xops[2];
3910 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3912 /* Loop. */
3913 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3915 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3916 xops[0] = reg1;
3917 xops[1] = GEN_INT (PROBE_INTERVAL);
3918 output_asm_insn ("sub\t%0, %0, %1", xops);
3920 /* Probe at TEST_ADDR. */
3921 output_asm_insn ("str\txzr, [%0]", xops);
3923 /* Test if TEST_ADDR == LAST_ADDR. */
3924 xops[1] = reg2;
3925 output_asm_insn ("cmp\t%0, %1", xops);
3927 /* Branch. */
3928 fputs ("\tb.ne\t", asm_out_file);
3929 assemble_name_raw (asm_out_file, loop_lab);
3930 fputc ('\n', asm_out_file);
3932 return "";
3935 /* Determine whether a frame chain needs to be generated. */
3936 static bool
3937 aarch64_needs_frame_chain (void)
3939 /* Force a frame chain for EH returns so the return address is at FP+8. */
3940 if (frame_pointer_needed || crtl->calls_eh_return)
3941 return true;
3943 /* A leaf function cannot have calls or write LR. */
3944 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
3946 /* Don't use a frame chain in leaf functions if leaf frame pointers
3947 are disabled. */
3948 if (flag_omit_leaf_frame_pointer && is_leaf)
3949 return false;
3951 return aarch64_use_frame_pointer;
3954 /* Mark the registers that need to be saved by the callee and calculate
3955 the size of the callee-saved registers area and frame record (both FP
3956 and LR may be omitted). */
3957 static void
3958 aarch64_layout_frame (void)
3960 HOST_WIDE_INT offset = 0;
3961 int regno, last_fp_reg = INVALID_REGNUM;
3963 if (reload_completed && cfun->machine->frame.laid_out)
3964 return;
3966 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
3968 #define SLOT_NOT_REQUIRED (-2)
3969 #define SLOT_REQUIRED (-1)
3971 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3972 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3974 /* First mark all the registers that really need to be saved... */
3975 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3976 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3978 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3979 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3981 /* ... that includes the eh data registers (if needed)... */
3982 if (crtl->calls_eh_return)
3983 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
3984 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
3985 = SLOT_REQUIRED;
3987 /* ... and any callee saved register that dataflow says is live. */
3988 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3989 if (df_regs_ever_live_p (regno)
3990 && (regno == R30_REGNUM
3991 || !call_used_regs[regno]))
3992 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3994 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3995 if (df_regs_ever_live_p (regno)
3996 && !call_used_regs[regno])
3998 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3999 last_fp_reg = regno;
4002 if (cfun->machine->frame.emit_frame_chain)
4004 /* FP and LR are placed in the linkage record. */
4005 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4006 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4007 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4008 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4009 offset = 2 * UNITS_PER_WORD;
4012 /* Now assign stack slots for them. */
4013 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4014 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4016 cfun->machine->frame.reg_offset[regno] = offset;
4017 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4018 cfun->machine->frame.wb_candidate1 = regno;
4019 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4020 cfun->machine->frame.wb_candidate2 = regno;
4021 offset += UNITS_PER_WORD;
4024 HOST_WIDE_INT max_int_offset = offset;
4025 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4026 bool has_align_gap = offset != max_int_offset;
4028 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4029 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4031 /* If there is an alignment gap between integer and fp callee-saves,
4032 allocate the last fp register to it if possible. */
4033 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4035 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4036 break;
4039 cfun->machine->frame.reg_offset[regno] = offset;
4040 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4041 cfun->machine->frame.wb_candidate1 = regno;
4042 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4043 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4044 cfun->machine->frame.wb_candidate2 = regno;
4045 offset += UNITS_PER_WORD;
4048 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4050 cfun->machine->frame.saved_regs_size = offset;
4052 HOST_WIDE_INT varargs_and_saved_regs_size
4053 = offset + cfun->machine->frame.saved_varargs_size;
4055 cfun->machine->frame.hard_fp_offset
4056 = aligned_upper_bound (varargs_and_saved_regs_size
4057 + get_frame_size (),
4058 STACK_BOUNDARY / BITS_PER_UNIT);
4060 /* Both these values are already aligned. */
4061 gcc_assert (multiple_p (crtl->outgoing_args_size,
4062 STACK_BOUNDARY / BITS_PER_UNIT));
4063 cfun->machine->frame.frame_size
4064 = (cfun->machine->frame.hard_fp_offset
4065 + crtl->outgoing_args_size);
4067 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4069 cfun->machine->frame.initial_adjust = 0;
4070 cfun->machine->frame.final_adjust = 0;
4071 cfun->machine->frame.callee_adjust = 0;
4072 cfun->machine->frame.callee_offset = 0;
4074 HOST_WIDE_INT max_push_offset = 0;
4075 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4076 max_push_offset = 512;
4077 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4078 max_push_offset = 256;
4080 HOST_WIDE_INT const_size, const_fp_offset;
4081 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4082 && const_size < max_push_offset
4083 && known_eq (crtl->outgoing_args_size, 0))
4085 /* Simple, small frame with no outgoing arguments:
4086 stp reg1, reg2, [sp, -frame_size]!
4087 stp reg3, reg4, [sp, 16] */
4088 cfun->machine->frame.callee_adjust = const_size;
4090 else if (known_lt (crtl->outgoing_args_size
4091 + cfun->machine->frame.saved_regs_size, 512)
4092 && !(cfun->calls_alloca
4093 && known_lt (cfun->machine->frame.hard_fp_offset,
4094 max_push_offset)))
4096 /* Frame with small outgoing arguments:
4097 sub sp, sp, frame_size
4098 stp reg1, reg2, [sp, outgoing_args_size]
4099 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4100 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4101 cfun->machine->frame.callee_offset
4102 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4104 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4105 && const_fp_offset < max_push_offset)
4107 /* Frame with large outgoing arguments but a small local area:
4108 stp reg1, reg2, [sp, -hard_fp_offset]!
4109 stp reg3, reg4, [sp, 16]
4110 sub sp, sp, outgoing_args_size */
4111 cfun->machine->frame.callee_adjust = const_fp_offset;
4112 cfun->machine->frame.final_adjust
4113 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4115 else
4117 /* Frame with large local area and outgoing arguments using frame pointer:
4118 sub sp, sp, hard_fp_offset
4119 stp x29, x30, [sp, 0]
4120 add x29, sp, 0
4121 stp reg3, reg4, [sp, 16]
4122 sub sp, sp, outgoing_args_size */
4123 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4124 cfun->machine->frame.final_adjust
4125 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4128 cfun->machine->frame.laid_out = true;
4131 /* Return true if the register REGNO is saved on entry to
4132 the current function. */
4134 static bool
4135 aarch64_register_saved_on_entry (int regno)
4137 return cfun->machine->frame.reg_offset[regno] >= 0;
4140 /* Return the next register up from REGNO up to LIMIT for the callee
4141 to save. */
4143 static unsigned
4144 aarch64_next_callee_save (unsigned regno, unsigned limit)
4146 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4147 regno ++;
4148 return regno;
4151 /* Push the register number REGNO of mode MODE to the stack with write-back
4152 adjusting the stack by ADJUSTMENT. */
4154 static void
4155 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4156 HOST_WIDE_INT adjustment)
4158 rtx base_rtx = stack_pointer_rtx;
4159 rtx insn, reg, mem;
4161 reg = gen_rtx_REG (mode, regno);
4162 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4163 plus_constant (Pmode, base_rtx, -adjustment));
4164 mem = gen_frame_mem (mode, mem);
4166 insn = emit_move_insn (mem, reg);
4167 RTX_FRAME_RELATED_P (insn) = 1;
4170 /* Generate and return an instruction to store the pair of registers
4171 REG and REG2 of mode MODE to location BASE with write-back adjusting
4172 the stack location BASE by ADJUSTMENT. */
4174 static rtx
4175 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4176 HOST_WIDE_INT adjustment)
4178 switch (mode)
4180 case E_DImode:
4181 return gen_storewb_pairdi_di (base, base, reg, reg2,
4182 GEN_INT (-adjustment),
4183 GEN_INT (UNITS_PER_WORD - adjustment));
4184 case E_DFmode:
4185 return gen_storewb_pairdf_di (base, base, reg, reg2,
4186 GEN_INT (-adjustment),
4187 GEN_INT (UNITS_PER_WORD - adjustment));
4188 default:
4189 gcc_unreachable ();
4193 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4194 stack pointer by ADJUSTMENT. */
4196 static void
4197 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4199 rtx_insn *insn;
4200 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4202 if (regno2 == INVALID_REGNUM)
4203 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4205 rtx reg1 = gen_rtx_REG (mode, regno1);
4206 rtx reg2 = gen_rtx_REG (mode, regno2);
4208 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4209 reg2, adjustment));
4210 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4211 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4212 RTX_FRAME_RELATED_P (insn) = 1;
4215 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4216 adjusting it by ADJUSTMENT afterwards. */
4218 static rtx
4219 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4220 HOST_WIDE_INT adjustment)
4222 switch (mode)
4224 case E_DImode:
4225 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4226 GEN_INT (UNITS_PER_WORD));
4227 case E_DFmode:
4228 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4229 GEN_INT (UNITS_PER_WORD));
4230 default:
4231 gcc_unreachable ();
4235 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4236 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4237 into CFI_OPS. */
4239 static void
4240 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4241 rtx *cfi_ops)
4243 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4244 rtx reg1 = gen_rtx_REG (mode, regno1);
4246 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4248 if (regno2 == INVALID_REGNUM)
4250 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4251 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4252 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4254 else
4256 rtx reg2 = gen_rtx_REG (mode, regno2);
4257 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4258 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4259 reg2, adjustment));
4263 /* Generate and return a store pair instruction of mode MODE to store
4264 register REG1 to MEM1 and register REG2 to MEM2. */
4266 static rtx
4267 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4268 rtx reg2)
4270 switch (mode)
4272 case E_DImode:
4273 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4275 case E_DFmode:
4276 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4278 default:
4279 gcc_unreachable ();
4283 /* Generate and regurn a load pair isntruction of mode MODE to load register
4284 REG1 from MEM1 and register REG2 from MEM2. */
4286 static rtx
4287 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4288 rtx mem2)
4290 switch (mode)
4292 case E_DImode:
4293 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4295 case E_DFmode:
4296 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4298 default:
4299 gcc_unreachable ();
4303 /* Return TRUE if return address signing should be enabled for the current
4304 function, otherwise return FALSE. */
4306 bool
4307 aarch64_return_address_signing_enabled (void)
4309 /* This function should only be called after frame laid out. */
4310 gcc_assert (cfun->machine->frame.laid_out);
4312 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4313 if it's LR is pushed onto stack. */
4314 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4315 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4316 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4319 /* Emit code to save the callee-saved registers from register number START
4320 to LIMIT to the stack at the location starting at offset START_OFFSET,
4321 skipping any write-back candidates if SKIP_WB is true. */
4323 static void
4324 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4325 unsigned start, unsigned limit, bool skip_wb)
4327 rtx_insn *insn;
4328 unsigned regno;
4329 unsigned regno2;
4331 for (regno = aarch64_next_callee_save (start, limit);
4332 regno <= limit;
4333 regno = aarch64_next_callee_save (regno + 1, limit))
4335 rtx reg, mem;
4336 poly_int64 offset;
4338 if (skip_wb
4339 && (regno == cfun->machine->frame.wb_candidate1
4340 || regno == cfun->machine->frame.wb_candidate2))
4341 continue;
4343 if (cfun->machine->reg_is_wrapped_separately[regno])
4344 continue;
4346 reg = gen_rtx_REG (mode, regno);
4347 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4348 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4349 offset));
4351 regno2 = aarch64_next_callee_save (regno + 1, limit);
4353 if (regno2 <= limit
4354 && !cfun->machine->reg_is_wrapped_separately[regno2]
4355 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4356 == cfun->machine->frame.reg_offset[regno2]))
4359 rtx reg2 = gen_rtx_REG (mode, regno2);
4360 rtx mem2;
4362 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4363 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4364 offset));
4365 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4366 reg2));
4368 /* The first part of a frame-related parallel insn is
4369 always assumed to be relevant to the frame
4370 calculations; subsequent parts, are only
4371 frame-related if explicitly marked. */
4372 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4373 regno = regno2;
4375 else
4376 insn = emit_move_insn (mem, reg);
4378 RTX_FRAME_RELATED_P (insn) = 1;
4382 /* Emit code to restore the callee registers of mode MODE from register
4383 number START up to and including LIMIT. Restore from the stack offset
4384 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4385 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4387 static void
4388 aarch64_restore_callee_saves (machine_mode mode,
4389 poly_int64 start_offset, unsigned start,
4390 unsigned limit, bool skip_wb, rtx *cfi_ops)
4392 rtx base_rtx = stack_pointer_rtx;
4393 unsigned regno;
4394 unsigned regno2;
4395 poly_int64 offset;
4397 for (regno = aarch64_next_callee_save (start, limit);
4398 regno <= limit;
4399 regno = aarch64_next_callee_save (regno + 1, limit))
4401 if (cfun->machine->reg_is_wrapped_separately[regno])
4402 continue;
4404 rtx reg, mem;
4406 if (skip_wb
4407 && (regno == cfun->machine->frame.wb_candidate1
4408 || regno == cfun->machine->frame.wb_candidate2))
4409 continue;
4411 reg = gen_rtx_REG (mode, regno);
4412 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4413 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4415 regno2 = aarch64_next_callee_save (regno + 1, limit);
4417 if (regno2 <= limit
4418 && !cfun->machine->reg_is_wrapped_separately[regno2]
4419 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4420 == cfun->machine->frame.reg_offset[regno2]))
4422 rtx reg2 = gen_rtx_REG (mode, regno2);
4423 rtx mem2;
4425 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4426 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4427 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4429 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4430 regno = regno2;
4432 else
4433 emit_move_insn (reg, mem);
4434 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4438 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4439 of MODE. */
4441 static inline bool
4442 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4444 HOST_WIDE_INT multiple;
4445 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4446 && IN_RANGE (multiple, -8, 7));
4449 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4450 of MODE. */
4452 static inline bool
4453 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4455 HOST_WIDE_INT multiple;
4456 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4457 && IN_RANGE (multiple, 0, 63));
4460 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4461 of MODE. */
4463 bool
4464 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4466 HOST_WIDE_INT multiple;
4467 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4468 && IN_RANGE (multiple, -64, 63));
4471 /* Return true if OFFSET is a signed 9-bit value. */
4473 static inline bool
4474 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4475 poly_int64 offset)
4477 HOST_WIDE_INT const_offset;
4478 return (offset.is_constant (&const_offset)
4479 && IN_RANGE (const_offset, -256, 255));
4482 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4483 of MODE. */
4485 static inline bool
4486 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4488 HOST_WIDE_INT multiple;
4489 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4490 && IN_RANGE (multiple, -256, 255));
4493 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4494 of MODE. */
4496 static inline bool
4497 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4499 HOST_WIDE_INT multiple;
4500 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4501 && IN_RANGE (multiple, 0, 4095));
4504 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4506 static sbitmap
4507 aarch64_get_separate_components (void)
4509 aarch64_layout_frame ();
4511 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4512 bitmap_clear (components);
4514 /* The registers we need saved to the frame. */
4515 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4516 if (aarch64_register_saved_on_entry (regno))
4518 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4519 if (!frame_pointer_needed)
4520 offset += cfun->machine->frame.frame_size
4521 - cfun->machine->frame.hard_fp_offset;
4522 /* Check that we can access the stack slot of the register with one
4523 direct load with no adjustments needed. */
4524 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4525 bitmap_set_bit (components, regno);
4528 /* Don't mess with the hard frame pointer. */
4529 if (frame_pointer_needed)
4530 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4532 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4533 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4534 /* If aarch64_layout_frame has chosen registers to store/restore with
4535 writeback don't interfere with them to avoid having to output explicit
4536 stack adjustment instructions. */
4537 if (reg2 != INVALID_REGNUM)
4538 bitmap_clear_bit (components, reg2);
4539 if (reg1 != INVALID_REGNUM)
4540 bitmap_clear_bit (components, reg1);
4542 bitmap_clear_bit (components, LR_REGNUM);
4543 bitmap_clear_bit (components, SP_REGNUM);
4545 return components;
4548 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4550 static sbitmap
4551 aarch64_components_for_bb (basic_block bb)
4553 bitmap in = DF_LIVE_IN (bb);
4554 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4555 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4557 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4558 bitmap_clear (components);
4560 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4561 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4562 if ((!call_used_regs[regno])
4563 && (bitmap_bit_p (in, regno)
4564 || bitmap_bit_p (gen, regno)
4565 || bitmap_bit_p (kill, regno)))
4567 unsigned regno2, offset, offset2;
4568 bitmap_set_bit (components, regno);
4570 /* If there is a callee-save at an adjacent offset, add it too
4571 to increase the use of LDP/STP. */
4572 offset = cfun->machine->frame.reg_offset[regno];
4573 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4575 if (regno2 <= LAST_SAVED_REGNUM)
4577 offset2 = cfun->machine->frame.reg_offset[regno2];
4578 if ((offset & ~8) == (offset2 & ~8))
4579 bitmap_set_bit (components, regno2);
4583 return components;
4586 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4587 Nothing to do for aarch64. */
4589 static void
4590 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4594 /* Return the next set bit in BMP from START onwards. Return the total number
4595 of bits in BMP if no set bit is found at or after START. */
4597 static unsigned int
4598 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4600 unsigned int nbits = SBITMAP_SIZE (bmp);
4601 if (start == nbits)
4602 return start;
4604 gcc_assert (start < nbits);
4605 for (unsigned int i = start; i < nbits; i++)
4606 if (bitmap_bit_p (bmp, i))
4607 return i;
4609 return nbits;
4612 /* Do the work for aarch64_emit_prologue_components and
4613 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4614 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4615 for these components or the epilogue sequence. That is, it determines
4616 whether we should emit stores or loads and what kind of CFA notes to attach
4617 to the insns. Otherwise the logic for the two sequences is very
4618 similar. */
4620 static void
4621 aarch64_process_components (sbitmap components, bool prologue_p)
4623 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4624 ? HARD_FRAME_POINTER_REGNUM
4625 : STACK_POINTER_REGNUM);
4627 unsigned last_regno = SBITMAP_SIZE (components);
4628 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4629 rtx_insn *insn = NULL;
4631 while (regno != last_regno)
4633 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4634 so DFmode for the vector registers is enough. */
4635 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4636 rtx reg = gen_rtx_REG (mode, regno);
4637 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4638 if (!frame_pointer_needed)
4639 offset += cfun->machine->frame.frame_size
4640 - cfun->machine->frame.hard_fp_offset;
4641 rtx addr = plus_constant (Pmode, ptr_reg, offset);
4642 rtx mem = gen_frame_mem (mode, addr);
4644 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4645 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4646 /* No more registers to handle after REGNO.
4647 Emit a single save/restore and exit. */
4648 if (regno2 == last_regno)
4650 insn = emit_insn (set);
4651 RTX_FRAME_RELATED_P (insn) = 1;
4652 if (prologue_p)
4653 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4654 else
4655 add_reg_note (insn, REG_CFA_RESTORE, reg);
4656 break;
4659 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4660 /* The next register is not of the same class or its offset is not
4661 mergeable with the current one into a pair. */
4662 if (!satisfies_constraint_Ump (mem)
4663 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4664 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4665 GET_MODE_SIZE (mode)))
4667 insn = emit_insn (set);
4668 RTX_FRAME_RELATED_P (insn) = 1;
4669 if (prologue_p)
4670 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4671 else
4672 add_reg_note (insn, REG_CFA_RESTORE, reg);
4674 regno = regno2;
4675 continue;
4678 /* REGNO2 can be saved/restored in a pair with REGNO. */
4679 rtx reg2 = gen_rtx_REG (mode, regno2);
4680 if (!frame_pointer_needed)
4681 offset2 += cfun->machine->frame.frame_size
4682 - cfun->machine->frame.hard_fp_offset;
4683 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4684 rtx mem2 = gen_frame_mem (mode, addr2);
4685 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4686 : gen_rtx_SET (reg2, mem2);
4688 if (prologue_p)
4689 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4690 else
4691 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4693 RTX_FRAME_RELATED_P (insn) = 1;
4694 if (prologue_p)
4696 add_reg_note (insn, REG_CFA_OFFSET, set);
4697 add_reg_note (insn, REG_CFA_OFFSET, set2);
4699 else
4701 add_reg_note (insn, REG_CFA_RESTORE, reg);
4702 add_reg_note (insn, REG_CFA_RESTORE, reg2);
4705 regno = aarch64_get_next_set_bit (components, regno2 + 1);
4709 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4711 static void
4712 aarch64_emit_prologue_components (sbitmap components)
4714 aarch64_process_components (components, true);
4717 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4719 static void
4720 aarch64_emit_epilogue_components (sbitmap components)
4722 aarch64_process_components (components, false);
4725 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4727 static void
4728 aarch64_set_handled_components (sbitmap components)
4730 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4731 if (bitmap_bit_p (components, regno))
4732 cfun->machine->reg_is_wrapped_separately[regno] = true;
4735 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4736 is saved at BASE + OFFSET. */
4738 static void
4739 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4740 rtx base, poly_int64 offset)
4742 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4743 add_reg_note (insn, REG_CFA_EXPRESSION,
4744 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4747 /* AArch64 stack frames generated by this compiler look like:
4749 +-------------------------------+
4751 | incoming stack arguments |
4753 +-------------------------------+
4754 | | <-- incoming stack pointer (aligned)
4755 | callee-allocated save area |
4756 | for register varargs |
4758 +-------------------------------+
4759 | local variables | <-- frame_pointer_rtx
4761 +-------------------------------+
4762 | padding0 | \
4763 +-------------------------------+ |
4764 | callee-saved registers | | frame.saved_regs_size
4765 +-------------------------------+ |
4766 | LR' | |
4767 +-------------------------------+ |
4768 | FP' | / <- hard_frame_pointer_rtx (aligned)
4769 +-------------------------------+
4770 | dynamic allocation |
4771 +-------------------------------+
4772 | padding |
4773 +-------------------------------+
4774 | outgoing stack arguments | <-- arg_pointer
4776 +-------------------------------+
4777 | | <-- stack_pointer_rtx (aligned)
4779 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4780 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4781 unchanged. */
4783 /* Generate the prologue instructions for entry into a function.
4784 Establish the stack frame by decreasing the stack pointer with a
4785 properly calculated size and, if necessary, create a frame record
4786 filled with the values of LR and previous frame pointer. The
4787 current FP is also set up if it is in use. */
4789 void
4790 aarch64_expand_prologue (void)
4792 aarch64_layout_frame ();
4794 poly_int64 frame_size = cfun->machine->frame.frame_size;
4795 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4796 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4797 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4798 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4799 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4800 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4801 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4802 rtx_insn *insn;
4804 /* Sign return address for functions. */
4805 if (aarch64_return_address_signing_enabled ())
4807 insn = emit_insn (gen_pacisp ());
4808 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4809 RTX_FRAME_RELATED_P (insn) = 1;
4812 if (flag_stack_usage_info)
4813 current_function_static_stack_size = constant_lower_bound (frame_size);
4815 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4817 if (crtl->is_leaf && !cfun->calls_alloca)
4819 if (maybe_gt (frame_size, PROBE_INTERVAL)
4820 && maybe_gt (frame_size, get_stack_check_protect ()))
4821 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4822 (frame_size
4823 - get_stack_check_protect ()));
4825 else if (maybe_gt (frame_size, 0))
4826 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4829 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4830 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4832 aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4834 if (callee_adjust != 0)
4835 aarch64_push_regs (reg1, reg2, callee_adjust);
4837 if (emit_frame_chain)
4839 poly_int64 reg_offset = callee_adjust;
4840 if (callee_adjust == 0)
4842 reg1 = R29_REGNUM;
4843 reg2 = R30_REGNUM;
4844 reg_offset = callee_offset;
4845 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4847 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4848 stack_pointer_rtx, callee_offset,
4849 ip1_rtx, ip0_rtx, frame_pointer_needed);
4850 if (frame_pointer_needed && !frame_size.is_constant ())
4852 /* Variable-sized frames need to describe the save slot
4853 address using DW_CFA_expression rather than DW_CFA_offset.
4854 This means that, without taking further action, the
4855 locations of the registers that we've already saved would
4856 remain based on the stack pointer even after we redefine
4857 the CFA based on the frame pointer. We therefore need new
4858 DW_CFA_expressions to re-express the save slots with addresses
4859 based on the frame pointer. */
4860 rtx_insn *insn = get_last_insn ();
4861 gcc_assert (RTX_FRAME_RELATED_P (insn));
4863 /* Add an explicit CFA definition if this was previously
4864 implicit. */
4865 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4867 rtx src = plus_constant (Pmode, stack_pointer_rtx,
4868 callee_offset);
4869 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4870 gen_rtx_SET (hard_frame_pointer_rtx, src));
4873 /* Change the save slot expressions for the registers that
4874 we've already saved. */
4875 reg_offset -= callee_offset;
4876 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4877 reg_offset + UNITS_PER_WORD);
4878 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4879 reg_offset);
4881 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4884 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4885 callee_adjust != 0 || emit_frame_chain);
4886 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4887 callee_adjust != 0 || emit_frame_chain);
4888 aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4891 /* Return TRUE if we can use a simple_return insn.
4893 This function checks whether the callee saved stack is empty, which
4894 means no restore actions are need. The pro_and_epilogue will use
4895 this to check whether shrink-wrapping opt is feasible. */
4897 bool
4898 aarch64_use_return_insn_p (void)
4900 if (!reload_completed)
4901 return false;
4903 if (crtl->profile)
4904 return false;
4906 aarch64_layout_frame ();
4908 return known_eq (cfun->machine->frame.frame_size, 0);
4911 /* Generate the epilogue instructions for returning from a function.
4912 This is almost exactly the reverse of the prolog sequence, except
4913 that we need to insert barriers to avoid scheduling loads that read
4914 from a deallocated stack, and we optimize the unwind records by
4915 emitting them all together if possible. */
4916 void
4917 aarch64_expand_epilogue (bool for_sibcall)
4919 aarch64_layout_frame ();
4921 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4922 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4923 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4924 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4925 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4926 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4927 rtx cfi_ops = NULL;
4928 rtx_insn *insn;
4929 /* A stack clash protection prologue may not have left IP0_REGNUM or
4930 IP1_REGNUM in a usable state. The same is true for allocations
4931 with an SVE component, since we then need both temporary registers
4932 for each allocation. */
4933 bool can_inherit_p = (initial_adjust.is_constant ()
4934 && final_adjust.is_constant ()
4935 && !flag_stack_clash_protection);
4937 /* We need to add memory barrier to prevent read from deallocated stack. */
4938 bool need_barrier_p
4939 = maybe_ne (get_frame_size ()
4940 + cfun->machine->frame.saved_varargs_size, 0);
4942 /* Emit a barrier to prevent loads from a deallocated stack. */
4943 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4944 || cfun->calls_alloca
4945 || crtl->calls_eh_return)
4947 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4948 need_barrier_p = false;
4951 /* Restore the stack pointer from the frame pointer if it may not
4952 be the same as the stack pointer. */
4953 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4954 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4955 if (frame_pointer_needed
4956 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4957 /* If writeback is used when restoring callee-saves, the CFA
4958 is restored on the instruction doing the writeback. */
4959 aarch64_add_offset (Pmode, stack_pointer_rtx,
4960 hard_frame_pointer_rtx, -callee_offset,
4961 ip1_rtx, ip0_rtx, callee_adjust == 0);
4962 else
4963 aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4964 !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4966 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4967 callee_adjust != 0, &cfi_ops);
4968 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4969 callee_adjust != 0, &cfi_ops);
4971 if (need_barrier_p)
4972 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4974 if (callee_adjust != 0)
4975 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4977 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4979 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
4980 insn = get_last_insn ();
4981 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
4982 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
4983 RTX_FRAME_RELATED_P (insn) = 1;
4984 cfi_ops = NULL;
4987 aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
4988 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
4990 if (cfi_ops)
4992 /* Emit delayed restores and reset the CFA to be SP. */
4993 insn = get_last_insn ();
4994 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
4995 REG_NOTES (insn) = cfi_ops;
4996 RTX_FRAME_RELATED_P (insn) = 1;
4999 /* We prefer to emit the combined return/authenticate instruction RETAA,
5000 however there are three cases in which we must instead emit an explicit
5001 authentication instruction.
5003 1) Sibcalls don't return in a normal way, so if we're about to call one
5004 we must authenticate.
5006 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5007 generating code for !TARGET_ARMV8_3 we can't use it and must
5008 explicitly authenticate.
5010 3) On an eh_return path we make extra stack adjustments to update the
5011 canonical frame address to be the exception handler's CFA. We want
5012 to authenticate using the CFA of the function which calls eh_return.
5014 if (aarch64_return_address_signing_enabled ()
5015 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5017 insn = emit_insn (gen_autisp ());
5018 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5019 RTX_FRAME_RELATED_P (insn) = 1;
5022 /* Stack adjustment for exception handler. */
5023 if (crtl->calls_eh_return)
5025 /* We need to unwind the stack by the offset computed by
5026 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5027 to be SP; letting the CFA move during this adjustment
5028 is just as correct as retaining the CFA from the body
5029 of the function. Therefore, do nothing special. */
5030 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5033 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5034 if (!for_sibcall)
5035 emit_jump_insn (ret_rtx);
5038 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5039 normally or return to a previous frame after unwinding.
5041 An EH return uses a single shared return sequence. The epilogue is
5042 exactly like a normal epilogue except that it has an extra input
5043 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5044 that must be applied after the frame has been destroyed. An extra label
5045 is inserted before the epilogue which initializes this register to zero,
5046 and this is the entry point for a normal return.
5048 An actual EH return updates the return address, initializes the stack
5049 adjustment and jumps directly into the epilogue (bypassing the zeroing
5050 of the adjustment). Since the return address is typically saved on the
5051 stack when a function makes a call, the saved LR must be updated outside
5052 the epilogue.
5054 This poses problems as the store is generated well before the epilogue,
5055 so the offset of LR is not known yet. Also optimizations will remove the
5056 store as it appears dead, even after the epilogue is generated (as the
5057 base or offset for loading LR is different in many cases).
5059 To avoid these problems this implementation forces the frame pointer
5060 in eh_return functions so that the location of LR is fixed and known early.
5061 It also marks the store volatile, so no optimization is permitted to
5062 remove the store. */
5064 aarch64_eh_return_handler_rtx (void)
5066 rtx tmp = gen_frame_mem (Pmode,
5067 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5069 /* Mark the store volatile, so no optimization is permitted to remove it. */
5070 MEM_VOLATILE_P (tmp) = true;
5071 return tmp;
5074 /* Output code to add DELTA to the first argument, and then jump
5075 to FUNCTION. Used for C++ multiple inheritance. */
5076 static void
5077 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5078 HOST_WIDE_INT delta,
5079 HOST_WIDE_INT vcall_offset,
5080 tree function)
5082 /* The this pointer is always in x0. Note that this differs from
5083 Arm where the this pointer maybe bumped to r1 if r0 is required
5084 to return a pointer to an aggregate. On AArch64 a result value
5085 pointer will be in x8. */
5086 int this_regno = R0_REGNUM;
5087 rtx this_rtx, temp0, temp1, addr, funexp;
5088 rtx_insn *insn;
5090 reload_completed = 1;
5091 emit_note (NOTE_INSN_PROLOGUE_END);
5093 this_rtx = gen_rtx_REG (Pmode, this_regno);
5094 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5095 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5097 if (vcall_offset == 0)
5098 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5099 else
5101 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5103 addr = this_rtx;
5104 if (delta != 0)
5106 if (delta >= -256 && delta < 256)
5107 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5108 plus_constant (Pmode, this_rtx, delta));
5109 else
5110 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5111 temp1, temp0, false);
5114 if (Pmode == ptr_mode)
5115 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5116 else
5117 aarch64_emit_move (temp0,
5118 gen_rtx_ZERO_EXTEND (Pmode,
5119 gen_rtx_MEM (ptr_mode, addr)));
5121 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5122 addr = plus_constant (Pmode, temp0, vcall_offset);
5123 else
5125 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5126 Pmode);
5127 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5130 if (Pmode == ptr_mode)
5131 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5132 else
5133 aarch64_emit_move (temp1,
5134 gen_rtx_SIGN_EXTEND (Pmode,
5135 gen_rtx_MEM (ptr_mode, addr)));
5137 emit_insn (gen_add2_insn (this_rtx, temp1));
5140 /* Generate a tail call to the target function. */
5141 if (!TREE_USED (function))
5143 assemble_external (function);
5144 TREE_USED (function) = 1;
5146 funexp = XEXP (DECL_RTL (function), 0);
5147 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5148 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5149 SIBLING_CALL_P (insn) = 1;
5151 insn = get_insns ();
5152 shorten_branches (insn);
5153 final_start_function (insn, file, 1);
5154 final (insn, file, 1);
5155 final_end_function ();
5157 /* Stop pretending to be a post-reload pass. */
5158 reload_completed = 0;
5161 static bool
5162 aarch64_tls_referenced_p (rtx x)
5164 if (!TARGET_HAVE_TLS)
5165 return false;
5166 subrtx_iterator::array_type array;
5167 FOR_EACH_SUBRTX (iter, array, x, ALL)
5169 const_rtx x = *iter;
5170 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5171 return true;
5172 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5173 TLS offsets, not real symbol references. */
5174 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5175 iter.skip_subrtxes ();
5177 return false;
5181 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5182 a left shift of 0 or 12 bits. */
5183 bool
5184 aarch64_uimm12_shift (HOST_WIDE_INT val)
5186 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5187 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5192 /* Return true if val is an immediate that can be loaded into a
5193 register by a MOVZ instruction. */
5194 static bool
5195 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5197 if (GET_MODE_SIZE (mode) > 4)
5199 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5200 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5201 return 1;
5203 else
5205 /* Ignore sign extension. */
5206 val &= (HOST_WIDE_INT) 0xffffffff;
5208 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5209 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5212 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5213 64-bit (DImode) integer. */
5215 static unsigned HOST_WIDE_INT
5216 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5218 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5219 while (size < 64)
5221 val &= (HOST_WIDE_INT_1U << size) - 1;
5222 val |= val << size;
5223 size *= 2;
5225 return val;
5228 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5230 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5232 0x0000000100000001ull,
5233 0x0001000100010001ull,
5234 0x0101010101010101ull,
5235 0x1111111111111111ull,
5236 0x5555555555555555ull,
5240 /* Return true if val is a valid bitmask immediate. */
5242 bool
5243 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5245 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5246 int bits;
5248 /* Check for a single sequence of one bits and return quickly if so.
5249 The special cases of all ones and all zeroes returns false. */
5250 val = aarch64_replicate_bitmask_imm (val_in, mode);
5251 tmp = val + (val & -val);
5253 if (tmp == (tmp & -tmp))
5254 return (val + 1) > 1;
5256 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5257 if (mode == SImode)
5258 val = (val << 32) | (val & 0xffffffff);
5260 /* Invert if the immediate doesn't start with a zero bit - this means we
5261 only need to search for sequences of one bits. */
5262 if (val & 1)
5263 val = ~val;
5265 /* Find the first set bit and set tmp to val with the first sequence of one
5266 bits removed. Return success if there is a single sequence of ones. */
5267 first_one = val & -val;
5268 tmp = val & (val + first_one);
5270 if (tmp == 0)
5271 return true;
5273 /* Find the next set bit and compute the difference in bit position. */
5274 next_one = tmp & -tmp;
5275 bits = clz_hwi (first_one) - clz_hwi (next_one);
5276 mask = val ^ tmp;
5278 /* Check the bit position difference is a power of 2, and that the first
5279 sequence of one bits fits within 'bits' bits. */
5280 if ((mask >> bits) != 0 || bits != (bits & -bits))
5281 return false;
5283 /* Check the sequence of one bits is repeated 64/bits times. */
5284 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5287 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5288 Assumed precondition: VAL_IN Is not zero. */
5290 unsigned HOST_WIDE_INT
5291 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5293 int lowest_bit_set = ctz_hwi (val_in);
5294 int highest_bit_set = floor_log2 (val_in);
5295 gcc_assert (val_in != 0);
5297 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5298 (HOST_WIDE_INT_1U << lowest_bit_set));
5301 /* Create constant where bits outside of lowest bit set to highest bit set
5302 are set to 1. */
5304 unsigned HOST_WIDE_INT
5305 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5307 return val_in | ~aarch64_and_split_imm1 (val_in);
5310 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5312 bool
5313 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5315 scalar_int_mode int_mode;
5316 if (!is_a <scalar_int_mode> (mode, &int_mode))
5317 return false;
5319 if (aarch64_bitmask_imm (val_in, int_mode))
5320 return false;
5322 if (aarch64_move_imm (val_in, int_mode))
5323 return false;
5325 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5327 return aarch64_bitmask_imm (imm2, int_mode);
5330 /* Return true if val is an immediate that can be loaded into a
5331 register in a single instruction. */
5332 bool
5333 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5335 scalar_int_mode int_mode;
5336 if (!is_a <scalar_int_mode> (mode, &int_mode))
5337 return false;
5339 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5340 return 1;
5341 return aarch64_bitmask_imm (val, int_mode);
5344 static bool
5345 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5347 rtx base, offset;
5349 if (GET_CODE (x) == HIGH)
5350 return true;
5352 /* There's no way to calculate VL-based values using relocations. */
5353 subrtx_iterator::array_type array;
5354 FOR_EACH_SUBRTX (iter, array, x, ALL)
5355 if (GET_CODE (*iter) == CONST_POLY_INT)
5356 return true;
5358 split_const (x, &base, &offset);
5359 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5361 if (aarch64_classify_symbol (base, INTVAL (offset))
5362 != SYMBOL_FORCE_TO_MEM)
5363 return true;
5364 else
5365 /* Avoid generating a 64-bit relocation in ILP32; leave
5366 to aarch64_expand_mov_immediate to handle it properly. */
5367 return mode != ptr_mode;
5370 return aarch64_tls_referenced_p (x);
5373 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5374 The expansion for a table switch is quite expensive due to the number
5375 of instructions, the table lookup and hard to predict indirect jump.
5376 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5377 set, otherwise use tables for > 16 cases as a tradeoff between size and
5378 performance. When optimizing for size, use the default setting. */
5380 static unsigned int
5381 aarch64_case_values_threshold (void)
5383 /* Use the specified limit for the number of cases before using jump
5384 tables at higher optimization levels. */
5385 if (optimize > 2
5386 && selected_cpu->tune->max_case_values != 0)
5387 return selected_cpu->tune->max_case_values;
5388 else
5389 return optimize_size ? default_case_values_threshold () : 17;
5392 /* Return true if register REGNO is a valid index register.
5393 STRICT_P is true if REG_OK_STRICT is in effect. */
5395 bool
5396 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5398 if (!HARD_REGISTER_NUM_P (regno))
5400 if (!strict_p)
5401 return true;
5403 if (!reg_renumber)
5404 return false;
5406 regno = reg_renumber[regno];
5408 return GP_REGNUM_P (regno);
5411 /* Return true if register REGNO is a valid base register for mode MODE.
5412 STRICT_P is true if REG_OK_STRICT is in effect. */
5414 bool
5415 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5417 if (!HARD_REGISTER_NUM_P (regno))
5419 if (!strict_p)
5420 return true;
5422 if (!reg_renumber)
5423 return false;
5425 regno = reg_renumber[regno];
5428 /* The fake registers will be eliminated to either the stack or
5429 hard frame pointer, both of which are usually valid base registers.
5430 Reload deals with the cases where the eliminated form isn't valid. */
5431 return (GP_REGNUM_P (regno)
5432 || regno == SP_REGNUM
5433 || regno == FRAME_POINTER_REGNUM
5434 || regno == ARG_POINTER_REGNUM);
5437 /* Return true if X is a valid base register for mode MODE.
5438 STRICT_P is true if REG_OK_STRICT is in effect. */
5440 static bool
5441 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5443 if (!strict_p
5444 && GET_CODE (x) == SUBREG
5445 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5446 x = SUBREG_REG (x);
5448 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5451 /* Return true if address offset is a valid index. If it is, fill in INFO
5452 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5454 static bool
5455 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5456 machine_mode mode, bool strict_p)
5458 enum aarch64_address_type type;
5459 rtx index;
5460 int shift;
5462 /* (reg:P) */
5463 if ((REG_P (x) || GET_CODE (x) == SUBREG)
5464 && GET_MODE (x) == Pmode)
5466 type = ADDRESS_REG_REG;
5467 index = x;
5468 shift = 0;
5470 /* (sign_extend:DI (reg:SI)) */
5471 else if ((GET_CODE (x) == SIGN_EXTEND
5472 || GET_CODE (x) == ZERO_EXTEND)
5473 && GET_MODE (x) == DImode
5474 && GET_MODE (XEXP (x, 0)) == SImode)
5476 type = (GET_CODE (x) == SIGN_EXTEND)
5477 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5478 index = XEXP (x, 0);
5479 shift = 0;
5481 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5482 else if (GET_CODE (x) == MULT
5483 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5484 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5485 && GET_MODE (XEXP (x, 0)) == DImode
5486 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5487 && CONST_INT_P (XEXP (x, 1)))
5489 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5490 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5491 index = XEXP (XEXP (x, 0), 0);
5492 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5494 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5495 else if (GET_CODE (x) == ASHIFT
5496 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5497 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5498 && GET_MODE (XEXP (x, 0)) == DImode
5499 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5500 && CONST_INT_P (XEXP (x, 1)))
5502 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5503 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5504 index = XEXP (XEXP (x, 0), 0);
5505 shift = INTVAL (XEXP (x, 1));
5507 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5508 else if ((GET_CODE (x) == SIGN_EXTRACT
5509 || GET_CODE (x) == ZERO_EXTRACT)
5510 && GET_MODE (x) == DImode
5511 && GET_CODE (XEXP (x, 0)) == MULT
5512 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5513 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5515 type = (GET_CODE (x) == SIGN_EXTRACT)
5516 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5517 index = XEXP (XEXP (x, 0), 0);
5518 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5519 if (INTVAL (XEXP (x, 1)) != 32 + shift
5520 || INTVAL (XEXP (x, 2)) != 0)
5521 shift = -1;
5523 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5524 (const_int 0xffffffff<<shift)) */
5525 else if (GET_CODE (x) == AND
5526 && GET_MODE (x) == DImode
5527 && GET_CODE (XEXP (x, 0)) == MULT
5528 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5529 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5530 && CONST_INT_P (XEXP (x, 1)))
5532 type = ADDRESS_REG_UXTW;
5533 index = XEXP (XEXP (x, 0), 0);
5534 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5535 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5536 shift = -1;
5538 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5539 else if ((GET_CODE (x) == SIGN_EXTRACT
5540 || GET_CODE (x) == ZERO_EXTRACT)
5541 && GET_MODE (x) == DImode
5542 && GET_CODE (XEXP (x, 0)) == ASHIFT
5543 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5544 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5546 type = (GET_CODE (x) == SIGN_EXTRACT)
5547 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5548 index = XEXP (XEXP (x, 0), 0);
5549 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5550 if (INTVAL (XEXP (x, 1)) != 32 + shift
5551 || INTVAL (XEXP (x, 2)) != 0)
5552 shift = -1;
5554 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5555 (const_int 0xffffffff<<shift)) */
5556 else if (GET_CODE (x) == AND
5557 && GET_MODE (x) == DImode
5558 && GET_CODE (XEXP (x, 0)) == ASHIFT
5559 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5560 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5561 && CONST_INT_P (XEXP (x, 1)))
5563 type = ADDRESS_REG_UXTW;
5564 index = XEXP (XEXP (x, 0), 0);
5565 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5566 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5567 shift = -1;
5569 /* (mult:P (reg:P) (const_int scale)) */
5570 else if (GET_CODE (x) == MULT
5571 && GET_MODE (x) == Pmode
5572 && GET_MODE (XEXP (x, 0)) == Pmode
5573 && CONST_INT_P (XEXP (x, 1)))
5575 type = ADDRESS_REG_REG;
5576 index = XEXP (x, 0);
5577 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5579 /* (ashift:P (reg:P) (const_int shift)) */
5580 else if (GET_CODE (x) == ASHIFT
5581 && GET_MODE (x) == Pmode
5582 && GET_MODE (XEXP (x, 0)) == Pmode
5583 && CONST_INT_P (XEXP (x, 1)))
5585 type = ADDRESS_REG_REG;
5586 index = XEXP (x, 0);
5587 shift = INTVAL (XEXP (x, 1));
5589 else
5590 return false;
5592 if (!strict_p
5593 && GET_CODE (index) == SUBREG
5594 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5595 index = SUBREG_REG (index);
5597 if (aarch64_sve_data_mode_p (mode))
5599 if (type != ADDRESS_REG_REG
5600 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5601 return false;
5603 else
5605 if (shift != 0
5606 && !(IN_RANGE (shift, 1, 3)
5607 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5608 return false;
5611 if (REG_P (index)
5612 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5614 info->type = type;
5615 info->offset = index;
5616 info->shift = shift;
5617 return true;
5620 return false;
5623 /* Return true if MODE is one of the modes for which we
5624 support LDP/STP operations. */
5626 static bool
5627 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5629 return mode == SImode || mode == DImode
5630 || mode == SFmode || mode == DFmode
5631 || (aarch64_vector_mode_supported_p (mode)
5632 && (known_eq (GET_MODE_SIZE (mode), 8)
5633 || (known_eq (GET_MODE_SIZE (mode), 16)
5634 && (aarch64_tune_params.extra_tuning_flags
5635 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
5638 /* Return true if REGNO is a virtual pointer register, or an eliminable
5639 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5640 include stack_pointer or hard_frame_pointer. */
5641 static bool
5642 virt_or_elim_regno_p (unsigned regno)
5644 return ((regno >= FIRST_VIRTUAL_REGISTER
5645 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5646 || regno == FRAME_POINTER_REGNUM
5647 || regno == ARG_POINTER_REGNUM);
5650 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5651 If it is, fill in INFO appropriately. STRICT_P is true if
5652 REG_OK_STRICT is in effect. */
5654 static bool
5655 aarch64_classify_address (struct aarch64_address_info *info,
5656 rtx x, machine_mode mode, bool strict_p,
5657 aarch64_addr_query_type type = ADDR_QUERY_M)
5659 enum rtx_code code = GET_CODE (x);
5660 rtx op0, op1;
5661 poly_int64 offset;
5663 HOST_WIDE_INT const_size;
5665 /* On BE, we use load/store pair for all large int mode load/stores.
5666 TI/TFmode may also use a load/store pair. */
5667 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5668 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5669 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5670 || type == ADDR_QUERY_LDP_STP_N
5671 || mode == TImode
5672 || mode == TFmode
5673 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5675 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
5676 corresponds to the actual size of the memory being loaded/stored and the
5677 mode of the corresponding addressing mode is half of that. */
5678 if (type == ADDR_QUERY_LDP_STP_N
5679 && known_eq (GET_MODE_SIZE (mode), 16))
5680 mode = DFmode;
5682 bool allow_reg_index_p = (!load_store_pair_p
5683 && (known_lt (GET_MODE_SIZE (mode), 16)
5684 || vec_flags == VEC_ADVSIMD
5685 || vec_flags == VEC_SVE_DATA));
5687 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5688 [Rn, #offset, MUL VL]. */
5689 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5690 && (code != REG && code != PLUS))
5691 return false;
5693 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5694 REG addressing. */
5695 if (advsimd_struct_p
5696 && !BYTES_BIG_ENDIAN
5697 && (code != POST_INC && code != REG))
5698 return false;
5700 gcc_checking_assert (GET_MODE (x) == VOIDmode
5701 || SCALAR_INT_MODE_P (GET_MODE (x)));
5703 switch (code)
5705 case REG:
5706 case SUBREG:
5707 info->type = ADDRESS_REG_IMM;
5708 info->base = x;
5709 info->offset = const0_rtx;
5710 info->const_offset = 0;
5711 return aarch64_base_register_rtx_p (x, strict_p);
5713 case PLUS:
5714 op0 = XEXP (x, 0);
5715 op1 = XEXP (x, 1);
5717 if (! strict_p
5718 && REG_P (op0)
5719 && virt_or_elim_regno_p (REGNO (op0))
5720 && poly_int_rtx_p (op1, &offset))
5722 info->type = ADDRESS_REG_IMM;
5723 info->base = op0;
5724 info->offset = op1;
5725 info->const_offset = offset;
5727 return true;
5730 if (maybe_ne (GET_MODE_SIZE (mode), 0)
5731 && aarch64_base_register_rtx_p (op0, strict_p)
5732 && poly_int_rtx_p (op1, &offset))
5734 info->type = ADDRESS_REG_IMM;
5735 info->base = op0;
5736 info->offset = op1;
5737 info->const_offset = offset;
5739 /* TImode and TFmode values are allowed in both pairs of X
5740 registers and individual Q registers. The available
5741 address modes are:
5742 X,X: 7-bit signed scaled offset
5743 Q: 9-bit signed offset
5744 We conservatively require an offset representable in either mode.
5745 When performing the check for pairs of X registers i.e. LDP/STP
5746 pass down DImode since that is the natural size of the LDP/STP
5747 instruction memory accesses. */
5748 if (mode == TImode || mode == TFmode)
5749 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5750 && (offset_9bit_signed_unscaled_p (mode, offset)
5751 || offset_12bit_unsigned_scaled_p (mode, offset)));
5753 /* A 7bit offset check because OImode will emit a ldp/stp
5754 instruction (only big endian will get here).
5755 For ldp/stp instructions, the offset is scaled for the size of a
5756 single element of the pair. */
5757 if (mode == OImode)
5758 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5760 /* Three 9/12 bit offsets checks because CImode will emit three
5761 ldr/str instructions (only big endian will get here). */
5762 if (mode == CImode)
5763 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5764 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5765 || offset_12bit_unsigned_scaled_p (V16QImode,
5766 offset + 32)));
5768 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5769 instructions (only big endian will get here). */
5770 if (mode == XImode)
5771 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5772 && aarch64_offset_7bit_signed_scaled_p (TImode,
5773 offset + 32));
5775 /* Make "m" use the LD1 offset range for SVE data modes, so
5776 that pre-RTL optimizers like ivopts will work to that
5777 instead of the wider LDR/STR range. */
5778 if (vec_flags == VEC_SVE_DATA)
5779 return (type == ADDR_QUERY_M
5780 ? offset_4bit_signed_scaled_p (mode, offset)
5781 : offset_9bit_signed_scaled_p (mode, offset));
5783 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5785 poly_int64 end_offset = (offset
5786 + GET_MODE_SIZE (mode)
5787 - BYTES_PER_SVE_VECTOR);
5788 return (type == ADDR_QUERY_M
5789 ? offset_4bit_signed_scaled_p (mode, offset)
5790 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5791 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5792 end_offset)));
5795 if (vec_flags == VEC_SVE_PRED)
5796 return offset_9bit_signed_scaled_p (mode, offset);
5798 if (load_store_pair_p)
5799 return ((known_eq (GET_MODE_SIZE (mode), 4)
5800 || known_eq (GET_MODE_SIZE (mode), 8)
5801 || known_eq (GET_MODE_SIZE (mode), 16))
5802 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5803 else
5804 return (offset_9bit_signed_unscaled_p (mode, offset)
5805 || offset_12bit_unsigned_scaled_p (mode, offset));
5808 if (allow_reg_index_p)
5810 /* Look for base + (scaled/extended) index register. */
5811 if (aarch64_base_register_rtx_p (op0, strict_p)
5812 && aarch64_classify_index (info, op1, mode, strict_p))
5814 info->base = op0;
5815 return true;
5817 if (aarch64_base_register_rtx_p (op1, strict_p)
5818 && aarch64_classify_index (info, op0, mode, strict_p))
5820 info->base = op1;
5821 return true;
5825 return false;
5827 case POST_INC:
5828 case POST_DEC:
5829 case PRE_INC:
5830 case PRE_DEC:
5831 info->type = ADDRESS_REG_WB;
5832 info->base = XEXP (x, 0);
5833 info->offset = NULL_RTX;
5834 return aarch64_base_register_rtx_p (info->base, strict_p);
5836 case POST_MODIFY:
5837 case PRE_MODIFY:
5838 info->type = ADDRESS_REG_WB;
5839 info->base = XEXP (x, 0);
5840 if (GET_CODE (XEXP (x, 1)) == PLUS
5841 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5842 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5843 && aarch64_base_register_rtx_p (info->base, strict_p))
5845 info->offset = XEXP (XEXP (x, 1), 1);
5846 info->const_offset = offset;
5848 /* TImode and TFmode values are allowed in both pairs of X
5849 registers and individual Q registers. The available
5850 address modes are:
5851 X,X: 7-bit signed scaled offset
5852 Q: 9-bit signed offset
5853 We conservatively require an offset representable in either mode.
5855 if (mode == TImode || mode == TFmode)
5856 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5857 && offset_9bit_signed_unscaled_p (mode, offset));
5859 if (load_store_pair_p)
5860 return ((known_eq (GET_MODE_SIZE (mode), 4)
5861 || known_eq (GET_MODE_SIZE (mode), 8)
5862 || known_eq (GET_MODE_SIZE (mode), 16))
5863 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5864 else
5865 return offset_9bit_signed_unscaled_p (mode, offset);
5867 return false;
5869 case CONST:
5870 case SYMBOL_REF:
5871 case LABEL_REF:
5872 /* load literal: pc-relative constant pool entry. Only supported
5873 for SI mode or larger. */
5874 info->type = ADDRESS_SYMBOLIC;
5876 if (!load_store_pair_p
5877 && GET_MODE_SIZE (mode).is_constant (&const_size)
5878 && const_size >= 4)
5880 rtx sym, addend;
5882 split_const (x, &sym, &addend);
5883 return ((GET_CODE (sym) == LABEL_REF
5884 || (GET_CODE (sym) == SYMBOL_REF
5885 && CONSTANT_POOL_ADDRESS_P (sym)
5886 && aarch64_pcrelative_literal_loads)));
5888 return false;
5890 case LO_SUM:
5891 info->type = ADDRESS_LO_SUM;
5892 info->base = XEXP (x, 0);
5893 info->offset = XEXP (x, 1);
5894 if (allow_reg_index_p
5895 && aarch64_base_register_rtx_p (info->base, strict_p))
5897 rtx sym, offs;
5898 split_const (info->offset, &sym, &offs);
5899 if (GET_CODE (sym) == SYMBOL_REF
5900 && (aarch64_classify_symbol (sym, INTVAL (offs))
5901 == SYMBOL_SMALL_ABSOLUTE))
5903 /* The symbol and offset must be aligned to the access size. */
5904 unsigned int align;
5906 if (CONSTANT_POOL_ADDRESS_P (sym))
5907 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5908 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5910 tree exp = SYMBOL_REF_DECL (sym);
5911 align = TYPE_ALIGN (TREE_TYPE (exp));
5912 align = aarch64_constant_alignment (exp, align);
5914 else if (SYMBOL_REF_DECL (sym))
5915 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5916 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5917 && SYMBOL_REF_BLOCK (sym) != NULL)
5918 align = SYMBOL_REF_BLOCK (sym)->alignment;
5919 else
5920 align = BITS_PER_UNIT;
5922 poly_int64 ref_size = GET_MODE_SIZE (mode);
5923 if (known_eq (ref_size, 0))
5924 ref_size = GET_MODE_SIZE (DImode);
5926 return (multiple_p (INTVAL (offs), ref_size)
5927 && multiple_p (align / BITS_PER_UNIT, ref_size));
5930 return false;
5932 default:
5933 return false;
5937 /* Return true if the address X is valid for a PRFM instruction.
5938 STRICT_P is true if we should do strict checking with
5939 aarch64_classify_address. */
5941 bool
5942 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5944 struct aarch64_address_info addr;
5946 /* PRFM accepts the same addresses as DImode... */
5947 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5948 if (!res)
5949 return false;
5951 /* ... except writeback forms. */
5952 return addr.type != ADDRESS_REG_WB;
5955 bool
5956 aarch64_symbolic_address_p (rtx x)
5958 rtx offset;
5960 split_const (x, &x, &offset);
5961 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5964 /* Classify the base of symbolic expression X. */
5966 enum aarch64_symbol_type
5967 aarch64_classify_symbolic_expression (rtx x)
5969 rtx offset;
5971 split_const (x, &x, &offset);
5972 return aarch64_classify_symbol (x, INTVAL (offset));
5976 /* Return TRUE if X is a legitimate address for accessing memory in
5977 mode MODE. */
5978 static bool
5979 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5981 struct aarch64_address_info addr;
5983 return aarch64_classify_address (&addr, x, mode, strict_p);
5986 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5987 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
5988 bool
5989 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5990 aarch64_addr_query_type type)
5992 struct aarch64_address_info addr;
5994 return aarch64_classify_address (&addr, x, mode, strict_p, type);
5997 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
5999 static bool
6000 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6001 poly_int64 orig_offset,
6002 machine_mode mode)
6004 HOST_WIDE_INT size;
6005 if (GET_MODE_SIZE (mode).is_constant (&size))
6007 HOST_WIDE_INT const_offset, second_offset;
6009 /* A general SVE offset is A * VQ + B. Remove the A component from
6010 coefficient 0 in order to get the constant B. */
6011 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6013 /* Split an out-of-range address displacement into a base and
6014 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6015 range otherwise to increase opportunities for sharing the base
6016 address of different sizes. Unaligned accesses use the signed
6017 9-bit range, TImode/TFmode use the intersection of signed
6018 scaled 7-bit and signed 9-bit offset. */
6019 if (mode == TImode || mode == TFmode)
6020 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6021 else if ((const_offset & (size - 1)) != 0)
6022 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6023 else
6024 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6026 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6027 return false;
6029 /* Split the offset into second_offset and the rest. */
6030 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6031 *offset2 = gen_int_mode (second_offset, Pmode);
6032 return true;
6034 else
6036 /* Get the mode we should use as the basis of the range. For structure
6037 modes this is the mode of one vector. */
6038 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6039 machine_mode step_mode
6040 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6042 /* Get the "mul vl" multiplier we'd like to use. */
6043 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6044 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6045 if (vec_flags & VEC_SVE_DATA)
6046 /* LDR supports a 9-bit range, but the move patterns for
6047 structure modes require all vectors to be in range of the
6048 same base. The simplest way of accomodating that while still
6049 promoting reuse of anchor points between different modes is
6050 to use an 8-bit range unconditionally. */
6051 vnum = ((vnum + 128) & 255) - 128;
6052 else
6053 /* Predicates are only handled singly, so we might as well use
6054 the full range. */
6055 vnum = ((vnum + 256) & 511) - 256;
6056 if (vnum == 0)
6057 return false;
6059 /* Convert the "mul vl" multiplier into a byte offset. */
6060 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6061 if (known_eq (second_offset, orig_offset))
6062 return false;
6064 /* Split the offset into second_offset and the rest. */
6065 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6066 *offset2 = gen_int_mode (second_offset, Pmode);
6067 return true;
6071 /* Return the binary representation of floating point constant VALUE in INTVAL.
6072 If the value cannot be converted, return false without setting INTVAL.
6073 The conversion is done in the given MODE. */
6074 bool
6075 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6078 /* We make a general exception for 0. */
6079 if (aarch64_float_const_zero_rtx_p (value))
6081 *intval = 0;
6082 return true;
6085 scalar_float_mode mode;
6086 if (GET_CODE (value) != CONST_DOUBLE
6087 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6088 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6089 /* Only support up to DF mode. */
6090 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6091 return false;
6093 unsigned HOST_WIDE_INT ival = 0;
6095 long res[2];
6096 real_to_target (res,
6097 CONST_DOUBLE_REAL_VALUE (value),
6098 REAL_MODE_FORMAT (mode));
6100 if (mode == DFmode)
6102 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6103 ival = zext_hwi (res[order], 32);
6104 ival |= (zext_hwi (res[1 - order], 32) << 32);
6106 else
6107 ival = zext_hwi (res[0], 32);
6109 *intval = ival;
6110 return true;
6113 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6114 single MOV(+MOVK) followed by an FMOV. */
6115 bool
6116 aarch64_float_const_rtx_p (rtx x)
6118 machine_mode mode = GET_MODE (x);
6119 if (mode == VOIDmode)
6120 return false;
6122 /* Determine whether it's cheaper to write float constants as
6123 mov/movk pairs over ldr/adrp pairs. */
6124 unsigned HOST_WIDE_INT ival;
6126 if (GET_CODE (x) == CONST_DOUBLE
6127 && SCALAR_FLOAT_MODE_P (mode)
6128 && aarch64_reinterpret_float_as_int (x, &ival))
6130 scalar_int_mode imode = (mode == HFmode
6131 ? SImode
6132 : int_mode_for_mode (mode).require ());
6133 int num_instr = aarch64_internal_mov_immediate
6134 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6135 return num_instr < 3;
6138 return false;
6141 /* Return TRUE if rtx X is immediate constant 0.0 */
6142 bool
6143 aarch64_float_const_zero_rtx_p (rtx x)
6145 if (GET_MODE (x) == VOIDmode)
6146 return false;
6148 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6149 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6150 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6153 /* Return TRUE if rtx X is immediate constant that fits in a single
6154 MOVI immediate operation. */
6155 bool
6156 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6158 if (!TARGET_SIMD)
6159 return false;
6161 machine_mode vmode;
6162 scalar_int_mode imode;
6163 unsigned HOST_WIDE_INT ival;
6165 if (GET_CODE (x) == CONST_DOUBLE
6166 && SCALAR_FLOAT_MODE_P (mode))
6168 if (!aarch64_reinterpret_float_as_int (x, &ival))
6169 return false;
6171 /* We make a general exception for 0. */
6172 if (aarch64_float_const_zero_rtx_p (x))
6173 return true;
6175 imode = int_mode_for_mode (mode).require ();
6177 else if (GET_CODE (x) == CONST_INT
6178 && is_a <scalar_int_mode> (mode, &imode))
6179 ival = INTVAL (x);
6180 else
6181 return false;
6183 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6184 a 128 bit vector mode. */
6185 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6187 vmode = aarch64_simd_container_mode (imode, width);
6188 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6190 return aarch64_simd_valid_immediate (v_op, NULL);
6194 /* Return the fixed registers used for condition codes. */
6196 static bool
6197 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6199 *p1 = CC_REGNUM;
6200 *p2 = INVALID_REGNUM;
6201 return true;
6204 /* This function is used by the call expanders of the machine description.
6205 RESULT is the register in which the result is returned. It's NULL for
6206 "call" and "sibcall".
6207 MEM is the location of the function call.
6208 SIBCALL indicates whether this function call is normal call or sibling call.
6209 It will generate different pattern accordingly. */
6211 void
6212 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6214 rtx call, callee, tmp;
6215 rtvec vec;
6216 machine_mode mode;
6218 gcc_assert (MEM_P (mem));
6219 callee = XEXP (mem, 0);
6220 mode = GET_MODE (callee);
6221 gcc_assert (mode == Pmode);
6223 /* Decide if we should generate indirect calls by loading the
6224 address of the callee into a register before performing
6225 the branch-and-link. */
6226 if (SYMBOL_REF_P (callee)
6227 ? (aarch64_is_long_call_p (callee)
6228 || aarch64_is_noplt_call_p (callee))
6229 : !REG_P (callee))
6230 XEXP (mem, 0) = force_reg (mode, callee);
6232 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6234 if (result != NULL_RTX)
6235 call = gen_rtx_SET (result, call);
6237 if (sibcall)
6238 tmp = ret_rtx;
6239 else
6240 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6242 vec = gen_rtvec (2, call, tmp);
6243 call = gen_rtx_PARALLEL (VOIDmode, vec);
6245 aarch64_emit_call_insn (call);
6248 /* Emit call insn with PAT and do aarch64-specific handling. */
6250 void
6251 aarch64_emit_call_insn (rtx pat)
6253 rtx insn = emit_call_insn (pat);
6255 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6256 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6257 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6260 machine_mode
6261 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6263 /* All floating point compares return CCFP if it is an equality
6264 comparison, and CCFPE otherwise. */
6265 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6267 switch (code)
6269 case EQ:
6270 case NE:
6271 case UNORDERED:
6272 case ORDERED:
6273 case UNLT:
6274 case UNLE:
6275 case UNGT:
6276 case UNGE:
6277 case UNEQ:
6278 return CCFPmode;
6280 case LT:
6281 case LE:
6282 case GT:
6283 case GE:
6284 case LTGT:
6285 return CCFPEmode;
6287 default:
6288 gcc_unreachable ();
6292 /* Equality comparisons of short modes against zero can be performed
6293 using the TST instruction with the appropriate bitmask. */
6294 if (y == const0_rtx && REG_P (x)
6295 && (code == EQ || code == NE)
6296 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6297 return CC_NZmode;
6299 /* Similarly, comparisons of zero_extends from shorter modes can
6300 be performed using an ANDS with an immediate mask. */
6301 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6302 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6303 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6304 && (code == EQ || code == NE))
6305 return CC_NZmode;
6307 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6308 && y == const0_rtx
6309 && (code == EQ || code == NE || code == LT || code == GE)
6310 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6311 || GET_CODE (x) == NEG
6312 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6313 && CONST_INT_P (XEXP (x, 2)))))
6314 return CC_NZmode;
6316 /* A compare with a shifted operand. Because of canonicalization,
6317 the comparison will have to be swapped when we emit the assembly
6318 code. */
6319 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6320 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6321 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6322 || GET_CODE (x) == LSHIFTRT
6323 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6324 return CC_SWPmode;
6326 /* Similarly for a negated operand, but we can only do this for
6327 equalities. */
6328 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6329 && (REG_P (y) || GET_CODE (y) == SUBREG)
6330 && (code == EQ || code == NE)
6331 && GET_CODE (x) == NEG)
6332 return CC_Zmode;
6334 /* A test for unsigned overflow. */
6335 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6336 && code == NE
6337 && GET_CODE (x) == PLUS
6338 && GET_CODE (y) == ZERO_EXTEND)
6339 return CC_Cmode;
6341 /* A test for signed overflow. */
6342 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6343 && code == NE
6344 && GET_CODE (x) == PLUS
6345 && GET_CODE (y) == SIGN_EXTEND)
6346 return CC_Vmode;
6348 /* For everything else, return CCmode. */
6349 return CCmode;
6352 static int
6353 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6356 aarch64_get_condition_code (rtx x)
6358 machine_mode mode = GET_MODE (XEXP (x, 0));
6359 enum rtx_code comp_code = GET_CODE (x);
6361 if (GET_MODE_CLASS (mode) != MODE_CC)
6362 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6363 return aarch64_get_condition_code_1 (mode, comp_code);
6366 static int
6367 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6369 switch (mode)
6371 case E_CCFPmode:
6372 case E_CCFPEmode:
6373 switch (comp_code)
6375 case GE: return AARCH64_GE;
6376 case GT: return AARCH64_GT;
6377 case LE: return AARCH64_LS;
6378 case LT: return AARCH64_MI;
6379 case NE: return AARCH64_NE;
6380 case EQ: return AARCH64_EQ;
6381 case ORDERED: return AARCH64_VC;
6382 case UNORDERED: return AARCH64_VS;
6383 case UNLT: return AARCH64_LT;
6384 case UNLE: return AARCH64_LE;
6385 case UNGT: return AARCH64_HI;
6386 case UNGE: return AARCH64_PL;
6387 default: return -1;
6389 break;
6391 case E_CCmode:
6392 switch (comp_code)
6394 case NE: return AARCH64_NE;
6395 case EQ: return AARCH64_EQ;
6396 case GE: return AARCH64_GE;
6397 case GT: return AARCH64_GT;
6398 case LE: return AARCH64_LE;
6399 case LT: return AARCH64_LT;
6400 case GEU: return AARCH64_CS;
6401 case GTU: return AARCH64_HI;
6402 case LEU: return AARCH64_LS;
6403 case LTU: return AARCH64_CC;
6404 default: return -1;
6406 break;
6408 case E_CC_SWPmode:
6409 switch (comp_code)
6411 case NE: return AARCH64_NE;
6412 case EQ: return AARCH64_EQ;
6413 case GE: return AARCH64_LE;
6414 case GT: return AARCH64_LT;
6415 case LE: return AARCH64_GE;
6416 case LT: return AARCH64_GT;
6417 case GEU: return AARCH64_LS;
6418 case GTU: return AARCH64_CC;
6419 case LEU: return AARCH64_CS;
6420 case LTU: return AARCH64_HI;
6421 default: return -1;
6423 break;
6425 case E_CC_NZmode:
6426 switch (comp_code)
6428 case NE: return AARCH64_NE;
6429 case EQ: return AARCH64_EQ;
6430 case GE: return AARCH64_PL;
6431 case LT: return AARCH64_MI;
6432 default: return -1;
6434 break;
6436 case E_CC_Zmode:
6437 switch (comp_code)
6439 case NE: return AARCH64_NE;
6440 case EQ: return AARCH64_EQ;
6441 default: return -1;
6443 break;
6445 case E_CC_Cmode:
6446 switch (comp_code)
6448 case NE: return AARCH64_CS;
6449 case EQ: return AARCH64_CC;
6450 default: return -1;
6452 break;
6454 case E_CC_Vmode:
6455 switch (comp_code)
6457 case NE: return AARCH64_VS;
6458 case EQ: return AARCH64_VC;
6459 default: return -1;
6461 break;
6463 default:
6464 return -1;
6467 return -1;
6470 bool
6471 aarch64_const_vec_all_same_in_range_p (rtx x,
6472 HOST_WIDE_INT minval,
6473 HOST_WIDE_INT maxval)
6475 rtx elt;
6476 return (const_vec_duplicate_p (x, &elt)
6477 && CONST_INT_P (elt)
6478 && IN_RANGE (INTVAL (elt), minval, maxval));
6481 bool
6482 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6484 return aarch64_const_vec_all_same_in_range_p (x, val, val);
6487 /* Return true if VEC is a constant in which every element is in the range
6488 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6490 static bool
6491 aarch64_const_vec_all_in_range_p (rtx vec,
6492 HOST_WIDE_INT minval,
6493 HOST_WIDE_INT maxval)
6495 if (GET_CODE (vec) != CONST_VECTOR
6496 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6497 return false;
6499 int nunits;
6500 if (!CONST_VECTOR_STEPPED_P (vec))
6501 nunits = const_vector_encoded_nelts (vec);
6502 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6503 return false;
6505 for (int i = 0; i < nunits; i++)
6507 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6508 if (!CONST_INT_P (vec_elem)
6509 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6510 return false;
6512 return true;
6515 /* N Z C V. */
6516 #define AARCH64_CC_V 1
6517 #define AARCH64_CC_C (1 << 1)
6518 #define AARCH64_CC_Z (1 << 2)
6519 #define AARCH64_CC_N (1 << 3)
6521 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6522 static const int aarch64_nzcv_codes[] =
6524 0, /* EQ, Z == 1. */
6525 AARCH64_CC_Z, /* NE, Z == 0. */
6526 0, /* CS, C == 1. */
6527 AARCH64_CC_C, /* CC, C == 0. */
6528 0, /* MI, N == 1. */
6529 AARCH64_CC_N, /* PL, N == 0. */
6530 0, /* VS, V == 1. */
6531 AARCH64_CC_V, /* VC, V == 0. */
6532 0, /* HI, C ==1 && Z == 0. */
6533 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
6534 AARCH64_CC_V, /* GE, N == V. */
6535 0, /* LT, N != V. */
6536 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
6537 0, /* LE, !(Z == 0 && N == V). */
6538 0, /* AL, Any. */
6539 0 /* NV, Any. */
6542 /* Print floating-point vector immediate operand X to F, negating it
6543 first if NEGATE is true. Return true on success, false if it isn't
6544 a constant we can handle. */
6546 static bool
6547 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6549 rtx elt;
6551 if (!const_vec_duplicate_p (x, &elt))
6552 return false;
6554 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6555 if (negate)
6556 r = real_value_negate (&r);
6558 /* We only handle the SVE single-bit immediates here. */
6559 if (real_equal (&r, &dconst0))
6560 asm_fprintf (f, "0.0");
6561 else if (real_equal (&r, &dconst1))
6562 asm_fprintf (f, "1.0");
6563 else if (real_equal (&r, &dconsthalf))
6564 asm_fprintf (f, "0.5");
6565 else
6566 return false;
6568 return true;
6571 /* Return the equivalent letter for size. */
6572 static char
6573 sizetochar (int size)
6575 switch (size)
6577 case 64: return 'd';
6578 case 32: return 's';
6579 case 16: return 'h';
6580 case 8 : return 'b';
6581 default: gcc_unreachable ();
6585 /* Print operand X to file F in a target specific manner according to CODE.
6586 The acceptable formatting commands given by CODE are:
6587 'c': An integer or symbol address without a preceding #
6588 sign.
6589 'C': Take the duplicated element in a vector constant
6590 and print it in hex.
6591 'D': Take the duplicated element in a vector constant
6592 and print it as an unsigned integer, in decimal.
6593 'e': Print the sign/zero-extend size as a character 8->b,
6594 16->h, 32->w.
6595 'p': Prints N such that 2^N == X (X must be power of 2 and
6596 const int).
6597 'P': Print the number of non-zero bits in X (a const_int).
6598 'H': Print the higher numbered register of a pair (TImode)
6599 of regs.
6600 'm': Print a condition (eq, ne, etc).
6601 'M': Same as 'm', but invert condition.
6602 'N': Take the duplicated element in a vector constant
6603 and print the negative of it in decimal.
6604 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6605 'S/T/U/V': Print a FP/SIMD register name for a register list.
6606 The register printed is the FP/SIMD register name
6607 of X + 0/1/2/3 for S/T/U/V.
6608 'R': Print a scalar FP/SIMD register name + 1.
6609 'X': Print bottom 16 bits of integer constant in hex.
6610 'w/x': Print a general register name or the zero register
6611 (32-bit or 64-bit).
6612 '0': Print a normal operand, if it's a general register,
6613 then we assume DImode.
6614 'k': Print NZCV for conditional compare instructions.
6615 'A': Output address constant representing the first
6616 argument of X, specifying a relocation offset
6617 if appropriate.
6618 'L': Output constant address specified by X
6619 with a relocation offset if appropriate.
6620 'G': Prints address of X, specifying a PC relative
6621 relocation mode if appropriate.
6622 'y': Output address of LDP or STP - this is used for
6623 some LDP/STPs which don't use a PARALLEL in their
6624 pattern (so the mode needs to be adjusted).
6625 'z': Output address of a typical LDP or STP. */
6627 static void
6628 aarch64_print_operand (FILE *f, rtx x, int code)
6630 rtx elt;
6631 switch (code)
6633 case 'c':
6634 switch (GET_CODE (x))
6636 case CONST_INT:
6637 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6638 break;
6640 case SYMBOL_REF:
6641 output_addr_const (f, x);
6642 break;
6644 case CONST:
6645 if (GET_CODE (XEXP (x, 0)) == PLUS
6646 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6648 output_addr_const (f, x);
6649 break;
6651 /* Fall through. */
6653 default:
6654 output_operand_lossage ("unsupported operand for code '%c'", code);
6656 break;
6658 case 'e':
6660 int n;
6662 if (!CONST_INT_P (x)
6663 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6665 output_operand_lossage ("invalid operand for '%%%c'", code);
6666 return;
6669 switch (n)
6671 case 3:
6672 fputc ('b', f);
6673 break;
6674 case 4:
6675 fputc ('h', f);
6676 break;
6677 case 5:
6678 fputc ('w', f);
6679 break;
6680 default:
6681 output_operand_lossage ("invalid operand for '%%%c'", code);
6682 return;
6685 break;
6687 case 'p':
6689 int n;
6691 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6693 output_operand_lossage ("invalid operand for '%%%c'", code);
6694 return;
6697 asm_fprintf (f, "%d", n);
6699 break;
6701 case 'P':
6702 if (!CONST_INT_P (x))
6704 output_operand_lossage ("invalid operand for '%%%c'", code);
6705 return;
6708 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6709 break;
6711 case 'H':
6712 if (x == const0_rtx)
6714 asm_fprintf (f, "xzr");
6715 break;
6718 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6720 output_operand_lossage ("invalid operand for '%%%c'", code);
6721 return;
6724 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6725 break;
6727 case 'M':
6728 case 'm':
6730 int cond_code;
6731 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6732 if (x == const_true_rtx)
6734 if (code == 'M')
6735 fputs ("nv", f);
6736 return;
6739 if (!COMPARISON_P (x))
6741 output_operand_lossage ("invalid operand for '%%%c'", code);
6742 return;
6745 cond_code = aarch64_get_condition_code (x);
6746 gcc_assert (cond_code >= 0);
6747 if (code == 'M')
6748 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6749 fputs (aarch64_condition_codes[cond_code], f);
6751 break;
6753 case 'N':
6754 if (!const_vec_duplicate_p (x, &elt))
6756 output_operand_lossage ("invalid vector constant");
6757 return;
6760 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6761 asm_fprintf (f, "%wd", -INTVAL (elt));
6762 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6763 && aarch64_print_vector_float_operand (f, x, true))
6765 else
6767 output_operand_lossage ("invalid vector constant");
6768 return;
6770 break;
6772 case 'b':
6773 case 'h':
6774 case 's':
6775 case 'd':
6776 case 'q':
6777 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6779 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6780 return;
6782 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6783 break;
6785 case 'S':
6786 case 'T':
6787 case 'U':
6788 case 'V':
6789 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6791 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6792 return;
6794 asm_fprintf (f, "%c%d",
6795 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6796 REGNO (x) - V0_REGNUM + (code - 'S'));
6797 break;
6799 case 'R':
6800 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6802 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6803 return;
6805 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6806 break;
6808 case 'X':
6809 if (!CONST_INT_P (x))
6811 output_operand_lossage ("invalid operand for '%%%c'", code);
6812 return;
6814 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6815 break;
6817 case 'C':
6819 /* Print a replicated constant in hex. */
6820 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6822 output_operand_lossage ("invalid operand for '%%%c'", code);
6823 return;
6825 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6826 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6828 break;
6830 case 'D':
6832 /* Print a replicated constant in decimal, treating it as
6833 unsigned. */
6834 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6836 output_operand_lossage ("invalid operand for '%%%c'", code);
6837 return;
6839 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6840 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6842 break;
6844 case 'w':
6845 case 'x':
6846 if (x == const0_rtx
6847 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6849 asm_fprintf (f, "%czr", code);
6850 break;
6853 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6855 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6856 break;
6859 if (REG_P (x) && REGNO (x) == SP_REGNUM)
6861 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6862 break;
6865 /* Fall through */
6867 case 0:
6868 if (x == NULL)
6870 output_operand_lossage ("missing operand");
6871 return;
6874 switch (GET_CODE (x))
6876 case REG:
6877 if (aarch64_sve_data_mode_p (GET_MODE (x)))
6879 if (REG_NREGS (x) == 1)
6880 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6881 else
6883 char suffix
6884 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6885 asm_fprintf (f, "{z%d.%c - z%d.%c}",
6886 REGNO (x) - V0_REGNUM, suffix,
6887 END_REGNO (x) - V0_REGNUM - 1, suffix);
6890 else
6891 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6892 break;
6894 case MEM:
6895 output_address (GET_MODE (x), XEXP (x, 0));
6896 break;
6898 case LABEL_REF:
6899 case SYMBOL_REF:
6900 output_addr_const (asm_out_file, x);
6901 break;
6903 case CONST_INT:
6904 asm_fprintf (f, "%wd", INTVAL (x));
6905 break;
6907 case CONST:
6908 if (!VECTOR_MODE_P (GET_MODE (x)))
6910 output_addr_const (asm_out_file, x);
6911 break;
6913 /* fall through */
6915 case CONST_VECTOR:
6916 if (!const_vec_duplicate_p (x, &elt))
6918 output_operand_lossage ("invalid vector constant");
6919 return;
6922 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6923 asm_fprintf (f, "%wd", INTVAL (elt));
6924 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6925 && aarch64_print_vector_float_operand (f, x, false))
6927 else
6929 output_operand_lossage ("invalid vector constant");
6930 return;
6932 break;
6934 case CONST_DOUBLE:
6935 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6936 be getting CONST_DOUBLEs holding integers. */
6937 gcc_assert (GET_MODE (x) != VOIDmode);
6938 if (aarch64_float_const_zero_rtx_p (x))
6940 fputc ('0', f);
6941 break;
6943 else if (aarch64_float_const_representable_p (x))
6945 #define buf_size 20
6946 char float_buf[buf_size] = {'\0'};
6947 real_to_decimal_for_mode (float_buf,
6948 CONST_DOUBLE_REAL_VALUE (x),
6949 buf_size, buf_size,
6950 1, GET_MODE (x));
6951 asm_fprintf (asm_out_file, "%s", float_buf);
6952 break;
6953 #undef buf_size
6955 output_operand_lossage ("invalid constant");
6956 return;
6957 default:
6958 output_operand_lossage ("invalid operand");
6959 return;
6961 break;
6963 case 'A':
6964 if (GET_CODE (x) == HIGH)
6965 x = XEXP (x, 0);
6967 switch (aarch64_classify_symbolic_expression (x))
6969 case SYMBOL_SMALL_GOT_4G:
6970 asm_fprintf (asm_out_file, ":got:");
6971 break;
6973 case SYMBOL_SMALL_TLSGD:
6974 asm_fprintf (asm_out_file, ":tlsgd:");
6975 break;
6977 case SYMBOL_SMALL_TLSDESC:
6978 asm_fprintf (asm_out_file, ":tlsdesc:");
6979 break;
6981 case SYMBOL_SMALL_TLSIE:
6982 asm_fprintf (asm_out_file, ":gottprel:");
6983 break;
6985 case SYMBOL_TLSLE24:
6986 asm_fprintf (asm_out_file, ":tprel:");
6987 break;
6989 case SYMBOL_TINY_GOT:
6990 gcc_unreachable ();
6991 break;
6993 default:
6994 break;
6996 output_addr_const (asm_out_file, x);
6997 break;
6999 case 'L':
7000 switch (aarch64_classify_symbolic_expression (x))
7002 case SYMBOL_SMALL_GOT_4G:
7003 asm_fprintf (asm_out_file, ":lo12:");
7004 break;
7006 case SYMBOL_SMALL_TLSGD:
7007 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7008 break;
7010 case SYMBOL_SMALL_TLSDESC:
7011 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7012 break;
7014 case SYMBOL_SMALL_TLSIE:
7015 asm_fprintf (asm_out_file, ":gottprel_lo12:");
7016 break;
7018 case SYMBOL_TLSLE12:
7019 asm_fprintf (asm_out_file, ":tprel_lo12:");
7020 break;
7022 case SYMBOL_TLSLE24:
7023 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7024 break;
7026 case SYMBOL_TINY_GOT:
7027 asm_fprintf (asm_out_file, ":got:");
7028 break;
7030 case SYMBOL_TINY_TLSIE:
7031 asm_fprintf (asm_out_file, ":gottprel:");
7032 break;
7034 default:
7035 break;
7037 output_addr_const (asm_out_file, x);
7038 break;
7040 case 'G':
7041 switch (aarch64_classify_symbolic_expression (x))
7043 case SYMBOL_TLSLE24:
7044 asm_fprintf (asm_out_file, ":tprel_hi12:");
7045 break;
7046 default:
7047 break;
7049 output_addr_const (asm_out_file, x);
7050 break;
7052 case 'k':
7054 HOST_WIDE_INT cond_code;
7056 if (!CONST_INT_P (x))
7058 output_operand_lossage ("invalid operand for '%%%c'", code);
7059 return;
7062 cond_code = INTVAL (x);
7063 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7064 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7066 break;
7068 case 'y':
7069 case 'z':
7071 machine_mode mode = GET_MODE (x);
7073 if (GET_CODE (x) != MEM
7074 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7076 output_operand_lossage ("invalid operand for '%%%c'", code);
7077 return;
7080 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
7081 code == 'y'
7082 ? ADDR_QUERY_LDP_STP_N
7083 : ADDR_QUERY_LDP_STP))
7084 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7086 break;
7088 default:
7089 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7090 return;
7094 /* Print address 'x' of a memory access with mode 'mode'.
7095 'op' is the context required by aarch64_classify_address. It can either be
7096 MEM for a normal memory access or PARALLEL for LDP/STP. */
7097 static bool
7098 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7099 aarch64_addr_query_type type)
7101 struct aarch64_address_info addr;
7102 unsigned int size;
7104 /* Check all addresses are Pmode - including ILP32. */
7105 if (GET_MODE (x) != Pmode)
7106 output_operand_lossage ("invalid address mode");
7108 if (aarch64_classify_address (&addr, x, mode, true, type))
7109 switch (addr.type)
7111 case ADDRESS_REG_IMM:
7112 if (known_eq (addr.const_offset, 0))
7113 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7114 else if (aarch64_sve_data_mode_p (mode))
7116 HOST_WIDE_INT vnum
7117 = exact_div (addr.const_offset,
7118 BYTES_PER_SVE_VECTOR).to_constant ();
7119 asm_fprintf (f, "[%s, #%wd, mul vl]",
7120 reg_names[REGNO (addr.base)], vnum);
7122 else if (aarch64_sve_pred_mode_p (mode))
7124 HOST_WIDE_INT vnum
7125 = exact_div (addr.const_offset,
7126 BYTES_PER_SVE_PRED).to_constant ();
7127 asm_fprintf (f, "[%s, #%wd, mul vl]",
7128 reg_names[REGNO (addr.base)], vnum);
7130 else
7131 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7132 INTVAL (addr.offset));
7133 return true;
7135 case ADDRESS_REG_REG:
7136 if (addr.shift == 0)
7137 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7138 reg_names [REGNO (addr.offset)]);
7139 else
7140 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7141 reg_names [REGNO (addr.offset)], addr.shift);
7142 return true;
7144 case ADDRESS_REG_UXTW:
7145 if (addr.shift == 0)
7146 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7147 REGNO (addr.offset) - R0_REGNUM);
7148 else
7149 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7150 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7151 return true;
7153 case ADDRESS_REG_SXTW:
7154 if (addr.shift == 0)
7155 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7156 REGNO (addr.offset) - R0_REGNUM);
7157 else
7158 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7159 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7160 return true;
7162 case ADDRESS_REG_WB:
7163 /* Writeback is only supported for fixed-width modes. */
7164 size = GET_MODE_SIZE (mode).to_constant ();
7165 switch (GET_CODE (x))
7167 case PRE_INC:
7168 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7169 return true;
7170 case POST_INC:
7171 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7172 return true;
7173 case PRE_DEC:
7174 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7175 return true;
7176 case POST_DEC:
7177 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7178 return true;
7179 case PRE_MODIFY:
7180 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7181 INTVAL (addr.offset));
7182 return true;
7183 case POST_MODIFY:
7184 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7185 INTVAL (addr.offset));
7186 return true;
7187 default:
7188 break;
7190 break;
7192 case ADDRESS_LO_SUM:
7193 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7194 output_addr_const (f, addr.offset);
7195 asm_fprintf (f, "]");
7196 return true;
7198 case ADDRESS_SYMBOLIC:
7199 output_addr_const (f, x);
7200 return true;
7203 return false;
7206 /* Print address 'x' of a memory access with mode 'mode'. */
7207 static void
7208 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7210 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7211 output_addr_const (f, x);
7214 bool
7215 aarch64_label_mentioned_p (rtx x)
7217 const char *fmt;
7218 int i;
7220 if (GET_CODE (x) == LABEL_REF)
7221 return true;
7223 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7224 referencing instruction, but they are constant offsets, not
7225 symbols. */
7226 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7227 return false;
7229 fmt = GET_RTX_FORMAT (GET_CODE (x));
7230 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7232 if (fmt[i] == 'E')
7234 int j;
7236 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7237 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7238 return 1;
7240 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7241 return 1;
7244 return 0;
7247 /* Implement REGNO_REG_CLASS. */
7249 enum reg_class
7250 aarch64_regno_regclass (unsigned regno)
7252 if (GP_REGNUM_P (regno))
7253 return GENERAL_REGS;
7255 if (regno == SP_REGNUM)
7256 return STACK_REG;
7258 if (regno == FRAME_POINTER_REGNUM
7259 || regno == ARG_POINTER_REGNUM)
7260 return POINTER_REGS;
7262 if (FP_REGNUM_P (regno))
7263 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
7265 if (PR_REGNUM_P (regno))
7266 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7268 return NO_REGS;
7271 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7272 If OFFSET is out of range, return an offset of an anchor point
7273 that is in range. Return 0 otherwise. */
7275 static HOST_WIDE_INT
7276 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7277 machine_mode mode)
7279 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7280 if (size > 16)
7281 return (offset + 0x400) & ~0x7f0;
7283 /* For offsets that aren't a multiple of the access size, the limit is
7284 -256...255. */
7285 if (offset & (size - 1))
7287 /* BLKmode typically uses LDP of X-registers. */
7288 if (mode == BLKmode)
7289 return (offset + 512) & ~0x3ff;
7290 return (offset + 0x100) & ~0x1ff;
7293 /* Small negative offsets are supported. */
7294 if (IN_RANGE (offset, -256, 0))
7295 return 0;
7297 if (mode == TImode || mode == TFmode)
7298 return (offset + 0x100) & ~0x1ff;
7300 /* Use 12-bit offset by access size. */
7301 return offset & (~0xfff * size);
7304 static rtx
7305 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
7307 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7308 where mask is selected by alignment and size of the offset.
7309 We try to pick as large a range for the offset as possible to
7310 maximize the chance of a CSE. However, for aligned addresses
7311 we limit the range to 4k so that structures with different sized
7312 elements are likely to use the same base. We need to be careful
7313 not to split a CONST for some forms of address expression, otherwise
7314 it will generate sub-optimal code. */
7316 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7318 rtx base = XEXP (x, 0);
7319 rtx offset_rtx = XEXP (x, 1);
7320 HOST_WIDE_INT offset = INTVAL (offset_rtx);
7322 if (GET_CODE (base) == PLUS)
7324 rtx op0 = XEXP (base, 0);
7325 rtx op1 = XEXP (base, 1);
7327 /* Force any scaling into a temp for CSE. */
7328 op0 = force_reg (Pmode, op0);
7329 op1 = force_reg (Pmode, op1);
7331 /* Let the pointer register be in op0. */
7332 if (REG_POINTER (op1))
7333 std::swap (op0, op1);
7335 /* If the pointer is virtual or frame related, then we know that
7336 virtual register instantiation or register elimination is going
7337 to apply a second constant. We want the two constants folded
7338 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7339 if (virt_or_elim_regno_p (REGNO (op0)))
7341 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7342 NULL_RTX, true, OPTAB_DIRECT);
7343 return gen_rtx_PLUS (Pmode, base, op1);
7346 /* Otherwise, in order to encourage CSE (and thence loop strength
7347 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7348 base = expand_binop (Pmode, add_optab, op0, op1,
7349 NULL_RTX, true, OPTAB_DIRECT);
7350 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7353 HOST_WIDE_INT size;
7354 if (GET_MODE_SIZE (mode).is_constant (&size))
7356 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7357 mode);
7358 if (base_offset != 0)
7360 base = plus_constant (Pmode, base, base_offset);
7361 base = force_operand (base, NULL_RTX);
7362 return plus_constant (Pmode, base, offset - base_offset);
7367 return x;
7370 static reg_class_t
7371 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7372 reg_class_t rclass,
7373 machine_mode mode,
7374 secondary_reload_info *sri)
7376 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7377 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7378 comment at the head of aarch64-sve.md for more details about the
7379 big-endian handling. */
7380 if (BYTES_BIG_ENDIAN
7381 && reg_class_subset_p (rclass, FP_REGS)
7382 && !((REG_P (x) && HARD_REGISTER_P (x))
7383 || aarch64_simd_valid_immediate (x, NULL))
7384 && aarch64_sve_data_mode_p (mode))
7386 sri->icode = CODE_FOR_aarch64_sve_reload_be;
7387 return NO_REGS;
7390 /* If we have to disable direct literal pool loads and stores because the
7391 function is too big, then we need a scratch register. */
7392 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7393 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7394 || targetm.vector_mode_supported_p (GET_MODE (x)))
7395 && !aarch64_pcrelative_literal_loads)
7397 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
7398 return NO_REGS;
7401 /* Without the TARGET_SIMD instructions we cannot move a Q register
7402 to a Q register directly. We need a scratch. */
7403 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7404 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7405 && reg_class_subset_p (rclass, FP_REGS))
7407 sri->icode = code_for_aarch64_reload_mov (mode);
7408 return NO_REGS;
7411 /* A TFmode or TImode memory access should be handled via an FP_REGS
7412 because AArch64 has richer addressing modes for LDR/STR instructions
7413 than LDP/STP instructions. */
7414 if (TARGET_FLOAT && rclass == GENERAL_REGS
7415 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7416 return FP_REGS;
7418 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7419 return GENERAL_REGS;
7421 return NO_REGS;
7424 static bool
7425 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7427 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7429 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7430 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7431 if (frame_pointer_needed)
7432 return to == HARD_FRAME_POINTER_REGNUM;
7433 return true;
7436 poly_int64
7437 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7439 aarch64_layout_frame ();
7441 if (to == HARD_FRAME_POINTER_REGNUM)
7443 if (from == ARG_POINTER_REGNUM)
7444 return cfun->machine->frame.hard_fp_offset;
7446 if (from == FRAME_POINTER_REGNUM)
7447 return cfun->machine->frame.hard_fp_offset
7448 - cfun->machine->frame.locals_offset;
7451 if (to == STACK_POINTER_REGNUM)
7453 if (from == FRAME_POINTER_REGNUM)
7454 return cfun->machine->frame.frame_size
7455 - cfun->machine->frame.locals_offset;
7458 return cfun->machine->frame.frame_size;
7461 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7462 previous frame. */
7465 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7467 if (count != 0)
7468 return const0_rtx;
7469 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7473 static void
7474 aarch64_asm_trampoline_template (FILE *f)
7476 if (TARGET_ILP32)
7478 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7479 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7481 else
7483 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7484 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7486 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7487 assemble_aligned_integer (4, const0_rtx);
7488 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7489 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7492 static void
7493 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7495 rtx fnaddr, mem, a_tramp;
7496 const int tramp_code_sz = 16;
7498 /* Don't need to copy the trailing D-words, we fill those in below. */
7499 emit_block_move (m_tramp, assemble_trampoline_template (),
7500 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7501 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7502 fnaddr = XEXP (DECL_RTL (fndecl), 0);
7503 if (GET_MODE (fnaddr) != ptr_mode)
7504 fnaddr = convert_memory_address (ptr_mode, fnaddr);
7505 emit_move_insn (mem, fnaddr);
7507 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7508 emit_move_insn (mem, chain_value);
7510 /* XXX We should really define a "clear_cache" pattern and use
7511 gen_clear_cache(). */
7512 a_tramp = XEXP (m_tramp, 0);
7513 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7514 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7515 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7516 ptr_mode);
7519 static unsigned char
7520 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7522 /* ??? Logically we should only need to provide a value when
7523 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7524 can hold MODE, but at the moment we need to handle all modes.
7525 Just ignore any runtime parts for registers that can't store them. */
7526 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7527 unsigned int nregs;
7528 switch (regclass)
7530 case TAILCALL_ADDR_REGS:
7531 case POINTER_REGS:
7532 case GENERAL_REGS:
7533 case ALL_REGS:
7534 case POINTER_AND_FP_REGS:
7535 case FP_REGS:
7536 case FP_LO_REGS:
7537 if (aarch64_sve_data_mode_p (mode)
7538 && constant_multiple_p (GET_MODE_SIZE (mode),
7539 BYTES_PER_SVE_VECTOR, &nregs))
7540 return nregs;
7541 return (aarch64_vector_data_mode_p (mode)
7542 ? CEIL (lowest_size, UNITS_PER_VREG)
7543 : CEIL (lowest_size, UNITS_PER_WORD));
7544 case STACK_REG:
7545 case PR_REGS:
7546 case PR_LO_REGS:
7547 case PR_HI_REGS:
7548 return 1;
7550 case NO_REGS:
7551 return 0;
7553 default:
7554 break;
7556 gcc_unreachable ();
7559 static reg_class_t
7560 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7562 if (regclass == POINTER_REGS)
7563 return GENERAL_REGS;
7565 if (regclass == STACK_REG)
7567 if (REG_P(x)
7568 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7569 return regclass;
7571 return NO_REGS;
7574 /* Register eliminiation can result in a request for
7575 SP+constant->FP_REGS. We cannot support such operations which
7576 use SP as source and an FP_REG as destination, so reject out
7577 right now. */
7578 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7580 rtx lhs = XEXP (x, 0);
7582 /* Look through a possible SUBREG introduced by ILP32. */
7583 if (GET_CODE (lhs) == SUBREG)
7584 lhs = SUBREG_REG (lhs);
7586 gcc_assert (REG_P (lhs));
7587 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7588 POINTER_REGS));
7589 return NO_REGS;
7592 return regclass;
7595 void
7596 aarch64_asm_output_labelref (FILE* f, const char *name)
7598 asm_fprintf (f, "%U%s", name);
7601 static void
7602 aarch64_elf_asm_constructor (rtx symbol, int priority)
7604 if (priority == DEFAULT_INIT_PRIORITY)
7605 default_ctor_section_asm_out_constructor (symbol, priority);
7606 else
7608 section *s;
7609 /* While priority is known to be in range [0, 65535], so 18 bytes
7610 would be enough, the compiler might not know that. To avoid
7611 -Wformat-truncation false positive, use a larger size. */
7612 char buf[23];
7613 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7614 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7615 switch_to_section (s);
7616 assemble_align (POINTER_SIZE);
7617 assemble_aligned_integer (POINTER_BYTES, symbol);
7621 static void
7622 aarch64_elf_asm_destructor (rtx symbol, int priority)
7624 if (priority == DEFAULT_INIT_PRIORITY)
7625 default_dtor_section_asm_out_destructor (symbol, priority);
7626 else
7628 section *s;
7629 /* While priority is known to be in range [0, 65535], so 18 bytes
7630 would be enough, the compiler might not know that. To avoid
7631 -Wformat-truncation false positive, use a larger size. */
7632 char buf[23];
7633 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7634 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7635 switch_to_section (s);
7636 assemble_align (POINTER_SIZE);
7637 assemble_aligned_integer (POINTER_BYTES, symbol);
7641 const char*
7642 aarch64_output_casesi (rtx *operands)
7644 char buf[100];
7645 char label[100];
7646 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7647 int index;
7648 static const char *const patterns[4][2] =
7651 "ldrb\t%w3, [%0,%w1,uxtw]",
7652 "add\t%3, %4, %w3, sxtb #2"
7655 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7656 "add\t%3, %4, %w3, sxth #2"
7659 "ldr\t%w3, [%0,%w1,uxtw #2]",
7660 "add\t%3, %4, %w3, sxtw #2"
7662 /* We assume that DImode is only generated when not optimizing and
7663 that we don't really need 64-bit address offsets. That would
7664 imply an object file with 8GB of code in a single function! */
7666 "ldr\t%w3, [%0,%w1,uxtw #2]",
7667 "add\t%3, %4, %w3, sxtw #2"
7671 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7673 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7674 index = exact_log2 (GET_MODE_SIZE (mode));
7676 gcc_assert (index >= 0 && index <= 3);
7678 /* Need to implement table size reduction, by chaning the code below. */
7679 output_asm_insn (patterns[index][0], operands);
7680 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7681 snprintf (buf, sizeof (buf),
7682 "adr\t%%4, %s", targetm.strip_name_encoding (label));
7683 output_asm_insn (buf, operands);
7684 output_asm_insn (patterns[index][1], operands);
7685 output_asm_insn ("br\t%3", operands);
7686 assemble_label (asm_out_file, label);
7687 return "";
7691 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7692 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7693 operator. */
7696 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7698 if (shift >= 0 && shift <= 3)
7700 int size;
7701 for (size = 8; size <= 32; size *= 2)
7703 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7704 if (mask == bits << shift)
7705 return size;
7708 return 0;
7711 /* Constant pools are per function only when PC relative
7712 literal loads are true or we are in the large memory
7713 model. */
7715 static inline bool
7716 aarch64_can_use_per_function_literal_pools_p (void)
7718 return (aarch64_pcrelative_literal_loads
7719 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7722 static bool
7723 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7725 /* We can't use blocks for constants when we're using a per-function
7726 constant pool. */
7727 return !aarch64_can_use_per_function_literal_pools_p ();
7730 /* Select appropriate section for constants depending
7731 on where we place literal pools. */
7733 static section *
7734 aarch64_select_rtx_section (machine_mode mode,
7735 rtx x,
7736 unsigned HOST_WIDE_INT align)
7738 if (aarch64_can_use_per_function_literal_pools_p ())
7739 return function_section (current_function_decl);
7741 return default_elf_select_rtx_section (mode, x, align);
7744 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7745 void
7746 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7747 HOST_WIDE_INT offset)
7749 /* When using per-function literal pools, we must ensure that any code
7750 section is aligned to the minimal instruction length, lest we get
7751 errors from the assembler re "unaligned instructions". */
7752 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7753 ASM_OUTPUT_ALIGN (f, 2);
7756 /* Costs. */
7758 /* Helper function for rtx cost calculation. Strip a shift expression
7759 from X. Returns the inner operand if successful, or the original
7760 expression on failure. */
7761 static rtx
7762 aarch64_strip_shift (rtx x)
7764 rtx op = x;
7766 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7767 we can convert both to ROR during final output. */
7768 if ((GET_CODE (op) == ASHIFT
7769 || GET_CODE (op) == ASHIFTRT
7770 || GET_CODE (op) == LSHIFTRT
7771 || GET_CODE (op) == ROTATERT
7772 || GET_CODE (op) == ROTATE)
7773 && CONST_INT_P (XEXP (op, 1)))
7774 return XEXP (op, 0);
7776 if (GET_CODE (op) == MULT
7777 && CONST_INT_P (XEXP (op, 1))
7778 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7779 return XEXP (op, 0);
7781 return x;
7784 /* Helper function for rtx cost calculation. Strip an extend
7785 expression from X. Returns the inner operand if successful, or the
7786 original expression on failure. We deal with a number of possible
7787 canonicalization variations here. If STRIP_SHIFT is true, then
7788 we can strip off a shift also. */
7789 static rtx
7790 aarch64_strip_extend (rtx x, bool strip_shift)
7792 scalar_int_mode mode;
7793 rtx op = x;
7795 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7796 return op;
7798 /* Zero and sign extraction of a widened value. */
7799 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7800 && XEXP (op, 2) == const0_rtx
7801 && GET_CODE (XEXP (op, 0)) == MULT
7802 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7803 XEXP (op, 1)))
7804 return XEXP (XEXP (op, 0), 0);
7806 /* It can also be represented (for zero-extend) as an AND with an
7807 immediate. */
7808 if (GET_CODE (op) == AND
7809 && GET_CODE (XEXP (op, 0)) == MULT
7810 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7811 && CONST_INT_P (XEXP (op, 1))
7812 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7813 INTVAL (XEXP (op, 1))) != 0)
7814 return XEXP (XEXP (op, 0), 0);
7816 /* Now handle extended register, as this may also have an optional
7817 left shift by 1..4. */
7818 if (strip_shift
7819 && GET_CODE (op) == ASHIFT
7820 && CONST_INT_P (XEXP (op, 1))
7821 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7822 op = XEXP (op, 0);
7824 if (GET_CODE (op) == ZERO_EXTEND
7825 || GET_CODE (op) == SIGN_EXTEND)
7826 op = XEXP (op, 0);
7828 if (op != x)
7829 return op;
7831 return x;
7834 /* Return true iff CODE is a shift supported in combination
7835 with arithmetic instructions. */
7837 static bool
7838 aarch64_shift_p (enum rtx_code code)
7840 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7844 /* Return true iff X is a cheap shift without a sign extend. */
7846 static bool
7847 aarch64_cheap_mult_shift_p (rtx x)
7849 rtx op0, op1;
7851 op0 = XEXP (x, 0);
7852 op1 = XEXP (x, 1);
7854 if (!(aarch64_tune_params.extra_tuning_flags
7855 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7856 return false;
7858 if (GET_CODE (op0) == SIGN_EXTEND)
7859 return false;
7861 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7862 && UINTVAL (op1) <= 4)
7863 return true;
7865 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7866 return false;
7868 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7870 if (l2 > 0 && l2 <= 4)
7871 return true;
7873 return false;
7876 /* Helper function for rtx cost calculation. Calculate the cost of
7877 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7878 Return the calculated cost of the expression, recursing manually in to
7879 operands where needed. */
7881 static int
7882 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7884 rtx op0, op1;
7885 const struct cpu_cost_table *extra_cost
7886 = aarch64_tune_params.insn_extra_cost;
7887 int cost = 0;
7888 bool compound_p = (outer == PLUS || outer == MINUS);
7889 machine_mode mode = GET_MODE (x);
7891 gcc_checking_assert (code == MULT);
7893 op0 = XEXP (x, 0);
7894 op1 = XEXP (x, 1);
7896 if (VECTOR_MODE_P (mode))
7897 mode = GET_MODE_INNER (mode);
7899 /* Integer multiply/fma. */
7900 if (GET_MODE_CLASS (mode) == MODE_INT)
7902 /* The multiply will be canonicalized as a shift, cost it as such. */
7903 if (aarch64_shift_p (GET_CODE (x))
7904 || (CONST_INT_P (op1)
7905 && exact_log2 (INTVAL (op1)) > 0))
7907 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7908 || GET_CODE (op0) == SIGN_EXTEND;
7909 if (speed)
7911 if (compound_p)
7913 /* If the shift is considered cheap,
7914 then don't add any cost. */
7915 if (aarch64_cheap_mult_shift_p (x))
7917 else if (REG_P (op1))
7918 /* ARITH + shift-by-register. */
7919 cost += extra_cost->alu.arith_shift_reg;
7920 else if (is_extend)
7921 /* ARITH + extended register. We don't have a cost field
7922 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
7923 cost += extra_cost->alu.extend_arith;
7924 else
7925 /* ARITH + shift-by-immediate. */
7926 cost += extra_cost->alu.arith_shift;
7928 else
7929 /* LSL (immediate). */
7930 cost += extra_cost->alu.shift;
7933 /* Strip extends as we will have costed them in the case above. */
7934 if (is_extend)
7935 op0 = aarch64_strip_extend (op0, true);
7937 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7939 return cost;
7942 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
7943 compound and let the below cases handle it. After all, MNEG is a
7944 special-case alias of MSUB. */
7945 if (GET_CODE (op0) == NEG)
7947 op0 = XEXP (op0, 0);
7948 compound_p = true;
7951 /* Integer multiplies or FMAs have zero/sign extending variants. */
7952 if ((GET_CODE (op0) == ZERO_EXTEND
7953 && GET_CODE (op1) == ZERO_EXTEND)
7954 || (GET_CODE (op0) == SIGN_EXTEND
7955 && GET_CODE (op1) == SIGN_EXTEND))
7957 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
7958 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
7960 if (speed)
7962 if (compound_p)
7963 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
7964 cost += extra_cost->mult[0].extend_add;
7965 else
7966 /* MUL/SMULL/UMULL. */
7967 cost += extra_cost->mult[0].extend;
7970 return cost;
7973 /* This is either an integer multiply or a MADD. In both cases
7974 we want to recurse and cost the operands. */
7975 cost += rtx_cost (op0, mode, MULT, 0, speed);
7976 cost += rtx_cost (op1, mode, MULT, 1, speed);
7978 if (speed)
7980 if (compound_p)
7981 /* MADD/MSUB. */
7982 cost += extra_cost->mult[mode == DImode].add;
7983 else
7984 /* MUL. */
7985 cost += extra_cost->mult[mode == DImode].simple;
7988 return cost;
7990 else
7992 if (speed)
7994 /* Floating-point FMA/FMUL can also support negations of the
7995 operands, unless the rounding mode is upward or downward in
7996 which case FNMUL is different than FMUL with operand negation. */
7997 bool neg0 = GET_CODE (op0) == NEG;
7998 bool neg1 = GET_CODE (op1) == NEG;
7999 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8001 if (neg0)
8002 op0 = XEXP (op0, 0);
8003 if (neg1)
8004 op1 = XEXP (op1, 0);
8007 if (compound_p)
8008 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8009 cost += extra_cost->fp[mode == DFmode].fma;
8010 else
8011 /* FMUL/FNMUL. */
8012 cost += extra_cost->fp[mode == DFmode].mult;
8015 cost += rtx_cost (op0, mode, MULT, 0, speed);
8016 cost += rtx_cost (op1, mode, MULT, 1, speed);
8017 return cost;
8021 static int
8022 aarch64_address_cost (rtx x,
8023 machine_mode mode,
8024 addr_space_t as ATTRIBUTE_UNUSED,
8025 bool speed)
8027 enum rtx_code c = GET_CODE (x);
8028 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8029 struct aarch64_address_info info;
8030 int cost = 0;
8031 info.shift = 0;
8033 if (!aarch64_classify_address (&info, x, mode, false))
8035 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8037 /* This is a CONST or SYMBOL ref which will be split
8038 in a different way depending on the code model in use.
8039 Cost it through the generic infrastructure. */
8040 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8041 /* Divide through by the cost of one instruction to
8042 bring it to the same units as the address costs. */
8043 cost_symbol_ref /= COSTS_N_INSNS (1);
8044 /* The cost is then the cost of preparing the address,
8045 followed by an immediate (possibly 0) offset. */
8046 return cost_symbol_ref + addr_cost->imm_offset;
8048 else
8050 /* This is most likely a jump table from a case
8051 statement. */
8052 return addr_cost->register_offset;
8056 switch (info.type)
8058 case ADDRESS_LO_SUM:
8059 case ADDRESS_SYMBOLIC:
8060 case ADDRESS_REG_IMM:
8061 cost += addr_cost->imm_offset;
8062 break;
8064 case ADDRESS_REG_WB:
8065 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8066 cost += addr_cost->pre_modify;
8067 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8068 cost += addr_cost->post_modify;
8069 else
8070 gcc_unreachable ();
8072 break;
8074 case ADDRESS_REG_REG:
8075 cost += addr_cost->register_offset;
8076 break;
8078 case ADDRESS_REG_SXTW:
8079 cost += addr_cost->register_sextend;
8080 break;
8082 case ADDRESS_REG_UXTW:
8083 cost += addr_cost->register_zextend;
8084 break;
8086 default:
8087 gcc_unreachable ();
8091 if (info.shift > 0)
8093 /* For the sake of calculating the cost of the shifted register
8094 component, we can treat same sized modes in the same way. */
8095 if (known_eq (GET_MODE_BITSIZE (mode), 16))
8096 cost += addr_cost->addr_scale_costs.hi;
8097 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8098 cost += addr_cost->addr_scale_costs.si;
8099 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8100 cost += addr_cost->addr_scale_costs.di;
8101 else
8102 /* We can't tell, or this is a 128-bit vector. */
8103 cost += addr_cost->addr_scale_costs.ti;
8106 return cost;
8109 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8110 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8111 to be taken. */
8114 aarch64_branch_cost (bool speed_p, bool predictable_p)
8116 /* When optimizing for speed, use the cost of unpredictable branches. */
8117 const struct cpu_branch_cost *branch_costs =
8118 aarch64_tune_params.branch_costs;
8120 if (!speed_p || predictable_p)
8121 return branch_costs->predictable;
8122 else
8123 return branch_costs->unpredictable;
8126 /* Return true if the RTX X in mode MODE is a zero or sign extract
8127 usable in an ADD or SUB (extended register) instruction. */
8128 static bool
8129 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8131 /* Catch add with a sign extract.
8132 This is add_<optab><mode>_multp2. */
8133 if (GET_CODE (x) == SIGN_EXTRACT
8134 || GET_CODE (x) == ZERO_EXTRACT)
8136 rtx op0 = XEXP (x, 0);
8137 rtx op1 = XEXP (x, 1);
8138 rtx op2 = XEXP (x, 2);
8140 if (GET_CODE (op0) == MULT
8141 && CONST_INT_P (op1)
8142 && op2 == const0_rtx
8143 && CONST_INT_P (XEXP (op0, 1))
8144 && aarch64_is_extend_from_extract (mode,
8145 XEXP (op0, 1),
8146 op1))
8148 return true;
8151 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8152 No shift. */
8153 else if (GET_CODE (x) == SIGN_EXTEND
8154 || GET_CODE (x) == ZERO_EXTEND)
8155 return REG_P (XEXP (x, 0));
8157 return false;
8160 static bool
8161 aarch64_frint_unspec_p (unsigned int u)
8163 switch (u)
8165 case UNSPEC_FRINTZ:
8166 case UNSPEC_FRINTP:
8167 case UNSPEC_FRINTM:
8168 case UNSPEC_FRINTA:
8169 case UNSPEC_FRINTN:
8170 case UNSPEC_FRINTX:
8171 case UNSPEC_FRINTI:
8172 return true;
8174 default:
8175 return false;
8179 /* Return true iff X is an rtx that will match an extr instruction
8180 i.e. as described in the *extr<mode>5_insn family of patterns.
8181 OP0 and OP1 will be set to the operands of the shifts involved
8182 on success and will be NULL_RTX otherwise. */
8184 static bool
8185 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8187 rtx op0, op1;
8188 scalar_int_mode mode;
8189 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8190 return false;
8192 *res_op0 = NULL_RTX;
8193 *res_op1 = NULL_RTX;
8195 if (GET_CODE (x) != IOR)
8196 return false;
8198 op0 = XEXP (x, 0);
8199 op1 = XEXP (x, 1);
8201 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8202 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8204 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8205 if (GET_CODE (op1) == ASHIFT)
8206 std::swap (op0, op1);
8208 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8209 return false;
8211 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8212 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8214 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8215 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8217 *res_op0 = XEXP (op0, 0);
8218 *res_op1 = XEXP (op1, 0);
8219 return true;
8223 return false;
8226 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8227 storing it in *COST. Result is true if the total cost of the operation
8228 has now been calculated. */
8229 static bool
8230 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8232 rtx inner;
8233 rtx comparator;
8234 enum rtx_code cmpcode;
8236 if (COMPARISON_P (op0))
8238 inner = XEXP (op0, 0);
8239 comparator = XEXP (op0, 1);
8240 cmpcode = GET_CODE (op0);
8242 else
8244 inner = op0;
8245 comparator = const0_rtx;
8246 cmpcode = NE;
8249 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8251 /* Conditional branch. */
8252 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8253 return true;
8254 else
8256 if (cmpcode == NE || cmpcode == EQ)
8258 if (comparator == const0_rtx)
8260 /* TBZ/TBNZ/CBZ/CBNZ. */
8261 if (GET_CODE (inner) == ZERO_EXTRACT)
8262 /* TBZ/TBNZ. */
8263 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8264 ZERO_EXTRACT, 0, speed);
8265 else
8266 /* CBZ/CBNZ. */
8267 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8269 return true;
8272 else if (cmpcode == LT || cmpcode == GE)
8274 /* TBZ/TBNZ. */
8275 if (comparator == const0_rtx)
8276 return true;
8280 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8282 /* CCMP. */
8283 if (GET_CODE (op1) == COMPARE)
8285 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8286 if (XEXP (op1, 1) == const0_rtx)
8287 *cost += 1;
8288 if (speed)
8290 machine_mode mode = GET_MODE (XEXP (op1, 0));
8291 const struct cpu_cost_table *extra_cost
8292 = aarch64_tune_params.insn_extra_cost;
8294 if (GET_MODE_CLASS (mode) == MODE_INT)
8295 *cost += extra_cost->alu.arith;
8296 else
8297 *cost += extra_cost->fp[mode == DFmode].compare;
8299 return true;
8302 /* It's a conditional operation based on the status flags,
8303 so it must be some flavor of CSEL. */
8305 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8306 if (GET_CODE (op1) == NEG
8307 || GET_CODE (op1) == NOT
8308 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8309 op1 = XEXP (op1, 0);
8310 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8312 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8313 op1 = XEXP (op1, 0);
8314 op2 = XEXP (op2, 0);
8317 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8318 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8319 return true;
8322 /* We don't know what this is, cost all operands. */
8323 return false;
8326 /* Check whether X is a bitfield operation of the form shift + extend that
8327 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8328 operand to which the bitfield operation is applied. Otherwise return
8329 NULL_RTX. */
8331 static rtx
8332 aarch64_extend_bitfield_pattern_p (rtx x)
8334 rtx_code outer_code = GET_CODE (x);
8335 machine_mode outer_mode = GET_MODE (x);
8337 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8338 && outer_mode != SImode && outer_mode != DImode)
8339 return NULL_RTX;
8341 rtx inner = XEXP (x, 0);
8342 rtx_code inner_code = GET_CODE (inner);
8343 machine_mode inner_mode = GET_MODE (inner);
8344 rtx op = NULL_RTX;
8346 switch (inner_code)
8348 case ASHIFT:
8349 if (CONST_INT_P (XEXP (inner, 1))
8350 && (inner_mode == QImode || inner_mode == HImode))
8351 op = XEXP (inner, 0);
8352 break;
8353 case LSHIFTRT:
8354 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8355 && (inner_mode == QImode || inner_mode == HImode))
8356 op = XEXP (inner, 0);
8357 break;
8358 case ASHIFTRT:
8359 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8360 && (inner_mode == QImode || inner_mode == HImode))
8361 op = XEXP (inner, 0);
8362 break;
8363 default:
8364 break;
8367 return op;
8370 /* Return true if the mask and a shift amount from an RTX of the form
8371 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8372 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8374 bool
8375 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8376 rtx shft_amnt)
8378 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8379 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8380 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8381 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8384 /* Calculate the cost of calculating X, storing it in *COST. Result
8385 is true if the total cost of the operation has now been calculated. */
8386 static bool
8387 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8388 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8390 rtx op0, op1, op2;
8391 const struct cpu_cost_table *extra_cost
8392 = aarch64_tune_params.insn_extra_cost;
8393 int code = GET_CODE (x);
8394 scalar_int_mode int_mode;
8396 /* By default, assume that everything has equivalent cost to the
8397 cheapest instruction. Any additional costs are applied as a delta
8398 above this default. */
8399 *cost = COSTS_N_INSNS (1);
8401 switch (code)
8403 case SET:
8404 /* The cost depends entirely on the operands to SET. */
8405 *cost = 0;
8406 op0 = SET_DEST (x);
8407 op1 = SET_SRC (x);
8409 switch (GET_CODE (op0))
8411 case MEM:
8412 if (speed)
8414 rtx address = XEXP (op0, 0);
8415 if (VECTOR_MODE_P (mode))
8416 *cost += extra_cost->ldst.storev;
8417 else if (GET_MODE_CLASS (mode) == MODE_INT)
8418 *cost += extra_cost->ldst.store;
8419 else if (mode == SFmode)
8420 *cost += extra_cost->ldst.storef;
8421 else if (mode == DFmode)
8422 *cost += extra_cost->ldst.stored;
8424 *cost +=
8425 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8426 0, speed));
8429 *cost += rtx_cost (op1, mode, SET, 1, speed);
8430 return true;
8432 case SUBREG:
8433 if (! REG_P (SUBREG_REG (op0)))
8434 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8436 /* Fall through. */
8437 case REG:
8438 /* The cost is one per vector-register copied. */
8439 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8441 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8442 *cost = COSTS_N_INSNS (nregs);
8444 /* const0_rtx is in general free, but we will use an
8445 instruction to set a register to 0. */
8446 else if (REG_P (op1) || op1 == const0_rtx)
8448 /* The cost is 1 per register copied. */
8449 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8450 *cost = COSTS_N_INSNS (nregs);
8452 else
8453 /* Cost is just the cost of the RHS of the set. */
8454 *cost += rtx_cost (op1, mode, SET, 1, speed);
8455 return true;
8457 case ZERO_EXTRACT:
8458 case SIGN_EXTRACT:
8459 /* Bit-field insertion. Strip any redundant widening of
8460 the RHS to meet the width of the target. */
8461 if (GET_CODE (op1) == SUBREG)
8462 op1 = SUBREG_REG (op1);
8463 if ((GET_CODE (op1) == ZERO_EXTEND
8464 || GET_CODE (op1) == SIGN_EXTEND)
8465 && CONST_INT_P (XEXP (op0, 1))
8466 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8467 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8468 op1 = XEXP (op1, 0);
8470 if (CONST_INT_P (op1))
8472 /* MOV immediate is assumed to always be cheap. */
8473 *cost = COSTS_N_INSNS (1);
8475 else
8477 /* BFM. */
8478 if (speed)
8479 *cost += extra_cost->alu.bfi;
8480 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8483 return true;
8485 default:
8486 /* We can't make sense of this, assume default cost. */
8487 *cost = COSTS_N_INSNS (1);
8488 return false;
8490 return false;
8492 case CONST_INT:
8493 /* If an instruction can incorporate a constant within the
8494 instruction, the instruction's expression avoids calling
8495 rtx_cost() on the constant. If rtx_cost() is called on a
8496 constant, then it is usually because the constant must be
8497 moved into a register by one or more instructions.
8499 The exception is constant 0, which can be expressed
8500 as XZR/WZR and is therefore free. The exception to this is
8501 if we have (set (reg) (const0_rtx)) in which case we must cost
8502 the move. However, we can catch that when we cost the SET, so
8503 we don't need to consider that here. */
8504 if (x == const0_rtx)
8505 *cost = 0;
8506 else
8508 /* To an approximation, building any other constant is
8509 proportionally expensive to the number of instructions
8510 required to build that constant. This is true whether we
8511 are compiling for SPEED or otherwise. */
8512 if (!is_a <scalar_int_mode> (mode, &int_mode))
8513 int_mode = word_mode;
8514 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8515 (NULL_RTX, x, false, int_mode));
8517 return true;
8519 case CONST_DOUBLE:
8521 /* First determine number of instructions to do the move
8522 as an integer constant. */
8523 if (!aarch64_float_const_representable_p (x)
8524 && !aarch64_can_const_movi_rtx_p (x, mode)
8525 && aarch64_float_const_rtx_p (x))
8527 unsigned HOST_WIDE_INT ival;
8528 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8529 gcc_assert (succeed);
8531 scalar_int_mode imode = (mode == HFmode
8532 ? SImode
8533 : int_mode_for_mode (mode).require ());
8534 int ncost = aarch64_internal_mov_immediate
8535 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8536 *cost += COSTS_N_INSNS (ncost);
8537 return true;
8540 if (speed)
8542 /* mov[df,sf]_aarch64. */
8543 if (aarch64_float_const_representable_p (x))
8544 /* FMOV (scalar immediate). */
8545 *cost += extra_cost->fp[mode == DFmode].fpconst;
8546 else if (!aarch64_float_const_zero_rtx_p (x))
8548 /* This will be a load from memory. */
8549 if (mode == DFmode)
8550 *cost += extra_cost->ldst.loadd;
8551 else
8552 *cost += extra_cost->ldst.loadf;
8554 else
8555 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8556 or MOV v0.s[0], wzr - neither of which are modeled by the
8557 cost tables. Just use the default cost. */
8562 return true;
8564 case MEM:
8565 if (speed)
8567 /* For loads we want the base cost of a load, plus an
8568 approximation for the additional cost of the addressing
8569 mode. */
8570 rtx address = XEXP (x, 0);
8571 if (VECTOR_MODE_P (mode))
8572 *cost += extra_cost->ldst.loadv;
8573 else if (GET_MODE_CLASS (mode) == MODE_INT)
8574 *cost += extra_cost->ldst.load;
8575 else if (mode == SFmode)
8576 *cost += extra_cost->ldst.loadf;
8577 else if (mode == DFmode)
8578 *cost += extra_cost->ldst.loadd;
8580 *cost +=
8581 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8582 0, speed));
8585 return true;
8587 case NEG:
8588 op0 = XEXP (x, 0);
8590 if (VECTOR_MODE_P (mode))
8592 if (speed)
8594 /* FNEG. */
8595 *cost += extra_cost->vect.alu;
8597 return false;
8600 if (GET_MODE_CLASS (mode) == MODE_INT)
8602 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8603 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8605 /* CSETM. */
8606 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8607 return true;
8610 /* Cost this as SUB wzr, X. */
8611 op0 = CONST0_RTX (mode);
8612 op1 = XEXP (x, 0);
8613 goto cost_minus;
8616 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8618 /* Support (neg(fma...)) as a single instruction only if
8619 sign of zeros is unimportant. This matches the decision
8620 making in aarch64.md. */
8621 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8623 /* FNMADD. */
8624 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8625 return true;
8627 if (GET_CODE (op0) == MULT)
8629 /* FNMUL. */
8630 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8631 return true;
8633 if (speed)
8634 /* FNEG. */
8635 *cost += extra_cost->fp[mode == DFmode].neg;
8636 return false;
8639 return false;
8641 case CLRSB:
8642 case CLZ:
8643 if (speed)
8645 if (VECTOR_MODE_P (mode))
8646 *cost += extra_cost->vect.alu;
8647 else
8648 *cost += extra_cost->alu.clz;
8651 return false;
8653 case COMPARE:
8654 op0 = XEXP (x, 0);
8655 op1 = XEXP (x, 1);
8657 if (op1 == const0_rtx
8658 && GET_CODE (op0) == AND)
8660 x = op0;
8661 mode = GET_MODE (op0);
8662 goto cost_logic;
8665 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8667 /* TODO: A write to the CC flags possibly costs extra, this
8668 needs encoding in the cost tables. */
8670 mode = GET_MODE (op0);
8671 /* ANDS. */
8672 if (GET_CODE (op0) == AND)
8674 x = op0;
8675 goto cost_logic;
8678 if (GET_CODE (op0) == PLUS)
8680 /* ADDS (and CMN alias). */
8681 x = op0;
8682 goto cost_plus;
8685 if (GET_CODE (op0) == MINUS)
8687 /* SUBS. */
8688 x = op0;
8689 goto cost_minus;
8692 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8693 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8694 && CONST_INT_P (XEXP (op0, 2)))
8696 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8697 Handle it here directly rather than going to cost_logic
8698 since we know the immediate generated for the TST is valid
8699 so we can avoid creating an intermediate rtx for it only
8700 for costing purposes. */
8701 if (speed)
8702 *cost += extra_cost->alu.logical;
8704 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8705 ZERO_EXTRACT, 0, speed);
8706 return true;
8709 if (GET_CODE (op1) == NEG)
8711 /* CMN. */
8712 if (speed)
8713 *cost += extra_cost->alu.arith;
8715 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8716 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8717 return true;
8720 /* CMP.
8722 Compare can freely swap the order of operands, and
8723 canonicalization puts the more complex operation first.
8724 But the integer MINUS logic expects the shift/extend
8725 operation in op1. */
8726 if (! (REG_P (op0)
8727 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8729 op0 = XEXP (x, 1);
8730 op1 = XEXP (x, 0);
8732 goto cost_minus;
8735 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8737 /* FCMP. */
8738 if (speed)
8739 *cost += extra_cost->fp[mode == DFmode].compare;
8741 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8743 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8744 /* FCMP supports constant 0.0 for no extra cost. */
8745 return true;
8747 return false;
8750 if (VECTOR_MODE_P (mode))
8752 /* Vector compare. */
8753 if (speed)
8754 *cost += extra_cost->vect.alu;
8756 if (aarch64_float_const_zero_rtx_p (op1))
8758 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8759 cost. */
8760 return true;
8762 return false;
8764 return false;
8766 case MINUS:
8768 op0 = XEXP (x, 0);
8769 op1 = XEXP (x, 1);
8771 cost_minus:
8772 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8774 /* Detect valid immediates. */
8775 if ((GET_MODE_CLASS (mode) == MODE_INT
8776 || (GET_MODE_CLASS (mode) == MODE_CC
8777 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8778 && CONST_INT_P (op1)
8779 && aarch64_uimm12_shift (INTVAL (op1)))
8781 if (speed)
8782 /* SUB(S) (immediate). */
8783 *cost += extra_cost->alu.arith;
8784 return true;
8787 /* Look for SUB (extended register). */
8788 if (is_a <scalar_int_mode> (mode, &int_mode)
8789 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8791 if (speed)
8792 *cost += extra_cost->alu.extend_arith;
8794 op1 = aarch64_strip_extend (op1, true);
8795 *cost += rtx_cost (op1, VOIDmode,
8796 (enum rtx_code) GET_CODE (op1), 0, speed);
8797 return true;
8800 rtx new_op1 = aarch64_strip_extend (op1, false);
8802 /* Cost this as an FMA-alike operation. */
8803 if ((GET_CODE (new_op1) == MULT
8804 || aarch64_shift_p (GET_CODE (new_op1)))
8805 && code != COMPARE)
8807 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8808 (enum rtx_code) code,
8809 speed);
8810 return true;
8813 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8815 if (speed)
8817 if (VECTOR_MODE_P (mode))
8819 /* Vector SUB. */
8820 *cost += extra_cost->vect.alu;
8822 else if (GET_MODE_CLASS (mode) == MODE_INT)
8824 /* SUB(S). */
8825 *cost += extra_cost->alu.arith;
8827 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8829 /* FSUB. */
8830 *cost += extra_cost->fp[mode == DFmode].addsub;
8833 return true;
8836 case PLUS:
8838 rtx new_op0;
8840 op0 = XEXP (x, 0);
8841 op1 = XEXP (x, 1);
8843 cost_plus:
8844 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8845 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8847 /* CSINC. */
8848 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8849 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8850 return true;
8853 if (GET_MODE_CLASS (mode) == MODE_INT
8854 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8855 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8857 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8859 if (speed)
8860 /* ADD (immediate). */
8861 *cost += extra_cost->alu.arith;
8862 return true;
8865 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8867 /* Look for ADD (extended register). */
8868 if (is_a <scalar_int_mode> (mode, &int_mode)
8869 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8871 if (speed)
8872 *cost += extra_cost->alu.extend_arith;
8874 op0 = aarch64_strip_extend (op0, true);
8875 *cost += rtx_cost (op0, VOIDmode,
8876 (enum rtx_code) GET_CODE (op0), 0, speed);
8877 return true;
8880 /* Strip any extend, leave shifts behind as we will
8881 cost them through mult_cost. */
8882 new_op0 = aarch64_strip_extend (op0, false);
8884 if (GET_CODE (new_op0) == MULT
8885 || aarch64_shift_p (GET_CODE (new_op0)))
8887 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8888 speed);
8889 return true;
8892 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8894 if (speed)
8896 if (VECTOR_MODE_P (mode))
8898 /* Vector ADD. */
8899 *cost += extra_cost->vect.alu;
8901 else if (GET_MODE_CLASS (mode) == MODE_INT)
8903 /* ADD. */
8904 *cost += extra_cost->alu.arith;
8906 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8908 /* FADD. */
8909 *cost += extra_cost->fp[mode == DFmode].addsub;
8912 return true;
8915 case BSWAP:
8916 *cost = COSTS_N_INSNS (1);
8918 if (speed)
8920 if (VECTOR_MODE_P (mode))
8921 *cost += extra_cost->vect.alu;
8922 else
8923 *cost += extra_cost->alu.rev;
8925 return false;
8927 case IOR:
8928 if (aarch_rev16_p (x))
8930 *cost = COSTS_N_INSNS (1);
8932 if (speed)
8934 if (VECTOR_MODE_P (mode))
8935 *cost += extra_cost->vect.alu;
8936 else
8937 *cost += extra_cost->alu.rev;
8939 return true;
8942 if (aarch64_extr_rtx_p (x, &op0, &op1))
8944 *cost += rtx_cost (op0, mode, IOR, 0, speed);
8945 *cost += rtx_cost (op1, mode, IOR, 1, speed);
8946 if (speed)
8947 *cost += extra_cost->alu.shift;
8949 return true;
8951 /* Fall through. */
8952 case XOR:
8953 case AND:
8954 cost_logic:
8955 op0 = XEXP (x, 0);
8956 op1 = XEXP (x, 1);
8958 if (VECTOR_MODE_P (mode))
8960 if (speed)
8961 *cost += extra_cost->vect.alu;
8962 return true;
8965 if (code == AND
8966 && GET_CODE (op0) == MULT
8967 && CONST_INT_P (XEXP (op0, 1))
8968 && CONST_INT_P (op1)
8969 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
8970 INTVAL (op1)) != 0)
8972 /* This is a UBFM/SBFM. */
8973 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
8974 if (speed)
8975 *cost += extra_cost->alu.bfx;
8976 return true;
8979 if (is_int_mode (mode, &int_mode))
8981 if (CONST_INT_P (op1))
8983 /* We have a mask + shift version of a UBFIZ
8984 i.e. the *andim_ashift<mode>_bfiz pattern. */
8985 if (GET_CODE (op0) == ASHIFT
8986 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
8987 XEXP (op0, 1)))
8989 *cost += rtx_cost (XEXP (op0, 0), int_mode,
8990 (enum rtx_code) code, 0, speed);
8991 if (speed)
8992 *cost += extra_cost->alu.bfx;
8994 return true;
8996 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
8998 /* We possibly get the immediate for free, this is not
8999 modelled. */
9000 *cost += rtx_cost (op0, int_mode,
9001 (enum rtx_code) code, 0, speed);
9002 if (speed)
9003 *cost += extra_cost->alu.logical;
9005 return true;
9008 else
9010 rtx new_op0 = op0;
9012 /* Handle ORN, EON, or BIC. */
9013 if (GET_CODE (op0) == NOT)
9014 op0 = XEXP (op0, 0);
9016 new_op0 = aarch64_strip_shift (op0);
9018 /* If we had a shift on op0 then this is a logical-shift-
9019 by-register/immediate operation. Otherwise, this is just
9020 a logical operation. */
9021 if (speed)
9023 if (new_op0 != op0)
9025 /* Shift by immediate. */
9026 if (CONST_INT_P (XEXP (op0, 1)))
9027 *cost += extra_cost->alu.log_shift;
9028 else
9029 *cost += extra_cost->alu.log_shift_reg;
9031 else
9032 *cost += extra_cost->alu.logical;
9035 /* In both cases we want to cost both operands. */
9036 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9037 0, speed);
9038 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9039 1, speed);
9041 return true;
9044 return false;
9046 case NOT:
9047 x = XEXP (x, 0);
9048 op0 = aarch64_strip_shift (x);
9050 if (VECTOR_MODE_P (mode))
9052 /* Vector NOT. */
9053 *cost += extra_cost->vect.alu;
9054 return false;
9057 /* MVN-shifted-reg. */
9058 if (op0 != x)
9060 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9062 if (speed)
9063 *cost += extra_cost->alu.log_shift;
9065 return true;
9067 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9068 Handle the second form here taking care that 'a' in the above can
9069 be a shift. */
9070 else if (GET_CODE (op0) == XOR)
9072 rtx newop0 = XEXP (op0, 0);
9073 rtx newop1 = XEXP (op0, 1);
9074 rtx op0_stripped = aarch64_strip_shift (newop0);
9076 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9077 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9079 if (speed)
9081 if (op0_stripped != newop0)
9082 *cost += extra_cost->alu.log_shift;
9083 else
9084 *cost += extra_cost->alu.logical;
9087 return true;
9089 /* MVN. */
9090 if (speed)
9091 *cost += extra_cost->alu.logical;
9093 return false;
9095 case ZERO_EXTEND:
9097 op0 = XEXP (x, 0);
9098 /* If a value is written in SI mode, then zero extended to DI
9099 mode, the operation will in general be free as a write to
9100 a 'w' register implicitly zeroes the upper bits of an 'x'
9101 register. However, if this is
9103 (set (reg) (zero_extend (reg)))
9105 we must cost the explicit register move. */
9106 if (mode == DImode
9107 && GET_MODE (op0) == SImode
9108 && outer == SET)
9110 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9112 /* If OP_COST is non-zero, then the cost of the zero extend
9113 is effectively the cost of the inner operation. Otherwise
9114 we have a MOV instruction and we take the cost from the MOV
9115 itself. This is true independently of whether we are
9116 optimizing for space or time. */
9117 if (op_cost)
9118 *cost = op_cost;
9120 return true;
9122 else if (MEM_P (op0))
9124 /* All loads can zero extend to any size for free. */
9125 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9126 return true;
9129 op0 = aarch64_extend_bitfield_pattern_p (x);
9130 if (op0)
9132 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9133 if (speed)
9134 *cost += extra_cost->alu.bfx;
9135 return true;
9138 if (speed)
9140 if (VECTOR_MODE_P (mode))
9142 /* UMOV. */
9143 *cost += extra_cost->vect.alu;
9145 else
9147 /* We generate an AND instead of UXTB/UXTH. */
9148 *cost += extra_cost->alu.logical;
9151 return false;
9153 case SIGN_EXTEND:
9154 if (MEM_P (XEXP (x, 0)))
9156 /* LDRSH. */
9157 if (speed)
9159 rtx address = XEXP (XEXP (x, 0), 0);
9160 *cost += extra_cost->ldst.load_sign_extend;
9162 *cost +=
9163 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9164 0, speed));
9166 return true;
9169 op0 = aarch64_extend_bitfield_pattern_p (x);
9170 if (op0)
9172 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9173 if (speed)
9174 *cost += extra_cost->alu.bfx;
9175 return true;
9178 if (speed)
9180 if (VECTOR_MODE_P (mode))
9181 *cost += extra_cost->vect.alu;
9182 else
9183 *cost += extra_cost->alu.extend;
9185 return false;
9187 case ASHIFT:
9188 op0 = XEXP (x, 0);
9189 op1 = XEXP (x, 1);
9191 if (CONST_INT_P (op1))
9193 if (speed)
9195 if (VECTOR_MODE_P (mode))
9197 /* Vector shift (immediate). */
9198 *cost += extra_cost->vect.alu;
9200 else
9202 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9203 aliases. */
9204 *cost += extra_cost->alu.shift;
9208 /* We can incorporate zero/sign extend for free. */
9209 if (GET_CODE (op0) == ZERO_EXTEND
9210 || GET_CODE (op0) == SIGN_EXTEND)
9211 op0 = XEXP (op0, 0);
9213 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9214 return true;
9216 else
9218 if (VECTOR_MODE_P (mode))
9220 if (speed)
9221 /* Vector shift (register). */
9222 *cost += extra_cost->vect.alu;
9224 else
9226 if (speed)
9227 /* LSLV. */
9228 *cost += extra_cost->alu.shift_reg;
9230 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9231 && CONST_INT_P (XEXP (op1, 1))
9232 && known_eq (INTVAL (XEXP (op1, 1)),
9233 GET_MODE_BITSIZE (mode) - 1))
9235 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9236 /* We already demanded XEXP (op1, 0) to be REG_P, so
9237 don't recurse into it. */
9238 return true;
9241 return false; /* All arguments need to be in registers. */
9244 case ROTATE:
9245 case ROTATERT:
9246 case LSHIFTRT:
9247 case ASHIFTRT:
9248 op0 = XEXP (x, 0);
9249 op1 = XEXP (x, 1);
9251 if (CONST_INT_P (op1))
9253 /* ASR (immediate) and friends. */
9254 if (speed)
9256 if (VECTOR_MODE_P (mode))
9257 *cost += extra_cost->vect.alu;
9258 else
9259 *cost += extra_cost->alu.shift;
9262 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9263 return true;
9265 else
9267 if (VECTOR_MODE_P (mode))
9269 if (speed)
9270 /* Vector shift (register). */
9271 *cost += extra_cost->vect.alu;
9273 else
9275 if (speed)
9276 /* ASR (register) and friends. */
9277 *cost += extra_cost->alu.shift_reg;
9279 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9280 && CONST_INT_P (XEXP (op1, 1))
9281 && known_eq (INTVAL (XEXP (op1, 1)),
9282 GET_MODE_BITSIZE (mode) - 1))
9284 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9285 /* We already demanded XEXP (op1, 0) to be REG_P, so
9286 don't recurse into it. */
9287 return true;
9290 return false; /* All arguments need to be in registers. */
9293 case SYMBOL_REF:
9295 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9296 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9298 /* LDR. */
9299 if (speed)
9300 *cost += extra_cost->ldst.load;
9302 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9303 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9305 /* ADRP, followed by ADD. */
9306 *cost += COSTS_N_INSNS (1);
9307 if (speed)
9308 *cost += 2 * extra_cost->alu.arith;
9310 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9311 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9313 /* ADR. */
9314 if (speed)
9315 *cost += extra_cost->alu.arith;
9318 if (flag_pic)
9320 /* One extra load instruction, after accessing the GOT. */
9321 *cost += COSTS_N_INSNS (1);
9322 if (speed)
9323 *cost += extra_cost->ldst.load;
9325 return true;
9327 case HIGH:
9328 case LO_SUM:
9329 /* ADRP/ADD (immediate). */
9330 if (speed)
9331 *cost += extra_cost->alu.arith;
9332 return true;
9334 case ZERO_EXTRACT:
9335 case SIGN_EXTRACT:
9336 /* UBFX/SBFX. */
9337 if (speed)
9339 if (VECTOR_MODE_P (mode))
9340 *cost += extra_cost->vect.alu;
9341 else
9342 *cost += extra_cost->alu.bfx;
9345 /* We can trust that the immediates used will be correct (there
9346 are no by-register forms), so we need only cost op0. */
9347 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9348 return true;
9350 case MULT:
9351 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9352 /* aarch64_rtx_mult_cost always handles recursion to its
9353 operands. */
9354 return true;
9356 case MOD:
9357 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9358 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9359 an unconditional negate. This case should only ever be reached through
9360 the set_smod_pow2_cheap check in expmed.c. */
9361 if (CONST_INT_P (XEXP (x, 1))
9362 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9363 && (mode == SImode || mode == DImode))
9365 /* We expand to 4 instructions. Reset the baseline. */
9366 *cost = COSTS_N_INSNS (4);
9368 if (speed)
9369 *cost += 2 * extra_cost->alu.logical
9370 + 2 * extra_cost->alu.arith;
9372 return true;
9375 /* Fall-through. */
9376 case UMOD:
9377 if (speed)
9379 /* Slighly prefer UMOD over SMOD. */
9380 if (VECTOR_MODE_P (mode))
9381 *cost += extra_cost->vect.alu;
9382 else if (GET_MODE_CLASS (mode) == MODE_INT)
9383 *cost += (extra_cost->mult[mode == DImode].add
9384 + extra_cost->mult[mode == DImode].idiv
9385 + (code == MOD ? 1 : 0));
9387 return false; /* All arguments need to be in registers. */
9389 case DIV:
9390 case UDIV:
9391 case SQRT:
9392 if (speed)
9394 if (VECTOR_MODE_P (mode))
9395 *cost += extra_cost->vect.alu;
9396 else if (GET_MODE_CLASS (mode) == MODE_INT)
9397 /* There is no integer SQRT, so only DIV and UDIV can get
9398 here. */
9399 *cost += (extra_cost->mult[mode == DImode].idiv
9400 /* Slighly prefer UDIV over SDIV. */
9401 + (code == DIV ? 1 : 0));
9402 else
9403 *cost += extra_cost->fp[mode == DFmode].div;
9405 return false; /* All arguments need to be in registers. */
9407 case IF_THEN_ELSE:
9408 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9409 XEXP (x, 2), cost, speed);
9411 case EQ:
9412 case NE:
9413 case GT:
9414 case GTU:
9415 case LT:
9416 case LTU:
9417 case GE:
9418 case GEU:
9419 case LE:
9420 case LEU:
9422 return false; /* All arguments must be in registers. */
9424 case FMA:
9425 op0 = XEXP (x, 0);
9426 op1 = XEXP (x, 1);
9427 op2 = XEXP (x, 2);
9429 if (speed)
9431 if (VECTOR_MODE_P (mode))
9432 *cost += extra_cost->vect.alu;
9433 else
9434 *cost += extra_cost->fp[mode == DFmode].fma;
9437 /* FMSUB, FNMADD, and FNMSUB are free. */
9438 if (GET_CODE (op0) == NEG)
9439 op0 = XEXP (op0, 0);
9441 if (GET_CODE (op2) == NEG)
9442 op2 = XEXP (op2, 0);
9444 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9445 and the by-element operand as operand 0. */
9446 if (GET_CODE (op1) == NEG)
9447 op1 = XEXP (op1, 0);
9449 /* Catch vector-by-element operations. The by-element operand can
9450 either be (vec_duplicate (vec_select (x))) or just
9451 (vec_select (x)), depending on whether we are multiplying by
9452 a vector or a scalar.
9454 Canonicalization is not very good in these cases, FMA4 will put the
9455 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9456 if (GET_CODE (op0) == VEC_DUPLICATE)
9457 op0 = XEXP (op0, 0);
9458 else if (GET_CODE (op1) == VEC_DUPLICATE)
9459 op1 = XEXP (op1, 0);
9461 if (GET_CODE (op0) == VEC_SELECT)
9462 op0 = XEXP (op0, 0);
9463 else if (GET_CODE (op1) == VEC_SELECT)
9464 op1 = XEXP (op1, 0);
9466 /* If the remaining parameters are not registers,
9467 get the cost to put them into registers. */
9468 *cost += rtx_cost (op0, mode, FMA, 0, speed);
9469 *cost += rtx_cost (op1, mode, FMA, 1, speed);
9470 *cost += rtx_cost (op2, mode, FMA, 2, speed);
9471 return true;
9473 case FLOAT:
9474 case UNSIGNED_FLOAT:
9475 if (speed)
9476 *cost += extra_cost->fp[mode == DFmode].fromint;
9477 return false;
9479 case FLOAT_EXTEND:
9480 if (speed)
9482 if (VECTOR_MODE_P (mode))
9484 /*Vector truncate. */
9485 *cost += extra_cost->vect.alu;
9487 else
9488 *cost += extra_cost->fp[mode == DFmode].widen;
9490 return false;
9492 case FLOAT_TRUNCATE:
9493 if (speed)
9495 if (VECTOR_MODE_P (mode))
9497 /*Vector conversion. */
9498 *cost += extra_cost->vect.alu;
9500 else
9501 *cost += extra_cost->fp[mode == DFmode].narrow;
9503 return false;
9505 case FIX:
9506 case UNSIGNED_FIX:
9507 x = XEXP (x, 0);
9508 /* Strip the rounding part. They will all be implemented
9509 by the fcvt* family of instructions anyway. */
9510 if (GET_CODE (x) == UNSPEC)
9512 unsigned int uns_code = XINT (x, 1);
9514 if (uns_code == UNSPEC_FRINTA
9515 || uns_code == UNSPEC_FRINTM
9516 || uns_code == UNSPEC_FRINTN
9517 || uns_code == UNSPEC_FRINTP
9518 || uns_code == UNSPEC_FRINTZ)
9519 x = XVECEXP (x, 0, 0);
9522 if (speed)
9524 if (VECTOR_MODE_P (mode))
9525 *cost += extra_cost->vect.alu;
9526 else
9527 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9530 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9531 fixed-point fcvt. */
9532 if (GET_CODE (x) == MULT
9533 && ((VECTOR_MODE_P (mode)
9534 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9535 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9537 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9538 0, speed);
9539 return true;
9542 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9543 return true;
9545 case ABS:
9546 if (VECTOR_MODE_P (mode))
9548 /* ABS (vector). */
9549 if (speed)
9550 *cost += extra_cost->vect.alu;
9552 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9554 op0 = XEXP (x, 0);
9556 /* FABD, which is analogous to FADD. */
9557 if (GET_CODE (op0) == MINUS)
9559 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9560 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9561 if (speed)
9562 *cost += extra_cost->fp[mode == DFmode].addsub;
9564 return true;
9566 /* Simple FABS is analogous to FNEG. */
9567 if (speed)
9568 *cost += extra_cost->fp[mode == DFmode].neg;
9570 else
9572 /* Integer ABS will either be split to
9573 two arithmetic instructions, or will be an ABS
9574 (scalar), which we don't model. */
9575 *cost = COSTS_N_INSNS (2);
9576 if (speed)
9577 *cost += 2 * extra_cost->alu.arith;
9579 return false;
9581 case SMAX:
9582 case SMIN:
9583 if (speed)
9585 if (VECTOR_MODE_P (mode))
9586 *cost += extra_cost->vect.alu;
9587 else
9589 /* FMAXNM/FMINNM/FMAX/FMIN.
9590 TODO: This may not be accurate for all implementations, but
9591 we do not model this in the cost tables. */
9592 *cost += extra_cost->fp[mode == DFmode].addsub;
9595 return false;
9597 case UNSPEC:
9598 /* The floating point round to integer frint* instructions. */
9599 if (aarch64_frint_unspec_p (XINT (x, 1)))
9601 if (speed)
9602 *cost += extra_cost->fp[mode == DFmode].roundint;
9604 return false;
9607 if (XINT (x, 1) == UNSPEC_RBIT)
9609 if (speed)
9610 *cost += extra_cost->alu.rev;
9612 return false;
9614 break;
9616 case TRUNCATE:
9618 /* Decompose <su>muldi3_highpart. */
9619 if (/* (truncate:DI */
9620 mode == DImode
9621 /* (lshiftrt:TI */
9622 && GET_MODE (XEXP (x, 0)) == TImode
9623 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9624 /* (mult:TI */
9625 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9626 /* (ANY_EXTEND:TI (reg:DI))
9627 (ANY_EXTEND:TI (reg:DI))) */
9628 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9629 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9630 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9631 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9632 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9633 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9634 /* (const_int 64) */
9635 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9636 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9638 /* UMULH/SMULH. */
9639 if (speed)
9640 *cost += extra_cost->mult[mode == DImode].extend;
9641 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9642 mode, MULT, 0, speed);
9643 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9644 mode, MULT, 1, speed);
9645 return true;
9648 /* Fall through. */
9649 default:
9650 break;
9653 if (dump_file
9654 && flag_aarch64_verbose_cost)
9655 fprintf (dump_file,
9656 "\nFailed to cost RTX. Assuming default cost.\n");
9658 return true;
9661 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9662 calculated for X. This cost is stored in *COST. Returns true
9663 if the total cost of X was calculated. */
9664 static bool
9665 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9666 int param, int *cost, bool speed)
9668 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9670 if (dump_file
9671 && flag_aarch64_verbose_cost)
9673 print_rtl_single (dump_file, x);
9674 fprintf (dump_file, "\n%s cost: %d (%s)\n",
9675 speed ? "Hot" : "Cold",
9676 *cost, result ? "final" : "partial");
9679 return result;
9682 static int
9683 aarch64_register_move_cost (machine_mode mode,
9684 reg_class_t from_i, reg_class_t to_i)
9686 enum reg_class from = (enum reg_class) from_i;
9687 enum reg_class to = (enum reg_class) to_i;
9688 const struct cpu_regmove_cost *regmove_cost
9689 = aarch64_tune_params.regmove_cost;
9691 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9692 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9693 to = GENERAL_REGS;
9695 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9696 from = GENERAL_REGS;
9698 /* Moving between GPR and stack cost is the same as GP2GP. */
9699 if ((from == GENERAL_REGS && to == STACK_REG)
9700 || (to == GENERAL_REGS && from == STACK_REG))
9701 return regmove_cost->GP2GP;
9703 /* To/From the stack register, we move via the gprs. */
9704 if (to == STACK_REG || from == STACK_REG)
9705 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9706 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9708 if (known_eq (GET_MODE_SIZE (mode), 16))
9710 /* 128-bit operations on general registers require 2 instructions. */
9711 if (from == GENERAL_REGS && to == GENERAL_REGS)
9712 return regmove_cost->GP2GP * 2;
9713 else if (from == GENERAL_REGS)
9714 return regmove_cost->GP2FP * 2;
9715 else if (to == GENERAL_REGS)
9716 return regmove_cost->FP2GP * 2;
9718 /* When AdvSIMD instructions are disabled it is not possible to move
9719 a 128-bit value directly between Q registers. This is handled in
9720 secondary reload. A general register is used as a scratch to move
9721 the upper DI value and the lower DI value is moved directly,
9722 hence the cost is the sum of three moves. */
9723 if (! TARGET_SIMD)
9724 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9726 return regmove_cost->FP2FP;
9729 if (from == GENERAL_REGS && to == GENERAL_REGS)
9730 return regmove_cost->GP2GP;
9731 else if (from == GENERAL_REGS)
9732 return regmove_cost->GP2FP;
9733 else if (to == GENERAL_REGS)
9734 return regmove_cost->FP2GP;
9736 return regmove_cost->FP2FP;
9739 static int
9740 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9741 reg_class_t rclass ATTRIBUTE_UNUSED,
9742 bool in ATTRIBUTE_UNUSED)
9744 return aarch64_tune_params.memmov_cost;
9747 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9748 to optimize 1.0/sqrt. */
9750 static bool
9751 use_rsqrt_p (machine_mode mode)
9753 return (!flag_trapping_math
9754 && flag_unsafe_math_optimizations
9755 && ((aarch64_tune_params.approx_modes->recip_sqrt
9756 & AARCH64_APPROX_MODE (mode))
9757 || flag_mrecip_low_precision_sqrt));
9760 /* Function to decide when to use the approximate reciprocal square root
9761 builtin. */
9763 static tree
9764 aarch64_builtin_reciprocal (tree fndecl)
9766 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9768 if (!use_rsqrt_p (mode))
9769 return NULL_TREE;
9770 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9773 /* Emit instruction sequence to compute either the approximate square root
9774 or its approximate reciprocal, depending on the flag RECP, and return
9775 whether the sequence was emitted or not. */
9777 bool
9778 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9780 machine_mode mode = GET_MODE (dst);
9782 if (GET_MODE_INNER (mode) == HFmode)
9784 gcc_assert (!recp);
9785 return false;
9788 if (!recp)
9790 if (!(flag_mlow_precision_sqrt
9791 || (aarch64_tune_params.approx_modes->sqrt
9792 & AARCH64_APPROX_MODE (mode))))
9793 return false;
9795 if (flag_finite_math_only
9796 || flag_trapping_math
9797 || !flag_unsafe_math_optimizations
9798 || optimize_function_for_size_p (cfun))
9799 return false;
9801 else
9802 /* Caller assumes we cannot fail. */
9803 gcc_assert (use_rsqrt_p (mode));
9805 machine_mode mmsk = mode_for_int_vector (mode).require ();
9806 rtx xmsk = gen_reg_rtx (mmsk);
9807 if (!recp)
9808 /* When calculating the approximate square root, compare the
9809 argument with 0.0 and create a mask. */
9810 emit_insn (gen_rtx_SET (xmsk,
9811 gen_rtx_NEG (mmsk,
9812 gen_rtx_EQ (mmsk, src,
9813 CONST0_RTX (mode)))));
9815 /* Estimate the approximate reciprocal square root. */
9816 rtx xdst = gen_reg_rtx (mode);
9817 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
9819 /* Iterate over the series twice for SF and thrice for DF. */
9820 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9822 /* Optionally iterate over the series once less for faster performance
9823 while sacrificing the accuracy. */
9824 if ((recp && flag_mrecip_low_precision_sqrt)
9825 || (!recp && flag_mlow_precision_sqrt))
9826 iterations--;
9828 /* Iterate over the series to calculate the approximate reciprocal square
9829 root. */
9830 rtx x1 = gen_reg_rtx (mode);
9831 while (iterations--)
9833 rtx x2 = gen_reg_rtx (mode);
9834 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9836 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
9838 if (iterations > 0)
9839 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9842 if (!recp)
9844 /* Qualify the approximate reciprocal square root when the argument is
9845 0.0 by squashing the intermediary result to 0.0. */
9846 rtx xtmp = gen_reg_rtx (mmsk);
9847 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9848 gen_rtx_SUBREG (mmsk, xdst, 0)));
9849 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9851 /* Calculate the approximate square root. */
9852 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9855 /* Finalize the approximation. */
9856 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9858 return true;
9861 /* Emit the instruction sequence to compute the approximation for the division
9862 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
9864 bool
9865 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9867 machine_mode mode = GET_MODE (quo);
9869 if (GET_MODE_INNER (mode) == HFmode)
9870 return false;
9872 bool use_approx_division_p = (flag_mlow_precision_div
9873 || (aarch64_tune_params.approx_modes->division
9874 & AARCH64_APPROX_MODE (mode)));
9876 if (!flag_finite_math_only
9877 || flag_trapping_math
9878 || !flag_unsafe_math_optimizations
9879 || optimize_function_for_size_p (cfun)
9880 || !use_approx_division_p)
9881 return false;
9883 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9884 return false;
9886 /* Estimate the approximate reciprocal. */
9887 rtx xrcp = gen_reg_rtx (mode);
9888 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
9890 /* Iterate over the series twice for SF and thrice for DF. */
9891 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9893 /* Optionally iterate over the series once less for faster performance,
9894 while sacrificing the accuracy. */
9895 if (flag_mlow_precision_div)
9896 iterations--;
9898 /* Iterate over the series to calculate the approximate reciprocal. */
9899 rtx xtmp = gen_reg_rtx (mode);
9900 while (iterations--)
9902 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
9904 if (iterations > 0)
9905 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
9908 if (num != CONST1_RTX (mode))
9910 /* As the approximate reciprocal of DEN is already calculated, only
9911 calculate the approximate division when NUM is not 1.0. */
9912 rtx xnum = force_reg (mode, num);
9913 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
9916 /* Finalize the approximation. */
9917 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
9918 return true;
9921 /* Return the number of instructions that can be issued per cycle. */
9922 static int
9923 aarch64_sched_issue_rate (void)
9925 return aarch64_tune_params.issue_rate;
9928 static int
9929 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
9931 int issue_rate = aarch64_sched_issue_rate ();
9933 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
9937 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
9938 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
9939 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
9941 static int
9942 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
9943 int ready_index)
9945 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
9949 /* Vectorizer cost model target hooks. */
9951 /* Implement targetm.vectorize.builtin_vectorization_cost. */
9952 static int
9953 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
9954 tree vectype,
9955 int misalign ATTRIBUTE_UNUSED)
9957 unsigned elements;
9958 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
9959 bool fp = false;
9961 if (vectype != NULL)
9962 fp = FLOAT_TYPE_P (vectype);
9964 switch (type_of_cost)
9966 case scalar_stmt:
9967 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
9969 case scalar_load:
9970 return costs->scalar_load_cost;
9972 case scalar_store:
9973 return costs->scalar_store_cost;
9975 case vector_stmt:
9976 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
9978 case vector_load:
9979 return costs->vec_align_load_cost;
9981 case vector_store:
9982 return costs->vec_store_cost;
9984 case vec_to_scalar:
9985 return costs->vec_to_scalar_cost;
9987 case scalar_to_vec:
9988 return costs->scalar_to_vec_cost;
9990 case unaligned_load:
9991 case vector_gather_load:
9992 return costs->vec_unalign_load_cost;
9994 case unaligned_store:
9995 case vector_scatter_store:
9996 return costs->vec_unalign_store_cost;
9998 case cond_branch_taken:
9999 return costs->cond_taken_branch_cost;
10001 case cond_branch_not_taken:
10002 return costs->cond_not_taken_branch_cost;
10004 case vec_perm:
10005 return costs->vec_permute_cost;
10007 case vec_promote_demote:
10008 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10010 case vec_construct:
10011 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10012 return elements / 2 + 1;
10014 default:
10015 gcc_unreachable ();
10019 /* Implement targetm.vectorize.add_stmt_cost. */
10020 static unsigned
10021 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10022 struct _stmt_vec_info *stmt_info, int misalign,
10023 enum vect_cost_model_location where)
10025 unsigned *cost = (unsigned *) data;
10026 unsigned retval = 0;
10028 if (flag_vect_cost_model)
10030 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10031 int stmt_cost =
10032 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10034 /* Statements in an inner loop relative to the loop being
10035 vectorized are weighted more heavily. The value here is
10036 arbitrary and could potentially be improved with analysis. */
10037 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10038 count *= 50; /* FIXME */
10040 retval = (unsigned) (count * stmt_cost);
10041 cost[where] += retval;
10044 return retval;
10047 static void initialize_aarch64_code_model (struct gcc_options *);
10049 /* Parse the TO_PARSE string and put the architecture struct that it
10050 selects into RES and the architectural features into ISA_FLAGS.
10051 Return an aarch64_parse_opt_result describing the parse result.
10052 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10054 static enum aarch64_parse_opt_result
10055 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10056 unsigned long *isa_flags)
10058 char *ext;
10059 const struct processor *arch;
10060 char *str = (char *) alloca (strlen (to_parse) + 1);
10061 size_t len;
10063 strcpy (str, to_parse);
10065 ext = strchr (str, '+');
10067 if (ext != NULL)
10068 len = ext - str;
10069 else
10070 len = strlen (str);
10072 if (len == 0)
10073 return AARCH64_PARSE_MISSING_ARG;
10076 /* Loop through the list of supported ARCHes to find a match. */
10077 for (arch = all_architectures; arch->name != NULL; arch++)
10079 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10081 unsigned long isa_temp = arch->flags;
10083 if (ext != NULL)
10085 /* TO_PARSE string contains at least one extension. */
10086 enum aarch64_parse_opt_result ext_res
10087 = aarch64_parse_extension (ext, &isa_temp);
10089 if (ext_res != AARCH64_PARSE_OK)
10090 return ext_res;
10092 /* Extension parsing was successful. Confirm the result
10093 arch and ISA flags. */
10094 *res = arch;
10095 *isa_flags = isa_temp;
10096 return AARCH64_PARSE_OK;
10100 /* ARCH name not found in list. */
10101 return AARCH64_PARSE_INVALID_ARG;
10104 /* Parse the TO_PARSE string and put the result tuning in RES and the
10105 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10106 describing the parse result. If there is an error parsing, RES and
10107 ISA_FLAGS are left unchanged. */
10109 static enum aarch64_parse_opt_result
10110 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10111 unsigned long *isa_flags)
10113 char *ext;
10114 const struct processor *cpu;
10115 char *str = (char *) alloca (strlen (to_parse) + 1);
10116 size_t len;
10118 strcpy (str, to_parse);
10120 ext = strchr (str, '+');
10122 if (ext != NULL)
10123 len = ext - str;
10124 else
10125 len = strlen (str);
10127 if (len == 0)
10128 return AARCH64_PARSE_MISSING_ARG;
10131 /* Loop through the list of supported CPUs to find a match. */
10132 for (cpu = all_cores; cpu->name != NULL; cpu++)
10134 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10136 unsigned long isa_temp = cpu->flags;
10139 if (ext != NULL)
10141 /* TO_PARSE string contains at least one extension. */
10142 enum aarch64_parse_opt_result ext_res
10143 = aarch64_parse_extension (ext, &isa_temp);
10145 if (ext_res != AARCH64_PARSE_OK)
10146 return ext_res;
10148 /* Extension parsing was successfull. Confirm the result
10149 cpu and ISA flags. */
10150 *res = cpu;
10151 *isa_flags = isa_temp;
10152 return AARCH64_PARSE_OK;
10156 /* CPU name not found in list. */
10157 return AARCH64_PARSE_INVALID_ARG;
10160 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10161 Return an aarch64_parse_opt_result describing the parse result.
10162 If the parsing fails the RES does not change. */
10164 static enum aarch64_parse_opt_result
10165 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10167 const struct processor *cpu;
10168 char *str = (char *) alloca (strlen (to_parse) + 1);
10170 strcpy (str, to_parse);
10172 /* Loop through the list of supported CPUs to find a match. */
10173 for (cpu = all_cores; cpu->name != NULL; cpu++)
10175 if (strcmp (cpu->name, str) == 0)
10177 *res = cpu;
10178 return AARCH64_PARSE_OK;
10182 /* CPU name not found in list. */
10183 return AARCH64_PARSE_INVALID_ARG;
10186 /* Parse TOKEN, which has length LENGTH to see if it is an option
10187 described in FLAG. If it is, return the index bit for that fusion type.
10188 If not, error (printing OPTION_NAME) and return zero. */
10190 static unsigned int
10191 aarch64_parse_one_option_token (const char *token,
10192 size_t length,
10193 const struct aarch64_flag_desc *flag,
10194 const char *option_name)
10196 for (; flag->name != NULL; flag++)
10198 if (length == strlen (flag->name)
10199 && !strncmp (flag->name, token, length))
10200 return flag->flag;
10203 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10204 return 0;
10207 /* Parse OPTION which is a comma-separated list of flags to enable.
10208 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10209 default state we inherit from the CPU tuning structures. OPTION_NAME
10210 gives the top-level option we are parsing in the -moverride string,
10211 for use in error messages. */
10213 static unsigned int
10214 aarch64_parse_boolean_options (const char *option,
10215 const struct aarch64_flag_desc *flags,
10216 unsigned int initial_state,
10217 const char *option_name)
10219 const char separator = '.';
10220 const char* specs = option;
10221 const char* ntoken = option;
10222 unsigned int found_flags = initial_state;
10224 while ((ntoken = strchr (specs, separator)))
10226 size_t token_length = ntoken - specs;
10227 unsigned token_ops = aarch64_parse_one_option_token (specs,
10228 token_length,
10229 flags,
10230 option_name);
10231 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10232 in the token stream, reset the supported operations. So:
10234 adrp+add.cmp+branch.none.adrp+add
10236 would have the result of turning on only adrp+add fusion. */
10237 if (!token_ops)
10238 found_flags = 0;
10240 found_flags |= token_ops;
10241 specs = ++ntoken;
10244 /* We ended with a comma, print something. */
10245 if (!(*specs))
10247 error ("%s string ill-formed\n", option_name);
10248 return 0;
10251 /* We still have one more token to parse. */
10252 size_t token_length = strlen (specs);
10253 unsigned token_ops = aarch64_parse_one_option_token (specs,
10254 token_length,
10255 flags,
10256 option_name);
10257 if (!token_ops)
10258 found_flags = 0;
10260 found_flags |= token_ops;
10261 return found_flags;
10264 /* Support for overriding instruction fusion. */
10266 static void
10267 aarch64_parse_fuse_string (const char *fuse_string,
10268 struct tune_params *tune)
10270 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10271 aarch64_fusible_pairs,
10272 tune->fusible_ops,
10273 "fuse=");
10276 /* Support for overriding other tuning flags. */
10278 static void
10279 aarch64_parse_tune_string (const char *tune_string,
10280 struct tune_params *tune)
10282 tune->extra_tuning_flags
10283 = aarch64_parse_boolean_options (tune_string,
10284 aarch64_tuning_flags,
10285 tune->extra_tuning_flags,
10286 "tune=");
10289 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10290 we understand. If it is, extract the option string and handoff to
10291 the appropriate function. */
10293 void
10294 aarch64_parse_one_override_token (const char* token,
10295 size_t length,
10296 struct tune_params *tune)
10298 const struct aarch64_tuning_override_function *fn
10299 = aarch64_tuning_override_functions;
10301 const char *option_part = strchr (token, '=');
10302 if (!option_part)
10304 error ("tuning string missing in option (%s)", token);
10305 return;
10308 /* Get the length of the option name. */
10309 length = option_part - token;
10310 /* Skip the '=' to get to the option string. */
10311 option_part++;
10313 for (; fn->name != NULL; fn++)
10315 if (!strncmp (fn->name, token, length))
10317 fn->parse_override (option_part, tune);
10318 return;
10322 error ("unknown tuning option (%s)",token);
10323 return;
10326 /* A checking mechanism for the implementation of the tls size. */
10328 static void
10329 initialize_aarch64_tls_size (struct gcc_options *opts)
10331 if (aarch64_tls_size == 0)
10332 aarch64_tls_size = 24;
10334 switch (opts->x_aarch64_cmodel_var)
10336 case AARCH64_CMODEL_TINY:
10337 /* Both the default and maximum TLS size allowed under tiny is 1M which
10338 needs two instructions to address, so we clamp the size to 24. */
10339 if (aarch64_tls_size > 24)
10340 aarch64_tls_size = 24;
10341 break;
10342 case AARCH64_CMODEL_SMALL:
10343 /* The maximum TLS size allowed under small is 4G. */
10344 if (aarch64_tls_size > 32)
10345 aarch64_tls_size = 32;
10346 break;
10347 case AARCH64_CMODEL_LARGE:
10348 /* The maximum TLS size allowed under large is 16E.
10349 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10350 if (aarch64_tls_size > 48)
10351 aarch64_tls_size = 48;
10352 break;
10353 default:
10354 gcc_unreachable ();
10357 return;
10360 /* Parse STRING looking for options in the format:
10361 string :: option:string
10362 option :: name=substring
10363 name :: {a-z}
10364 substring :: defined by option. */
10366 static void
10367 aarch64_parse_override_string (const char* input_string,
10368 struct tune_params* tune)
10370 const char separator = ':';
10371 size_t string_length = strlen (input_string) + 1;
10372 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10373 char *string = string_root;
10374 strncpy (string, input_string, string_length);
10375 string[string_length - 1] = '\0';
10377 char* ntoken = string;
10379 while ((ntoken = strchr (string, separator)))
10381 size_t token_length = ntoken - string;
10382 /* Make this substring look like a string. */
10383 *ntoken = '\0';
10384 aarch64_parse_one_override_token (string, token_length, tune);
10385 string = ++ntoken;
10388 /* One last option to parse. */
10389 aarch64_parse_one_override_token (string, strlen (string), tune);
10390 free (string_root);
10394 static void
10395 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10397 /* PR 70044: We have to be careful about being called multiple times for the
10398 same function. This means all changes should be repeatable. */
10400 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10401 Disable the frame pointer flag so the mid-end will not use a frame
10402 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10403 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10404 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
10405 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10406 if (opts->x_flag_omit_frame_pointer == 0)
10407 opts->x_flag_omit_frame_pointer = 2;
10409 /* If not optimizing for size, set the default
10410 alignment to what the target wants. */
10411 if (!opts->x_optimize_size)
10413 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
10414 opts->x_str_align_loops = aarch64_tune_params.loop_align;
10415 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
10416 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
10417 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
10418 opts->x_str_align_functions = aarch64_tune_params.function_align;
10421 /* We default to no pc-relative literal loads. */
10423 aarch64_pcrelative_literal_loads = false;
10425 /* If -mpc-relative-literal-loads is set on the command line, this
10426 implies that the user asked for PC relative literal loads. */
10427 if (opts->x_pcrelative_literal_loads == 1)
10428 aarch64_pcrelative_literal_loads = true;
10430 /* In the tiny memory model it makes no sense to disallow PC relative
10431 literal pool loads. */
10432 if (aarch64_cmodel == AARCH64_CMODEL_TINY
10433 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10434 aarch64_pcrelative_literal_loads = true;
10436 /* When enabling the lower precision Newton series for the square root, also
10437 enable it for the reciprocal square root, since the latter is an
10438 intermediary step for the former. */
10439 if (flag_mlow_precision_sqrt)
10440 flag_mrecip_low_precision_sqrt = true;
10443 /* 'Unpack' up the internal tuning structs and update the options
10444 in OPTS. The caller must have set up selected_tune and selected_arch
10445 as all the other target-specific codegen decisions are
10446 derived from them. */
10448 void
10449 aarch64_override_options_internal (struct gcc_options *opts)
10451 aarch64_tune_flags = selected_tune->flags;
10452 aarch64_tune = selected_tune->sched_core;
10453 /* Make a copy of the tuning parameters attached to the core, which
10454 we may later overwrite. */
10455 aarch64_tune_params = *(selected_tune->tune);
10456 aarch64_architecture_version = selected_arch->architecture_version;
10458 if (opts->x_aarch64_override_tune_string)
10459 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10460 &aarch64_tune_params);
10462 /* This target defaults to strict volatile bitfields. */
10463 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10464 opts->x_flag_strict_volatile_bitfields = 1;
10466 initialize_aarch64_code_model (opts);
10467 initialize_aarch64_tls_size (opts);
10469 int queue_depth = 0;
10470 switch (aarch64_tune_params.autoprefetcher_model)
10472 case tune_params::AUTOPREFETCHER_OFF:
10473 queue_depth = -1;
10474 break;
10475 case tune_params::AUTOPREFETCHER_WEAK:
10476 queue_depth = 0;
10477 break;
10478 case tune_params::AUTOPREFETCHER_STRONG:
10479 queue_depth = max_insn_queue_index + 1;
10480 break;
10481 default:
10482 gcc_unreachable ();
10485 /* We don't mind passing in global_options_set here as we don't use
10486 the *options_set structs anyway. */
10487 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10488 queue_depth,
10489 opts->x_param_values,
10490 global_options_set.x_param_values);
10492 /* Set up parameters to be used in prefetching algorithm. Do not
10493 override the defaults unless we are tuning for a core we have
10494 researched values for. */
10495 if (aarch64_tune_params.prefetch->num_slots > 0)
10496 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10497 aarch64_tune_params.prefetch->num_slots,
10498 opts->x_param_values,
10499 global_options_set.x_param_values);
10500 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10501 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10502 aarch64_tune_params.prefetch->l1_cache_size,
10503 opts->x_param_values,
10504 global_options_set.x_param_values);
10505 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10506 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10507 aarch64_tune_params.prefetch->l1_cache_line_size,
10508 opts->x_param_values,
10509 global_options_set.x_param_values);
10510 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10511 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10512 aarch64_tune_params.prefetch->l2_cache_size,
10513 opts->x_param_values,
10514 global_options_set.x_param_values);
10515 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10516 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10518 opts->x_param_values,
10519 global_options_set.x_param_values);
10520 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10521 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10522 aarch64_tune_params.prefetch->minimum_stride,
10523 opts->x_param_values,
10524 global_options_set.x_param_values);
10526 /* Use the alternative scheduling-pressure algorithm by default. */
10527 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10528 opts->x_param_values,
10529 global_options_set.x_param_values);
10531 /* Enable sw prefetching at specified optimization level for
10532 CPUS that have prefetch. Lower optimization level threshold by 1
10533 when profiling is enabled. */
10534 if (opts->x_flag_prefetch_loop_arrays < 0
10535 && !opts->x_optimize_size
10536 && aarch64_tune_params.prefetch->default_opt_level >= 0
10537 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10538 opts->x_flag_prefetch_loop_arrays = 1;
10540 aarch64_override_options_after_change_1 (opts);
10543 /* Print a hint with a suggestion for a core or architecture name that
10544 most closely resembles what the user passed in STR. ARCH is true if
10545 the user is asking for an architecture name. ARCH is false if the user
10546 is asking for a core name. */
10548 static void
10549 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10551 auto_vec<const char *> candidates;
10552 const struct processor *entry = arch ? all_architectures : all_cores;
10553 for (; entry->name != NULL; entry++)
10554 candidates.safe_push (entry->name);
10556 #ifdef HAVE_LOCAL_CPU_DETECT
10557 /* Add also "native" as possible value. */
10558 if (arch)
10559 candidates.safe_push ("native");
10560 #endif
10562 char *s;
10563 const char *hint = candidates_list_and_hint (str, s, candidates);
10564 if (hint)
10565 inform (input_location, "valid arguments are: %s;"
10566 " did you mean %qs?", s, hint);
10567 else
10568 inform (input_location, "valid arguments are: %s", s);
10570 XDELETEVEC (s);
10573 /* Print a hint with a suggestion for a core name that most closely resembles
10574 what the user passed in STR. */
10576 inline static void
10577 aarch64_print_hint_for_core (const char *str)
10579 aarch64_print_hint_for_core_or_arch (str, false);
10582 /* Print a hint with a suggestion for an architecture name that most closely
10583 resembles what the user passed in STR. */
10585 inline static void
10586 aarch64_print_hint_for_arch (const char *str)
10588 aarch64_print_hint_for_core_or_arch (str, true);
10591 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10592 specified in STR and throw errors if appropriate. Put the results if
10593 they are valid in RES and ISA_FLAGS. Return whether the option is
10594 valid. */
10596 static bool
10597 aarch64_validate_mcpu (const char *str, const struct processor **res,
10598 unsigned long *isa_flags)
10600 enum aarch64_parse_opt_result parse_res
10601 = aarch64_parse_cpu (str, res, isa_flags);
10603 if (parse_res == AARCH64_PARSE_OK)
10604 return true;
10606 switch (parse_res)
10608 case AARCH64_PARSE_MISSING_ARG:
10609 error ("missing cpu name in %<-mcpu=%s%>", str);
10610 break;
10611 case AARCH64_PARSE_INVALID_ARG:
10612 error ("unknown value %qs for -mcpu", str);
10613 aarch64_print_hint_for_core (str);
10614 break;
10615 case AARCH64_PARSE_INVALID_FEATURE:
10616 error ("invalid feature modifier in %<-mcpu=%s%>", str);
10617 break;
10618 default:
10619 gcc_unreachable ();
10622 return false;
10625 /* Validate a command-line -march option. Parse the arch and extensions
10626 (if any) specified in STR and throw errors if appropriate. Put the
10627 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10628 option is valid. */
10630 static bool
10631 aarch64_validate_march (const char *str, const struct processor **res,
10632 unsigned long *isa_flags)
10634 enum aarch64_parse_opt_result parse_res
10635 = aarch64_parse_arch (str, res, isa_flags);
10637 if (parse_res == AARCH64_PARSE_OK)
10638 return true;
10640 switch (parse_res)
10642 case AARCH64_PARSE_MISSING_ARG:
10643 error ("missing arch name in %<-march=%s%>", str);
10644 break;
10645 case AARCH64_PARSE_INVALID_ARG:
10646 error ("unknown value %qs for -march", str);
10647 aarch64_print_hint_for_arch (str);
10648 break;
10649 case AARCH64_PARSE_INVALID_FEATURE:
10650 error ("invalid feature modifier in %<-march=%s%>", str);
10651 break;
10652 default:
10653 gcc_unreachable ();
10656 return false;
10659 /* Validate a command-line -mtune option. Parse the cpu
10660 specified in STR and throw errors if appropriate. Put the
10661 result, if it is valid, in RES. Return whether the option is
10662 valid. */
10664 static bool
10665 aarch64_validate_mtune (const char *str, const struct processor **res)
10667 enum aarch64_parse_opt_result parse_res
10668 = aarch64_parse_tune (str, res);
10670 if (parse_res == AARCH64_PARSE_OK)
10671 return true;
10673 switch (parse_res)
10675 case AARCH64_PARSE_MISSING_ARG:
10676 error ("missing cpu name in %<-mtune=%s%>", str);
10677 break;
10678 case AARCH64_PARSE_INVALID_ARG:
10679 error ("unknown value %qs for -mtune", str);
10680 aarch64_print_hint_for_core (str);
10681 break;
10682 default:
10683 gcc_unreachable ();
10685 return false;
10688 /* Return the CPU corresponding to the enum CPU.
10689 If it doesn't specify a cpu, return the default. */
10691 static const struct processor *
10692 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10694 if (cpu != aarch64_none)
10695 return &all_cores[cpu];
10697 /* The & 0x3f is to extract the bottom 6 bits that encode the
10698 default cpu as selected by the --with-cpu GCC configure option
10699 in config.gcc.
10700 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10701 flags mechanism should be reworked to make it more sane. */
10702 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10705 /* Return the architecture corresponding to the enum ARCH.
10706 If it doesn't specify a valid architecture, return the default. */
10708 static const struct processor *
10709 aarch64_get_arch (enum aarch64_arch arch)
10711 if (arch != aarch64_no_arch)
10712 return &all_architectures[arch];
10714 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10716 return &all_architectures[cpu->arch];
10719 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10721 static poly_uint16
10722 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10724 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10725 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10726 deciding which .md file patterns to use and when deciding whether
10727 something is a legitimate address or constant. */
10728 if (value == SVE_SCALABLE || value == SVE_128)
10729 return poly_uint16 (2, 2);
10730 else
10731 return (int) value / 64;
10734 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10735 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10736 tuning structs. In particular it must set selected_tune and
10737 aarch64_isa_flags that define the available ISA features and tuning
10738 decisions. It must also set selected_arch as this will be used to
10739 output the .arch asm tags for each function. */
10741 static void
10742 aarch64_override_options (void)
10744 unsigned long cpu_isa = 0;
10745 unsigned long arch_isa = 0;
10746 aarch64_isa_flags = 0;
10748 bool valid_cpu = true;
10749 bool valid_tune = true;
10750 bool valid_arch = true;
10752 selected_cpu = NULL;
10753 selected_arch = NULL;
10754 selected_tune = NULL;
10756 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10757 If either of -march or -mtune is given, they override their
10758 respective component of -mcpu. */
10759 if (aarch64_cpu_string)
10760 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10761 &cpu_isa);
10763 if (aarch64_arch_string)
10764 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10765 &arch_isa);
10767 if (aarch64_tune_string)
10768 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10770 /* If the user did not specify a processor, choose the default
10771 one for them. This will be the CPU set during configuration using
10772 --with-cpu, otherwise it is "generic". */
10773 if (!selected_cpu)
10775 if (selected_arch)
10777 selected_cpu = &all_cores[selected_arch->ident];
10778 aarch64_isa_flags = arch_isa;
10779 explicit_arch = selected_arch->arch;
10781 else
10783 /* Get default configure-time CPU. */
10784 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10785 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10788 if (selected_tune)
10789 explicit_tune_core = selected_tune->ident;
10791 /* If both -mcpu and -march are specified check that they are architecturally
10792 compatible, warn if they're not and prefer the -march ISA flags. */
10793 else if (selected_arch)
10795 if (selected_arch->arch != selected_cpu->arch)
10797 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10798 all_architectures[selected_cpu->arch].name,
10799 selected_arch->name);
10801 aarch64_isa_flags = arch_isa;
10802 explicit_arch = selected_arch->arch;
10803 explicit_tune_core = selected_tune ? selected_tune->ident
10804 : selected_cpu->ident;
10806 else
10808 /* -mcpu but no -march. */
10809 aarch64_isa_flags = cpu_isa;
10810 explicit_tune_core = selected_tune ? selected_tune->ident
10811 : selected_cpu->ident;
10812 gcc_assert (selected_cpu);
10813 selected_arch = &all_architectures[selected_cpu->arch];
10814 explicit_arch = selected_arch->arch;
10817 /* Set the arch as well as we will need it when outputing
10818 the .arch directive in assembly. */
10819 if (!selected_arch)
10821 gcc_assert (selected_cpu);
10822 selected_arch = &all_architectures[selected_cpu->arch];
10825 if (!selected_tune)
10826 selected_tune = selected_cpu;
10828 #ifndef HAVE_AS_MABI_OPTION
10829 /* The compiler may have been configured with 2.23.* binutils, which does
10830 not have support for ILP32. */
10831 if (TARGET_ILP32)
10832 error ("assembler does not support -mabi=ilp32");
10833 #endif
10835 /* Convert -msve-vector-bits to a VG count. */
10836 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10838 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10839 sorry ("return address signing is only supported for -mabi=lp64");
10841 /* Make sure we properly set up the explicit options. */
10842 if ((aarch64_cpu_string && valid_cpu)
10843 || (aarch64_tune_string && valid_tune))
10844 gcc_assert (explicit_tune_core != aarch64_none);
10846 if ((aarch64_cpu_string && valid_cpu)
10847 || (aarch64_arch_string && valid_arch))
10848 gcc_assert (explicit_arch != aarch64_no_arch);
10850 aarch64_override_options_internal (&global_options);
10852 /* Save these options as the default ones in case we push and pop them later
10853 while processing functions with potential target attributes. */
10854 target_option_default_node = target_option_current_node
10855 = build_target_option_node (&global_options);
10858 /* Implement targetm.override_options_after_change. */
10860 static void
10861 aarch64_override_options_after_change (void)
10863 aarch64_override_options_after_change_1 (&global_options);
10866 static struct machine_function *
10867 aarch64_init_machine_status (void)
10869 struct machine_function *machine;
10870 machine = ggc_cleared_alloc<machine_function> ();
10871 return machine;
10874 void
10875 aarch64_init_expanders (void)
10877 init_machine_status = aarch64_init_machine_status;
10880 /* A checking mechanism for the implementation of the various code models. */
10881 static void
10882 initialize_aarch64_code_model (struct gcc_options *opts)
10884 if (opts->x_flag_pic)
10886 switch (opts->x_aarch64_cmodel_var)
10888 case AARCH64_CMODEL_TINY:
10889 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10890 break;
10891 case AARCH64_CMODEL_SMALL:
10892 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10893 aarch64_cmodel = (flag_pic == 2
10894 ? AARCH64_CMODEL_SMALL_PIC
10895 : AARCH64_CMODEL_SMALL_SPIC);
10896 #else
10897 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10898 #endif
10899 break;
10900 case AARCH64_CMODEL_LARGE:
10901 sorry ("code model %qs with -f%s", "large",
10902 opts->x_flag_pic > 1 ? "PIC" : "pic");
10903 break;
10904 default:
10905 gcc_unreachable ();
10908 else
10909 aarch64_cmodel = opts->x_aarch64_cmodel_var;
10912 /* Implement TARGET_OPTION_SAVE. */
10914 static void
10915 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
10917 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
10920 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
10921 using the information saved in PTR. */
10923 static void
10924 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
10926 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
10927 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10928 opts->x_explicit_arch = ptr->x_explicit_arch;
10929 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
10930 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
10932 aarch64_override_options_internal (opts);
10935 /* Implement TARGET_OPTION_PRINT. */
10937 static void
10938 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
10940 const struct processor *cpu
10941 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10942 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
10943 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
10944 std::string extension
10945 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
10947 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
10948 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
10949 arch->name, extension.c_str ());
10952 static GTY(()) tree aarch64_previous_fndecl;
10954 void
10955 aarch64_reset_previous_fndecl (void)
10957 aarch64_previous_fndecl = NULL;
10960 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
10961 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
10962 make sure optab availability predicates are recomputed when necessary. */
10964 void
10965 aarch64_save_restore_target_globals (tree new_tree)
10967 if (TREE_TARGET_GLOBALS (new_tree))
10968 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
10969 else if (new_tree == target_option_default_node)
10970 restore_target_globals (&default_target_globals);
10971 else
10972 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
10975 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
10976 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
10977 of the function, if such exists. This function may be called multiple
10978 times on a single function so use aarch64_previous_fndecl to avoid
10979 setting up identical state. */
10981 static void
10982 aarch64_set_current_function (tree fndecl)
10984 if (!fndecl || fndecl == aarch64_previous_fndecl)
10985 return;
10987 tree old_tree = (aarch64_previous_fndecl
10988 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
10989 : NULL_TREE);
10991 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10993 /* If current function has no attributes but the previous one did,
10994 use the default node. */
10995 if (!new_tree && old_tree)
10996 new_tree = target_option_default_node;
10998 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
10999 the default have been handled by aarch64_save_restore_target_globals from
11000 aarch64_pragma_target_parse. */
11001 if (old_tree == new_tree)
11002 return;
11004 aarch64_previous_fndecl = fndecl;
11006 /* First set the target options. */
11007 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11009 aarch64_save_restore_target_globals (new_tree);
11012 /* Enum describing the various ways we can handle attributes.
11013 In many cases we can reuse the generic option handling machinery. */
11015 enum aarch64_attr_opt_type
11017 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
11018 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
11019 aarch64_attr_enum, /* Attribute sets an enum variable. */
11020 aarch64_attr_custom /* Attribute requires a custom handling function. */
11023 /* All the information needed to handle a target attribute.
11024 NAME is the name of the attribute.
11025 ATTR_TYPE specifies the type of behavior of the attribute as described
11026 in the definition of enum aarch64_attr_opt_type.
11027 ALLOW_NEG is true if the attribute supports a "no-" form.
11028 HANDLER is the function that takes the attribute string as an argument
11029 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11030 OPT_NUM is the enum specifying the option that the attribute modifies.
11031 This is needed for attributes that mirror the behavior of a command-line
11032 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11033 aarch64_attr_enum. */
11035 struct aarch64_attribute_info
11037 const char *name;
11038 enum aarch64_attr_opt_type attr_type;
11039 bool allow_neg;
11040 bool (*handler) (const char *);
11041 enum opt_code opt_num;
11044 /* Handle the ARCH_STR argument to the arch= target attribute. */
11046 static bool
11047 aarch64_handle_attr_arch (const char *str)
11049 const struct processor *tmp_arch = NULL;
11050 enum aarch64_parse_opt_result parse_res
11051 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11053 if (parse_res == AARCH64_PARSE_OK)
11055 gcc_assert (tmp_arch);
11056 selected_arch = tmp_arch;
11057 explicit_arch = selected_arch->arch;
11058 return true;
11061 switch (parse_res)
11063 case AARCH64_PARSE_MISSING_ARG:
11064 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11065 break;
11066 case AARCH64_PARSE_INVALID_ARG:
11067 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11068 aarch64_print_hint_for_arch (str);
11069 break;
11070 case AARCH64_PARSE_INVALID_FEATURE:
11071 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11072 break;
11073 default:
11074 gcc_unreachable ();
11077 return false;
11080 /* Handle the argument CPU_STR to the cpu= target attribute. */
11082 static bool
11083 aarch64_handle_attr_cpu (const char *str)
11085 const struct processor *tmp_cpu = NULL;
11086 enum aarch64_parse_opt_result parse_res
11087 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11089 if (parse_res == AARCH64_PARSE_OK)
11091 gcc_assert (tmp_cpu);
11092 selected_tune = tmp_cpu;
11093 explicit_tune_core = selected_tune->ident;
11095 selected_arch = &all_architectures[tmp_cpu->arch];
11096 explicit_arch = selected_arch->arch;
11097 return true;
11100 switch (parse_res)
11102 case AARCH64_PARSE_MISSING_ARG:
11103 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11104 break;
11105 case AARCH64_PARSE_INVALID_ARG:
11106 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11107 aarch64_print_hint_for_core (str);
11108 break;
11109 case AARCH64_PARSE_INVALID_FEATURE:
11110 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11111 break;
11112 default:
11113 gcc_unreachable ();
11116 return false;
11119 /* Handle the argument STR to the tune= target attribute. */
11121 static bool
11122 aarch64_handle_attr_tune (const char *str)
11124 const struct processor *tmp_tune = NULL;
11125 enum aarch64_parse_opt_result parse_res
11126 = aarch64_parse_tune (str, &tmp_tune);
11128 if (parse_res == AARCH64_PARSE_OK)
11130 gcc_assert (tmp_tune);
11131 selected_tune = tmp_tune;
11132 explicit_tune_core = selected_tune->ident;
11133 return true;
11136 switch (parse_res)
11138 case AARCH64_PARSE_INVALID_ARG:
11139 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11140 aarch64_print_hint_for_core (str);
11141 break;
11142 default:
11143 gcc_unreachable ();
11146 return false;
11149 /* Parse an architecture extensions target attribute string specified in STR.
11150 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11151 if successful. Update aarch64_isa_flags to reflect the ISA features
11152 modified. */
11154 static bool
11155 aarch64_handle_attr_isa_flags (char *str)
11157 enum aarch64_parse_opt_result parse_res;
11158 unsigned long isa_flags = aarch64_isa_flags;
11160 /* We allow "+nothing" in the beginning to clear out all architectural
11161 features if the user wants to handpick specific features. */
11162 if (strncmp ("+nothing", str, 8) == 0)
11164 isa_flags = 0;
11165 str += 8;
11168 parse_res = aarch64_parse_extension (str, &isa_flags);
11170 if (parse_res == AARCH64_PARSE_OK)
11172 aarch64_isa_flags = isa_flags;
11173 return true;
11176 switch (parse_res)
11178 case AARCH64_PARSE_MISSING_ARG:
11179 error ("missing value in %<target()%> pragma or attribute");
11180 break;
11182 case AARCH64_PARSE_INVALID_FEATURE:
11183 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11184 break;
11186 default:
11187 gcc_unreachable ();
11190 return false;
11193 /* The target attributes that we support. On top of these we also support just
11194 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11195 handled explicitly in aarch64_process_one_target_attr. */
11197 static const struct aarch64_attribute_info aarch64_attributes[] =
11199 { "general-regs-only", aarch64_attr_mask, false, NULL,
11200 OPT_mgeneral_regs_only },
11201 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11202 OPT_mfix_cortex_a53_835769 },
11203 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11204 OPT_mfix_cortex_a53_843419 },
11205 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11206 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11207 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11208 OPT_momit_leaf_frame_pointer },
11209 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11210 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11211 OPT_march_ },
11212 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11213 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11214 OPT_mtune_ },
11215 { "sign-return-address", aarch64_attr_enum, false, NULL,
11216 OPT_msign_return_address_ },
11217 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11220 /* Parse ARG_STR which contains the definition of one target attribute.
11221 Show appropriate errors if any or return true if the attribute is valid. */
11223 static bool
11224 aarch64_process_one_target_attr (char *arg_str)
11226 bool invert = false;
11228 size_t len = strlen (arg_str);
11230 if (len == 0)
11232 error ("malformed %<target()%> pragma or attribute");
11233 return false;
11236 char *str_to_check = (char *) alloca (len + 1);
11237 strcpy (str_to_check, arg_str);
11239 /* Skip leading whitespace. */
11240 while (*str_to_check == ' ' || *str_to_check == '\t')
11241 str_to_check++;
11243 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11244 It is easier to detect and handle it explicitly here rather than going
11245 through the machinery for the rest of the target attributes in this
11246 function. */
11247 if (*str_to_check == '+')
11248 return aarch64_handle_attr_isa_flags (str_to_check);
11250 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11252 invert = true;
11253 str_to_check += 3;
11255 char *arg = strchr (str_to_check, '=');
11257 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11258 and point ARG to "foo". */
11259 if (arg)
11261 *arg = '\0';
11262 arg++;
11264 const struct aarch64_attribute_info *p_attr;
11265 bool found = false;
11266 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11268 /* If the names don't match up, or the user has given an argument
11269 to an attribute that doesn't accept one, or didn't give an argument
11270 to an attribute that expects one, fail to match. */
11271 if (strcmp (str_to_check, p_attr->name) != 0)
11272 continue;
11274 found = true;
11275 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11276 || p_attr->attr_type == aarch64_attr_enum;
11278 if (attr_need_arg_p ^ (arg != NULL))
11280 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11281 return false;
11284 /* If the name matches but the attribute does not allow "no-" versions
11285 then we can't match. */
11286 if (invert && !p_attr->allow_neg)
11288 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11289 return false;
11292 switch (p_attr->attr_type)
11294 /* Has a custom handler registered.
11295 For example, cpu=, arch=, tune=. */
11296 case aarch64_attr_custom:
11297 gcc_assert (p_attr->handler);
11298 if (!p_attr->handler (arg))
11299 return false;
11300 break;
11302 /* Either set or unset a boolean option. */
11303 case aarch64_attr_bool:
11305 struct cl_decoded_option decoded;
11307 generate_option (p_attr->opt_num, NULL, !invert,
11308 CL_TARGET, &decoded);
11309 aarch64_handle_option (&global_options, &global_options_set,
11310 &decoded, input_location);
11311 break;
11313 /* Set or unset a bit in the target_flags. aarch64_handle_option
11314 should know what mask to apply given the option number. */
11315 case aarch64_attr_mask:
11317 struct cl_decoded_option decoded;
11318 /* We only need to specify the option number.
11319 aarch64_handle_option will know which mask to apply. */
11320 decoded.opt_index = p_attr->opt_num;
11321 decoded.value = !invert;
11322 aarch64_handle_option (&global_options, &global_options_set,
11323 &decoded, input_location);
11324 break;
11326 /* Use the option setting machinery to set an option to an enum. */
11327 case aarch64_attr_enum:
11329 gcc_assert (arg);
11330 bool valid;
11331 int value;
11332 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11333 &value, CL_TARGET);
11334 if (valid)
11336 set_option (&global_options, NULL, p_attr->opt_num, value,
11337 NULL, DK_UNSPECIFIED, input_location,
11338 global_dc);
11340 else
11342 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11344 break;
11346 default:
11347 gcc_unreachable ();
11351 /* If we reached here we either have found an attribute and validated
11352 it or didn't match any. If we matched an attribute but its arguments
11353 were malformed we will have returned false already. */
11354 return found;
11357 /* Count how many times the character C appears in
11358 NULL-terminated string STR. */
11360 static unsigned int
11361 num_occurences_in_str (char c, char *str)
11363 unsigned int res = 0;
11364 while (*str != '\0')
11366 if (*str == c)
11367 res++;
11369 str++;
11372 return res;
11375 /* Parse the tree in ARGS that contains the target attribute information
11376 and update the global target options space. */
11378 bool
11379 aarch64_process_target_attr (tree args)
11381 if (TREE_CODE (args) == TREE_LIST)
11385 tree head = TREE_VALUE (args);
11386 if (head)
11388 if (!aarch64_process_target_attr (head))
11389 return false;
11391 args = TREE_CHAIN (args);
11392 } while (args);
11394 return true;
11397 if (TREE_CODE (args) != STRING_CST)
11399 error ("attribute %<target%> argument not a string");
11400 return false;
11403 size_t len = strlen (TREE_STRING_POINTER (args));
11404 char *str_to_check = (char *) alloca (len + 1);
11405 strcpy (str_to_check, TREE_STRING_POINTER (args));
11407 if (len == 0)
11409 error ("malformed %<target()%> pragma or attribute");
11410 return false;
11413 /* Used to catch empty spaces between commas i.e.
11414 attribute ((target ("attr1,,attr2"))). */
11415 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11417 /* Handle multiple target attributes separated by ','. */
11418 char *token = strtok (str_to_check, ",");
11420 unsigned int num_attrs = 0;
11421 while (token)
11423 num_attrs++;
11424 if (!aarch64_process_one_target_attr (token))
11426 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11427 return false;
11430 token = strtok (NULL, ",");
11433 if (num_attrs != num_commas + 1)
11435 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11436 return false;
11439 return true;
11442 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11443 process attribute ((target ("..."))). */
11445 static bool
11446 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11448 struct cl_target_option cur_target;
11449 bool ret;
11450 tree old_optimize;
11451 tree new_target, new_optimize;
11452 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11454 /* If what we're processing is the current pragma string then the
11455 target option node is already stored in target_option_current_node
11456 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11457 having to re-parse the string. This is especially useful to keep
11458 arm_neon.h compile times down since that header contains a lot
11459 of intrinsics enclosed in pragmas. */
11460 if (!existing_target && args == current_target_pragma)
11462 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11463 return true;
11465 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11467 old_optimize = build_optimization_node (&global_options);
11468 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11470 /* If the function changed the optimization levels as well as setting
11471 target options, start with the optimizations specified. */
11472 if (func_optimize && func_optimize != old_optimize)
11473 cl_optimization_restore (&global_options,
11474 TREE_OPTIMIZATION (func_optimize));
11476 /* Save the current target options to restore at the end. */
11477 cl_target_option_save (&cur_target, &global_options);
11479 /* If fndecl already has some target attributes applied to it, unpack
11480 them so that we add this attribute on top of them, rather than
11481 overwriting them. */
11482 if (existing_target)
11484 struct cl_target_option *existing_options
11485 = TREE_TARGET_OPTION (existing_target);
11487 if (existing_options)
11488 cl_target_option_restore (&global_options, existing_options);
11490 else
11491 cl_target_option_restore (&global_options,
11492 TREE_TARGET_OPTION (target_option_current_node));
11494 ret = aarch64_process_target_attr (args);
11496 /* Set up any additional state. */
11497 if (ret)
11499 aarch64_override_options_internal (&global_options);
11500 /* Initialize SIMD builtins if we haven't already.
11501 Set current_target_pragma to NULL for the duration so that
11502 the builtin initialization code doesn't try to tag the functions
11503 being built with the attributes specified by any current pragma, thus
11504 going into an infinite recursion. */
11505 if (TARGET_SIMD)
11507 tree saved_current_target_pragma = current_target_pragma;
11508 current_target_pragma = NULL;
11509 aarch64_init_simd_builtins ();
11510 current_target_pragma = saved_current_target_pragma;
11512 new_target = build_target_option_node (&global_options);
11514 else
11515 new_target = NULL;
11517 new_optimize = build_optimization_node (&global_options);
11519 if (fndecl && ret)
11521 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11523 if (old_optimize != new_optimize)
11524 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11527 cl_target_option_restore (&global_options, &cur_target);
11529 if (old_optimize != new_optimize)
11530 cl_optimization_restore (&global_options,
11531 TREE_OPTIMIZATION (old_optimize));
11532 return ret;
11535 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11536 tri-bool options (yes, no, don't care) and the default value is
11537 DEF, determine whether to reject inlining. */
11539 static bool
11540 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11541 int dont_care, int def)
11543 /* If the callee doesn't care, always allow inlining. */
11544 if (callee == dont_care)
11545 return true;
11547 /* If the caller doesn't care, always allow inlining. */
11548 if (caller == dont_care)
11549 return true;
11551 /* Otherwise, allow inlining if either the callee and caller values
11552 agree, or if the callee is using the default value. */
11553 return (callee == caller || callee == def);
11556 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11557 to inline CALLEE into CALLER based on target-specific info.
11558 Make sure that the caller and callee have compatible architectural
11559 features. Then go through the other possible target attributes
11560 and see if they can block inlining. Try not to reject always_inline
11561 callees unless they are incompatible architecturally. */
11563 static bool
11564 aarch64_can_inline_p (tree caller, tree callee)
11566 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11567 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11569 struct cl_target_option *caller_opts
11570 = TREE_TARGET_OPTION (caller_tree ? caller_tree
11571 : target_option_default_node);
11573 struct cl_target_option *callee_opts
11574 = TREE_TARGET_OPTION (callee_tree ? callee_tree
11575 : target_option_default_node);
11577 /* Callee's ISA flags should be a subset of the caller's. */
11578 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11579 != callee_opts->x_aarch64_isa_flags)
11580 return false;
11582 /* Allow non-strict aligned functions inlining into strict
11583 aligned ones. */
11584 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11585 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11586 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11587 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11588 return false;
11590 bool always_inline = lookup_attribute ("always_inline",
11591 DECL_ATTRIBUTES (callee));
11593 /* If the architectural features match up and the callee is always_inline
11594 then the other attributes don't matter. */
11595 if (always_inline)
11596 return true;
11598 if (caller_opts->x_aarch64_cmodel_var
11599 != callee_opts->x_aarch64_cmodel_var)
11600 return false;
11602 if (caller_opts->x_aarch64_tls_dialect
11603 != callee_opts->x_aarch64_tls_dialect)
11604 return false;
11606 /* Honour explicit requests to workaround errata. */
11607 if (!aarch64_tribools_ok_for_inlining_p (
11608 caller_opts->x_aarch64_fix_a53_err835769,
11609 callee_opts->x_aarch64_fix_a53_err835769,
11610 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11611 return false;
11613 if (!aarch64_tribools_ok_for_inlining_p (
11614 caller_opts->x_aarch64_fix_a53_err843419,
11615 callee_opts->x_aarch64_fix_a53_err843419,
11616 2, TARGET_FIX_ERR_A53_843419))
11617 return false;
11619 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11620 caller and calle and they don't match up, reject inlining. */
11621 if (!aarch64_tribools_ok_for_inlining_p (
11622 caller_opts->x_flag_omit_leaf_frame_pointer,
11623 callee_opts->x_flag_omit_leaf_frame_pointer,
11624 2, 1))
11625 return false;
11627 /* If the callee has specific tuning overrides, respect them. */
11628 if (callee_opts->x_aarch64_override_tune_string != NULL
11629 && caller_opts->x_aarch64_override_tune_string == NULL)
11630 return false;
11632 /* If the user specified tuning override strings for the
11633 caller and callee and they don't match up, reject inlining.
11634 We just do a string compare here, we don't analyze the meaning
11635 of the string, as it would be too costly for little gain. */
11636 if (callee_opts->x_aarch64_override_tune_string
11637 && caller_opts->x_aarch64_override_tune_string
11638 && (strcmp (callee_opts->x_aarch64_override_tune_string,
11639 caller_opts->x_aarch64_override_tune_string) != 0))
11640 return false;
11642 return true;
11645 /* Return true if SYMBOL_REF X binds locally. */
11647 static bool
11648 aarch64_symbol_binds_local_p (const_rtx x)
11650 return (SYMBOL_REF_DECL (x)
11651 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11652 : SYMBOL_REF_LOCAL_P (x));
11655 /* Return true if SYMBOL_REF X is thread local */
11656 static bool
11657 aarch64_tls_symbol_p (rtx x)
11659 if (! TARGET_HAVE_TLS)
11660 return false;
11662 if (GET_CODE (x) != SYMBOL_REF)
11663 return false;
11665 return SYMBOL_REF_TLS_MODEL (x) != 0;
11668 /* Classify a TLS symbol into one of the TLS kinds. */
11669 enum aarch64_symbol_type
11670 aarch64_classify_tls_symbol (rtx x)
11672 enum tls_model tls_kind = tls_symbolic_operand_type (x);
11674 switch (tls_kind)
11676 case TLS_MODEL_GLOBAL_DYNAMIC:
11677 case TLS_MODEL_LOCAL_DYNAMIC:
11678 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11680 case TLS_MODEL_INITIAL_EXEC:
11681 switch (aarch64_cmodel)
11683 case AARCH64_CMODEL_TINY:
11684 case AARCH64_CMODEL_TINY_PIC:
11685 return SYMBOL_TINY_TLSIE;
11686 default:
11687 return SYMBOL_SMALL_TLSIE;
11690 case TLS_MODEL_LOCAL_EXEC:
11691 if (aarch64_tls_size == 12)
11692 return SYMBOL_TLSLE12;
11693 else if (aarch64_tls_size == 24)
11694 return SYMBOL_TLSLE24;
11695 else if (aarch64_tls_size == 32)
11696 return SYMBOL_TLSLE32;
11697 else if (aarch64_tls_size == 48)
11698 return SYMBOL_TLSLE48;
11699 else
11700 gcc_unreachable ();
11702 case TLS_MODEL_EMULATED:
11703 case TLS_MODEL_NONE:
11704 return SYMBOL_FORCE_TO_MEM;
11706 default:
11707 gcc_unreachable ();
11711 /* Return the correct method for accessing X + OFFSET, where X is either
11712 a SYMBOL_REF or LABEL_REF. */
11714 enum aarch64_symbol_type
11715 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11717 if (GET_CODE (x) == LABEL_REF)
11719 switch (aarch64_cmodel)
11721 case AARCH64_CMODEL_LARGE:
11722 return SYMBOL_FORCE_TO_MEM;
11724 case AARCH64_CMODEL_TINY_PIC:
11725 case AARCH64_CMODEL_TINY:
11726 return SYMBOL_TINY_ABSOLUTE;
11728 case AARCH64_CMODEL_SMALL_SPIC:
11729 case AARCH64_CMODEL_SMALL_PIC:
11730 case AARCH64_CMODEL_SMALL:
11731 return SYMBOL_SMALL_ABSOLUTE;
11733 default:
11734 gcc_unreachable ();
11738 if (GET_CODE (x) == SYMBOL_REF)
11740 if (aarch64_tls_symbol_p (x))
11741 return aarch64_classify_tls_symbol (x);
11743 switch (aarch64_cmodel)
11745 case AARCH64_CMODEL_TINY:
11746 /* When we retrieve symbol + offset address, we have to make sure
11747 the offset does not cause overflow of the final address. But
11748 we have no way of knowing the address of symbol at compile time
11749 so we can't accurately say if the distance between the PC and
11750 symbol + offset is outside the addressible range of +/-1M in the
11751 TINY code model. So we rely on images not being greater than
11752 1M and cap the offset at 1M and anything beyond 1M will have to
11753 be loaded using an alternative mechanism. Furthermore if the
11754 symbol is a weak reference to something that isn't known to
11755 resolve to a symbol in this module, then force to memory. */
11756 if ((SYMBOL_REF_WEAK (x)
11757 && !aarch64_symbol_binds_local_p (x))
11758 || !IN_RANGE (offset, -1048575, 1048575))
11759 return SYMBOL_FORCE_TO_MEM;
11760 return SYMBOL_TINY_ABSOLUTE;
11762 case AARCH64_CMODEL_SMALL:
11763 /* Same reasoning as the tiny code model, but the offset cap here is
11764 4G. */
11765 if ((SYMBOL_REF_WEAK (x)
11766 && !aarch64_symbol_binds_local_p (x))
11767 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11768 HOST_WIDE_INT_C (4294967264)))
11769 return SYMBOL_FORCE_TO_MEM;
11770 return SYMBOL_SMALL_ABSOLUTE;
11772 case AARCH64_CMODEL_TINY_PIC:
11773 if (!aarch64_symbol_binds_local_p (x))
11774 return SYMBOL_TINY_GOT;
11775 return SYMBOL_TINY_ABSOLUTE;
11777 case AARCH64_CMODEL_SMALL_SPIC:
11778 case AARCH64_CMODEL_SMALL_PIC:
11779 if (!aarch64_symbol_binds_local_p (x))
11780 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11781 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11782 return SYMBOL_SMALL_ABSOLUTE;
11784 case AARCH64_CMODEL_LARGE:
11785 /* This is alright even in PIC code as the constant
11786 pool reference is always PC relative and within
11787 the same translation unit. */
11788 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11789 return SYMBOL_SMALL_ABSOLUTE;
11790 else
11791 return SYMBOL_FORCE_TO_MEM;
11793 default:
11794 gcc_unreachable ();
11798 /* By default push everything into the constant pool. */
11799 return SYMBOL_FORCE_TO_MEM;
11802 bool
11803 aarch64_constant_address_p (rtx x)
11805 return (CONSTANT_P (x) && memory_address_p (DImode, x));
11808 bool
11809 aarch64_legitimate_pic_operand_p (rtx x)
11811 if (GET_CODE (x) == SYMBOL_REF
11812 || (GET_CODE (x) == CONST
11813 && GET_CODE (XEXP (x, 0)) == PLUS
11814 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11815 return false;
11817 return true;
11820 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11821 that should be rematerialized rather than spilled. */
11823 static bool
11824 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11826 /* Support CSE and rematerialization of common constants. */
11827 if (CONST_INT_P (x)
11828 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11829 || GET_CODE (x) == CONST_VECTOR)
11830 return true;
11832 /* Do not allow vector struct mode constants for Advanced SIMD.
11833 We could support 0 and -1 easily, but they need support in
11834 aarch64-simd.md. */
11835 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11836 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11837 return false;
11839 /* Only accept variable-length vector constants if they can be
11840 handled directly.
11842 ??? It would be possible to handle rematerialization of other
11843 constants via secondary reloads. */
11844 if (vec_flags & VEC_ANY_SVE)
11845 return aarch64_simd_valid_immediate (x, NULL);
11847 if (GET_CODE (x) == HIGH)
11848 x = XEXP (x, 0);
11850 /* Accept polynomial constants that can be calculated by using the
11851 destination of a move as the sole temporary. Constants that
11852 require a second temporary cannot be rematerialized (they can't be
11853 forced to memory and also aren't legitimate constants). */
11854 poly_int64 offset;
11855 if (poly_int_rtx_p (x, &offset))
11856 return aarch64_offset_temporaries (false, offset) <= 1;
11858 /* If an offset is being added to something else, we need to allow the
11859 base to be moved into the destination register, meaning that there
11860 are no free temporaries for the offset. */
11861 x = strip_offset (x, &offset);
11862 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11863 return false;
11865 /* Do not allow const (plus (anchor_symbol, const_int)). */
11866 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11867 return false;
11869 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
11870 so spilling them is better than rematerialization. */
11871 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11872 return true;
11874 /* Label references are always constant. */
11875 if (GET_CODE (x) == LABEL_REF)
11876 return true;
11878 return false;
11882 aarch64_load_tp (rtx target)
11884 if (!target
11885 || GET_MODE (target) != Pmode
11886 || !register_operand (target, Pmode))
11887 target = gen_reg_rtx (Pmode);
11889 /* Can return in any reg. */
11890 emit_insn (gen_aarch64_load_tp_hard (target));
11891 return target;
11894 /* On AAPCS systems, this is the "struct __va_list". */
11895 static GTY(()) tree va_list_type;
11897 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11898 Return the type to use as __builtin_va_list.
11900 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11902 struct __va_list
11904 void *__stack;
11905 void *__gr_top;
11906 void *__vr_top;
11907 int __gr_offs;
11908 int __vr_offs;
11909 }; */
11911 static tree
11912 aarch64_build_builtin_va_list (void)
11914 tree va_list_name;
11915 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11917 /* Create the type. */
11918 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
11919 /* Give it the required name. */
11920 va_list_name = build_decl (BUILTINS_LOCATION,
11921 TYPE_DECL,
11922 get_identifier ("__va_list"),
11923 va_list_type);
11924 DECL_ARTIFICIAL (va_list_name) = 1;
11925 TYPE_NAME (va_list_type) = va_list_name;
11926 TYPE_STUB_DECL (va_list_type) = va_list_name;
11928 /* Create the fields. */
11929 f_stack = build_decl (BUILTINS_LOCATION,
11930 FIELD_DECL, get_identifier ("__stack"),
11931 ptr_type_node);
11932 f_grtop = build_decl (BUILTINS_LOCATION,
11933 FIELD_DECL, get_identifier ("__gr_top"),
11934 ptr_type_node);
11935 f_vrtop = build_decl (BUILTINS_LOCATION,
11936 FIELD_DECL, get_identifier ("__vr_top"),
11937 ptr_type_node);
11938 f_groff = build_decl (BUILTINS_LOCATION,
11939 FIELD_DECL, get_identifier ("__gr_offs"),
11940 integer_type_node);
11941 f_vroff = build_decl (BUILTINS_LOCATION,
11942 FIELD_DECL, get_identifier ("__vr_offs"),
11943 integer_type_node);
11945 /* Tell tree-stdarg pass about our internal offset fields.
11946 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
11947 purpose to identify whether the code is updating va_list internal
11948 offset fields through irregular way. */
11949 va_list_gpr_counter_field = f_groff;
11950 va_list_fpr_counter_field = f_vroff;
11952 DECL_ARTIFICIAL (f_stack) = 1;
11953 DECL_ARTIFICIAL (f_grtop) = 1;
11954 DECL_ARTIFICIAL (f_vrtop) = 1;
11955 DECL_ARTIFICIAL (f_groff) = 1;
11956 DECL_ARTIFICIAL (f_vroff) = 1;
11958 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
11959 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
11960 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
11961 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
11962 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
11964 TYPE_FIELDS (va_list_type) = f_stack;
11965 DECL_CHAIN (f_stack) = f_grtop;
11966 DECL_CHAIN (f_grtop) = f_vrtop;
11967 DECL_CHAIN (f_vrtop) = f_groff;
11968 DECL_CHAIN (f_groff) = f_vroff;
11970 /* Compute its layout. */
11971 layout_type (va_list_type);
11973 return va_list_type;
11976 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
11977 static void
11978 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
11980 const CUMULATIVE_ARGS *cum;
11981 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11982 tree stack, grtop, vrtop, groff, vroff;
11983 tree t;
11984 int gr_save_area_size = cfun->va_list_gpr_size;
11985 int vr_save_area_size = cfun->va_list_fpr_size;
11986 int vr_offset;
11988 cum = &crtl->args.info;
11989 if (cfun->va_list_gpr_size)
11990 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
11991 cfun->va_list_gpr_size);
11992 if (cfun->va_list_fpr_size)
11993 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
11994 * UNITS_PER_VREG, cfun->va_list_fpr_size);
11996 if (!TARGET_FLOAT)
11998 gcc_assert (cum->aapcs_nvrn == 0);
11999 vr_save_area_size = 0;
12002 f_stack = TYPE_FIELDS (va_list_type_node);
12003 f_grtop = DECL_CHAIN (f_stack);
12004 f_vrtop = DECL_CHAIN (f_grtop);
12005 f_groff = DECL_CHAIN (f_vrtop);
12006 f_vroff = DECL_CHAIN (f_groff);
12008 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12009 NULL_TREE);
12010 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12011 NULL_TREE);
12012 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12013 NULL_TREE);
12014 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12015 NULL_TREE);
12016 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12017 NULL_TREE);
12019 /* Emit code to initialize STACK, which points to the next varargs stack
12020 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12021 by named arguments. STACK is 8-byte aligned. */
12022 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12023 if (cum->aapcs_stack_size > 0)
12024 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12025 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12026 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12028 /* Emit code to initialize GRTOP, the top of the GR save area.
12029 virtual_incoming_args_rtx should have been 16 byte aligned. */
12030 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12031 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12032 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12034 /* Emit code to initialize VRTOP, the top of the VR save area.
12035 This address is gr_save_area_bytes below GRTOP, rounded
12036 down to the next 16-byte boundary. */
12037 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12038 vr_offset = ROUND_UP (gr_save_area_size,
12039 STACK_BOUNDARY / BITS_PER_UNIT);
12041 if (vr_offset)
12042 t = fold_build_pointer_plus_hwi (t, -vr_offset);
12043 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12044 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12046 /* Emit code to initialize GROFF, the offset from GRTOP of the
12047 next GPR argument. */
12048 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12049 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12050 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12052 /* Likewise emit code to initialize VROFF, the offset from FTOP
12053 of the next VR argument. */
12054 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12055 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12056 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12059 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12061 static tree
12062 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12063 gimple_seq *post_p ATTRIBUTE_UNUSED)
12065 tree addr;
12066 bool indirect_p;
12067 bool is_ha; /* is HFA or HVA. */
12068 bool dw_align; /* double-word align. */
12069 machine_mode ag_mode = VOIDmode;
12070 int nregs;
12071 machine_mode mode;
12073 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12074 tree stack, f_top, f_off, off, arg, roundup, on_stack;
12075 HOST_WIDE_INT size, rsize, adjust, align;
12076 tree t, u, cond1, cond2;
12078 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12079 if (indirect_p)
12080 type = build_pointer_type (type);
12082 mode = TYPE_MODE (type);
12084 f_stack = TYPE_FIELDS (va_list_type_node);
12085 f_grtop = DECL_CHAIN (f_stack);
12086 f_vrtop = DECL_CHAIN (f_grtop);
12087 f_groff = DECL_CHAIN (f_vrtop);
12088 f_vroff = DECL_CHAIN (f_groff);
12090 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12091 f_stack, NULL_TREE);
12092 size = int_size_in_bytes (type);
12093 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12095 dw_align = false;
12096 adjust = 0;
12097 if (aarch64_vfp_is_call_or_return_candidate (mode,
12098 type,
12099 &ag_mode,
12100 &nregs,
12101 &is_ha))
12103 /* No frontends can create types with variable-sized modes, so we
12104 shouldn't be asked to pass or return them. */
12105 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12107 /* TYPE passed in fp/simd registers. */
12108 if (!TARGET_FLOAT)
12109 aarch64_err_no_fpadvsimd (mode);
12111 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12112 unshare_expr (valist), f_vrtop, NULL_TREE);
12113 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12114 unshare_expr (valist), f_vroff, NULL_TREE);
12116 rsize = nregs * UNITS_PER_VREG;
12118 if (is_ha)
12120 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12121 adjust = UNITS_PER_VREG - ag_size;
12123 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12124 && size < UNITS_PER_VREG)
12126 adjust = UNITS_PER_VREG - size;
12129 else
12131 /* TYPE passed in general registers. */
12132 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12133 unshare_expr (valist), f_grtop, NULL_TREE);
12134 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12135 unshare_expr (valist), f_groff, NULL_TREE);
12136 rsize = ROUND_UP (size, UNITS_PER_WORD);
12137 nregs = rsize / UNITS_PER_WORD;
12139 if (align > 8)
12140 dw_align = true;
12142 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12143 && size < UNITS_PER_WORD)
12145 adjust = UNITS_PER_WORD - size;
12149 /* Get a local temporary for the field value. */
12150 off = get_initialized_tmp_var (f_off, pre_p, NULL);
12152 /* Emit code to branch if off >= 0. */
12153 t = build2 (GE_EXPR, boolean_type_node, off,
12154 build_int_cst (TREE_TYPE (off), 0));
12155 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12157 if (dw_align)
12159 /* Emit: offs = (offs + 15) & -16. */
12160 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12161 build_int_cst (TREE_TYPE (off), 15));
12162 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12163 build_int_cst (TREE_TYPE (off), -16));
12164 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12166 else
12167 roundup = NULL;
12169 /* Update ap.__[g|v]r_offs */
12170 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12171 build_int_cst (TREE_TYPE (off), rsize));
12172 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12174 /* String up. */
12175 if (roundup)
12176 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12178 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12179 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12180 build_int_cst (TREE_TYPE (f_off), 0));
12181 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12183 /* String up: make sure the assignment happens before the use. */
12184 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12185 COND_EXPR_ELSE (cond1) = t;
12187 /* Prepare the trees handling the argument that is passed on the stack;
12188 the top level node will store in ON_STACK. */
12189 arg = get_initialized_tmp_var (stack, pre_p, NULL);
12190 if (align > 8)
12192 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12193 t = fold_build_pointer_plus_hwi (arg, 15);
12194 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12195 build_int_cst (TREE_TYPE (t), -16));
12196 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12198 else
12199 roundup = NULL;
12200 /* Advance ap.__stack */
12201 t = fold_build_pointer_plus_hwi (arg, size + 7);
12202 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12203 build_int_cst (TREE_TYPE (t), -8));
12204 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12205 /* String up roundup and advance. */
12206 if (roundup)
12207 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12208 /* String up with arg */
12209 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12210 /* Big-endianness related address adjustment. */
12211 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12212 && size < UNITS_PER_WORD)
12214 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12215 size_int (UNITS_PER_WORD - size));
12216 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12219 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12220 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12222 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12223 t = off;
12224 if (adjust)
12225 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12226 build_int_cst (TREE_TYPE (off), adjust));
12228 t = fold_convert (sizetype, t);
12229 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12231 if (is_ha)
12233 /* type ha; // treat as "struct {ftype field[n];}"
12234 ... [computing offs]
12235 for (i = 0; i <nregs; ++i, offs += 16)
12236 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12237 return ha; */
12238 int i;
12239 tree tmp_ha, field_t, field_ptr_t;
12241 /* Declare a local variable. */
12242 tmp_ha = create_tmp_var_raw (type, "ha");
12243 gimple_add_tmp_var (tmp_ha);
12245 /* Establish the base type. */
12246 switch (ag_mode)
12248 case E_SFmode:
12249 field_t = float_type_node;
12250 field_ptr_t = float_ptr_type_node;
12251 break;
12252 case E_DFmode:
12253 field_t = double_type_node;
12254 field_ptr_t = double_ptr_type_node;
12255 break;
12256 case E_TFmode:
12257 field_t = long_double_type_node;
12258 field_ptr_t = long_double_ptr_type_node;
12259 break;
12260 case E_HFmode:
12261 field_t = aarch64_fp16_type_node;
12262 field_ptr_t = aarch64_fp16_ptr_type_node;
12263 break;
12264 case E_V2SImode:
12265 case E_V4SImode:
12267 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12268 field_t = build_vector_type_for_mode (innertype, ag_mode);
12269 field_ptr_t = build_pointer_type (field_t);
12271 break;
12272 default:
12273 gcc_assert (0);
12276 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12277 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12278 addr = t;
12279 t = fold_convert (field_ptr_t, addr);
12280 t = build2 (MODIFY_EXPR, field_t,
12281 build1 (INDIRECT_REF, field_t, tmp_ha),
12282 build1 (INDIRECT_REF, field_t, t));
12284 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12285 for (i = 1; i < nregs; ++i)
12287 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12288 u = fold_convert (field_ptr_t, addr);
12289 u = build2 (MODIFY_EXPR, field_t,
12290 build2 (MEM_REF, field_t, tmp_ha,
12291 build_int_cst (field_ptr_t,
12292 (i *
12293 int_size_in_bytes (field_t)))),
12294 build1 (INDIRECT_REF, field_t, u));
12295 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12298 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12299 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12302 COND_EXPR_ELSE (cond2) = t;
12303 addr = fold_convert (build_pointer_type (type), cond1);
12304 addr = build_va_arg_indirect_ref (addr);
12306 if (indirect_p)
12307 addr = build_va_arg_indirect_ref (addr);
12309 return addr;
12312 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12314 static void
12315 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12316 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12317 int no_rtl)
12319 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12320 CUMULATIVE_ARGS local_cum;
12321 int gr_saved = cfun->va_list_gpr_size;
12322 int vr_saved = cfun->va_list_fpr_size;
12324 /* The caller has advanced CUM up to, but not beyond, the last named
12325 argument. Advance a local copy of CUM past the last "real" named
12326 argument, to find out how many registers are left over. */
12327 local_cum = *cum;
12328 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12330 /* Found out how many registers we need to save.
12331 Honor tree-stdvar analysis results. */
12332 if (cfun->va_list_gpr_size)
12333 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12334 cfun->va_list_gpr_size / UNITS_PER_WORD);
12335 if (cfun->va_list_fpr_size)
12336 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12337 cfun->va_list_fpr_size / UNITS_PER_VREG);
12339 if (!TARGET_FLOAT)
12341 gcc_assert (local_cum.aapcs_nvrn == 0);
12342 vr_saved = 0;
12345 if (!no_rtl)
12347 if (gr_saved > 0)
12349 rtx ptr, mem;
12351 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12352 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12353 - gr_saved * UNITS_PER_WORD);
12354 mem = gen_frame_mem (BLKmode, ptr);
12355 set_mem_alias_set (mem, get_varargs_alias_set ());
12357 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12358 mem, gr_saved);
12360 if (vr_saved > 0)
12362 /* We can't use move_block_from_reg, because it will use
12363 the wrong mode, storing D regs only. */
12364 machine_mode mode = TImode;
12365 int off, i, vr_start;
12367 /* Set OFF to the offset from virtual_incoming_args_rtx of
12368 the first vector register. The VR save area lies below
12369 the GR one, and is aligned to 16 bytes. */
12370 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12371 STACK_BOUNDARY / BITS_PER_UNIT);
12372 off -= vr_saved * UNITS_PER_VREG;
12374 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12375 for (i = 0; i < vr_saved; ++i)
12377 rtx ptr, mem;
12379 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12380 mem = gen_frame_mem (mode, ptr);
12381 set_mem_alias_set (mem, get_varargs_alias_set ());
12382 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12383 off += UNITS_PER_VREG;
12388 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12389 any complication of having crtl->args.pretend_args_size changed. */
12390 cfun->machine->frame.saved_varargs_size
12391 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12392 STACK_BOUNDARY / BITS_PER_UNIT)
12393 + vr_saved * UNITS_PER_VREG);
12396 static void
12397 aarch64_conditional_register_usage (void)
12399 int i;
12400 if (!TARGET_FLOAT)
12402 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12404 fixed_regs[i] = 1;
12405 call_used_regs[i] = 1;
12408 if (!TARGET_SVE)
12409 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12411 fixed_regs[i] = 1;
12412 call_used_regs[i] = 1;
12415 /* When tracking speculation, we need a couple of call-clobbered registers
12416 to track the speculation state. It would be nice to just use
12417 IP0 and IP1, but currently there are numerous places that just
12418 assume these registers are free for other uses (eg pointer
12419 authentication). */
12420 if (aarch64_track_speculation)
12422 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
12423 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
12424 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
12425 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
12429 /* Walk down the type tree of TYPE counting consecutive base elements.
12430 If *MODEP is VOIDmode, then set it to the first valid floating point
12431 type. If a non-floating point type is found, or if a floating point
12432 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12433 otherwise return the count in the sub-tree. */
12434 static int
12435 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12437 machine_mode mode;
12438 HOST_WIDE_INT size;
12440 switch (TREE_CODE (type))
12442 case REAL_TYPE:
12443 mode = TYPE_MODE (type);
12444 if (mode != DFmode && mode != SFmode
12445 && mode != TFmode && mode != HFmode)
12446 return -1;
12448 if (*modep == VOIDmode)
12449 *modep = mode;
12451 if (*modep == mode)
12452 return 1;
12454 break;
12456 case COMPLEX_TYPE:
12457 mode = TYPE_MODE (TREE_TYPE (type));
12458 if (mode != DFmode && mode != SFmode
12459 && mode != TFmode && mode != HFmode)
12460 return -1;
12462 if (*modep == VOIDmode)
12463 *modep = mode;
12465 if (*modep == mode)
12466 return 2;
12468 break;
12470 case VECTOR_TYPE:
12471 /* Use V2SImode and V4SImode as representatives of all 64-bit
12472 and 128-bit vector types. */
12473 size = int_size_in_bytes (type);
12474 switch (size)
12476 case 8:
12477 mode = V2SImode;
12478 break;
12479 case 16:
12480 mode = V4SImode;
12481 break;
12482 default:
12483 return -1;
12486 if (*modep == VOIDmode)
12487 *modep = mode;
12489 /* Vector modes are considered to be opaque: two vectors are
12490 equivalent for the purposes of being homogeneous aggregates
12491 if they are the same size. */
12492 if (*modep == mode)
12493 return 1;
12495 break;
12497 case ARRAY_TYPE:
12499 int count;
12500 tree index = TYPE_DOMAIN (type);
12502 /* Can't handle incomplete types nor sizes that are not
12503 fixed. */
12504 if (!COMPLETE_TYPE_P (type)
12505 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12506 return -1;
12508 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12509 if (count == -1
12510 || !index
12511 || !TYPE_MAX_VALUE (index)
12512 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12513 || !TYPE_MIN_VALUE (index)
12514 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12515 || count < 0)
12516 return -1;
12518 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12519 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12521 /* There must be no padding. */
12522 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12523 count * GET_MODE_BITSIZE (*modep)))
12524 return -1;
12526 return count;
12529 case RECORD_TYPE:
12531 int count = 0;
12532 int sub_count;
12533 tree field;
12535 /* Can't handle incomplete types nor sizes that are not
12536 fixed. */
12537 if (!COMPLETE_TYPE_P (type)
12538 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12539 return -1;
12541 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12543 if (TREE_CODE (field) != FIELD_DECL)
12544 continue;
12546 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12547 if (sub_count < 0)
12548 return -1;
12549 count += sub_count;
12552 /* There must be no padding. */
12553 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12554 count * GET_MODE_BITSIZE (*modep)))
12555 return -1;
12557 return count;
12560 case UNION_TYPE:
12561 case QUAL_UNION_TYPE:
12563 /* These aren't very interesting except in a degenerate case. */
12564 int count = 0;
12565 int sub_count;
12566 tree field;
12568 /* Can't handle incomplete types nor sizes that are not
12569 fixed. */
12570 if (!COMPLETE_TYPE_P (type)
12571 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12572 return -1;
12574 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12576 if (TREE_CODE (field) != FIELD_DECL)
12577 continue;
12579 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12580 if (sub_count < 0)
12581 return -1;
12582 count = count > sub_count ? count : sub_count;
12585 /* There must be no padding. */
12586 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12587 count * GET_MODE_BITSIZE (*modep)))
12588 return -1;
12590 return count;
12593 default:
12594 break;
12597 return -1;
12600 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12601 type as described in AAPCS64 \S 4.1.2.
12603 See the comment above aarch64_composite_type_p for the notes on MODE. */
12605 static bool
12606 aarch64_short_vector_p (const_tree type,
12607 machine_mode mode)
12609 poly_int64 size = -1;
12611 if (type && TREE_CODE (type) == VECTOR_TYPE)
12612 size = int_size_in_bytes (type);
12613 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12614 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12615 size = GET_MODE_SIZE (mode);
12617 return known_eq (size, 8) || known_eq (size, 16);
12620 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12621 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12622 array types. The C99 floating-point complex types are also considered
12623 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12624 types, which are GCC extensions and out of the scope of AAPCS64, are
12625 treated as composite types here as well.
12627 Note that MODE itself is not sufficient in determining whether a type
12628 is such a composite type or not. This is because
12629 stor-layout.c:compute_record_mode may have already changed the MODE
12630 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12631 structure with only one field may have its MODE set to the mode of the
12632 field. Also an integer mode whose size matches the size of the
12633 RECORD_TYPE type may be used to substitute the original mode
12634 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12635 solely relied on. */
12637 static bool
12638 aarch64_composite_type_p (const_tree type,
12639 machine_mode mode)
12641 if (aarch64_short_vector_p (type, mode))
12642 return false;
12644 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12645 return true;
12647 if (mode == BLKmode
12648 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12649 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12650 return true;
12652 return false;
12655 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12656 shall be passed or returned in simd/fp register(s) (providing these
12657 parameter passing registers are available).
12659 Upon successful return, *COUNT returns the number of needed registers,
12660 *BASE_MODE returns the mode of the individual register and when IS_HAF
12661 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12662 floating-point aggregate or a homogeneous short-vector aggregate. */
12664 static bool
12665 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12666 const_tree type,
12667 machine_mode *base_mode,
12668 int *count,
12669 bool *is_ha)
12671 machine_mode new_mode = VOIDmode;
12672 bool composite_p = aarch64_composite_type_p (type, mode);
12674 if (is_ha != NULL) *is_ha = false;
12676 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12677 || aarch64_short_vector_p (type, mode))
12679 *count = 1;
12680 new_mode = mode;
12682 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12684 if (is_ha != NULL) *is_ha = true;
12685 *count = 2;
12686 new_mode = GET_MODE_INNER (mode);
12688 else if (type && composite_p)
12690 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12692 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12694 if (is_ha != NULL) *is_ha = true;
12695 *count = ag_count;
12697 else
12698 return false;
12700 else
12701 return false;
12703 *base_mode = new_mode;
12704 return true;
12707 /* Implement TARGET_STRUCT_VALUE_RTX. */
12709 static rtx
12710 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12711 int incoming ATTRIBUTE_UNUSED)
12713 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12716 /* Implements target hook vector_mode_supported_p. */
12717 static bool
12718 aarch64_vector_mode_supported_p (machine_mode mode)
12720 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12721 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12724 /* Return appropriate SIMD container
12725 for MODE within a vector of WIDTH bits. */
12726 static machine_mode
12727 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12729 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12730 switch (mode)
12732 case E_DFmode:
12733 return VNx2DFmode;
12734 case E_SFmode:
12735 return VNx4SFmode;
12736 case E_HFmode:
12737 return VNx8HFmode;
12738 case E_DImode:
12739 return VNx2DImode;
12740 case E_SImode:
12741 return VNx4SImode;
12742 case E_HImode:
12743 return VNx8HImode;
12744 case E_QImode:
12745 return VNx16QImode;
12746 default:
12747 return word_mode;
12750 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12751 if (TARGET_SIMD)
12753 if (known_eq (width, 128))
12754 switch (mode)
12756 case E_DFmode:
12757 return V2DFmode;
12758 case E_SFmode:
12759 return V4SFmode;
12760 case E_HFmode:
12761 return V8HFmode;
12762 case E_SImode:
12763 return V4SImode;
12764 case E_HImode:
12765 return V8HImode;
12766 case E_QImode:
12767 return V16QImode;
12768 case E_DImode:
12769 return V2DImode;
12770 default:
12771 break;
12773 else
12774 switch (mode)
12776 case E_SFmode:
12777 return V2SFmode;
12778 case E_HFmode:
12779 return V4HFmode;
12780 case E_SImode:
12781 return V2SImode;
12782 case E_HImode:
12783 return V4HImode;
12784 case E_QImode:
12785 return V8QImode;
12786 default:
12787 break;
12790 return word_mode;
12793 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12794 static machine_mode
12795 aarch64_preferred_simd_mode (scalar_mode mode)
12797 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12798 return aarch64_simd_container_mode (mode, bits);
12801 /* Return a list of possible vector sizes for the vectorizer
12802 to iterate over. */
12803 static void
12804 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12806 if (TARGET_SVE)
12807 sizes->safe_push (BYTES_PER_SVE_VECTOR);
12808 sizes->safe_push (16);
12809 sizes->safe_push (8);
12812 /* Implement TARGET_MANGLE_TYPE. */
12814 static const char *
12815 aarch64_mangle_type (const_tree type)
12817 /* The AArch64 ABI documents say that "__va_list" has to be
12818 managled as if it is in the "std" namespace. */
12819 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12820 return "St9__va_list";
12822 /* Half-precision float. */
12823 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12824 return "Dh";
12826 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12827 builtin types. */
12828 if (TYPE_NAME (type) != NULL)
12829 return aarch64_mangle_builtin_type (type);
12831 /* Use the default mangling. */
12832 return NULL;
12835 /* Find the first rtx_insn before insn that will generate an assembly
12836 instruction. */
12838 static rtx_insn *
12839 aarch64_prev_real_insn (rtx_insn *insn)
12841 if (!insn)
12842 return NULL;
12846 insn = prev_real_insn (insn);
12848 while (insn && recog_memoized (insn) < 0);
12850 return insn;
12853 static bool
12854 is_madd_op (enum attr_type t1)
12856 unsigned int i;
12857 /* A number of these may be AArch32 only. */
12858 enum attr_type mlatypes[] = {
12859 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12860 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12861 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12864 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12866 if (t1 == mlatypes[i])
12867 return true;
12870 return false;
12873 /* Check if there is a register dependency between a load and the insn
12874 for which we hold recog_data. */
12876 static bool
12877 dep_between_memop_and_curr (rtx memop)
12879 rtx load_reg;
12880 int opno;
12882 gcc_assert (GET_CODE (memop) == SET);
12884 if (!REG_P (SET_DEST (memop)))
12885 return false;
12887 load_reg = SET_DEST (memop);
12888 for (opno = 1; opno < recog_data.n_operands; opno++)
12890 rtx operand = recog_data.operand[opno];
12891 if (REG_P (operand)
12892 && reg_overlap_mentioned_p (load_reg, operand))
12893 return true;
12896 return false;
12900 /* When working around the Cortex-A53 erratum 835769,
12901 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12902 instruction and has a preceding memory instruction such that a NOP
12903 should be inserted between them. */
12905 bool
12906 aarch64_madd_needs_nop (rtx_insn* insn)
12908 enum attr_type attr_type;
12909 rtx_insn *prev;
12910 rtx body;
12912 if (!TARGET_FIX_ERR_A53_835769)
12913 return false;
12915 if (!INSN_P (insn) || recog_memoized (insn) < 0)
12916 return false;
12918 attr_type = get_attr_type (insn);
12919 if (!is_madd_op (attr_type))
12920 return false;
12922 prev = aarch64_prev_real_insn (insn);
12923 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12924 Restore recog state to INSN to avoid state corruption. */
12925 extract_constrain_insn_cached (insn);
12927 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
12928 return false;
12930 body = single_set (prev);
12932 /* If the previous insn is a memory op and there is no dependency between
12933 it and the DImode madd, emit a NOP between them. If body is NULL then we
12934 have a complex memory operation, probably a load/store pair.
12935 Be conservative for now and emit a NOP. */
12936 if (GET_MODE (recog_data.operand[0]) == DImode
12937 && (!body || !dep_between_memop_and_curr (body)))
12938 return true;
12940 return false;
12945 /* Implement FINAL_PRESCAN_INSN. */
12947 void
12948 aarch64_final_prescan_insn (rtx_insn *insn)
12950 if (aarch64_madd_needs_nop (insn))
12951 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
12955 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
12956 instruction. */
12958 bool
12959 aarch64_sve_index_immediate_p (rtx base_or_step)
12961 return (CONST_INT_P (base_or_step)
12962 && IN_RANGE (INTVAL (base_or_step), -16, 15));
12965 /* Return true if X is a valid immediate for the SVE ADD and SUB
12966 instructions. Negate X first if NEGATE_P is true. */
12968 bool
12969 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
12971 rtx elt;
12973 if (!const_vec_duplicate_p (x, &elt)
12974 || !CONST_INT_P (elt))
12975 return false;
12977 HOST_WIDE_INT val = INTVAL (elt);
12978 if (negate_p)
12979 val = -val;
12980 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
12982 if (val & 0xff)
12983 return IN_RANGE (val, 0, 0xff);
12984 return IN_RANGE (val, 0, 0xff00);
12987 /* Return true if X is a valid immediate operand for an SVE logical
12988 instruction such as AND. */
12990 bool
12991 aarch64_sve_bitmask_immediate_p (rtx x)
12993 rtx elt;
12995 return (const_vec_duplicate_p (x, &elt)
12996 && CONST_INT_P (elt)
12997 && aarch64_bitmask_imm (INTVAL (elt),
12998 GET_MODE_INNER (GET_MODE (x))));
13001 /* Return true if X is a valid immediate for the SVE DUP and CPY
13002 instructions. */
13004 bool
13005 aarch64_sve_dup_immediate_p (rtx x)
13007 rtx elt;
13009 if (!const_vec_duplicate_p (x, &elt)
13010 || !CONST_INT_P (elt))
13011 return false;
13013 HOST_WIDE_INT val = INTVAL (elt);
13014 if (val & 0xff)
13015 return IN_RANGE (val, -0x80, 0x7f);
13016 return IN_RANGE (val, -0x8000, 0x7f00);
13019 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13020 SIGNED_P says whether the operand is signed rather than unsigned. */
13022 bool
13023 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13025 rtx elt;
13027 return (const_vec_duplicate_p (x, &elt)
13028 && CONST_INT_P (elt)
13029 && (signed_p
13030 ? IN_RANGE (INTVAL (elt), -16, 15)
13031 : IN_RANGE (INTVAL (elt), 0, 127)));
13034 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13035 instruction. Negate X first if NEGATE_P is true. */
13037 bool
13038 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13040 rtx elt;
13041 REAL_VALUE_TYPE r;
13043 if (!const_vec_duplicate_p (x, &elt)
13044 || GET_CODE (elt) != CONST_DOUBLE)
13045 return false;
13047 r = *CONST_DOUBLE_REAL_VALUE (elt);
13049 if (negate_p)
13050 r = real_value_negate (&r);
13052 if (real_equal (&r, &dconst1))
13053 return true;
13054 if (real_equal (&r, &dconsthalf))
13055 return true;
13056 return false;
13059 /* Return true if X is a valid immediate operand for an SVE FMUL
13060 instruction. */
13062 bool
13063 aarch64_sve_float_mul_immediate_p (rtx x)
13065 rtx elt;
13067 /* GCC will never generate a multiply with an immediate of 2, so there is no
13068 point testing for it (even though it is a valid constant). */
13069 return (const_vec_duplicate_p (x, &elt)
13070 && GET_CODE (elt) == CONST_DOUBLE
13071 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13074 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13075 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13076 is nonnull, use it to describe valid immediates. */
13077 static bool
13078 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13079 simd_immediate_info *info,
13080 enum simd_immediate_check which,
13081 simd_immediate_info::insn_type insn)
13083 /* Try a 4-byte immediate with LSL. */
13084 for (unsigned int shift = 0; shift < 32; shift += 8)
13085 if ((val32 & (0xff << shift)) == val32)
13087 if (info)
13088 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13089 simd_immediate_info::LSL, shift);
13090 return true;
13093 /* Try a 2-byte immediate with LSL. */
13094 unsigned int imm16 = val32 & 0xffff;
13095 if (imm16 == (val32 >> 16))
13096 for (unsigned int shift = 0; shift < 16; shift += 8)
13097 if ((imm16 & (0xff << shift)) == imm16)
13099 if (info)
13100 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13101 simd_immediate_info::LSL, shift);
13102 return true;
13105 /* Try a 4-byte immediate with MSL, except for cases that MVN
13106 can handle. */
13107 if (which == AARCH64_CHECK_MOV)
13108 for (unsigned int shift = 8; shift < 24; shift += 8)
13110 unsigned int low = (1 << shift) - 1;
13111 if (((val32 & (0xff << shift)) | low) == val32)
13113 if (info)
13114 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13115 simd_immediate_info::MSL, shift);
13116 return true;
13120 return false;
13123 /* Return true if replicating VAL64 is a valid immediate for the
13124 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13125 use it to describe valid immediates. */
13126 static bool
13127 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13128 simd_immediate_info *info,
13129 enum simd_immediate_check which)
13131 unsigned int val32 = val64 & 0xffffffff;
13132 unsigned int val16 = val64 & 0xffff;
13133 unsigned int val8 = val64 & 0xff;
13135 if (val32 == (val64 >> 32))
13137 if ((which & AARCH64_CHECK_ORR) != 0
13138 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13139 simd_immediate_info::MOV))
13140 return true;
13142 if ((which & AARCH64_CHECK_BIC) != 0
13143 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13144 simd_immediate_info::MVN))
13145 return true;
13147 /* Try using a replicated byte. */
13148 if (which == AARCH64_CHECK_MOV
13149 && val16 == (val32 >> 16)
13150 && val8 == (val16 >> 8))
13152 if (info)
13153 *info = simd_immediate_info (QImode, val8);
13154 return true;
13158 /* Try using a bit-to-bytemask. */
13159 if (which == AARCH64_CHECK_MOV)
13161 unsigned int i;
13162 for (i = 0; i < 64; i += 8)
13164 unsigned char byte = (val64 >> i) & 0xff;
13165 if (byte != 0 && byte != 0xff)
13166 break;
13168 if (i == 64)
13170 if (info)
13171 *info = simd_immediate_info (DImode, val64);
13172 return true;
13175 return false;
13178 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13179 instruction. If INFO is nonnull, use it to describe valid immediates. */
13181 static bool
13182 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13183 simd_immediate_info *info)
13185 scalar_int_mode mode = DImode;
13186 unsigned int val32 = val64 & 0xffffffff;
13187 if (val32 == (val64 >> 32))
13189 mode = SImode;
13190 unsigned int val16 = val32 & 0xffff;
13191 if (val16 == (val32 >> 16))
13193 mode = HImode;
13194 unsigned int val8 = val16 & 0xff;
13195 if (val8 == (val16 >> 8))
13196 mode = QImode;
13199 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13200 if (IN_RANGE (val, -0x80, 0x7f))
13202 /* DUP with no shift. */
13203 if (info)
13204 *info = simd_immediate_info (mode, val);
13205 return true;
13207 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13209 /* DUP with LSL #8. */
13210 if (info)
13211 *info = simd_immediate_info (mode, val);
13212 return true;
13214 if (aarch64_bitmask_imm (val64, mode))
13216 /* DUPM. */
13217 if (info)
13218 *info = simd_immediate_info (mode, val);
13219 return true;
13221 return false;
13224 /* Return true if OP is a valid SIMD immediate for the operation
13225 described by WHICH. If INFO is nonnull, use it to describe valid
13226 immediates. */
13227 bool
13228 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13229 enum simd_immediate_check which)
13231 machine_mode mode = GET_MODE (op);
13232 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13233 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13234 return false;
13236 scalar_mode elt_mode = GET_MODE_INNER (mode);
13237 rtx base, step;
13238 unsigned int n_elts;
13239 if (GET_CODE (op) == CONST_VECTOR
13240 && CONST_VECTOR_DUPLICATE_P (op))
13241 n_elts = CONST_VECTOR_NPATTERNS (op);
13242 else if ((vec_flags & VEC_SVE_DATA)
13243 && const_vec_series_p (op, &base, &step))
13245 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13246 if (!aarch64_sve_index_immediate_p (base)
13247 || !aarch64_sve_index_immediate_p (step))
13248 return false;
13250 if (info)
13251 *info = simd_immediate_info (elt_mode, base, step);
13252 return true;
13254 else if (GET_CODE (op) == CONST_VECTOR
13255 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13256 /* N_ELTS set above. */;
13257 else
13258 return false;
13260 /* Handle PFALSE and PTRUE. */
13261 if (vec_flags & VEC_SVE_PRED)
13262 return (op == CONST0_RTX (mode)
13263 || op == CONSTM1_RTX (mode));
13265 scalar_float_mode elt_float_mode;
13266 if (n_elts == 1
13267 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13269 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13270 if (aarch64_float_const_zero_rtx_p (elt)
13271 || aarch64_float_const_representable_p (elt))
13273 if (info)
13274 *info = simd_immediate_info (elt_float_mode, elt);
13275 return true;
13279 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13280 if (elt_size > 8)
13281 return false;
13283 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13285 /* Expand the vector constant out into a byte vector, with the least
13286 significant byte of the register first. */
13287 auto_vec<unsigned char, 16> bytes;
13288 bytes.reserve (n_elts * elt_size);
13289 for (unsigned int i = 0; i < n_elts; i++)
13291 /* The vector is provided in gcc endian-neutral fashion.
13292 For aarch64_be Advanced SIMD, it must be laid out in the vector
13293 register in reverse order. */
13294 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13295 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13297 if (elt_mode != elt_int_mode)
13298 elt = gen_lowpart (elt_int_mode, elt);
13300 if (!CONST_INT_P (elt))
13301 return false;
13303 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13304 for (unsigned int byte = 0; byte < elt_size; byte++)
13306 bytes.quick_push (elt_val & 0xff);
13307 elt_val >>= BITS_PER_UNIT;
13311 /* The immediate must repeat every eight bytes. */
13312 unsigned int nbytes = bytes.length ();
13313 for (unsigned i = 8; i < nbytes; ++i)
13314 if (bytes[i] != bytes[i - 8])
13315 return false;
13317 /* Get the repeating 8-byte value as an integer. No endian correction
13318 is needed here because bytes is already in lsb-first order. */
13319 unsigned HOST_WIDE_INT val64 = 0;
13320 for (unsigned int i = 0; i < 8; i++)
13321 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13322 << (i * BITS_PER_UNIT));
13324 if (vec_flags & VEC_SVE_DATA)
13325 return aarch64_sve_valid_immediate (val64, info);
13326 else
13327 return aarch64_advsimd_valid_immediate (val64, info, which);
13330 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13331 has a step in the range of INDEX. Return the index expression if so,
13332 otherwise return null. */
13334 aarch64_check_zero_based_sve_index_immediate (rtx x)
13336 rtx base, step;
13337 if (const_vec_series_p (x, &base, &step)
13338 && base == const0_rtx
13339 && aarch64_sve_index_immediate_p (step))
13340 return step;
13341 return NULL_RTX;
13344 /* Check of immediate shift constants are within range. */
13345 bool
13346 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13348 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13349 if (left)
13350 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13351 else
13352 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13355 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13356 operation of width WIDTH at bit position POS. */
13359 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13361 gcc_assert (CONST_INT_P (width));
13362 gcc_assert (CONST_INT_P (pos));
13364 unsigned HOST_WIDE_INT mask
13365 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13366 return GEN_INT (mask << UINTVAL (pos));
13369 bool
13370 aarch64_mov_operand_p (rtx x, machine_mode mode)
13372 if (GET_CODE (x) == HIGH
13373 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13374 return true;
13376 if (CONST_INT_P (x))
13377 return true;
13379 if (VECTOR_MODE_P (GET_MODE (x)))
13380 return aarch64_simd_valid_immediate (x, NULL);
13382 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13383 return true;
13385 if (aarch64_sve_cnt_immediate_p (x))
13386 return true;
13388 return aarch64_classify_symbolic_expression (x)
13389 == SYMBOL_TINY_ABSOLUTE;
13392 /* Return a const_int vector of VAL. */
13394 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13396 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13397 return gen_const_vec_duplicate (mode, c);
13400 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13402 bool
13403 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13405 machine_mode vmode;
13407 vmode = aarch64_simd_container_mode (mode, 64);
13408 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13409 return aarch64_simd_valid_immediate (op_v, NULL);
13412 /* Construct and return a PARALLEL RTX vector with elements numbering the
13413 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13414 the vector - from the perspective of the architecture. This does not
13415 line up with GCC's perspective on lane numbers, so we end up with
13416 different masks depending on our target endian-ness. The diagram
13417 below may help. We must draw the distinction when building masks
13418 which select one half of the vector. An instruction selecting
13419 architectural low-lanes for a big-endian target, must be described using
13420 a mask selecting GCC high-lanes.
13422 Big-Endian Little-Endian
13424 GCC 0 1 2 3 3 2 1 0
13425 | x | x | x | x | | x | x | x | x |
13426 Architecture 3 2 1 0 3 2 1 0
13428 Low Mask: { 2, 3 } { 0, 1 }
13429 High Mask: { 0, 1 } { 2, 3 }
13431 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13434 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13436 rtvec v = rtvec_alloc (nunits / 2);
13437 int high_base = nunits / 2;
13438 int low_base = 0;
13439 int base;
13440 rtx t1;
13441 int i;
13443 if (BYTES_BIG_ENDIAN)
13444 base = high ? low_base : high_base;
13445 else
13446 base = high ? high_base : low_base;
13448 for (i = 0; i < nunits / 2; i++)
13449 RTVEC_ELT (v, i) = GEN_INT (base + i);
13451 t1 = gen_rtx_PARALLEL (mode, v);
13452 return t1;
13455 /* Check OP for validity as a PARALLEL RTX vector with elements
13456 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13457 from the perspective of the architecture. See the diagram above
13458 aarch64_simd_vect_par_cnst_half for more details. */
13460 bool
13461 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13462 bool high)
13464 int nelts;
13465 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13466 return false;
13468 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13469 HOST_WIDE_INT count_op = XVECLEN (op, 0);
13470 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13471 int i = 0;
13473 if (count_op != count_ideal)
13474 return false;
13476 for (i = 0; i < count_ideal; i++)
13478 rtx elt_op = XVECEXP (op, 0, i);
13479 rtx elt_ideal = XVECEXP (ideal, 0, i);
13481 if (!CONST_INT_P (elt_op)
13482 || INTVAL (elt_ideal) != INTVAL (elt_op))
13483 return false;
13485 return true;
13488 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13489 HIGH (exclusive). */
13490 void
13491 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13492 const_tree exp)
13494 HOST_WIDE_INT lane;
13495 gcc_assert (CONST_INT_P (operand));
13496 lane = INTVAL (operand);
13498 if (lane < low || lane >= high)
13500 if (exp)
13501 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13502 else
13503 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13507 /* Peform endian correction on lane number N, which indexes a vector
13508 of mode MODE, and return the result as an SImode rtx. */
13511 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13513 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13516 /* Return TRUE if OP is a valid vector addressing mode. */
13518 bool
13519 aarch64_simd_mem_operand_p (rtx op)
13521 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13522 || REG_P (XEXP (op, 0)));
13525 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13527 bool
13528 aarch64_sve_ld1r_operand_p (rtx op)
13530 struct aarch64_address_info addr;
13531 scalar_mode mode;
13533 return (MEM_P (op)
13534 && is_a <scalar_mode> (GET_MODE (op), &mode)
13535 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13536 && addr.type == ADDRESS_REG_IMM
13537 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13540 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13541 The conditions for STR are the same. */
13542 bool
13543 aarch64_sve_ldr_operand_p (rtx op)
13545 struct aarch64_address_info addr;
13547 return (MEM_P (op)
13548 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13549 false, ADDR_QUERY_ANY)
13550 && addr.type == ADDRESS_REG_IMM);
13553 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13554 We need to be able to access the individual pieces, so the range
13555 is different from LD[234] and ST[234]. */
13556 bool
13557 aarch64_sve_struct_memory_operand_p (rtx op)
13559 if (!MEM_P (op))
13560 return false;
13562 machine_mode mode = GET_MODE (op);
13563 struct aarch64_address_info addr;
13564 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13565 ADDR_QUERY_ANY)
13566 || addr.type != ADDRESS_REG_IMM)
13567 return false;
13569 poly_int64 first = addr.const_offset;
13570 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13571 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13572 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13575 /* Emit a register copy from operand to operand, taking care not to
13576 early-clobber source registers in the process.
13578 COUNT is the number of components into which the copy needs to be
13579 decomposed. */
13580 void
13581 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13582 unsigned int count)
13584 unsigned int i;
13585 int rdest = REGNO (operands[0]);
13586 int rsrc = REGNO (operands[1]);
13588 if (!reg_overlap_mentioned_p (operands[0], operands[1])
13589 || rdest < rsrc)
13590 for (i = 0; i < count; i++)
13591 emit_move_insn (gen_rtx_REG (mode, rdest + i),
13592 gen_rtx_REG (mode, rsrc + i));
13593 else
13594 for (i = 0; i < count; i++)
13595 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13596 gen_rtx_REG (mode, rsrc + count - i - 1));
13599 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13600 one of VSTRUCT modes: OI, CI, or XI. */
13602 aarch64_simd_attr_length_rglist (machine_mode mode)
13604 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13605 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13608 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13609 alignment of a vector to 128 bits. SVE predicates have an alignment of
13610 16 bits. */
13611 static HOST_WIDE_INT
13612 aarch64_simd_vector_alignment (const_tree type)
13614 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13615 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13616 be set for non-predicate vectors of booleans. Modes are the most
13617 direct way we have of identifying real SVE predicate types. */
13618 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13619 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13620 return MIN (align, 128);
13623 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13624 static HOST_WIDE_INT
13625 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13627 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13629 /* If the length of the vector is fixed, try to align to that length,
13630 otherwise don't try to align at all. */
13631 HOST_WIDE_INT result;
13632 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13633 result = TYPE_ALIGN (TREE_TYPE (type));
13634 return result;
13636 return TYPE_ALIGN (type);
13639 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13640 static bool
13641 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13643 if (is_packed)
13644 return false;
13646 /* For fixed-length vectors, check that the vectorizer will aim for
13647 full-vector alignment. This isn't true for generic GCC vectors
13648 that are wider than the ABI maximum of 128 bits. */
13649 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13650 && (wi::to_widest (TYPE_SIZE (type))
13651 != aarch64_vectorize_preferred_vector_alignment (type)))
13652 return false;
13654 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13655 return true;
13658 /* Return true if the vector misalignment factor is supported by the
13659 target. */
13660 static bool
13661 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13662 const_tree type, int misalignment,
13663 bool is_packed)
13665 if (TARGET_SIMD && STRICT_ALIGNMENT)
13667 /* Return if movmisalign pattern is not supported for this mode. */
13668 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13669 return false;
13671 /* Misalignment factor is unknown at compile time. */
13672 if (misalignment == -1)
13673 return false;
13675 return default_builtin_support_vector_misalignment (mode, type, misalignment,
13676 is_packed);
13679 /* If VALS is a vector constant that can be loaded into a register
13680 using DUP, generate instructions to do so and return an RTX to
13681 assign to the register. Otherwise return NULL_RTX. */
13682 static rtx
13683 aarch64_simd_dup_constant (rtx vals)
13685 machine_mode mode = GET_MODE (vals);
13686 machine_mode inner_mode = GET_MODE_INNER (mode);
13687 rtx x;
13689 if (!const_vec_duplicate_p (vals, &x))
13690 return NULL_RTX;
13692 /* We can load this constant by using DUP and a constant in a
13693 single ARM register. This will be cheaper than a vector
13694 load. */
13695 x = copy_to_mode_reg (inner_mode, x);
13696 return gen_vec_duplicate (mode, x);
13700 /* Generate code to load VALS, which is a PARALLEL containing only
13701 constants (for vec_init) or CONST_VECTOR, efficiently into a
13702 register. Returns an RTX to copy into the register, or NULL_RTX
13703 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13704 static rtx
13705 aarch64_simd_make_constant (rtx vals)
13707 machine_mode mode = GET_MODE (vals);
13708 rtx const_dup;
13709 rtx const_vec = NULL_RTX;
13710 int n_const = 0;
13711 int i;
13713 if (GET_CODE (vals) == CONST_VECTOR)
13714 const_vec = vals;
13715 else if (GET_CODE (vals) == PARALLEL)
13717 /* A CONST_VECTOR must contain only CONST_INTs and
13718 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13719 Only store valid constants in a CONST_VECTOR. */
13720 int n_elts = XVECLEN (vals, 0);
13721 for (i = 0; i < n_elts; ++i)
13723 rtx x = XVECEXP (vals, 0, i);
13724 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13725 n_const++;
13727 if (n_const == n_elts)
13728 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13730 else
13731 gcc_unreachable ();
13733 if (const_vec != NULL_RTX
13734 && aarch64_simd_valid_immediate (const_vec, NULL))
13735 /* Load using MOVI/MVNI. */
13736 return const_vec;
13737 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13738 /* Loaded using DUP. */
13739 return const_dup;
13740 else if (const_vec != NULL_RTX)
13741 /* Load from constant pool. We can not take advantage of single-cycle
13742 LD1 because we need a PC-relative addressing mode. */
13743 return const_vec;
13744 else
13745 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13746 We can not construct an initializer. */
13747 return NULL_RTX;
13750 /* Expand a vector initialisation sequence, such that TARGET is
13751 initialised to contain VALS. */
13753 void
13754 aarch64_expand_vector_init (rtx target, rtx vals)
13756 machine_mode mode = GET_MODE (target);
13757 scalar_mode inner_mode = GET_MODE_INNER (mode);
13758 /* The number of vector elements. */
13759 int n_elts = XVECLEN (vals, 0);
13760 /* The number of vector elements which are not constant. */
13761 int n_var = 0;
13762 rtx any_const = NULL_RTX;
13763 /* The first element of vals. */
13764 rtx v0 = XVECEXP (vals, 0, 0);
13765 bool all_same = true;
13767 /* Count the number of variable elements to initialise. */
13768 for (int i = 0; i < n_elts; ++i)
13770 rtx x = XVECEXP (vals, 0, i);
13771 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13772 ++n_var;
13773 else
13774 any_const = x;
13776 all_same &= rtx_equal_p (x, v0);
13779 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13780 how best to handle this. */
13781 if (n_var == 0)
13783 rtx constant = aarch64_simd_make_constant (vals);
13784 if (constant != NULL_RTX)
13786 emit_move_insn (target, constant);
13787 return;
13791 /* Splat a single non-constant element if we can. */
13792 if (all_same)
13794 rtx x = copy_to_mode_reg (inner_mode, v0);
13795 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13796 return;
13799 enum insn_code icode = optab_handler (vec_set_optab, mode);
13800 gcc_assert (icode != CODE_FOR_nothing);
13802 /* If there are only variable elements, try to optimize
13803 the insertion using dup for the most common element
13804 followed by insertions. */
13806 /* The algorithm will fill matches[*][0] with the earliest matching element,
13807 and matches[X][1] with the count of duplicate elements (if X is the
13808 earliest element which has duplicates). */
13810 if (n_var == n_elts && n_elts <= 16)
13812 int matches[16][2] = {0};
13813 for (int i = 0; i < n_elts; i++)
13815 for (int j = 0; j <= i; j++)
13817 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13819 matches[i][0] = j;
13820 matches[j][1]++;
13821 break;
13825 int maxelement = 0;
13826 int maxv = 0;
13827 for (int i = 0; i < n_elts; i++)
13828 if (matches[i][1] > maxv)
13830 maxelement = i;
13831 maxv = matches[i][1];
13834 /* Create a duplicate of the most common element, unless all elements
13835 are equally useless to us, in which case just immediately set the
13836 vector register using the first element. */
13838 if (maxv == 1)
13840 /* For vectors of two 64-bit elements, we can do even better. */
13841 if (n_elts == 2
13842 && (inner_mode == E_DImode
13843 || inner_mode == E_DFmode))
13846 rtx x0 = XVECEXP (vals, 0, 0);
13847 rtx x1 = XVECEXP (vals, 0, 1);
13848 /* Combine can pick up this case, but handling it directly
13849 here leaves clearer RTL.
13851 This is load_pair_lanes<mode>, and also gives us a clean-up
13852 for store_pair_lanes<mode>. */
13853 if (memory_operand (x0, inner_mode)
13854 && memory_operand (x1, inner_mode)
13855 && !STRICT_ALIGNMENT
13856 && rtx_equal_p (XEXP (x1, 0),
13857 plus_constant (Pmode,
13858 XEXP (x0, 0),
13859 GET_MODE_SIZE (inner_mode))))
13861 rtx t;
13862 if (inner_mode == DFmode)
13863 t = gen_load_pair_lanesdf (target, x0, x1);
13864 else
13865 t = gen_load_pair_lanesdi (target, x0, x1);
13866 emit_insn (t);
13867 return;
13870 /* The subreg-move sequence below will move into lane zero of the
13871 vector register. For big-endian we want that position to hold
13872 the last element of VALS. */
13873 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
13874 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13875 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
13877 else
13879 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13880 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13883 /* Insert the rest. */
13884 for (int i = 0; i < n_elts; i++)
13886 rtx x = XVECEXP (vals, 0, i);
13887 if (matches[i][0] == maxelement)
13888 continue;
13889 x = copy_to_mode_reg (inner_mode, x);
13890 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13892 return;
13895 /* Initialise a vector which is part-variable. We want to first try
13896 to build those lanes which are constant in the most efficient way we
13897 can. */
13898 if (n_var != n_elts)
13900 rtx copy = copy_rtx (vals);
13902 /* Load constant part of vector. We really don't care what goes into the
13903 parts we will overwrite, but we're more likely to be able to load the
13904 constant efficiently if it has fewer, larger, repeating parts
13905 (see aarch64_simd_valid_immediate). */
13906 for (int i = 0; i < n_elts; i++)
13908 rtx x = XVECEXP (vals, 0, i);
13909 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13910 continue;
13911 rtx subst = any_const;
13912 for (int bit = n_elts / 2; bit > 0; bit /= 2)
13914 /* Look in the copied vector, as more elements are const. */
13915 rtx test = XVECEXP (copy, 0, i ^ bit);
13916 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13918 subst = test;
13919 break;
13922 XVECEXP (copy, 0, i) = subst;
13924 aarch64_expand_vector_init (target, copy);
13927 /* Insert the variable lanes directly. */
13928 for (int i = 0; i < n_elts; i++)
13930 rtx x = XVECEXP (vals, 0, i);
13931 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13932 continue;
13933 x = copy_to_mode_reg (inner_mode, x);
13934 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13938 static unsigned HOST_WIDE_INT
13939 aarch64_shift_truncation_mask (machine_mode mode)
13941 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
13942 return 0;
13943 return GET_MODE_UNIT_BITSIZE (mode) - 1;
13946 /* Select a format to encode pointers in exception handling data. */
13948 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
13950 int type;
13951 switch (aarch64_cmodel)
13953 case AARCH64_CMODEL_TINY:
13954 case AARCH64_CMODEL_TINY_PIC:
13955 case AARCH64_CMODEL_SMALL:
13956 case AARCH64_CMODEL_SMALL_PIC:
13957 case AARCH64_CMODEL_SMALL_SPIC:
13958 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
13959 for everything. */
13960 type = DW_EH_PE_sdata4;
13961 break;
13962 default:
13963 /* No assumptions here. 8-byte relocs required. */
13964 type = DW_EH_PE_sdata8;
13965 break;
13967 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
13970 /* The last .arch and .tune assembly strings that we printed. */
13971 static std::string aarch64_last_printed_arch_string;
13972 static std::string aarch64_last_printed_tune_string;
13974 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
13975 by the function fndecl. */
13977 void
13978 aarch64_declare_function_name (FILE *stream, const char* name,
13979 tree fndecl)
13981 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13983 struct cl_target_option *targ_options;
13984 if (target_parts)
13985 targ_options = TREE_TARGET_OPTION (target_parts);
13986 else
13987 targ_options = TREE_TARGET_OPTION (target_option_current_node);
13988 gcc_assert (targ_options);
13990 const struct processor *this_arch
13991 = aarch64_get_arch (targ_options->x_explicit_arch);
13993 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
13994 std::string extension
13995 = aarch64_get_extension_string_for_isa_flags (isa_flags,
13996 this_arch->flags);
13997 /* Only update the assembler .arch string if it is distinct from the last
13998 such string we printed. */
13999 std::string to_print = this_arch->name + extension;
14000 if (to_print != aarch64_last_printed_arch_string)
14002 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14003 aarch64_last_printed_arch_string = to_print;
14006 /* Print the cpu name we're tuning for in the comments, might be
14007 useful to readers of the generated asm. Do it only when it changes
14008 from function to function and verbose assembly is requested. */
14009 const struct processor *this_tune
14010 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14012 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14014 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14015 this_tune->name);
14016 aarch64_last_printed_tune_string = this_tune->name;
14019 /* Don't forget the type directive for ELF. */
14020 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14021 ASM_OUTPUT_LABEL (stream, name);
14024 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14026 static void
14027 aarch64_start_file (void)
14029 struct cl_target_option *default_options
14030 = TREE_TARGET_OPTION (target_option_default_node);
14032 const struct processor *default_arch
14033 = aarch64_get_arch (default_options->x_explicit_arch);
14034 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14035 std::string extension
14036 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14037 default_arch->flags);
14039 aarch64_last_printed_arch_string = default_arch->name + extension;
14040 aarch64_last_printed_tune_string = "";
14041 asm_fprintf (asm_out_file, "\t.arch %s\n",
14042 aarch64_last_printed_arch_string.c_str ());
14044 default_file_start ();
14047 /* Emit load exclusive. */
14049 static void
14050 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14051 rtx mem, rtx model_rtx)
14053 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
14056 /* Emit store exclusive. */
14058 static void
14059 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14060 rtx rval, rtx mem, rtx model_rtx)
14062 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
14065 /* Mark the previous jump instruction as unlikely. */
14067 static void
14068 aarch64_emit_unlikely_jump (rtx insn)
14070 rtx_insn *jump = emit_jump_insn (insn);
14071 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14074 /* Expand a compare and swap pattern. */
14076 void
14077 aarch64_expand_compare_and_swap (rtx operands[])
14079 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14080 machine_mode mode, cmp_mode;
14082 bval = operands[0];
14083 rval = operands[1];
14084 mem = operands[2];
14085 oldval = operands[3];
14086 newval = operands[4];
14087 is_weak = operands[5];
14088 mod_s = operands[6];
14089 mod_f = operands[7];
14090 mode = GET_MODE (mem);
14091 cmp_mode = mode;
14093 /* Normally the succ memory model must be stronger than fail, but in the
14094 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14095 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14097 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14098 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14099 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14101 switch (mode)
14103 case E_QImode:
14104 case E_HImode:
14105 /* For short modes, we're going to perform the comparison in SImode,
14106 so do the zero-extension now. */
14107 cmp_mode = SImode;
14108 rval = gen_reg_rtx (SImode);
14109 oldval = convert_modes (SImode, mode, oldval, true);
14110 /* Fall through. */
14112 case E_SImode:
14113 case E_DImode:
14114 /* Force the value into a register if needed. */
14115 if (!aarch64_plus_operand (oldval, mode))
14116 oldval = force_reg (cmp_mode, oldval);
14117 break;
14119 default:
14120 gcc_unreachable ();
14123 if (TARGET_LSE)
14124 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem, oldval,
14125 newval, is_weak, mod_s,
14126 mod_f));
14127 else
14128 emit_insn (gen_aarch64_compare_and_swap (mode, rval, mem, oldval, newval,
14129 is_weak, mod_s, mod_f));
14132 if (mode == QImode || mode == HImode)
14133 emit_move_insn (operands[1], gen_lowpart (mode, rval));
14135 x = gen_rtx_REG (CCmode, CC_REGNUM);
14136 x = gen_rtx_EQ (SImode, x, const0_rtx);
14137 emit_insn (gen_rtx_SET (bval, x));
14140 /* Test whether the target supports using a atomic load-operate instruction.
14141 CODE is the operation and AFTER is TRUE if the data in memory after the
14142 operation should be returned and FALSE if the data before the operation
14143 should be returned. Returns FALSE if the operation isn't supported by the
14144 architecture. */
14146 bool
14147 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14149 if (!TARGET_LSE)
14150 return false;
14152 switch (code)
14154 case SET:
14155 case AND:
14156 case IOR:
14157 case XOR:
14158 case MINUS:
14159 case PLUS:
14160 return true;
14161 default:
14162 return false;
14166 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14167 sequence implementing an atomic operation. */
14169 static void
14170 aarch64_emit_post_barrier (enum memmodel model)
14172 const enum memmodel base_model = memmodel_base (model);
14174 if (is_mm_sync (model)
14175 && (base_model == MEMMODEL_ACQUIRE
14176 || base_model == MEMMODEL_ACQ_REL
14177 || base_model == MEMMODEL_SEQ_CST))
14179 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14183 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14184 for the data in memory. EXPECTED is the value expected to be in memory.
14185 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14186 is the memory ordering to use. */
14188 void
14189 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14190 rtx expected, rtx desired,
14191 rtx model)
14193 machine_mode mode;
14195 mode = GET_MODE (mem);
14197 /* Move the expected value into the CAS destination register. */
14198 emit_insn (gen_rtx_SET (rval, expected));
14200 /* Emit the CAS. */
14201 emit_insn (gen_aarch64_atomic_cas (mode, rval, mem, desired, model));
14203 /* Compare the expected value with the value loaded by the CAS, to establish
14204 whether the swap was made. */
14205 aarch64_gen_compare_reg (EQ, rval, expected);
14208 /* Split a compare and swap pattern. */
14210 void
14211 aarch64_split_compare_and_swap (rtx operands[])
14213 rtx rval, mem, oldval, newval, scratch;
14214 machine_mode mode;
14215 bool is_weak;
14216 rtx_code_label *label1, *label2;
14217 rtx x, cond;
14218 enum memmodel model;
14219 rtx model_rtx;
14221 rval = operands[0];
14222 mem = operands[1];
14223 oldval = operands[2];
14224 newval = operands[3];
14225 is_weak = (operands[4] != const0_rtx);
14226 model_rtx = operands[5];
14227 scratch = operands[7];
14228 mode = GET_MODE (mem);
14229 model = memmodel_from_int (INTVAL (model_rtx));
14231 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14232 loop:
14233 .label1:
14234 LD[A]XR rval, [mem]
14235 CBNZ rval, .label2
14236 ST[L]XR scratch, newval, [mem]
14237 CBNZ scratch, .label1
14238 .label2:
14239 CMP rval, 0. */
14240 bool strong_zero_p = !is_weak && oldval == const0_rtx;
14242 label1 = NULL;
14243 if (!is_weak)
14245 label1 = gen_label_rtx ();
14246 emit_label (label1);
14248 label2 = gen_label_rtx ();
14250 /* The initial load can be relaxed for a __sync operation since a final
14251 barrier will be emitted to stop code hoisting. */
14252 if (is_mm_sync (model))
14253 aarch64_emit_load_exclusive (mode, rval, mem,
14254 GEN_INT (MEMMODEL_RELAXED));
14255 else
14256 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14258 if (strong_zero_p)
14260 if (aarch64_track_speculation)
14262 /* Emit an explicit compare instruction, so that we can correctly
14263 track the condition codes. */
14264 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
14265 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14267 else
14268 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14270 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14271 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14272 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14274 else
14276 cond = aarch64_gen_compare_reg (NE, rval, oldval);
14277 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14278 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14279 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14280 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14283 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14285 if (!is_weak)
14287 if (aarch64_track_speculation)
14289 /* Emit an explicit compare instruction, so that we can correctly
14290 track the condition codes. */
14291 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
14292 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14294 else
14295 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14297 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14298 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14299 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14301 else
14303 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14304 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14305 emit_insn (gen_rtx_SET (cond, x));
14308 emit_label (label2);
14309 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14310 to set the condition flags. If this is not used it will be removed by
14311 later passes. */
14312 if (strong_zero_p)
14314 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14315 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14316 emit_insn (gen_rtx_SET (cond, x));
14318 /* Emit any final barrier needed for a __sync operation. */
14319 if (is_mm_sync (model))
14320 aarch64_emit_post_barrier (model);
14323 /* Emit a BIC instruction. */
14325 static void
14326 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14328 rtx shift_rtx = GEN_INT (shift);
14329 rtx (*gen) (rtx, rtx, rtx, rtx);
14331 switch (mode)
14333 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14334 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14335 default:
14336 gcc_unreachable ();
14339 emit_insn (gen (dst, s2, shift_rtx, s1));
14342 /* Emit an atomic swap. */
14344 static void
14345 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14346 rtx mem, rtx model)
14348 emit_insn (gen_aarch64_atomic_swp (mode, dst, mem, value, model));
14351 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14352 location to store the data read from memory. OUT_RESULT is the location to
14353 store the result of the operation. MEM is the memory location to read and
14354 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14355 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14356 be NULL. */
14358 void
14359 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14360 rtx mem, rtx value, rtx model_rtx)
14362 machine_mode mode = GET_MODE (mem);
14363 machine_mode wmode = (mode == DImode ? DImode : SImode);
14364 const bool short_mode = (mode < SImode);
14365 int ldop_code;
14366 rtx src;
14367 rtx x;
14369 if (out_data)
14370 out_data = gen_lowpart (mode, out_data);
14372 if (out_result)
14373 out_result = gen_lowpart (mode, out_result);
14375 /* Make sure the value is in a register, putting it into a destination
14376 register if it needs to be manipulated. */
14377 if (!register_operand (value, mode)
14378 || code == AND || code == MINUS)
14380 src = out_result ? out_result : out_data;
14381 emit_move_insn (src, gen_lowpart (mode, value));
14383 else
14384 src = value;
14385 gcc_assert (register_operand (src, mode));
14387 /* Preprocess the data for the operation as necessary. If the operation is
14388 a SET then emit a swap instruction and finish. */
14389 switch (code)
14391 case SET:
14392 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14393 return;
14395 case MINUS:
14396 /* Negate the value and treat it as a PLUS. */
14398 rtx neg_src;
14400 /* Resize the value if necessary. */
14401 if (short_mode)
14402 src = gen_lowpart (wmode, src);
14404 neg_src = gen_rtx_NEG (wmode, src);
14405 emit_insn (gen_rtx_SET (src, neg_src));
14407 if (short_mode)
14408 src = gen_lowpart (mode, src);
14410 /* Fall-through. */
14411 case PLUS:
14412 ldop_code = UNSPECV_ATOMIC_LDOP_PLUS;
14413 break;
14415 case IOR:
14416 ldop_code = UNSPECV_ATOMIC_LDOP_OR;
14417 break;
14419 case XOR:
14420 ldop_code = UNSPECV_ATOMIC_LDOP_XOR;
14421 break;
14423 case AND:
14425 rtx not_src;
14427 /* Resize the value if necessary. */
14428 if (short_mode)
14429 src = gen_lowpart (wmode, src);
14431 not_src = gen_rtx_NOT (wmode, src);
14432 emit_insn (gen_rtx_SET (src, not_src));
14434 if (short_mode)
14435 src = gen_lowpart (mode, src);
14437 ldop_code = UNSPECV_ATOMIC_LDOP_BIC;
14438 break;
14440 default:
14441 /* The operation can't be done with atomic instructions. */
14442 gcc_unreachable ();
14445 emit_insn (gen_aarch64_atomic_load (ldop_code, mode,
14446 out_data, mem, src, model_rtx));
14448 /* If necessary, calculate the data in memory after the update by redoing the
14449 operation from values in registers. */
14450 if (!out_result)
14451 return;
14453 if (short_mode)
14455 src = gen_lowpart (wmode, src);
14456 out_data = gen_lowpart (wmode, out_data);
14457 out_result = gen_lowpart (wmode, out_result);
14460 x = NULL_RTX;
14462 switch (code)
14464 case MINUS:
14465 case PLUS:
14466 x = gen_rtx_PLUS (wmode, out_data, src);
14467 break;
14468 case IOR:
14469 x = gen_rtx_IOR (wmode, out_data, src);
14470 break;
14471 case XOR:
14472 x = gen_rtx_XOR (wmode, out_data, src);
14473 break;
14474 case AND:
14475 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14476 return;
14477 default:
14478 gcc_unreachable ();
14481 emit_set_insn (out_result, x);
14483 return;
14486 /* Split an atomic operation. */
14488 void
14489 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14490 rtx value, rtx model_rtx, rtx cond)
14492 machine_mode mode = GET_MODE (mem);
14493 machine_mode wmode = (mode == DImode ? DImode : SImode);
14494 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14495 const bool is_sync = is_mm_sync (model);
14496 rtx_code_label *label;
14497 rtx x;
14499 /* Split the atomic operation into a sequence. */
14500 label = gen_label_rtx ();
14501 emit_label (label);
14503 if (new_out)
14504 new_out = gen_lowpart (wmode, new_out);
14505 if (old_out)
14506 old_out = gen_lowpart (wmode, old_out);
14507 else
14508 old_out = new_out;
14509 value = simplify_gen_subreg (wmode, value, mode, 0);
14511 /* The initial load can be relaxed for a __sync operation since a final
14512 barrier will be emitted to stop code hoisting. */
14513 if (is_sync)
14514 aarch64_emit_load_exclusive (mode, old_out, mem,
14515 GEN_INT (MEMMODEL_RELAXED));
14516 else
14517 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14519 switch (code)
14521 case SET:
14522 new_out = value;
14523 break;
14525 case NOT:
14526 x = gen_rtx_AND (wmode, old_out, value);
14527 emit_insn (gen_rtx_SET (new_out, x));
14528 x = gen_rtx_NOT (wmode, new_out);
14529 emit_insn (gen_rtx_SET (new_out, x));
14530 break;
14532 case MINUS:
14533 if (CONST_INT_P (value))
14535 value = GEN_INT (-INTVAL (value));
14536 code = PLUS;
14538 /* Fall through. */
14540 default:
14541 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14542 emit_insn (gen_rtx_SET (new_out, x));
14543 break;
14546 aarch64_emit_store_exclusive (mode, cond, mem,
14547 gen_lowpart (mode, new_out), model_rtx);
14549 if (aarch64_track_speculation)
14551 /* Emit an explicit compare instruction, so that we can correctly
14552 track the condition codes. */
14553 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
14554 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14556 else
14557 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14559 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14560 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14561 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14563 /* Emit any final barrier needed for a __sync operation. */
14564 if (is_sync)
14565 aarch64_emit_post_barrier (model);
14568 static void
14569 aarch64_init_libfuncs (void)
14571 /* Half-precision float operations. The compiler handles all operations
14572 with NULL libfuncs by converting to SFmode. */
14574 /* Conversions. */
14575 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14576 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14578 /* Arithmetic. */
14579 set_optab_libfunc (add_optab, HFmode, NULL);
14580 set_optab_libfunc (sdiv_optab, HFmode, NULL);
14581 set_optab_libfunc (smul_optab, HFmode, NULL);
14582 set_optab_libfunc (neg_optab, HFmode, NULL);
14583 set_optab_libfunc (sub_optab, HFmode, NULL);
14585 /* Comparisons. */
14586 set_optab_libfunc (eq_optab, HFmode, NULL);
14587 set_optab_libfunc (ne_optab, HFmode, NULL);
14588 set_optab_libfunc (lt_optab, HFmode, NULL);
14589 set_optab_libfunc (le_optab, HFmode, NULL);
14590 set_optab_libfunc (ge_optab, HFmode, NULL);
14591 set_optab_libfunc (gt_optab, HFmode, NULL);
14592 set_optab_libfunc (unord_optab, HFmode, NULL);
14595 /* Target hook for c_mode_for_suffix. */
14596 static machine_mode
14597 aarch64_c_mode_for_suffix (char suffix)
14599 if (suffix == 'q')
14600 return TFmode;
14602 return VOIDmode;
14605 /* We can only represent floating point constants which will fit in
14606 "quarter-precision" values. These values are characterised by
14607 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14610 (-1)^s * (n/16) * 2^r
14612 Where:
14613 's' is the sign bit.
14614 'n' is an integer in the range 16 <= n <= 31.
14615 'r' is an integer in the range -3 <= r <= 4. */
14617 /* Return true iff X can be represented by a quarter-precision
14618 floating point immediate operand X. Note, we cannot represent 0.0. */
14619 bool
14620 aarch64_float_const_representable_p (rtx x)
14622 /* This represents our current view of how many bits
14623 make up the mantissa. */
14624 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14625 int exponent;
14626 unsigned HOST_WIDE_INT mantissa, mask;
14627 REAL_VALUE_TYPE r, m;
14628 bool fail;
14630 if (!CONST_DOUBLE_P (x))
14631 return false;
14633 if (GET_MODE (x) == VOIDmode
14634 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
14635 return false;
14637 r = *CONST_DOUBLE_REAL_VALUE (x);
14639 /* We cannot represent infinities, NaNs or +/-zero. We won't
14640 know if we have +zero until we analyse the mantissa, but we
14641 can reject the other invalid values. */
14642 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14643 || REAL_VALUE_MINUS_ZERO (r))
14644 return false;
14646 /* Extract exponent. */
14647 r = real_value_abs (&r);
14648 exponent = REAL_EXP (&r);
14650 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14651 highest (sign) bit, with a fixed binary point at bit point_pos.
14652 m1 holds the low part of the mantissa, m2 the high part.
14653 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14654 bits for the mantissa, this can fail (low bits will be lost). */
14655 real_ldexp (&m, &r, point_pos - exponent);
14656 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14658 /* If the low part of the mantissa has bits set we cannot represent
14659 the value. */
14660 if (w.ulow () != 0)
14661 return false;
14662 /* We have rejected the lower HOST_WIDE_INT, so update our
14663 understanding of how many bits lie in the mantissa and
14664 look only at the high HOST_WIDE_INT. */
14665 mantissa = w.elt (1);
14666 point_pos -= HOST_BITS_PER_WIDE_INT;
14668 /* We can only represent values with a mantissa of the form 1.xxxx. */
14669 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14670 if ((mantissa & mask) != 0)
14671 return false;
14673 /* Having filtered unrepresentable values, we may now remove all
14674 but the highest 5 bits. */
14675 mantissa >>= point_pos - 5;
14677 /* We cannot represent the value 0.0, so reject it. This is handled
14678 elsewhere. */
14679 if (mantissa == 0)
14680 return false;
14682 /* Then, as bit 4 is always set, we can mask it off, leaving
14683 the mantissa in the range [0, 15]. */
14684 mantissa &= ~(1 << 4);
14685 gcc_assert (mantissa <= 15);
14687 /* GCC internally does not use IEEE754-like encoding (where normalized
14688 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14689 Our mantissa values are shifted 4 places to the left relative to
14690 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14691 by 5 places to correct for GCC's representation. */
14692 exponent = 5 - exponent;
14694 return (exponent >= 0 && exponent <= 7);
14697 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14698 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14699 output MOVI/MVNI, ORR or BIC immediate. */
14700 char*
14701 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14702 enum simd_immediate_check which)
14704 bool is_valid;
14705 static char templ[40];
14706 const char *mnemonic;
14707 const char *shift_op;
14708 unsigned int lane_count = 0;
14709 char element_char;
14711 struct simd_immediate_info info;
14713 /* This will return true to show const_vector is legal for use as either
14714 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14715 It will also update INFO to show how the immediate should be generated.
14716 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14717 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14718 gcc_assert (is_valid);
14720 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14721 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14723 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14725 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14726 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14727 move immediate path. */
14728 if (aarch64_float_const_zero_rtx_p (info.value))
14729 info.value = GEN_INT (0);
14730 else
14732 const unsigned int buf_size = 20;
14733 char float_buf[buf_size] = {'\0'};
14734 real_to_decimal_for_mode (float_buf,
14735 CONST_DOUBLE_REAL_VALUE (info.value),
14736 buf_size, buf_size, 1, info.elt_mode);
14738 if (lane_count == 1)
14739 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14740 else
14741 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14742 lane_count, element_char, float_buf);
14743 return templ;
14747 gcc_assert (CONST_INT_P (info.value));
14749 if (which == AARCH64_CHECK_MOV)
14751 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14752 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14753 if (lane_count == 1)
14754 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14755 mnemonic, UINTVAL (info.value));
14756 else if (info.shift)
14757 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14758 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14759 element_char, UINTVAL (info.value), shift_op, info.shift);
14760 else
14761 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14762 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14763 element_char, UINTVAL (info.value));
14765 else
14767 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
14768 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14769 if (info.shift)
14770 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14771 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14772 element_char, UINTVAL (info.value), "lsl", info.shift);
14773 else
14774 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14775 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14776 element_char, UINTVAL (info.value));
14778 return templ;
14781 char*
14782 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14785 /* If a floating point number was passed and we desire to use it in an
14786 integer mode do the conversion to integer. */
14787 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14789 unsigned HOST_WIDE_INT ival;
14790 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14791 gcc_unreachable ();
14792 immediate = gen_int_mode (ival, mode);
14795 machine_mode vmode;
14796 /* use a 64 bit mode for everything except for DI/DF mode, where we use
14797 a 128 bit vector mode. */
14798 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14800 vmode = aarch64_simd_container_mode (mode, width);
14801 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14802 return aarch64_output_simd_mov_immediate (v_op, width);
14805 /* Return the output string to use for moving immediate CONST_VECTOR
14806 into an SVE register. */
14808 char *
14809 aarch64_output_sve_mov_immediate (rtx const_vector)
14811 static char templ[40];
14812 struct simd_immediate_info info;
14813 char element_char;
14815 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
14816 gcc_assert (is_valid);
14818 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14820 if (info.step)
14822 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
14823 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
14824 element_char, INTVAL (info.value), INTVAL (info.step));
14825 return templ;
14828 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14830 if (aarch64_float_const_zero_rtx_p (info.value))
14831 info.value = GEN_INT (0);
14832 else
14834 const int buf_size = 20;
14835 char float_buf[buf_size] = {};
14836 real_to_decimal_for_mode (float_buf,
14837 CONST_DOUBLE_REAL_VALUE (info.value),
14838 buf_size, buf_size, 1, info.elt_mode);
14840 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
14841 element_char, float_buf);
14842 return templ;
14846 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
14847 element_char, INTVAL (info.value));
14848 return templ;
14851 /* Return the asm format for a PTRUE instruction whose destination has
14852 mode MODE. SUFFIX is the element size suffix. */
14854 char *
14855 aarch64_output_ptrue (machine_mode mode, char suffix)
14857 unsigned int nunits;
14858 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
14859 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
14860 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
14861 else
14862 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
14863 return buf;
14866 /* Split operands into moves from op[1] + op[2] into op[0]. */
14868 void
14869 aarch64_split_combinev16qi (rtx operands[3])
14871 unsigned int dest = REGNO (operands[0]);
14872 unsigned int src1 = REGNO (operands[1]);
14873 unsigned int src2 = REGNO (operands[2]);
14874 machine_mode halfmode = GET_MODE (operands[1]);
14875 unsigned int halfregs = REG_NREGS (operands[1]);
14876 rtx destlo, desthi;
14878 gcc_assert (halfmode == V16QImode);
14880 if (src1 == dest && src2 == dest + halfregs)
14882 /* No-op move. Can't split to nothing; emit something. */
14883 emit_note (NOTE_INSN_DELETED);
14884 return;
14887 /* Preserve register attributes for variable tracking. */
14888 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
14889 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
14890 GET_MODE_SIZE (halfmode));
14892 /* Special case of reversed high/low parts. */
14893 if (reg_overlap_mentioned_p (operands[2], destlo)
14894 && reg_overlap_mentioned_p (operands[1], desthi))
14896 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14897 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
14898 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14900 else if (!reg_overlap_mentioned_p (operands[2], destlo))
14902 /* Try to avoid unnecessary moves if part of the result
14903 is in the right place already. */
14904 if (src1 != dest)
14905 emit_move_insn (destlo, operands[1]);
14906 if (src2 != dest + halfregs)
14907 emit_move_insn (desthi, operands[2]);
14909 else
14911 if (src2 != dest + halfregs)
14912 emit_move_insn (desthi, operands[2]);
14913 if (src1 != dest)
14914 emit_move_insn (destlo, operands[1]);
14918 /* vec_perm support. */
14920 struct expand_vec_perm_d
14922 rtx target, op0, op1;
14923 vec_perm_indices perm;
14924 machine_mode vmode;
14925 unsigned int vec_flags;
14926 bool one_vector_p;
14927 bool testing_p;
14930 /* Generate a variable permutation. */
14932 static void
14933 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
14935 machine_mode vmode = GET_MODE (target);
14936 bool one_vector_p = rtx_equal_p (op0, op1);
14938 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
14939 gcc_checking_assert (GET_MODE (op0) == vmode);
14940 gcc_checking_assert (GET_MODE (op1) == vmode);
14941 gcc_checking_assert (GET_MODE (sel) == vmode);
14942 gcc_checking_assert (TARGET_SIMD);
14944 if (one_vector_p)
14946 if (vmode == V8QImode)
14948 /* Expand the argument to a V16QI mode by duplicating it. */
14949 rtx pair = gen_reg_rtx (V16QImode);
14950 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
14951 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14953 else
14955 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
14958 else
14960 rtx pair;
14962 if (vmode == V8QImode)
14964 pair = gen_reg_rtx (V16QImode);
14965 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
14966 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14968 else
14970 pair = gen_reg_rtx (OImode);
14971 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
14972 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
14977 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
14978 NELT is the number of elements in the vector. */
14980 void
14981 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
14982 unsigned int nelt)
14984 machine_mode vmode = GET_MODE (target);
14985 bool one_vector_p = rtx_equal_p (op0, op1);
14986 rtx mask;
14988 /* The TBL instruction does not use a modulo index, so we must take care
14989 of that ourselves. */
14990 mask = aarch64_simd_gen_const_vector_dup (vmode,
14991 one_vector_p ? nelt - 1 : 2 * nelt - 1);
14992 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
14994 /* For big-endian, we also need to reverse the index within the vector
14995 (but not which vector). */
14996 if (BYTES_BIG_ENDIAN)
14998 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
14999 if (!one_vector_p)
15000 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15001 sel = expand_simple_binop (vmode, XOR, sel, mask,
15002 NULL, 0, OPTAB_LIB_WIDEN);
15004 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15007 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15009 static void
15010 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15012 emit_insn (gen_rtx_SET (target,
15013 gen_rtx_UNSPEC (GET_MODE (target),
15014 gen_rtvec (2, op0, op1), code)));
15017 /* Expand an SVE vec_perm with the given operands. */
15019 void
15020 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15022 machine_mode data_mode = GET_MODE (target);
15023 machine_mode sel_mode = GET_MODE (sel);
15024 /* Enforced by the pattern condition. */
15025 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15027 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15028 size of the two value vectors, i.e. the upper bits of the indices
15029 are effectively ignored. SVE TBL instead produces 0 for any
15030 out-of-range indices, so we need to modulo all the vec_perm indices
15031 to ensure they are all in range. */
15032 rtx sel_reg = force_reg (sel_mode, sel);
15034 /* Check if the sel only references the first values vector. */
15035 if (GET_CODE (sel) == CONST_VECTOR
15036 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15038 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15039 return;
15042 /* Check if the two values vectors are the same. */
15043 if (rtx_equal_p (op0, op1))
15045 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15046 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15047 NULL, 0, OPTAB_DIRECT);
15048 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15049 return;
15052 /* Run TBL on for each value vector and combine the results. */
15054 rtx res0 = gen_reg_rtx (data_mode);
15055 rtx res1 = gen_reg_rtx (data_mode);
15056 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15057 if (GET_CODE (sel) != CONST_VECTOR
15058 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15060 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15061 2 * nunits - 1);
15062 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15063 NULL, 0, OPTAB_DIRECT);
15065 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15066 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15067 NULL, 0, OPTAB_DIRECT);
15068 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15069 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15070 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15071 else
15072 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15075 /* Recognize patterns suitable for the TRN instructions. */
15076 static bool
15077 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15079 HOST_WIDE_INT odd;
15080 poly_uint64 nelt = d->perm.length ();
15081 rtx out, in0, in1, x;
15082 machine_mode vmode = d->vmode;
15084 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15085 return false;
15087 /* Note that these are little-endian tests.
15088 We correct for big-endian later. */
15089 if (!d->perm[0].is_constant (&odd)
15090 || (odd != 0 && odd != 1)
15091 || !d->perm.series_p (0, 2, odd, 2)
15092 || !d->perm.series_p (1, 2, nelt + odd, 2))
15093 return false;
15095 /* Success! */
15096 if (d->testing_p)
15097 return true;
15099 in0 = d->op0;
15100 in1 = d->op1;
15101 /* We don't need a big-endian lane correction for SVE; see the comment
15102 at the head of aarch64-sve.md for details. */
15103 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15105 x = in0, in0 = in1, in1 = x;
15106 odd = !odd;
15108 out = d->target;
15110 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15111 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15112 return true;
15115 /* Recognize patterns suitable for the UZP instructions. */
15116 static bool
15117 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15119 HOST_WIDE_INT odd;
15120 rtx out, in0, in1, x;
15121 machine_mode vmode = d->vmode;
15123 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15124 return false;
15126 /* Note that these are little-endian tests.
15127 We correct for big-endian later. */
15128 if (!d->perm[0].is_constant (&odd)
15129 || (odd != 0 && odd != 1)
15130 || !d->perm.series_p (0, 1, odd, 2))
15131 return false;
15133 /* Success! */
15134 if (d->testing_p)
15135 return true;
15137 in0 = d->op0;
15138 in1 = d->op1;
15139 /* We don't need a big-endian lane correction for SVE; see the comment
15140 at the head of aarch64-sve.md for details. */
15141 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15143 x = in0, in0 = in1, in1 = x;
15144 odd = !odd;
15146 out = d->target;
15148 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15149 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15150 return true;
15153 /* Recognize patterns suitable for the ZIP instructions. */
15154 static bool
15155 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15157 unsigned int high;
15158 poly_uint64 nelt = d->perm.length ();
15159 rtx out, in0, in1, x;
15160 machine_mode vmode = d->vmode;
15162 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15163 return false;
15165 /* Note that these are little-endian tests.
15166 We correct for big-endian later. */
15167 poly_uint64 first = d->perm[0];
15168 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15169 || !d->perm.series_p (0, 2, first, 1)
15170 || !d->perm.series_p (1, 2, first + nelt, 1))
15171 return false;
15172 high = maybe_ne (first, 0U);
15174 /* Success! */
15175 if (d->testing_p)
15176 return true;
15178 in0 = d->op0;
15179 in1 = d->op1;
15180 /* We don't need a big-endian lane correction for SVE; see the comment
15181 at the head of aarch64-sve.md for details. */
15182 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15184 x = in0, in0 = in1, in1 = x;
15185 high = !high;
15187 out = d->target;
15189 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15190 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15191 return true;
15194 /* Recognize patterns for the EXT insn. */
15196 static bool
15197 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15199 HOST_WIDE_INT location;
15200 rtx offset;
15202 /* The first element always refers to the first vector.
15203 Check if the extracted indices are increasing by one. */
15204 if (d->vec_flags == VEC_SVE_PRED
15205 || !d->perm[0].is_constant (&location)
15206 || !d->perm.series_p (0, 1, location, 1))
15207 return false;
15209 /* Success! */
15210 if (d->testing_p)
15211 return true;
15213 /* The case where (location == 0) is a no-op for both big- and little-endian,
15214 and is removed by the mid-end at optimization levels -O1 and higher.
15216 We don't need a big-endian lane correction for SVE; see the comment
15217 at the head of aarch64-sve.md for details. */
15218 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15220 /* After setup, we want the high elements of the first vector (stored
15221 at the LSB end of the register), and the low elements of the second
15222 vector (stored at the MSB end of the register). So swap. */
15223 std::swap (d->op0, d->op1);
15224 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15225 to_constant () is safe since this is restricted to Advanced SIMD
15226 vectors. */
15227 location = d->perm.length ().to_constant () - location;
15230 offset = GEN_INT (location);
15231 emit_set_insn (d->target,
15232 gen_rtx_UNSPEC (d->vmode,
15233 gen_rtvec (3, d->op0, d->op1, offset),
15234 UNSPEC_EXT));
15235 return true;
15238 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15239 within each 64-bit, 32-bit or 16-bit granule. */
15241 static bool
15242 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15244 HOST_WIDE_INT diff;
15245 unsigned int i, size, unspec;
15246 machine_mode pred_mode;
15248 if (d->vec_flags == VEC_SVE_PRED
15249 || !d->one_vector_p
15250 || !d->perm[0].is_constant (&diff))
15251 return false;
15253 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15254 if (size == 8)
15256 unspec = UNSPEC_REV64;
15257 pred_mode = VNx2BImode;
15259 else if (size == 4)
15261 unspec = UNSPEC_REV32;
15262 pred_mode = VNx4BImode;
15264 else if (size == 2)
15266 unspec = UNSPEC_REV16;
15267 pred_mode = VNx8BImode;
15269 else
15270 return false;
15272 unsigned int step = diff + 1;
15273 for (i = 0; i < step; ++i)
15274 if (!d->perm.series_p (i, step, diff - i, step))
15275 return false;
15277 /* Success! */
15278 if (d->testing_p)
15279 return true;
15281 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15282 if (d->vec_flags == VEC_SVE_DATA)
15284 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15285 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15286 UNSPEC_MERGE_PTRUE);
15288 emit_set_insn (d->target, src);
15289 return true;
15292 /* Recognize patterns for the REV insn, which reverses elements within
15293 a full vector. */
15295 static bool
15296 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15298 poly_uint64 nelt = d->perm.length ();
15300 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15301 return false;
15303 if (!d->perm.series_p (0, 1, nelt - 1, -1))
15304 return false;
15306 /* Success! */
15307 if (d->testing_p)
15308 return true;
15310 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15311 emit_set_insn (d->target, src);
15312 return true;
15315 static bool
15316 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15318 rtx out = d->target;
15319 rtx in0;
15320 HOST_WIDE_INT elt;
15321 machine_mode vmode = d->vmode;
15322 rtx lane;
15324 if (d->vec_flags == VEC_SVE_PRED
15325 || d->perm.encoding ().encoded_nelts () != 1
15326 || !d->perm[0].is_constant (&elt))
15327 return false;
15329 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15330 return false;
15332 /* Success! */
15333 if (d->testing_p)
15334 return true;
15336 /* The generic preparation in aarch64_expand_vec_perm_const_1
15337 swaps the operand order and the permute indices if it finds
15338 d->perm[0] to be in the second operand. Thus, we can always
15339 use d->op0 and need not do any extra arithmetic to get the
15340 correct lane number. */
15341 in0 = d->op0;
15342 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
15344 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15345 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15346 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15347 return true;
15350 static bool
15351 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15353 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15354 machine_mode vmode = d->vmode;
15356 /* Make sure that the indices are constant. */
15357 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15358 for (unsigned int i = 0; i < encoded_nelts; ++i)
15359 if (!d->perm[i].is_constant ())
15360 return false;
15362 if (d->testing_p)
15363 return true;
15365 /* Generic code will try constant permutation twice. Once with the
15366 original mode and again with the elements lowered to QImode.
15367 So wait and don't do the selector expansion ourselves. */
15368 if (vmode != V8QImode && vmode != V16QImode)
15369 return false;
15371 /* to_constant is safe since this routine is specific to Advanced SIMD
15372 vectors. */
15373 unsigned int nelt = d->perm.length ().to_constant ();
15374 for (unsigned int i = 0; i < nelt; ++i)
15375 /* If big-endian and two vectors we end up with a weird mixed-endian
15376 mode on NEON. Reverse the index within each word but not the word
15377 itself. to_constant is safe because we checked is_constant above. */
15378 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15379 ? d->perm[i].to_constant () ^ (nelt - 1)
15380 : d->perm[i].to_constant ());
15382 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15383 sel = force_reg (vmode, sel);
15385 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15386 return true;
15389 /* Try to implement D using an SVE TBL instruction. */
15391 static bool
15392 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15394 unsigned HOST_WIDE_INT nelt;
15396 /* Permuting two variable-length vectors could overflow the
15397 index range. */
15398 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15399 return false;
15401 if (d->testing_p)
15402 return true;
15404 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15405 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15406 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15407 return true;
15410 static bool
15411 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15413 /* The pattern matching functions above are written to look for a small
15414 number to begin the sequence (0, 1, N/2). If we begin with an index
15415 from the second operand, we can swap the operands. */
15416 poly_int64 nelt = d->perm.length ();
15417 if (known_ge (d->perm[0], nelt))
15419 d->perm.rotate_inputs (1);
15420 std::swap (d->op0, d->op1);
15423 if ((d->vec_flags == VEC_ADVSIMD
15424 || d->vec_flags == VEC_SVE_DATA
15425 || d->vec_flags == VEC_SVE_PRED)
15426 && known_gt (nelt, 1))
15428 if (aarch64_evpc_rev_local (d))
15429 return true;
15430 else if (aarch64_evpc_rev_global (d))
15431 return true;
15432 else if (aarch64_evpc_ext (d))
15433 return true;
15434 else if (aarch64_evpc_dup (d))
15435 return true;
15436 else if (aarch64_evpc_zip (d))
15437 return true;
15438 else if (aarch64_evpc_uzp (d))
15439 return true;
15440 else if (aarch64_evpc_trn (d))
15441 return true;
15442 if (d->vec_flags == VEC_SVE_DATA)
15443 return aarch64_evpc_sve_tbl (d);
15444 else if (d->vec_flags == VEC_SVE_DATA)
15445 return aarch64_evpc_tbl (d);
15447 return false;
15450 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15452 static bool
15453 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15454 rtx op1, const vec_perm_indices &sel)
15456 struct expand_vec_perm_d d;
15458 /* Check whether the mask can be applied to a single vector. */
15459 if (op0 && rtx_equal_p (op0, op1))
15460 d.one_vector_p = true;
15461 else if (sel.all_from_input_p (0))
15463 d.one_vector_p = true;
15464 op1 = op0;
15466 else if (sel.all_from_input_p (1))
15468 d.one_vector_p = true;
15469 op0 = op1;
15471 else
15472 d.one_vector_p = false;
15474 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15475 sel.nelts_per_input ());
15476 d.vmode = vmode;
15477 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15478 d.target = target;
15479 d.op0 = op0;
15480 d.op1 = op1;
15481 d.testing_p = !target;
15483 if (!d.testing_p)
15484 return aarch64_expand_vec_perm_const_1 (&d);
15486 rtx_insn *last = get_last_insn ();
15487 bool ret = aarch64_expand_vec_perm_const_1 (&d);
15488 gcc_assert (last == get_last_insn ());
15490 return ret;
15493 /* Generate a byte permute mask for a register of mode MODE,
15494 which has NUNITS units. */
15497 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15499 /* We have to reverse each vector because we dont have
15500 a permuted load that can reverse-load according to ABI rules. */
15501 rtx mask;
15502 rtvec v = rtvec_alloc (16);
15503 unsigned int i, j;
15504 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15506 gcc_assert (BYTES_BIG_ENDIAN);
15507 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15509 for (i = 0; i < nunits; i++)
15510 for (j = 0; j < usize; j++)
15511 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15512 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15513 return force_reg (V16QImode, mask);
15516 /* Return true if X is a valid second operand for the SVE instruction
15517 that implements integer comparison OP_CODE. */
15519 static bool
15520 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15522 if (register_operand (x, VOIDmode))
15523 return true;
15525 switch (op_code)
15527 case LTU:
15528 case LEU:
15529 case GEU:
15530 case GTU:
15531 return aarch64_sve_cmp_immediate_p (x, false);
15532 case LT:
15533 case LE:
15534 case GE:
15535 case GT:
15536 case NE:
15537 case EQ:
15538 return aarch64_sve_cmp_immediate_p (x, true);
15539 default:
15540 gcc_unreachable ();
15544 /* Use predicated SVE instructions to implement the equivalent of:
15546 (set TARGET OP)
15548 given that PTRUE is an all-true predicate of the appropriate mode. */
15550 static void
15551 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15553 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15554 gen_rtvec (2, ptrue, op),
15555 UNSPEC_MERGE_PTRUE);
15556 rtx_insn *insn = emit_set_insn (target, unspec);
15557 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15560 /* Likewise, but also clobber the condition codes. */
15562 static void
15563 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15565 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15566 gen_rtvec (2, ptrue, op),
15567 UNSPEC_MERGE_PTRUE);
15568 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15569 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15572 /* Return the UNSPEC_COND_* code for comparison CODE. */
15574 static unsigned int
15575 aarch64_unspec_cond_code (rtx_code code)
15577 switch (code)
15579 case NE:
15580 return UNSPEC_COND_NE;
15581 case EQ:
15582 return UNSPEC_COND_EQ;
15583 case LT:
15584 return UNSPEC_COND_LT;
15585 case GT:
15586 return UNSPEC_COND_GT;
15587 case LE:
15588 return UNSPEC_COND_LE;
15589 case GE:
15590 return UNSPEC_COND_GE;
15591 default:
15592 gcc_unreachable ();
15596 /* Emit:
15598 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15600 where <X> is the operation associated with comparison CODE. This form
15601 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15602 semantics, such as when PRED might not be all-true and when comparing
15603 inactive lanes could have side effects. */
15605 static void
15606 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15607 rtx pred, rtx op0, rtx op1)
15609 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15610 gen_rtvec (3, pred, op0, op1),
15611 aarch64_unspec_cond_code (code));
15612 emit_set_insn (target, unspec);
15615 /* Expand an SVE integer comparison using the SVE equivalent of:
15617 (set TARGET (CODE OP0 OP1)). */
15619 void
15620 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15622 machine_mode pred_mode = GET_MODE (target);
15623 machine_mode data_mode = GET_MODE (op0);
15625 if (!aarch64_sve_cmp_operand_p (code, op1))
15626 op1 = force_reg (data_mode, op1);
15628 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15629 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15630 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15633 /* Emit the SVE equivalent of:
15635 (set TMP1 (CODE1 OP0 OP1))
15636 (set TMP2 (CODE2 OP0 OP1))
15637 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15639 PTRUE is an all-true predicate with the same mode as TARGET. */
15641 static void
15642 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15643 rtx ptrue, rtx op0, rtx op1)
15645 machine_mode pred_mode = GET_MODE (ptrue);
15646 rtx tmp1 = gen_reg_rtx (pred_mode);
15647 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15648 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15649 rtx tmp2 = gen_reg_rtx (pred_mode);
15650 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15651 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15652 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15655 /* Emit the SVE equivalent of:
15657 (set TMP (CODE OP0 OP1))
15658 (set TARGET (not TMP))
15660 PTRUE is an all-true predicate with the same mode as TARGET. */
15662 static void
15663 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15664 rtx op0, rtx op1)
15666 machine_mode pred_mode = GET_MODE (ptrue);
15667 rtx tmp = gen_reg_rtx (pred_mode);
15668 aarch64_emit_sve_ptrue_op (tmp, ptrue,
15669 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15670 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15673 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15675 (set TARGET (CODE OP0 OP1))
15677 If CAN_INVERT_P is true, the caller can also handle inverted results;
15678 return true if the result is in fact inverted. */
15680 bool
15681 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15682 rtx op0, rtx op1, bool can_invert_p)
15684 machine_mode pred_mode = GET_MODE (target);
15685 machine_mode data_mode = GET_MODE (op0);
15687 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15688 switch (code)
15690 case UNORDERED:
15691 /* UNORDERED has no immediate form. */
15692 op1 = force_reg (data_mode, op1);
15693 /* fall through */
15694 case LT:
15695 case LE:
15696 case GT:
15697 case GE:
15698 case EQ:
15699 case NE:
15701 /* There is native support for the comparison. */
15702 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15703 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15704 return false;
15707 case LTGT:
15708 /* This is a trapping operation (LT or GT). */
15709 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15710 return false;
15712 case UNEQ:
15713 if (!flag_trapping_math)
15715 /* This would trap for signaling NaNs. */
15716 op1 = force_reg (data_mode, op1);
15717 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15718 return false;
15720 /* fall through */
15721 case UNLT:
15722 case UNLE:
15723 case UNGT:
15724 case UNGE:
15725 if (flag_trapping_math)
15727 /* Work out which elements are ordered. */
15728 rtx ordered = gen_reg_rtx (pred_mode);
15729 op1 = force_reg (data_mode, op1);
15730 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
15732 /* Test the opposite condition for the ordered elements,
15733 then invert the result. */
15734 if (code == UNEQ)
15735 code = NE;
15736 else
15737 code = reverse_condition_maybe_unordered (code);
15738 if (can_invert_p)
15740 aarch64_emit_sve_predicated_cond (target, code,
15741 ordered, op0, op1);
15742 return true;
15744 rtx tmp = gen_reg_rtx (pred_mode);
15745 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
15746 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15747 return false;
15749 break;
15751 case ORDERED:
15752 /* ORDERED has no immediate form. */
15753 op1 = force_reg (data_mode, op1);
15754 break;
15756 default:
15757 gcc_unreachable ();
15760 /* There is native support for the inverse comparison. */
15761 code = reverse_condition_maybe_unordered (code);
15762 if (can_invert_p)
15764 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15765 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15766 return true;
15768 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
15769 return false;
15772 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
15773 of the data being selected and CMP_MODE is the mode of the values being
15774 compared. */
15776 void
15777 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15778 rtx *ops)
15780 machine_mode pred_mode
15781 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15782 GET_MODE_SIZE (cmp_mode)).require ();
15783 rtx pred = gen_reg_rtx (pred_mode);
15784 if (FLOAT_MODE_P (cmp_mode))
15786 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15787 ops[4], ops[5], true))
15788 std::swap (ops[1], ops[2]);
15790 else
15791 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15793 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15794 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15797 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
15798 true. However due to issues with register allocation it is preferable
15799 to avoid tieing integer scalar and FP scalar modes. Executing integer
15800 operations in general registers is better than treating them as scalar
15801 vector operations. This reduces latency and avoids redundant int<->FP
15802 moves. So tie modes if they are either the same class, or vector modes
15803 with other vector modes, vector structs or any scalar mode. */
15805 static bool
15806 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15808 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15809 return true;
15811 /* We specifically want to allow elements of "structure" modes to
15812 be tieable to the structure. This more general condition allows
15813 other rarer situations too. The reason we don't extend this to
15814 predicate modes is that there are no predicate structure modes
15815 nor any specific instructions for extracting part of a predicate
15816 register. */
15817 if (aarch64_vector_data_mode_p (mode1)
15818 && aarch64_vector_data_mode_p (mode2))
15819 return true;
15821 /* Also allow any scalar modes with vectors. */
15822 if (aarch64_vector_mode_supported_p (mode1)
15823 || aarch64_vector_mode_supported_p (mode2))
15824 return true;
15826 return false;
15829 /* Return a new RTX holding the result of moving POINTER forward by
15830 AMOUNT bytes. */
15832 static rtx
15833 aarch64_move_pointer (rtx pointer, poly_int64 amount)
15835 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
15837 return adjust_automodify_address (pointer, GET_MODE (pointer),
15838 next, amount);
15841 /* Return a new RTX holding the result of moving POINTER forward by the
15842 size of the mode it points to. */
15844 static rtx
15845 aarch64_progress_pointer (rtx pointer)
15847 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
15850 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15851 MODE bytes. */
15853 static void
15854 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
15855 machine_mode mode)
15857 rtx reg = gen_reg_rtx (mode);
15859 /* "Cast" the pointers to the correct mode. */
15860 *src = adjust_address (*src, mode, 0);
15861 *dst = adjust_address (*dst, mode, 0);
15862 /* Emit the memcpy. */
15863 emit_move_insn (reg, *src);
15864 emit_move_insn (*dst, reg);
15865 /* Move the pointers forward. */
15866 *src = aarch64_progress_pointer (*src);
15867 *dst = aarch64_progress_pointer (*dst);
15870 /* Expand movmem, as if from a __builtin_memcpy. Return true if
15871 we succeed, otherwise return false. */
15873 bool
15874 aarch64_expand_movmem (rtx *operands)
15876 int n, mode_bits;
15877 rtx dst = operands[0];
15878 rtx src = operands[1];
15879 rtx base;
15880 machine_mode cur_mode = BLKmode, next_mode;
15881 bool speed_p = !optimize_function_for_size_p (cfun);
15883 /* When optimizing for size, give a better estimate of the length of a
15884 memcpy call, but use the default otherwise. Moves larger than 8 bytes
15885 will always require an even number of instructions to do now. And each
15886 operation requires both a load+store, so devide the max number by 2. */
15887 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
15889 /* We can't do anything smart if the amount to copy is not constant. */
15890 if (!CONST_INT_P (operands[2]))
15891 return false;
15893 n = INTVAL (operands[2]);
15895 /* Try to keep the number of instructions low. For all cases we will do at
15896 most two moves for the residual amount, since we'll always overlap the
15897 remainder. */
15898 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
15899 return false;
15901 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
15902 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
15904 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
15905 src = adjust_automodify_address (src, VOIDmode, base, 0);
15907 /* Convert n to bits to make the rest of the code simpler. */
15908 n = n * BITS_PER_UNIT;
15910 while (n > 0)
15912 /* Find the largest mode in which to do the copy in without over reading
15913 or writing. */
15914 opt_scalar_int_mode mode_iter;
15915 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
15916 if (GET_MODE_BITSIZE (mode_iter.require ()) <= n)
15917 cur_mode = mode_iter.require ();
15919 gcc_assert (cur_mode != BLKmode);
15921 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
15922 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
15924 n -= mode_bits;
15926 /* Do certain trailing copies as overlapping if it's going to be
15927 cheaper. i.e. less instructions to do so. For instance doing a 15
15928 byte copy it's more efficient to do two overlapping 8 byte copies than
15929 8 + 6 + 1. */
15930 next_mode = smallest_mode_for_size (n, MODE_INT);
15931 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
15932 if (n > 0 && n_bits > n && n_bits <= 8 * BITS_PER_UNIT)
15934 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
15935 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
15936 n = n_bits;
15940 return true;
15943 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
15944 SImode stores. Handle the case when the constant has identical
15945 bottom and top halves. This is beneficial when the two stores can be
15946 merged into an STP and we avoid synthesising potentially expensive
15947 immediates twice. Return true if such a split is possible. */
15949 bool
15950 aarch64_split_dimode_const_store (rtx dst, rtx src)
15952 rtx lo = gen_lowpart (SImode, src);
15953 rtx hi = gen_highpart_mode (SImode, DImode, src);
15955 bool size_p = optimize_function_for_size_p (cfun);
15957 if (!rtx_equal_p (lo, hi))
15958 return false;
15960 unsigned int orig_cost
15961 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
15962 unsigned int lo_cost
15963 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
15965 /* We want to transform:
15966 MOV x1, 49370
15967 MOVK x1, 0x140, lsl 16
15968 MOVK x1, 0xc0da, lsl 32
15969 MOVK x1, 0x140, lsl 48
15970 STR x1, [x0]
15971 into:
15972 MOV w1, 49370
15973 MOVK w1, 0x140, lsl 16
15974 STP w1, w1, [x0]
15975 So we want to perform this only when we save two instructions
15976 or more. When optimizing for size, however, accept any code size
15977 savings we can. */
15978 if (size_p && orig_cost <= lo_cost)
15979 return false;
15981 if (!size_p
15982 && (orig_cost <= lo_cost + 1))
15983 return false;
15985 rtx mem_lo = adjust_address (dst, SImode, 0);
15986 if (!aarch64_mem_pair_operand (mem_lo, SImode))
15987 return false;
15989 rtx tmp_reg = gen_reg_rtx (SImode);
15990 aarch64_expand_mov_immediate (tmp_reg, lo);
15991 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
15992 /* Don't emit an explicit store pair as this may not be always profitable.
15993 Let the sched-fusion logic decide whether to merge them. */
15994 emit_move_insn (mem_lo, tmp_reg);
15995 emit_move_insn (mem_hi, tmp_reg);
15997 return true;
16000 /* Generate RTL for a conditional branch with rtx comparison CODE in
16001 mode CC_MODE. The destination of the unlikely conditional branch
16002 is LABEL_REF. */
16004 void
16005 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
16006 rtx label_ref)
16008 rtx x;
16009 x = gen_rtx_fmt_ee (code, VOIDmode,
16010 gen_rtx_REG (cc_mode, CC_REGNUM),
16011 const0_rtx);
16013 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16014 gen_rtx_LABEL_REF (VOIDmode, label_ref),
16015 pc_rtx);
16016 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16019 /* Generate DImode scratch registers for 128-bit (TImode) addition.
16021 OP1 represents the TImode destination operand 1
16022 OP2 represents the TImode destination operand 2
16023 LOW_DEST represents the low half (DImode) of TImode operand 0
16024 LOW_IN1 represents the low half (DImode) of TImode operand 1
16025 LOW_IN2 represents the low half (DImode) of TImode operand 2
16026 HIGH_DEST represents the high half (DImode) of TImode operand 0
16027 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16028 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16030 void
16031 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16032 rtx *low_in1, rtx *low_in2,
16033 rtx *high_dest, rtx *high_in1,
16034 rtx *high_in2)
16036 *low_dest = gen_reg_rtx (DImode);
16037 *low_in1 = gen_lowpart (DImode, op1);
16038 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16039 subreg_lowpart_offset (DImode, TImode));
16040 *high_dest = gen_reg_rtx (DImode);
16041 *high_in1 = gen_highpart (DImode, op1);
16042 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16043 subreg_highpart_offset (DImode, TImode));
16046 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
16048 This function differs from 'arch64_addti_scratch_regs' in that
16049 OP1 can be an immediate constant (zero). We must call
16050 subreg_highpart_offset with DImode and TImode arguments, otherwise
16051 VOIDmode will be used for the const_int which generates an internal
16052 error from subreg_size_highpart_offset which does not expect a size of zero.
16054 OP1 represents the TImode destination operand 1
16055 OP2 represents the TImode destination operand 2
16056 LOW_DEST represents the low half (DImode) of TImode operand 0
16057 LOW_IN1 represents the low half (DImode) of TImode operand 1
16058 LOW_IN2 represents the low half (DImode) of TImode operand 2
16059 HIGH_DEST represents the high half (DImode) of TImode operand 0
16060 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16061 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16064 void
16065 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16066 rtx *low_in1, rtx *low_in2,
16067 rtx *high_dest, rtx *high_in1,
16068 rtx *high_in2)
16070 *low_dest = gen_reg_rtx (DImode);
16071 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
16072 subreg_lowpart_offset (DImode, TImode));
16074 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16075 subreg_lowpart_offset (DImode, TImode));
16076 *high_dest = gen_reg_rtx (DImode);
16078 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
16079 subreg_highpart_offset (DImode, TImode));
16080 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16081 subreg_highpart_offset (DImode, TImode));
16084 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
16086 OP0 represents the TImode destination operand 0
16087 LOW_DEST represents the low half (DImode) of TImode operand 0
16088 LOW_IN1 represents the low half (DImode) of TImode operand 1
16089 LOW_IN2 represents the low half (DImode) of TImode operand 2
16090 HIGH_DEST represents the high half (DImode) of TImode operand 0
16091 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16092 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16094 void
16095 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
16096 rtx low_in2, rtx high_dest, rtx high_in1,
16097 rtx high_in2)
16099 if (low_in2 == const0_rtx)
16101 low_dest = low_in1;
16102 emit_insn (gen_subdi3_compare1 (high_dest, high_in1,
16103 force_reg (DImode, high_in2)));
16105 else
16107 if (CONST_INT_P (low_in2))
16109 low_in2 = force_reg (DImode, GEN_INT (-UINTVAL (low_in2)));
16110 high_in2 = force_reg (DImode, high_in2);
16111 emit_insn (gen_adddi3_compareC (low_dest, low_in1, low_in2));
16113 else
16114 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
16115 emit_insn (gen_subdi3_carryinCV (high_dest,
16116 force_reg (DImode, high_in1),
16117 high_in2));
16120 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
16121 emit_move_insn (gen_highpart (DImode, op0), high_dest);
16125 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16127 static unsigned HOST_WIDE_INT
16128 aarch64_asan_shadow_offset (void)
16130 return (HOST_WIDE_INT_1 << 36);
16133 static rtx
16134 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16135 int code, tree treeop0, tree treeop1)
16137 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16138 rtx op0, op1;
16139 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16140 insn_code icode;
16141 struct expand_operand ops[4];
16143 start_sequence ();
16144 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16146 op_mode = GET_MODE (op0);
16147 if (op_mode == VOIDmode)
16148 op_mode = GET_MODE (op1);
16150 switch (op_mode)
16152 case E_QImode:
16153 case E_HImode:
16154 case E_SImode:
16155 cmp_mode = SImode;
16156 icode = CODE_FOR_cmpsi;
16157 break;
16159 case E_DImode:
16160 cmp_mode = DImode;
16161 icode = CODE_FOR_cmpdi;
16162 break;
16164 case E_SFmode:
16165 cmp_mode = SFmode;
16166 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16167 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16168 break;
16170 case E_DFmode:
16171 cmp_mode = DFmode;
16172 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16173 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16174 break;
16176 default:
16177 end_sequence ();
16178 return NULL_RTX;
16181 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16182 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16183 if (!op0 || !op1)
16185 end_sequence ();
16186 return NULL_RTX;
16188 *prep_seq = get_insns ();
16189 end_sequence ();
16191 create_fixed_operand (&ops[0], op0);
16192 create_fixed_operand (&ops[1], op1);
16194 start_sequence ();
16195 if (!maybe_expand_insn (icode, 2, ops))
16197 end_sequence ();
16198 return NULL_RTX;
16200 *gen_seq = get_insns ();
16201 end_sequence ();
16203 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16204 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16207 static rtx
16208 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16209 int cmp_code, tree treeop0, tree treeop1, int bit_code)
16211 rtx op0, op1, target;
16212 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16213 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16214 insn_code icode;
16215 struct expand_operand ops[6];
16216 int aarch64_cond;
16218 push_to_sequence (*prep_seq);
16219 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16221 op_mode = GET_MODE (op0);
16222 if (op_mode == VOIDmode)
16223 op_mode = GET_MODE (op1);
16225 switch (op_mode)
16227 case E_QImode:
16228 case E_HImode:
16229 case E_SImode:
16230 cmp_mode = SImode;
16231 icode = CODE_FOR_ccmpsi;
16232 break;
16234 case E_DImode:
16235 cmp_mode = DImode;
16236 icode = CODE_FOR_ccmpdi;
16237 break;
16239 case E_SFmode:
16240 cmp_mode = SFmode;
16241 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16242 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16243 break;
16245 case E_DFmode:
16246 cmp_mode = DFmode;
16247 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16248 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16249 break;
16251 default:
16252 end_sequence ();
16253 return NULL_RTX;
16256 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16257 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16258 if (!op0 || !op1)
16260 end_sequence ();
16261 return NULL_RTX;
16263 *prep_seq = get_insns ();
16264 end_sequence ();
16266 target = gen_rtx_REG (cc_mode, CC_REGNUM);
16267 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16269 if (bit_code != AND)
16271 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16272 GET_MODE (XEXP (prev, 0))),
16273 VOIDmode, XEXP (prev, 0), const0_rtx);
16274 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16277 create_fixed_operand (&ops[0], XEXP (prev, 0));
16278 create_fixed_operand (&ops[1], target);
16279 create_fixed_operand (&ops[2], op0);
16280 create_fixed_operand (&ops[3], op1);
16281 create_fixed_operand (&ops[4], prev);
16282 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16284 push_to_sequence (*gen_seq);
16285 if (!maybe_expand_insn (icode, 6, ops))
16287 end_sequence ();
16288 return NULL_RTX;
16291 *gen_seq = get_insns ();
16292 end_sequence ();
16294 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16297 #undef TARGET_GEN_CCMP_FIRST
16298 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16300 #undef TARGET_GEN_CCMP_NEXT
16301 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16303 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16304 instruction fusion of some sort. */
16306 static bool
16307 aarch64_macro_fusion_p (void)
16309 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16313 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16314 should be kept together during scheduling. */
16316 static bool
16317 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16319 rtx set_dest;
16320 rtx prev_set = single_set (prev);
16321 rtx curr_set = single_set (curr);
16322 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16323 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16325 if (!aarch64_macro_fusion_p ())
16326 return false;
16328 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16330 /* We are trying to match:
16331 prev (mov) == (set (reg r0) (const_int imm16))
16332 curr (movk) == (set (zero_extract (reg r0)
16333 (const_int 16)
16334 (const_int 16))
16335 (const_int imm16_1)) */
16337 set_dest = SET_DEST (curr_set);
16339 if (GET_CODE (set_dest) == ZERO_EXTRACT
16340 && CONST_INT_P (SET_SRC (curr_set))
16341 && CONST_INT_P (SET_SRC (prev_set))
16342 && CONST_INT_P (XEXP (set_dest, 2))
16343 && INTVAL (XEXP (set_dest, 2)) == 16
16344 && REG_P (XEXP (set_dest, 0))
16345 && REG_P (SET_DEST (prev_set))
16346 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16348 return true;
16352 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16355 /* We're trying to match:
16356 prev (adrp) == (set (reg r1)
16357 (high (symbol_ref ("SYM"))))
16358 curr (add) == (set (reg r0)
16359 (lo_sum (reg r1)
16360 (symbol_ref ("SYM"))))
16361 Note that r0 need not necessarily be the same as r1, especially
16362 during pre-regalloc scheduling. */
16364 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16365 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16367 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16368 && REG_P (XEXP (SET_SRC (curr_set), 0))
16369 && REGNO (XEXP (SET_SRC (curr_set), 0))
16370 == REGNO (SET_DEST (prev_set))
16371 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16372 XEXP (SET_SRC (curr_set), 1)))
16373 return true;
16377 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16380 /* We're trying to match:
16381 prev (movk) == (set (zero_extract (reg r0)
16382 (const_int 16)
16383 (const_int 32))
16384 (const_int imm16_1))
16385 curr (movk) == (set (zero_extract (reg r0)
16386 (const_int 16)
16387 (const_int 48))
16388 (const_int imm16_2)) */
16390 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16391 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16392 && REG_P (XEXP (SET_DEST (prev_set), 0))
16393 && REG_P (XEXP (SET_DEST (curr_set), 0))
16394 && REGNO (XEXP (SET_DEST (prev_set), 0))
16395 == REGNO (XEXP (SET_DEST (curr_set), 0))
16396 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16397 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16398 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16399 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16400 && CONST_INT_P (SET_SRC (prev_set))
16401 && CONST_INT_P (SET_SRC (curr_set)))
16402 return true;
16405 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16407 /* We're trying to match:
16408 prev (adrp) == (set (reg r0)
16409 (high (symbol_ref ("SYM"))))
16410 curr (ldr) == (set (reg r1)
16411 (mem (lo_sum (reg r0)
16412 (symbol_ref ("SYM")))))
16414 curr (ldr) == (set (reg r1)
16415 (zero_extend (mem
16416 (lo_sum (reg r0)
16417 (symbol_ref ("SYM")))))) */
16418 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16419 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16421 rtx curr_src = SET_SRC (curr_set);
16423 if (GET_CODE (curr_src) == ZERO_EXTEND)
16424 curr_src = XEXP (curr_src, 0);
16426 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16427 && REG_P (XEXP (XEXP (curr_src, 0), 0))
16428 && REGNO (XEXP (XEXP (curr_src, 0), 0))
16429 == REGNO (SET_DEST (prev_set))
16430 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16431 XEXP (SET_SRC (prev_set), 0)))
16432 return true;
16436 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16437 && aarch_crypto_can_dual_issue (prev, curr))
16438 return true;
16440 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16441 && any_condjump_p (curr))
16443 enum attr_type prev_type = get_attr_type (prev);
16445 unsigned int condreg1, condreg2;
16446 rtx cc_reg_1;
16447 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16448 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16450 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16451 && prev
16452 && modified_in_p (cc_reg_1, prev))
16454 /* FIXME: this misses some which is considered simple arthematic
16455 instructions for ThunderX. Simple shifts are missed here. */
16456 if (prev_type == TYPE_ALUS_SREG
16457 || prev_type == TYPE_ALUS_IMM
16458 || prev_type == TYPE_LOGICS_REG
16459 || prev_type == TYPE_LOGICS_IMM)
16460 return true;
16464 if (prev_set
16465 && curr_set
16466 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16467 && any_condjump_p (curr))
16469 /* We're trying to match:
16470 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16471 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16472 (const_int 0))
16473 (label_ref ("SYM"))
16474 (pc)) */
16475 if (SET_DEST (curr_set) == (pc_rtx)
16476 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16477 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16478 && REG_P (SET_DEST (prev_set))
16479 && REGNO (SET_DEST (prev_set))
16480 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16482 /* Fuse ALU operations followed by conditional branch instruction. */
16483 switch (get_attr_type (prev))
16485 case TYPE_ALU_IMM:
16486 case TYPE_ALU_SREG:
16487 case TYPE_ADC_REG:
16488 case TYPE_ADC_IMM:
16489 case TYPE_ADCS_REG:
16490 case TYPE_ADCS_IMM:
16491 case TYPE_LOGIC_REG:
16492 case TYPE_LOGIC_IMM:
16493 case TYPE_CSEL:
16494 case TYPE_ADR:
16495 case TYPE_MOV_IMM:
16496 case TYPE_SHIFT_REG:
16497 case TYPE_SHIFT_IMM:
16498 case TYPE_BFM:
16499 case TYPE_RBIT:
16500 case TYPE_REV:
16501 case TYPE_EXTEND:
16502 return true;
16504 default:;
16509 return false;
16512 /* Return true iff the instruction fusion described by OP is enabled. */
16514 bool
16515 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16517 return (aarch64_tune_params.fusible_ops & op) != 0;
16520 /* If MEM is in the form of [base+offset], extract the two parts
16521 of address and set to BASE and OFFSET, otherwise return false
16522 after clearing BASE and OFFSET. */
16524 bool
16525 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16527 rtx addr;
16529 gcc_assert (MEM_P (mem));
16531 addr = XEXP (mem, 0);
16533 if (REG_P (addr))
16535 *base = addr;
16536 *offset = const0_rtx;
16537 return true;
16540 if (GET_CODE (addr) == PLUS
16541 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16543 *base = XEXP (addr, 0);
16544 *offset = XEXP (addr, 1);
16545 return true;
16548 *base = NULL_RTX;
16549 *offset = NULL_RTX;
16551 return false;
16554 /* Types for scheduling fusion. */
16555 enum sched_fusion_type
16557 SCHED_FUSION_NONE = 0,
16558 SCHED_FUSION_LD_SIGN_EXTEND,
16559 SCHED_FUSION_LD_ZERO_EXTEND,
16560 SCHED_FUSION_LD,
16561 SCHED_FUSION_ST,
16562 SCHED_FUSION_NUM
16565 /* If INSN is a load or store of address in the form of [base+offset],
16566 extract the two parts and set to BASE and OFFSET. Return scheduling
16567 fusion type this INSN is. */
16569 static enum sched_fusion_type
16570 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16572 rtx x, dest, src;
16573 enum sched_fusion_type fusion = SCHED_FUSION_LD;
16575 gcc_assert (INSN_P (insn));
16576 x = PATTERN (insn);
16577 if (GET_CODE (x) != SET)
16578 return SCHED_FUSION_NONE;
16580 src = SET_SRC (x);
16581 dest = SET_DEST (x);
16583 machine_mode dest_mode = GET_MODE (dest);
16585 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16586 return SCHED_FUSION_NONE;
16588 if (GET_CODE (src) == SIGN_EXTEND)
16590 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16591 src = XEXP (src, 0);
16592 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16593 return SCHED_FUSION_NONE;
16595 else if (GET_CODE (src) == ZERO_EXTEND)
16597 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16598 src = XEXP (src, 0);
16599 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16600 return SCHED_FUSION_NONE;
16603 if (GET_CODE (src) == MEM && REG_P (dest))
16604 extract_base_offset_in_addr (src, base, offset);
16605 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16607 fusion = SCHED_FUSION_ST;
16608 extract_base_offset_in_addr (dest, base, offset);
16610 else
16611 return SCHED_FUSION_NONE;
16613 if (*base == NULL_RTX || *offset == NULL_RTX)
16614 fusion = SCHED_FUSION_NONE;
16616 return fusion;
16619 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16621 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16622 and PRI are only calculated for these instructions. For other instruction,
16623 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16624 type instruction fusion can be added by returning different priorities.
16626 It's important that irrelevant instructions get the largest FUSION_PRI. */
16628 static void
16629 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16630 int *fusion_pri, int *pri)
16632 int tmp, off_val;
16633 rtx base, offset;
16634 enum sched_fusion_type fusion;
16636 gcc_assert (INSN_P (insn));
16638 tmp = max_pri - 1;
16639 fusion = fusion_load_store (insn, &base, &offset);
16640 if (fusion == SCHED_FUSION_NONE)
16642 *pri = tmp;
16643 *fusion_pri = tmp;
16644 return;
16647 /* Set FUSION_PRI according to fusion type and base register. */
16648 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16650 /* Calculate PRI. */
16651 tmp /= 2;
16653 /* INSN with smaller offset goes first. */
16654 off_val = (int)(INTVAL (offset));
16655 if (off_val >= 0)
16656 tmp -= (off_val & 0xfffff);
16657 else
16658 tmp += ((- off_val) & 0xfffff);
16660 *pri = tmp;
16661 return;
16664 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16665 Adjust priority of sha1h instructions so they are scheduled before
16666 other SHA1 instructions. */
16668 static int
16669 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16671 rtx x = PATTERN (insn);
16673 if (GET_CODE (x) == SET)
16675 x = SET_SRC (x);
16677 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16678 return priority + 10;
16681 return priority;
16684 /* Given OPERANDS of consecutive load/store, check if we can merge
16685 them into ldp/stp. LOAD is true if they are load instructions.
16686 MODE is the mode of memory operands. */
16688 bool
16689 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16690 machine_mode mode)
16692 HOST_WIDE_INT offval_1, offval_2, msize;
16693 enum reg_class rclass_1, rclass_2;
16694 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16696 if (load)
16698 mem_1 = operands[1];
16699 mem_2 = operands[3];
16700 reg_1 = operands[0];
16701 reg_2 = operands[2];
16702 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16703 if (REGNO (reg_1) == REGNO (reg_2))
16704 return false;
16706 else
16708 mem_1 = operands[0];
16709 mem_2 = operands[2];
16710 reg_1 = operands[1];
16711 reg_2 = operands[3];
16714 /* The mems cannot be volatile. */
16715 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16716 return false;
16718 /* If we have SImode and slow unaligned ldp,
16719 check the alignment to be at least 8 byte. */
16720 if (mode == SImode
16721 && (aarch64_tune_params.extra_tuning_flags
16722 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16723 && !optimize_size
16724 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16725 return false;
16727 /* Check if the addresses are in the form of [base+offset]. */
16728 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16729 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16730 return false;
16731 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16732 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16733 return false;
16735 /* Check if the bases are same. */
16736 if (!rtx_equal_p (base_1, base_2))
16737 return false;
16739 /* The operands must be of the same size. */
16740 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16741 GET_MODE_SIZE (GET_MODE (mem_2))));
16743 offval_1 = INTVAL (offset_1);
16744 offval_2 = INTVAL (offset_2);
16745 /* We should only be trying this for fixed-sized modes. There is no
16746 SVE LDP/STP instruction. */
16747 msize = GET_MODE_SIZE (mode).to_constant ();
16748 /* Check if the offsets are consecutive. */
16749 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16750 return false;
16752 /* Check if the addresses are clobbered by load. */
16753 if (load)
16755 if (reg_mentioned_p (reg_1, mem_1))
16756 return false;
16758 /* In increasing order, the last load can clobber the address. */
16759 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16760 return false;
16763 /* One of the memory accesses must be a mempair operand.
16764 If it is not the first one, they need to be swapped by the
16765 peephole. */
16766 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16767 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16768 return false;
16770 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16771 rclass_1 = FP_REGS;
16772 else
16773 rclass_1 = GENERAL_REGS;
16775 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16776 rclass_2 = FP_REGS;
16777 else
16778 rclass_2 = GENERAL_REGS;
16780 /* Check if the registers are of same class. */
16781 if (rclass_1 != rclass_2)
16782 return false;
16784 return true;
16787 /* Given OPERANDS of consecutive load/store that can be merged,
16788 swap them if they are not in ascending order. */
16789 void
16790 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16792 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16793 HOST_WIDE_INT offval_1, offval_2;
16795 if (load)
16797 mem_1 = operands[1];
16798 mem_2 = operands[3];
16800 else
16802 mem_1 = operands[0];
16803 mem_2 = operands[2];
16806 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16807 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16809 offval_1 = INTVAL (offset_1);
16810 offval_2 = INTVAL (offset_2);
16812 if (offval_1 > offval_2)
16814 /* Irrespective of whether this is a load or a store,
16815 we do the same swap. */
16816 std::swap (operands[0], operands[2]);
16817 std::swap (operands[1], operands[3]);
16821 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16822 comparison between the two. */
16824 aarch64_host_wide_int_compare (const void *x, const void *y)
16826 return wi::cmps (* ((const HOST_WIDE_INT *) x),
16827 * ((const HOST_WIDE_INT *) y));
16830 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16831 other pointing to a REG rtx containing an offset, compare the offsets
16832 of the two pairs.
16834 Return:
16836 1 iff offset (X) > offset (Y)
16837 0 iff offset (X) == offset (Y)
16838 -1 iff offset (X) < offset (Y) */
16840 aarch64_ldrstr_offset_compare (const void *x, const void *y)
16842 const rtx * operands_1 = (const rtx *) x;
16843 const rtx * operands_2 = (const rtx *) y;
16844 rtx mem_1, mem_2, base, offset_1, offset_2;
16846 if (MEM_P (operands_1[0]))
16847 mem_1 = operands_1[0];
16848 else
16849 mem_1 = operands_1[1];
16851 if (MEM_P (operands_2[0]))
16852 mem_2 = operands_2[0];
16853 else
16854 mem_2 = operands_2[1];
16856 /* Extract the offsets. */
16857 extract_base_offset_in_addr (mem_1, &base, &offset_1);
16858 extract_base_offset_in_addr (mem_2, &base, &offset_2);
16860 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
16862 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
16865 /* Given OPERANDS of consecutive load/store, check if we can merge
16866 them into ldp/stp by adjusting the offset. LOAD is true if they
16867 are load instructions. MODE is the mode of memory operands.
16869 Given below consecutive stores:
16871 str w1, [xb, 0x100]
16872 str w1, [xb, 0x104]
16873 str w1, [xb, 0x108]
16874 str w1, [xb, 0x10c]
16876 Though the offsets are out of the range supported by stp, we can
16877 still pair them after adjusting the offset, like:
16879 add scratch, xb, 0x100
16880 stp w1, w1, [scratch]
16881 stp w1, w1, [scratch, 0x8]
16883 The peephole patterns detecting this opportunity should guarantee
16884 the scratch register is avaliable. */
16886 bool
16887 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16888 scalar_mode mode)
16890 const int num_insns = 4;
16891 enum reg_class rclass;
16892 HOST_WIDE_INT offvals[num_insns], msize;
16893 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
16895 if (load)
16897 for (int i = 0; i < num_insns; i++)
16899 reg[i] = operands[2 * i];
16900 mem[i] = operands[2 * i + 1];
16902 gcc_assert (REG_P (reg[i]));
16905 /* Do not attempt to merge the loads if the loads clobber each other. */
16906 for (int i = 0; i < 8; i += 2)
16907 for (int j = i + 2; j < 8; j += 2)
16908 if (reg_overlap_mentioned_p (operands[i], operands[j]))
16909 return false;
16911 else
16912 for (int i = 0; i < num_insns; i++)
16914 mem[i] = operands[2 * i];
16915 reg[i] = operands[2 * i + 1];
16918 /* Skip if memory operand is by itself valid for ldp/stp. */
16919 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
16920 return false;
16922 for (int i = 0; i < num_insns; i++)
16924 /* The mems cannot be volatile. */
16925 if (MEM_VOLATILE_P (mem[i]))
16926 return false;
16928 /* Check if the addresses are in the form of [base+offset]. */
16929 extract_base_offset_in_addr (mem[i], base + i, offset + i);
16930 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
16931 return false;
16934 /* Check if the registers are of same class. */
16935 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
16936 ? FP_REGS : GENERAL_REGS;
16938 for (int i = 1; i < num_insns; i++)
16939 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
16941 if (rclass != FP_REGS)
16942 return false;
16944 else
16946 if (rclass != GENERAL_REGS)
16947 return false;
16950 /* Only the last register in the order in which they occur
16951 may be clobbered by the load. */
16952 if (rclass == GENERAL_REGS && load)
16953 for (int i = 0; i < num_insns - 1; i++)
16954 if (reg_mentioned_p (reg[i], mem[i]))
16955 return false;
16957 /* Check if the bases are same. */
16958 for (int i = 0; i < num_insns - 1; i++)
16959 if (!rtx_equal_p (base[i], base[i + 1]))
16960 return false;
16962 for (int i = 0; i < num_insns; i++)
16963 offvals[i] = INTVAL (offset[i]);
16965 msize = GET_MODE_SIZE (mode);
16967 /* Check if the offsets can be put in the right order to do a ldp/stp. */
16968 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
16969 aarch64_host_wide_int_compare);
16971 if (!(offvals[1] == offvals[0] + msize
16972 && offvals[3] == offvals[2] + msize))
16973 return false;
16975 /* Check that offsets are within range of each other. The ldp/stp
16976 instructions have 7 bit immediate offsets, so use 0x80. */
16977 if (offvals[2] - offvals[0] >= msize * 0x80)
16978 return false;
16980 /* The offsets must be aligned with respect to each other. */
16981 if (offvals[0] % msize != offvals[2] % msize)
16982 return false;
16984 /* If we have SImode and slow unaligned ldp,
16985 check the alignment to be at least 8 byte. */
16986 if (mode == SImode
16987 && (aarch64_tune_params.extra_tuning_flags
16988 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16989 && !optimize_size
16990 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
16991 return false;
16993 return true;
16996 /* Given OPERANDS of consecutive load/store, this function pairs them
16997 into LDP/STP after adjusting the offset. It depends on the fact
16998 that the operands can be sorted so the offsets are correct for STP.
16999 MODE is the mode of memory operands. CODE is the rtl operator
17000 which should be applied to all memory operands, it's SIGN_EXTEND,
17001 ZERO_EXTEND or UNKNOWN. */
17003 bool
17004 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17005 scalar_mode mode, RTX_CODE code)
17007 rtx base, offset_1, offset_3, t1, t2;
17008 rtx mem_1, mem_2, mem_3, mem_4;
17009 rtx temp_operands[8];
17010 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17011 stp_off_upper_limit, stp_off_lower_limit, msize;
17013 /* We make changes on a copy as we may still bail out. */
17014 for (int i = 0; i < 8; i ++)
17015 temp_operands[i] = operands[i];
17017 /* Sort the operands. */
17018 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17020 if (load)
17022 mem_1 = temp_operands[1];
17023 mem_2 = temp_operands[3];
17024 mem_3 = temp_operands[5];
17025 mem_4 = temp_operands[7];
17027 else
17029 mem_1 = temp_operands[0];
17030 mem_2 = temp_operands[2];
17031 mem_3 = temp_operands[4];
17032 mem_4 = temp_operands[6];
17033 gcc_assert (code == UNKNOWN);
17036 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17037 extract_base_offset_in_addr (mem_3, &base, &offset_3);
17038 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17039 && offset_3 != NULL_RTX);
17041 /* Adjust offset so it can fit in LDP/STP instruction. */
17042 msize = GET_MODE_SIZE (mode);
17043 stp_off_upper_limit = msize * (0x40 - 1);
17044 stp_off_lower_limit = - msize * 0x40;
17046 off_val_1 = INTVAL (offset_1);
17047 off_val_3 = INTVAL (offset_3);
17049 /* The base offset is optimally half way between the two STP/LDP offsets. */
17050 if (msize <= 4)
17051 base_off = (off_val_1 + off_val_3) / 2;
17052 else
17053 /* However, due to issues with negative LDP/STP offset generation for
17054 larger modes, for DF, DI and vector modes. we must not use negative
17055 addresses smaller than 9 signed unadjusted bits can store. This
17056 provides the most range in this case. */
17057 base_off = off_val_1;
17059 /* Adjust the base so that it is aligned with the addresses but still
17060 optimal. */
17061 if (base_off % msize != off_val_1 % msize)
17062 /* Fix the offset, bearing in mind we want to make it bigger not
17063 smaller. */
17064 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17065 else if (msize <= 4)
17066 /* The negative range of LDP/STP is one larger than the positive range. */
17067 base_off += msize;
17069 /* Check if base offset is too big or too small. We can attempt to resolve
17070 this issue by setting it to the maximum value and seeing if the offsets
17071 still fit. */
17072 if (base_off >= 0x1000)
17074 base_off = 0x1000 - 1;
17075 /* We must still make sure that the base offset is aligned with respect
17076 to the address. But it may may not be made any bigger. */
17077 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17080 /* Likewise for the case where the base is too small. */
17081 if (base_off <= -0x1000)
17083 base_off = -0x1000 + 1;
17084 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17087 /* Offset of the first STP/LDP. */
17088 new_off_1 = off_val_1 - base_off;
17090 /* Offset of the second STP/LDP. */
17091 new_off_3 = off_val_3 - base_off;
17093 /* The offsets must be within the range of the LDP/STP instructions. */
17094 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17095 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17096 return false;
17098 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17099 new_off_1), true);
17100 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17101 new_off_1 + msize), true);
17102 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17103 new_off_3), true);
17104 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17105 new_off_3 + msize), true);
17107 if (!aarch64_mem_pair_operand (mem_1, mode)
17108 || !aarch64_mem_pair_operand (mem_3, mode))
17109 return false;
17111 if (code == ZERO_EXTEND)
17113 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17114 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17115 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17116 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17118 else if (code == SIGN_EXTEND)
17120 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17121 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17122 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17123 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17126 if (load)
17128 operands[0] = temp_operands[0];
17129 operands[1] = mem_1;
17130 operands[2] = temp_operands[2];
17131 operands[3] = mem_2;
17132 operands[4] = temp_operands[4];
17133 operands[5] = mem_3;
17134 operands[6] = temp_operands[6];
17135 operands[7] = mem_4;
17137 else
17139 operands[0] = mem_1;
17140 operands[1] = temp_operands[1];
17141 operands[2] = mem_2;
17142 operands[3] = temp_operands[3];
17143 operands[4] = mem_3;
17144 operands[5] = temp_operands[5];
17145 operands[6] = mem_4;
17146 operands[7] = temp_operands[7];
17149 /* Emit adjusting instruction. */
17150 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17151 /* Emit ldp/stp instructions. */
17152 t1 = gen_rtx_SET (operands[0], operands[1]);
17153 t2 = gen_rtx_SET (operands[2], operands[3]);
17154 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17155 t1 = gen_rtx_SET (operands[4], operands[5]);
17156 t2 = gen_rtx_SET (operands[6], operands[7]);
17157 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17158 return true;
17161 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17162 it isn't worth branching around empty masked ops (including masked
17163 stores). */
17165 static bool
17166 aarch64_empty_mask_is_expensive (unsigned)
17168 return false;
17171 /* Return 1 if pseudo register should be created and used to hold
17172 GOT address for PIC code. */
17174 bool
17175 aarch64_use_pseudo_pic_reg (void)
17177 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17180 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17182 static int
17183 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17185 switch (XINT (x, 1))
17187 case UNSPEC_GOTSMALLPIC:
17188 case UNSPEC_GOTSMALLPIC28K:
17189 case UNSPEC_GOTTINYPIC:
17190 return 0;
17191 default:
17192 break;
17195 return default_unspec_may_trap_p (x, flags);
17199 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17200 return the log2 of that value. Otherwise return -1. */
17203 aarch64_fpconst_pow_of_2 (rtx x)
17205 const REAL_VALUE_TYPE *r;
17207 if (!CONST_DOUBLE_P (x))
17208 return -1;
17210 r = CONST_DOUBLE_REAL_VALUE (x);
17212 if (REAL_VALUE_NEGATIVE (*r)
17213 || REAL_VALUE_ISNAN (*r)
17214 || REAL_VALUE_ISINF (*r)
17215 || !real_isinteger (r, DFmode))
17216 return -1;
17218 return exact_log2 (real_to_integer (r));
17221 /* If X is a vector of equal CONST_DOUBLE values and that value is
17222 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17225 aarch64_vec_fpconst_pow_of_2 (rtx x)
17227 int nelts;
17228 if (GET_CODE (x) != CONST_VECTOR
17229 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17230 return -1;
17232 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17233 return -1;
17235 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17236 if (firstval <= 0)
17237 return -1;
17239 for (int i = 1; i < nelts; i++)
17240 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17241 return -1;
17243 return firstval;
17246 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17247 to float.
17249 __fp16 always promotes through this hook.
17250 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17251 through the generic excess precision logic rather than here. */
17253 static tree
17254 aarch64_promoted_type (const_tree t)
17256 if (SCALAR_FLOAT_TYPE_P (t)
17257 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17258 return float_type_node;
17260 return NULL_TREE;
17263 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17265 static bool
17266 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17267 optimization_type opt_type)
17269 switch (op)
17271 case rsqrt_optab:
17272 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17274 default:
17275 return true;
17279 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17281 static unsigned int
17282 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17283 int *offset)
17285 /* Polynomial invariant 1 == (VG / 2) - 1. */
17286 gcc_assert (i == 1);
17287 *factor = 2;
17288 *offset = 1;
17289 return AARCH64_DWARF_VG;
17292 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17293 if MODE is HFmode, and punt to the generic implementation otherwise. */
17295 static bool
17296 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17298 return (mode == HFmode
17299 ? true
17300 : default_libgcc_floating_mode_supported_p (mode));
17303 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17304 if MODE is HFmode, and punt to the generic implementation otherwise. */
17306 static bool
17307 aarch64_scalar_mode_supported_p (scalar_mode mode)
17309 return (mode == HFmode
17310 ? true
17311 : default_scalar_mode_supported_p (mode));
17314 /* Set the value of FLT_EVAL_METHOD.
17315 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17317 0: evaluate all operations and constants, whose semantic type has at
17318 most the range and precision of type float, to the range and
17319 precision of float; evaluate all other operations and constants to
17320 the range and precision of the semantic type;
17322 N, where _FloatN is a supported interchange floating type
17323 evaluate all operations and constants, whose semantic type has at
17324 most the range and precision of _FloatN type, to the range and
17325 precision of the _FloatN type; evaluate all other operations and
17326 constants to the range and precision of the semantic type;
17328 If we have the ARMv8.2-A extensions then we support _Float16 in native
17329 precision, so we should set this to 16. Otherwise, we support the type,
17330 but want to evaluate expressions in float precision, so set this to
17331 0. */
17333 static enum flt_eval_method
17334 aarch64_excess_precision (enum excess_precision_type type)
17336 switch (type)
17338 case EXCESS_PRECISION_TYPE_FAST:
17339 case EXCESS_PRECISION_TYPE_STANDARD:
17340 /* We can calculate either in 16-bit range and precision or
17341 32-bit range and precision. Make that decision based on whether
17342 we have native support for the ARMv8.2-A 16-bit floating-point
17343 instructions or not. */
17344 return (TARGET_FP_F16INST
17345 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17346 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17347 case EXCESS_PRECISION_TYPE_IMPLICIT:
17348 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17349 default:
17350 gcc_unreachable ();
17352 return FLT_EVAL_METHOD_UNPREDICTABLE;
17355 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17356 scheduled for speculative execution. Reject the long-running division
17357 and square-root instructions. */
17359 static bool
17360 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17362 switch (get_attr_type (insn))
17364 case TYPE_SDIV:
17365 case TYPE_UDIV:
17366 case TYPE_FDIVS:
17367 case TYPE_FDIVD:
17368 case TYPE_FSQRTS:
17369 case TYPE_FSQRTD:
17370 case TYPE_NEON_FP_SQRT_S:
17371 case TYPE_NEON_FP_SQRT_D:
17372 case TYPE_NEON_FP_SQRT_S_Q:
17373 case TYPE_NEON_FP_SQRT_D_Q:
17374 case TYPE_NEON_FP_DIV_S:
17375 case TYPE_NEON_FP_DIV_D:
17376 case TYPE_NEON_FP_DIV_S_Q:
17377 case TYPE_NEON_FP_DIV_D_Q:
17378 return false;
17379 default:
17380 return true;
17384 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17386 static int
17387 aarch64_compute_pressure_classes (reg_class *classes)
17389 int i = 0;
17390 classes[i++] = GENERAL_REGS;
17391 classes[i++] = FP_REGS;
17392 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17393 registers need to go in PR_LO_REGS at some point during their
17394 lifetime. Splitting it into two halves has the effect of making
17395 all predicates count against PR_LO_REGS, so that we try whenever
17396 possible to restrict the number of live predicates to 8. This
17397 greatly reduces the amount of spilling in certain loops. */
17398 classes[i++] = PR_LO_REGS;
17399 classes[i++] = PR_HI_REGS;
17400 return i;
17403 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17405 static bool
17406 aarch64_can_change_mode_class (machine_mode from,
17407 machine_mode to, reg_class_t)
17409 if (BYTES_BIG_ENDIAN)
17411 bool from_sve_p = aarch64_sve_data_mode_p (from);
17412 bool to_sve_p = aarch64_sve_data_mode_p (to);
17414 /* Don't allow changes between SVE data modes and non-SVE modes.
17415 See the comment at the head of aarch64-sve.md for details. */
17416 if (from_sve_p != to_sve_p)
17417 return false;
17419 /* Don't allow changes in element size: lane 0 of the new vector
17420 would not then be lane 0 of the old vector. See the comment
17421 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17422 description.
17424 In the worst case, this forces a register to be spilled in
17425 one mode and reloaded in the other, which handles the
17426 endianness correctly. */
17427 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17428 return false;
17430 return true;
17433 /* Implement TARGET_EARLY_REMAT_MODES. */
17435 static void
17436 aarch64_select_early_remat_modes (sbitmap modes)
17438 /* SVE values are not normally live across a call, so it should be
17439 worth doing early rematerialization even in VL-specific mode. */
17440 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17442 machine_mode mode = (machine_mode) i;
17443 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17444 if (vec_flags & VEC_ANY_SVE)
17445 bitmap_set_bit (modes, i);
17449 /* Override the default target speculation_safe_value. */
17450 static rtx
17451 aarch64_speculation_safe_value (machine_mode mode,
17452 rtx result, rtx val, rtx failval)
17454 /* Maybe we should warn if falling back to hard barriers. They are
17455 likely to be noticably more expensive than the alternative below. */
17456 if (!aarch64_track_speculation)
17457 return default_speculation_safe_value (mode, result, val, failval);
17459 if (!REG_P (val))
17460 val = copy_to_mode_reg (mode, val);
17462 if (!aarch64_reg_or_zero (failval, mode))
17463 failval = copy_to_mode_reg (mode, failval);
17465 switch (mode)
17467 case E_QImode:
17468 emit_insn (gen_despeculate_copyqi (result, val, failval));
17469 break;
17470 case E_HImode:
17471 emit_insn (gen_despeculate_copyhi (result, val, failval));
17472 break;
17473 case E_SImode:
17474 emit_insn (gen_despeculate_copysi (result, val, failval));
17475 break;
17476 case E_DImode:
17477 emit_insn (gen_despeculate_copydi (result, val, failval));
17478 break;
17479 case E_TImode:
17480 emit_insn (gen_despeculate_copyti (result, val, failval));
17481 break;
17482 default:
17483 gcc_unreachable ();
17485 return result;
17488 /* Target-specific selftests. */
17490 #if CHECKING_P
17492 namespace selftest {
17494 /* Selftest for the RTL loader.
17495 Verify that the RTL loader copes with a dump from
17496 print_rtx_function. This is essentially just a test that class
17497 function_reader can handle a real dump, but it also verifies
17498 that lookup_reg_by_dump_name correctly handles hard regs.
17499 The presence of hard reg names in the dump means that the test is
17500 target-specific, hence it is in this file. */
17502 static void
17503 aarch64_test_loading_full_dump ()
17505 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17507 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17509 rtx_insn *insn_1 = get_insn_by_uid (1);
17510 ASSERT_EQ (NOTE, GET_CODE (insn_1));
17512 rtx_insn *insn_15 = get_insn_by_uid (15);
17513 ASSERT_EQ (INSN, GET_CODE (insn_15));
17514 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17516 /* Verify crtl->return_rtx. */
17517 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17518 ASSERT_EQ (0, REGNO (crtl->return_rtx));
17519 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17522 /* Run all target-specific selftests. */
17524 static void
17525 aarch64_run_selftests (void)
17527 aarch64_test_loading_full_dump ();
17530 } // namespace selftest
17532 #endif /* #if CHECKING_P */
17534 #undef TARGET_ADDRESS_COST
17535 #define TARGET_ADDRESS_COST aarch64_address_cost
17537 /* This hook will determines whether unnamed bitfields affect the alignment
17538 of the containing structure. The hook returns true if the structure
17539 should inherit the alignment requirements of an unnamed bitfield's
17540 type. */
17541 #undef TARGET_ALIGN_ANON_BITFIELD
17542 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17544 #undef TARGET_ASM_ALIGNED_DI_OP
17545 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17547 #undef TARGET_ASM_ALIGNED_HI_OP
17548 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17550 #undef TARGET_ASM_ALIGNED_SI_OP
17551 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17553 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17554 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17555 hook_bool_const_tree_hwi_hwi_const_tree_true
17557 #undef TARGET_ASM_FILE_START
17558 #define TARGET_ASM_FILE_START aarch64_start_file
17560 #undef TARGET_ASM_OUTPUT_MI_THUNK
17561 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17563 #undef TARGET_ASM_SELECT_RTX_SECTION
17564 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17566 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17567 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17569 #undef TARGET_BUILD_BUILTIN_VA_LIST
17570 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17572 #undef TARGET_CALLEE_COPIES
17573 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17575 #undef TARGET_CAN_ELIMINATE
17576 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17578 #undef TARGET_CAN_INLINE_P
17579 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17581 #undef TARGET_CANNOT_FORCE_CONST_MEM
17582 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17584 #undef TARGET_CASE_VALUES_THRESHOLD
17585 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17587 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17588 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17590 /* Only the least significant bit is used for initialization guard
17591 variables. */
17592 #undef TARGET_CXX_GUARD_MASK_BIT
17593 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17595 #undef TARGET_C_MODE_FOR_SUFFIX
17596 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17598 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17599 #undef TARGET_DEFAULT_TARGET_FLAGS
17600 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17601 #endif
17603 #undef TARGET_CLASS_MAX_NREGS
17604 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17606 #undef TARGET_BUILTIN_DECL
17607 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17609 #undef TARGET_BUILTIN_RECIPROCAL
17610 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17612 #undef TARGET_C_EXCESS_PRECISION
17613 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17615 #undef TARGET_EXPAND_BUILTIN
17616 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17618 #undef TARGET_EXPAND_BUILTIN_VA_START
17619 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17621 #undef TARGET_FOLD_BUILTIN
17622 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17624 #undef TARGET_FUNCTION_ARG
17625 #define TARGET_FUNCTION_ARG aarch64_function_arg
17627 #undef TARGET_FUNCTION_ARG_ADVANCE
17628 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17630 #undef TARGET_FUNCTION_ARG_BOUNDARY
17631 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17633 #undef TARGET_FUNCTION_ARG_PADDING
17634 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17636 #undef TARGET_GET_RAW_RESULT_MODE
17637 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17638 #undef TARGET_GET_RAW_ARG_MODE
17639 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17641 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17642 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17644 #undef TARGET_FUNCTION_VALUE
17645 #define TARGET_FUNCTION_VALUE aarch64_function_value
17647 #undef TARGET_FUNCTION_VALUE_REGNO_P
17648 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17650 #undef TARGET_GIMPLE_FOLD_BUILTIN
17651 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17653 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17654 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17656 #undef TARGET_INIT_BUILTINS
17657 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17659 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17660 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17661 aarch64_ira_change_pseudo_allocno_class
17663 #undef TARGET_LEGITIMATE_ADDRESS_P
17664 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17666 #undef TARGET_LEGITIMATE_CONSTANT_P
17667 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17669 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17670 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17671 aarch64_legitimize_address_displacement
17673 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17674 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17676 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17677 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17678 aarch64_libgcc_floating_mode_supported_p
17680 #undef TARGET_MANGLE_TYPE
17681 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17683 #undef TARGET_MEMORY_MOVE_COST
17684 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17686 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17687 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17689 #undef TARGET_MUST_PASS_IN_STACK
17690 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17692 /* This target hook should return true if accesses to volatile bitfields
17693 should use the narrowest mode possible. It should return false if these
17694 accesses should use the bitfield container type. */
17695 #undef TARGET_NARROW_VOLATILE_BITFIELD
17696 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17698 #undef TARGET_OPTION_OVERRIDE
17699 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17701 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17702 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17703 aarch64_override_options_after_change
17705 #undef TARGET_OPTION_SAVE
17706 #define TARGET_OPTION_SAVE aarch64_option_save
17708 #undef TARGET_OPTION_RESTORE
17709 #define TARGET_OPTION_RESTORE aarch64_option_restore
17711 #undef TARGET_OPTION_PRINT
17712 #define TARGET_OPTION_PRINT aarch64_option_print
17714 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17715 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17717 #undef TARGET_SET_CURRENT_FUNCTION
17718 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17720 #undef TARGET_PASS_BY_REFERENCE
17721 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17723 #undef TARGET_PREFERRED_RELOAD_CLASS
17724 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17726 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17727 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17729 #undef TARGET_PROMOTED_TYPE
17730 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17732 #undef TARGET_SECONDARY_RELOAD
17733 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17735 #undef TARGET_SHIFT_TRUNCATION_MASK
17736 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17738 #undef TARGET_SETUP_INCOMING_VARARGS
17739 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17741 #undef TARGET_STRUCT_VALUE_RTX
17742 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17744 #undef TARGET_REGISTER_MOVE_COST
17745 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17747 #undef TARGET_RETURN_IN_MEMORY
17748 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17750 #undef TARGET_RETURN_IN_MSB
17751 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17753 #undef TARGET_RTX_COSTS
17754 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17756 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17757 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17759 #undef TARGET_SCHED_ISSUE_RATE
17760 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17762 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17763 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17764 aarch64_sched_first_cycle_multipass_dfa_lookahead
17766 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17767 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17768 aarch64_first_cycle_multipass_dfa_lookahead_guard
17770 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17771 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17772 aarch64_get_separate_components
17774 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17775 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17776 aarch64_components_for_bb
17778 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17779 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17780 aarch64_disqualify_components
17782 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17783 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17784 aarch64_emit_prologue_components
17786 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17787 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17788 aarch64_emit_epilogue_components
17790 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17791 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17792 aarch64_set_handled_components
17794 #undef TARGET_TRAMPOLINE_INIT
17795 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17797 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17798 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17800 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17801 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17803 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17804 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17805 aarch64_builtin_support_vector_misalignment
17807 #undef TARGET_ARRAY_MODE
17808 #define TARGET_ARRAY_MODE aarch64_array_mode
17810 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17811 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17813 #undef TARGET_VECTORIZE_ADD_STMT_COST
17814 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17816 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17817 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17818 aarch64_builtin_vectorization_cost
17820 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17821 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17823 #undef TARGET_VECTORIZE_BUILTINS
17824 #define TARGET_VECTORIZE_BUILTINS
17826 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17827 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17828 aarch64_builtin_vectorized_function
17830 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17831 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17832 aarch64_autovectorize_vector_sizes
17834 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17835 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17836 aarch64_atomic_assign_expand_fenv
17838 /* Section anchor support. */
17840 #undef TARGET_MIN_ANCHOR_OFFSET
17841 #define TARGET_MIN_ANCHOR_OFFSET -256
17843 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17844 byte offset; we can do much more for larger data types, but have no way
17845 to determine the size of the access. We assume accesses are aligned. */
17846 #undef TARGET_MAX_ANCHOR_OFFSET
17847 #define TARGET_MAX_ANCHOR_OFFSET 4095
17849 #undef TARGET_VECTOR_ALIGNMENT
17850 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17852 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17853 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17854 aarch64_vectorize_preferred_vector_alignment
17855 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17856 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17857 aarch64_simd_vector_alignment_reachable
17859 /* vec_perm support. */
17861 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17862 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17863 aarch64_vectorize_vec_perm_const
17865 #undef TARGET_VECTORIZE_GET_MASK_MODE
17866 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17867 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17868 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17869 aarch64_empty_mask_is_expensive
17870 #undef TARGET_PREFERRED_ELSE_VALUE
17871 #define TARGET_PREFERRED_ELSE_VALUE \
17872 aarch64_preferred_else_value
17874 #undef TARGET_INIT_LIBFUNCS
17875 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17877 #undef TARGET_FIXED_CONDITION_CODE_REGS
17878 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17880 #undef TARGET_FLAGS_REGNUM
17881 #define TARGET_FLAGS_REGNUM CC_REGNUM
17883 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17884 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17886 #undef TARGET_ASAN_SHADOW_OFFSET
17887 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17889 #undef TARGET_LEGITIMIZE_ADDRESS
17890 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17892 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17893 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17895 #undef TARGET_CAN_USE_DOLOOP_P
17896 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17898 #undef TARGET_SCHED_ADJUST_PRIORITY
17899 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17901 #undef TARGET_SCHED_MACRO_FUSION_P
17902 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17904 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17905 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17907 #undef TARGET_SCHED_FUSION_PRIORITY
17908 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17910 #undef TARGET_UNSPEC_MAY_TRAP_P
17911 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17913 #undef TARGET_USE_PSEUDO_PIC_REG
17914 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17916 #undef TARGET_PRINT_OPERAND
17917 #define TARGET_PRINT_OPERAND aarch64_print_operand
17919 #undef TARGET_PRINT_OPERAND_ADDRESS
17920 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17922 #undef TARGET_OPTAB_SUPPORTED_P
17923 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17925 #undef TARGET_OMIT_STRUCT_RETURN_REG
17926 #define TARGET_OMIT_STRUCT_RETURN_REG true
17928 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17929 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17930 aarch64_dwarf_poly_indeterminate_value
17932 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
17933 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17934 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17936 #undef TARGET_HARD_REGNO_NREGS
17937 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17938 #undef TARGET_HARD_REGNO_MODE_OK
17939 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17941 #undef TARGET_MODES_TIEABLE_P
17942 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17944 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17945 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17946 aarch64_hard_regno_call_part_clobbered
17948 #undef TARGET_CONSTANT_ALIGNMENT
17949 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17951 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17952 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17954 #undef TARGET_CAN_CHANGE_MODE_CLASS
17955 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17957 #undef TARGET_SELECT_EARLY_REMAT_MODES
17958 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17960 #undef TARGET_SPECULATION_SAFE_VALUE
17961 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
17963 #if CHECKING_P
17964 #undef TARGET_RUN_TARGET_SELFTESTS
17965 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17966 #endif /* #if CHECKING_P */
17968 struct gcc_target targetm = TARGET_INITIALIZER;
17970 #include "gt-aarch64.h"