[AArch64] Add SVE support
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobc5ed870ef57a458ae5f8a393cfa20c58c446271e
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Classifies an address.
83 ADDRESS_REG_IMM
84 A simple base register plus immediate offset.
86 ADDRESS_REG_WB
87 A base register indexed by immediate offset with writeback.
89 ADDRESS_REG_REG
90 A base register indexed by (optionally scaled) register.
92 ADDRESS_REG_UXTW
93 A base register indexed by (optionally scaled) zero-extended register.
95 ADDRESS_REG_SXTW
96 A base register indexed by (optionally scaled) sign-extended register.
98 ADDRESS_LO_SUM
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
101 ADDRESS_SYMBOLIC:
102 A constant symbolic address, in pc-relative literal pool. */
104 enum aarch64_address_type {
105 ADDRESS_REG_IMM,
106 ADDRESS_REG_WB,
107 ADDRESS_REG_REG,
108 ADDRESS_REG_UXTW,
109 ADDRESS_REG_SXTW,
110 ADDRESS_LO_SUM,
111 ADDRESS_SYMBOLIC
114 struct aarch64_address_info {
115 enum aarch64_address_type type;
116 rtx base;
117 rtx offset;
118 poly_int64 const_offset;
119 int shift;
120 enum aarch64_symbol_type symbol_type;
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
126 enum insn_type { MOV, MVN };
127 enum modifier_type { LSL, MSL };
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode, rtx);
131 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
132 insn_type = MOV, modifier_type = LSL,
133 unsigned int = 0);
134 simd_immediate_info (scalar_mode, rtx, rtx);
136 /* The mode of the elements. */
137 scalar_mode elt_mode;
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
141 rtx value;
143 /* The value of the step if the constant is a series, null otherwise. */
144 rtx step;
146 /* The instruction to use to move the immediate into a vector. */
147 insn_type insn;
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier;
152 unsigned int shift;
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
159 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
160 modifier (LSL), shift (0)
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
165 fields. */
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
168 unsigned HOST_WIDE_INT value_in,
169 insn_type insn_in, modifier_type modifier_in,
170 unsigned int shift_in)
171 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
172 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
179 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
180 modifier (LSL), shift (0)
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel;
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg;
189 #ifdef HAVE_AS_TLS
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
192 #endif
194 static bool aarch64_composite_type_p (const_tree, machine_mode);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
196 const_tree,
197 machine_mode *, int *,
198 bool *);
199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode);
203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
205 const_tree type,
206 int misalignment,
207 bool is_packed);
208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
211 /* Major revision number of the ARM Architecture implemented by the target. */
212 unsigned aarch64_architecture_version;
214 /* The processor for which instructions should be scheduled. */
215 enum aarch64_processor aarch64_tune = cortexa53;
217 /* Mask to specify which instruction scheduling options should be used. */
218 unsigned long aarch64_tune_flags = 0;
220 /* Global flag for PC relative loads. */
221 bool aarch64_pcrelative_literal_loads;
223 /* Support for command line parsing of boolean flags in the tuning
224 structures. */
225 struct aarch64_flag_desc
227 const char* name;
228 unsigned int flag;
231 #define AARCH64_FUSION_PAIR(name, internal_name) \
232 { name, AARCH64_FUSE_##internal_name },
233 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
235 { "none", AARCH64_FUSE_NOTHING },
236 #include "aarch64-fusion-pairs.def"
237 { "all", AARCH64_FUSE_ALL },
238 { NULL, AARCH64_FUSE_NOTHING }
241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
242 { name, AARCH64_EXTRA_TUNE_##internal_name },
243 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
245 { "none", AARCH64_EXTRA_TUNE_NONE },
246 #include "aarch64-tuning-flags.def"
247 { "all", AARCH64_EXTRA_TUNE_ALL },
248 { NULL, AARCH64_EXTRA_TUNE_NONE }
251 /* Tuning parameters. */
253 static const struct cpu_addrcost_table generic_addrcost_table =
256 1, /* hi */
257 0, /* si */
258 0, /* di */
259 1, /* ti */
261 0, /* pre_modify */
262 0, /* post_modify */
263 0, /* register_offset */
264 0, /* register_sextend */
265 0, /* register_zextend */
266 0 /* imm_offset */
269 static const struct cpu_addrcost_table exynosm1_addrcost_table =
272 0, /* hi */
273 0, /* si */
274 0, /* di */
275 2, /* ti */
277 0, /* pre_modify */
278 0, /* post_modify */
279 1, /* register_offset */
280 1, /* register_sextend */
281 2, /* register_zextend */
282 0, /* imm_offset */
285 static const struct cpu_addrcost_table xgene1_addrcost_table =
288 1, /* hi */
289 0, /* si */
290 0, /* di */
291 1, /* ti */
293 1, /* pre_modify */
294 0, /* post_modify */
295 0, /* register_offset */
296 1, /* register_sextend */
297 1, /* register_zextend */
298 0, /* imm_offset */
301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
304 1, /* hi */
305 1, /* si */
306 1, /* di */
307 2, /* ti */
309 0, /* pre_modify */
310 0, /* post_modify */
311 2, /* register_offset */
312 3, /* register_sextend */
313 3, /* register_zextend */
314 0, /* imm_offset */
317 static const struct cpu_regmove_cost generic_regmove_cost =
319 1, /* GP2GP */
320 /* Avoid the use of slow int<->fp moves for spilling by setting
321 their cost higher than memmov_cost. */
322 5, /* GP2FP */
323 5, /* FP2GP */
324 2 /* FP2FP */
327 static const struct cpu_regmove_cost cortexa57_regmove_cost =
329 1, /* GP2GP */
330 /* Avoid the use of slow int<->fp moves for spilling by setting
331 their cost higher than memmov_cost. */
332 5, /* GP2FP */
333 5, /* FP2GP */
334 2 /* FP2FP */
337 static const struct cpu_regmove_cost cortexa53_regmove_cost =
339 1, /* GP2GP */
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
342 5, /* GP2FP */
343 5, /* FP2GP */
344 2 /* FP2FP */
347 static const struct cpu_regmove_cost exynosm1_regmove_cost =
349 1, /* GP2GP */
350 /* Avoid the use of slow int<->fp moves for spilling by setting
351 their cost higher than memmov_cost (actual, 4 and 9). */
352 9, /* GP2FP */
353 9, /* FP2GP */
354 1 /* FP2FP */
357 static const struct cpu_regmove_cost thunderx_regmove_cost =
359 2, /* GP2GP */
360 2, /* GP2FP */
361 6, /* FP2GP */
362 4 /* FP2FP */
365 static const struct cpu_regmove_cost xgene1_regmove_cost =
367 1, /* GP2GP */
368 /* Avoid the use of slow int<->fp moves for spilling by setting
369 their cost higher than memmov_cost. */
370 8, /* GP2FP */
371 8, /* FP2GP */
372 2 /* FP2FP */
375 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
377 2, /* GP2GP */
378 /* Avoid the use of int<->fp moves for spilling. */
379 6, /* GP2FP */
380 6, /* FP2GP */
381 4 /* FP2FP */
384 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
386 1, /* GP2GP */
387 /* Avoid the use of int<->fp moves for spilling. */
388 8, /* GP2FP */
389 8, /* FP2GP */
390 4 /* FP2FP */
393 /* Generic costs for vector insn classes. */
394 static const struct cpu_vector_cost generic_vector_cost =
396 1, /* scalar_int_stmt_cost */
397 1, /* scalar_fp_stmt_cost */
398 1, /* scalar_load_cost */
399 1, /* scalar_store_cost */
400 1, /* vec_int_stmt_cost */
401 1, /* vec_fp_stmt_cost */
402 2, /* vec_permute_cost */
403 1, /* vec_to_scalar_cost */
404 1, /* scalar_to_vec_cost */
405 1, /* vec_align_load_cost */
406 1, /* vec_unalign_load_cost */
407 1, /* vec_unalign_store_cost */
408 1, /* vec_store_cost */
409 3, /* cond_taken_branch_cost */
410 1 /* cond_not_taken_branch_cost */
413 /* ThunderX costs for vector insn classes. */
414 static const struct cpu_vector_cost thunderx_vector_cost =
416 1, /* scalar_int_stmt_cost */
417 1, /* scalar_fp_stmt_cost */
418 3, /* scalar_load_cost */
419 1, /* scalar_store_cost */
420 4, /* vec_int_stmt_cost */
421 1, /* vec_fp_stmt_cost */
422 4, /* vec_permute_cost */
423 2, /* vec_to_scalar_cost */
424 2, /* scalar_to_vec_cost */
425 3, /* vec_align_load_cost */
426 5, /* vec_unalign_load_cost */
427 5, /* vec_unalign_store_cost */
428 1, /* vec_store_cost */
429 3, /* cond_taken_branch_cost */
430 3 /* cond_not_taken_branch_cost */
433 /* Generic costs for vector insn classes. */
434 static const struct cpu_vector_cost cortexa57_vector_cost =
436 1, /* scalar_int_stmt_cost */
437 1, /* scalar_fp_stmt_cost */
438 4, /* scalar_load_cost */
439 1, /* scalar_store_cost */
440 2, /* vec_int_stmt_cost */
441 2, /* vec_fp_stmt_cost */
442 3, /* vec_permute_cost */
443 8, /* vec_to_scalar_cost */
444 8, /* scalar_to_vec_cost */
445 4, /* vec_align_load_cost */
446 4, /* vec_unalign_load_cost */
447 1, /* vec_unalign_store_cost */
448 1, /* vec_store_cost */
449 1, /* cond_taken_branch_cost */
450 1 /* cond_not_taken_branch_cost */
453 static const struct cpu_vector_cost exynosm1_vector_cost =
455 1, /* scalar_int_stmt_cost */
456 1, /* scalar_fp_stmt_cost */
457 5, /* scalar_load_cost */
458 1, /* scalar_store_cost */
459 3, /* vec_int_stmt_cost */
460 3, /* vec_fp_stmt_cost */
461 3, /* vec_permute_cost */
462 3, /* vec_to_scalar_cost */
463 3, /* scalar_to_vec_cost */
464 5, /* vec_align_load_cost */
465 5, /* vec_unalign_load_cost */
466 1, /* vec_unalign_store_cost */
467 1, /* vec_store_cost */
468 1, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
472 /* Generic costs for vector insn classes. */
473 static const struct cpu_vector_cost xgene1_vector_cost =
475 1, /* scalar_int_stmt_cost */
476 1, /* scalar_fp_stmt_cost */
477 5, /* scalar_load_cost */
478 1, /* scalar_store_cost */
479 2, /* vec_int_stmt_cost */
480 2, /* vec_fp_stmt_cost */
481 2, /* vec_permute_cost */
482 4, /* vec_to_scalar_cost */
483 4, /* scalar_to_vec_cost */
484 10, /* vec_align_load_cost */
485 10, /* vec_unalign_load_cost */
486 2, /* vec_unalign_store_cost */
487 2, /* vec_store_cost */
488 2, /* cond_taken_branch_cost */
489 1 /* cond_not_taken_branch_cost */
492 /* Costs for vector insn classes for Vulcan. */
493 static const struct cpu_vector_cost thunderx2t99_vector_cost =
495 1, /* scalar_int_stmt_cost */
496 6, /* scalar_fp_stmt_cost */
497 4, /* scalar_load_cost */
498 1, /* scalar_store_cost */
499 5, /* vec_int_stmt_cost */
500 6, /* vec_fp_stmt_cost */
501 3, /* vec_permute_cost */
502 6, /* vec_to_scalar_cost */
503 5, /* scalar_to_vec_cost */
504 8, /* vec_align_load_cost */
505 8, /* vec_unalign_load_cost */
506 4, /* vec_unalign_store_cost */
507 4, /* vec_store_cost */
508 2, /* cond_taken_branch_cost */
509 1 /* cond_not_taken_branch_cost */
512 /* Generic costs for branch instructions. */
513 static const struct cpu_branch_cost generic_branch_cost =
515 1, /* Predictable. */
516 3 /* Unpredictable. */
519 /* Generic approximation modes. */
520 static const cpu_approx_modes generic_approx_modes =
522 AARCH64_APPROX_NONE, /* division */
523 AARCH64_APPROX_NONE, /* sqrt */
524 AARCH64_APPROX_NONE /* recip_sqrt */
527 /* Approximation modes for Exynos M1. */
528 static const cpu_approx_modes exynosm1_approx_modes =
530 AARCH64_APPROX_NONE, /* division */
531 AARCH64_APPROX_ALL, /* sqrt */
532 AARCH64_APPROX_ALL /* recip_sqrt */
535 /* Approximation modes for X-Gene 1. */
536 static const cpu_approx_modes xgene1_approx_modes =
538 AARCH64_APPROX_NONE, /* division */
539 AARCH64_APPROX_NONE, /* sqrt */
540 AARCH64_APPROX_ALL /* recip_sqrt */
543 /* Generic prefetch settings (which disable prefetch). */
544 static const cpu_prefetch_tune generic_prefetch_tune =
546 0, /* num_slots */
547 -1, /* l1_cache_size */
548 -1, /* l1_cache_line_size */
549 -1, /* l2_cache_size */
550 -1 /* default_opt_level */
553 static const cpu_prefetch_tune exynosm1_prefetch_tune =
555 0, /* num_slots */
556 -1, /* l1_cache_size */
557 64, /* l1_cache_line_size */
558 -1, /* l2_cache_size */
559 -1 /* default_opt_level */
562 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
564 4, /* num_slots */
565 32, /* l1_cache_size */
566 64, /* l1_cache_line_size */
567 1024, /* l2_cache_size */
568 -1 /* default_opt_level */
571 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
573 8, /* num_slots */
574 32, /* l1_cache_size */
575 128, /* l1_cache_line_size */
576 16*1024, /* l2_cache_size */
577 3 /* default_opt_level */
580 static const cpu_prefetch_tune thunderx_prefetch_tune =
582 8, /* num_slots */
583 32, /* l1_cache_size */
584 128, /* l1_cache_line_size */
585 -1, /* l2_cache_size */
586 -1 /* default_opt_level */
589 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
591 8, /* num_slots */
592 32, /* l1_cache_size */
593 64, /* l1_cache_line_size */
594 256, /* l2_cache_size */
595 -1 /* default_opt_level */
598 static const struct tune_params generic_tunings =
600 &cortexa57_extra_costs,
601 &generic_addrcost_table,
602 &generic_regmove_cost,
603 &generic_vector_cost,
604 &generic_branch_cost,
605 &generic_approx_modes,
606 4, /* memmov_cost */
607 2, /* issue_rate */
608 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
609 8, /* function_align. */
610 4, /* jump_align. */
611 8, /* loop_align. */
612 2, /* int_reassoc_width. */
613 4, /* fp_reassoc_width. */
614 1, /* vec_reassoc_width. */
615 2, /* min_div_recip_mul_sf. */
616 2, /* min_div_recip_mul_df. */
617 0, /* max_case_values. */
618 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
619 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
620 &generic_prefetch_tune
623 static const struct tune_params cortexa35_tunings =
625 &cortexa53_extra_costs,
626 &generic_addrcost_table,
627 &cortexa53_regmove_cost,
628 &generic_vector_cost,
629 &generic_branch_cost,
630 &generic_approx_modes,
631 4, /* memmov_cost */
632 1, /* issue_rate */
633 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
634 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
635 16, /* function_align. */
636 4, /* jump_align. */
637 8, /* loop_align. */
638 2, /* int_reassoc_width. */
639 4, /* fp_reassoc_width. */
640 1, /* vec_reassoc_width. */
641 2, /* min_div_recip_mul_sf. */
642 2, /* min_div_recip_mul_df. */
643 0, /* max_case_values. */
644 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
645 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
646 &generic_prefetch_tune
649 static const struct tune_params cortexa53_tunings =
651 &cortexa53_extra_costs,
652 &generic_addrcost_table,
653 &cortexa53_regmove_cost,
654 &generic_vector_cost,
655 &generic_branch_cost,
656 &generic_approx_modes,
657 4, /* memmov_cost */
658 2, /* issue_rate */
659 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
660 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
661 16, /* function_align. */
662 4, /* jump_align. */
663 8, /* loop_align. */
664 2, /* int_reassoc_width. */
665 4, /* fp_reassoc_width. */
666 1, /* vec_reassoc_width. */
667 2, /* min_div_recip_mul_sf. */
668 2, /* min_div_recip_mul_df. */
669 0, /* max_case_values. */
670 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
671 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
672 &generic_prefetch_tune
675 static const struct tune_params cortexa57_tunings =
677 &cortexa57_extra_costs,
678 &generic_addrcost_table,
679 &cortexa57_regmove_cost,
680 &cortexa57_vector_cost,
681 &generic_branch_cost,
682 &generic_approx_modes,
683 4, /* memmov_cost */
684 3, /* issue_rate */
685 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
686 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
687 16, /* function_align. */
688 4, /* jump_align. */
689 8, /* loop_align. */
690 2, /* int_reassoc_width. */
691 4, /* fp_reassoc_width. */
692 1, /* vec_reassoc_width. */
693 2, /* min_div_recip_mul_sf. */
694 2, /* min_div_recip_mul_df. */
695 0, /* max_case_values. */
696 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
697 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
698 &generic_prefetch_tune
701 static const struct tune_params cortexa72_tunings =
703 &cortexa57_extra_costs,
704 &generic_addrcost_table,
705 &cortexa57_regmove_cost,
706 &cortexa57_vector_cost,
707 &generic_branch_cost,
708 &generic_approx_modes,
709 4, /* memmov_cost */
710 3, /* issue_rate */
711 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
712 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
713 16, /* function_align. */
714 4, /* jump_align. */
715 8, /* loop_align. */
716 2, /* int_reassoc_width. */
717 4, /* fp_reassoc_width. */
718 1, /* vec_reassoc_width. */
719 2, /* min_div_recip_mul_sf. */
720 2, /* min_div_recip_mul_df. */
721 0, /* max_case_values. */
722 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
723 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
724 &generic_prefetch_tune
727 static const struct tune_params cortexa73_tunings =
729 &cortexa57_extra_costs,
730 &generic_addrcost_table,
731 &cortexa57_regmove_cost,
732 &cortexa57_vector_cost,
733 &generic_branch_cost,
734 &generic_approx_modes,
735 4, /* memmov_cost. */
736 2, /* issue_rate. */
737 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
738 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
739 16, /* function_align. */
740 4, /* jump_align. */
741 8, /* loop_align. */
742 2, /* int_reassoc_width. */
743 4, /* fp_reassoc_width. */
744 1, /* vec_reassoc_width. */
745 2, /* min_div_recip_mul_sf. */
746 2, /* min_div_recip_mul_df. */
747 0, /* max_case_values. */
748 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
749 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
750 &generic_prefetch_tune
755 static const struct tune_params exynosm1_tunings =
757 &exynosm1_extra_costs,
758 &exynosm1_addrcost_table,
759 &exynosm1_regmove_cost,
760 &exynosm1_vector_cost,
761 &generic_branch_cost,
762 &exynosm1_approx_modes,
763 4, /* memmov_cost */
764 3, /* issue_rate */
765 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
766 4, /* function_align. */
767 4, /* jump_align. */
768 4, /* loop_align. */
769 2, /* int_reassoc_width. */
770 4, /* fp_reassoc_width. */
771 1, /* vec_reassoc_width. */
772 2, /* min_div_recip_mul_sf. */
773 2, /* min_div_recip_mul_df. */
774 48, /* max_case_values. */
775 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
776 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
777 &exynosm1_prefetch_tune
780 static const struct tune_params thunderxt88_tunings =
782 &thunderx_extra_costs,
783 &generic_addrcost_table,
784 &thunderx_regmove_cost,
785 &thunderx_vector_cost,
786 &generic_branch_cost,
787 &generic_approx_modes,
788 6, /* memmov_cost */
789 2, /* issue_rate */
790 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
791 8, /* function_align. */
792 8, /* jump_align. */
793 8, /* loop_align. */
794 2, /* int_reassoc_width. */
795 4, /* fp_reassoc_width. */
796 1, /* vec_reassoc_width. */
797 2, /* min_div_recip_mul_sf. */
798 2, /* min_div_recip_mul_df. */
799 0, /* max_case_values. */
800 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
801 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
802 &thunderxt88_prefetch_tune
805 static const struct tune_params thunderx_tunings =
807 &thunderx_extra_costs,
808 &generic_addrcost_table,
809 &thunderx_regmove_cost,
810 &thunderx_vector_cost,
811 &generic_branch_cost,
812 &generic_approx_modes,
813 6, /* memmov_cost */
814 2, /* issue_rate */
815 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
816 8, /* function_align. */
817 8, /* jump_align. */
818 8, /* loop_align. */
819 2, /* int_reassoc_width. */
820 4, /* fp_reassoc_width. */
821 1, /* vec_reassoc_width. */
822 2, /* min_div_recip_mul_sf. */
823 2, /* min_div_recip_mul_df. */
824 0, /* max_case_values. */
825 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
826 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
827 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
828 &thunderx_prefetch_tune
831 static const struct tune_params xgene1_tunings =
833 &xgene1_extra_costs,
834 &xgene1_addrcost_table,
835 &xgene1_regmove_cost,
836 &xgene1_vector_cost,
837 &generic_branch_cost,
838 &xgene1_approx_modes,
839 6, /* memmov_cost */
840 4, /* issue_rate */
841 AARCH64_FUSE_NOTHING, /* fusible_ops */
842 16, /* function_align. */
843 8, /* jump_align. */
844 16, /* loop_align. */
845 2, /* int_reassoc_width. */
846 4, /* fp_reassoc_width. */
847 1, /* vec_reassoc_width. */
848 2, /* min_div_recip_mul_sf. */
849 2, /* min_div_recip_mul_df. */
850 0, /* max_case_values. */
851 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
852 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
853 &generic_prefetch_tune
856 static const struct tune_params qdf24xx_tunings =
858 &qdf24xx_extra_costs,
859 &generic_addrcost_table,
860 &qdf24xx_regmove_cost,
861 &generic_vector_cost,
862 &generic_branch_cost,
863 &generic_approx_modes,
864 4, /* memmov_cost */
865 4, /* issue_rate */
866 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
867 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
868 16, /* function_align. */
869 8, /* jump_align. */
870 16, /* loop_align. */
871 2, /* int_reassoc_width. */
872 4, /* fp_reassoc_width. */
873 1, /* vec_reassoc_width. */
874 2, /* min_div_recip_mul_sf. */
875 2, /* min_div_recip_mul_df. */
876 0, /* max_case_values. */
877 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
878 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
879 &qdf24xx_prefetch_tune
882 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
883 for now. */
884 static const struct tune_params saphira_tunings =
886 &generic_extra_costs,
887 &generic_addrcost_table,
888 &generic_regmove_cost,
889 &generic_vector_cost,
890 &generic_branch_cost,
891 &generic_approx_modes,
892 4, /* memmov_cost */
893 4, /* issue_rate */
894 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
895 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
896 16, /* function_align. */
897 8, /* jump_align. */
898 16, /* loop_align. */
899 2, /* int_reassoc_width. */
900 4, /* fp_reassoc_width. */
901 1, /* vec_reassoc_width. */
902 2, /* min_div_recip_mul_sf. */
903 2, /* min_div_recip_mul_df. */
904 0, /* max_case_values. */
905 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
906 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
907 &generic_prefetch_tune
910 static const struct tune_params thunderx2t99_tunings =
912 &thunderx2t99_extra_costs,
913 &thunderx2t99_addrcost_table,
914 &thunderx2t99_regmove_cost,
915 &thunderx2t99_vector_cost,
916 &generic_branch_cost,
917 &generic_approx_modes,
918 4, /* memmov_cost. */
919 4, /* issue_rate. */
920 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
921 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
922 16, /* function_align. */
923 8, /* jump_align. */
924 16, /* loop_align. */
925 3, /* int_reassoc_width. */
926 2, /* fp_reassoc_width. */
927 2, /* vec_reassoc_width. */
928 2, /* min_div_recip_mul_sf. */
929 2, /* min_div_recip_mul_df. */
930 0, /* max_case_values. */
931 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
932 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
933 &thunderx2t99_prefetch_tune
936 /* Support for fine-grained override of the tuning structures. */
937 struct aarch64_tuning_override_function
939 const char* name;
940 void (*parse_override)(const char*, struct tune_params*);
943 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
944 static void aarch64_parse_tune_string (const char*, struct tune_params*);
946 static const struct aarch64_tuning_override_function
947 aarch64_tuning_override_functions[] =
949 { "fuse", aarch64_parse_fuse_string },
950 { "tune", aarch64_parse_tune_string },
951 { NULL, NULL }
954 /* A processor implementing AArch64. */
955 struct processor
957 const char *const name;
958 enum aarch64_processor ident;
959 enum aarch64_processor sched_core;
960 enum aarch64_arch arch;
961 unsigned architecture_version;
962 const unsigned long flags;
963 const struct tune_params *const tune;
966 /* Architectures implementing AArch64. */
967 static const struct processor all_architectures[] =
969 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
970 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
971 #include "aarch64-arches.def"
972 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
975 /* Processor cores implementing AArch64. */
976 static const struct processor all_cores[] =
978 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
979 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
980 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
981 FLAGS, &COSTS##_tunings},
982 #include "aarch64-cores.def"
983 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
984 AARCH64_FL_FOR_ARCH8, &generic_tunings},
985 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
989 /* Target specification. These are populated by the -march, -mtune, -mcpu
990 handling code or by target attributes. */
991 static const struct processor *selected_arch;
992 static const struct processor *selected_cpu;
993 static const struct processor *selected_tune;
995 /* The current tuning set. */
996 struct tune_params aarch64_tune_params = generic_tunings;
998 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1000 /* An ISA extension in the co-processor and main instruction set space. */
1001 struct aarch64_option_extension
1003 const char *const name;
1004 const unsigned long flags_on;
1005 const unsigned long flags_off;
1008 typedef enum aarch64_cond_code
1010 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1011 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1012 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1014 aarch64_cc;
1016 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1018 /* The condition codes of the processor, and the inverse function. */
1019 static const char * const aarch64_condition_codes[] =
1021 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1022 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1025 /* Generate code to enable conditional branches in functions over 1 MiB. */
1026 const char *
1027 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1028 const char * branch_format)
1030 rtx_code_label * tmp_label = gen_label_rtx ();
1031 char label_buf[256];
1032 char buffer[128];
1033 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1034 CODE_LABEL_NUMBER (tmp_label));
1035 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1036 rtx dest_label = operands[pos_label];
1037 operands[pos_label] = tmp_label;
1039 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1040 output_asm_insn (buffer, operands);
1042 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1043 operands[pos_label] = dest_label;
1044 output_asm_insn (buffer, operands);
1045 return "";
1048 void
1049 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1051 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1052 if (TARGET_GENERAL_REGS_ONLY)
1053 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1054 else
1055 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1058 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1059 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1060 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
1061 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1062 cost (in this case the best class is the lowest cost one). Using ALL_REGS
1063 irrespectively of its cost results in bad allocations with many redundant
1064 int<->FP moves which are expensive on various cores.
1065 To avoid this we don't allow ALL_REGS as the allocno class, but force a
1066 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
1067 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
1068 Otherwise set the allocno class depending on the mode.
1069 The result of this is that it is no longer inefficient to have a higher
1070 memory move cost than the register move cost.
1073 static reg_class_t
1074 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1075 reg_class_t best_class)
1077 machine_mode mode;
1079 if (allocno_class != ALL_REGS)
1080 return allocno_class;
1082 if (best_class != ALL_REGS)
1083 return best_class;
1085 mode = PSEUDO_REGNO_MODE (regno);
1086 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1089 static unsigned int
1090 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1092 if (GET_MODE_UNIT_SIZE (mode) == 4)
1093 return aarch64_tune_params.min_div_recip_mul_sf;
1094 return aarch64_tune_params.min_div_recip_mul_df;
1097 static int
1098 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1099 machine_mode mode)
1101 if (VECTOR_MODE_P (mode))
1102 return aarch64_tune_params.vec_reassoc_width;
1103 if (INTEGRAL_MODE_P (mode))
1104 return aarch64_tune_params.int_reassoc_width;
1105 if (FLOAT_MODE_P (mode))
1106 return aarch64_tune_params.fp_reassoc_width;
1107 return 1;
1110 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1111 unsigned
1112 aarch64_dbx_register_number (unsigned regno)
1114 if (GP_REGNUM_P (regno))
1115 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1116 else if (regno == SP_REGNUM)
1117 return AARCH64_DWARF_SP;
1118 else if (FP_REGNUM_P (regno))
1119 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1120 else if (PR_REGNUM_P (regno))
1121 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1122 else if (regno == VG_REGNUM)
1123 return AARCH64_DWARF_VG;
1125 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1126 equivalent DWARF register. */
1127 return DWARF_FRAME_REGISTERS;
1130 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1131 static bool
1132 aarch64_advsimd_struct_mode_p (machine_mode mode)
1134 return (TARGET_SIMD
1135 && (mode == OImode || mode == CImode || mode == XImode));
1138 /* Return true if MODE is an SVE predicate mode. */
1139 static bool
1140 aarch64_sve_pred_mode_p (machine_mode mode)
1142 return (TARGET_SVE
1143 && (mode == VNx16BImode
1144 || mode == VNx8BImode
1145 || mode == VNx4BImode
1146 || mode == VNx2BImode));
1149 /* Three mutually-exclusive flags describing a vector or predicate type. */
1150 const unsigned int VEC_ADVSIMD = 1;
1151 const unsigned int VEC_SVE_DATA = 2;
1152 const unsigned int VEC_SVE_PRED = 4;
1153 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1154 a structure of 2, 3 or 4 vectors. */
1155 const unsigned int VEC_STRUCT = 8;
1156 /* Useful combinations of the above. */
1157 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1158 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1160 /* Return a set of flags describing the vector properties of mode MODE.
1161 Ignore modes that are not supported by the current target. */
1162 static unsigned int
1163 aarch64_classify_vector_mode (machine_mode mode)
1165 if (aarch64_advsimd_struct_mode_p (mode))
1166 return VEC_ADVSIMD | VEC_STRUCT;
1168 if (aarch64_sve_pred_mode_p (mode))
1169 return VEC_SVE_PRED;
1171 scalar_mode inner = GET_MODE_INNER (mode);
1172 if (VECTOR_MODE_P (mode)
1173 && (inner == QImode
1174 || inner == HImode
1175 || inner == HFmode
1176 || inner == SImode
1177 || inner == SFmode
1178 || inner == DImode
1179 || inner == DFmode))
1181 if (TARGET_SVE
1182 && known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1183 return VEC_SVE_DATA;
1185 /* This includes V1DF but not V1DI (which doesn't exist). */
1186 if (TARGET_SIMD
1187 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1188 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1189 return VEC_ADVSIMD;
1192 return 0;
1195 /* Return true if MODE is any of the data vector modes, including
1196 structure modes. */
1197 static bool
1198 aarch64_vector_data_mode_p (machine_mode mode)
1200 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1203 /* Return true if MODE is an SVE data vector mode; either a single vector
1204 or a structure of vectors. */
1205 static bool
1206 aarch64_sve_data_mode_p (machine_mode mode)
1208 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1211 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1212 static bool
1213 aarch64_array_mode_supported_p (machine_mode mode,
1214 unsigned HOST_WIDE_INT nelems)
1216 if (TARGET_SIMD
1217 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1218 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1219 && (nelems >= 2 && nelems <= 4))
1220 return true;
1222 return false;
1225 /* Return the SVE predicate mode to use for elements that have
1226 ELEM_NBYTES bytes, if such a mode exists. */
1228 opt_machine_mode
1229 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1231 if (TARGET_SVE)
1233 if (elem_nbytes == 1)
1234 return VNx16BImode;
1235 if (elem_nbytes == 2)
1236 return VNx8BImode;
1237 if (elem_nbytes == 4)
1238 return VNx4BImode;
1239 if (elem_nbytes == 8)
1240 return VNx2BImode;
1242 return opt_machine_mode ();
1245 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1247 static opt_machine_mode
1248 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1250 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1252 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1253 machine_mode pred_mode;
1254 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1255 return pred_mode;
1258 return default_get_mask_mode (nunits, nbytes);
1261 /* Implement TARGET_HARD_REGNO_NREGS. */
1263 static unsigned int
1264 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1266 /* ??? Logically we should only need to provide a value when
1267 HARD_REGNO_MODE_OK says that the combination is valid,
1268 but at the moment we need to handle all modes. Just ignore
1269 any runtime parts for registers that can't store them. */
1270 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1271 switch (aarch64_regno_regclass (regno))
1273 case FP_REGS:
1274 case FP_LO_REGS:
1275 if (aarch64_sve_data_mode_p (mode))
1276 return exact_div (GET_MODE_SIZE (mode),
1277 BYTES_PER_SVE_VECTOR).to_constant ();
1278 return CEIL (lowest_size, UNITS_PER_VREG);
1279 case PR_REGS:
1280 case PR_LO_REGS:
1281 case PR_HI_REGS:
1282 return 1;
1283 default:
1284 return CEIL (lowest_size, UNITS_PER_WORD);
1286 gcc_unreachable ();
1289 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1291 static bool
1292 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1294 if (GET_MODE_CLASS (mode) == MODE_CC)
1295 return regno == CC_REGNUM;
1297 if (regno == VG_REGNUM)
1298 /* This must have the same size as _Unwind_Word. */
1299 return mode == DImode;
1301 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1302 if (vec_flags & VEC_SVE_PRED)
1303 return PR_REGNUM_P (regno);
1305 if (PR_REGNUM_P (regno))
1306 return 0;
1308 if (regno == SP_REGNUM)
1309 /* The purpose of comparing with ptr_mode is to support the
1310 global register variable associated with the stack pointer
1311 register via the syntax of asm ("wsp") in ILP32. */
1312 return mode == Pmode || mode == ptr_mode;
1314 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1315 return mode == Pmode;
1317 if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1318 return true;
1320 if (FP_REGNUM_P (regno))
1322 if (vec_flags & VEC_STRUCT)
1323 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1324 else
1325 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1328 return false;
1331 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1332 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1333 clobbers the top 64 bits when restoring the bottom 64 bits. */
1335 static bool
1336 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1338 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1341 /* Implement REGMODE_NATURAL_SIZE. */
1342 poly_uint64
1343 aarch64_regmode_natural_size (machine_mode mode)
1345 /* The natural size for SVE data modes is one SVE data vector,
1346 and similarly for predicates. We can't independently modify
1347 anything smaller than that. */
1348 /* ??? For now, only do this for variable-width SVE registers.
1349 Doing it for constant-sized registers breaks lower-subreg.c. */
1350 /* ??? And once that's fixed, we should probably have similar
1351 code for Advanced SIMD. */
1352 if (!aarch64_sve_vg.is_constant ())
1354 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1355 if (vec_flags & VEC_SVE_PRED)
1356 return BYTES_PER_SVE_PRED;
1357 if (vec_flags & VEC_SVE_DATA)
1358 return BYTES_PER_SVE_VECTOR;
1360 return UNITS_PER_WORD;
1363 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1364 machine_mode
1365 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1366 machine_mode mode)
1368 /* The predicate mode determines which bits are significant and
1369 which are "don't care". Decreasing the number of lanes would
1370 lose data while increasing the number of lanes would make bits
1371 unnecessarily significant. */
1372 if (PR_REGNUM_P (regno))
1373 return mode;
1374 if (known_ge (GET_MODE_SIZE (mode), 4))
1375 return mode;
1376 else
1377 return SImode;
1380 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1381 that strcpy from constants will be faster. */
1383 static HOST_WIDE_INT
1384 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1386 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1387 return MAX (align, BITS_PER_WORD);
1388 return align;
1391 /* Return true if calls to DECL should be treated as
1392 long-calls (ie called via a register). */
1393 static bool
1394 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1396 return false;
1399 /* Return true if calls to symbol-ref SYM should be treated as
1400 long-calls (ie called via a register). */
1401 bool
1402 aarch64_is_long_call_p (rtx sym)
1404 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1407 /* Return true if calls to symbol-ref SYM should not go through
1408 plt stubs. */
1410 bool
1411 aarch64_is_noplt_call_p (rtx sym)
1413 const_tree decl = SYMBOL_REF_DECL (sym);
1415 if (flag_pic
1416 && decl
1417 && (!flag_plt
1418 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1419 && !targetm.binds_local_p (decl))
1420 return true;
1422 return false;
1425 /* Return true if the offsets to a zero/sign-extract operation
1426 represent an expression that matches an extend operation. The
1427 operands represent the paramters from
1429 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1430 bool
1431 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1432 rtx extract_imm)
1434 HOST_WIDE_INT mult_val, extract_val;
1436 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1437 return false;
1439 mult_val = INTVAL (mult_imm);
1440 extract_val = INTVAL (extract_imm);
1442 if (extract_val > 8
1443 && extract_val < GET_MODE_BITSIZE (mode)
1444 && exact_log2 (extract_val & ~7) > 0
1445 && (extract_val & 7) <= 4
1446 && mult_val == (1 << (extract_val & 7)))
1447 return true;
1449 return false;
1452 /* Emit an insn that's a simple single-set. Both the operands must be
1453 known to be valid. */
1454 inline static rtx_insn *
1455 emit_set_insn (rtx x, rtx y)
1457 return emit_insn (gen_rtx_SET (x, y));
1460 /* X and Y are two things to compare using CODE. Emit the compare insn and
1461 return the rtx for register 0 in the proper mode. */
1463 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1465 machine_mode mode = SELECT_CC_MODE (code, x, y);
1466 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1468 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1469 return cc_reg;
1472 /* Build the SYMBOL_REF for __tls_get_addr. */
1474 static GTY(()) rtx tls_get_addr_libfunc;
1477 aarch64_tls_get_addr (void)
1479 if (!tls_get_addr_libfunc)
1480 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1481 return tls_get_addr_libfunc;
1484 /* Return the TLS model to use for ADDR. */
1486 static enum tls_model
1487 tls_symbolic_operand_type (rtx addr)
1489 enum tls_model tls_kind = TLS_MODEL_NONE;
1490 if (GET_CODE (addr) == CONST)
1492 poly_int64 addend;
1493 rtx sym = strip_offset (addr, &addend);
1494 if (GET_CODE (sym) == SYMBOL_REF)
1495 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1497 else if (GET_CODE (addr) == SYMBOL_REF)
1498 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1500 return tls_kind;
1503 /* We'll allow lo_sum's in addresses in our legitimate addresses
1504 so that combine would take care of combining addresses where
1505 necessary, but for generation purposes, we'll generate the address
1506 as :
1507 RTL Absolute
1508 tmp = hi (symbol_ref); adrp x1, foo
1509 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1512 PIC TLS
1513 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1514 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1515 bl __tls_get_addr
1518 Load TLS symbol, depending on TLS mechanism and TLS access model.
1520 Global Dynamic - Traditional TLS:
1521 adrp tmp, :tlsgd:imm
1522 add dest, tmp, #:tlsgd_lo12:imm
1523 bl __tls_get_addr
1525 Global Dynamic - TLS Descriptors:
1526 adrp dest, :tlsdesc:imm
1527 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1528 add dest, dest, #:tlsdesc_lo12:imm
1529 blr tmp
1530 mrs tp, tpidr_el0
1531 add dest, dest, tp
1533 Initial Exec:
1534 mrs tp, tpidr_el0
1535 adrp tmp, :gottprel:imm
1536 ldr dest, [tmp, #:gottprel_lo12:imm]
1537 add dest, dest, tp
1539 Local Exec:
1540 mrs tp, tpidr_el0
1541 add t0, tp, #:tprel_hi12:imm, lsl #12
1542 add t0, t0, #:tprel_lo12_nc:imm
1545 static void
1546 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1547 enum aarch64_symbol_type type)
1549 switch (type)
1551 case SYMBOL_SMALL_ABSOLUTE:
1553 /* In ILP32, the mode of dest can be either SImode or DImode. */
1554 rtx tmp_reg = dest;
1555 machine_mode mode = GET_MODE (dest);
1557 gcc_assert (mode == Pmode || mode == ptr_mode);
1559 if (can_create_pseudo_p ())
1560 tmp_reg = gen_reg_rtx (mode);
1562 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1563 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1564 return;
1567 case SYMBOL_TINY_ABSOLUTE:
1568 emit_insn (gen_rtx_SET (dest, imm));
1569 return;
1571 case SYMBOL_SMALL_GOT_28K:
1573 machine_mode mode = GET_MODE (dest);
1574 rtx gp_rtx = pic_offset_table_rtx;
1575 rtx insn;
1576 rtx mem;
1578 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1579 here before rtl expand. Tree IVOPT will generate rtl pattern to
1580 decide rtx costs, in which case pic_offset_table_rtx is not
1581 initialized. For that case no need to generate the first adrp
1582 instruction as the final cost for global variable access is
1583 one instruction. */
1584 if (gp_rtx != NULL)
1586 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1587 using the page base as GOT base, the first page may be wasted,
1588 in the worst scenario, there is only 28K space for GOT).
1590 The generate instruction sequence for accessing global variable
1593 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1595 Only one instruction needed. But we must initialize
1596 pic_offset_table_rtx properly. We generate initialize insn for
1597 every global access, and allow CSE to remove all redundant.
1599 The final instruction sequences will look like the following
1600 for multiply global variables access.
1602 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1604 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1605 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1606 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1607 ... */
1609 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1610 crtl->uses_pic_offset_table = 1;
1611 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1613 if (mode != GET_MODE (gp_rtx))
1614 gp_rtx = gen_lowpart (mode, gp_rtx);
1618 if (mode == ptr_mode)
1620 if (mode == DImode)
1621 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1622 else
1623 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1625 mem = XVECEXP (SET_SRC (insn), 0, 0);
1627 else
1629 gcc_assert (mode == Pmode);
1631 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1632 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1635 /* The operand is expected to be MEM. Whenever the related insn
1636 pattern changed, above code which calculate mem should be
1637 updated. */
1638 gcc_assert (GET_CODE (mem) == MEM);
1639 MEM_READONLY_P (mem) = 1;
1640 MEM_NOTRAP_P (mem) = 1;
1641 emit_insn (insn);
1642 return;
1645 case SYMBOL_SMALL_GOT_4G:
1647 /* In ILP32, the mode of dest can be either SImode or DImode,
1648 while the got entry is always of SImode size. The mode of
1649 dest depends on how dest is used: if dest is assigned to a
1650 pointer (e.g. in the memory), it has SImode; it may have
1651 DImode if dest is dereferenced to access the memeory.
1652 This is why we have to handle three different ldr_got_small
1653 patterns here (two patterns for ILP32). */
1655 rtx insn;
1656 rtx mem;
1657 rtx tmp_reg = dest;
1658 machine_mode mode = GET_MODE (dest);
1660 if (can_create_pseudo_p ())
1661 tmp_reg = gen_reg_rtx (mode);
1663 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1664 if (mode == ptr_mode)
1666 if (mode == DImode)
1667 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1668 else
1669 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1671 mem = XVECEXP (SET_SRC (insn), 0, 0);
1673 else
1675 gcc_assert (mode == Pmode);
1677 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1678 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1681 gcc_assert (GET_CODE (mem) == MEM);
1682 MEM_READONLY_P (mem) = 1;
1683 MEM_NOTRAP_P (mem) = 1;
1684 emit_insn (insn);
1685 return;
1688 case SYMBOL_SMALL_TLSGD:
1690 rtx_insn *insns;
1691 machine_mode mode = GET_MODE (dest);
1692 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1694 start_sequence ();
1695 if (TARGET_ILP32)
1696 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1697 else
1698 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1699 insns = get_insns ();
1700 end_sequence ();
1702 RTL_CONST_CALL_P (insns) = 1;
1703 emit_libcall_block (insns, dest, result, imm);
1704 return;
1707 case SYMBOL_SMALL_TLSDESC:
1709 machine_mode mode = GET_MODE (dest);
1710 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1711 rtx tp;
1713 gcc_assert (mode == Pmode || mode == ptr_mode);
1715 /* In ILP32, the got entry is always of SImode size. Unlike
1716 small GOT, the dest is fixed at reg 0. */
1717 if (TARGET_ILP32)
1718 emit_insn (gen_tlsdesc_small_si (imm));
1719 else
1720 emit_insn (gen_tlsdesc_small_di (imm));
1721 tp = aarch64_load_tp (NULL);
1723 if (mode != Pmode)
1724 tp = gen_lowpart (mode, tp);
1726 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1727 if (REG_P (dest))
1728 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1729 return;
1732 case SYMBOL_SMALL_TLSIE:
1734 /* In ILP32, the mode of dest can be either SImode or DImode,
1735 while the got entry is always of SImode size. The mode of
1736 dest depends on how dest is used: if dest is assigned to a
1737 pointer (e.g. in the memory), it has SImode; it may have
1738 DImode if dest is dereferenced to access the memeory.
1739 This is why we have to handle three different tlsie_small
1740 patterns here (two patterns for ILP32). */
1741 machine_mode mode = GET_MODE (dest);
1742 rtx tmp_reg = gen_reg_rtx (mode);
1743 rtx tp = aarch64_load_tp (NULL);
1745 if (mode == ptr_mode)
1747 if (mode == DImode)
1748 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1749 else
1751 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1752 tp = gen_lowpart (mode, tp);
1755 else
1757 gcc_assert (mode == Pmode);
1758 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1761 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1762 if (REG_P (dest))
1763 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1764 return;
1767 case SYMBOL_TLSLE12:
1768 case SYMBOL_TLSLE24:
1769 case SYMBOL_TLSLE32:
1770 case SYMBOL_TLSLE48:
1772 machine_mode mode = GET_MODE (dest);
1773 rtx tp = aarch64_load_tp (NULL);
1775 if (mode != Pmode)
1776 tp = gen_lowpart (mode, tp);
1778 switch (type)
1780 case SYMBOL_TLSLE12:
1781 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1782 (dest, tp, imm));
1783 break;
1784 case SYMBOL_TLSLE24:
1785 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1786 (dest, tp, imm));
1787 break;
1788 case SYMBOL_TLSLE32:
1789 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1790 (dest, imm));
1791 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1792 (dest, dest, tp));
1793 break;
1794 case SYMBOL_TLSLE48:
1795 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1796 (dest, imm));
1797 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1798 (dest, dest, tp));
1799 break;
1800 default:
1801 gcc_unreachable ();
1804 if (REG_P (dest))
1805 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1806 return;
1809 case SYMBOL_TINY_GOT:
1810 emit_insn (gen_ldr_got_tiny (dest, imm));
1811 return;
1813 case SYMBOL_TINY_TLSIE:
1815 machine_mode mode = GET_MODE (dest);
1816 rtx tp = aarch64_load_tp (NULL);
1818 if (mode == ptr_mode)
1820 if (mode == DImode)
1821 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1822 else
1824 tp = gen_lowpart (mode, tp);
1825 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1828 else
1830 gcc_assert (mode == Pmode);
1831 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1834 if (REG_P (dest))
1835 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1836 return;
1839 default:
1840 gcc_unreachable ();
1844 /* Emit a move from SRC to DEST. Assume that the move expanders can
1845 handle all moves if !can_create_pseudo_p (). The distinction is
1846 important because, unlike emit_move_insn, the move expanders know
1847 how to force Pmode objects into the constant pool even when the
1848 constant pool address is not itself legitimate. */
1849 static rtx
1850 aarch64_emit_move (rtx dest, rtx src)
1852 return (can_create_pseudo_p ()
1853 ? emit_move_insn (dest, src)
1854 : emit_move_insn_1 (dest, src));
1857 /* Split a 128-bit move operation into two 64-bit move operations,
1858 taking care to handle partial overlap of register to register
1859 copies. Special cases are needed when moving between GP regs and
1860 FP regs. SRC can be a register, constant or memory; DST a register
1861 or memory. If either operand is memory it must not have any side
1862 effects. */
1863 void
1864 aarch64_split_128bit_move (rtx dst, rtx src)
1866 rtx dst_lo, dst_hi;
1867 rtx src_lo, src_hi;
1869 machine_mode mode = GET_MODE (dst);
1871 gcc_assert (mode == TImode || mode == TFmode);
1872 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1873 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1875 if (REG_P (dst) && REG_P (src))
1877 int src_regno = REGNO (src);
1878 int dst_regno = REGNO (dst);
1880 /* Handle FP <-> GP regs. */
1881 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1883 src_lo = gen_lowpart (word_mode, src);
1884 src_hi = gen_highpart (word_mode, src);
1886 if (mode == TImode)
1888 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1889 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1891 else
1893 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1894 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1896 return;
1898 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1900 dst_lo = gen_lowpart (word_mode, dst);
1901 dst_hi = gen_highpart (word_mode, dst);
1903 if (mode == TImode)
1905 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1906 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1908 else
1910 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1911 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1913 return;
1917 dst_lo = gen_lowpart (word_mode, dst);
1918 dst_hi = gen_highpart (word_mode, dst);
1919 src_lo = gen_lowpart (word_mode, src);
1920 src_hi = gen_highpart_mode (word_mode, mode, src);
1922 /* At most one pairing may overlap. */
1923 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1925 aarch64_emit_move (dst_hi, src_hi);
1926 aarch64_emit_move (dst_lo, src_lo);
1928 else
1930 aarch64_emit_move (dst_lo, src_lo);
1931 aarch64_emit_move (dst_hi, src_hi);
1935 bool
1936 aarch64_split_128bit_move_p (rtx dst, rtx src)
1938 return (! REG_P (src)
1939 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1942 /* Split a complex SIMD combine. */
1944 void
1945 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1947 machine_mode src_mode = GET_MODE (src1);
1948 machine_mode dst_mode = GET_MODE (dst);
1950 gcc_assert (VECTOR_MODE_P (dst_mode));
1951 gcc_assert (register_operand (dst, dst_mode)
1952 && register_operand (src1, src_mode)
1953 && register_operand (src2, src_mode));
1955 rtx (*gen) (rtx, rtx, rtx);
1957 switch (src_mode)
1959 case E_V8QImode:
1960 gen = gen_aarch64_simd_combinev8qi;
1961 break;
1962 case E_V4HImode:
1963 gen = gen_aarch64_simd_combinev4hi;
1964 break;
1965 case E_V2SImode:
1966 gen = gen_aarch64_simd_combinev2si;
1967 break;
1968 case E_V4HFmode:
1969 gen = gen_aarch64_simd_combinev4hf;
1970 break;
1971 case E_V2SFmode:
1972 gen = gen_aarch64_simd_combinev2sf;
1973 break;
1974 case E_DImode:
1975 gen = gen_aarch64_simd_combinedi;
1976 break;
1977 case E_DFmode:
1978 gen = gen_aarch64_simd_combinedf;
1979 break;
1980 default:
1981 gcc_unreachable ();
1984 emit_insn (gen (dst, src1, src2));
1985 return;
1988 /* Split a complex SIMD move. */
1990 void
1991 aarch64_split_simd_move (rtx dst, rtx src)
1993 machine_mode src_mode = GET_MODE (src);
1994 machine_mode dst_mode = GET_MODE (dst);
1996 gcc_assert (VECTOR_MODE_P (dst_mode));
1998 if (REG_P (dst) && REG_P (src))
2000 rtx (*gen) (rtx, rtx);
2002 gcc_assert (VECTOR_MODE_P (src_mode));
2004 switch (src_mode)
2006 case E_V16QImode:
2007 gen = gen_aarch64_split_simd_movv16qi;
2008 break;
2009 case E_V8HImode:
2010 gen = gen_aarch64_split_simd_movv8hi;
2011 break;
2012 case E_V4SImode:
2013 gen = gen_aarch64_split_simd_movv4si;
2014 break;
2015 case E_V2DImode:
2016 gen = gen_aarch64_split_simd_movv2di;
2017 break;
2018 case E_V8HFmode:
2019 gen = gen_aarch64_split_simd_movv8hf;
2020 break;
2021 case E_V4SFmode:
2022 gen = gen_aarch64_split_simd_movv4sf;
2023 break;
2024 case E_V2DFmode:
2025 gen = gen_aarch64_split_simd_movv2df;
2026 break;
2027 default:
2028 gcc_unreachable ();
2031 emit_insn (gen (dst, src));
2032 return;
2036 bool
2037 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2038 machine_mode ymode, rtx y)
2040 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2041 gcc_assert (r != NULL);
2042 return rtx_equal_p (x, r);
2046 static rtx
2047 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2049 if (can_create_pseudo_p ())
2050 return force_reg (mode, value);
2051 else
2053 gcc_assert (x);
2054 aarch64_emit_move (x, value);
2055 return x;
2059 /* Return true if we can move VALUE into a register using a single
2060 CNT[BHWD] instruction. */
2062 static bool
2063 aarch64_sve_cnt_immediate_p (poly_int64 value)
2065 HOST_WIDE_INT factor = value.coeffs[0];
2066 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2067 return (value.coeffs[1] == factor
2068 && IN_RANGE (factor, 2, 16 * 16)
2069 && (factor & 1) == 0
2070 && factor <= 16 * (factor & -factor));
2073 /* Likewise for rtx X. */
2075 bool
2076 aarch64_sve_cnt_immediate_p (rtx x)
2078 poly_int64 value;
2079 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2082 /* Return the asm string for an instruction with a CNT-like vector size
2083 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2084 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2085 first part of the operands template (the part that comes before the
2086 vector size itself). FACTOR is the number of quadwords.
2087 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2088 If it is zero, we can use any element size. */
2090 static char *
2091 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2092 unsigned int factor,
2093 unsigned int nelts_per_vq)
2095 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2097 if (nelts_per_vq == 0)
2098 /* There is some overlap in the ranges of the four CNT instructions.
2099 Here we always use the smallest possible element size, so that the
2100 multiplier is 1 whereever possible. */
2101 nelts_per_vq = factor & -factor;
2102 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2103 gcc_assert (IN_RANGE (shift, 1, 4));
2104 char suffix = "dwhb"[shift - 1];
2106 factor >>= shift;
2107 unsigned int written;
2108 if (factor == 1)
2109 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2110 prefix, suffix, operands);
2111 else
2112 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2113 prefix, suffix, operands, factor);
2114 gcc_assert (written < sizeof (buffer));
2115 return buffer;
2118 /* Return the asm string for an instruction with a CNT-like vector size
2119 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2120 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2121 first part of the operands template (the part that comes before the
2122 vector size itself). X is the value of the vector size operand,
2123 as a polynomial integer rtx. */
2125 char *
2126 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2127 rtx x)
2129 poly_int64 value = rtx_to_poly_int64 (x);
2130 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2131 return aarch64_output_sve_cnt_immediate (prefix, operands,
2132 value.coeffs[1], 0);
2135 /* Return true if we can add VALUE to a register using a single ADDVL
2136 or ADDPL instruction. */
2138 static bool
2139 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2141 HOST_WIDE_INT factor = value.coeffs[0];
2142 if (factor == 0 || value.coeffs[1] != factor)
2143 return false;
2144 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2145 and a value of 16 is one vector width. */
2146 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2147 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2150 /* Likewise for rtx X. */
2152 bool
2153 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2155 poly_int64 value;
2156 return (poly_int_rtx_p (x, &value)
2157 && aarch64_sve_addvl_addpl_immediate_p (value));
2160 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2161 and storing the result in operand 0. */
2163 char *
2164 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2166 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2167 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2168 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2170 /* Use INC or DEC if possible. */
2171 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2173 if (aarch64_sve_cnt_immediate_p (offset_value))
2174 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2175 offset_value.coeffs[1], 0);
2176 if (aarch64_sve_cnt_immediate_p (-offset_value))
2177 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2178 -offset_value.coeffs[1], 0);
2181 int factor = offset_value.coeffs[1];
2182 if ((factor & 15) == 0)
2183 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2184 else
2185 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2186 return buffer;
2189 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2190 instruction. If it is, store the number of elements in each vector
2191 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2192 factor in *FACTOR_OUT (if nonnull). */
2194 bool
2195 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2196 unsigned int *nelts_per_vq_out)
2198 rtx elt;
2199 poly_int64 value;
2201 if (!const_vec_duplicate_p (x, &elt)
2202 || !poly_int_rtx_p (elt, &value))
2203 return false;
2205 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2206 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2207 /* There's no vector INCB. */
2208 return false;
2210 HOST_WIDE_INT factor = value.coeffs[0];
2211 if (value.coeffs[1] != factor)
2212 return false;
2214 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2215 if ((factor % nelts_per_vq) != 0
2216 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2217 return false;
2219 if (factor_out)
2220 *factor_out = factor;
2221 if (nelts_per_vq_out)
2222 *nelts_per_vq_out = nelts_per_vq;
2223 return true;
2226 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2227 instruction. */
2229 bool
2230 aarch64_sve_inc_dec_immediate_p (rtx x)
2232 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2235 /* Return the asm template for an SVE vector INC or DEC instruction.
2236 OPERANDS gives the operands before the vector count and X is the
2237 value of the vector count operand itself. */
2239 char *
2240 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2242 int factor;
2243 unsigned int nelts_per_vq;
2244 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2245 gcc_unreachable ();
2246 if (factor < 0)
2247 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2248 nelts_per_vq);
2249 else
2250 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2251 nelts_per_vq);
2254 static int
2255 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2256 scalar_int_mode mode)
2258 int i;
2259 unsigned HOST_WIDE_INT val, val2, mask;
2260 int one_match, zero_match;
2261 int num_insns;
2263 val = INTVAL (imm);
2265 if (aarch64_move_imm (val, mode))
2267 if (generate)
2268 emit_insn (gen_rtx_SET (dest, imm));
2269 return 1;
2272 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2273 (with XXXX non-zero). In that case check to see if the move can be done in
2274 a smaller mode. */
2275 val2 = val & 0xffffffff;
2276 if (mode == DImode
2277 && aarch64_move_imm (val2, SImode)
2278 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2280 if (generate)
2281 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2283 /* Check if we have to emit a second instruction by checking to see
2284 if any of the upper 32 bits of the original DI mode value is set. */
2285 if (val == val2)
2286 return 1;
2288 i = (val >> 48) ? 48 : 32;
2290 if (generate)
2291 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2292 GEN_INT ((val >> i) & 0xffff)));
2294 return 2;
2297 if ((val >> 32) == 0 || mode == SImode)
2299 if (generate)
2301 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2302 if (mode == SImode)
2303 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2304 GEN_INT ((val >> 16) & 0xffff)));
2305 else
2306 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2307 GEN_INT ((val >> 16) & 0xffff)));
2309 return 2;
2312 /* Remaining cases are all for DImode. */
2314 mask = 0xffff;
2315 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2316 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2317 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2318 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2320 if (zero_match != 2 && one_match != 2)
2322 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2323 For a 64-bit bitmask try whether changing 16 bits to all ones or
2324 zeroes creates a valid bitmask. To check any repeated bitmask,
2325 try using 16 bits from the other 32-bit half of val. */
2327 for (i = 0; i < 64; i += 16, mask <<= 16)
2329 val2 = val & ~mask;
2330 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2331 break;
2332 val2 = val | mask;
2333 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2334 break;
2335 val2 = val2 & ~mask;
2336 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2337 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2338 break;
2340 if (i != 64)
2342 if (generate)
2344 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2345 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2346 GEN_INT ((val >> i) & 0xffff)));
2348 return 2;
2352 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2353 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2354 otherwise skip zero bits. */
2356 num_insns = 1;
2357 mask = 0xffff;
2358 val2 = one_match > zero_match ? ~val : val;
2359 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2361 if (generate)
2362 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2363 ? (val | ~(mask << i))
2364 : (val & (mask << i)))));
2365 for (i += 16; i < 64; i += 16)
2367 if ((val2 & (mask << i)) == 0)
2368 continue;
2369 if (generate)
2370 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2371 GEN_INT ((val >> i) & 0xffff)));
2372 num_insns ++;
2375 return num_insns;
2378 /* Return the number of temporary registers that aarch64_add_offset_1
2379 would need to add OFFSET to a register. */
2381 static unsigned int
2382 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2384 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2387 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2388 a non-polynomial OFFSET. MODE is the mode of the addition.
2389 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2390 be set and CFA adjustments added to the generated instructions.
2392 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2393 temporary if register allocation is already complete. This temporary
2394 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2395 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2396 the immediate again.
2398 Since this function may be used to adjust the stack pointer, we must
2399 ensure that it cannot cause transient stack deallocation (for example
2400 by first incrementing SP and then decrementing when adjusting by a
2401 large immediate). */
2403 static void
2404 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2405 rtx src, HOST_WIDE_INT offset, rtx temp1,
2406 bool frame_related_p, bool emit_move_imm)
2408 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2409 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2411 HOST_WIDE_INT moffset = abs_hwi (offset);
2412 rtx_insn *insn;
2414 if (!moffset)
2416 if (!rtx_equal_p (dest, src))
2418 insn = emit_insn (gen_rtx_SET (dest, src));
2419 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2421 return;
2424 /* Single instruction adjustment. */
2425 if (aarch64_uimm12_shift (moffset))
2427 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2428 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2429 return;
2432 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2433 and either:
2435 a) the offset cannot be loaded by a 16-bit move or
2436 b) there is no spare register into which we can move it. */
2437 if (moffset < 0x1000000
2438 && ((!temp1 && !can_create_pseudo_p ())
2439 || !aarch64_move_imm (moffset, mode)))
2441 HOST_WIDE_INT low_off = moffset & 0xfff;
2443 low_off = offset < 0 ? -low_off : low_off;
2444 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2445 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2446 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2447 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2448 return;
2451 /* Emit a move immediate if required and an addition/subtraction. */
2452 if (emit_move_imm)
2454 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2455 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2457 insn = emit_insn (offset < 0
2458 ? gen_sub3_insn (dest, src, temp1)
2459 : gen_add3_insn (dest, src, temp1));
2460 if (frame_related_p)
2462 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2463 rtx adj = plus_constant (mode, src, offset);
2464 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2468 /* Return the number of temporary registers that aarch64_add_offset
2469 would need to move OFFSET into a register or add OFFSET to a register;
2470 ADD_P is true if we want the latter rather than the former. */
2472 static unsigned int
2473 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2475 /* This follows the same structure as aarch64_add_offset. */
2476 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2477 return 0;
2479 unsigned int count = 0;
2480 HOST_WIDE_INT factor = offset.coeffs[1];
2481 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2482 poly_int64 poly_offset (factor, factor);
2483 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2484 /* Need one register for the ADDVL/ADDPL result. */
2485 count += 1;
2486 else if (factor != 0)
2488 factor = abs (factor);
2489 if (factor > 16 * (factor & -factor))
2490 /* Need one register for the CNT result and one for the multiplication
2491 factor. If necessary, the second temporary can be reused for the
2492 constant part of the offset. */
2493 return 2;
2494 /* Need one register for the CNT result (which might then
2495 be shifted). */
2496 count += 1;
2498 return count + aarch64_add_offset_1_temporaries (constant);
2501 /* If X can be represented as a poly_int64, return the number
2502 of temporaries that are required to add it to a register.
2503 Return -1 otherwise. */
2506 aarch64_add_offset_temporaries (rtx x)
2508 poly_int64 offset;
2509 if (!poly_int_rtx_p (x, &offset))
2510 return -1;
2511 return aarch64_offset_temporaries (true, offset);
2514 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2515 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2516 be set and CFA adjustments added to the generated instructions.
2518 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2519 temporary if register allocation is already complete. This temporary
2520 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2521 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2522 false to avoid emitting the immediate again.
2524 TEMP2, if nonnull, is a second temporary register that doesn't
2525 overlap either DEST or REG.
2527 Since this function may be used to adjust the stack pointer, we must
2528 ensure that it cannot cause transient stack deallocation (for example
2529 by first incrementing SP and then decrementing when adjusting by a
2530 large immediate). */
2532 static void
2533 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2534 poly_int64 offset, rtx temp1, rtx temp2,
2535 bool frame_related_p, bool emit_move_imm = true)
2537 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2538 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2539 gcc_assert (temp1 == NULL_RTX
2540 || !frame_related_p
2541 || !reg_overlap_mentioned_p (temp1, dest));
2542 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2544 /* Try using ADDVL or ADDPL to add the whole value. */
2545 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2547 rtx offset_rtx = gen_int_mode (offset, mode);
2548 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2549 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2550 return;
2553 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2554 SVE vector register, over and above the minimum size of 128 bits.
2555 This is equivalent to half the value returned by CNTD with a
2556 vector shape of ALL. */
2557 HOST_WIDE_INT factor = offset.coeffs[1];
2558 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2560 /* Try using ADDVL or ADDPL to add the VG-based part. */
2561 poly_int64 poly_offset (factor, factor);
2562 if (src != const0_rtx
2563 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2565 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2566 if (frame_related_p)
2568 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2569 RTX_FRAME_RELATED_P (insn) = true;
2570 src = dest;
2572 else
2574 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2575 src = aarch64_force_temporary (mode, temp1, addr);
2576 temp1 = temp2;
2577 temp2 = NULL_RTX;
2580 /* Otherwise use a CNT-based sequence. */
2581 else if (factor != 0)
2583 /* Use a subtraction if we have a negative factor. */
2584 rtx_code code = PLUS;
2585 if (factor < 0)
2587 factor = -factor;
2588 code = MINUS;
2591 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2592 into the multiplication. */
2593 rtx val;
2594 int shift = 0;
2595 if (factor & 1)
2596 /* Use a right shift by 1. */
2597 shift = -1;
2598 else
2599 factor /= 2;
2600 HOST_WIDE_INT low_bit = factor & -factor;
2601 if (factor <= 16 * low_bit)
2603 if (factor > 16 * 8)
2605 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2606 the value with the minimum multiplier and shift it into
2607 position. */
2608 int extra_shift = exact_log2 (low_bit);
2609 shift += extra_shift;
2610 factor >>= extra_shift;
2612 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2614 else
2616 /* Use CNTD, then multiply it by FACTOR. */
2617 val = gen_int_mode (poly_int64 (2, 2), mode);
2618 val = aarch64_force_temporary (mode, temp1, val);
2620 /* Go back to using a negative multiplication factor if we have
2621 no register from which to subtract. */
2622 if (code == MINUS && src == const0_rtx)
2624 factor = -factor;
2625 code = PLUS;
2627 rtx coeff1 = gen_int_mode (factor, mode);
2628 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2629 val = gen_rtx_MULT (mode, val, coeff1);
2632 if (shift > 0)
2634 /* Multiply by 1 << SHIFT. */
2635 val = aarch64_force_temporary (mode, temp1, val);
2636 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2638 else if (shift == -1)
2640 /* Divide by 2. */
2641 val = aarch64_force_temporary (mode, temp1, val);
2642 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2645 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2646 if (src != const0_rtx)
2648 val = aarch64_force_temporary (mode, temp1, val);
2649 val = gen_rtx_fmt_ee (code, mode, src, val);
2651 else if (code == MINUS)
2653 val = aarch64_force_temporary (mode, temp1, val);
2654 val = gen_rtx_NEG (mode, val);
2657 if (constant == 0 || frame_related_p)
2659 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2660 if (frame_related_p)
2662 RTX_FRAME_RELATED_P (insn) = true;
2663 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2664 gen_rtx_SET (dest, plus_constant (Pmode, src,
2665 poly_offset)));
2667 src = dest;
2668 if (constant == 0)
2669 return;
2671 else
2673 src = aarch64_force_temporary (mode, temp1, val);
2674 temp1 = temp2;
2675 temp2 = NULL_RTX;
2678 emit_move_imm = true;
2681 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2682 frame_related_p, emit_move_imm);
2685 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2686 than a poly_int64. */
2688 void
2689 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2690 rtx offset_rtx, rtx temp1, rtx temp2)
2692 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2693 temp1, temp2, false);
2696 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2697 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2698 if TEMP1 already contains abs (DELTA). */
2700 static inline void
2701 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2703 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2704 temp1, temp2, true, emit_move_imm);
2707 /* Subtract DELTA from the stack pointer, marking the instructions
2708 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2709 if nonnull. */
2711 static inline void
2712 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2714 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2715 temp1, temp2, frame_related_p);
2718 /* Set DEST to (vec_series BASE STEP). */
2720 static void
2721 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2723 machine_mode mode = GET_MODE (dest);
2724 scalar_mode inner = GET_MODE_INNER (mode);
2726 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2727 if (!aarch64_sve_index_immediate_p (base))
2728 base = force_reg (inner, base);
2729 if (!aarch64_sve_index_immediate_p (step))
2730 step = force_reg (inner, step);
2732 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2735 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2736 integer of mode INT_MODE. Return true on success. */
2738 static bool
2739 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2740 rtx src)
2742 /* If the constant is smaller than 128 bits, we can do the move
2743 using a vector of SRC_MODEs. */
2744 if (src_mode != TImode)
2746 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2747 GET_MODE_SIZE (src_mode));
2748 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2749 emit_move_insn (gen_lowpart (dup_mode, dest),
2750 gen_const_vec_duplicate (dup_mode, src));
2751 return true;
2754 /* The bytes are loaded in little-endian order, so do a byteswap on
2755 big-endian targets. */
2756 if (BYTES_BIG_ENDIAN)
2758 src = simplify_unary_operation (BSWAP, src_mode, src, src_mode);
2759 if (!src)
2760 return NULL_RTX;
2763 /* Use LD1RQ to load the 128 bits from memory. */
2764 src = force_const_mem (src_mode, src);
2765 if (!src)
2766 return false;
2768 /* Make sure that the address is legitimate. */
2769 if (!aarch64_sve_ld1r_operand_p (src))
2771 rtx addr = force_reg (Pmode, XEXP (src, 0));
2772 src = replace_equiv_address (src, addr);
2775 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
2776 emit_insn (gen_sve_ld1rq (gen_lowpart (VNx16QImode, dest), ptrue, src));
2777 return true;
2780 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2781 isn't a simple duplicate or series. */
2783 static void
2784 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2786 machine_mode mode = GET_MODE (src);
2787 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2788 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2789 gcc_assert (npatterns > 1);
2791 if (nelts_per_pattern == 1)
2793 /* The constant is a repeating seqeuence of at least two elements,
2794 where the repeating elements occupy no more than 128 bits.
2795 Get an integer representation of the replicated value. */
2796 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2797 gcc_assert (int_bits <= 128);
2799 scalar_int_mode int_mode = int_mode_for_size (int_bits, 0).require ();
2800 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2801 if (int_value
2802 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2803 return;
2806 /* Expand each pattern individually. */
2807 rtx_vector_builder builder;
2808 auto_vec<rtx, 16> vectors (npatterns);
2809 for (unsigned int i = 0; i < npatterns; ++i)
2811 builder.new_vector (mode, 1, nelts_per_pattern);
2812 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2813 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2814 vectors.quick_push (force_reg (mode, builder.build ()));
2817 /* Use permutes to interleave the separate vectors. */
2818 while (npatterns > 1)
2820 npatterns /= 2;
2821 for (unsigned int i = 0; i < npatterns; ++i)
2823 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2824 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2825 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2826 vectors[i] = tmp;
2829 gcc_assert (vectors[0] == dest);
2832 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2833 is a pattern that can be used to set DEST to a replicated scalar
2834 element. */
2836 void
2837 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2838 rtx (*gen_vec_duplicate) (rtx, rtx))
2840 machine_mode mode = GET_MODE (dest);
2842 /* Check on what type of symbol it is. */
2843 scalar_int_mode int_mode;
2844 if ((GET_CODE (imm) == SYMBOL_REF
2845 || GET_CODE (imm) == LABEL_REF
2846 || GET_CODE (imm) == CONST
2847 || GET_CODE (imm) == CONST_POLY_INT)
2848 && is_a <scalar_int_mode> (mode, &int_mode))
2850 rtx mem;
2851 poly_int64 offset;
2852 HOST_WIDE_INT const_offset;
2853 enum aarch64_symbol_type sty;
2855 /* If we have (const (plus symbol offset)), separate out the offset
2856 before we start classifying the symbol. */
2857 rtx base = strip_offset (imm, &offset);
2859 /* We must always add an offset involving VL separately, rather than
2860 folding it into the relocation. */
2861 if (!offset.is_constant (&const_offset))
2863 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2864 emit_insn (gen_rtx_SET (dest, imm));
2865 else
2867 /* Do arithmetic on 32-bit values if the result is smaller
2868 than that. */
2869 if (partial_subreg_p (int_mode, SImode))
2871 /* It is invalid to do symbol calculations in modes
2872 narrower than SImode. */
2873 gcc_assert (base == const0_rtx);
2874 dest = gen_lowpart (SImode, dest);
2875 int_mode = SImode;
2877 if (base != const0_rtx)
2879 base = aarch64_force_temporary (int_mode, dest, base);
2880 aarch64_add_offset (int_mode, dest, base, offset,
2881 NULL_RTX, NULL_RTX, false);
2883 else
2884 aarch64_add_offset (int_mode, dest, base, offset,
2885 dest, NULL_RTX, false);
2887 return;
2890 sty = aarch64_classify_symbol (base, const_offset);
2891 switch (sty)
2893 case SYMBOL_FORCE_TO_MEM:
2894 if (const_offset != 0
2895 && targetm.cannot_force_const_mem (int_mode, imm))
2897 gcc_assert (can_create_pseudo_p ());
2898 base = aarch64_force_temporary (int_mode, dest, base);
2899 aarch64_add_offset (int_mode, dest, base, const_offset,
2900 NULL_RTX, NULL_RTX, false);
2901 return;
2904 mem = force_const_mem (ptr_mode, imm);
2905 gcc_assert (mem);
2907 /* If we aren't generating PC relative literals, then
2908 we need to expand the literal pool access carefully.
2909 This is something that needs to be done in a number
2910 of places, so could well live as a separate function. */
2911 if (!aarch64_pcrelative_literal_loads)
2913 gcc_assert (can_create_pseudo_p ());
2914 base = gen_reg_rtx (ptr_mode);
2915 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2916 if (ptr_mode != Pmode)
2917 base = convert_memory_address (Pmode, base);
2918 mem = gen_rtx_MEM (ptr_mode, base);
2921 if (int_mode != ptr_mode)
2922 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2924 emit_insn (gen_rtx_SET (dest, mem));
2926 return;
2928 case SYMBOL_SMALL_TLSGD:
2929 case SYMBOL_SMALL_TLSDESC:
2930 case SYMBOL_SMALL_TLSIE:
2931 case SYMBOL_SMALL_GOT_28K:
2932 case SYMBOL_SMALL_GOT_4G:
2933 case SYMBOL_TINY_GOT:
2934 case SYMBOL_TINY_TLSIE:
2935 if (const_offset != 0)
2937 gcc_assert(can_create_pseudo_p ());
2938 base = aarch64_force_temporary (int_mode, dest, base);
2939 aarch64_add_offset (int_mode, dest, base, const_offset,
2940 NULL_RTX, NULL_RTX, false);
2941 return;
2943 /* FALLTHRU */
2945 case SYMBOL_SMALL_ABSOLUTE:
2946 case SYMBOL_TINY_ABSOLUTE:
2947 case SYMBOL_TLSLE12:
2948 case SYMBOL_TLSLE24:
2949 case SYMBOL_TLSLE32:
2950 case SYMBOL_TLSLE48:
2951 aarch64_load_symref_appropriately (dest, imm, sty);
2952 return;
2954 default:
2955 gcc_unreachable ();
2959 if (!CONST_INT_P (imm))
2961 rtx base, step, value;
2962 if (GET_CODE (imm) == HIGH
2963 || aarch64_simd_valid_immediate (imm, NULL))
2964 emit_insn (gen_rtx_SET (dest, imm));
2965 else if (const_vec_series_p (imm, &base, &step))
2966 aarch64_expand_vec_series (dest, base, step);
2967 else if (const_vec_duplicate_p (imm, &value))
2969 /* If the constant is out of range of an SVE vector move,
2970 load it from memory if we can, otherwise move it into
2971 a register and use a DUP. */
2972 scalar_mode inner_mode = GET_MODE_INNER (mode);
2973 rtx op = force_const_mem (inner_mode, value);
2974 if (!op)
2975 op = force_reg (inner_mode, value);
2976 else if (!aarch64_sve_ld1r_operand_p (op))
2978 rtx addr = force_reg (Pmode, XEXP (op, 0));
2979 op = replace_equiv_address (op, addr);
2981 emit_insn (gen_vec_duplicate (dest, op));
2983 else if (GET_CODE (imm) == CONST_VECTOR
2984 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
2985 aarch64_expand_sve_const_vector (dest, imm);
2986 else
2988 rtx mem = force_const_mem (mode, imm);
2989 gcc_assert (mem);
2990 emit_move_insn (dest, mem);
2993 return;
2996 aarch64_internal_mov_immediate (dest, imm, true,
2997 as_a <scalar_int_mode> (mode));
3000 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3001 that is known to contain PTRUE. */
3003 void
3004 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3006 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3007 gen_rtvec (2, pred, src),
3008 UNSPEC_MERGE_PTRUE)));
3011 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3012 operand is in memory. In this case we need to use the predicated LD1
3013 and ST1 instead of LDR and STR, both for correctness on big-endian
3014 targets and because LD1 and ST1 support a wider range of addressing modes.
3015 PRED_MODE is the mode of the predicate.
3017 See the comment at the head of aarch64-sve.md for details about the
3018 big-endian handling. */
3020 void
3021 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3023 machine_mode mode = GET_MODE (dest);
3024 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3025 if (!register_operand (src, mode)
3026 && !register_operand (dest, mode))
3028 rtx tmp = gen_reg_rtx (mode);
3029 if (MEM_P (src))
3030 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3031 else
3032 emit_move_insn (tmp, src);
3033 src = tmp;
3035 aarch64_emit_sve_pred_move (dest, ptrue, src);
3038 static bool
3039 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3040 tree exp ATTRIBUTE_UNUSED)
3042 /* Currently, always true. */
3043 return true;
3046 /* Implement TARGET_PASS_BY_REFERENCE. */
3048 static bool
3049 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3050 machine_mode mode,
3051 const_tree type,
3052 bool named ATTRIBUTE_UNUSED)
3054 HOST_WIDE_INT size;
3055 machine_mode dummymode;
3056 int nregs;
3058 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3059 if (mode == BLKmode && type)
3060 size = int_size_in_bytes (type);
3061 else
3062 /* No frontends can create types with variable-sized modes, so we
3063 shouldn't be asked to pass or return them. */
3064 size = GET_MODE_SIZE (mode).to_constant ();
3066 /* Aggregates are passed by reference based on their size. */
3067 if (type && AGGREGATE_TYPE_P (type))
3069 size = int_size_in_bytes (type);
3072 /* Variable sized arguments are always returned by reference. */
3073 if (size < 0)
3074 return true;
3076 /* Can this be a candidate to be passed in fp/simd register(s)? */
3077 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3078 &dummymode, &nregs,
3079 NULL))
3080 return false;
3082 /* Arguments which are variable sized or larger than 2 registers are
3083 passed by reference unless they are a homogenous floating point
3084 aggregate. */
3085 return size > 2 * UNITS_PER_WORD;
3088 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3089 static bool
3090 aarch64_return_in_msb (const_tree valtype)
3092 machine_mode dummy_mode;
3093 int dummy_int;
3095 /* Never happens in little-endian mode. */
3096 if (!BYTES_BIG_ENDIAN)
3097 return false;
3099 /* Only composite types smaller than or equal to 16 bytes can
3100 be potentially returned in registers. */
3101 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3102 || int_size_in_bytes (valtype) <= 0
3103 || int_size_in_bytes (valtype) > 16)
3104 return false;
3106 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3107 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3108 is always passed/returned in the least significant bits of fp/simd
3109 register(s). */
3110 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3111 &dummy_mode, &dummy_int, NULL))
3112 return false;
3114 return true;
3117 /* Implement TARGET_FUNCTION_VALUE.
3118 Define how to find the value returned by a function. */
3120 static rtx
3121 aarch64_function_value (const_tree type, const_tree func,
3122 bool outgoing ATTRIBUTE_UNUSED)
3124 machine_mode mode;
3125 int unsignedp;
3126 int count;
3127 machine_mode ag_mode;
3129 mode = TYPE_MODE (type);
3130 if (INTEGRAL_TYPE_P (type))
3131 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3133 if (aarch64_return_in_msb (type))
3135 HOST_WIDE_INT size = int_size_in_bytes (type);
3137 if (size % UNITS_PER_WORD != 0)
3139 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3140 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3144 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3145 &ag_mode, &count, NULL))
3147 if (!aarch64_composite_type_p (type, mode))
3149 gcc_assert (count == 1 && mode == ag_mode);
3150 return gen_rtx_REG (mode, V0_REGNUM);
3152 else
3154 int i;
3155 rtx par;
3157 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3158 for (i = 0; i < count; i++)
3160 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3161 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3162 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3163 XVECEXP (par, 0, i) = tmp;
3165 return par;
3168 else
3169 return gen_rtx_REG (mode, R0_REGNUM);
3172 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3173 Return true if REGNO is the number of a hard register in which the values
3174 of called function may come back. */
3176 static bool
3177 aarch64_function_value_regno_p (const unsigned int regno)
3179 /* Maximum of 16 bytes can be returned in the general registers. Examples
3180 of 16-byte return values are: 128-bit integers and 16-byte small
3181 structures (excluding homogeneous floating-point aggregates). */
3182 if (regno == R0_REGNUM || regno == R1_REGNUM)
3183 return true;
3185 /* Up to four fp/simd registers can return a function value, e.g. a
3186 homogeneous floating-point aggregate having four members. */
3187 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3188 return TARGET_FLOAT;
3190 return false;
3193 /* Implement TARGET_RETURN_IN_MEMORY.
3195 If the type T of the result of a function is such that
3196 void func (T arg)
3197 would require that arg be passed as a value in a register (or set of
3198 registers) according to the parameter passing rules, then the result
3199 is returned in the same registers as would be used for such an
3200 argument. */
3202 static bool
3203 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3205 HOST_WIDE_INT size;
3206 machine_mode ag_mode;
3207 int count;
3209 if (!AGGREGATE_TYPE_P (type)
3210 && TREE_CODE (type) != COMPLEX_TYPE
3211 && TREE_CODE (type) != VECTOR_TYPE)
3212 /* Simple scalar types always returned in registers. */
3213 return false;
3215 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3216 type,
3217 &ag_mode,
3218 &count,
3219 NULL))
3220 return false;
3222 /* Types larger than 2 registers returned in memory. */
3223 size = int_size_in_bytes (type);
3224 return (size < 0 || size > 2 * UNITS_PER_WORD);
3227 static bool
3228 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3229 const_tree type, int *nregs)
3231 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3232 return aarch64_vfp_is_call_or_return_candidate (mode,
3233 type,
3234 &pcum->aapcs_vfp_rmode,
3235 nregs,
3236 NULL);
3239 /* Given MODE and TYPE of a function argument, return the alignment in
3240 bits. The idea is to suppress any stronger alignment requested by
3241 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3242 This is a helper function for local use only. */
3244 static unsigned int
3245 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3247 if (!type)
3248 return GET_MODE_ALIGNMENT (mode);
3250 if (integer_zerop (TYPE_SIZE (type)))
3251 return 0;
3253 gcc_assert (TYPE_MODE (type) == mode);
3255 if (!AGGREGATE_TYPE_P (type))
3256 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3258 if (TREE_CODE (type) == ARRAY_TYPE)
3259 return TYPE_ALIGN (TREE_TYPE (type));
3261 unsigned int alignment = 0;
3262 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3263 if (TREE_CODE (field) == FIELD_DECL)
3264 alignment = std::max (alignment, DECL_ALIGN (field));
3266 return alignment;
3269 /* Layout a function argument according to the AAPCS64 rules. The rule
3270 numbers refer to the rule numbers in the AAPCS64. */
3272 static void
3273 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3274 const_tree type,
3275 bool named ATTRIBUTE_UNUSED)
3277 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3278 int ncrn, nvrn, nregs;
3279 bool allocate_ncrn, allocate_nvrn;
3280 HOST_WIDE_INT size;
3282 /* We need to do this once per argument. */
3283 if (pcum->aapcs_arg_processed)
3284 return;
3286 pcum->aapcs_arg_processed = true;
3288 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3289 if (type)
3290 size = int_size_in_bytes (type);
3291 else
3292 /* No frontends can create types with variable-sized modes, so we
3293 shouldn't be asked to pass or return them. */
3294 size = GET_MODE_SIZE (mode).to_constant ();
3295 size = ROUND_UP (size, UNITS_PER_WORD);
3297 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3298 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3299 mode,
3300 type,
3301 &nregs);
3303 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3304 The following code thus handles passing by SIMD/FP registers first. */
3306 nvrn = pcum->aapcs_nvrn;
3308 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3309 and homogenous short-vector aggregates (HVA). */
3310 if (allocate_nvrn)
3312 if (!TARGET_FLOAT)
3313 aarch64_err_no_fpadvsimd (mode, "argument");
3315 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3317 pcum->aapcs_nextnvrn = nvrn + nregs;
3318 if (!aarch64_composite_type_p (type, mode))
3320 gcc_assert (nregs == 1);
3321 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3323 else
3325 rtx par;
3326 int i;
3327 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3328 for (i = 0; i < nregs; i++)
3330 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3331 V0_REGNUM + nvrn + i);
3332 rtx offset = gen_int_mode
3333 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3334 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3335 XVECEXP (par, 0, i) = tmp;
3337 pcum->aapcs_reg = par;
3339 return;
3341 else
3343 /* C.3 NSRN is set to 8. */
3344 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3345 goto on_stack;
3349 ncrn = pcum->aapcs_ncrn;
3350 nregs = size / UNITS_PER_WORD;
3352 /* C6 - C9. though the sign and zero extension semantics are
3353 handled elsewhere. This is the case where the argument fits
3354 entirely general registers. */
3355 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3358 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3360 /* C.8 if the argument has an alignment of 16 then the NGRN is
3361 rounded up to the next even number. */
3362 if (nregs == 2
3363 && ncrn % 2
3364 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3365 comparison is there because for > 16 * BITS_PER_UNIT
3366 alignment nregs should be > 2 and therefore it should be
3367 passed by reference rather than value. */
3368 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3370 ++ncrn;
3371 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3374 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3375 A reg is still generated for it, but the caller should be smart
3376 enough not to use it. */
3377 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3378 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3379 else
3381 rtx par;
3382 int i;
3384 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3385 for (i = 0; i < nregs; i++)
3387 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3388 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3389 GEN_INT (i * UNITS_PER_WORD));
3390 XVECEXP (par, 0, i) = tmp;
3392 pcum->aapcs_reg = par;
3395 pcum->aapcs_nextncrn = ncrn + nregs;
3396 return;
3399 /* C.11 */
3400 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3402 /* The argument is passed on stack; record the needed number of words for
3403 this argument and align the total size if necessary. */
3404 on_stack:
3405 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3407 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3408 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3409 16 / UNITS_PER_WORD);
3410 return;
3413 /* Implement TARGET_FUNCTION_ARG. */
3415 static rtx
3416 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3417 const_tree type, bool named)
3419 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3420 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3422 if (mode == VOIDmode)
3423 return NULL_RTX;
3425 aarch64_layout_arg (pcum_v, mode, type, named);
3426 return pcum->aapcs_reg;
3429 void
3430 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3431 const_tree fntype ATTRIBUTE_UNUSED,
3432 rtx libname ATTRIBUTE_UNUSED,
3433 const_tree fndecl ATTRIBUTE_UNUSED,
3434 unsigned n_named ATTRIBUTE_UNUSED)
3436 pcum->aapcs_ncrn = 0;
3437 pcum->aapcs_nvrn = 0;
3438 pcum->aapcs_nextncrn = 0;
3439 pcum->aapcs_nextnvrn = 0;
3440 pcum->pcs_variant = ARM_PCS_AAPCS64;
3441 pcum->aapcs_reg = NULL_RTX;
3442 pcum->aapcs_arg_processed = false;
3443 pcum->aapcs_stack_words = 0;
3444 pcum->aapcs_stack_size = 0;
3446 if (!TARGET_FLOAT
3447 && fndecl && TREE_PUBLIC (fndecl)
3448 && fntype && fntype != error_mark_node)
3450 const_tree type = TREE_TYPE (fntype);
3451 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3452 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3453 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3454 &mode, &nregs, NULL))
3455 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3457 return;
3460 static void
3461 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3462 machine_mode mode,
3463 const_tree type,
3464 bool named)
3466 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3467 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3469 aarch64_layout_arg (pcum_v, mode, type, named);
3470 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3471 != (pcum->aapcs_stack_words != 0));
3472 pcum->aapcs_arg_processed = false;
3473 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3474 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3475 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3476 pcum->aapcs_stack_words = 0;
3477 pcum->aapcs_reg = NULL_RTX;
3481 bool
3482 aarch64_function_arg_regno_p (unsigned regno)
3484 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3485 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3488 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3489 PARM_BOUNDARY bits of alignment, but will be given anything up
3490 to STACK_BOUNDARY bits if the type requires it. This makes sure
3491 that both before and after the layout of each argument, the Next
3492 Stacked Argument Address (NSAA) will have a minimum alignment of
3493 8 bytes. */
3495 static unsigned int
3496 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3498 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3499 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3502 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3504 static fixed_size_mode
3505 aarch64_get_reg_raw_mode (int regno)
3507 if (TARGET_SVE && FP_REGNUM_P (regno))
3508 /* Don't use the SVE part of the register for __builtin_apply and
3509 __builtin_return. The SVE registers aren't used by the normal PCS,
3510 so using them there would be a waste of time. The PCS extensions
3511 for SVE types are fundamentally incompatible with the
3512 __builtin_return/__builtin_apply interface. */
3513 return as_a <fixed_size_mode> (V16QImode);
3514 return default_get_reg_raw_mode (regno);
3517 /* Implement TARGET_FUNCTION_ARG_PADDING.
3519 Small aggregate types are placed in the lowest memory address.
3521 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3523 static pad_direction
3524 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3526 /* On little-endian targets, the least significant byte of every stack
3527 argument is passed at the lowest byte address of the stack slot. */
3528 if (!BYTES_BIG_ENDIAN)
3529 return PAD_UPWARD;
3531 /* Otherwise, integral, floating-point and pointer types are padded downward:
3532 the least significant byte of a stack argument is passed at the highest
3533 byte address of the stack slot. */
3534 if (type
3535 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3536 || POINTER_TYPE_P (type))
3537 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3538 return PAD_DOWNWARD;
3540 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3541 return PAD_UPWARD;
3544 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3546 It specifies padding for the last (may also be the only)
3547 element of a block move between registers and memory. If
3548 assuming the block is in the memory, padding upward means that
3549 the last element is padded after its highest significant byte,
3550 while in downward padding, the last element is padded at the
3551 its least significant byte side.
3553 Small aggregates and small complex types are always padded
3554 upwards.
3556 We don't need to worry about homogeneous floating-point or
3557 short-vector aggregates; their move is not affected by the
3558 padding direction determined here. Regardless of endianness,
3559 each element of such an aggregate is put in the least
3560 significant bits of a fp/simd register.
3562 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3563 register has useful data, and return the opposite if the most
3564 significant byte does. */
3566 bool
3567 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3568 bool first ATTRIBUTE_UNUSED)
3571 /* Small composite types are always padded upward. */
3572 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3574 HOST_WIDE_INT size;
3575 if (type)
3576 size = int_size_in_bytes (type);
3577 else
3578 /* No frontends can create types with variable-sized modes, so we
3579 shouldn't be asked to pass or return them. */
3580 size = GET_MODE_SIZE (mode).to_constant ();
3581 if (size < 2 * UNITS_PER_WORD)
3582 return true;
3585 /* Otherwise, use the default padding. */
3586 return !BYTES_BIG_ENDIAN;
3589 static scalar_int_mode
3590 aarch64_libgcc_cmp_return_mode (void)
3592 return SImode;
3595 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3597 /* We use the 12-bit shifted immediate arithmetic instructions so values
3598 must be multiple of (1 << 12), i.e. 4096. */
3599 #define ARITH_FACTOR 4096
3601 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3602 #error Cannot use simple address calculation for stack probing
3603 #endif
3605 /* The pair of scratch registers used for stack probing. */
3606 #define PROBE_STACK_FIRST_REG 9
3607 #define PROBE_STACK_SECOND_REG 10
3609 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3610 inclusive. These are offsets from the current stack pointer. */
3612 static void
3613 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3615 HOST_WIDE_INT size;
3616 if (!poly_size.is_constant (&size))
3618 sorry ("stack probes for SVE frames");
3619 return;
3622 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3624 /* See the same assertion on PROBE_INTERVAL above. */
3625 gcc_assert ((first % ARITH_FACTOR) == 0);
3627 /* See if we have a constant small number of probes to generate. If so,
3628 that's the easy case. */
3629 if (size <= PROBE_INTERVAL)
3631 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3633 emit_set_insn (reg1,
3634 plus_constant (Pmode,
3635 stack_pointer_rtx, -(first + base)));
3636 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3639 /* The run-time loop is made up of 8 insns in the generic case while the
3640 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3641 else if (size <= 4 * PROBE_INTERVAL)
3643 HOST_WIDE_INT i, rem;
3645 emit_set_insn (reg1,
3646 plus_constant (Pmode,
3647 stack_pointer_rtx,
3648 -(first + PROBE_INTERVAL)));
3649 emit_stack_probe (reg1);
3651 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3652 it exceeds SIZE. If only two probes are needed, this will not
3653 generate any code. Then probe at FIRST + SIZE. */
3654 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3656 emit_set_insn (reg1,
3657 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3658 emit_stack_probe (reg1);
3661 rem = size - (i - PROBE_INTERVAL);
3662 if (rem > 256)
3664 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3666 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3667 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3669 else
3670 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3673 /* Otherwise, do the same as above, but in a loop. Note that we must be
3674 extra careful with variables wrapping around because we might be at
3675 the very top (or the very bottom) of the address space and we have
3676 to be able to handle this case properly; in particular, we use an
3677 equality test for the loop condition. */
3678 else
3680 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3682 /* Step 1: round SIZE to the previous multiple of the interval. */
3684 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3687 /* Step 2: compute initial and final value of the loop counter. */
3689 /* TEST_ADDR = SP + FIRST. */
3690 emit_set_insn (reg1,
3691 plus_constant (Pmode, stack_pointer_rtx, -first));
3693 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3694 HOST_WIDE_INT adjustment = - (first + rounded_size);
3695 if (! aarch64_uimm12_shift (adjustment))
3697 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3698 true, Pmode);
3699 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3701 else
3703 emit_set_insn (reg2,
3704 plus_constant (Pmode, stack_pointer_rtx, adjustment));
3707 /* Step 3: the loop
3711 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3712 probe at TEST_ADDR
3714 while (TEST_ADDR != LAST_ADDR)
3716 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3717 until it is equal to ROUNDED_SIZE. */
3719 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3722 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3723 that SIZE is equal to ROUNDED_SIZE. */
3725 if (size != rounded_size)
3727 HOST_WIDE_INT rem = size - rounded_size;
3729 if (rem > 256)
3731 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3733 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3734 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3736 else
3737 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3741 /* Make sure nothing is scheduled before we are done. */
3742 emit_insn (gen_blockage ());
3745 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3746 absolute addresses. */
3748 const char *
3749 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3751 static int labelno = 0;
3752 char loop_lab[32];
3753 rtx xops[2];
3755 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3757 /* Loop. */
3758 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3760 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3761 xops[0] = reg1;
3762 xops[1] = GEN_INT (PROBE_INTERVAL);
3763 output_asm_insn ("sub\t%0, %0, %1", xops);
3765 /* Probe at TEST_ADDR. */
3766 output_asm_insn ("str\txzr, [%0]", xops);
3768 /* Test if TEST_ADDR == LAST_ADDR. */
3769 xops[1] = reg2;
3770 output_asm_insn ("cmp\t%0, %1", xops);
3772 /* Branch. */
3773 fputs ("\tb.ne\t", asm_out_file);
3774 assemble_name_raw (asm_out_file, loop_lab);
3775 fputc ('\n', asm_out_file);
3777 return "";
3780 /* Mark the registers that need to be saved by the callee and calculate
3781 the size of the callee-saved registers area and frame record (both FP
3782 and LR may be omitted). */
3783 static void
3784 aarch64_layout_frame (void)
3786 HOST_WIDE_INT offset = 0;
3787 int regno, last_fp_reg = INVALID_REGNUM;
3789 if (reload_completed && cfun->machine->frame.laid_out)
3790 return;
3792 /* Force a frame chain for EH returns so the return address is at FP+8. */
3793 cfun->machine->frame.emit_frame_chain
3794 = frame_pointer_needed || crtl->calls_eh_return;
3796 /* Emit a frame chain if the frame pointer is enabled.
3797 If -momit-leaf-frame-pointer is used, do not use a frame chain
3798 in leaf functions which do not use LR. */
3799 if (flag_omit_frame_pointer == 2
3800 && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
3801 && !df_regs_ever_live_p (LR_REGNUM)))
3802 cfun->machine->frame.emit_frame_chain = true;
3804 #define SLOT_NOT_REQUIRED (-2)
3805 #define SLOT_REQUIRED (-1)
3807 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3808 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3810 /* First mark all the registers that really need to be saved... */
3811 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3812 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3814 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3815 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3817 /* ... that includes the eh data registers (if needed)... */
3818 if (crtl->calls_eh_return)
3819 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
3820 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
3821 = SLOT_REQUIRED;
3823 /* ... and any callee saved register that dataflow says is live. */
3824 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3825 if (df_regs_ever_live_p (regno)
3826 && (regno == R30_REGNUM
3827 || !call_used_regs[regno]))
3828 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3830 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3831 if (df_regs_ever_live_p (regno)
3832 && !call_used_regs[regno])
3834 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3835 last_fp_reg = regno;
3838 if (cfun->machine->frame.emit_frame_chain)
3840 /* FP and LR are placed in the linkage record. */
3841 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
3842 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
3843 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
3844 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
3845 offset = 2 * UNITS_PER_WORD;
3848 /* Now assign stack slots for them. */
3849 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3850 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
3852 cfun->machine->frame.reg_offset[regno] = offset;
3853 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
3854 cfun->machine->frame.wb_candidate1 = regno;
3855 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
3856 cfun->machine->frame.wb_candidate2 = regno;
3857 offset += UNITS_PER_WORD;
3860 HOST_WIDE_INT max_int_offset = offset;
3861 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
3862 bool has_align_gap = offset != max_int_offset;
3864 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3865 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
3867 /* If there is an alignment gap between integer and fp callee-saves,
3868 allocate the last fp register to it if possible. */
3869 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
3871 cfun->machine->frame.reg_offset[regno] = max_int_offset;
3872 break;
3875 cfun->machine->frame.reg_offset[regno] = offset;
3876 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
3877 cfun->machine->frame.wb_candidate1 = regno;
3878 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
3879 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
3880 cfun->machine->frame.wb_candidate2 = regno;
3881 offset += UNITS_PER_WORD;
3884 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
3886 cfun->machine->frame.saved_regs_size = offset;
3888 HOST_WIDE_INT varargs_and_saved_regs_size
3889 = offset + cfun->machine->frame.saved_varargs_size;
3891 cfun->machine->frame.hard_fp_offset
3892 = aligned_upper_bound (varargs_and_saved_regs_size
3893 + get_frame_size (),
3894 STACK_BOUNDARY / BITS_PER_UNIT);
3896 /* Both these values are already aligned. */
3897 gcc_assert (multiple_p (crtl->outgoing_args_size,
3898 STACK_BOUNDARY / BITS_PER_UNIT));
3899 cfun->machine->frame.frame_size
3900 = (cfun->machine->frame.hard_fp_offset
3901 + crtl->outgoing_args_size);
3903 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
3905 cfun->machine->frame.initial_adjust = 0;
3906 cfun->machine->frame.final_adjust = 0;
3907 cfun->machine->frame.callee_adjust = 0;
3908 cfun->machine->frame.callee_offset = 0;
3910 HOST_WIDE_INT max_push_offset = 0;
3911 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
3912 max_push_offset = 512;
3913 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
3914 max_push_offset = 256;
3916 HOST_WIDE_INT const_size, const_fp_offset;
3917 if (cfun->machine->frame.frame_size.is_constant (&const_size)
3918 && const_size < max_push_offset
3919 && known_eq (crtl->outgoing_args_size, 0))
3921 /* Simple, small frame with no outgoing arguments:
3922 stp reg1, reg2, [sp, -frame_size]!
3923 stp reg3, reg4, [sp, 16] */
3924 cfun->machine->frame.callee_adjust = const_size;
3926 else if (known_lt (crtl->outgoing_args_size
3927 + cfun->machine->frame.saved_regs_size, 512)
3928 && !(cfun->calls_alloca
3929 && known_lt (cfun->machine->frame.hard_fp_offset,
3930 max_push_offset)))
3932 /* Frame with small outgoing arguments:
3933 sub sp, sp, frame_size
3934 stp reg1, reg2, [sp, outgoing_args_size]
3935 stp reg3, reg4, [sp, outgoing_args_size + 16] */
3936 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3937 cfun->machine->frame.callee_offset
3938 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3940 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
3941 && const_fp_offset < max_push_offset)
3943 /* Frame with large outgoing arguments but a small local area:
3944 stp reg1, reg2, [sp, -hard_fp_offset]!
3945 stp reg3, reg4, [sp, 16]
3946 sub sp, sp, outgoing_args_size */
3947 cfun->machine->frame.callee_adjust = const_fp_offset;
3948 cfun->machine->frame.final_adjust
3949 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3951 else
3953 /* Frame with large local area and outgoing arguments using frame pointer:
3954 sub sp, sp, hard_fp_offset
3955 stp x29, x30, [sp, 0]
3956 add x29, sp, 0
3957 stp reg3, reg4, [sp, 16]
3958 sub sp, sp, outgoing_args_size */
3959 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3960 cfun->machine->frame.final_adjust
3961 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3964 cfun->machine->frame.laid_out = true;
3967 /* Return true if the register REGNO is saved on entry to
3968 the current function. */
3970 static bool
3971 aarch64_register_saved_on_entry (int regno)
3973 return cfun->machine->frame.reg_offset[regno] >= 0;
3976 /* Return the next register up from REGNO up to LIMIT for the callee
3977 to save. */
3979 static unsigned
3980 aarch64_next_callee_save (unsigned regno, unsigned limit)
3982 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3983 regno ++;
3984 return regno;
3987 /* Push the register number REGNO of mode MODE to the stack with write-back
3988 adjusting the stack by ADJUSTMENT. */
3990 static void
3991 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3992 HOST_WIDE_INT adjustment)
3994 rtx base_rtx = stack_pointer_rtx;
3995 rtx insn, reg, mem;
3997 reg = gen_rtx_REG (mode, regno);
3998 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3999 plus_constant (Pmode, base_rtx, -adjustment));
4000 mem = gen_frame_mem (mode, mem);
4002 insn = emit_move_insn (mem, reg);
4003 RTX_FRAME_RELATED_P (insn) = 1;
4006 /* Generate and return an instruction to store the pair of registers
4007 REG and REG2 of mode MODE to location BASE with write-back adjusting
4008 the stack location BASE by ADJUSTMENT. */
4010 static rtx
4011 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4012 HOST_WIDE_INT adjustment)
4014 switch (mode)
4016 case E_DImode:
4017 return gen_storewb_pairdi_di (base, base, reg, reg2,
4018 GEN_INT (-adjustment),
4019 GEN_INT (UNITS_PER_WORD - adjustment));
4020 case E_DFmode:
4021 return gen_storewb_pairdf_di (base, base, reg, reg2,
4022 GEN_INT (-adjustment),
4023 GEN_INT (UNITS_PER_WORD - adjustment));
4024 default:
4025 gcc_unreachable ();
4029 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4030 stack pointer by ADJUSTMENT. */
4032 static void
4033 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4035 rtx_insn *insn;
4036 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4038 if (regno2 == INVALID_REGNUM)
4039 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4041 rtx reg1 = gen_rtx_REG (mode, regno1);
4042 rtx reg2 = gen_rtx_REG (mode, regno2);
4044 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4045 reg2, adjustment));
4046 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4047 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4048 RTX_FRAME_RELATED_P (insn) = 1;
4051 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4052 adjusting it by ADJUSTMENT afterwards. */
4054 static rtx
4055 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4056 HOST_WIDE_INT adjustment)
4058 switch (mode)
4060 case E_DImode:
4061 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4062 GEN_INT (UNITS_PER_WORD));
4063 case E_DFmode:
4064 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4065 GEN_INT (UNITS_PER_WORD));
4066 default:
4067 gcc_unreachable ();
4071 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4072 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4073 into CFI_OPS. */
4075 static void
4076 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4077 rtx *cfi_ops)
4079 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4080 rtx reg1 = gen_rtx_REG (mode, regno1);
4082 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4084 if (regno2 == INVALID_REGNUM)
4086 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4087 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4088 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4090 else
4092 rtx reg2 = gen_rtx_REG (mode, regno2);
4093 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4094 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4095 reg2, adjustment));
4099 /* Generate and return a store pair instruction of mode MODE to store
4100 register REG1 to MEM1 and register REG2 to MEM2. */
4102 static rtx
4103 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4104 rtx reg2)
4106 switch (mode)
4108 case E_DImode:
4109 return gen_store_pairdi (mem1, reg1, mem2, reg2);
4111 case E_DFmode:
4112 return gen_store_pairdf (mem1, reg1, mem2, reg2);
4114 default:
4115 gcc_unreachable ();
4119 /* Generate and regurn a load pair isntruction of mode MODE to load register
4120 REG1 from MEM1 and register REG2 from MEM2. */
4122 static rtx
4123 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4124 rtx mem2)
4126 switch (mode)
4128 case E_DImode:
4129 return gen_load_pairdi (reg1, mem1, reg2, mem2);
4131 case E_DFmode:
4132 return gen_load_pairdf (reg1, mem1, reg2, mem2);
4134 default:
4135 gcc_unreachable ();
4139 /* Return TRUE if return address signing should be enabled for the current
4140 function, otherwise return FALSE. */
4142 bool
4143 aarch64_return_address_signing_enabled (void)
4145 /* This function should only be called after frame laid out. */
4146 gcc_assert (cfun->machine->frame.laid_out);
4148 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4149 if it's LR is pushed onto stack. */
4150 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4151 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4152 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4155 /* Emit code to save the callee-saved registers from register number START
4156 to LIMIT to the stack at the location starting at offset START_OFFSET,
4157 skipping any write-back candidates if SKIP_WB is true. */
4159 static void
4160 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4161 unsigned start, unsigned limit, bool skip_wb)
4163 rtx_insn *insn;
4164 unsigned regno;
4165 unsigned regno2;
4167 for (regno = aarch64_next_callee_save (start, limit);
4168 regno <= limit;
4169 regno = aarch64_next_callee_save (regno + 1, limit))
4171 rtx reg, mem;
4172 poly_int64 offset;
4174 if (skip_wb
4175 && (regno == cfun->machine->frame.wb_candidate1
4176 || regno == cfun->machine->frame.wb_candidate2))
4177 continue;
4179 if (cfun->machine->reg_is_wrapped_separately[regno])
4180 continue;
4182 reg = gen_rtx_REG (mode, regno);
4183 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4184 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4185 offset));
4187 regno2 = aarch64_next_callee_save (regno + 1, limit);
4189 if (regno2 <= limit
4190 && !cfun->machine->reg_is_wrapped_separately[regno2]
4191 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4192 == cfun->machine->frame.reg_offset[regno2]))
4195 rtx reg2 = gen_rtx_REG (mode, regno2);
4196 rtx mem2;
4198 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4199 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4200 offset));
4201 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4202 reg2));
4204 /* The first part of a frame-related parallel insn is
4205 always assumed to be relevant to the frame
4206 calculations; subsequent parts, are only
4207 frame-related if explicitly marked. */
4208 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4209 regno = regno2;
4211 else
4212 insn = emit_move_insn (mem, reg);
4214 RTX_FRAME_RELATED_P (insn) = 1;
4218 /* Emit code to restore the callee registers of mode MODE from register
4219 number START up to and including LIMIT. Restore from the stack offset
4220 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4221 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4223 static void
4224 aarch64_restore_callee_saves (machine_mode mode,
4225 poly_int64 start_offset, unsigned start,
4226 unsigned limit, bool skip_wb, rtx *cfi_ops)
4228 rtx base_rtx = stack_pointer_rtx;
4229 unsigned regno;
4230 unsigned regno2;
4231 poly_int64 offset;
4233 for (regno = aarch64_next_callee_save (start, limit);
4234 regno <= limit;
4235 regno = aarch64_next_callee_save (regno + 1, limit))
4237 if (cfun->machine->reg_is_wrapped_separately[regno])
4238 continue;
4240 rtx reg, mem;
4242 if (skip_wb
4243 && (regno == cfun->machine->frame.wb_candidate1
4244 || regno == cfun->machine->frame.wb_candidate2))
4245 continue;
4247 reg = gen_rtx_REG (mode, regno);
4248 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4249 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4251 regno2 = aarch64_next_callee_save (regno + 1, limit);
4253 if (regno2 <= limit
4254 && !cfun->machine->reg_is_wrapped_separately[regno2]
4255 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4256 == cfun->machine->frame.reg_offset[regno2]))
4258 rtx reg2 = gen_rtx_REG (mode, regno2);
4259 rtx mem2;
4261 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4262 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4263 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4265 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4266 regno = regno2;
4268 else
4269 emit_move_insn (reg, mem);
4270 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4274 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4275 of MODE. */
4277 static inline bool
4278 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4280 HOST_WIDE_INT multiple;
4281 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4282 && IN_RANGE (multiple, -8, 7));
4285 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4286 of MODE. */
4288 static inline bool
4289 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4291 HOST_WIDE_INT multiple;
4292 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4293 && IN_RANGE (multiple, 0, 63));
4296 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4297 of MODE. */
4299 bool
4300 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4302 HOST_WIDE_INT multiple;
4303 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4304 && IN_RANGE (multiple, -64, 63));
4307 /* Return true if OFFSET is a signed 9-bit value. */
4309 static inline bool
4310 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4311 poly_int64 offset)
4313 HOST_WIDE_INT const_offset;
4314 return (offset.is_constant (&const_offset)
4315 && IN_RANGE (const_offset, -256, 255));
4318 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4319 of MODE. */
4321 static inline bool
4322 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4324 HOST_WIDE_INT multiple;
4325 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4326 && IN_RANGE (multiple, -256, 255));
4329 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4330 of MODE. */
4332 static inline bool
4333 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4335 HOST_WIDE_INT multiple;
4336 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4337 && IN_RANGE (multiple, 0, 4095));
4340 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4342 static sbitmap
4343 aarch64_get_separate_components (void)
4345 aarch64_layout_frame ();
4347 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4348 bitmap_clear (components);
4350 /* The registers we need saved to the frame. */
4351 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4352 if (aarch64_register_saved_on_entry (regno))
4354 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4355 if (!frame_pointer_needed)
4356 offset += cfun->machine->frame.frame_size
4357 - cfun->machine->frame.hard_fp_offset;
4358 /* Check that we can access the stack slot of the register with one
4359 direct load with no adjustments needed. */
4360 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4361 bitmap_set_bit (components, regno);
4364 /* Don't mess with the hard frame pointer. */
4365 if (frame_pointer_needed)
4366 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4368 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4369 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4370 /* If aarch64_layout_frame has chosen registers to store/restore with
4371 writeback don't interfere with them to avoid having to output explicit
4372 stack adjustment instructions. */
4373 if (reg2 != INVALID_REGNUM)
4374 bitmap_clear_bit (components, reg2);
4375 if (reg1 != INVALID_REGNUM)
4376 bitmap_clear_bit (components, reg1);
4378 bitmap_clear_bit (components, LR_REGNUM);
4379 bitmap_clear_bit (components, SP_REGNUM);
4381 return components;
4384 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4386 static sbitmap
4387 aarch64_components_for_bb (basic_block bb)
4389 bitmap in = DF_LIVE_IN (bb);
4390 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4391 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4393 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4394 bitmap_clear (components);
4396 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4397 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4398 if ((!call_used_regs[regno])
4399 && (bitmap_bit_p (in, regno)
4400 || bitmap_bit_p (gen, regno)
4401 || bitmap_bit_p (kill, regno)))
4402 bitmap_set_bit (components, regno);
4404 return components;
4407 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4408 Nothing to do for aarch64. */
4410 static void
4411 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4415 /* Return the next set bit in BMP from START onwards. Return the total number
4416 of bits in BMP if no set bit is found at or after START. */
4418 static unsigned int
4419 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4421 unsigned int nbits = SBITMAP_SIZE (bmp);
4422 if (start == nbits)
4423 return start;
4425 gcc_assert (start < nbits);
4426 for (unsigned int i = start; i < nbits; i++)
4427 if (bitmap_bit_p (bmp, i))
4428 return i;
4430 return nbits;
4433 /* Do the work for aarch64_emit_prologue_components and
4434 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4435 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4436 for these components or the epilogue sequence. That is, it determines
4437 whether we should emit stores or loads and what kind of CFA notes to attach
4438 to the insns. Otherwise the logic for the two sequences is very
4439 similar. */
4441 static void
4442 aarch64_process_components (sbitmap components, bool prologue_p)
4444 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4445 ? HARD_FRAME_POINTER_REGNUM
4446 : STACK_POINTER_REGNUM);
4448 unsigned last_regno = SBITMAP_SIZE (components);
4449 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4450 rtx_insn *insn = NULL;
4452 while (regno != last_regno)
4454 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4455 so DFmode for the vector registers is enough. */
4456 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4457 rtx reg = gen_rtx_REG (mode, regno);
4458 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4459 if (!frame_pointer_needed)
4460 offset += cfun->machine->frame.frame_size
4461 - cfun->machine->frame.hard_fp_offset;
4462 rtx addr = plus_constant (Pmode, ptr_reg, offset);
4463 rtx mem = gen_frame_mem (mode, addr);
4465 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4466 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4467 /* No more registers to handle after REGNO.
4468 Emit a single save/restore and exit. */
4469 if (regno2 == last_regno)
4471 insn = emit_insn (set);
4472 RTX_FRAME_RELATED_P (insn) = 1;
4473 if (prologue_p)
4474 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4475 else
4476 add_reg_note (insn, REG_CFA_RESTORE, reg);
4477 break;
4480 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4481 /* The next register is not of the same class or its offset is not
4482 mergeable with the current one into a pair. */
4483 if (!satisfies_constraint_Ump (mem)
4484 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4485 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4486 GET_MODE_SIZE (mode)))
4488 insn = emit_insn (set);
4489 RTX_FRAME_RELATED_P (insn) = 1;
4490 if (prologue_p)
4491 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4492 else
4493 add_reg_note (insn, REG_CFA_RESTORE, reg);
4495 regno = regno2;
4496 continue;
4499 /* REGNO2 can be saved/restored in a pair with REGNO. */
4500 rtx reg2 = gen_rtx_REG (mode, regno2);
4501 if (!frame_pointer_needed)
4502 offset2 += cfun->machine->frame.frame_size
4503 - cfun->machine->frame.hard_fp_offset;
4504 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4505 rtx mem2 = gen_frame_mem (mode, addr2);
4506 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4507 : gen_rtx_SET (reg2, mem2);
4509 if (prologue_p)
4510 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4511 else
4512 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4514 RTX_FRAME_RELATED_P (insn) = 1;
4515 if (prologue_p)
4517 add_reg_note (insn, REG_CFA_OFFSET, set);
4518 add_reg_note (insn, REG_CFA_OFFSET, set2);
4520 else
4522 add_reg_note (insn, REG_CFA_RESTORE, reg);
4523 add_reg_note (insn, REG_CFA_RESTORE, reg2);
4526 regno = aarch64_get_next_set_bit (components, regno2 + 1);
4530 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4532 static void
4533 aarch64_emit_prologue_components (sbitmap components)
4535 aarch64_process_components (components, true);
4538 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4540 static void
4541 aarch64_emit_epilogue_components (sbitmap components)
4543 aarch64_process_components (components, false);
4546 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4548 static void
4549 aarch64_set_handled_components (sbitmap components)
4551 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4552 if (bitmap_bit_p (components, regno))
4553 cfun->machine->reg_is_wrapped_separately[regno] = true;
4556 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4557 is saved at BASE + OFFSET. */
4559 static void
4560 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4561 rtx base, poly_int64 offset)
4563 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4564 add_reg_note (insn, REG_CFA_EXPRESSION,
4565 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4568 /* AArch64 stack frames generated by this compiler look like:
4570 +-------------------------------+
4572 | incoming stack arguments |
4574 +-------------------------------+
4575 | | <-- incoming stack pointer (aligned)
4576 | callee-allocated save area |
4577 | for register varargs |
4579 +-------------------------------+
4580 | local variables | <-- frame_pointer_rtx
4582 +-------------------------------+
4583 | padding0 | \
4584 +-------------------------------+ |
4585 | callee-saved registers | | frame.saved_regs_size
4586 +-------------------------------+ |
4587 | LR' | |
4588 +-------------------------------+ |
4589 | FP' | / <- hard_frame_pointer_rtx (aligned)
4590 +-------------------------------+
4591 | dynamic allocation |
4592 +-------------------------------+
4593 | padding |
4594 +-------------------------------+
4595 | outgoing stack arguments | <-- arg_pointer
4597 +-------------------------------+
4598 | | <-- stack_pointer_rtx (aligned)
4600 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4601 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4602 unchanged. */
4604 /* Generate the prologue instructions for entry into a function.
4605 Establish the stack frame by decreasing the stack pointer with a
4606 properly calculated size and, if necessary, create a frame record
4607 filled with the values of LR and previous frame pointer. The
4608 current FP is also set up if it is in use. */
4610 void
4611 aarch64_expand_prologue (void)
4613 aarch64_layout_frame ();
4615 poly_int64 frame_size = cfun->machine->frame.frame_size;
4616 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4617 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4618 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4619 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4620 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4621 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4622 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4623 rtx_insn *insn;
4625 /* Sign return address for functions. */
4626 if (aarch64_return_address_signing_enabled ())
4628 insn = emit_insn (gen_pacisp ());
4629 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4630 RTX_FRAME_RELATED_P (insn) = 1;
4633 if (flag_stack_usage_info)
4634 current_function_static_stack_size = constant_lower_bound (frame_size);
4636 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4638 if (crtl->is_leaf && !cfun->calls_alloca)
4640 if (maybe_gt (frame_size, PROBE_INTERVAL)
4641 && maybe_gt (frame_size, get_stack_check_protect ()))
4642 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4643 (frame_size
4644 - get_stack_check_protect ()));
4646 else if (maybe_gt (frame_size, 0))
4647 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4650 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4651 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4653 aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4655 if (callee_adjust != 0)
4656 aarch64_push_regs (reg1, reg2, callee_adjust);
4658 if (emit_frame_chain)
4660 poly_int64 reg_offset = callee_adjust;
4661 if (callee_adjust == 0)
4663 reg1 = R29_REGNUM;
4664 reg2 = R30_REGNUM;
4665 reg_offset = callee_offset;
4666 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4668 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4669 stack_pointer_rtx, callee_offset,
4670 ip1_rtx, ip0_rtx, frame_pointer_needed);
4671 if (frame_pointer_needed && !frame_size.is_constant ())
4673 /* Variable-sized frames need to describe the save slot
4674 address using DW_CFA_expression rather than DW_CFA_offset.
4675 This means that, without taking further action, the
4676 locations of the registers that we've already saved would
4677 remain based on the stack pointer even after we redefine
4678 the CFA based on the frame pointer. We therefore need new
4679 DW_CFA_expressions to re-express the save slots with addresses
4680 based on the frame pointer. */
4681 rtx_insn *insn = get_last_insn ();
4682 gcc_assert (RTX_FRAME_RELATED_P (insn));
4684 /* Add an explicit CFA definition if this was previously
4685 implicit. */
4686 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4688 rtx src = plus_constant (Pmode, stack_pointer_rtx,
4689 callee_offset);
4690 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4691 gen_rtx_SET (hard_frame_pointer_rtx, src));
4694 /* Change the save slot expressions for the registers that
4695 we've already saved. */
4696 reg_offset -= callee_offset;
4697 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4698 reg_offset + UNITS_PER_WORD);
4699 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4700 reg_offset);
4702 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4705 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4706 callee_adjust != 0 || emit_frame_chain);
4707 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4708 callee_adjust != 0 || emit_frame_chain);
4709 aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4712 /* Return TRUE if we can use a simple_return insn.
4714 This function checks whether the callee saved stack is empty, which
4715 means no restore actions are need. The pro_and_epilogue will use
4716 this to check whether shrink-wrapping opt is feasible. */
4718 bool
4719 aarch64_use_return_insn_p (void)
4721 if (!reload_completed)
4722 return false;
4724 if (crtl->profile)
4725 return false;
4727 aarch64_layout_frame ();
4729 return known_eq (cfun->machine->frame.frame_size, 0);
4732 /* Generate the epilogue instructions for returning from a function.
4733 This is almost exactly the reverse of the prolog sequence, except
4734 that we need to insert barriers to avoid scheduling loads that read
4735 from a deallocated stack, and we optimize the unwind records by
4736 emitting them all together if possible. */
4737 void
4738 aarch64_expand_epilogue (bool for_sibcall)
4740 aarch64_layout_frame ();
4742 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4743 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4744 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4745 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4746 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4747 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4748 rtx cfi_ops = NULL;
4749 rtx_insn *insn;
4750 /* A stack clash protection prologue may not have left IP0_REGNUM or
4751 IP1_REGNUM in a usable state. The same is true for allocations
4752 with an SVE component, since we then need both temporary registers
4753 for each allocation. */
4754 bool can_inherit_p = (initial_adjust.is_constant ()
4755 && final_adjust.is_constant ()
4756 && !flag_stack_clash_protection);
4758 /* We need to add memory barrier to prevent read from deallocated stack. */
4759 bool need_barrier_p
4760 = maybe_ne (get_frame_size ()
4761 + cfun->machine->frame.saved_varargs_size, 0);
4763 /* Emit a barrier to prevent loads from a deallocated stack. */
4764 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4765 || cfun->calls_alloca
4766 || crtl->calls_eh_return)
4768 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4769 need_barrier_p = false;
4772 /* Restore the stack pointer from the frame pointer if it may not
4773 be the same as the stack pointer. */
4774 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4775 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4776 if (frame_pointer_needed
4777 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4778 /* If writeback is used when restoring callee-saves, the CFA
4779 is restored on the instruction doing the writeback. */
4780 aarch64_add_offset (Pmode, stack_pointer_rtx,
4781 hard_frame_pointer_rtx, -callee_offset,
4782 ip1_rtx, ip0_rtx, callee_adjust == 0);
4783 else
4784 aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4785 !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4787 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4788 callee_adjust != 0, &cfi_ops);
4789 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4790 callee_adjust != 0, &cfi_ops);
4792 if (need_barrier_p)
4793 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4795 if (callee_adjust != 0)
4796 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4798 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4800 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
4801 insn = get_last_insn ();
4802 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
4803 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
4804 RTX_FRAME_RELATED_P (insn) = 1;
4805 cfi_ops = NULL;
4808 aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
4809 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
4811 if (cfi_ops)
4813 /* Emit delayed restores and reset the CFA to be SP. */
4814 insn = get_last_insn ();
4815 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
4816 REG_NOTES (insn) = cfi_ops;
4817 RTX_FRAME_RELATED_P (insn) = 1;
4820 /* We prefer to emit the combined return/authenticate instruction RETAA,
4821 however there are three cases in which we must instead emit an explicit
4822 authentication instruction.
4824 1) Sibcalls don't return in a normal way, so if we're about to call one
4825 we must authenticate.
4827 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
4828 generating code for !TARGET_ARMV8_3 we can't use it and must
4829 explicitly authenticate.
4831 3) On an eh_return path we make extra stack adjustments to update the
4832 canonical frame address to be the exception handler's CFA. We want
4833 to authenticate using the CFA of the function which calls eh_return.
4835 if (aarch64_return_address_signing_enabled ()
4836 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
4838 insn = emit_insn (gen_autisp ());
4839 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4840 RTX_FRAME_RELATED_P (insn) = 1;
4843 /* Stack adjustment for exception handler. */
4844 if (crtl->calls_eh_return)
4846 /* We need to unwind the stack by the offset computed by
4847 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
4848 to be SP; letting the CFA move during this adjustment
4849 is just as correct as retaining the CFA from the body
4850 of the function. Therefore, do nothing special. */
4851 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
4854 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
4855 if (!for_sibcall)
4856 emit_jump_insn (ret_rtx);
4859 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
4860 normally or return to a previous frame after unwinding.
4862 An EH return uses a single shared return sequence. The epilogue is
4863 exactly like a normal epilogue except that it has an extra input
4864 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
4865 that must be applied after the frame has been destroyed. An extra label
4866 is inserted before the epilogue which initializes this register to zero,
4867 and this is the entry point for a normal return.
4869 An actual EH return updates the return address, initializes the stack
4870 adjustment and jumps directly into the epilogue (bypassing the zeroing
4871 of the adjustment). Since the return address is typically saved on the
4872 stack when a function makes a call, the saved LR must be updated outside
4873 the epilogue.
4875 This poses problems as the store is generated well before the epilogue,
4876 so the offset of LR is not known yet. Also optimizations will remove the
4877 store as it appears dead, even after the epilogue is generated (as the
4878 base or offset for loading LR is different in many cases).
4880 To avoid these problems this implementation forces the frame pointer
4881 in eh_return functions so that the location of LR is fixed and known early.
4882 It also marks the store volatile, so no optimization is permitted to
4883 remove the store. */
4885 aarch64_eh_return_handler_rtx (void)
4887 rtx tmp = gen_frame_mem (Pmode,
4888 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
4890 /* Mark the store volatile, so no optimization is permitted to remove it. */
4891 MEM_VOLATILE_P (tmp) = true;
4892 return tmp;
4895 /* Output code to add DELTA to the first argument, and then jump
4896 to FUNCTION. Used for C++ multiple inheritance. */
4897 static void
4898 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
4899 HOST_WIDE_INT delta,
4900 HOST_WIDE_INT vcall_offset,
4901 tree function)
4903 /* The this pointer is always in x0. Note that this differs from
4904 Arm where the this pointer maybe bumped to r1 if r0 is required
4905 to return a pointer to an aggregate. On AArch64 a result value
4906 pointer will be in x8. */
4907 int this_regno = R0_REGNUM;
4908 rtx this_rtx, temp0, temp1, addr, funexp;
4909 rtx_insn *insn;
4911 reload_completed = 1;
4912 emit_note (NOTE_INSN_PROLOGUE_END);
4914 this_rtx = gen_rtx_REG (Pmode, this_regno);
4915 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
4916 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
4918 if (vcall_offset == 0)
4919 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
4920 else
4922 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
4924 addr = this_rtx;
4925 if (delta != 0)
4927 if (delta >= -256 && delta < 256)
4928 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
4929 plus_constant (Pmode, this_rtx, delta));
4930 else
4931 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
4932 temp1, temp0, false);
4935 if (Pmode == ptr_mode)
4936 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
4937 else
4938 aarch64_emit_move (temp0,
4939 gen_rtx_ZERO_EXTEND (Pmode,
4940 gen_rtx_MEM (ptr_mode, addr)));
4942 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
4943 addr = plus_constant (Pmode, temp0, vcall_offset);
4944 else
4946 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
4947 Pmode);
4948 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
4951 if (Pmode == ptr_mode)
4952 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
4953 else
4954 aarch64_emit_move (temp1,
4955 gen_rtx_SIGN_EXTEND (Pmode,
4956 gen_rtx_MEM (ptr_mode, addr)));
4958 emit_insn (gen_add2_insn (this_rtx, temp1));
4961 /* Generate a tail call to the target function. */
4962 if (!TREE_USED (function))
4964 assemble_external (function);
4965 TREE_USED (function) = 1;
4967 funexp = XEXP (DECL_RTL (function), 0);
4968 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
4969 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
4970 SIBLING_CALL_P (insn) = 1;
4972 insn = get_insns ();
4973 shorten_branches (insn);
4974 final_start_function (insn, file, 1);
4975 final (insn, file, 1);
4976 final_end_function ();
4978 /* Stop pretending to be a post-reload pass. */
4979 reload_completed = 0;
4982 static bool
4983 aarch64_tls_referenced_p (rtx x)
4985 if (!TARGET_HAVE_TLS)
4986 return false;
4987 subrtx_iterator::array_type array;
4988 FOR_EACH_SUBRTX (iter, array, x, ALL)
4990 const_rtx x = *iter;
4991 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
4992 return true;
4993 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
4994 TLS offsets, not real symbol references. */
4995 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4996 iter.skip_subrtxes ();
4998 return false;
5002 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5003 a left shift of 0 or 12 bits. */
5004 bool
5005 aarch64_uimm12_shift (HOST_WIDE_INT val)
5007 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5008 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5013 /* Return true if val is an immediate that can be loaded into a
5014 register by a MOVZ instruction. */
5015 static bool
5016 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5018 if (GET_MODE_SIZE (mode) > 4)
5020 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5021 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5022 return 1;
5024 else
5026 /* Ignore sign extension. */
5027 val &= (HOST_WIDE_INT) 0xffffffff;
5029 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5030 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5033 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5034 64-bit (DImode) integer. */
5036 static unsigned HOST_WIDE_INT
5037 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5039 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5040 while (size < 64)
5042 val &= (HOST_WIDE_INT_1U << size) - 1;
5043 val |= val << size;
5044 size *= 2;
5046 return val;
5049 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5051 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5053 0x0000000100000001ull,
5054 0x0001000100010001ull,
5055 0x0101010101010101ull,
5056 0x1111111111111111ull,
5057 0x5555555555555555ull,
5061 /* Return true if val is a valid bitmask immediate. */
5063 bool
5064 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5066 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5067 int bits;
5069 /* Check for a single sequence of one bits and return quickly if so.
5070 The special cases of all ones and all zeroes returns false. */
5071 val = aarch64_replicate_bitmask_imm (val_in, mode);
5072 tmp = val + (val & -val);
5074 if (tmp == (tmp & -tmp))
5075 return (val + 1) > 1;
5077 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5078 if (mode == SImode)
5079 val = (val << 32) | (val & 0xffffffff);
5081 /* Invert if the immediate doesn't start with a zero bit - this means we
5082 only need to search for sequences of one bits. */
5083 if (val & 1)
5084 val = ~val;
5086 /* Find the first set bit and set tmp to val with the first sequence of one
5087 bits removed. Return success if there is a single sequence of ones. */
5088 first_one = val & -val;
5089 tmp = val & (val + first_one);
5091 if (tmp == 0)
5092 return true;
5094 /* Find the next set bit and compute the difference in bit position. */
5095 next_one = tmp & -tmp;
5096 bits = clz_hwi (first_one) - clz_hwi (next_one);
5097 mask = val ^ tmp;
5099 /* Check the bit position difference is a power of 2, and that the first
5100 sequence of one bits fits within 'bits' bits. */
5101 if ((mask >> bits) != 0 || bits != (bits & -bits))
5102 return false;
5104 /* Check the sequence of one bits is repeated 64/bits times. */
5105 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5108 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5109 Assumed precondition: VAL_IN Is not zero. */
5111 unsigned HOST_WIDE_INT
5112 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5114 int lowest_bit_set = ctz_hwi (val_in);
5115 int highest_bit_set = floor_log2 (val_in);
5116 gcc_assert (val_in != 0);
5118 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5119 (HOST_WIDE_INT_1U << lowest_bit_set));
5122 /* Create constant where bits outside of lowest bit set to highest bit set
5123 are set to 1. */
5125 unsigned HOST_WIDE_INT
5126 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5128 return val_in | ~aarch64_and_split_imm1 (val_in);
5131 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5133 bool
5134 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5136 scalar_int_mode int_mode;
5137 if (!is_a <scalar_int_mode> (mode, &int_mode))
5138 return false;
5140 if (aarch64_bitmask_imm (val_in, int_mode))
5141 return false;
5143 if (aarch64_move_imm (val_in, int_mode))
5144 return false;
5146 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5148 return aarch64_bitmask_imm (imm2, int_mode);
5151 /* Return true if val is an immediate that can be loaded into a
5152 register in a single instruction. */
5153 bool
5154 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5156 scalar_int_mode int_mode;
5157 if (!is_a <scalar_int_mode> (mode, &int_mode))
5158 return false;
5160 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5161 return 1;
5162 return aarch64_bitmask_imm (val, int_mode);
5165 static bool
5166 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5168 rtx base, offset;
5170 if (GET_CODE (x) == HIGH)
5171 return true;
5173 /* There's no way to calculate VL-based values using relocations. */
5174 subrtx_iterator::array_type array;
5175 FOR_EACH_SUBRTX (iter, array, x, ALL)
5176 if (GET_CODE (*iter) == CONST_POLY_INT)
5177 return true;
5179 split_const (x, &base, &offset);
5180 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5182 if (aarch64_classify_symbol (base, INTVAL (offset))
5183 != SYMBOL_FORCE_TO_MEM)
5184 return true;
5185 else
5186 /* Avoid generating a 64-bit relocation in ILP32; leave
5187 to aarch64_expand_mov_immediate to handle it properly. */
5188 return mode != ptr_mode;
5191 return aarch64_tls_referenced_p (x);
5194 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5195 The expansion for a table switch is quite expensive due to the number
5196 of instructions, the table lookup and hard to predict indirect jump.
5197 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5198 set, otherwise use tables for > 16 cases as a tradeoff between size and
5199 performance. When optimizing for size, use the default setting. */
5201 static unsigned int
5202 aarch64_case_values_threshold (void)
5204 /* Use the specified limit for the number of cases before using jump
5205 tables at higher optimization levels. */
5206 if (optimize > 2
5207 && selected_cpu->tune->max_case_values != 0)
5208 return selected_cpu->tune->max_case_values;
5209 else
5210 return optimize_size ? default_case_values_threshold () : 17;
5213 /* Return true if register REGNO is a valid index register.
5214 STRICT_P is true if REG_OK_STRICT is in effect. */
5216 bool
5217 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5219 if (!HARD_REGISTER_NUM_P (regno))
5221 if (!strict_p)
5222 return true;
5224 if (!reg_renumber)
5225 return false;
5227 regno = reg_renumber[regno];
5229 return GP_REGNUM_P (regno);
5232 /* Return true if register REGNO is a valid base register for mode MODE.
5233 STRICT_P is true if REG_OK_STRICT is in effect. */
5235 bool
5236 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5238 if (!HARD_REGISTER_NUM_P (regno))
5240 if (!strict_p)
5241 return true;
5243 if (!reg_renumber)
5244 return false;
5246 regno = reg_renumber[regno];
5249 /* The fake registers will be eliminated to either the stack or
5250 hard frame pointer, both of which are usually valid base registers.
5251 Reload deals with the cases where the eliminated form isn't valid. */
5252 return (GP_REGNUM_P (regno)
5253 || regno == SP_REGNUM
5254 || regno == FRAME_POINTER_REGNUM
5255 || regno == ARG_POINTER_REGNUM);
5258 /* Return true if X is a valid base register for mode MODE.
5259 STRICT_P is true if REG_OK_STRICT is in effect. */
5261 static bool
5262 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5264 if (!strict_p
5265 && GET_CODE (x) == SUBREG
5266 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5267 x = SUBREG_REG (x);
5269 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5272 /* Return true if address offset is a valid index. If it is, fill in INFO
5273 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5275 static bool
5276 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5277 machine_mode mode, bool strict_p)
5279 enum aarch64_address_type type;
5280 rtx index;
5281 int shift;
5283 /* (reg:P) */
5284 if ((REG_P (x) || GET_CODE (x) == SUBREG)
5285 && GET_MODE (x) == Pmode)
5287 type = ADDRESS_REG_REG;
5288 index = x;
5289 shift = 0;
5291 /* (sign_extend:DI (reg:SI)) */
5292 else if ((GET_CODE (x) == SIGN_EXTEND
5293 || GET_CODE (x) == ZERO_EXTEND)
5294 && GET_MODE (x) == DImode
5295 && GET_MODE (XEXP (x, 0)) == SImode)
5297 type = (GET_CODE (x) == SIGN_EXTEND)
5298 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5299 index = XEXP (x, 0);
5300 shift = 0;
5302 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5303 else if (GET_CODE (x) == MULT
5304 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5305 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5306 && GET_MODE (XEXP (x, 0)) == DImode
5307 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5308 && CONST_INT_P (XEXP (x, 1)))
5310 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5311 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5312 index = XEXP (XEXP (x, 0), 0);
5313 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5315 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5316 else if (GET_CODE (x) == ASHIFT
5317 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5318 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5319 && GET_MODE (XEXP (x, 0)) == DImode
5320 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5321 && CONST_INT_P (XEXP (x, 1)))
5323 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5324 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5325 index = XEXP (XEXP (x, 0), 0);
5326 shift = INTVAL (XEXP (x, 1));
5328 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5329 else if ((GET_CODE (x) == SIGN_EXTRACT
5330 || GET_CODE (x) == ZERO_EXTRACT)
5331 && GET_MODE (x) == DImode
5332 && GET_CODE (XEXP (x, 0)) == MULT
5333 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5334 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5336 type = (GET_CODE (x) == SIGN_EXTRACT)
5337 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5338 index = XEXP (XEXP (x, 0), 0);
5339 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5340 if (INTVAL (XEXP (x, 1)) != 32 + shift
5341 || INTVAL (XEXP (x, 2)) != 0)
5342 shift = -1;
5344 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5345 (const_int 0xffffffff<<shift)) */
5346 else if (GET_CODE (x) == AND
5347 && GET_MODE (x) == DImode
5348 && GET_CODE (XEXP (x, 0)) == MULT
5349 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5350 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5351 && CONST_INT_P (XEXP (x, 1)))
5353 type = ADDRESS_REG_UXTW;
5354 index = XEXP (XEXP (x, 0), 0);
5355 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5356 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5357 shift = -1;
5359 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5360 else if ((GET_CODE (x) == SIGN_EXTRACT
5361 || GET_CODE (x) == ZERO_EXTRACT)
5362 && GET_MODE (x) == DImode
5363 && GET_CODE (XEXP (x, 0)) == ASHIFT
5364 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5365 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5367 type = (GET_CODE (x) == SIGN_EXTRACT)
5368 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5369 index = XEXP (XEXP (x, 0), 0);
5370 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5371 if (INTVAL (XEXP (x, 1)) != 32 + shift
5372 || INTVAL (XEXP (x, 2)) != 0)
5373 shift = -1;
5375 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5376 (const_int 0xffffffff<<shift)) */
5377 else if (GET_CODE (x) == AND
5378 && GET_MODE (x) == DImode
5379 && GET_CODE (XEXP (x, 0)) == ASHIFT
5380 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5381 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5382 && CONST_INT_P (XEXP (x, 1)))
5384 type = ADDRESS_REG_UXTW;
5385 index = XEXP (XEXP (x, 0), 0);
5386 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5387 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5388 shift = -1;
5390 /* (mult:P (reg:P) (const_int scale)) */
5391 else if (GET_CODE (x) == MULT
5392 && GET_MODE (x) == Pmode
5393 && GET_MODE (XEXP (x, 0)) == Pmode
5394 && CONST_INT_P (XEXP (x, 1)))
5396 type = ADDRESS_REG_REG;
5397 index = XEXP (x, 0);
5398 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5400 /* (ashift:P (reg:P) (const_int shift)) */
5401 else if (GET_CODE (x) == ASHIFT
5402 && GET_MODE (x) == Pmode
5403 && GET_MODE (XEXP (x, 0)) == Pmode
5404 && CONST_INT_P (XEXP (x, 1)))
5406 type = ADDRESS_REG_REG;
5407 index = XEXP (x, 0);
5408 shift = INTVAL (XEXP (x, 1));
5410 else
5411 return false;
5413 if (!strict_p
5414 && GET_CODE (index) == SUBREG
5415 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5416 index = SUBREG_REG (index);
5418 if (aarch64_sve_data_mode_p (mode))
5420 if (type != ADDRESS_REG_REG
5421 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5422 return false;
5424 else
5426 if (shift != 0
5427 && !(IN_RANGE (shift, 1, 3)
5428 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5429 return false;
5432 if (REG_P (index)
5433 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5435 info->type = type;
5436 info->offset = index;
5437 info->shift = shift;
5438 return true;
5441 return false;
5444 /* Return true if MODE is one of the modes for which we
5445 support LDP/STP operations. */
5447 static bool
5448 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5450 return mode == SImode || mode == DImode
5451 || mode == SFmode || mode == DFmode
5452 || (aarch64_vector_mode_supported_p (mode)
5453 && known_eq (GET_MODE_SIZE (mode), 8));
5456 /* Return true if REGNO is a virtual pointer register, or an eliminable
5457 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5458 include stack_pointer or hard_frame_pointer. */
5459 static bool
5460 virt_or_elim_regno_p (unsigned regno)
5462 return ((regno >= FIRST_VIRTUAL_REGISTER
5463 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5464 || regno == FRAME_POINTER_REGNUM
5465 || regno == ARG_POINTER_REGNUM);
5468 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5469 If it is, fill in INFO appropriately. STRICT_P is true if
5470 REG_OK_STRICT is in effect. */
5472 static bool
5473 aarch64_classify_address (struct aarch64_address_info *info,
5474 rtx x, machine_mode mode, bool strict_p,
5475 aarch64_addr_query_type type = ADDR_QUERY_M)
5477 enum rtx_code code = GET_CODE (x);
5478 rtx op0, op1;
5479 poly_int64 offset;
5481 HOST_WIDE_INT const_size;
5483 /* On BE, we use load/store pair for all large int mode load/stores.
5484 TI/TFmode may also use a load/store pair. */
5485 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5486 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5487 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5488 || mode == TImode
5489 || mode == TFmode
5490 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5492 bool allow_reg_index_p = (!load_store_pair_p
5493 && (known_lt (GET_MODE_SIZE (mode), 16)
5494 || vec_flags == VEC_ADVSIMD
5495 || vec_flags == VEC_SVE_DATA));
5497 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5498 [Rn, #offset, MUL VL]. */
5499 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5500 && (code != REG && code != PLUS))
5501 return false;
5503 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5504 REG addressing. */
5505 if (advsimd_struct_p
5506 && !BYTES_BIG_ENDIAN
5507 && (code != POST_INC && code != REG))
5508 return false;
5510 gcc_checking_assert (GET_MODE (x) == VOIDmode
5511 || SCALAR_INT_MODE_P (GET_MODE (x)));
5513 switch (code)
5515 case REG:
5516 case SUBREG:
5517 info->type = ADDRESS_REG_IMM;
5518 info->base = x;
5519 info->offset = const0_rtx;
5520 info->const_offset = 0;
5521 return aarch64_base_register_rtx_p (x, strict_p);
5523 case PLUS:
5524 op0 = XEXP (x, 0);
5525 op1 = XEXP (x, 1);
5527 if (! strict_p
5528 && REG_P (op0)
5529 && virt_or_elim_regno_p (REGNO (op0))
5530 && poly_int_rtx_p (op1, &offset))
5532 info->type = ADDRESS_REG_IMM;
5533 info->base = op0;
5534 info->offset = op1;
5535 info->const_offset = offset;
5537 return true;
5540 if (maybe_ne (GET_MODE_SIZE (mode), 0)
5541 && aarch64_base_register_rtx_p (op0, strict_p)
5542 && poly_int_rtx_p (op1, &offset))
5544 info->type = ADDRESS_REG_IMM;
5545 info->base = op0;
5546 info->offset = op1;
5547 info->const_offset = offset;
5549 /* TImode and TFmode values are allowed in both pairs of X
5550 registers and individual Q registers. The available
5551 address modes are:
5552 X,X: 7-bit signed scaled offset
5553 Q: 9-bit signed offset
5554 We conservatively require an offset representable in either mode.
5555 When performing the check for pairs of X registers i.e. LDP/STP
5556 pass down DImode since that is the natural size of the LDP/STP
5557 instruction memory accesses. */
5558 if (mode == TImode || mode == TFmode)
5559 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5560 && (offset_9bit_signed_unscaled_p (mode, offset)
5561 || offset_12bit_unsigned_scaled_p (mode, offset)));
5563 /* A 7bit offset check because OImode will emit a ldp/stp
5564 instruction (only big endian will get here).
5565 For ldp/stp instructions, the offset is scaled for the size of a
5566 single element of the pair. */
5567 if (mode == OImode)
5568 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5570 /* Three 9/12 bit offsets checks because CImode will emit three
5571 ldr/str instructions (only big endian will get here). */
5572 if (mode == CImode)
5573 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5574 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5575 || offset_12bit_unsigned_scaled_p (V16QImode,
5576 offset + 32)));
5578 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5579 instructions (only big endian will get here). */
5580 if (mode == XImode)
5581 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5582 && aarch64_offset_7bit_signed_scaled_p (TImode,
5583 offset + 32));
5585 /* Make "m" use the LD1 offset range for SVE data modes, so
5586 that pre-RTL optimizers like ivopts will work to that
5587 instead of the wider LDR/STR range. */
5588 if (vec_flags == VEC_SVE_DATA)
5589 return (type == ADDR_QUERY_M
5590 ? offset_4bit_signed_scaled_p (mode, offset)
5591 : offset_9bit_signed_scaled_p (mode, offset));
5593 if (vec_flags == VEC_SVE_PRED)
5594 return offset_9bit_signed_scaled_p (mode, offset);
5596 if (load_store_pair_p)
5597 return ((known_eq (GET_MODE_SIZE (mode), 4)
5598 || known_eq (GET_MODE_SIZE (mode), 8))
5599 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5600 else
5601 return (offset_9bit_signed_unscaled_p (mode, offset)
5602 || offset_12bit_unsigned_scaled_p (mode, offset));
5605 if (allow_reg_index_p)
5607 /* Look for base + (scaled/extended) index register. */
5608 if (aarch64_base_register_rtx_p (op0, strict_p)
5609 && aarch64_classify_index (info, op1, mode, strict_p))
5611 info->base = op0;
5612 return true;
5614 if (aarch64_base_register_rtx_p (op1, strict_p)
5615 && aarch64_classify_index (info, op0, mode, strict_p))
5617 info->base = op1;
5618 return true;
5622 return false;
5624 case POST_INC:
5625 case POST_DEC:
5626 case PRE_INC:
5627 case PRE_DEC:
5628 info->type = ADDRESS_REG_WB;
5629 info->base = XEXP (x, 0);
5630 info->offset = NULL_RTX;
5631 return aarch64_base_register_rtx_p (info->base, strict_p);
5633 case POST_MODIFY:
5634 case PRE_MODIFY:
5635 info->type = ADDRESS_REG_WB;
5636 info->base = XEXP (x, 0);
5637 if (GET_CODE (XEXP (x, 1)) == PLUS
5638 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5639 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5640 && aarch64_base_register_rtx_p (info->base, strict_p))
5642 info->offset = XEXP (XEXP (x, 1), 1);
5643 info->const_offset = offset;
5645 /* TImode and TFmode values are allowed in both pairs of X
5646 registers and individual Q registers. The available
5647 address modes are:
5648 X,X: 7-bit signed scaled offset
5649 Q: 9-bit signed offset
5650 We conservatively require an offset representable in either mode.
5652 if (mode == TImode || mode == TFmode)
5653 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5654 && offset_9bit_signed_unscaled_p (mode, offset));
5656 if (load_store_pair_p)
5657 return ((known_eq (GET_MODE_SIZE (mode), 4)
5658 || known_eq (GET_MODE_SIZE (mode), 8))
5659 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5660 else
5661 return offset_9bit_signed_unscaled_p (mode, offset);
5663 return false;
5665 case CONST:
5666 case SYMBOL_REF:
5667 case LABEL_REF:
5668 /* load literal: pc-relative constant pool entry. Only supported
5669 for SI mode or larger. */
5670 info->type = ADDRESS_SYMBOLIC;
5672 if (!load_store_pair_p
5673 && GET_MODE_SIZE (mode).is_constant (&const_size)
5674 && const_size >= 4)
5676 rtx sym, addend;
5678 split_const (x, &sym, &addend);
5679 return ((GET_CODE (sym) == LABEL_REF
5680 || (GET_CODE (sym) == SYMBOL_REF
5681 && CONSTANT_POOL_ADDRESS_P (sym)
5682 && aarch64_pcrelative_literal_loads)));
5684 return false;
5686 case LO_SUM:
5687 info->type = ADDRESS_LO_SUM;
5688 info->base = XEXP (x, 0);
5689 info->offset = XEXP (x, 1);
5690 if (allow_reg_index_p
5691 && aarch64_base_register_rtx_p (info->base, strict_p))
5693 rtx sym, offs;
5694 split_const (info->offset, &sym, &offs);
5695 if (GET_CODE (sym) == SYMBOL_REF
5696 && (aarch64_classify_symbol (sym, INTVAL (offs))
5697 == SYMBOL_SMALL_ABSOLUTE))
5699 /* The symbol and offset must be aligned to the access size. */
5700 unsigned int align;
5702 if (CONSTANT_POOL_ADDRESS_P (sym))
5703 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5704 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5706 tree exp = SYMBOL_REF_DECL (sym);
5707 align = TYPE_ALIGN (TREE_TYPE (exp));
5708 align = aarch64_constant_alignment (exp, align);
5710 else if (SYMBOL_REF_DECL (sym))
5711 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5712 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5713 && SYMBOL_REF_BLOCK (sym) != NULL)
5714 align = SYMBOL_REF_BLOCK (sym)->alignment;
5715 else
5716 align = BITS_PER_UNIT;
5718 poly_int64 ref_size = GET_MODE_SIZE (mode);
5719 if (known_eq (ref_size, 0))
5720 ref_size = GET_MODE_SIZE (DImode);
5722 return (multiple_p (INTVAL (offs), ref_size)
5723 && multiple_p (align / BITS_PER_UNIT, ref_size));
5726 return false;
5728 default:
5729 return false;
5733 /* Return true if the address X is valid for a PRFM instruction.
5734 STRICT_P is true if we should do strict checking with
5735 aarch64_classify_address. */
5737 bool
5738 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5740 struct aarch64_address_info addr;
5742 /* PRFM accepts the same addresses as DImode... */
5743 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5744 if (!res)
5745 return false;
5747 /* ... except writeback forms. */
5748 return addr.type != ADDRESS_REG_WB;
5751 bool
5752 aarch64_symbolic_address_p (rtx x)
5754 rtx offset;
5756 split_const (x, &x, &offset);
5757 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5760 /* Classify the base of symbolic expression X. */
5762 enum aarch64_symbol_type
5763 aarch64_classify_symbolic_expression (rtx x)
5765 rtx offset;
5767 split_const (x, &x, &offset);
5768 return aarch64_classify_symbol (x, INTVAL (offset));
5772 /* Return TRUE if X is a legitimate address for accessing memory in
5773 mode MODE. */
5774 static bool
5775 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5777 struct aarch64_address_info addr;
5779 return aarch64_classify_address (&addr, x, mode, strict_p);
5782 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5783 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
5784 bool
5785 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5786 aarch64_addr_query_type type)
5788 struct aarch64_address_info addr;
5790 return aarch64_classify_address (&addr, x, mode, strict_p, type);
5793 /* Split an out-of-range address displacement into a base and offset.
5794 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
5795 to increase opportunities for sharing the base address of different sizes.
5796 Unaligned accesses use the signed 9-bit range, TImode/TFmode use
5797 the intersection of signed scaled 7-bit and signed 9-bit offset. */
5798 static bool
5799 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
5801 HOST_WIDE_INT size;
5802 if (GET_MODE_SIZE (mode).is_constant (&size))
5804 HOST_WIDE_INT offset = INTVAL (*disp);
5805 HOST_WIDE_INT base;
5807 if (mode == TImode || mode == TFmode)
5808 base = (offset + 0x100) & ~0x1f8;
5809 else if ((offset & (size - 1)) != 0)
5810 base = (offset + 0x100) & ~0x1ff;
5811 else
5812 base = offset & ~(size < 4 ? 0xfff : 0x3ffc);
5814 *off = GEN_INT (base);
5815 *disp = GEN_INT (offset - base);
5816 return true;
5818 return false;
5821 /* Return the binary representation of floating point constant VALUE in INTVAL.
5822 If the value cannot be converted, return false without setting INTVAL.
5823 The conversion is done in the given MODE. */
5824 bool
5825 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
5828 /* We make a general exception for 0. */
5829 if (aarch64_float_const_zero_rtx_p (value))
5831 *intval = 0;
5832 return true;
5835 scalar_float_mode mode;
5836 if (GET_CODE (value) != CONST_DOUBLE
5837 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
5838 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
5839 /* Only support up to DF mode. */
5840 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
5841 return false;
5843 unsigned HOST_WIDE_INT ival = 0;
5845 long res[2];
5846 real_to_target (res,
5847 CONST_DOUBLE_REAL_VALUE (value),
5848 REAL_MODE_FORMAT (mode));
5850 if (mode == DFmode)
5852 int order = BYTES_BIG_ENDIAN ? 1 : 0;
5853 ival = zext_hwi (res[order], 32);
5854 ival |= (zext_hwi (res[1 - order], 32) << 32);
5856 else
5857 ival = zext_hwi (res[0], 32);
5859 *intval = ival;
5860 return true;
5863 /* Return TRUE if rtx X is an immediate constant that can be moved using a
5864 single MOV(+MOVK) followed by an FMOV. */
5865 bool
5866 aarch64_float_const_rtx_p (rtx x)
5868 machine_mode mode = GET_MODE (x);
5869 if (mode == VOIDmode)
5870 return false;
5872 /* Determine whether it's cheaper to write float constants as
5873 mov/movk pairs over ldr/adrp pairs. */
5874 unsigned HOST_WIDE_INT ival;
5876 if (GET_CODE (x) == CONST_DOUBLE
5877 && SCALAR_FLOAT_MODE_P (mode)
5878 && aarch64_reinterpret_float_as_int (x, &ival))
5880 scalar_int_mode imode = (mode == HFmode
5881 ? SImode
5882 : int_mode_for_mode (mode).require ());
5883 int num_instr = aarch64_internal_mov_immediate
5884 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
5885 return num_instr < 3;
5888 return false;
5891 /* Return TRUE if rtx X is immediate constant 0.0 */
5892 bool
5893 aarch64_float_const_zero_rtx_p (rtx x)
5895 if (GET_MODE (x) == VOIDmode)
5896 return false;
5898 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
5899 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
5900 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
5903 /* Return TRUE if rtx X is immediate constant that fits in a single
5904 MOVI immediate operation. */
5905 bool
5906 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
5908 if (!TARGET_SIMD)
5909 return false;
5911 machine_mode vmode;
5912 scalar_int_mode imode;
5913 unsigned HOST_WIDE_INT ival;
5915 if (GET_CODE (x) == CONST_DOUBLE
5916 && SCALAR_FLOAT_MODE_P (mode))
5918 if (!aarch64_reinterpret_float_as_int (x, &ival))
5919 return false;
5921 /* We make a general exception for 0. */
5922 if (aarch64_float_const_zero_rtx_p (x))
5923 return true;
5925 imode = int_mode_for_mode (mode).require ();
5927 else if (GET_CODE (x) == CONST_INT
5928 && is_a <scalar_int_mode> (mode, &imode))
5929 ival = INTVAL (x);
5930 else
5931 return false;
5933 /* use a 64 bit mode for everything except for DI/DF mode, where we use
5934 a 128 bit vector mode. */
5935 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
5937 vmode = aarch64_simd_container_mode (imode, width);
5938 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
5940 return aarch64_simd_valid_immediate (v_op, NULL);
5944 /* Return the fixed registers used for condition codes. */
5946 static bool
5947 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
5949 *p1 = CC_REGNUM;
5950 *p2 = INVALID_REGNUM;
5951 return true;
5954 /* This function is used by the call expanders of the machine description.
5955 RESULT is the register in which the result is returned. It's NULL for
5956 "call" and "sibcall".
5957 MEM is the location of the function call.
5958 SIBCALL indicates whether this function call is normal call or sibling call.
5959 It will generate different pattern accordingly. */
5961 void
5962 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
5964 rtx call, callee, tmp;
5965 rtvec vec;
5966 machine_mode mode;
5968 gcc_assert (MEM_P (mem));
5969 callee = XEXP (mem, 0);
5970 mode = GET_MODE (callee);
5971 gcc_assert (mode == Pmode);
5973 /* Decide if we should generate indirect calls by loading the
5974 address of the callee into a register before performing
5975 the branch-and-link. */
5976 if (SYMBOL_REF_P (callee)
5977 ? (aarch64_is_long_call_p (callee)
5978 || aarch64_is_noplt_call_p (callee))
5979 : !REG_P (callee))
5980 XEXP (mem, 0) = force_reg (mode, callee);
5982 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
5984 if (result != NULL_RTX)
5985 call = gen_rtx_SET (result, call);
5987 if (sibcall)
5988 tmp = ret_rtx;
5989 else
5990 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
5992 vec = gen_rtvec (2, call, tmp);
5993 call = gen_rtx_PARALLEL (VOIDmode, vec);
5995 aarch64_emit_call_insn (call);
5998 /* Emit call insn with PAT and do aarch64-specific handling. */
6000 void
6001 aarch64_emit_call_insn (rtx pat)
6003 rtx insn = emit_call_insn (pat);
6005 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6006 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6007 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6010 machine_mode
6011 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6013 /* All floating point compares return CCFP if it is an equality
6014 comparison, and CCFPE otherwise. */
6015 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6017 switch (code)
6019 case EQ:
6020 case NE:
6021 case UNORDERED:
6022 case ORDERED:
6023 case UNLT:
6024 case UNLE:
6025 case UNGT:
6026 case UNGE:
6027 case UNEQ:
6028 return CCFPmode;
6030 case LT:
6031 case LE:
6032 case GT:
6033 case GE:
6034 case LTGT:
6035 return CCFPEmode;
6037 default:
6038 gcc_unreachable ();
6042 /* Equality comparisons of short modes against zero can be performed
6043 using the TST instruction with the appropriate bitmask. */
6044 if (y == const0_rtx && REG_P (x)
6045 && (code == EQ || code == NE)
6046 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6047 return CC_NZmode;
6049 /* Similarly, comparisons of zero_extends from shorter modes can
6050 be performed using an ANDS with an immediate mask. */
6051 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6052 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6053 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6054 && (code == EQ || code == NE))
6055 return CC_NZmode;
6057 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6058 && y == const0_rtx
6059 && (code == EQ || code == NE || code == LT || code == GE)
6060 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6061 || GET_CODE (x) == NEG
6062 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6063 && CONST_INT_P (XEXP (x, 2)))))
6064 return CC_NZmode;
6066 /* A compare with a shifted operand. Because of canonicalization,
6067 the comparison will have to be swapped when we emit the assembly
6068 code. */
6069 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6070 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6071 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6072 || GET_CODE (x) == LSHIFTRT
6073 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6074 return CC_SWPmode;
6076 /* Similarly for a negated operand, but we can only do this for
6077 equalities. */
6078 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6079 && (REG_P (y) || GET_CODE (y) == SUBREG)
6080 && (code == EQ || code == NE)
6081 && GET_CODE (x) == NEG)
6082 return CC_Zmode;
6084 /* A test for unsigned overflow. */
6085 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6086 && code == NE
6087 && GET_CODE (x) == PLUS
6088 && GET_CODE (y) == ZERO_EXTEND)
6089 return CC_Cmode;
6091 /* For everything else, return CCmode. */
6092 return CCmode;
6095 static int
6096 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6099 aarch64_get_condition_code (rtx x)
6101 machine_mode mode = GET_MODE (XEXP (x, 0));
6102 enum rtx_code comp_code = GET_CODE (x);
6104 if (GET_MODE_CLASS (mode) != MODE_CC)
6105 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6106 return aarch64_get_condition_code_1 (mode, comp_code);
6109 static int
6110 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6112 switch (mode)
6114 case E_CCFPmode:
6115 case E_CCFPEmode:
6116 switch (comp_code)
6118 case GE: return AARCH64_GE;
6119 case GT: return AARCH64_GT;
6120 case LE: return AARCH64_LS;
6121 case LT: return AARCH64_MI;
6122 case NE: return AARCH64_NE;
6123 case EQ: return AARCH64_EQ;
6124 case ORDERED: return AARCH64_VC;
6125 case UNORDERED: return AARCH64_VS;
6126 case UNLT: return AARCH64_LT;
6127 case UNLE: return AARCH64_LE;
6128 case UNGT: return AARCH64_HI;
6129 case UNGE: return AARCH64_PL;
6130 default: return -1;
6132 break;
6134 case E_CCmode:
6135 switch (comp_code)
6137 case NE: return AARCH64_NE;
6138 case EQ: return AARCH64_EQ;
6139 case GE: return AARCH64_GE;
6140 case GT: return AARCH64_GT;
6141 case LE: return AARCH64_LE;
6142 case LT: return AARCH64_LT;
6143 case GEU: return AARCH64_CS;
6144 case GTU: return AARCH64_HI;
6145 case LEU: return AARCH64_LS;
6146 case LTU: return AARCH64_CC;
6147 default: return -1;
6149 break;
6151 case E_CC_SWPmode:
6152 switch (comp_code)
6154 case NE: return AARCH64_NE;
6155 case EQ: return AARCH64_EQ;
6156 case GE: return AARCH64_LE;
6157 case GT: return AARCH64_LT;
6158 case LE: return AARCH64_GE;
6159 case LT: return AARCH64_GT;
6160 case GEU: return AARCH64_LS;
6161 case GTU: return AARCH64_CC;
6162 case LEU: return AARCH64_CS;
6163 case LTU: return AARCH64_HI;
6164 default: return -1;
6166 break;
6168 case E_CC_NZmode:
6169 switch (comp_code)
6171 case NE: return AARCH64_NE;
6172 case EQ: return AARCH64_EQ;
6173 case GE: return AARCH64_PL;
6174 case LT: return AARCH64_MI;
6175 default: return -1;
6177 break;
6179 case E_CC_Zmode:
6180 switch (comp_code)
6182 case NE: return AARCH64_NE;
6183 case EQ: return AARCH64_EQ;
6184 default: return -1;
6186 break;
6188 case E_CC_Cmode:
6189 switch (comp_code)
6191 case NE: return AARCH64_CS;
6192 case EQ: return AARCH64_CC;
6193 default: return -1;
6195 break;
6197 default:
6198 return -1;
6201 return -1;
6204 bool
6205 aarch64_const_vec_all_same_in_range_p (rtx x,
6206 HOST_WIDE_INT minval,
6207 HOST_WIDE_INT maxval)
6209 rtx elt;
6210 return (const_vec_duplicate_p (x, &elt)
6211 && CONST_INT_P (elt)
6212 && IN_RANGE (INTVAL (elt), minval, maxval));
6215 bool
6216 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6218 return aarch64_const_vec_all_same_in_range_p (x, val, val);
6221 /* Return true if VEC is a constant in which every element is in the range
6222 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6224 static bool
6225 aarch64_const_vec_all_in_range_p (rtx vec,
6226 HOST_WIDE_INT minval,
6227 HOST_WIDE_INT maxval)
6229 if (GET_CODE (vec) != CONST_VECTOR
6230 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6231 return false;
6233 int nunits;
6234 if (!CONST_VECTOR_STEPPED_P (vec))
6235 nunits = const_vector_encoded_nelts (vec);
6236 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6237 return false;
6239 for (int i = 0; i < nunits; i++)
6241 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6242 if (!CONST_INT_P (vec_elem)
6243 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6244 return false;
6246 return true;
6249 /* N Z C V. */
6250 #define AARCH64_CC_V 1
6251 #define AARCH64_CC_C (1 << 1)
6252 #define AARCH64_CC_Z (1 << 2)
6253 #define AARCH64_CC_N (1 << 3)
6255 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6256 static const int aarch64_nzcv_codes[] =
6258 0, /* EQ, Z == 1. */
6259 AARCH64_CC_Z, /* NE, Z == 0. */
6260 0, /* CS, C == 1. */
6261 AARCH64_CC_C, /* CC, C == 0. */
6262 0, /* MI, N == 1. */
6263 AARCH64_CC_N, /* PL, N == 0. */
6264 0, /* VS, V == 1. */
6265 AARCH64_CC_V, /* VC, V == 0. */
6266 0, /* HI, C ==1 && Z == 0. */
6267 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
6268 AARCH64_CC_V, /* GE, N == V. */
6269 0, /* LT, N != V. */
6270 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
6271 0, /* LE, !(Z == 0 && N == V). */
6272 0, /* AL, Any. */
6273 0 /* NV, Any. */
6276 /* Print floating-point vector immediate operand X to F, negating it
6277 first if NEGATE is true. Return true on success, false if it isn't
6278 a constant we can handle. */
6280 static bool
6281 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6283 rtx elt;
6285 if (!const_vec_duplicate_p (x, &elt))
6286 return false;
6288 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6289 if (negate)
6290 r = real_value_negate (&r);
6292 /* We only handle the SVE single-bit immediates here. */
6293 if (real_equal (&r, &dconst0))
6294 asm_fprintf (f, "0.0");
6295 else if (real_equal (&r, &dconst1))
6296 asm_fprintf (f, "1.0");
6297 else if (real_equal (&r, &dconsthalf))
6298 asm_fprintf (f, "0.5");
6299 else
6300 return false;
6302 return true;
6305 /* Print operand X to file F in a target specific manner according to CODE.
6306 The acceptable formatting commands given by CODE are:
6307 'c': An integer or symbol address without a preceding #
6308 sign.
6309 'C': Take the duplicated element in a vector constant
6310 and print it in hex.
6311 'D': Take the duplicated element in a vector constant
6312 and print it as an unsigned integer, in decimal.
6313 'e': Print the sign/zero-extend size as a character 8->b,
6314 16->h, 32->w.
6315 'p': Prints N such that 2^N == X (X must be power of 2 and
6316 const int).
6317 'P': Print the number of non-zero bits in X (a const_int).
6318 'H': Print the higher numbered register of a pair (TImode)
6319 of regs.
6320 'm': Print a condition (eq, ne, etc).
6321 'M': Same as 'm', but invert condition.
6322 'N': Take the duplicated element in a vector constant
6323 and print the negative of it in decimal.
6324 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6325 'S/T/U/V': Print a FP/SIMD register name for a register list.
6326 The register printed is the FP/SIMD register name
6327 of X + 0/1/2/3 for S/T/U/V.
6328 'R': Print a scalar FP/SIMD register name + 1.
6329 'X': Print bottom 16 bits of integer constant in hex.
6330 'w/x': Print a general register name or the zero register
6331 (32-bit or 64-bit).
6332 '0': Print a normal operand, if it's a general register,
6333 then we assume DImode.
6334 'k': Print NZCV for conditional compare instructions.
6335 'A': Output address constant representing the first
6336 argument of X, specifying a relocation offset
6337 if appropriate.
6338 'L': Output constant address specified by X
6339 with a relocation offset if appropriate.
6340 'G': Prints address of X, specifying a PC relative
6341 relocation mode if appropriate.
6342 'y': Output address of LDP or STP - this is used for
6343 some LDP/STPs which don't use a PARALLEL in their
6344 pattern (so the mode needs to be adjusted).
6345 'z': Output address of a typical LDP or STP. */
6347 static void
6348 aarch64_print_operand (FILE *f, rtx x, int code)
6350 rtx elt;
6351 switch (code)
6353 case 'c':
6354 switch (GET_CODE (x))
6356 case CONST_INT:
6357 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6358 break;
6360 case SYMBOL_REF:
6361 output_addr_const (f, x);
6362 break;
6364 case CONST:
6365 if (GET_CODE (XEXP (x, 0)) == PLUS
6366 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6368 output_addr_const (f, x);
6369 break;
6371 /* Fall through. */
6373 default:
6374 output_operand_lossage ("unsupported operand for code '%c'", code);
6376 break;
6378 case 'e':
6380 int n;
6382 if (!CONST_INT_P (x)
6383 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6385 output_operand_lossage ("invalid operand for '%%%c'", code);
6386 return;
6389 switch (n)
6391 case 3:
6392 fputc ('b', f);
6393 break;
6394 case 4:
6395 fputc ('h', f);
6396 break;
6397 case 5:
6398 fputc ('w', f);
6399 break;
6400 default:
6401 output_operand_lossage ("invalid operand for '%%%c'", code);
6402 return;
6405 break;
6407 case 'p':
6409 int n;
6411 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6413 output_operand_lossage ("invalid operand for '%%%c'", code);
6414 return;
6417 asm_fprintf (f, "%d", n);
6419 break;
6421 case 'P':
6422 if (!CONST_INT_P (x))
6424 output_operand_lossage ("invalid operand for '%%%c'", code);
6425 return;
6428 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6429 break;
6431 case 'H':
6432 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6434 output_operand_lossage ("invalid operand for '%%%c'", code);
6435 return;
6438 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6439 break;
6441 case 'M':
6442 case 'm':
6444 int cond_code;
6445 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6446 if (x == const_true_rtx)
6448 if (code == 'M')
6449 fputs ("nv", f);
6450 return;
6453 if (!COMPARISON_P (x))
6455 output_operand_lossage ("invalid operand for '%%%c'", code);
6456 return;
6459 cond_code = aarch64_get_condition_code (x);
6460 gcc_assert (cond_code >= 0);
6461 if (code == 'M')
6462 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6463 fputs (aarch64_condition_codes[cond_code], f);
6465 break;
6467 case 'N':
6468 if (!const_vec_duplicate_p (x, &elt))
6470 output_operand_lossage ("invalid vector constant");
6471 return;
6474 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6475 asm_fprintf (f, "%wd", -INTVAL (elt));
6476 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6477 && aarch64_print_vector_float_operand (f, x, true))
6479 else
6481 output_operand_lossage ("invalid vector constant");
6482 return;
6484 break;
6486 case 'b':
6487 case 'h':
6488 case 's':
6489 case 'd':
6490 case 'q':
6491 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6493 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6494 return;
6496 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6497 break;
6499 case 'S':
6500 case 'T':
6501 case 'U':
6502 case 'V':
6503 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6505 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6506 return;
6508 asm_fprintf (f, "%c%d",
6509 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6510 REGNO (x) - V0_REGNUM + (code - 'S'));
6511 break;
6513 case 'R':
6514 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6516 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6517 return;
6519 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6520 break;
6522 case 'X':
6523 if (!CONST_INT_P (x))
6525 output_operand_lossage ("invalid operand for '%%%c'", code);
6526 return;
6528 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6529 break;
6531 case 'C':
6533 /* Print a replicated constant in hex. */
6534 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6536 output_operand_lossage ("invalid operand for '%%%c'", code);
6537 return;
6539 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6540 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6542 break;
6544 case 'D':
6546 /* Print a replicated constant in decimal, treating it as
6547 unsigned. */
6548 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6550 output_operand_lossage ("invalid operand for '%%%c'", code);
6551 return;
6553 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6554 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6556 break;
6558 case 'w':
6559 case 'x':
6560 if (x == const0_rtx
6561 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6563 asm_fprintf (f, "%czr", code);
6564 break;
6567 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6569 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6570 break;
6573 if (REG_P (x) && REGNO (x) == SP_REGNUM)
6575 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6576 break;
6579 /* Fall through */
6581 case 0:
6582 if (x == NULL)
6584 output_operand_lossage ("missing operand");
6585 return;
6588 switch (GET_CODE (x))
6590 case REG:
6591 if (aarch64_sve_data_mode_p (GET_MODE (x)))
6592 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6593 else
6594 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6595 break;
6597 case MEM:
6598 output_address (GET_MODE (x), XEXP (x, 0));
6599 break;
6601 case LABEL_REF:
6602 case SYMBOL_REF:
6603 output_addr_const (asm_out_file, x);
6604 break;
6606 case CONST_INT:
6607 asm_fprintf (f, "%wd", INTVAL (x));
6608 break;
6610 case CONST:
6611 if (!VECTOR_MODE_P (GET_MODE (x)))
6613 output_addr_const (asm_out_file, x);
6614 break;
6616 /* fall through */
6618 case CONST_VECTOR:
6619 if (!const_vec_duplicate_p (x, &elt))
6621 output_operand_lossage ("invalid vector constant");
6622 return;
6625 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6626 asm_fprintf (f, "%wd", INTVAL (elt));
6627 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6628 && aarch64_print_vector_float_operand (f, x, false))
6630 else
6632 output_operand_lossage ("invalid vector constant");
6633 return;
6635 break;
6637 case CONST_DOUBLE:
6638 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6639 be getting CONST_DOUBLEs holding integers. */
6640 gcc_assert (GET_MODE (x) != VOIDmode);
6641 if (aarch64_float_const_zero_rtx_p (x))
6643 fputc ('0', f);
6644 break;
6646 else if (aarch64_float_const_representable_p (x))
6648 #define buf_size 20
6649 char float_buf[buf_size] = {'\0'};
6650 real_to_decimal_for_mode (float_buf,
6651 CONST_DOUBLE_REAL_VALUE (x),
6652 buf_size, buf_size,
6653 1, GET_MODE (x));
6654 asm_fprintf (asm_out_file, "%s", float_buf);
6655 break;
6656 #undef buf_size
6658 output_operand_lossage ("invalid constant");
6659 return;
6660 default:
6661 output_operand_lossage ("invalid operand");
6662 return;
6664 break;
6666 case 'A':
6667 if (GET_CODE (x) == HIGH)
6668 x = XEXP (x, 0);
6670 switch (aarch64_classify_symbolic_expression (x))
6672 case SYMBOL_SMALL_GOT_4G:
6673 asm_fprintf (asm_out_file, ":got:");
6674 break;
6676 case SYMBOL_SMALL_TLSGD:
6677 asm_fprintf (asm_out_file, ":tlsgd:");
6678 break;
6680 case SYMBOL_SMALL_TLSDESC:
6681 asm_fprintf (asm_out_file, ":tlsdesc:");
6682 break;
6684 case SYMBOL_SMALL_TLSIE:
6685 asm_fprintf (asm_out_file, ":gottprel:");
6686 break;
6688 case SYMBOL_TLSLE24:
6689 asm_fprintf (asm_out_file, ":tprel:");
6690 break;
6692 case SYMBOL_TINY_GOT:
6693 gcc_unreachable ();
6694 break;
6696 default:
6697 break;
6699 output_addr_const (asm_out_file, x);
6700 break;
6702 case 'L':
6703 switch (aarch64_classify_symbolic_expression (x))
6705 case SYMBOL_SMALL_GOT_4G:
6706 asm_fprintf (asm_out_file, ":lo12:");
6707 break;
6709 case SYMBOL_SMALL_TLSGD:
6710 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6711 break;
6713 case SYMBOL_SMALL_TLSDESC:
6714 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
6715 break;
6717 case SYMBOL_SMALL_TLSIE:
6718 asm_fprintf (asm_out_file, ":gottprel_lo12:");
6719 break;
6721 case SYMBOL_TLSLE12:
6722 asm_fprintf (asm_out_file, ":tprel_lo12:");
6723 break;
6725 case SYMBOL_TLSLE24:
6726 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
6727 break;
6729 case SYMBOL_TINY_GOT:
6730 asm_fprintf (asm_out_file, ":got:");
6731 break;
6733 case SYMBOL_TINY_TLSIE:
6734 asm_fprintf (asm_out_file, ":gottprel:");
6735 break;
6737 default:
6738 break;
6740 output_addr_const (asm_out_file, x);
6741 break;
6743 case 'G':
6744 switch (aarch64_classify_symbolic_expression (x))
6746 case SYMBOL_TLSLE24:
6747 asm_fprintf (asm_out_file, ":tprel_hi12:");
6748 break;
6749 default:
6750 break;
6752 output_addr_const (asm_out_file, x);
6753 break;
6755 case 'k':
6757 HOST_WIDE_INT cond_code;
6759 if (!CONST_INT_P (x))
6761 output_operand_lossage ("invalid operand for '%%%c'", code);
6762 return;
6765 cond_code = INTVAL (x);
6766 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
6767 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
6769 break;
6771 case 'y':
6772 case 'z':
6774 machine_mode mode = GET_MODE (x);
6776 if (GET_CODE (x) != MEM
6777 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
6779 output_operand_lossage ("invalid operand for '%%%c'", code);
6780 return;
6783 if (code == 'y')
6784 /* LDP/STP which uses a single double-width memory operand.
6785 Adjust the mode to appear like a typical LDP/STP.
6786 Currently this is supported for 16-byte accesses only. */
6787 mode = DFmode;
6789 if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
6790 output_operand_lossage ("invalid operand prefix '%%%c'", code);
6792 break;
6794 default:
6795 output_operand_lossage ("invalid operand prefix '%%%c'", code);
6796 return;
6800 /* Print address 'x' of a memory access with mode 'mode'.
6801 'op' is the context required by aarch64_classify_address. It can either be
6802 MEM for a normal memory access or PARALLEL for LDP/STP. */
6803 static bool
6804 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
6805 aarch64_addr_query_type type)
6807 struct aarch64_address_info addr;
6808 unsigned int size;
6810 /* Check all addresses are Pmode - including ILP32. */
6811 gcc_assert (GET_MODE (x) == Pmode);
6813 if (aarch64_classify_address (&addr, x, mode, true, type))
6814 switch (addr.type)
6816 case ADDRESS_REG_IMM:
6817 if (known_eq (addr.const_offset, 0))
6818 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
6819 else if (aarch64_sve_data_mode_p (mode))
6821 HOST_WIDE_INT vnum
6822 = exact_div (addr.const_offset,
6823 BYTES_PER_SVE_VECTOR).to_constant ();
6824 asm_fprintf (f, "[%s, #%wd, mul vl]",
6825 reg_names[REGNO (addr.base)], vnum);
6827 else if (aarch64_sve_pred_mode_p (mode))
6829 HOST_WIDE_INT vnum
6830 = exact_div (addr.const_offset,
6831 BYTES_PER_SVE_PRED).to_constant ();
6832 asm_fprintf (f, "[%s, #%wd, mul vl]",
6833 reg_names[REGNO (addr.base)], vnum);
6835 else
6836 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
6837 INTVAL (addr.offset));
6838 return true;
6840 case ADDRESS_REG_REG:
6841 if (addr.shift == 0)
6842 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
6843 reg_names [REGNO (addr.offset)]);
6844 else
6845 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
6846 reg_names [REGNO (addr.offset)], addr.shift);
6847 return true;
6849 case ADDRESS_REG_UXTW:
6850 if (addr.shift == 0)
6851 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
6852 REGNO (addr.offset) - R0_REGNUM);
6853 else
6854 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
6855 REGNO (addr.offset) - R0_REGNUM, addr.shift);
6856 return true;
6858 case ADDRESS_REG_SXTW:
6859 if (addr.shift == 0)
6860 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
6861 REGNO (addr.offset) - R0_REGNUM);
6862 else
6863 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
6864 REGNO (addr.offset) - R0_REGNUM, addr.shift);
6865 return true;
6867 case ADDRESS_REG_WB:
6868 /* Writeback is only supported for fixed-width modes. */
6869 size = GET_MODE_SIZE (mode).to_constant ();
6870 switch (GET_CODE (x))
6872 case PRE_INC:
6873 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
6874 return true;
6875 case POST_INC:
6876 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
6877 return true;
6878 case PRE_DEC:
6879 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
6880 return true;
6881 case POST_DEC:
6882 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
6883 return true;
6884 case PRE_MODIFY:
6885 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
6886 INTVAL (addr.offset));
6887 return true;
6888 case POST_MODIFY:
6889 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
6890 INTVAL (addr.offset));
6891 return true;
6892 default:
6893 break;
6895 break;
6897 case ADDRESS_LO_SUM:
6898 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
6899 output_addr_const (f, addr.offset);
6900 asm_fprintf (f, "]");
6901 return true;
6903 case ADDRESS_SYMBOLIC:
6904 output_addr_const (f, x);
6905 return true;
6908 return false;
6911 /* Print address 'x' of a LDP/STP with mode 'mode'. */
6912 static bool
6913 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
6915 return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
6918 /* Print address 'x' of a memory access with mode 'mode'. */
6919 static void
6920 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
6922 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
6923 output_addr_const (f, x);
6926 bool
6927 aarch64_label_mentioned_p (rtx x)
6929 const char *fmt;
6930 int i;
6932 if (GET_CODE (x) == LABEL_REF)
6933 return true;
6935 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
6936 referencing instruction, but they are constant offsets, not
6937 symbols. */
6938 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
6939 return false;
6941 fmt = GET_RTX_FORMAT (GET_CODE (x));
6942 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
6944 if (fmt[i] == 'E')
6946 int j;
6948 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
6949 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
6950 return 1;
6952 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
6953 return 1;
6956 return 0;
6959 /* Implement REGNO_REG_CLASS. */
6961 enum reg_class
6962 aarch64_regno_regclass (unsigned regno)
6964 if (GP_REGNUM_P (regno))
6965 return GENERAL_REGS;
6967 if (regno == SP_REGNUM)
6968 return STACK_REG;
6970 if (regno == FRAME_POINTER_REGNUM
6971 || regno == ARG_POINTER_REGNUM)
6972 return POINTER_REGS;
6974 if (FP_REGNUM_P (regno))
6975 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
6977 if (PR_REGNUM_P (regno))
6978 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
6980 return NO_REGS;
6983 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
6984 If OFFSET is out of range, return an offset of an anchor point
6985 that is in range. Return 0 otherwise. */
6987 static HOST_WIDE_INT
6988 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
6989 machine_mode mode)
6991 /* Does it look like we'll need a 16-byte load/store-pair operation? */
6992 if (size > 16)
6993 return (offset + 0x400) & ~0x7f0;
6995 /* For offsets that aren't a multiple of the access size, the limit is
6996 -256...255. */
6997 if (offset & (size - 1))
6999 /* BLKmode typically uses LDP of X-registers. */
7000 if (mode == BLKmode)
7001 return (offset + 512) & ~0x3ff;
7002 return (offset + 0x100) & ~0x1ff;
7005 /* Small negative offsets are supported. */
7006 if (IN_RANGE (offset, -256, 0))
7007 return 0;
7009 if (mode == TImode || mode == TFmode)
7010 return (offset + 0x100) & ~0x1ff;
7012 /* Use 12-bit offset by access size. */
7013 return offset & (~0xfff * size);
7016 static rtx
7017 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
7019 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7020 where mask is selected by alignment and size of the offset.
7021 We try to pick as large a range for the offset as possible to
7022 maximize the chance of a CSE. However, for aligned addresses
7023 we limit the range to 4k so that structures with different sized
7024 elements are likely to use the same base. We need to be careful
7025 not to split a CONST for some forms of address expression, otherwise
7026 it will generate sub-optimal code. */
7028 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7030 rtx base = XEXP (x, 0);
7031 rtx offset_rtx = XEXP (x, 1);
7032 HOST_WIDE_INT offset = INTVAL (offset_rtx);
7034 if (GET_CODE (base) == PLUS)
7036 rtx op0 = XEXP (base, 0);
7037 rtx op1 = XEXP (base, 1);
7039 /* Force any scaling into a temp for CSE. */
7040 op0 = force_reg (Pmode, op0);
7041 op1 = force_reg (Pmode, op1);
7043 /* Let the pointer register be in op0. */
7044 if (REG_POINTER (op1))
7045 std::swap (op0, op1);
7047 /* If the pointer is virtual or frame related, then we know that
7048 virtual register instantiation or register elimination is going
7049 to apply a second constant. We want the two constants folded
7050 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7051 if (virt_or_elim_regno_p (REGNO (op0)))
7053 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7054 NULL_RTX, true, OPTAB_DIRECT);
7055 return gen_rtx_PLUS (Pmode, base, op1);
7058 /* Otherwise, in order to encourage CSE (and thence loop strength
7059 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7060 base = expand_binop (Pmode, add_optab, op0, op1,
7061 NULL_RTX, true, OPTAB_DIRECT);
7062 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7065 HOST_WIDE_INT size;
7066 if (GET_MODE_SIZE (mode).is_constant (&size))
7068 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7069 mode);
7070 if (base_offset != 0)
7072 base = plus_constant (Pmode, base, base_offset);
7073 base = force_operand (base, NULL_RTX);
7074 return plus_constant (Pmode, base, offset - base_offset);
7079 return x;
7082 /* Return the reload icode required for a constant pool in mode. */
7083 static enum insn_code
7084 aarch64_constant_pool_reload_icode (machine_mode mode)
7086 switch (mode)
7088 case E_SFmode:
7089 return CODE_FOR_aarch64_reload_movcpsfdi;
7091 case E_DFmode:
7092 return CODE_FOR_aarch64_reload_movcpdfdi;
7094 case E_TFmode:
7095 return CODE_FOR_aarch64_reload_movcptfdi;
7097 case E_V8QImode:
7098 return CODE_FOR_aarch64_reload_movcpv8qidi;
7100 case E_V16QImode:
7101 return CODE_FOR_aarch64_reload_movcpv16qidi;
7103 case E_V4HImode:
7104 return CODE_FOR_aarch64_reload_movcpv4hidi;
7106 case E_V8HImode:
7107 return CODE_FOR_aarch64_reload_movcpv8hidi;
7109 case E_V2SImode:
7110 return CODE_FOR_aarch64_reload_movcpv2sidi;
7112 case E_V4SImode:
7113 return CODE_FOR_aarch64_reload_movcpv4sidi;
7115 case E_V2DImode:
7116 return CODE_FOR_aarch64_reload_movcpv2didi;
7118 case E_V2DFmode:
7119 return CODE_FOR_aarch64_reload_movcpv2dfdi;
7121 default:
7122 gcc_unreachable ();
7125 gcc_unreachable ();
7127 static reg_class_t
7128 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7129 reg_class_t rclass,
7130 machine_mode mode,
7131 secondary_reload_info *sri)
7133 if (BYTES_BIG_ENDIAN
7134 && reg_class_subset_p (rclass, FP_REGS)
7135 && (MEM_P (x) || (REG_P (x) && !HARD_REGISTER_P (x)))
7136 && aarch64_sve_data_mode_p (mode))
7138 sri->icode = CODE_FOR_aarch64_sve_reload_be;
7139 return NO_REGS;
7142 /* If we have to disable direct literal pool loads and stores because the
7143 function is too big, then we need a scratch register. */
7144 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7145 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7146 || targetm.vector_mode_supported_p (GET_MODE (x)))
7147 && !aarch64_pcrelative_literal_loads)
7149 sri->icode = aarch64_constant_pool_reload_icode (mode);
7150 return NO_REGS;
7153 /* Without the TARGET_SIMD instructions we cannot move a Q register
7154 to a Q register directly. We need a scratch. */
7155 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7156 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7157 && reg_class_subset_p (rclass, FP_REGS))
7159 if (mode == TFmode)
7160 sri->icode = CODE_FOR_aarch64_reload_movtf;
7161 else if (mode == TImode)
7162 sri->icode = CODE_FOR_aarch64_reload_movti;
7163 return NO_REGS;
7166 /* A TFmode or TImode memory access should be handled via an FP_REGS
7167 because AArch64 has richer addressing modes for LDR/STR instructions
7168 than LDP/STP instructions. */
7169 if (TARGET_FLOAT && rclass == GENERAL_REGS
7170 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7171 return FP_REGS;
7173 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7174 return GENERAL_REGS;
7176 return NO_REGS;
7179 static bool
7180 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7182 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7184 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7185 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7186 if (frame_pointer_needed)
7187 return to == HARD_FRAME_POINTER_REGNUM;
7188 return true;
7191 poly_int64
7192 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7194 aarch64_layout_frame ();
7196 if (to == HARD_FRAME_POINTER_REGNUM)
7198 if (from == ARG_POINTER_REGNUM)
7199 return cfun->machine->frame.hard_fp_offset;
7201 if (from == FRAME_POINTER_REGNUM)
7202 return cfun->machine->frame.hard_fp_offset
7203 - cfun->machine->frame.locals_offset;
7206 if (to == STACK_POINTER_REGNUM)
7208 if (from == FRAME_POINTER_REGNUM)
7209 return cfun->machine->frame.frame_size
7210 - cfun->machine->frame.locals_offset;
7213 return cfun->machine->frame.frame_size;
7216 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7217 previous frame. */
7220 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7222 if (count != 0)
7223 return const0_rtx;
7224 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7228 static void
7229 aarch64_asm_trampoline_template (FILE *f)
7231 if (TARGET_ILP32)
7233 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7234 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7236 else
7238 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7239 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7241 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7242 assemble_aligned_integer (4, const0_rtx);
7243 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7244 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7247 static void
7248 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7250 rtx fnaddr, mem, a_tramp;
7251 const int tramp_code_sz = 16;
7253 /* Don't need to copy the trailing D-words, we fill those in below. */
7254 emit_block_move (m_tramp, assemble_trampoline_template (),
7255 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7256 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7257 fnaddr = XEXP (DECL_RTL (fndecl), 0);
7258 if (GET_MODE (fnaddr) != ptr_mode)
7259 fnaddr = convert_memory_address (ptr_mode, fnaddr);
7260 emit_move_insn (mem, fnaddr);
7262 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7263 emit_move_insn (mem, chain_value);
7265 /* XXX We should really define a "clear_cache" pattern and use
7266 gen_clear_cache(). */
7267 a_tramp = XEXP (m_tramp, 0);
7268 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7269 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7270 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7271 ptr_mode);
7274 static unsigned char
7275 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7277 /* ??? Logically we should only need to provide a value when
7278 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7279 can hold MODE, but at the moment we need to handle all modes.
7280 Just ignore any runtime parts for registers that can't store them. */
7281 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7282 unsigned int nregs;
7283 switch (regclass)
7285 case CALLER_SAVE_REGS:
7286 case POINTER_REGS:
7287 case GENERAL_REGS:
7288 case ALL_REGS:
7289 case POINTER_AND_FP_REGS:
7290 case FP_REGS:
7291 case FP_LO_REGS:
7292 if (aarch64_sve_data_mode_p (mode)
7293 && constant_multiple_p (GET_MODE_SIZE (mode),
7294 BYTES_PER_SVE_VECTOR, &nregs))
7295 return nregs;
7296 return (aarch64_vector_data_mode_p (mode)
7297 ? CEIL (lowest_size, UNITS_PER_VREG)
7298 : CEIL (lowest_size, UNITS_PER_WORD));
7299 case STACK_REG:
7300 case PR_REGS:
7301 case PR_LO_REGS:
7302 case PR_HI_REGS:
7303 return 1;
7305 case NO_REGS:
7306 return 0;
7308 default:
7309 break;
7311 gcc_unreachable ();
7314 static reg_class_t
7315 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7317 if (regclass == POINTER_REGS)
7318 return GENERAL_REGS;
7320 if (regclass == STACK_REG)
7322 if (REG_P(x)
7323 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7324 return regclass;
7326 return NO_REGS;
7329 /* Register eliminiation can result in a request for
7330 SP+constant->FP_REGS. We cannot support such operations which
7331 use SP as source and an FP_REG as destination, so reject out
7332 right now. */
7333 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7335 rtx lhs = XEXP (x, 0);
7337 /* Look through a possible SUBREG introduced by ILP32. */
7338 if (GET_CODE (lhs) == SUBREG)
7339 lhs = SUBREG_REG (lhs);
7341 gcc_assert (REG_P (lhs));
7342 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7343 POINTER_REGS));
7344 return NO_REGS;
7347 return regclass;
7350 void
7351 aarch64_asm_output_labelref (FILE* f, const char *name)
7353 asm_fprintf (f, "%U%s", name);
7356 static void
7357 aarch64_elf_asm_constructor (rtx symbol, int priority)
7359 if (priority == DEFAULT_INIT_PRIORITY)
7360 default_ctor_section_asm_out_constructor (symbol, priority);
7361 else
7363 section *s;
7364 /* While priority is known to be in range [0, 65535], so 18 bytes
7365 would be enough, the compiler might not know that. To avoid
7366 -Wformat-truncation false positive, use a larger size. */
7367 char buf[23];
7368 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7369 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7370 switch_to_section (s);
7371 assemble_align (POINTER_SIZE);
7372 assemble_aligned_integer (POINTER_BYTES, symbol);
7376 static void
7377 aarch64_elf_asm_destructor (rtx symbol, int priority)
7379 if (priority == DEFAULT_INIT_PRIORITY)
7380 default_dtor_section_asm_out_destructor (symbol, priority);
7381 else
7383 section *s;
7384 /* While priority is known to be in range [0, 65535], so 18 bytes
7385 would be enough, the compiler might not know that. To avoid
7386 -Wformat-truncation false positive, use a larger size. */
7387 char buf[23];
7388 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7389 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7390 switch_to_section (s);
7391 assemble_align (POINTER_SIZE);
7392 assemble_aligned_integer (POINTER_BYTES, symbol);
7396 const char*
7397 aarch64_output_casesi (rtx *operands)
7399 char buf[100];
7400 char label[100];
7401 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7402 int index;
7403 static const char *const patterns[4][2] =
7406 "ldrb\t%w3, [%0,%w1,uxtw]",
7407 "add\t%3, %4, %w3, sxtb #2"
7410 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7411 "add\t%3, %4, %w3, sxth #2"
7414 "ldr\t%w3, [%0,%w1,uxtw #2]",
7415 "add\t%3, %4, %w3, sxtw #2"
7417 /* We assume that DImode is only generated when not optimizing and
7418 that we don't really need 64-bit address offsets. That would
7419 imply an object file with 8GB of code in a single function! */
7421 "ldr\t%w3, [%0,%w1,uxtw #2]",
7422 "add\t%3, %4, %w3, sxtw #2"
7426 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7428 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7429 index = exact_log2 (GET_MODE_SIZE (mode));
7431 gcc_assert (index >= 0 && index <= 3);
7433 /* Need to implement table size reduction, by chaning the code below. */
7434 output_asm_insn (patterns[index][0], operands);
7435 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7436 snprintf (buf, sizeof (buf),
7437 "adr\t%%4, %s", targetm.strip_name_encoding (label));
7438 output_asm_insn (buf, operands);
7439 output_asm_insn (patterns[index][1], operands);
7440 output_asm_insn ("br\t%3", operands);
7441 assemble_label (asm_out_file, label);
7442 return "";
7446 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7447 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7448 operator. */
7451 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7453 if (shift >= 0 && shift <= 3)
7455 int size;
7456 for (size = 8; size <= 32; size *= 2)
7458 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7459 if (mask == bits << shift)
7460 return size;
7463 return 0;
7466 /* Constant pools are per function only when PC relative
7467 literal loads are true or we are in the large memory
7468 model. */
7470 static inline bool
7471 aarch64_can_use_per_function_literal_pools_p (void)
7473 return (aarch64_pcrelative_literal_loads
7474 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7477 static bool
7478 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7480 /* Fixme:: In an ideal world this would work similar
7481 to the logic in aarch64_select_rtx_section but this
7482 breaks bootstrap in gcc go. For now we workaround
7483 this by returning false here. */
7484 return false;
7487 /* Select appropriate section for constants depending
7488 on where we place literal pools. */
7490 static section *
7491 aarch64_select_rtx_section (machine_mode mode,
7492 rtx x,
7493 unsigned HOST_WIDE_INT align)
7495 if (aarch64_can_use_per_function_literal_pools_p ())
7496 return function_section (current_function_decl);
7498 return default_elf_select_rtx_section (mode, x, align);
7501 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7502 void
7503 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7504 HOST_WIDE_INT offset)
7506 /* When using per-function literal pools, we must ensure that any code
7507 section is aligned to the minimal instruction length, lest we get
7508 errors from the assembler re "unaligned instructions". */
7509 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7510 ASM_OUTPUT_ALIGN (f, 2);
7513 /* Costs. */
7515 /* Helper function for rtx cost calculation. Strip a shift expression
7516 from X. Returns the inner operand if successful, or the original
7517 expression on failure. */
7518 static rtx
7519 aarch64_strip_shift (rtx x)
7521 rtx op = x;
7523 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7524 we can convert both to ROR during final output. */
7525 if ((GET_CODE (op) == ASHIFT
7526 || GET_CODE (op) == ASHIFTRT
7527 || GET_CODE (op) == LSHIFTRT
7528 || GET_CODE (op) == ROTATERT
7529 || GET_CODE (op) == ROTATE)
7530 && CONST_INT_P (XEXP (op, 1)))
7531 return XEXP (op, 0);
7533 if (GET_CODE (op) == MULT
7534 && CONST_INT_P (XEXP (op, 1))
7535 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7536 return XEXP (op, 0);
7538 return x;
7541 /* Helper function for rtx cost calculation. Strip an extend
7542 expression from X. Returns the inner operand if successful, or the
7543 original expression on failure. We deal with a number of possible
7544 canonicalization variations here. If STRIP_SHIFT is true, then
7545 we can strip off a shift also. */
7546 static rtx
7547 aarch64_strip_extend (rtx x, bool strip_shift)
7549 scalar_int_mode mode;
7550 rtx op = x;
7552 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7553 return op;
7555 /* Zero and sign extraction of a widened value. */
7556 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7557 && XEXP (op, 2) == const0_rtx
7558 && GET_CODE (XEXP (op, 0)) == MULT
7559 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7560 XEXP (op, 1)))
7561 return XEXP (XEXP (op, 0), 0);
7563 /* It can also be represented (for zero-extend) as an AND with an
7564 immediate. */
7565 if (GET_CODE (op) == AND
7566 && GET_CODE (XEXP (op, 0)) == MULT
7567 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7568 && CONST_INT_P (XEXP (op, 1))
7569 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7570 INTVAL (XEXP (op, 1))) != 0)
7571 return XEXP (XEXP (op, 0), 0);
7573 /* Now handle extended register, as this may also have an optional
7574 left shift by 1..4. */
7575 if (strip_shift
7576 && GET_CODE (op) == ASHIFT
7577 && CONST_INT_P (XEXP (op, 1))
7578 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7579 op = XEXP (op, 0);
7581 if (GET_CODE (op) == ZERO_EXTEND
7582 || GET_CODE (op) == SIGN_EXTEND)
7583 op = XEXP (op, 0);
7585 if (op != x)
7586 return op;
7588 return x;
7591 /* Return true iff CODE is a shift supported in combination
7592 with arithmetic instructions. */
7594 static bool
7595 aarch64_shift_p (enum rtx_code code)
7597 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7601 /* Return true iff X is a cheap shift without a sign extend. */
7603 static bool
7604 aarch64_cheap_mult_shift_p (rtx x)
7606 rtx op0, op1;
7608 op0 = XEXP (x, 0);
7609 op1 = XEXP (x, 1);
7611 if (!(aarch64_tune_params.extra_tuning_flags
7612 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7613 return false;
7615 if (GET_CODE (op0) == SIGN_EXTEND)
7616 return false;
7618 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7619 && UINTVAL (op1) <= 4)
7620 return true;
7622 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7623 return false;
7625 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7627 if (l2 > 0 && l2 <= 4)
7628 return true;
7630 return false;
7633 /* Helper function for rtx cost calculation. Calculate the cost of
7634 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7635 Return the calculated cost of the expression, recursing manually in to
7636 operands where needed. */
7638 static int
7639 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7641 rtx op0, op1;
7642 const struct cpu_cost_table *extra_cost
7643 = aarch64_tune_params.insn_extra_cost;
7644 int cost = 0;
7645 bool compound_p = (outer == PLUS || outer == MINUS);
7646 machine_mode mode = GET_MODE (x);
7648 gcc_checking_assert (code == MULT);
7650 op0 = XEXP (x, 0);
7651 op1 = XEXP (x, 1);
7653 if (VECTOR_MODE_P (mode))
7654 mode = GET_MODE_INNER (mode);
7656 /* Integer multiply/fma. */
7657 if (GET_MODE_CLASS (mode) == MODE_INT)
7659 /* The multiply will be canonicalized as a shift, cost it as such. */
7660 if (aarch64_shift_p (GET_CODE (x))
7661 || (CONST_INT_P (op1)
7662 && exact_log2 (INTVAL (op1)) > 0))
7664 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7665 || GET_CODE (op0) == SIGN_EXTEND;
7666 if (speed)
7668 if (compound_p)
7670 /* If the shift is considered cheap,
7671 then don't add any cost. */
7672 if (aarch64_cheap_mult_shift_p (x))
7674 else if (REG_P (op1))
7675 /* ARITH + shift-by-register. */
7676 cost += extra_cost->alu.arith_shift_reg;
7677 else if (is_extend)
7678 /* ARITH + extended register. We don't have a cost field
7679 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
7680 cost += extra_cost->alu.extend_arith;
7681 else
7682 /* ARITH + shift-by-immediate. */
7683 cost += extra_cost->alu.arith_shift;
7685 else
7686 /* LSL (immediate). */
7687 cost += extra_cost->alu.shift;
7690 /* Strip extends as we will have costed them in the case above. */
7691 if (is_extend)
7692 op0 = aarch64_strip_extend (op0, true);
7694 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7696 return cost;
7699 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
7700 compound and let the below cases handle it. After all, MNEG is a
7701 special-case alias of MSUB. */
7702 if (GET_CODE (op0) == NEG)
7704 op0 = XEXP (op0, 0);
7705 compound_p = true;
7708 /* Integer multiplies or FMAs have zero/sign extending variants. */
7709 if ((GET_CODE (op0) == ZERO_EXTEND
7710 && GET_CODE (op1) == ZERO_EXTEND)
7711 || (GET_CODE (op0) == SIGN_EXTEND
7712 && GET_CODE (op1) == SIGN_EXTEND))
7714 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
7715 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
7717 if (speed)
7719 if (compound_p)
7720 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
7721 cost += extra_cost->mult[0].extend_add;
7722 else
7723 /* MUL/SMULL/UMULL. */
7724 cost += extra_cost->mult[0].extend;
7727 return cost;
7730 /* This is either an integer multiply or a MADD. In both cases
7731 we want to recurse and cost the operands. */
7732 cost += rtx_cost (op0, mode, MULT, 0, speed);
7733 cost += rtx_cost (op1, mode, MULT, 1, speed);
7735 if (speed)
7737 if (compound_p)
7738 /* MADD/MSUB. */
7739 cost += extra_cost->mult[mode == DImode].add;
7740 else
7741 /* MUL. */
7742 cost += extra_cost->mult[mode == DImode].simple;
7745 return cost;
7747 else
7749 if (speed)
7751 /* Floating-point FMA/FMUL can also support negations of the
7752 operands, unless the rounding mode is upward or downward in
7753 which case FNMUL is different than FMUL with operand negation. */
7754 bool neg0 = GET_CODE (op0) == NEG;
7755 bool neg1 = GET_CODE (op1) == NEG;
7756 if (compound_p || !flag_rounding_math || (neg0 && neg1))
7758 if (neg0)
7759 op0 = XEXP (op0, 0);
7760 if (neg1)
7761 op1 = XEXP (op1, 0);
7764 if (compound_p)
7765 /* FMADD/FNMADD/FNMSUB/FMSUB. */
7766 cost += extra_cost->fp[mode == DFmode].fma;
7767 else
7768 /* FMUL/FNMUL. */
7769 cost += extra_cost->fp[mode == DFmode].mult;
7772 cost += rtx_cost (op0, mode, MULT, 0, speed);
7773 cost += rtx_cost (op1, mode, MULT, 1, speed);
7774 return cost;
7778 static int
7779 aarch64_address_cost (rtx x,
7780 machine_mode mode,
7781 addr_space_t as ATTRIBUTE_UNUSED,
7782 bool speed)
7784 enum rtx_code c = GET_CODE (x);
7785 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
7786 struct aarch64_address_info info;
7787 int cost = 0;
7788 info.shift = 0;
7790 if (!aarch64_classify_address (&info, x, mode, false))
7792 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
7794 /* This is a CONST or SYMBOL ref which will be split
7795 in a different way depending on the code model in use.
7796 Cost it through the generic infrastructure. */
7797 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
7798 /* Divide through by the cost of one instruction to
7799 bring it to the same units as the address costs. */
7800 cost_symbol_ref /= COSTS_N_INSNS (1);
7801 /* The cost is then the cost of preparing the address,
7802 followed by an immediate (possibly 0) offset. */
7803 return cost_symbol_ref + addr_cost->imm_offset;
7805 else
7807 /* This is most likely a jump table from a case
7808 statement. */
7809 return addr_cost->register_offset;
7813 switch (info.type)
7815 case ADDRESS_LO_SUM:
7816 case ADDRESS_SYMBOLIC:
7817 case ADDRESS_REG_IMM:
7818 cost += addr_cost->imm_offset;
7819 break;
7821 case ADDRESS_REG_WB:
7822 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
7823 cost += addr_cost->pre_modify;
7824 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
7825 cost += addr_cost->post_modify;
7826 else
7827 gcc_unreachable ();
7829 break;
7831 case ADDRESS_REG_REG:
7832 cost += addr_cost->register_offset;
7833 break;
7835 case ADDRESS_REG_SXTW:
7836 cost += addr_cost->register_sextend;
7837 break;
7839 case ADDRESS_REG_UXTW:
7840 cost += addr_cost->register_zextend;
7841 break;
7843 default:
7844 gcc_unreachable ();
7848 if (info.shift > 0)
7850 /* For the sake of calculating the cost of the shifted register
7851 component, we can treat same sized modes in the same way. */
7852 if (known_eq (GET_MODE_BITSIZE (mode), 16))
7853 cost += addr_cost->addr_scale_costs.hi;
7854 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
7855 cost += addr_cost->addr_scale_costs.si;
7856 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
7857 cost += addr_cost->addr_scale_costs.di;
7858 else
7859 /* We can't tell, or this is a 128-bit vector. */
7860 cost += addr_cost->addr_scale_costs.ti;
7863 return cost;
7866 /* Return the cost of a branch. If SPEED_P is true then the compiler is
7867 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
7868 to be taken. */
7871 aarch64_branch_cost (bool speed_p, bool predictable_p)
7873 /* When optimizing for speed, use the cost of unpredictable branches. */
7874 const struct cpu_branch_cost *branch_costs =
7875 aarch64_tune_params.branch_costs;
7877 if (!speed_p || predictable_p)
7878 return branch_costs->predictable;
7879 else
7880 return branch_costs->unpredictable;
7883 /* Return true if the RTX X in mode MODE is a zero or sign extract
7884 usable in an ADD or SUB (extended register) instruction. */
7885 static bool
7886 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
7888 /* Catch add with a sign extract.
7889 This is add_<optab><mode>_multp2. */
7890 if (GET_CODE (x) == SIGN_EXTRACT
7891 || GET_CODE (x) == ZERO_EXTRACT)
7893 rtx op0 = XEXP (x, 0);
7894 rtx op1 = XEXP (x, 1);
7895 rtx op2 = XEXP (x, 2);
7897 if (GET_CODE (op0) == MULT
7898 && CONST_INT_P (op1)
7899 && op2 == const0_rtx
7900 && CONST_INT_P (XEXP (op0, 1))
7901 && aarch64_is_extend_from_extract (mode,
7902 XEXP (op0, 1),
7903 op1))
7905 return true;
7908 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
7909 No shift. */
7910 else if (GET_CODE (x) == SIGN_EXTEND
7911 || GET_CODE (x) == ZERO_EXTEND)
7912 return REG_P (XEXP (x, 0));
7914 return false;
7917 static bool
7918 aarch64_frint_unspec_p (unsigned int u)
7920 switch (u)
7922 case UNSPEC_FRINTZ:
7923 case UNSPEC_FRINTP:
7924 case UNSPEC_FRINTM:
7925 case UNSPEC_FRINTA:
7926 case UNSPEC_FRINTN:
7927 case UNSPEC_FRINTX:
7928 case UNSPEC_FRINTI:
7929 return true;
7931 default:
7932 return false;
7936 /* Return true iff X is an rtx that will match an extr instruction
7937 i.e. as described in the *extr<mode>5_insn family of patterns.
7938 OP0 and OP1 will be set to the operands of the shifts involved
7939 on success and will be NULL_RTX otherwise. */
7941 static bool
7942 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
7944 rtx op0, op1;
7945 scalar_int_mode mode;
7946 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
7947 return false;
7949 *res_op0 = NULL_RTX;
7950 *res_op1 = NULL_RTX;
7952 if (GET_CODE (x) != IOR)
7953 return false;
7955 op0 = XEXP (x, 0);
7956 op1 = XEXP (x, 1);
7958 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
7959 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
7961 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
7962 if (GET_CODE (op1) == ASHIFT)
7963 std::swap (op0, op1);
7965 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
7966 return false;
7968 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
7969 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
7971 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
7972 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
7974 *res_op0 = XEXP (op0, 0);
7975 *res_op1 = XEXP (op1, 0);
7976 return true;
7980 return false;
7983 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
7984 storing it in *COST. Result is true if the total cost of the operation
7985 has now been calculated. */
7986 static bool
7987 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
7989 rtx inner;
7990 rtx comparator;
7991 enum rtx_code cmpcode;
7993 if (COMPARISON_P (op0))
7995 inner = XEXP (op0, 0);
7996 comparator = XEXP (op0, 1);
7997 cmpcode = GET_CODE (op0);
7999 else
8001 inner = op0;
8002 comparator = const0_rtx;
8003 cmpcode = NE;
8006 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8008 /* Conditional branch. */
8009 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8010 return true;
8011 else
8013 if (cmpcode == NE || cmpcode == EQ)
8015 if (comparator == const0_rtx)
8017 /* TBZ/TBNZ/CBZ/CBNZ. */
8018 if (GET_CODE (inner) == ZERO_EXTRACT)
8019 /* TBZ/TBNZ. */
8020 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8021 ZERO_EXTRACT, 0, speed);
8022 else
8023 /* CBZ/CBNZ. */
8024 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8026 return true;
8029 else if (cmpcode == LT || cmpcode == GE)
8031 /* TBZ/TBNZ. */
8032 if (comparator == const0_rtx)
8033 return true;
8037 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8039 /* CCMP. */
8040 if (GET_CODE (op1) == COMPARE)
8042 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8043 if (XEXP (op1, 1) == const0_rtx)
8044 *cost += 1;
8045 if (speed)
8047 machine_mode mode = GET_MODE (XEXP (op1, 0));
8048 const struct cpu_cost_table *extra_cost
8049 = aarch64_tune_params.insn_extra_cost;
8051 if (GET_MODE_CLASS (mode) == MODE_INT)
8052 *cost += extra_cost->alu.arith;
8053 else
8054 *cost += extra_cost->fp[mode == DFmode].compare;
8056 return true;
8059 /* It's a conditional operation based on the status flags,
8060 so it must be some flavor of CSEL. */
8062 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8063 if (GET_CODE (op1) == NEG
8064 || GET_CODE (op1) == NOT
8065 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8066 op1 = XEXP (op1, 0);
8067 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8069 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8070 op1 = XEXP (op1, 0);
8071 op2 = XEXP (op2, 0);
8074 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8075 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8076 return true;
8079 /* We don't know what this is, cost all operands. */
8080 return false;
8083 /* Check whether X is a bitfield operation of the form shift + extend that
8084 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8085 operand to which the bitfield operation is applied. Otherwise return
8086 NULL_RTX. */
8088 static rtx
8089 aarch64_extend_bitfield_pattern_p (rtx x)
8091 rtx_code outer_code = GET_CODE (x);
8092 machine_mode outer_mode = GET_MODE (x);
8094 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8095 && outer_mode != SImode && outer_mode != DImode)
8096 return NULL_RTX;
8098 rtx inner = XEXP (x, 0);
8099 rtx_code inner_code = GET_CODE (inner);
8100 machine_mode inner_mode = GET_MODE (inner);
8101 rtx op = NULL_RTX;
8103 switch (inner_code)
8105 case ASHIFT:
8106 if (CONST_INT_P (XEXP (inner, 1))
8107 && (inner_mode == QImode || inner_mode == HImode))
8108 op = XEXP (inner, 0);
8109 break;
8110 case LSHIFTRT:
8111 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8112 && (inner_mode == QImode || inner_mode == HImode))
8113 op = XEXP (inner, 0);
8114 break;
8115 case ASHIFTRT:
8116 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8117 && (inner_mode == QImode || inner_mode == HImode))
8118 op = XEXP (inner, 0);
8119 break;
8120 default:
8121 break;
8124 return op;
8127 /* Return true if the mask and a shift amount from an RTX of the form
8128 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8129 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8131 bool
8132 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8133 rtx shft_amnt)
8135 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8136 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8137 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8138 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8141 /* Calculate the cost of calculating X, storing it in *COST. Result
8142 is true if the total cost of the operation has now been calculated. */
8143 static bool
8144 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8145 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8147 rtx op0, op1, op2;
8148 const struct cpu_cost_table *extra_cost
8149 = aarch64_tune_params.insn_extra_cost;
8150 int code = GET_CODE (x);
8151 scalar_int_mode int_mode;
8153 /* By default, assume that everything has equivalent cost to the
8154 cheapest instruction. Any additional costs are applied as a delta
8155 above this default. */
8156 *cost = COSTS_N_INSNS (1);
8158 switch (code)
8160 case SET:
8161 /* The cost depends entirely on the operands to SET. */
8162 *cost = 0;
8163 op0 = SET_DEST (x);
8164 op1 = SET_SRC (x);
8166 switch (GET_CODE (op0))
8168 case MEM:
8169 if (speed)
8171 rtx address = XEXP (op0, 0);
8172 if (VECTOR_MODE_P (mode))
8173 *cost += extra_cost->ldst.storev;
8174 else if (GET_MODE_CLASS (mode) == MODE_INT)
8175 *cost += extra_cost->ldst.store;
8176 else if (mode == SFmode)
8177 *cost += extra_cost->ldst.storef;
8178 else if (mode == DFmode)
8179 *cost += extra_cost->ldst.stored;
8181 *cost +=
8182 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8183 0, speed));
8186 *cost += rtx_cost (op1, mode, SET, 1, speed);
8187 return true;
8189 case SUBREG:
8190 if (! REG_P (SUBREG_REG (op0)))
8191 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8193 /* Fall through. */
8194 case REG:
8195 /* The cost is one per vector-register copied. */
8196 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8198 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8199 *cost = COSTS_N_INSNS (nregs);
8201 /* const0_rtx is in general free, but we will use an
8202 instruction to set a register to 0. */
8203 else if (REG_P (op1) || op1 == const0_rtx)
8205 /* The cost is 1 per register copied. */
8206 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8207 *cost = COSTS_N_INSNS (nregs);
8209 else
8210 /* Cost is just the cost of the RHS of the set. */
8211 *cost += rtx_cost (op1, mode, SET, 1, speed);
8212 return true;
8214 case ZERO_EXTRACT:
8215 case SIGN_EXTRACT:
8216 /* Bit-field insertion. Strip any redundant widening of
8217 the RHS to meet the width of the target. */
8218 if (GET_CODE (op1) == SUBREG)
8219 op1 = SUBREG_REG (op1);
8220 if ((GET_CODE (op1) == ZERO_EXTEND
8221 || GET_CODE (op1) == SIGN_EXTEND)
8222 && CONST_INT_P (XEXP (op0, 1))
8223 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8224 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8225 op1 = XEXP (op1, 0);
8227 if (CONST_INT_P (op1))
8229 /* MOV immediate is assumed to always be cheap. */
8230 *cost = COSTS_N_INSNS (1);
8232 else
8234 /* BFM. */
8235 if (speed)
8236 *cost += extra_cost->alu.bfi;
8237 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8240 return true;
8242 default:
8243 /* We can't make sense of this, assume default cost. */
8244 *cost = COSTS_N_INSNS (1);
8245 return false;
8247 return false;
8249 case CONST_INT:
8250 /* If an instruction can incorporate a constant within the
8251 instruction, the instruction's expression avoids calling
8252 rtx_cost() on the constant. If rtx_cost() is called on a
8253 constant, then it is usually because the constant must be
8254 moved into a register by one or more instructions.
8256 The exception is constant 0, which can be expressed
8257 as XZR/WZR and is therefore free. The exception to this is
8258 if we have (set (reg) (const0_rtx)) in which case we must cost
8259 the move. However, we can catch that when we cost the SET, so
8260 we don't need to consider that here. */
8261 if (x == const0_rtx)
8262 *cost = 0;
8263 else
8265 /* To an approximation, building any other constant is
8266 proportionally expensive to the number of instructions
8267 required to build that constant. This is true whether we
8268 are compiling for SPEED or otherwise. */
8269 if (!is_a <scalar_int_mode> (mode, &int_mode))
8270 int_mode = word_mode;
8271 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8272 (NULL_RTX, x, false, int_mode));
8274 return true;
8276 case CONST_DOUBLE:
8278 /* First determine number of instructions to do the move
8279 as an integer constant. */
8280 if (!aarch64_float_const_representable_p (x)
8281 && !aarch64_can_const_movi_rtx_p (x, mode)
8282 && aarch64_float_const_rtx_p (x))
8284 unsigned HOST_WIDE_INT ival;
8285 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8286 gcc_assert (succeed);
8288 scalar_int_mode imode = (mode == HFmode
8289 ? SImode
8290 : int_mode_for_mode (mode).require ());
8291 int ncost = aarch64_internal_mov_immediate
8292 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8293 *cost += COSTS_N_INSNS (ncost);
8294 return true;
8297 if (speed)
8299 /* mov[df,sf]_aarch64. */
8300 if (aarch64_float_const_representable_p (x))
8301 /* FMOV (scalar immediate). */
8302 *cost += extra_cost->fp[mode == DFmode].fpconst;
8303 else if (!aarch64_float_const_zero_rtx_p (x))
8305 /* This will be a load from memory. */
8306 if (mode == DFmode)
8307 *cost += extra_cost->ldst.loadd;
8308 else
8309 *cost += extra_cost->ldst.loadf;
8311 else
8312 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8313 or MOV v0.s[0], wzr - neither of which are modeled by the
8314 cost tables. Just use the default cost. */
8319 return true;
8321 case MEM:
8322 if (speed)
8324 /* For loads we want the base cost of a load, plus an
8325 approximation for the additional cost of the addressing
8326 mode. */
8327 rtx address = XEXP (x, 0);
8328 if (VECTOR_MODE_P (mode))
8329 *cost += extra_cost->ldst.loadv;
8330 else if (GET_MODE_CLASS (mode) == MODE_INT)
8331 *cost += extra_cost->ldst.load;
8332 else if (mode == SFmode)
8333 *cost += extra_cost->ldst.loadf;
8334 else if (mode == DFmode)
8335 *cost += extra_cost->ldst.loadd;
8337 *cost +=
8338 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8339 0, speed));
8342 return true;
8344 case NEG:
8345 op0 = XEXP (x, 0);
8347 if (VECTOR_MODE_P (mode))
8349 if (speed)
8351 /* FNEG. */
8352 *cost += extra_cost->vect.alu;
8354 return false;
8357 if (GET_MODE_CLASS (mode) == MODE_INT)
8359 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8360 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8362 /* CSETM. */
8363 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8364 return true;
8367 /* Cost this as SUB wzr, X. */
8368 op0 = CONST0_RTX (mode);
8369 op1 = XEXP (x, 0);
8370 goto cost_minus;
8373 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8375 /* Support (neg(fma...)) as a single instruction only if
8376 sign of zeros is unimportant. This matches the decision
8377 making in aarch64.md. */
8378 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8380 /* FNMADD. */
8381 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8382 return true;
8384 if (GET_CODE (op0) == MULT)
8386 /* FNMUL. */
8387 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8388 return true;
8390 if (speed)
8391 /* FNEG. */
8392 *cost += extra_cost->fp[mode == DFmode].neg;
8393 return false;
8396 return false;
8398 case CLRSB:
8399 case CLZ:
8400 if (speed)
8402 if (VECTOR_MODE_P (mode))
8403 *cost += extra_cost->vect.alu;
8404 else
8405 *cost += extra_cost->alu.clz;
8408 return false;
8410 case COMPARE:
8411 op0 = XEXP (x, 0);
8412 op1 = XEXP (x, 1);
8414 if (op1 == const0_rtx
8415 && GET_CODE (op0) == AND)
8417 x = op0;
8418 mode = GET_MODE (op0);
8419 goto cost_logic;
8422 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8424 /* TODO: A write to the CC flags possibly costs extra, this
8425 needs encoding in the cost tables. */
8427 mode = GET_MODE (op0);
8428 /* ANDS. */
8429 if (GET_CODE (op0) == AND)
8431 x = op0;
8432 goto cost_logic;
8435 if (GET_CODE (op0) == PLUS)
8437 /* ADDS (and CMN alias). */
8438 x = op0;
8439 goto cost_plus;
8442 if (GET_CODE (op0) == MINUS)
8444 /* SUBS. */
8445 x = op0;
8446 goto cost_minus;
8449 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8450 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8451 && CONST_INT_P (XEXP (op0, 2)))
8453 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8454 Handle it here directly rather than going to cost_logic
8455 since we know the immediate generated for the TST is valid
8456 so we can avoid creating an intermediate rtx for it only
8457 for costing purposes. */
8458 if (speed)
8459 *cost += extra_cost->alu.logical;
8461 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8462 ZERO_EXTRACT, 0, speed);
8463 return true;
8466 if (GET_CODE (op1) == NEG)
8468 /* CMN. */
8469 if (speed)
8470 *cost += extra_cost->alu.arith;
8472 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8473 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8474 return true;
8477 /* CMP.
8479 Compare can freely swap the order of operands, and
8480 canonicalization puts the more complex operation first.
8481 But the integer MINUS logic expects the shift/extend
8482 operation in op1. */
8483 if (! (REG_P (op0)
8484 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8486 op0 = XEXP (x, 1);
8487 op1 = XEXP (x, 0);
8489 goto cost_minus;
8492 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8494 /* FCMP. */
8495 if (speed)
8496 *cost += extra_cost->fp[mode == DFmode].compare;
8498 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8500 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8501 /* FCMP supports constant 0.0 for no extra cost. */
8502 return true;
8504 return false;
8507 if (VECTOR_MODE_P (mode))
8509 /* Vector compare. */
8510 if (speed)
8511 *cost += extra_cost->vect.alu;
8513 if (aarch64_float_const_zero_rtx_p (op1))
8515 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8516 cost. */
8517 return true;
8519 return false;
8521 return false;
8523 case MINUS:
8525 op0 = XEXP (x, 0);
8526 op1 = XEXP (x, 1);
8528 cost_minus:
8529 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8531 /* Detect valid immediates. */
8532 if ((GET_MODE_CLASS (mode) == MODE_INT
8533 || (GET_MODE_CLASS (mode) == MODE_CC
8534 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8535 && CONST_INT_P (op1)
8536 && aarch64_uimm12_shift (INTVAL (op1)))
8538 if (speed)
8539 /* SUB(S) (immediate). */
8540 *cost += extra_cost->alu.arith;
8541 return true;
8544 /* Look for SUB (extended register). */
8545 if (is_a <scalar_int_mode> (mode, &int_mode)
8546 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8548 if (speed)
8549 *cost += extra_cost->alu.extend_arith;
8551 op1 = aarch64_strip_extend (op1, true);
8552 *cost += rtx_cost (op1, VOIDmode,
8553 (enum rtx_code) GET_CODE (op1), 0, speed);
8554 return true;
8557 rtx new_op1 = aarch64_strip_extend (op1, false);
8559 /* Cost this as an FMA-alike operation. */
8560 if ((GET_CODE (new_op1) == MULT
8561 || aarch64_shift_p (GET_CODE (new_op1)))
8562 && code != COMPARE)
8564 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8565 (enum rtx_code) code,
8566 speed);
8567 return true;
8570 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8572 if (speed)
8574 if (VECTOR_MODE_P (mode))
8576 /* Vector SUB. */
8577 *cost += extra_cost->vect.alu;
8579 else if (GET_MODE_CLASS (mode) == MODE_INT)
8581 /* SUB(S). */
8582 *cost += extra_cost->alu.arith;
8584 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8586 /* FSUB. */
8587 *cost += extra_cost->fp[mode == DFmode].addsub;
8590 return true;
8593 case PLUS:
8595 rtx new_op0;
8597 op0 = XEXP (x, 0);
8598 op1 = XEXP (x, 1);
8600 cost_plus:
8601 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8602 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8604 /* CSINC. */
8605 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8606 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8607 return true;
8610 if (GET_MODE_CLASS (mode) == MODE_INT
8611 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8612 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8614 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8616 if (speed)
8617 /* ADD (immediate). */
8618 *cost += extra_cost->alu.arith;
8619 return true;
8622 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8624 /* Look for ADD (extended register). */
8625 if (is_a <scalar_int_mode> (mode, &int_mode)
8626 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8628 if (speed)
8629 *cost += extra_cost->alu.extend_arith;
8631 op0 = aarch64_strip_extend (op0, true);
8632 *cost += rtx_cost (op0, VOIDmode,
8633 (enum rtx_code) GET_CODE (op0), 0, speed);
8634 return true;
8637 /* Strip any extend, leave shifts behind as we will
8638 cost them through mult_cost. */
8639 new_op0 = aarch64_strip_extend (op0, false);
8641 if (GET_CODE (new_op0) == MULT
8642 || aarch64_shift_p (GET_CODE (new_op0)))
8644 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8645 speed);
8646 return true;
8649 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8651 if (speed)
8653 if (VECTOR_MODE_P (mode))
8655 /* Vector ADD. */
8656 *cost += extra_cost->vect.alu;
8658 else if (GET_MODE_CLASS (mode) == MODE_INT)
8660 /* ADD. */
8661 *cost += extra_cost->alu.arith;
8663 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8665 /* FADD. */
8666 *cost += extra_cost->fp[mode == DFmode].addsub;
8669 return true;
8672 case BSWAP:
8673 *cost = COSTS_N_INSNS (1);
8675 if (speed)
8677 if (VECTOR_MODE_P (mode))
8678 *cost += extra_cost->vect.alu;
8679 else
8680 *cost += extra_cost->alu.rev;
8682 return false;
8684 case IOR:
8685 if (aarch_rev16_p (x))
8687 *cost = COSTS_N_INSNS (1);
8689 if (speed)
8691 if (VECTOR_MODE_P (mode))
8692 *cost += extra_cost->vect.alu;
8693 else
8694 *cost += extra_cost->alu.rev;
8696 return true;
8699 if (aarch64_extr_rtx_p (x, &op0, &op1))
8701 *cost += rtx_cost (op0, mode, IOR, 0, speed);
8702 *cost += rtx_cost (op1, mode, IOR, 1, speed);
8703 if (speed)
8704 *cost += extra_cost->alu.shift;
8706 return true;
8708 /* Fall through. */
8709 case XOR:
8710 case AND:
8711 cost_logic:
8712 op0 = XEXP (x, 0);
8713 op1 = XEXP (x, 1);
8715 if (VECTOR_MODE_P (mode))
8717 if (speed)
8718 *cost += extra_cost->vect.alu;
8719 return true;
8722 if (code == AND
8723 && GET_CODE (op0) == MULT
8724 && CONST_INT_P (XEXP (op0, 1))
8725 && CONST_INT_P (op1)
8726 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
8727 INTVAL (op1)) != 0)
8729 /* This is a UBFM/SBFM. */
8730 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
8731 if (speed)
8732 *cost += extra_cost->alu.bfx;
8733 return true;
8736 if (is_int_mode (mode, &int_mode))
8738 if (CONST_INT_P (op1))
8740 /* We have a mask + shift version of a UBFIZ
8741 i.e. the *andim_ashift<mode>_bfiz pattern. */
8742 if (GET_CODE (op0) == ASHIFT
8743 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
8744 XEXP (op0, 1)))
8746 *cost += rtx_cost (XEXP (op0, 0), int_mode,
8747 (enum rtx_code) code, 0, speed);
8748 if (speed)
8749 *cost += extra_cost->alu.bfx;
8751 return true;
8753 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
8755 /* We possibly get the immediate for free, this is not
8756 modelled. */
8757 *cost += rtx_cost (op0, int_mode,
8758 (enum rtx_code) code, 0, speed);
8759 if (speed)
8760 *cost += extra_cost->alu.logical;
8762 return true;
8765 else
8767 rtx new_op0 = op0;
8769 /* Handle ORN, EON, or BIC. */
8770 if (GET_CODE (op0) == NOT)
8771 op0 = XEXP (op0, 0);
8773 new_op0 = aarch64_strip_shift (op0);
8775 /* If we had a shift on op0 then this is a logical-shift-
8776 by-register/immediate operation. Otherwise, this is just
8777 a logical operation. */
8778 if (speed)
8780 if (new_op0 != op0)
8782 /* Shift by immediate. */
8783 if (CONST_INT_P (XEXP (op0, 1)))
8784 *cost += extra_cost->alu.log_shift;
8785 else
8786 *cost += extra_cost->alu.log_shift_reg;
8788 else
8789 *cost += extra_cost->alu.logical;
8792 /* In both cases we want to cost both operands. */
8793 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
8794 0, speed);
8795 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
8796 1, speed);
8798 return true;
8801 return false;
8803 case NOT:
8804 x = XEXP (x, 0);
8805 op0 = aarch64_strip_shift (x);
8807 if (VECTOR_MODE_P (mode))
8809 /* Vector NOT. */
8810 *cost += extra_cost->vect.alu;
8811 return false;
8814 /* MVN-shifted-reg. */
8815 if (op0 != x)
8817 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
8819 if (speed)
8820 *cost += extra_cost->alu.log_shift;
8822 return true;
8824 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
8825 Handle the second form here taking care that 'a' in the above can
8826 be a shift. */
8827 else if (GET_CODE (op0) == XOR)
8829 rtx newop0 = XEXP (op0, 0);
8830 rtx newop1 = XEXP (op0, 1);
8831 rtx op0_stripped = aarch64_strip_shift (newop0);
8833 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
8834 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
8836 if (speed)
8838 if (op0_stripped != newop0)
8839 *cost += extra_cost->alu.log_shift;
8840 else
8841 *cost += extra_cost->alu.logical;
8844 return true;
8846 /* MVN. */
8847 if (speed)
8848 *cost += extra_cost->alu.logical;
8850 return false;
8852 case ZERO_EXTEND:
8854 op0 = XEXP (x, 0);
8855 /* If a value is written in SI mode, then zero extended to DI
8856 mode, the operation will in general be free as a write to
8857 a 'w' register implicitly zeroes the upper bits of an 'x'
8858 register. However, if this is
8860 (set (reg) (zero_extend (reg)))
8862 we must cost the explicit register move. */
8863 if (mode == DImode
8864 && GET_MODE (op0) == SImode
8865 && outer == SET)
8867 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
8869 /* If OP_COST is non-zero, then the cost of the zero extend
8870 is effectively the cost of the inner operation. Otherwise
8871 we have a MOV instruction and we take the cost from the MOV
8872 itself. This is true independently of whether we are
8873 optimizing for space or time. */
8874 if (op_cost)
8875 *cost = op_cost;
8877 return true;
8879 else if (MEM_P (op0))
8881 /* All loads can zero extend to any size for free. */
8882 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
8883 return true;
8886 op0 = aarch64_extend_bitfield_pattern_p (x);
8887 if (op0)
8889 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
8890 if (speed)
8891 *cost += extra_cost->alu.bfx;
8892 return true;
8895 if (speed)
8897 if (VECTOR_MODE_P (mode))
8899 /* UMOV. */
8900 *cost += extra_cost->vect.alu;
8902 else
8904 /* We generate an AND instead of UXTB/UXTH. */
8905 *cost += extra_cost->alu.logical;
8908 return false;
8910 case SIGN_EXTEND:
8911 if (MEM_P (XEXP (x, 0)))
8913 /* LDRSH. */
8914 if (speed)
8916 rtx address = XEXP (XEXP (x, 0), 0);
8917 *cost += extra_cost->ldst.load_sign_extend;
8919 *cost +=
8920 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8921 0, speed));
8923 return true;
8926 op0 = aarch64_extend_bitfield_pattern_p (x);
8927 if (op0)
8929 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
8930 if (speed)
8931 *cost += extra_cost->alu.bfx;
8932 return true;
8935 if (speed)
8937 if (VECTOR_MODE_P (mode))
8938 *cost += extra_cost->vect.alu;
8939 else
8940 *cost += extra_cost->alu.extend;
8942 return false;
8944 case ASHIFT:
8945 op0 = XEXP (x, 0);
8946 op1 = XEXP (x, 1);
8948 if (CONST_INT_P (op1))
8950 if (speed)
8952 if (VECTOR_MODE_P (mode))
8954 /* Vector shift (immediate). */
8955 *cost += extra_cost->vect.alu;
8957 else
8959 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
8960 aliases. */
8961 *cost += extra_cost->alu.shift;
8965 /* We can incorporate zero/sign extend for free. */
8966 if (GET_CODE (op0) == ZERO_EXTEND
8967 || GET_CODE (op0) == SIGN_EXTEND)
8968 op0 = XEXP (op0, 0);
8970 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
8971 return true;
8973 else
8975 if (VECTOR_MODE_P (mode))
8977 if (speed)
8978 /* Vector shift (register). */
8979 *cost += extra_cost->vect.alu;
8981 else
8983 if (speed)
8984 /* LSLV. */
8985 *cost += extra_cost->alu.shift_reg;
8987 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
8988 && CONST_INT_P (XEXP (op1, 1))
8989 && known_eq (INTVAL (XEXP (op1, 1)),
8990 GET_MODE_BITSIZE (mode) - 1))
8992 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
8993 /* We already demanded XEXP (op1, 0) to be REG_P, so
8994 don't recurse into it. */
8995 return true;
8998 return false; /* All arguments need to be in registers. */
9001 case ROTATE:
9002 case ROTATERT:
9003 case LSHIFTRT:
9004 case ASHIFTRT:
9005 op0 = XEXP (x, 0);
9006 op1 = XEXP (x, 1);
9008 if (CONST_INT_P (op1))
9010 /* ASR (immediate) and friends. */
9011 if (speed)
9013 if (VECTOR_MODE_P (mode))
9014 *cost += extra_cost->vect.alu;
9015 else
9016 *cost += extra_cost->alu.shift;
9019 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9020 return true;
9022 else
9024 if (VECTOR_MODE_P (mode))
9026 if (speed)
9027 /* Vector shift (register). */
9028 *cost += extra_cost->vect.alu;
9030 else
9032 if (speed)
9033 /* ASR (register) and friends. */
9034 *cost += extra_cost->alu.shift_reg;
9036 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9037 && CONST_INT_P (XEXP (op1, 1))
9038 && known_eq (INTVAL (XEXP (op1, 1)),
9039 GET_MODE_BITSIZE (mode) - 1))
9041 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9042 /* We already demanded XEXP (op1, 0) to be REG_P, so
9043 don't recurse into it. */
9044 return true;
9047 return false; /* All arguments need to be in registers. */
9050 case SYMBOL_REF:
9052 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9053 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9055 /* LDR. */
9056 if (speed)
9057 *cost += extra_cost->ldst.load;
9059 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9060 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9062 /* ADRP, followed by ADD. */
9063 *cost += COSTS_N_INSNS (1);
9064 if (speed)
9065 *cost += 2 * extra_cost->alu.arith;
9067 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9068 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9070 /* ADR. */
9071 if (speed)
9072 *cost += extra_cost->alu.arith;
9075 if (flag_pic)
9077 /* One extra load instruction, after accessing the GOT. */
9078 *cost += COSTS_N_INSNS (1);
9079 if (speed)
9080 *cost += extra_cost->ldst.load;
9082 return true;
9084 case HIGH:
9085 case LO_SUM:
9086 /* ADRP/ADD (immediate). */
9087 if (speed)
9088 *cost += extra_cost->alu.arith;
9089 return true;
9091 case ZERO_EXTRACT:
9092 case SIGN_EXTRACT:
9093 /* UBFX/SBFX. */
9094 if (speed)
9096 if (VECTOR_MODE_P (mode))
9097 *cost += extra_cost->vect.alu;
9098 else
9099 *cost += extra_cost->alu.bfx;
9102 /* We can trust that the immediates used will be correct (there
9103 are no by-register forms), so we need only cost op0. */
9104 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9105 return true;
9107 case MULT:
9108 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9109 /* aarch64_rtx_mult_cost always handles recursion to its
9110 operands. */
9111 return true;
9113 case MOD:
9114 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9115 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9116 an unconditional negate. This case should only ever be reached through
9117 the set_smod_pow2_cheap check in expmed.c. */
9118 if (CONST_INT_P (XEXP (x, 1))
9119 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9120 && (mode == SImode || mode == DImode))
9122 /* We expand to 4 instructions. Reset the baseline. */
9123 *cost = COSTS_N_INSNS (4);
9125 if (speed)
9126 *cost += 2 * extra_cost->alu.logical
9127 + 2 * extra_cost->alu.arith;
9129 return true;
9132 /* Fall-through. */
9133 case UMOD:
9134 if (speed)
9136 /* Slighly prefer UMOD over SMOD. */
9137 if (VECTOR_MODE_P (mode))
9138 *cost += extra_cost->vect.alu;
9139 else if (GET_MODE_CLASS (mode) == MODE_INT)
9140 *cost += (extra_cost->mult[mode == DImode].add
9141 + extra_cost->mult[mode == DImode].idiv
9142 + (code == MOD ? 1 : 0));
9144 return false; /* All arguments need to be in registers. */
9146 case DIV:
9147 case UDIV:
9148 case SQRT:
9149 if (speed)
9151 if (VECTOR_MODE_P (mode))
9152 *cost += extra_cost->vect.alu;
9153 else if (GET_MODE_CLASS (mode) == MODE_INT)
9154 /* There is no integer SQRT, so only DIV and UDIV can get
9155 here. */
9156 *cost += (extra_cost->mult[mode == DImode].idiv
9157 /* Slighly prefer UDIV over SDIV. */
9158 + (code == DIV ? 1 : 0));
9159 else
9160 *cost += extra_cost->fp[mode == DFmode].div;
9162 return false; /* All arguments need to be in registers. */
9164 case IF_THEN_ELSE:
9165 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9166 XEXP (x, 2), cost, speed);
9168 case EQ:
9169 case NE:
9170 case GT:
9171 case GTU:
9172 case LT:
9173 case LTU:
9174 case GE:
9175 case GEU:
9176 case LE:
9177 case LEU:
9179 return false; /* All arguments must be in registers. */
9181 case FMA:
9182 op0 = XEXP (x, 0);
9183 op1 = XEXP (x, 1);
9184 op2 = XEXP (x, 2);
9186 if (speed)
9188 if (VECTOR_MODE_P (mode))
9189 *cost += extra_cost->vect.alu;
9190 else
9191 *cost += extra_cost->fp[mode == DFmode].fma;
9194 /* FMSUB, FNMADD, and FNMSUB are free. */
9195 if (GET_CODE (op0) == NEG)
9196 op0 = XEXP (op0, 0);
9198 if (GET_CODE (op2) == NEG)
9199 op2 = XEXP (op2, 0);
9201 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9202 and the by-element operand as operand 0. */
9203 if (GET_CODE (op1) == NEG)
9204 op1 = XEXP (op1, 0);
9206 /* Catch vector-by-element operations. The by-element operand can
9207 either be (vec_duplicate (vec_select (x))) or just
9208 (vec_select (x)), depending on whether we are multiplying by
9209 a vector or a scalar.
9211 Canonicalization is not very good in these cases, FMA4 will put the
9212 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9213 if (GET_CODE (op0) == VEC_DUPLICATE)
9214 op0 = XEXP (op0, 0);
9215 else if (GET_CODE (op1) == VEC_DUPLICATE)
9216 op1 = XEXP (op1, 0);
9218 if (GET_CODE (op0) == VEC_SELECT)
9219 op0 = XEXP (op0, 0);
9220 else if (GET_CODE (op1) == VEC_SELECT)
9221 op1 = XEXP (op1, 0);
9223 /* If the remaining parameters are not registers,
9224 get the cost to put them into registers. */
9225 *cost += rtx_cost (op0, mode, FMA, 0, speed);
9226 *cost += rtx_cost (op1, mode, FMA, 1, speed);
9227 *cost += rtx_cost (op2, mode, FMA, 2, speed);
9228 return true;
9230 case FLOAT:
9231 case UNSIGNED_FLOAT:
9232 if (speed)
9233 *cost += extra_cost->fp[mode == DFmode].fromint;
9234 return false;
9236 case FLOAT_EXTEND:
9237 if (speed)
9239 if (VECTOR_MODE_P (mode))
9241 /*Vector truncate. */
9242 *cost += extra_cost->vect.alu;
9244 else
9245 *cost += extra_cost->fp[mode == DFmode].widen;
9247 return false;
9249 case FLOAT_TRUNCATE:
9250 if (speed)
9252 if (VECTOR_MODE_P (mode))
9254 /*Vector conversion. */
9255 *cost += extra_cost->vect.alu;
9257 else
9258 *cost += extra_cost->fp[mode == DFmode].narrow;
9260 return false;
9262 case FIX:
9263 case UNSIGNED_FIX:
9264 x = XEXP (x, 0);
9265 /* Strip the rounding part. They will all be implemented
9266 by the fcvt* family of instructions anyway. */
9267 if (GET_CODE (x) == UNSPEC)
9269 unsigned int uns_code = XINT (x, 1);
9271 if (uns_code == UNSPEC_FRINTA
9272 || uns_code == UNSPEC_FRINTM
9273 || uns_code == UNSPEC_FRINTN
9274 || uns_code == UNSPEC_FRINTP
9275 || uns_code == UNSPEC_FRINTZ)
9276 x = XVECEXP (x, 0, 0);
9279 if (speed)
9281 if (VECTOR_MODE_P (mode))
9282 *cost += extra_cost->vect.alu;
9283 else
9284 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9287 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9288 fixed-point fcvt. */
9289 if (GET_CODE (x) == MULT
9290 && ((VECTOR_MODE_P (mode)
9291 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9292 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9294 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9295 0, speed);
9296 return true;
9299 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9300 return true;
9302 case ABS:
9303 if (VECTOR_MODE_P (mode))
9305 /* ABS (vector). */
9306 if (speed)
9307 *cost += extra_cost->vect.alu;
9309 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9311 op0 = XEXP (x, 0);
9313 /* FABD, which is analogous to FADD. */
9314 if (GET_CODE (op0) == MINUS)
9316 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9317 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9318 if (speed)
9319 *cost += extra_cost->fp[mode == DFmode].addsub;
9321 return true;
9323 /* Simple FABS is analogous to FNEG. */
9324 if (speed)
9325 *cost += extra_cost->fp[mode == DFmode].neg;
9327 else
9329 /* Integer ABS will either be split to
9330 two arithmetic instructions, or will be an ABS
9331 (scalar), which we don't model. */
9332 *cost = COSTS_N_INSNS (2);
9333 if (speed)
9334 *cost += 2 * extra_cost->alu.arith;
9336 return false;
9338 case SMAX:
9339 case SMIN:
9340 if (speed)
9342 if (VECTOR_MODE_P (mode))
9343 *cost += extra_cost->vect.alu;
9344 else
9346 /* FMAXNM/FMINNM/FMAX/FMIN.
9347 TODO: This may not be accurate for all implementations, but
9348 we do not model this in the cost tables. */
9349 *cost += extra_cost->fp[mode == DFmode].addsub;
9352 return false;
9354 case UNSPEC:
9355 /* The floating point round to integer frint* instructions. */
9356 if (aarch64_frint_unspec_p (XINT (x, 1)))
9358 if (speed)
9359 *cost += extra_cost->fp[mode == DFmode].roundint;
9361 return false;
9364 if (XINT (x, 1) == UNSPEC_RBIT)
9366 if (speed)
9367 *cost += extra_cost->alu.rev;
9369 return false;
9371 break;
9373 case TRUNCATE:
9375 /* Decompose <su>muldi3_highpart. */
9376 if (/* (truncate:DI */
9377 mode == DImode
9378 /* (lshiftrt:TI */
9379 && GET_MODE (XEXP (x, 0)) == TImode
9380 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9381 /* (mult:TI */
9382 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9383 /* (ANY_EXTEND:TI (reg:DI))
9384 (ANY_EXTEND:TI (reg:DI))) */
9385 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9386 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9387 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9388 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9389 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9390 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9391 /* (const_int 64) */
9392 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9393 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9395 /* UMULH/SMULH. */
9396 if (speed)
9397 *cost += extra_cost->mult[mode == DImode].extend;
9398 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9399 mode, MULT, 0, speed);
9400 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9401 mode, MULT, 1, speed);
9402 return true;
9405 /* Fall through. */
9406 default:
9407 break;
9410 if (dump_file
9411 && flag_aarch64_verbose_cost)
9412 fprintf (dump_file,
9413 "\nFailed to cost RTX. Assuming default cost.\n");
9415 return true;
9418 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9419 calculated for X. This cost is stored in *COST. Returns true
9420 if the total cost of X was calculated. */
9421 static bool
9422 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9423 int param, int *cost, bool speed)
9425 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9427 if (dump_file
9428 && flag_aarch64_verbose_cost)
9430 print_rtl_single (dump_file, x);
9431 fprintf (dump_file, "\n%s cost: %d (%s)\n",
9432 speed ? "Hot" : "Cold",
9433 *cost, result ? "final" : "partial");
9436 return result;
9439 static int
9440 aarch64_register_move_cost (machine_mode mode,
9441 reg_class_t from_i, reg_class_t to_i)
9443 enum reg_class from = (enum reg_class) from_i;
9444 enum reg_class to = (enum reg_class) to_i;
9445 const struct cpu_regmove_cost *regmove_cost
9446 = aarch64_tune_params.regmove_cost;
9448 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9449 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
9450 to = GENERAL_REGS;
9452 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
9453 from = GENERAL_REGS;
9455 /* Moving between GPR and stack cost is the same as GP2GP. */
9456 if ((from == GENERAL_REGS && to == STACK_REG)
9457 || (to == GENERAL_REGS && from == STACK_REG))
9458 return regmove_cost->GP2GP;
9460 /* To/From the stack register, we move via the gprs. */
9461 if (to == STACK_REG || from == STACK_REG)
9462 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9463 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9465 if (known_eq (GET_MODE_SIZE (mode), 16))
9467 /* 128-bit operations on general registers require 2 instructions. */
9468 if (from == GENERAL_REGS && to == GENERAL_REGS)
9469 return regmove_cost->GP2GP * 2;
9470 else if (from == GENERAL_REGS)
9471 return regmove_cost->GP2FP * 2;
9472 else if (to == GENERAL_REGS)
9473 return regmove_cost->FP2GP * 2;
9475 /* When AdvSIMD instructions are disabled it is not possible to move
9476 a 128-bit value directly between Q registers. This is handled in
9477 secondary reload. A general register is used as a scratch to move
9478 the upper DI value and the lower DI value is moved directly,
9479 hence the cost is the sum of three moves. */
9480 if (! TARGET_SIMD)
9481 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9483 return regmove_cost->FP2FP;
9486 if (from == GENERAL_REGS && to == GENERAL_REGS)
9487 return regmove_cost->GP2GP;
9488 else if (from == GENERAL_REGS)
9489 return regmove_cost->GP2FP;
9490 else if (to == GENERAL_REGS)
9491 return regmove_cost->FP2GP;
9493 return regmove_cost->FP2FP;
9496 static int
9497 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9498 reg_class_t rclass ATTRIBUTE_UNUSED,
9499 bool in ATTRIBUTE_UNUSED)
9501 return aarch64_tune_params.memmov_cost;
9504 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9505 to optimize 1.0/sqrt. */
9507 static bool
9508 use_rsqrt_p (machine_mode mode)
9510 return (!flag_trapping_math
9511 && flag_unsafe_math_optimizations
9512 && ((aarch64_tune_params.approx_modes->recip_sqrt
9513 & AARCH64_APPROX_MODE (mode))
9514 || flag_mrecip_low_precision_sqrt));
9517 /* Function to decide when to use the approximate reciprocal square root
9518 builtin. */
9520 static tree
9521 aarch64_builtin_reciprocal (tree fndecl)
9523 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9525 if (!use_rsqrt_p (mode))
9526 return NULL_TREE;
9527 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9530 typedef rtx (*rsqrte_type) (rtx, rtx);
9532 /* Select reciprocal square root initial estimate insn depending on machine
9533 mode. */
9535 static rsqrte_type
9536 get_rsqrte_type (machine_mode mode)
9538 switch (mode)
9540 case E_DFmode: return gen_aarch64_rsqrtedf;
9541 case E_SFmode: return gen_aarch64_rsqrtesf;
9542 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9543 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9544 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9545 default: gcc_unreachable ();
9549 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9551 /* Select reciprocal square root series step insn depending on machine mode. */
9553 static rsqrts_type
9554 get_rsqrts_type (machine_mode mode)
9556 switch (mode)
9558 case E_DFmode: return gen_aarch64_rsqrtsdf;
9559 case E_SFmode: return gen_aarch64_rsqrtssf;
9560 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9561 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9562 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9563 default: gcc_unreachable ();
9567 /* Emit instruction sequence to compute either the approximate square root
9568 or its approximate reciprocal, depending on the flag RECP, and return
9569 whether the sequence was emitted or not. */
9571 bool
9572 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9574 machine_mode mode = GET_MODE (dst);
9576 if (GET_MODE_INNER (mode) == HFmode)
9578 gcc_assert (!recp);
9579 return false;
9582 if (!recp)
9584 if (!(flag_mlow_precision_sqrt
9585 || (aarch64_tune_params.approx_modes->sqrt
9586 & AARCH64_APPROX_MODE (mode))))
9587 return false;
9589 if (flag_finite_math_only
9590 || flag_trapping_math
9591 || !flag_unsafe_math_optimizations
9592 || optimize_function_for_size_p (cfun))
9593 return false;
9595 else
9596 /* Caller assumes we cannot fail. */
9597 gcc_assert (use_rsqrt_p (mode));
9599 machine_mode mmsk = mode_for_int_vector (mode).require ();
9600 rtx xmsk = gen_reg_rtx (mmsk);
9601 if (!recp)
9602 /* When calculating the approximate square root, compare the
9603 argument with 0.0 and create a mask. */
9604 emit_insn (gen_rtx_SET (xmsk,
9605 gen_rtx_NEG (mmsk,
9606 gen_rtx_EQ (mmsk, src,
9607 CONST0_RTX (mode)))));
9609 /* Estimate the approximate reciprocal square root. */
9610 rtx xdst = gen_reg_rtx (mode);
9611 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9613 /* Iterate over the series twice for SF and thrice for DF. */
9614 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9616 /* Optionally iterate over the series once less for faster performance
9617 while sacrificing the accuracy. */
9618 if ((recp && flag_mrecip_low_precision_sqrt)
9619 || (!recp && flag_mlow_precision_sqrt))
9620 iterations--;
9622 /* Iterate over the series to calculate the approximate reciprocal square
9623 root. */
9624 rtx x1 = gen_reg_rtx (mode);
9625 while (iterations--)
9627 rtx x2 = gen_reg_rtx (mode);
9628 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9630 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9632 if (iterations > 0)
9633 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9636 if (!recp)
9638 /* Qualify the approximate reciprocal square root when the argument is
9639 0.0 by squashing the intermediary result to 0.0. */
9640 rtx xtmp = gen_reg_rtx (mmsk);
9641 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9642 gen_rtx_SUBREG (mmsk, xdst, 0)));
9643 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9645 /* Calculate the approximate square root. */
9646 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9649 /* Finalize the approximation. */
9650 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9652 return true;
9655 typedef rtx (*recpe_type) (rtx, rtx);
9657 /* Select reciprocal initial estimate insn depending on machine mode. */
9659 static recpe_type
9660 get_recpe_type (machine_mode mode)
9662 switch (mode)
9664 case E_SFmode: return (gen_aarch64_frecpesf);
9665 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9666 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9667 case E_DFmode: return (gen_aarch64_frecpedf);
9668 case E_V2DFmode: return (gen_aarch64_frecpev2df);
9669 default: gcc_unreachable ();
9673 typedef rtx (*recps_type) (rtx, rtx, rtx);
9675 /* Select reciprocal series step insn depending on machine mode. */
9677 static recps_type
9678 get_recps_type (machine_mode mode)
9680 switch (mode)
9682 case E_SFmode: return (gen_aarch64_frecpssf);
9683 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9684 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9685 case E_DFmode: return (gen_aarch64_frecpsdf);
9686 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9687 default: gcc_unreachable ();
9691 /* Emit the instruction sequence to compute the approximation for the division
9692 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
9694 bool
9695 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9697 machine_mode mode = GET_MODE (quo);
9699 if (GET_MODE_INNER (mode) == HFmode)
9700 return false;
9702 bool use_approx_division_p = (flag_mlow_precision_div
9703 || (aarch64_tune_params.approx_modes->division
9704 & AARCH64_APPROX_MODE (mode)));
9706 if (!flag_finite_math_only
9707 || flag_trapping_math
9708 || !flag_unsafe_math_optimizations
9709 || optimize_function_for_size_p (cfun)
9710 || !use_approx_division_p)
9711 return false;
9713 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9714 return false;
9716 /* Estimate the approximate reciprocal. */
9717 rtx xrcp = gen_reg_rtx (mode);
9718 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
9720 /* Iterate over the series twice for SF and thrice for DF. */
9721 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9723 /* Optionally iterate over the series once less for faster performance,
9724 while sacrificing the accuracy. */
9725 if (flag_mlow_precision_div)
9726 iterations--;
9728 /* Iterate over the series to calculate the approximate reciprocal. */
9729 rtx xtmp = gen_reg_rtx (mode);
9730 while (iterations--)
9732 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
9734 if (iterations > 0)
9735 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
9738 if (num != CONST1_RTX (mode))
9740 /* As the approximate reciprocal of DEN is already calculated, only
9741 calculate the approximate division when NUM is not 1.0. */
9742 rtx xnum = force_reg (mode, num);
9743 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
9746 /* Finalize the approximation. */
9747 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
9748 return true;
9751 /* Return the number of instructions that can be issued per cycle. */
9752 static int
9753 aarch64_sched_issue_rate (void)
9755 return aarch64_tune_params.issue_rate;
9758 static int
9759 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
9761 int issue_rate = aarch64_sched_issue_rate ();
9763 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
9767 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
9768 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
9769 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
9771 static int
9772 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
9773 int ready_index)
9775 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
9779 /* Vectorizer cost model target hooks. */
9781 /* Implement targetm.vectorize.builtin_vectorization_cost. */
9782 static int
9783 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
9784 tree vectype,
9785 int misalign ATTRIBUTE_UNUSED)
9787 unsigned elements;
9788 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
9789 bool fp = false;
9791 if (vectype != NULL)
9792 fp = FLOAT_TYPE_P (vectype);
9794 switch (type_of_cost)
9796 case scalar_stmt:
9797 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
9799 case scalar_load:
9800 return costs->scalar_load_cost;
9802 case scalar_store:
9803 return costs->scalar_store_cost;
9805 case vector_stmt:
9806 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
9808 case vector_load:
9809 return costs->vec_align_load_cost;
9811 case vector_store:
9812 return costs->vec_store_cost;
9814 case vec_to_scalar:
9815 return costs->vec_to_scalar_cost;
9817 case scalar_to_vec:
9818 return costs->scalar_to_vec_cost;
9820 case unaligned_load:
9821 case vector_gather_load:
9822 return costs->vec_unalign_load_cost;
9824 case unaligned_store:
9825 case vector_scatter_store:
9826 return costs->vec_unalign_store_cost;
9828 case cond_branch_taken:
9829 return costs->cond_taken_branch_cost;
9831 case cond_branch_not_taken:
9832 return costs->cond_not_taken_branch_cost;
9834 case vec_perm:
9835 return costs->vec_permute_cost;
9837 case vec_promote_demote:
9838 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
9840 case vec_construct:
9841 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
9842 return elements / 2 + 1;
9844 default:
9845 gcc_unreachable ();
9849 /* Implement targetm.vectorize.add_stmt_cost. */
9850 static unsigned
9851 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
9852 struct _stmt_vec_info *stmt_info, int misalign,
9853 enum vect_cost_model_location where)
9855 unsigned *cost = (unsigned *) data;
9856 unsigned retval = 0;
9858 if (flag_vect_cost_model)
9860 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
9861 int stmt_cost =
9862 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
9864 /* Statements in an inner loop relative to the loop being
9865 vectorized are weighted more heavily. The value here is
9866 arbitrary and could potentially be improved with analysis. */
9867 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
9868 count *= 50; /* FIXME */
9870 retval = (unsigned) (count * stmt_cost);
9871 cost[where] += retval;
9874 return retval;
9877 static void initialize_aarch64_code_model (struct gcc_options *);
9879 /* Parse the TO_PARSE string and put the architecture struct that it
9880 selects into RES and the architectural features into ISA_FLAGS.
9881 Return an aarch64_parse_opt_result describing the parse result.
9882 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
9884 static enum aarch64_parse_opt_result
9885 aarch64_parse_arch (const char *to_parse, const struct processor **res,
9886 unsigned long *isa_flags)
9888 char *ext;
9889 const struct processor *arch;
9890 char *str = (char *) alloca (strlen (to_parse) + 1);
9891 size_t len;
9893 strcpy (str, to_parse);
9895 ext = strchr (str, '+');
9897 if (ext != NULL)
9898 len = ext - str;
9899 else
9900 len = strlen (str);
9902 if (len == 0)
9903 return AARCH64_PARSE_MISSING_ARG;
9906 /* Loop through the list of supported ARCHes to find a match. */
9907 for (arch = all_architectures; arch->name != NULL; arch++)
9909 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
9911 unsigned long isa_temp = arch->flags;
9913 if (ext != NULL)
9915 /* TO_PARSE string contains at least one extension. */
9916 enum aarch64_parse_opt_result ext_res
9917 = aarch64_parse_extension (ext, &isa_temp);
9919 if (ext_res != AARCH64_PARSE_OK)
9920 return ext_res;
9922 /* Extension parsing was successful. Confirm the result
9923 arch and ISA flags. */
9924 *res = arch;
9925 *isa_flags = isa_temp;
9926 return AARCH64_PARSE_OK;
9930 /* ARCH name not found in list. */
9931 return AARCH64_PARSE_INVALID_ARG;
9934 /* Parse the TO_PARSE string and put the result tuning in RES and the
9935 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
9936 describing the parse result. If there is an error parsing, RES and
9937 ISA_FLAGS are left unchanged. */
9939 static enum aarch64_parse_opt_result
9940 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
9941 unsigned long *isa_flags)
9943 char *ext;
9944 const struct processor *cpu;
9945 char *str = (char *) alloca (strlen (to_parse) + 1);
9946 size_t len;
9948 strcpy (str, to_parse);
9950 ext = strchr (str, '+');
9952 if (ext != NULL)
9953 len = ext - str;
9954 else
9955 len = strlen (str);
9957 if (len == 0)
9958 return AARCH64_PARSE_MISSING_ARG;
9961 /* Loop through the list of supported CPUs to find a match. */
9962 for (cpu = all_cores; cpu->name != NULL; cpu++)
9964 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
9966 unsigned long isa_temp = cpu->flags;
9969 if (ext != NULL)
9971 /* TO_PARSE string contains at least one extension. */
9972 enum aarch64_parse_opt_result ext_res
9973 = aarch64_parse_extension (ext, &isa_temp);
9975 if (ext_res != AARCH64_PARSE_OK)
9976 return ext_res;
9978 /* Extension parsing was successfull. Confirm the result
9979 cpu and ISA flags. */
9980 *res = cpu;
9981 *isa_flags = isa_temp;
9982 return AARCH64_PARSE_OK;
9986 /* CPU name not found in list. */
9987 return AARCH64_PARSE_INVALID_ARG;
9990 /* Parse the TO_PARSE string and put the cpu it selects into RES.
9991 Return an aarch64_parse_opt_result describing the parse result.
9992 If the parsing fails the RES does not change. */
9994 static enum aarch64_parse_opt_result
9995 aarch64_parse_tune (const char *to_parse, const struct processor **res)
9997 const struct processor *cpu;
9998 char *str = (char *) alloca (strlen (to_parse) + 1);
10000 strcpy (str, to_parse);
10002 /* Loop through the list of supported CPUs to find a match. */
10003 for (cpu = all_cores; cpu->name != NULL; cpu++)
10005 if (strcmp (cpu->name, str) == 0)
10007 *res = cpu;
10008 return AARCH64_PARSE_OK;
10012 /* CPU name not found in list. */
10013 return AARCH64_PARSE_INVALID_ARG;
10016 /* Parse TOKEN, which has length LENGTH to see if it is an option
10017 described in FLAG. If it is, return the index bit for that fusion type.
10018 If not, error (printing OPTION_NAME) and return zero. */
10020 static unsigned int
10021 aarch64_parse_one_option_token (const char *token,
10022 size_t length,
10023 const struct aarch64_flag_desc *flag,
10024 const char *option_name)
10026 for (; flag->name != NULL; flag++)
10028 if (length == strlen (flag->name)
10029 && !strncmp (flag->name, token, length))
10030 return flag->flag;
10033 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10034 return 0;
10037 /* Parse OPTION which is a comma-separated list of flags to enable.
10038 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10039 default state we inherit from the CPU tuning structures. OPTION_NAME
10040 gives the top-level option we are parsing in the -moverride string,
10041 for use in error messages. */
10043 static unsigned int
10044 aarch64_parse_boolean_options (const char *option,
10045 const struct aarch64_flag_desc *flags,
10046 unsigned int initial_state,
10047 const char *option_name)
10049 const char separator = '.';
10050 const char* specs = option;
10051 const char* ntoken = option;
10052 unsigned int found_flags = initial_state;
10054 while ((ntoken = strchr (specs, separator)))
10056 size_t token_length = ntoken - specs;
10057 unsigned token_ops = aarch64_parse_one_option_token (specs,
10058 token_length,
10059 flags,
10060 option_name);
10061 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10062 in the token stream, reset the supported operations. So:
10064 adrp+add.cmp+branch.none.adrp+add
10066 would have the result of turning on only adrp+add fusion. */
10067 if (!token_ops)
10068 found_flags = 0;
10070 found_flags |= token_ops;
10071 specs = ++ntoken;
10074 /* We ended with a comma, print something. */
10075 if (!(*specs))
10077 error ("%s string ill-formed\n", option_name);
10078 return 0;
10081 /* We still have one more token to parse. */
10082 size_t token_length = strlen (specs);
10083 unsigned token_ops = aarch64_parse_one_option_token (specs,
10084 token_length,
10085 flags,
10086 option_name);
10087 if (!token_ops)
10088 found_flags = 0;
10090 found_flags |= token_ops;
10091 return found_flags;
10094 /* Support for overriding instruction fusion. */
10096 static void
10097 aarch64_parse_fuse_string (const char *fuse_string,
10098 struct tune_params *tune)
10100 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10101 aarch64_fusible_pairs,
10102 tune->fusible_ops,
10103 "fuse=");
10106 /* Support for overriding other tuning flags. */
10108 static void
10109 aarch64_parse_tune_string (const char *tune_string,
10110 struct tune_params *tune)
10112 tune->extra_tuning_flags
10113 = aarch64_parse_boolean_options (tune_string,
10114 aarch64_tuning_flags,
10115 tune->extra_tuning_flags,
10116 "tune=");
10119 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10120 we understand. If it is, extract the option string and handoff to
10121 the appropriate function. */
10123 void
10124 aarch64_parse_one_override_token (const char* token,
10125 size_t length,
10126 struct tune_params *tune)
10128 const struct aarch64_tuning_override_function *fn
10129 = aarch64_tuning_override_functions;
10131 const char *option_part = strchr (token, '=');
10132 if (!option_part)
10134 error ("tuning string missing in option (%s)", token);
10135 return;
10138 /* Get the length of the option name. */
10139 length = option_part - token;
10140 /* Skip the '=' to get to the option string. */
10141 option_part++;
10143 for (; fn->name != NULL; fn++)
10145 if (!strncmp (fn->name, token, length))
10147 fn->parse_override (option_part, tune);
10148 return;
10152 error ("unknown tuning option (%s)",token);
10153 return;
10156 /* A checking mechanism for the implementation of the tls size. */
10158 static void
10159 initialize_aarch64_tls_size (struct gcc_options *opts)
10161 if (aarch64_tls_size == 0)
10162 aarch64_tls_size = 24;
10164 switch (opts->x_aarch64_cmodel_var)
10166 case AARCH64_CMODEL_TINY:
10167 /* Both the default and maximum TLS size allowed under tiny is 1M which
10168 needs two instructions to address, so we clamp the size to 24. */
10169 if (aarch64_tls_size > 24)
10170 aarch64_tls_size = 24;
10171 break;
10172 case AARCH64_CMODEL_SMALL:
10173 /* The maximum TLS size allowed under small is 4G. */
10174 if (aarch64_tls_size > 32)
10175 aarch64_tls_size = 32;
10176 break;
10177 case AARCH64_CMODEL_LARGE:
10178 /* The maximum TLS size allowed under large is 16E.
10179 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10180 if (aarch64_tls_size > 48)
10181 aarch64_tls_size = 48;
10182 break;
10183 default:
10184 gcc_unreachable ();
10187 return;
10190 /* Parse STRING looking for options in the format:
10191 string :: option:string
10192 option :: name=substring
10193 name :: {a-z}
10194 substring :: defined by option. */
10196 static void
10197 aarch64_parse_override_string (const char* input_string,
10198 struct tune_params* tune)
10200 const char separator = ':';
10201 size_t string_length = strlen (input_string) + 1;
10202 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10203 char *string = string_root;
10204 strncpy (string, input_string, string_length);
10205 string[string_length - 1] = '\0';
10207 char* ntoken = string;
10209 while ((ntoken = strchr (string, separator)))
10211 size_t token_length = ntoken - string;
10212 /* Make this substring look like a string. */
10213 *ntoken = '\0';
10214 aarch64_parse_one_override_token (string, token_length, tune);
10215 string = ++ntoken;
10218 /* One last option to parse. */
10219 aarch64_parse_one_override_token (string, strlen (string), tune);
10220 free (string_root);
10224 static void
10225 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10227 /* PR 70044: We have to be careful about being called multiple times for the
10228 same function. This means all changes should be repeatable. */
10230 /* If the frame pointer is enabled, set it to a special value that behaves
10231 similar to frame pointer omission. If we don't do this all leaf functions
10232 will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10233 If flag_omit_frame_pointer has this special value, we must force the
10234 frame pointer if not in a leaf function. We also need to force it in a
10235 leaf function if flag_omit_frame_pointer is not set or if LR is used. */
10236 if (opts->x_flag_omit_frame_pointer == 0)
10237 opts->x_flag_omit_frame_pointer = 2;
10239 /* If not optimizing for size, set the default
10240 alignment to what the target wants. */
10241 if (!opts->x_optimize_size)
10243 if (opts->x_align_loops <= 0)
10244 opts->x_align_loops = aarch64_tune_params.loop_align;
10245 if (opts->x_align_jumps <= 0)
10246 opts->x_align_jumps = aarch64_tune_params.jump_align;
10247 if (opts->x_align_functions <= 0)
10248 opts->x_align_functions = aarch64_tune_params.function_align;
10251 /* We default to no pc-relative literal loads. */
10253 aarch64_pcrelative_literal_loads = false;
10255 /* If -mpc-relative-literal-loads is set on the command line, this
10256 implies that the user asked for PC relative literal loads. */
10257 if (opts->x_pcrelative_literal_loads == 1)
10258 aarch64_pcrelative_literal_loads = true;
10260 /* In the tiny memory model it makes no sense to disallow PC relative
10261 literal pool loads. */
10262 if (aarch64_cmodel == AARCH64_CMODEL_TINY
10263 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10264 aarch64_pcrelative_literal_loads = true;
10266 /* When enabling the lower precision Newton series for the square root, also
10267 enable it for the reciprocal square root, since the latter is an
10268 intermediary step for the former. */
10269 if (flag_mlow_precision_sqrt)
10270 flag_mrecip_low_precision_sqrt = true;
10273 /* 'Unpack' up the internal tuning structs and update the options
10274 in OPTS. The caller must have set up selected_tune and selected_arch
10275 as all the other target-specific codegen decisions are
10276 derived from them. */
10278 void
10279 aarch64_override_options_internal (struct gcc_options *opts)
10281 aarch64_tune_flags = selected_tune->flags;
10282 aarch64_tune = selected_tune->sched_core;
10283 /* Make a copy of the tuning parameters attached to the core, which
10284 we may later overwrite. */
10285 aarch64_tune_params = *(selected_tune->tune);
10286 aarch64_architecture_version = selected_arch->architecture_version;
10288 if (opts->x_aarch64_override_tune_string)
10289 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10290 &aarch64_tune_params);
10292 /* This target defaults to strict volatile bitfields. */
10293 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10294 opts->x_flag_strict_volatile_bitfields = 1;
10296 initialize_aarch64_code_model (opts);
10297 initialize_aarch64_tls_size (opts);
10299 int queue_depth = 0;
10300 switch (aarch64_tune_params.autoprefetcher_model)
10302 case tune_params::AUTOPREFETCHER_OFF:
10303 queue_depth = -1;
10304 break;
10305 case tune_params::AUTOPREFETCHER_WEAK:
10306 queue_depth = 0;
10307 break;
10308 case tune_params::AUTOPREFETCHER_STRONG:
10309 queue_depth = max_insn_queue_index + 1;
10310 break;
10311 default:
10312 gcc_unreachable ();
10315 /* We don't mind passing in global_options_set here as we don't use
10316 the *options_set structs anyway. */
10317 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10318 queue_depth,
10319 opts->x_param_values,
10320 global_options_set.x_param_values);
10322 /* Set up parameters to be used in prefetching algorithm. Do not
10323 override the defaults unless we are tuning for a core we have
10324 researched values for. */
10325 if (aarch64_tune_params.prefetch->num_slots > 0)
10326 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10327 aarch64_tune_params.prefetch->num_slots,
10328 opts->x_param_values,
10329 global_options_set.x_param_values);
10330 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10331 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10332 aarch64_tune_params.prefetch->l1_cache_size,
10333 opts->x_param_values,
10334 global_options_set.x_param_values);
10335 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10336 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10337 aarch64_tune_params.prefetch->l1_cache_line_size,
10338 opts->x_param_values,
10339 global_options_set.x_param_values);
10340 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10341 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10342 aarch64_tune_params.prefetch->l2_cache_size,
10343 opts->x_param_values,
10344 global_options_set.x_param_values);
10346 /* Use the alternative scheduling-pressure algorithm by default. */
10347 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10348 opts->x_param_values,
10349 global_options_set.x_param_values);
10351 /* Enable sw prefetching at specified optimization level for
10352 CPUS that have prefetch. Lower optimization level threshold by 1
10353 when profiling is enabled. */
10354 if (opts->x_flag_prefetch_loop_arrays < 0
10355 && !opts->x_optimize_size
10356 && aarch64_tune_params.prefetch->default_opt_level >= 0
10357 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10358 opts->x_flag_prefetch_loop_arrays = 1;
10360 aarch64_override_options_after_change_1 (opts);
10363 /* Print a hint with a suggestion for a core or architecture name that
10364 most closely resembles what the user passed in STR. ARCH is true if
10365 the user is asking for an architecture name. ARCH is false if the user
10366 is asking for a core name. */
10368 static void
10369 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10371 auto_vec<const char *> candidates;
10372 const struct processor *entry = arch ? all_architectures : all_cores;
10373 for (; entry->name != NULL; entry++)
10374 candidates.safe_push (entry->name);
10375 char *s;
10376 const char *hint = candidates_list_and_hint (str, s, candidates);
10377 if (hint)
10378 inform (input_location, "valid arguments are: %s;"
10379 " did you mean %qs?", s, hint);
10380 XDELETEVEC (s);
10383 /* Print a hint with a suggestion for a core name that most closely resembles
10384 what the user passed in STR. */
10386 inline static void
10387 aarch64_print_hint_for_core (const char *str)
10389 aarch64_print_hint_for_core_or_arch (str, false);
10392 /* Print a hint with a suggestion for an architecture name that most closely
10393 resembles what the user passed in STR. */
10395 inline static void
10396 aarch64_print_hint_for_arch (const char *str)
10398 aarch64_print_hint_for_core_or_arch (str, true);
10401 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10402 specified in STR and throw errors if appropriate. Put the results if
10403 they are valid in RES and ISA_FLAGS. Return whether the option is
10404 valid. */
10406 static bool
10407 aarch64_validate_mcpu (const char *str, const struct processor **res,
10408 unsigned long *isa_flags)
10410 enum aarch64_parse_opt_result parse_res
10411 = aarch64_parse_cpu (str, res, isa_flags);
10413 if (parse_res == AARCH64_PARSE_OK)
10414 return true;
10416 switch (parse_res)
10418 case AARCH64_PARSE_MISSING_ARG:
10419 error ("missing cpu name in %<-mcpu=%s%>", str);
10420 break;
10421 case AARCH64_PARSE_INVALID_ARG:
10422 error ("unknown value %qs for -mcpu", str);
10423 aarch64_print_hint_for_core (str);
10424 break;
10425 case AARCH64_PARSE_INVALID_FEATURE:
10426 error ("invalid feature modifier in %<-mcpu=%s%>", str);
10427 break;
10428 default:
10429 gcc_unreachable ();
10432 return false;
10435 /* Validate a command-line -march option. Parse the arch and extensions
10436 (if any) specified in STR and throw errors if appropriate. Put the
10437 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10438 option is valid. */
10440 static bool
10441 aarch64_validate_march (const char *str, const struct processor **res,
10442 unsigned long *isa_flags)
10444 enum aarch64_parse_opt_result parse_res
10445 = aarch64_parse_arch (str, res, isa_flags);
10447 if (parse_res == AARCH64_PARSE_OK)
10448 return true;
10450 switch (parse_res)
10452 case AARCH64_PARSE_MISSING_ARG:
10453 error ("missing arch name in %<-march=%s%>", str);
10454 break;
10455 case AARCH64_PARSE_INVALID_ARG:
10456 error ("unknown value %qs for -march", str);
10457 aarch64_print_hint_for_arch (str);
10458 break;
10459 case AARCH64_PARSE_INVALID_FEATURE:
10460 error ("invalid feature modifier in %<-march=%s%>", str);
10461 break;
10462 default:
10463 gcc_unreachable ();
10466 return false;
10469 /* Validate a command-line -mtune option. Parse the cpu
10470 specified in STR and throw errors if appropriate. Put the
10471 result, if it is valid, in RES. Return whether the option is
10472 valid. */
10474 static bool
10475 aarch64_validate_mtune (const char *str, const struct processor **res)
10477 enum aarch64_parse_opt_result parse_res
10478 = aarch64_parse_tune (str, res);
10480 if (parse_res == AARCH64_PARSE_OK)
10481 return true;
10483 switch (parse_res)
10485 case AARCH64_PARSE_MISSING_ARG:
10486 error ("missing cpu name in %<-mtune=%s%>", str);
10487 break;
10488 case AARCH64_PARSE_INVALID_ARG:
10489 error ("unknown value %qs for -mtune", str);
10490 aarch64_print_hint_for_core (str);
10491 break;
10492 default:
10493 gcc_unreachable ();
10495 return false;
10498 /* Return the CPU corresponding to the enum CPU.
10499 If it doesn't specify a cpu, return the default. */
10501 static const struct processor *
10502 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10504 if (cpu != aarch64_none)
10505 return &all_cores[cpu];
10507 /* The & 0x3f is to extract the bottom 6 bits that encode the
10508 default cpu as selected by the --with-cpu GCC configure option
10509 in config.gcc.
10510 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10511 flags mechanism should be reworked to make it more sane. */
10512 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10515 /* Return the architecture corresponding to the enum ARCH.
10516 If it doesn't specify a valid architecture, return the default. */
10518 static const struct processor *
10519 aarch64_get_arch (enum aarch64_arch arch)
10521 if (arch != aarch64_no_arch)
10522 return &all_architectures[arch];
10524 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10526 return &all_architectures[cpu->arch];
10529 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10531 static poly_uint16
10532 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10534 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10535 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10536 deciding which .md file patterns to use and when deciding whether
10537 something is a legitimate address or constant. */
10538 if (value == SVE_SCALABLE || value == SVE_128)
10539 return poly_uint16 (2, 2);
10540 else
10541 return (int) value / 64;
10544 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10545 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10546 tuning structs. In particular it must set selected_tune and
10547 aarch64_isa_flags that define the available ISA features and tuning
10548 decisions. It must also set selected_arch as this will be used to
10549 output the .arch asm tags for each function. */
10551 static void
10552 aarch64_override_options (void)
10554 unsigned long cpu_isa = 0;
10555 unsigned long arch_isa = 0;
10556 aarch64_isa_flags = 0;
10558 bool valid_cpu = true;
10559 bool valid_tune = true;
10560 bool valid_arch = true;
10562 selected_cpu = NULL;
10563 selected_arch = NULL;
10564 selected_tune = NULL;
10566 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10567 If either of -march or -mtune is given, they override their
10568 respective component of -mcpu. */
10569 if (aarch64_cpu_string)
10570 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10571 &cpu_isa);
10573 if (aarch64_arch_string)
10574 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10575 &arch_isa);
10577 if (aarch64_tune_string)
10578 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10580 /* If the user did not specify a processor, choose the default
10581 one for them. This will be the CPU set during configuration using
10582 --with-cpu, otherwise it is "generic". */
10583 if (!selected_cpu)
10585 if (selected_arch)
10587 selected_cpu = &all_cores[selected_arch->ident];
10588 aarch64_isa_flags = arch_isa;
10589 explicit_arch = selected_arch->arch;
10591 else
10593 /* Get default configure-time CPU. */
10594 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10595 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10598 if (selected_tune)
10599 explicit_tune_core = selected_tune->ident;
10601 /* If both -mcpu and -march are specified check that they are architecturally
10602 compatible, warn if they're not and prefer the -march ISA flags. */
10603 else if (selected_arch)
10605 if (selected_arch->arch != selected_cpu->arch)
10607 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10608 all_architectures[selected_cpu->arch].name,
10609 selected_arch->name);
10611 aarch64_isa_flags = arch_isa;
10612 explicit_arch = selected_arch->arch;
10613 explicit_tune_core = selected_tune ? selected_tune->ident
10614 : selected_cpu->ident;
10616 else
10618 /* -mcpu but no -march. */
10619 aarch64_isa_flags = cpu_isa;
10620 explicit_tune_core = selected_tune ? selected_tune->ident
10621 : selected_cpu->ident;
10622 gcc_assert (selected_cpu);
10623 selected_arch = &all_architectures[selected_cpu->arch];
10624 explicit_arch = selected_arch->arch;
10627 /* Set the arch as well as we will need it when outputing
10628 the .arch directive in assembly. */
10629 if (!selected_arch)
10631 gcc_assert (selected_cpu);
10632 selected_arch = &all_architectures[selected_cpu->arch];
10635 if (!selected_tune)
10636 selected_tune = selected_cpu;
10638 #ifndef HAVE_AS_MABI_OPTION
10639 /* The compiler may have been configured with 2.23.* binutils, which does
10640 not have support for ILP32. */
10641 if (TARGET_ILP32)
10642 error ("assembler does not support -mabi=ilp32");
10643 #endif
10645 /* Convert -msve-vector-bits to a VG count. */
10646 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10648 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10649 sorry ("return address signing is only supported for -mabi=lp64");
10651 /* Make sure we properly set up the explicit options. */
10652 if ((aarch64_cpu_string && valid_cpu)
10653 || (aarch64_tune_string && valid_tune))
10654 gcc_assert (explicit_tune_core != aarch64_none);
10656 if ((aarch64_cpu_string && valid_cpu)
10657 || (aarch64_arch_string && valid_arch))
10658 gcc_assert (explicit_arch != aarch64_no_arch);
10660 aarch64_override_options_internal (&global_options);
10662 /* Save these options as the default ones in case we push and pop them later
10663 while processing functions with potential target attributes. */
10664 target_option_default_node = target_option_current_node
10665 = build_target_option_node (&global_options);
10668 /* Implement targetm.override_options_after_change. */
10670 static void
10671 aarch64_override_options_after_change (void)
10673 aarch64_override_options_after_change_1 (&global_options);
10676 static struct machine_function *
10677 aarch64_init_machine_status (void)
10679 struct machine_function *machine;
10680 machine = ggc_cleared_alloc<machine_function> ();
10681 return machine;
10684 void
10685 aarch64_init_expanders (void)
10687 init_machine_status = aarch64_init_machine_status;
10690 /* A checking mechanism for the implementation of the various code models. */
10691 static void
10692 initialize_aarch64_code_model (struct gcc_options *opts)
10694 if (opts->x_flag_pic)
10696 switch (opts->x_aarch64_cmodel_var)
10698 case AARCH64_CMODEL_TINY:
10699 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10700 break;
10701 case AARCH64_CMODEL_SMALL:
10702 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10703 aarch64_cmodel = (flag_pic == 2
10704 ? AARCH64_CMODEL_SMALL_PIC
10705 : AARCH64_CMODEL_SMALL_SPIC);
10706 #else
10707 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10708 #endif
10709 break;
10710 case AARCH64_CMODEL_LARGE:
10711 sorry ("code model %qs with -f%s", "large",
10712 opts->x_flag_pic > 1 ? "PIC" : "pic");
10713 break;
10714 default:
10715 gcc_unreachable ();
10718 else
10719 aarch64_cmodel = opts->x_aarch64_cmodel_var;
10722 /* Implement TARGET_OPTION_SAVE. */
10724 static void
10725 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
10727 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
10730 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
10731 using the information saved in PTR. */
10733 static void
10734 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
10736 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
10737 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10738 opts->x_explicit_arch = ptr->x_explicit_arch;
10739 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
10740 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
10742 aarch64_override_options_internal (opts);
10745 /* Implement TARGET_OPTION_PRINT. */
10747 static void
10748 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
10750 const struct processor *cpu
10751 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10752 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
10753 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
10754 std::string extension
10755 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
10757 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
10758 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
10759 arch->name, extension.c_str ());
10762 static GTY(()) tree aarch64_previous_fndecl;
10764 void
10765 aarch64_reset_previous_fndecl (void)
10767 aarch64_previous_fndecl = NULL;
10770 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
10771 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
10772 make sure optab availability predicates are recomputed when necessary. */
10774 void
10775 aarch64_save_restore_target_globals (tree new_tree)
10777 if (TREE_TARGET_GLOBALS (new_tree))
10778 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
10779 else if (new_tree == target_option_default_node)
10780 restore_target_globals (&default_target_globals);
10781 else
10782 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
10785 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
10786 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
10787 of the function, if such exists. This function may be called multiple
10788 times on a single function so use aarch64_previous_fndecl to avoid
10789 setting up identical state. */
10791 static void
10792 aarch64_set_current_function (tree fndecl)
10794 if (!fndecl || fndecl == aarch64_previous_fndecl)
10795 return;
10797 tree old_tree = (aarch64_previous_fndecl
10798 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
10799 : NULL_TREE);
10801 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10803 /* If current function has no attributes but the previous one did,
10804 use the default node. */
10805 if (!new_tree && old_tree)
10806 new_tree = target_option_default_node;
10808 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
10809 the default have been handled by aarch64_save_restore_target_globals from
10810 aarch64_pragma_target_parse. */
10811 if (old_tree == new_tree)
10812 return;
10814 aarch64_previous_fndecl = fndecl;
10816 /* First set the target options. */
10817 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
10819 aarch64_save_restore_target_globals (new_tree);
10822 /* Enum describing the various ways we can handle attributes.
10823 In many cases we can reuse the generic option handling machinery. */
10825 enum aarch64_attr_opt_type
10827 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
10828 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
10829 aarch64_attr_enum, /* Attribute sets an enum variable. */
10830 aarch64_attr_custom /* Attribute requires a custom handling function. */
10833 /* All the information needed to handle a target attribute.
10834 NAME is the name of the attribute.
10835 ATTR_TYPE specifies the type of behavior of the attribute as described
10836 in the definition of enum aarch64_attr_opt_type.
10837 ALLOW_NEG is true if the attribute supports a "no-" form.
10838 HANDLER is the function that takes the attribute string as an argument
10839 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
10840 OPT_NUM is the enum specifying the option that the attribute modifies.
10841 This is needed for attributes that mirror the behavior of a command-line
10842 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
10843 aarch64_attr_enum. */
10845 struct aarch64_attribute_info
10847 const char *name;
10848 enum aarch64_attr_opt_type attr_type;
10849 bool allow_neg;
10850 bool (*handler) (const char *);
10851 enum opt_code opt_num;
10854 /* Handle the ARCH_STR argument to the arch= target attribute. */
10856 static bool
10857 aarch64_handle_attr_arch (const char *str)
10859 const struct processor *tmp_arch = NULL;
10860 enum aarch64_parse_opt_result parse_res
10861 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
10863 if (parse_res == AARCH64_PARSE_OK)
10865 gcc_assert (tmp_arch);
10866 selected_arch = tmp_arch;
10867 explicit_arch = selected_arch->arch;
10868 return true;
10871 switch (parse_res)
10873 case AARCH64_PARSE_MISSING_ARG:
10874 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
10875 break;
10876 case AARCH64_PARSE_INVALID_ARG:
10877 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
10878 aarch64_print_hint_for_arch (str);
10879 break;
10880 case AARCH64_PARSE_INVALID_FEATURE:
10881 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
10882 break;
10883 default:
10884 gcc_unreachable ();
10887 return false;
10890 /* Handle the argument CPU_STR to the cpu= target attribute. */
10892 static bool
10893 aarch64_handle_attr_cpu (const char *str)
10895 const struct processor *tmp_cpu = NULL;
10896 enum aarch64_parse_opt_result parse_res
10897 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
10899 if (parse_res == AARCH64_PARSE_OK)
10901 gcc_assert (tmp_cpu);
10902 selected_tune = tmp_cpu;
10903 explicit_tune_core = selected_tune->ident;
10905 selected_arch = &all_architectures[tmp_cpu->arch];
10906 explicit_arch = selected_arch->arch;
10907 return true;
10910 switch (parse_res)
10912 case AARCH64_PARSE_MISSING_ARG:
10913 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
10914 break;
10915 case AARCH64_PARSE_INVALID_ARG:
10916 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
10917 aarch64_print_hint_for_core (str);
10918 break;
10919 case AARCH64_PARSE_INVALID_FEATURE:
10920 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
10921 break;
10922 default:
10923 gcc_unreachable ();
10926 return false;
10929 /* Handle the argument STR to the tune= target attribute. */
10931 static bool
10932 aarch64_handle_attr_tune (const char *str)
10934 const struct processor *tmp_tune = NULL;
10935 enum aarch64_parse_opt_result parse_res
10936 = aarch64_parse_tune (str, &tmp_tune);
10938 if (parse_res == AARCH64_PARSE_OK)
10940 gcc_assert (tmp_tune);
10941 selected_tune = tmp_tune;
10942 explicit_tune_core = selected_tune->ident;
10943 return true;
10946 switch (parse_res)
10948 case AARCH64_PARSE_INVALID_ARG:
10949 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
10950 aarch64_print_hint_for_core (str);
10951 break;
10952 default:
10953 gcc_unreachable ();
10956 return false;
10959 /* Parse an architecture extensions target attribute string specified in STR.
10960 For example "+fp+nosimd". Show any errors if needed. Return TRUE
10961 if successful. Update aarch64_isa_flags to reflect the ISA features
10962 modified. */
10964 static bool
10965 aarch64_handle_attr_isa_flags (char *str)
10967 enum aarch64_parse_opt_result parse_res;
10968 unsigned long isa_flags = aarch64_isa_flags;
10970 /* We allow "+nothing" in the beginning to clear out all architectural
10971 features if the user wants to handpick specific features. */
10972 if (strncmp ("+nothing", str, 8) == 0)
10974 isa_flags = 0;
10975 str += 8;
10978 parse_res = aarch64_parse_extension (str, &isa_flags);
10980 if (parse_res == AARCH64_PARSE_OK)
10982 aarch64_isa_flags = isa_flags;
10983 return true;
10986 switch (parse_res)
10988 case AARCH64_PARSE_MISSING_ARG:
10989 error ("missing value in %<target()%> pragma or attribute");
10990 break;
10992 case AARCH64_PARSE_INVALID_FEATURE:
10993 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
10994 break;
10996 default:
10997 gcc_unreachable ();
11000 return false;
11003 /* The target attributes that we support. On top of these we also support just
11004 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11005 handled explicitly in aarch64_process_one_target_attr. */
11007 static const struct aarch64_attribute_info aarch64_attributes[] =
11009 { "general-regs-only", aarch64_attr_mask, false, NULL,
11010 OPT_mgeneral_regs_only },
11011 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11012 OPT_mfix_cortex_a53_835769 },
11013 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11014 OPT_mfix_cortex_a53_843419 },
11015 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11016 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
11017 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11018 OPT_momit_leaf_frame_pointer },
11019 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11020 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11021 OPT_march_ },
11022 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11023 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11024 OPT_mtune_ },
11025 { "sign-return-address", aarch64_attr_enum, false, NULL,
11026 OPT_msign_return_address_ },
11027 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11030 /* Parse ARG_STR which contains the definition of one target attribute.
11031 Show appropriate errors if any or return true if the attribute is valid. */
11033 static bool
11034 aarch64_process_one_target_attr (char *arg_str)
11036 bool invert = false;
11038 size_t len = strlen (arg_str);
11040 if (len == 0)
11042 error ("malformed %<target()%> pragma or attribute");
11043 return false;
11046 char *str_to_check = (char *) alloca (len + 1);
11047 strcpy (str_to_check, arg_str);
11049 /* Skip leading whitespace. */
11050 while (*str_to_check == ' ' || *str_to_check == '\t')
11051 str_to_check++;
11053 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11054 It is easier to detect and handle it explicitly here rather than going
11055 through the machinery for the rest of the target attributes in this
11056 function. */
11057 if (*str_to_check == '+')
11058 return aarch64_handle_attr_isa_flags (str_to_check);
11060 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11062 invert = true;
11063 str_to_check += 3;
11065 char *arg = strchr (str_to_check, '=');
11067 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11068 and point ARG to "foo". */
11069 if (arg)
11071 *arg = '\0';
11072 arg++;
11074 const struct aarch64_attribute_info *p_attr;
11075 bool found = false;
11076 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11078 /* If the names don't match up, or the user has given an argument
11079 to an attribute that doesn't accept one, or didn't give an argument
11080 to an attribute that expects one, fail to match. */
11081 if (strcmp (str_to_check, p_attr->name) != 0)
11082 continue;
11084 found = true;
11085 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11086 || p_attr->attr_type == aarch64_attr_enum;
11088 if (attr_need_arg_p ^ (arg != NULL))
11090 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11091 return false;
11094 /* If the name matches but the attribute does not allow "no-" versions
11095 then we can't match. */
11096 if (invert && !p_attr->allow_neg)
11098 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11099 return false;
11102 switch (p_attr->attr_type)
11104 /* Has a custom handler registered.
11105 For example, cpu=, arch=, tune=. */
11106 case aarch64_attr_custom:
11107 gcc_assert (p_attr->handler);
11108 if (!p_attr->handler (arg))
11109 return false;
11110 break;
11112 /* Either set or unset a boolean option. */
11113 case aarch64_attr_bool:
11115 struct cl_decoded_option decoded;
11117 generate_option (p_attr->opt_num, NULL, !invert,
11118 CL_TARGET, &decoded);
11119 aarch64_handle_option (&global_options, &global_options_set,
11120 &decoded, input_location);
11121 break;
11123 /* Set or unset a bit in the target_flags. aarch64_handle_option
11124 should know what mask to apply given the option number. */
11125 case aarch64_attr_mask:
11127 struct cl_decoded_option decoded;
11128 /* We only need to specify the option number.
11129 aarch64_handle_option will know which mask to apply. */
11130 decoded.opt_index = p_attr->opt_num;
11131 decoded.value = !invert;
11132 aarch64_handle_option (&global_options, &global_options_set,
11133 &decoded, input_location);
11134 break;
11136 /* Use the option setting machinery to set an option to an enum. */
11137 case aarch64_attr_enum:
11139 gcc_assert (arg);
11140 bool valid;
11141 int value;
11142 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11143 &value, CL_TARGET);
11144 if (valid)
11146 set_option (&global_options, NULL, p_attr->opt_num, value,
11147 NULL, DK_UNSPECIFIED, input_location,
11148 global_dc);
11150 else
11152 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11154 break;
11156 default:
11157 gcc_unreachable ();
11161 /* If we reached here we either have found an attribute and validated
11162 it or didn't match any. If we matched an attribute but its arguments
11163 were malformed we will have returned false already. */
11164 return found;
11167 /* Count how many times the character C appears in
11168 NULL-terminated string STR. */
11170 static unsigned int
11171 num_occurences_in_str (char c, char *str)
11173 unsigned int res = 0;
11174 while (*str != '\0')
11176 if (*str == c)
11177 res++;
11179 str++;
11182 return res;
11185 /* Parse the tree in ARGS that contains the target attribute information
11186 and update the global target options space. */
11188 bool
11189 aarch64_process_target_attr (tree args)
11191 if (TREE_CODE (args) == TREE_LIST)
11195 tree head = TREE_VALUE (args);
11196 if (head)
11198 if (!aarch64_process_target_attr (head))
11199 return false;
11201 args = TREE_CHAIN (args);
11202 } while (args);
11204 return true;
11207 if (TREE_CODE (args) != STRING_CST)
11209 error ("attribute %<target%> argument not a string");
11210 return false;
11213 size_t len = strlen (TREE_STRING_POINTER (args));
11214 char *str_to_check = (char *) alloca (len + 1);
11215 strcpy (str_to_check, TREE_STRING_POINTER (args));
11217 if (len == 0)
11219 error ("malformed %<target()%> pragma or attribute");
11220 return false;
11223 /* Used to catch empty spaces between commas i.e.
11224 attribute ((target ("attr1,,attr2"))). */
11225 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11227 /* Handle multiple target attributes separated by ','. */
11228 char *token = strtok (str_to_check, ",");
11230 unsigned int num_attrs = 0;
11231 while (token)
11233 num_attrs++;
11234 if (!aarch64_process_one_target_attr (token))
11236 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11237 return false;
11240 token = strtok (NULL, ",");
11243 if (num_attrs != num_commas + 1)
11245 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11246 return false;
11249 return true;
11252 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11253 process attribute ((target ("..."))). */
11255 static bool
11256 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11258 struct cl_target_option cur_target;
11259 bool ret;
11260 tree old_optimize;
11261 tree new_target, new_optimize;
11262 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11264 /* If what we're processing is the current pragma string then the
11265 target option node is already stored in target_option_current_node
11266 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11267 having to re-parse the string. This is especially useful to keep
11268 arm_neon.h compile times down since that header contains a lot
11269 of intrinsics enclosed in pragmas. */
11270 if (!existing_target && args == current_target_pragma)
11272 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11273 return true;
11275 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11277 old_optimize = build_optimization_node (&global_options);
11278 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11280 /* If the function changed the optimization levels as well as setting
11281 target options, start with the optimizations specified. */
11282 if (func_optimize && func_optimize != old_optimize)
11283 cl_optimization_restore (&global_options,
11284 TREE_OPTIMIZATION (func_optimize));
11286 /* Save the current target options to restore at the end. */
11287 cl_target_option_save (&cur_target, &global_options);
11289 /* If fndecl already has some target attributes applied to it, unpack
11290 them so that we add this attribute on top of them, rather than
11291 overwriting them. */
11292 if (existing_target)
11294 struct cl_target_option *existing_options
11295 = TREE_TARGET_OPTION (existing_target);
11297 if (existing_options)
11298 cl_target_option_restore (&global_options, existing_options);
11300 else
11301 cl_target_option_restore (&global_options,
11302 TREE_TARGET_OPTION (target_option_current_node));
11304 ret = aarch64_process_target_attr (args);
11306 /* Set up any additional state. */
11307 if (ret)
11309 aarch64_override_options_internal (&global_options);
11310 /* Initialize SIMD builtins if we haven't already.
11311 Set current_target_pragma to NULL for the duration so that
11312 the builtin initialization code doesn't try to tag the functions
11313 being built with the attributes specified by any current pragma, thus
11314 going into an infinite recursion. */
11315 if (TARGET_SIMD)
11317 tree saved_current_target_pragma = current_target_pragma;
11318 current_target_pragma = NULL;
11319 aarch64_init_simd_builtins ();
11320 current_target_pragma = saved_current_target_pragma;
11322 new_target = build_target_option_node (&global_options);
11324 else
11325 new_target = NULL;
11327 new_optimize = build_optimization_node (&global_options);
11329 if (fndecl && ret)
11331 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11333 if (old_optimize != new_optimize)
11334 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11337 cl_target_option_restore (&global_options, &cur_target);
11339 if (old_optimize != new_optimize)
11340 cl_optimization_restore (&global_options,
11341 TREE_OPTIMIZATION (old_optimize));
11342 return ret;
11345 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11346 tri-bool options (yes, no, don't care) and the default value is
11347 DEF, determine whether to reject inlining. */
11349 static bool
11350 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11351 int dont_care, int def)
11353 /* If the callee doesn't care, always allow inlining. */
11354 if (callee == dont_care)
11355 return true;
11357 /* If the caller doesn't care, always allow inlining. */
11358 if (caller == dont_care)
11359 return true;
11361 /* Otherwise, allow inlining if either the callee and caller values
11362 agree, or if the callee is using the default value. */
11363 return (callee == caller || callee == def);
11366 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11367 to inline CALLEE into CALLER based on target-specific info.
11368 Make sure that the caller and callee have compatible architectural
11369 features. Then go through the other possible target attributes
11370 and see if they can block inlining. Try not to reject always_inline
11371 callees unless they are incompatible architecturally. */
11373 static bool
11374 aarch64_can_inline_p (tree caller, tree callee)
11376 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11377 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11379 /* If callee has no option attributes, then it is ok to inline. */
11380 if (!callee_tree)
11381 return true;
11383 struct cl_target_option *caller_opts
11384 = TREE_TARGET_OPTION (caller_tree ? caller_tree
11385 : target_option_default_node);
11387 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
11390 /* Callee's ISA flags should be a subset of the caller's. */
11391 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11392 != callee_opts->x_aarch64_isa_flags)
11393 return false;
11395 /* Allow non-strict aligned functions inlining into strict
11396 aligned ones. */
11397 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11398 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11399 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11400 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11401 return false;
11403 bool always_inline = lookup_attribute ("always_inline",
11404 DECL_ATTRIBUTES (callee));
11406 /* If the architectural features match up and the callee is always_inline
11407 then the other attributes don't matter. */
11408 if (always_inline)
11409 return true;
11411 if (caller_opts->x_aarch64_cmodel_var
11412 != callee_opts->x_aarch64_cmodel_var)
11413 return false;
11415 if (caller_opts->x_aarch64_tls_dialect
11416 != callee_opts->x_aarch64_tls_dialect)
11417 return false;
11419 /* Honour explicit requests to workaround errata. */
11420 if (!aarch64_tribools_ok_for_inlining_p (
11421 caller_opts->x_aarch64_fix_a53_err835769,
11422 callee_opts->x_aarch64_fix_a53_err835769,
11423 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11424 return false;
11426 if (!aarch64_tribools_ok_for_inlining_p (
11427 caller_opts->x_aarch64_fix_a53_err843419,
11428 callee_opts->x_aarch64_fix_a53_err843419,
11429 2, TARGET_FIX_ERR_A53_843419))
11430 return false;
11432 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11433 caller and calle and they don't match up, reject inlining. */
11434 if (!aarch64_tribools_ok_for_inlining_p (
11435 caller_opts->x_flag_omit_leaf_frame_pointer,
11436 callee_opts->x_flag_omit_leaf_frame_pointer,
11437 2, 1))
11438 return false;
11440 /* If the callee has specific tuning overrides, respect them. */
11441 if (callee_opts->x_aarch64_override_tune_string != NULL
11442 && caller_opts->x_aarch64_override_tune_string == NULL)
11443 return false;
11445 /* If the user specified tuning override strings for the
11446 caller and callee and they don't match up, reject inlining.
11447 We just do a string compare here, we don't analyze the meaning
11448 of the string, as it would be too costly for little gain. */
11449 if (callee_opts->x_aarch64_override_tune_string
11450 && caller_opts->x_aarch64_override_tune_string
11451 && (strcmp (callee_opts->x_aarch64_override_tune_string,
11452 caller_opts->x_aarch64_override_tune_string) != 0))
11453 return false;
11455 return true;
11458 /* Return true if SYMBOL_REF X binds locally. */
11460 static bool
11461 aarch64_symbol_binds_local_p (const_rtx x)
11463 return (SYMBOL_REF_DECL (x)
11464 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11465 : SYMBOL_REF_LOCAL_P (x));
11468 /* Return true if SYMBOL_REF X is thread local */
11469 static bool
11470 aarch64_tls_symbol_p (rtx x)
11472 if (! TARGET_HAVE_TLS)
11473 return false;
11475 if (GET_CODE (x) != SYMBOL_REF)
11476 return false;
11478 return SYMBOL_REF_TLS_MODEL (x) != 0;
11481 /* Classify a TLS symbol into one of the TLS kinds. */
11482 enum aarch64_symbol_type
11483 aarch64_classify_tls_symbol (rtx x)
11485 enum tls_model tls_kind = tls_symbolic_operand_type (x);
11487 switch (tls_kind)
11489 case TLS_MODEL_GLOBAL_DYNAMIC:
11490 case TLS_MODEL_LOCAL_DYNAMIC:
11491 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11493 case TLS_MODEL_INITIAL_EXEC:
11494 switch (aarch64_cmodel)
11496 case AARCH64_CMODEL_TINY:
11497 case AARCH64_CMODEL_TINY_PIC:
11498 return SYMBOL_TINY_TLSIE;
11499 default:
11500 return SYMBOL_SMALL_TLSIE;
11503 case TLS_MODEL_LOCAL_EXEC:
11504 if (aarch64_tls_size == 12)
11505 return SYMBOL_TLSLE12;
11506 else if (aarch64_tls_size == 24)
11507 return SYMBOL_TLSLE24;
11508 else if (aarch64_tls_size == 32)
11509 return SYMBOL_TLSLE32;
11510 else if (aarch64_tls_size == 48)
11511 return SYMBOL_TLSLE48;
11512 else
11513 gcc_unreachable ();
11515 case TLS_MODEL_EMULATED:
11516 case TLS_MODEL_NONE:
11517 return SYMBOL_FORCE_TO_MEM;
11519 default:
11520 gcc_unreachable ();
11524 /* Return the correct method for accessing X + OFFSET, where X is either
11525 a SYMBOL_REF or LABEL_REF. */
11527 enum aarch64_symbol_type
11528 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11530 if (GET_CODE (x) == LABEL_REF)
11532 switch (aarch64_cmodel)
11534 case AARCH64_CMODEL_LARGE:
11535 return SYMBOL_FORCE_TO_MEM;
11537 case AARCH64_CMODEL_TINY_PIC:
11538 case AARCH64_CMODEL_TINY:
11539 return SYMBOL_TINY_ABSOLUTE;
11541 case AARCH64_CMODEL_SMALL_SPIC:
11542 case AARCH64_CMODEL_SMALL_PIC:
11543 case AARCH64_CMODEL_SMALL:
11544 return SYMBOL_SMALL_ABSOLUTE;
11546 default:
11547 gcc_unreachable ();
11551 if (GET_CODE (x) == SYMBOL_REF)
11553 if (aarch64_tls_symbol_p (x))
11554 return aarch64_classify_tls_symbol (x);
11556 switch (aarch64_cmodel)
11558 case AARCH64_CMODEL_TINY:
11559 /* When we retrieve symbol + offset address, we have to make sure
11560 the offset does not cause overflow of the final address. But
11561 we have no way of knowing the address of symbol at compile time
11562 so we can't accurately say if the distance between the PC and
11563 symbol + offset is outside the addressible range of +/-1M in the
11564 TINY code model. So we rely on images not being greater than
11565 1M and cap the offset at 1M and anything beyond 1M will have to
11566 be loaded using an alternative mechanism. Furthermore if the
11567 symbol is a weak reference to something that isn't known to
11568 resolve to a symbol in this module, then force to memory. */
11569 if ((SYMBOL_REF_WEAK (x)
11570 && !aarch64_symbol_binds_local_p (x))
11571 || !IN_RANGE (offset, -1048575, 1048575))
11572 return SYMBOL_FORCE_TO_MEM;
11573 return SYMBOL_TINY_ABSOLUTE;
11575 case AARCH64_CMODEL_SMALL:
11576 /* Same reasoning as the tiny code model, but the offset cap here is
11577 4G. */
11578 if ((SYMBOL_REF_WEAK (x)
11579 && !aarch64_symbol_binds_local_p (x))
11580 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11581 HOST_WIDE_INT_C (4294967264)))
11582 return SYMBOL_FORCE_TO_MEM;
11583 return SYMBOL_SMALL_ABSOLUTE;
11585 case AARCH64_CMODEL_TINY_PIC:
11586 if (!aarch64_symbol_binds_local_p (x))
11587 return SYMBOL_TINY_GOT;
11588 return SYMBOL_TINY_ABSOLUTE;
11590 case AARCH64_CMODEL_SMALL_SPIC:
11591 case AARCH64_CMODEL_SMALL_PIC:
11592 if (!aarch64_symbol_binds_local_p (x))
11593 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11594 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11595 return SYMBOL_SMALL_ABSOLUTE;
11597 case AARCH64_CMODEL_LARGE:
11598 /* This is alright even in PIC code as the constant
11599 pool reference is always PC relative and within
11600 the same translation unit. */
11601 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11602 return SYMBOL_SMALL_ABSOLUTE;
11603 else
11604 return SYMBOL_FORCE_TO_MEM;
11606 default:
11607 gcc_unreachable ();
11611 /* By default push everything into the constant pool. */
11612 return SYMBOL_FORCE_TO_MEM;
11615 bool
11616 aarch64_constant_address_p (rtx x)
11618 return (CONSTANT_P (x) && memory_address_p (DImode, x));
11621 bool
11622 aarch64_legitimate_pic_operand_p (rtx x)
11624 if (GET_CODE (x) == SYMBOL_REF
11625 || (GET_CODE (x) == CONST
11626 && GET_CODE (XEXP (x, 0)) == PLUS
11627 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11628 return false;
11630 return true;
11633 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11634 that should be rematerialized rather than spilled. */
11636 static bool
11637 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11639 /* Support CSE and rematerialization of common constants. */
11640 if (CONST_INT_P (x) || CONST_DOUBLE_P (x) || GET_CODE (x) == CONST_VECTOR)
11641 return true;
11643 /* Do not allow vector struct mode constants for Advanced SIMD.
11644 We could support 0 and -1 easily, but they need support in
11645 aarch64-simd.md. */
11646 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11647 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11648 return false;
11650 /* Do not allow wide int constants - this requires support in movti. */
11651 if (CONST_WIDE_INT_P (x))
11652 return false;
11654 /* Only accept variable-length vector constants if they can be
11655 handled directly.
11657 ??? It would be possible to handle rematerialization of other
11658 constants via secondary reloads. */
11659 if (vec_flags & VEC_ANY_SVE)
11660 return aarch64_simd_valid_immediate (x, NULL);
11662 if (GET_CODE (x) == HIGH)
11663 x = XEXP (x, 0);
11665 /* Accept polynomial constants that can be calculated by using the
11666 destination of a move as the sole temporary. Constants that
11667 require a second temporary cannot be rematerialized (they can't be
11668 forced to memory and also aren't legitimate constants). */
11669 poly_int64 offset;
11670 if (poly_int_rtx_p (x, &offset))
11671 return aarch64_offset_temporaries (false, offset) <= 1;
11673 /* If an offset is being added to something else, we need to allow the
11674 base to be moved into the destination register, meaning that there
11675 are no free temporaries for the offset. */
11676 x = strip_offset (x, &offset);
11677 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11678 return false;
11680 /* Do not allow const (plus (anchor_symbol, const_int)). */
11681 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11682 return false;
11684 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
11685 so spilling them is better than rematerialization. */
11686 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11687 return true;
11689 /* Label references are always constant. */
11690 if (GET_CODE (x) == LABEL_REF)
11691 return true;
11693 return false;
11697 aarch64_load_tp (rtx target)
11699 if (!target
11700 || GET_MODE (target) != Pmode
11701 || !register_operand (target, Pmode))
11702 target = gen_reg_rtx (Pmode);
11704 /* Can return in any reg. */
11705 emit_insn (gen_aarch64_load_tp_hard (target));
11706 return target;
11709 /* On AAPCS systems, this is the "struct __va_list". */
11710 static GTY(()) tree va_list_type;
11712 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11713 Return the type to use as __builtin_va_list.
11715 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11717 struct __va_list
11719 void *__stack;
11720 void *__gr_top;
11721 void *__vr_top;
11722 int __gr_offs;
11723 int __vr_offs;
11724 }; */
11726 static tree
11727 aarch64_build_builtin_va_list (void)
11729 tree va_list_name;
11730 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11732 /* Create the type. */
11733 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
11734 /* Give it the required name. */
11735 va_list_name = build_decl (BUILTINS_LOCATION,
11736 TYPE_DECL,
11737 get_identifier ("__va_list"),
11738 va_list_type);
11739 DECL_ARTIFICIAL (va_list_name) = 1;
11740 TYPE_NAME (va_list_type) = va_list_name;
11741 TYPE_STUB_DECL (va_list_type) = va_list_name;
11743 /* Create the fields. */
11744 f_stack = build_decl (BUILTINS_LOCATION,
11745 FIELD_DECL, get_identifier ("__stack"),
11746 ptr_type_node);
11747 f_grtop = build_decl (BUILTINS_LOCATION,
11748 FIELD_DECL, get_identifier ("__gr_top"),
11749 ptr_type_node);
11750 f_vrtop = build_decl (BUILTINS_LOCATION,
11751 FIELD_DECL, get_identifier ("__vr_top"),
11752 ptr_type_node);
11753 f_groff = build_decl (BUILTINS_LOCATION,
11754 FIELD_DECL, get_identifier ("__gr_offs"),
11755 integer_type_node);
11756 f_vroff = build_decl (BUILTINS_LOCATION,
11757 FIELD_DECL, get_identifier ("__vr_offs"),
11758 integer_type_node);
11760 /* Tell tree-stdarg pass about our internal offset fields.
11761 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
11762 purpose to identify whether the code is updating va_list internal
11763 offset fields through irregular way. */
11764 va_list_gpr_counter_field = f_groff;
11765 va_list_fpr_counter_field = f_vroff;
11767 DECL_ARTIFICIAL (f_stack) = 1;
11768 DECL_ARTIFICIAL (f_grtop) = 1;
11769 DECL_ARTIFICIAL (f_vrtop) = 1;
11770 DECL_ARTIFICIAL (f_groff) = 1;
11771 DECL_ARTIFICIAL (f_vroff) = 1;
11773 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
11774 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
11775 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
11776 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
11777 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
11779 TYPE_FIELDS (va_list_type) = f_stack;
11780 DECL_CHAIN (f_stack) = f_grtop;
11781 DECL_CHAIN (f_grtop) = f_vrtop;
11782 DECL_CHAIN (f_vrtop) = f_groff;
11783 DECL_CHAIN (f_groff) = f_vroff;
11785 /* Compute its layout. */
11786 layout_type (va_list_type);
11788 return va_list_type;
11791 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
11792 static void
11793 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
11795 const CUMULATIVE_ARGS *cum;
11796 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11797 tree stack, grtop, vrtop, groff, vroff;
11798 tree t;
11799 int gr_save_area_size = cfun->va_list_gpr_size;
11800 int vr_save_area_size = cfun->va_list_fpr_size;
11801 int vr_offset;
11803 cum = &crtl->args.info;
11804 if (cfun->va_list_gpr_size)
11805 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
11806 cfun->va_list_gpr_size);
11807 if (cfun->va_list_fpr_size)
11808 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
11809 * UNITS_PER_VREG, cfun->va_list_fpr_size);
11811 if (!TARGET_FLOAT)
11813 gcc_assert (cum->aapcs_nvrn == 0);
11814 vr_save_area_size = 0;
11817 f_stack = TYPE_FIELDS (va_list_type_node);
11818 f_grtop = DECL_CHAIN (f_stack);
11819 f_vrtop = DECL_CHAIN (f_grtop);
11820 f_groff = DECL_CHAIN (f_vrtop);
11821 f_vroff = DECL_CHAIN (f_groff);
11823 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
11824 NULL_TREE);
11825 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
11826 NULL_TREE);
11827 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
11828 NULL_TREE);
11829 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
11830 NULL_TREE);
11831 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
11832 NULL_TREE);
11834 /* Emit code to initialize STACK, which points to the next varargs stack
11835 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
11836 by named arguments. STACK is 8-byte aligned. */
11837 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
11838 if (cum->aapcs_stack_size > 0)
11839 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
11840 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
11841 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11843 /* Emit code to initialize GRTOP, the top of the GR save area.
11844 virtual_incoming_args_rtx should have been 16 byte aligned. */
11845 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
11846 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
11847 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11849 /* Emit code to initialize VRTOP, the top of the VR save area.
11850 This address is gr_save_area_bytes below GRTOP, rounded
11851 down to the next 16-byte boundary. */
11852 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
11853 vr_offset = ROUND_UP (gr_save_area_size,
11854 STACK_BOUNDARY / BITS_PER_UNIT);
11856 if (vr_offset)
11857 t = fold_build_pointer_plus_hwi (t, -vr_offset);
11858 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
11859 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11861 /* Emit code to initialize GROFF, the offset from GRTOP of the
11862 next GPR argument. */
11863 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
11864 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
11865 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11867 /* Likewise emit code to initialize VROFF, the offset from FTOP
11868 of the next VR argument. */
11869 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
11870 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
11871 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11874 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
11876 static tree
11877 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
11878 gimple_seq *post_p ATTRIBUTE_UNUSED)
11880 tree addr;
11881 bool indirect_p;
11882 bool is_ha; /* is HFA or HVA. */
11883 bool dw_align; /* double-word align. */
11884 machine_mode ag_mode = VOIDmode;
11885 int nregs;
11886 machine_mode mode;
11888 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11889 tree stack, f_top, f_off, off, arg, roundup, on_stack;
11890 HOST_WIDE_INT size, rsize, adjust, align;
11891 tree t, u, cond1, cond2;
11893 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
11894 if (indirect_p)
11895 type = build_pointer_type (type);
11897 mode = TYPE_MODE (type);
11899 f_stack = TYPE_FIELDS (va_list_type_node);
11900 f_grtop = DECL_CHAIN (f_stack);
11901 f_vrtop = DECL_CHAIN (f_grtop);
11902 f_groff = DECL_CHAIN (f_vrtop);
11903 f_vroff = DECL_CHAIN (f_groff);
11905 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
11906 f_stack, NULL_TREE);
11907 size = int_size_in_bytes (type);
11908 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
11910 dw_align = false;
11911 adjust = 0;
11912 if (aarch64_vfp_is_call_or_return_candidate (mode,
11913 type,
11914 &ag_mode,
11915 &nregs,
11916 &is_ha))
11918 /* No frontends can create types with variable-sized modes, so we
11919 shouldn't be asked to pass or return them. */
11920 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
11922 /* TYPE passed in fp/simd registers. */
11923 if (!TARGET_FLOAT)
11924 aarch64_err_no_fpadvsimd (mode, "varargs");
11926 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
11927 unshare_expr (valist), f_vrtop, NULL_TREE);
11928 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
11929 unshare_expr (valist), f_vroff, NULL_TREE);
11931 rsize = nregs * UNITS_PER_VREG;
11933 if (is_ha)
11935 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
11936 adjust = UNITS_PER_VREG - ag_size;
11938 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
11939 && size < UNITS_PER_VREG)
11941 adjust = UNITS_PER_VREG - size;
11944 else
11946 /* TYPE passed in general registers. */
11947 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
11948 unshare_expr (valist), f_grtop, NULL_TREE);
11949 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
11950 unshare_expr (valist), f_groff, NULL_TREE);
11951 rsize = ROUND_UP (size, UNITS_PER_WORD);
11952 nregs = rsize / UNITS_PER_WORD;
11954 if (align > 8)
11955 dw_align = true;
11957 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
11958 && size < UNITS_PER_WORD)
11960 adjust = UNITS_PER_WORD - size;
11964 /* Get a local temporary for the field value. */
11965 off = get_initialized_tmp_var (f_off, pre_p, NULL);
11967 /* Emit code to branch if off >= 0. */
11968 t = build2 (GE_EXPR, boolean_type_node, off,
11969 build_int_cst (TREE_TYPE (off), 0));
11970 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
11972 if (dw_align)
11974 /* Emit: offs = (offs + 15) & -16. */
11975 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
11976 build_int_cst (TREE_TYPE (off), 15));
11977 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
11978 build_int_cst (TREE_TYPE (off), -16));
11979 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
11981 else
11982 roundup = NULL;
11984 /* Update ap.__[g|v]r_offs */
11985 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
11986 build_int_cst (TREE_TYPE (off), rsize));
11987 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
11989 /* String up. */
11990 if (roundup)
11991 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
11993 /* [cond2] if (ap.__[g|v]r_offs > 0) */
11994 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
11995 build_int_cst (TREE_TYPE (f_off), 0));
11996 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
11998 /* String up: make sure the assignment happens before the use. */
11999 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12000 COND_EXPR_ELSE (cond1) = t;
12002 /* Prepare the trees handling the argument that is passed on the stack;
12003 the top level node will store in ON_STACK. */
12004 arg = get_initialized_tmp_var (stack, pre_p, NULL);
12005 if (align > 8)
12007 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12008 t = fold_convert (intDI_type_node, arg);
12009 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12010 build_int_cst (TREE_TYPE (t), 15));
12011 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12012 build_int_cst (TREE_TYPE (t), -16));
12013 t = fold_convert (TREE_TYPE (arg), t);
12014 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12016 else
12017 roundup = NULL;
12018 /* Advance ap.__stack */
12019 t = fold_convert (intDI_type_node, arg);
12020 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12021 build_int_cst (TREE_TYPE (t), size + 7));
12022 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12023 build_int_cst (TREE_TYPE (t), -8));
12024 t = fold_convert (TREE_TYPE (arg), t);
12025 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12026 /* String up roundup and advance. */
12027 if (roundup)
12028 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12029 /* String up with arg */
12030 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12031 /* Big-endianness related address adjustment. */
12032 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12033 && size < UNITS_PER_WORD)
12035 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12036 size_int (UNITS_PER_WORD - size));
12037 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12040 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12041 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12043 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12044 t = off;
12045 if (adjust)
12046 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12047 build_int_cst (TREE_TYPE (off), adjust));
12049 t = fold_convert (sizetype, t);
12050 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12052 if (is_ha)
12054 /* type ha; // treat as "struct {ftype field[n];}"
12055 ... [computing offs]
12056 for (i = 0; i <nregs; ++i, offs += 16)
12057 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12058 return ha; */
12059 int i;
12060 tree tmp_ha, field_t, field_ptr_t;
12062 /* Declare a local variable. */
12063 tmp_ha = create_tmp_var_raw (type, "ha");
12064 gimple_add_tmp_var (tmp_ha);
12066 /* Establish the base type. */
12067 switch (ag_mode)
12069 case E_SFmode:
12070 field_t = float_type_node;
12071 field_ptr_t = float_ptr_type_node;
12072 break;
12073 case E_DFmode:
12074 field_t = double_type_node;
12075 field_ptr_t = double_ptr_type_node;
12076 break;
12077 case E_TFmode:
12078 field_t = long_double_type_node;
12079 field_ptr_t = long_double_ptr_type_node;
12080 break;
12081 case E_HFmode:
12082 field_t = aarch64_fp16_type_node;
12083 field_ptr_t = aarch64_fp16_ptr_type_node;
12084 break;
12085 case E_V2SImode:
12086 case E_V4SImode:
12088 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12089 field_t = build_vector_type_for_mode (innertype, ag_mode);
12090 field_ptr_t = build_pointer_type (field_t);
12092 break;
12093 default:
12094 gcc_assert (0);
12097 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12098 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12099 addr = t;
12100 t = fold_convert (field_ptr_t, addr);
12101 t = build2 (MODIFY_EXPR, field_t,
12102 build1 (INDIRECT_REF, field_t, tmp_ha),
12103 build1 (INDIRECT_REF, field_t, t));
12105 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12106 for (i = 1; i < nregs; ++i)
12108 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12109 u = fold_convert (field_ptr_t, addr);
12110 u = build2 (MODIFY_EXPR, field_t,
12111 build2 (MEM_REF, field_t, tmp_ha,
12112 build_int_cst (field_ptr_t,
12113 (i *
12114 int_size_in_bytes (field_t)))),
12115 build1 (INDIRECT_REF, field_t, u));
12116 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12119 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12120 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12123 COND_EXPR_ELSE (cond2) = t;
12124 addr = fold_convert (build_pointer_type (type), cond1);
12125 addr = build_va_arg_indirect_ref (addr);
12127 if (indirect_p)
12128 addr = build_va_arg_indirect_ref (addr);
12130 return addr;
12133 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12135 static void
12136 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12137 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12138 int no_rtl)
12140 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12141 CUMULATIVE_ARGS local_cum;
12142 int gr_saved = cfun->va_list_gpr_size;
12143 int vr_saved = cfun->va_list_fpr_size;
12145 /* The caller has advanced CUM up to, but not beyond, the last named
12146 argument. Advance a local copy of CUM past the last "real" named
12147 argument, to find out how many registers are left over. */
12148 local_cum = *cum;
12149 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12151 /* Found out how many registers we need to save.
12152 Honor tree-stdvar analysis results. */
12153 if (cfun->va_list_gpr_size)
12154 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12155 cfun->va_list_gpr_size / UNITS_PER_WORD);
12156 if (cfun->va_list_fpr_size)
12157 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12158 cfun->va_list_fpr_size / UNITS_PER_VREG);
12160 if (!TARGET_FLOAT)
12162 gcc_assert (local_cum.aapcs_nvrn == 0);
12163 vr_saved = 0;
12166 if (!no_rtl)
12168 if (gr_saved > 0)
12170 rtx ptr, mem;
12172 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12173 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12174 - gr_saved * UNITS_PER_WORD);
12175 mem = gen_frame_mem (BLKmode, ptr);
12176 set_mem_alias_set (mem, get_varargs_alias_set ());
12178 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12179 mem, gr_saved);
12181 if (vr_saved > 0)
12183 /* We can't use move_block_from_reg, because it will use
12184 the wrong mode, storing D regs only. */
12185 machine_mode mode = TImode;
12186 int off, i, vr_start;
12188 /* Set OFF to the offset from virtual_incoming_args_rtx of
12189 the first vector register. The VR save area lies below
12190 the GR one, and is aligned to 16 bytes. */
12191 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12192 STACK_BOUNDARY / BITS_PER_UNIT);
12193 off -= vr_saved * UNITS_PER_VREG;
12195 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12196 for (i = 0; i < vr_saved; ++i)
12198 rtx ptr, mem;
12200 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12201 mem = gen_frame_mem (mode, ptr);
12202 set_mem_alias_set (mem, get_varargs_alias_set ());
12203 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12204 off += UNITS_PER_VREG;
12209 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12210 any complication of having crtl->args.pretend_args_size changed. */
12211 cfun->machine->frame.saved_varargs_size
12212 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12213 STACK_BOUNDARY / BITS_PER_UNIT)
12214 + vr_saved * UNITS_PER_VREG);
12217 static void
12218 aarch64_conditional_register_usage (void)
12220 int i;
12221 if (!TARGET_FLOAT)
12223 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12225 fixed_regs[i] = 1;
12226 call_used_regs[i] = 1;
12229 if (!TARGET_SVE)
12230 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12232 fixed_regs[i] = 1;
12233 call_used_regs[i] = 1;
12237 /* Walk down the type tree of TYPE counting consecutive base elements.
12238 If *MODEP is VOIDmode, then set it to the first valid floating point
12239 type. If a non-floating point type is found, or if a floating point
12240 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12241 otherwise return the count in the sub-tree. */
12242 static int
12243 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12245 machine_mode mode;
12246 HOST_WIDE_INT size;
12248 switch (TREE_CODE (type))
12250 case REAL_TYPE:
12251 mode = TYPE_MODE (type);
12252 if (mode != DFmode && mode != SFmode
12253 && mode != TFmode && mode != HFmode)
12254 return -1;
12256 if (*modep == VOIDmode)
12257 *modep = mode;
12259 if (*modep == mode)
12260 return 1;
12262 break;
12264 case COMPLEX_TYPE:
12265 mode = TYPE_MODE (TREE_TYPE (type));
12266 if (mode != DFmode && mode != SFmode
12267 && mode != TFmode && mode != HFmode)
12268 return -1;
12270 if (*modep == VOIDmode)
12271 *modep = mode;
12273 if (*modep == mode)
12274 return 2;
12276 break;
12278 case VECTOR_TYPE:
12279 /* Use V2SImode and V4SImode as representatives of all 64-bit
12280 and 128-bit vector types. */
12281 size = int_size_in_bytes (type);
12282 switch (size)
12284 case 8:
12285 mode = V2SImode;
12286 break;
12287 case 16:
12288 mode = V4SImode;
12289 break;
12290 default:
12291 return -1;
12294 if (*modep == VOIDmode)
12295 *modep = mode;
12297 /* Vector modes are considered to be opaque: two vectors are
12298 equivalent for the purposes of being homogeneous aggregates
12299 if they are the same size. */
12300 if (*modep == mode)
12301 return 1;
12303 break;
12305 case ARRAY_TYPE:
12307 int count;
12308 tree index = TYPE_DOMAIN (type);
12310 /* Can't handle incomplete types nor sizes that are not
12311 fixed. */
12312 if (!COMPLETE_TYPE_P (type)
12313 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12314 return -1;
12316 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12317 if (count == -1
12318 || !index
12319 || !TYPE_MAX_VALUE (index)
12320 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12321 || !TYPE_MIN_VALUE (index)
12322 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12323 || count < 0)
12324 return -1;
12326 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12327 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12329 /* There must be no padding. */
12330 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12331 count * GET_MODE_BITSIZE (*modep)))
12332 return -1;
12334 return count;
12337 case RECORD_TYPE:
12339 int count = 0;
12340 int sub_count;
12341 tree field;
12343 /* Can't handle incomplete types nor sizes that are not
12344 fixed. */
12345 if (!COMPLETE_TYPE_P (type)
12346 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12347 return -1;
12349 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12351 if (TREE_CODE (field) != FIELD_DECL)
12352 continue;
12354 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12355 if (sub_count < 0)
12356 return -1;
12357 count += sub_count;
12360 /* There must be no padding. */
12361 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12362 count * GET_MODE_BITSIZE (*modep)))
12363 return -1;
12365 return count;
12368 case UNION_TYPE:
12369 case QUAL_UNION_TYPE:
12371 /* These aren't very interesting except in a degenerate case. */
12372 int count = 0;
12373 int sub_count;
12374 tree field;
12376 /* Can't handle incomplete types nor sizes that are not
12377 fixed. */
12378 if (!COMPLETE_TYPE_P (type)
12379 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12380 return -1;
12382 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12384 if (TREE_CODE (field) != FIELD_DECL)
12385 continue;
12387 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12388 if (sub_count < 0)
12389 return -1;
12390 count = count > sub_count ? count : sub_count;
12393 /* There must be no padding. */
12394 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12395 count * GET_MODE_BITSIZE (*modep)))
12396 return -1;
12398 return count;
12401 default:
12402 break;
12405 return -1;
12408 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12409 type as described in AAPCS64 \S 4.1.2.
12411 See the comment above aarch64_composite_type_p for the notes on MODE. */
12413 static bool
12414 aarch64_short_vector_p (const_tree type,
12415 machine_mode mode)
12417 poly_int64 size = -1;
12419 if (type && TREE_CODE (type) == VECTOR_TYPE)
12420 size = int_size_in_bytes (type);
12421 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12422 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12423 size = GET_MODE_SIZE (mode);
12425 return known_eq (size, 8) || known_eq (size, 16);
12428 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12429 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12430 array types. The C99 floating-point complex types are also considered
12431 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12432 types, which are GCC extensions and out of the scope of AAPCS64, are
12433 treated as composite types here as well.
12435 Note that MODE itself is not sufficient in determining whether a type
12436 is such a composite type or not. This is because
12437 stor-layout.c:compute_record_mode may have already changed the MODE
12438 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12439 structure with only one field may have its MODE set to the mode of the
12440 field. Also an integer mode whose size matches the size of the
12441 RECORD_TYPE type may be used to substitute the original mode
12442 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12443 solely relied on. */
12445 static bool
12446 aarch64_composite_type_p (const_tree type,
12447 machine_mode mode)
12449 if (aarch64_short_vector_p (type, mode))
12450 return false;
12452 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12453 return true;
12455 if (mode == BLKmode
12456 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12457 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12458 return true;
12460 return false;
12463 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12464 shall be passed or returned in simd/fp register(s) (providing these
12465 parameter passing registers are available).
12467 Upon successful return, *COUNT returns the number of needed registers,
12468 *BASE_MODE returns the mode of the individual register and when IS_HAF
12469 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12470 floating-point aggregate or a homogeneous short-vector aggregate. */
12472 static bool
12473 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12474 const_tree type,
12475 machine_mode *base_mode,
12476 int *count,
12477 bool *is_ha)
12479 machine_mode new_mode = VOIDmode;
12480 bool composite_p = aarch64_composite_type_p (type, mode);
12482 if (is_ha != NULL) *is_ha = false;
12484 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12485 || aarch64_short_vector_p (type, mode))
12487 *count = 1;
12488 new_mode = mode;
12490 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12492 if (is_ha != NULL) *is_ha = true;
12493 *count = 2;
12494 new_mode = GET_MODE_INNER (mode);
12496 else if (type && composite_p)
12498 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12500 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12502 if (is_ha != NULL) *is_ha = true;
12503 *count = ag_count;
12505 else
12506 return false;
12508 else
12509 return false;
12511 *base_mode = new_mode;
12512 return true;
12515 /* Implement TARGET_STRUCT_VALUE_RTX. */
12517 static rtx
12518 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12519 int incoming ATTRIBUTE_UNUSED)
12521 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12524 /* Implements target hook vector_mode_supported_p. */
12525 static bool
12526 aarch64_vector_mode_supported_p (machine_mode mode)
12528 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12529 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12532 /* Return appropriate SIMD container
12533 for MODE within a vector of WIDTH bits. */
12534 static machine_mode
12535 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12537 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12538 switch (mode)
12540 case E_DFmode:
12541 return VNx2DFmode;
12542 case E_SFmode:
12543 return VNx4SFmode;
12544 case E_HFmode:
12545 return VNx8HFmode;
12546 case E_DImode:
12547 return VNx2DImode;
12548 case E_SImode:
12549 return VNx4SImode;
12550 case E_HImode:
12551 return VNx8HImode;
12552 case E_QImode:
12553 return VNx16QImode;
12554 default:
12555 return word_mode;
12558 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12559 if (TARGET_SIMD)
12561 if (known_eq (width, 128))
12562 switch (mode)
12564 case E_DFmode:
12565 return V2DFmode;
12566 case E_SFmode:
12567 return V4SFmode;
12568 case E_HFmode:
12569 return V8HFmode;
12570 case E_SImode:
12571 return V4SImode;
12572 case E_HImode:
12573 return V8HImode;
12574 case E_QImode:
12575 return V16QImode;
12576 case E_DImode:
12577 return V2DImode;
12578 default:
12579 break;
12581 else
12582 switch (mode)
12584 case E_SFmode:
12585 return V2SFmode;
12586 case E_HFmode:
12587 return V4HFmode;
12588 case E_SImode:
12589 return V2SImode;
12590 case E_HImode:
12591 return V4HImode;
12592 case E_QImode:
12593 return V8QImode;
12594 default:
12595 break;
12598 return word_mode;
12601 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12602 static machine_mode
12603 aarch64_preferred_simd_mode (scalar_mode mode)
12605 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12606 return aarch64_simd_container_mode (mode, bits);
12609 /* Return a list of possible vector sizes for the vectorizer
12610 to iterate over. */
12611 static void
12612 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12614 if (TARGET_SVE)
12615 sizes->safe_push (BYTES_PER_SVE_VECTOR);
12616 sizes->safe_push (16);
12617 sizes->safe_push (8);
12620 /* Implement TARGET_MANGLE_TYPE. */
12622 static const char *
12623 aarch64_mangle_type (const_tree type)
12625 /* The AArch64 ABI documents say that "__va_list" has to be
12626 managled as if it is in the "std" namespace. */
12627 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12628 return "St9__va_list";
12630 /* Half-precision float. */
12631 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12632 return "Dh";
12634 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12635 builtin types. */
12636 if (TYPE_NAME (type) != NULL)
12637 return aarch64_mangle_builtin_type (type);
12639 /* Use the default mangling. */
12640 return NULL;
12643 /* Find the first rtx_insn before insn that will generate an assembly
12644 instruction. */
12646 static rtx_insn *
12647 aarch64_prev_real_insn (rtx_insn *insn)
12649 if (!insn)
12650 return NULL;
12654 insn = prev_real_insn (insn);
12656 while (insn && recog_memoized (insn) < 0);
12658 return insn;
12661 static bool
12662 is_madd_op (enum attr_type t1)
12664 unsigned int i;
12665 /* A number of these may be AArch32 only. */
12666 enum attr_type mlatypes[] = {
12667 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12668 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12669 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12672 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12674 if (t1 == mlatypes[i])
12675 return true;
12678 return false;
12681 /* Check if there is a register dependency between a load and the insn
12682 for which we hold recog_data. */
12684 static bool
12685 dep_between_memop_and_curr (rtx memop)
12687 rtx load_reg;
12688 int opno;
12690 gcc_assert (GET_CODE (memop) == SET);
12692 if (!REG_P (SET_DEST (memop)))
12693 return false;
12695 load_reg = SET_DEST (memop);
12696 for (opno = 1; opno < recog_data.n_operands; opno++)
12698 rtx operand = recog_data.operand[opno];
12699 if (REG_P (operand)
12700 && reg_overlap_mentioned_p (load_reg, operand))
12701 return true;
12704 return false;
12708 /* When working around the Cortex-A53 erratum 835769,
12709 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12710 instruction and has a preceding memory instruction such that a NOP
12711 should be inserted between them. */
12713 bool
12714 aarch64_madd_needs_nop (rtx_insn* insn)
12716 enum attr_type attr_type;
12717 rtx_insn *prev;
12718 rtx body;
12720 if (!TARGET_FIX_ERR_A53_835769)
12721 return false;
12723 if (!INSN_P (insn) || recog_memoized (insn) < 0)
12724 return false;
12726 attr_type = get_attr_type (insn);
12727 if (!is_madd_op (attr_type))
12728 return false;
12730 prev = aarch64_prev_real_insn (insn);
12731 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12732 Restore recog state to INSN to avoid state corruption. */
12733 extract_constrain_insn_cached (insn);
12735 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
12736 return false;
12738 body = single_set (prev);
12740 /* If the previous insn is a memory op and there is no dependency between
12741 it and the DImode madd, emit a NOP between them. If body is NULL then we
12742 have a complex memory operation, probably a load/store pair.
12743 Be conservative for now and emit a NOP. */
12744 if (GET_MODE (recog_data.operand[0]) == DImode
12745 && (!body || !dep_between_memop_and_curr (body)))
12746 return true;
12748 return false;
12753 /* Implement FINAL_PRESCAN_INSN. */
12755 void
12756 aarch64_final_prescan_insn (rtx_insn *insn)
12758 if (aarch64_madd_needs_nop (insn))
12759 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
12763 /* Return the equivalent letter for size. */
12764 static char
12765 sizetochar (int size)
12767 switch (size)
12769 case 64: return 'd';
12770 case 32: return 's';
12771 case 16: return 'h';
12772 case 8 : return 'b';
12773 default: gcc_unreachable ();
12777 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
12778 instruction. */
12780 bool
12781 aarch64_sve_index_immediate_p (rtx base_or_step)
12783 return (CONST_INT_P (base_or_step)
12784 && IN_RANGE (INTVAL (base_or_step), -16, 15));
12787 /* Return true if X is a valid immediate for the SVE ADD and SUB
12788 instructions. Negate X first if NEGATE_P is true. */
12790 bool
12791 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
12793 rtx elt;
12795 if (!const_vec_duplicate_p (x, &elt)
12796 || !CONST_INT_P (elt))
12797 return false;
12799 HOST_WIDE_INT val = INTVAL (elt);
12800 if (negate_p)
12801 val = -val;
12802 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
12804 if (val & 0xff)
12805 return IN_RANGE (val, 0, 0xff);
12806 return IN_RANGE (val, 0, 0xff00);
12809 /* Return true if X is a valid immediate operand for an SVE logical
12810 instruction such as AND. */
12812 bool
12813 aarch64_sve_bitmask_immediate_p (rtx x)
12815 rtx elt;
12817 return (const_vec_duplicate_p (x, &elt)
12818 && CONST_INT_P (elt)
12819 && aarch64_bitmask_imm (INTVAL (elt),
12820 GET_MODE_INNER (GET_MODE (x))));
12823 /* Return true if X is a valid immediate for the SVE DUP and CPY
12824 instructions. */
12826 bool
12827 aarch64_sve_dup_immediate_p (rtx x)
12829 rtx elt;
12831 if (!const_vec_duplicate_p (x, &elt)
12832 || !CONST_INT_P (elt))
12833 return false;
12835 HOST_WIDE_INT val = INTVAL (elt);
12836 if (val & 0xff)
12837 return IN_RANGE (val, -0x80, 0x7f);
12838 return IN_RANGE (val, -0x8000, 0x7f00);
12841 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
12842 SIGNED_P says whether the operand is signed rather than unsigned. */
12844 bool
12845 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
12847 rtx elt;
12849 return (const_vec_duplicate_p (x, &elt)
12850 && CONST_INT_P (elt)
12851 && (signed_p
12852 ? IN_RANGE (INTVAL (elt), -16, 15)
12853 : IN_RANGE (INTVAL (elt), 0, 127)));
12856 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
12857 instruction. Negate X first if NEGATE_P is true. */
12859 bool
12860 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
12862 rtx elt;
12863 REAL_VALUE_TYPE r;
12865 if (!const_vec_duplicate_p (x, &elt)
12866 || GET_CODE (elt) != CONST_DOUBLE)
12867 return false;
12869 r = *CONST_DOUBLE_REAL_VALUE (elt);
12871 if (negate_p)
12872 r = real_value_negate (&r);
12874 if (real_equal (&r, &dconst1))
12875 return true;
12876 if (real_equal (&r, &dconsthalf))
12877 return true;
12878 return false;
12881 /* Return true if X is a valid immediate operand for an SVE FMUL
12882 instruction. */
12884 bool
12885 aarch64_sve_float_mul_immediate_p (rtx x)
12887 rtx elt;
12889 /* GCC will never generate a multiply with an immediate of 2, so there is no
12890 point testing for it (even though it is a valid constant). */
12891 return (const_vec_duplicate_p (x, &elt)
12892 && GET_CODE (elt) == CONST_DOUBLE
12893 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
12896 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
12897 for the Advanced SIMD operation described by WHICH and INSN. If INFO
12898 is nonnull, use it to describe valid immediates. */
12899 static bool
12900 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
12901 simd_immediate_info *info,
12902 enum simd_immediate_check which,
12903 simd_immediate_info::insn_type insn)
12905 /* Try a 4-byte immediate with LSL. */
12906 for (unsigned int shift = 0; shift < 32; shift += 8)
12907 if ((val32 & (0xff << shift)) == val32)
12909 if (info)
12910 *info = simd_immediate_info (SImode, val32 >> shift, insn,
12911 simd_immediate_info::LSL, shift);
12912 return true;
12915 /* Try a 2-byte immediate with LSL. */
12916 unsigned int imm16 = val32 & 0xffff;
12917 if (imm16 == (val32 >> 16))
12918 for (unsigned int shift = 0; shift < 16; shift += 8)
12919 if ((imm16 & (0xff << shift)) == imm16)
12921 if (info)
12922 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
12923 simd_immediate_info::LSL, shift);
12924 return true;
12927 /* Try a 4-byte immediate with MSL, except for cases that MVN
12928 can handle. */
12929 if (which == AARCH64_CHECK_MOV)
12930 for (unsigned int shift = 8; shift < 24; shift += 8)
12932 unsigned int low = (1 << shift) - 1;
12933 if (((val32 & (0xff << shift)) | low) == val32)
12935 if (info)
12936 *info = simd_immediate_info (SImode, val32 >> shift, insn,
12937 simd_immediate_info::MSL, shift);
12938 return true;
12942 return false;
12945 /* Return true if replicating VAL64 is a valid immediate for the
12946 Advanced SIMD operation described by WHICH. If INFO is nonnull,
12947 use it to describe valid immediates. */
12948 static bool
12949 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
12950 simd_immediate_info *info,
12951 enum simd_immediate_check which)
12953 unsigned int val32 = val64 & 0xffffffff;
12954 unsigned int val16 = val64 & 0xffff;
12955 unsigned int val8 = val64 & 0xff;
12957 if (val32 == (val64 >> 32))
12959 if ((which & AARCH64_CHECK_ORR) != 0
12960 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
12961 simd_immediate_info::MOV))
12962 return true;
12964 if ((which & AARCH64_CHECK_BIC) != 0
12965 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
12966 simd_immediate_info::MVN))
12967 return true;
12969 /* Try using a replicated byte. */
12970 if (which == AARCH64_CHECK_MOV
12971 && val16 == (val32 >> 16)
12972 && val8 == (val16 >> 8))
12974 if (info)
12975 *info = simd_immediate_info (QImode, val8);
12976 return true;
12980 /* Try using a bit-to-bytemask. */
12981 if (which == AARCH64_CHECK_MOV)
12983 unsigned int i;
12984 for (i = 0; i < 64; i += 8)
12986 unsigned char byte = (val64 >> i) & 0xff;
12987 if (byte != 0 && byte != 0xff)
12988 break;
12990 if (i == 64)
12992 if (info)
12993 *info = simd_immediate_info (DImode, val64);
12994 return true;
12997 return false;
13000 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13001 instruction. If INFO is nonnull, use it to describe valid immediates. */
13003 static bool
13004 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13005 simd_immediate_info *info)
13007 scalar_int_mode mode = DImode;
13008 unsigned int val32 = val64 & 0xffffffff;
13009 if (val32 == (val64 >> 32))
13011 mode = SImode;
13012 unsigned int val16 = val32 & 0xffff;
13013 if (val16 == (val32 >> 16))
13015 mode = HImode;
13016 unsigned int val8 = val16 & 0xff;
13017 if (val8 == (val16 >> 8))
13018 mode = QImode;
13021 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13022 if (IN_RANGE (val, -0x80, 0x7f))
13024 /* DUP with no shift. */
13025 if (info)
13026 *info = simd_immediate_info (mode, val);
13027 return true;
13029 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13031 /* DUP with LSL #8. */
13032 if (info)
13033 *info = simd_immediate_info (mode, val);
13034 return true;
13036 if (aarch64_bitmask_imm (val64, mode))
13038 /* DUPM. */
13039 if (info)
13040 *info = simd_immediate_info (mode, val);
13041 return true;
13043 return false;
13046 /* Return true if OP is a valid SIMD immediate for the operation
13047 described by WHICH. If INFO is nonnull, use it to describe valid
13048 immediates. */
13049 bool
13050 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13051 enum simd_immediate_check which)
13053 machine_mode mode = GET_MODE (op);
13054 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13055 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13056 return false;
13058 scalar_mode elt_mode = GET_MODE_INNER (mode);
13059 rtx elt = NULL, base, step;
13060 unsigned int n_elts;
13061 if (const_vec_duplicate_p (op, &elt))
13062 n_elts = 1;
13063 else if ((vec_flags & VEC_SVE_DATA)
13064 && const_vec_series_p (op, &base, &step))
13066 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13067 if (!aarch64_sve_index_immediate_p (base)
13068 || !aarch64_sve_index_immediate_p (step))
13069 return false;
13071 if (info)
13072 *info = simd_immediate_info (elt_mode, base, step);
13073 return true;
13075 else if (GET_CODE (op) == CONST_VECTOR
13076 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13077 /* N_ELTS set above. */;
13078 else
13079 return false;
13081 /* Handle PFALSE and PTRUE. */
13082 if (vec_flags & VEC_SVE_PRED)
13083 return (op == CONST0_RTX (mode)
13084 || op == CONSTM1_RTX (mode));
13086 scalar_float_mode elt_float_mode;
13087 if (elt
13088 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode)
13089 && (aarch64_float_const_zero_rtx_p (elt)
13090 || aarch64_float_const_representable_p (elt)))
13092 if (info)
13093 *info = simd_immediate_info (elt_float_mode, elt);
13094 return true;
13097 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13098 if (elt_size > 8)
13099 return false;
13101 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13103 /* Expand the vector constant out into a byte vector, with the least
13104 significant byte of the register first. */
13105 auto_vec<unsigned char, 16> bytes;
13106 bytes.reserve (n_elts * elt_size);
13107 for (unsigned int i = 0; i < n_elts; i++)
13109 if (!elt || n_elts != 1)
13110 /* The vector is provided in gcc endian-neutral fashion.
13111 For aarch64_be, it must be laid out in the vector register
13112 in reverse order. */
13113 elt = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
13115 if (elt_mode != elt_int_mode)
13116 elt = gen_lowpart (elt_int_mode, elt);
13118 if (!CONST_INT_P (elt))
13119 return false;
13121 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13122 for (unsigned int byte = 0; byte < elt_size; byte++)
13124 bytes.quick_push (elt_val & 0xff);
13125 elt_val >>= BITS_PER_UNIT;
13129 /* The immediate must repeat every eight bytes. */
13130 unsigned int nbytes = bytes.length ();
13131 for (unsigned i = 8; i < nbytes; ++i)
13132 if (bytes[i] != bytes[i - 8])
13133 return false;
13135 /* Get the repeating 8-byte value as an integer. No endian correction
13136 is needed here because bytes is already in lsb-first order. */
13137 unsigned HOST_WIDE_INT val64 = 0;
13138 for (unsigned int i = 0; i < 8; i++)
13139 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13140 << (i * BITS_PER_UNIT));
13142 if (vec_flags & VEC_SVE_DATA)
13143 return aarch64_sve_valid_immediate (val64, info);
13144 else
13145 return aarch64_advsimd_valid_immediate (val64, info, which);
13148 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13149 has a step in the range of INDEX. Return the index expression if so,
13150 otherwise return null. */
13152 aarch64_check_zero_based_sve_index_immediate (rtx x)
13154 rtx base, step;
13155 if (const_vec_series_p (x, &base, &step)
13156 && base == const0_rtx
13157 && aarch64_sve_index_immediate_p (step))
13158 return step;
13159 return NULL_RTX;
13162 /* Check of immediate shift constants are within range. */
13163 bool
13164 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13166 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13167 if (left)
13168 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13169 else
13170 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13173 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13174 operation of width WIDTH at bit position POS. */
13177 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13179 gcc_assert (CONST_INT_P (width));
13180 gcc_assert (CONST_INT_P (pos));
13182 unsigned HOST_WIDE_INT mask
13183 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13184 return GEN_INT (mask << UINTVAL (pos));
13187 bool
13188 aarch64_mov_operand_p (rtx x, machine_mode mode)
13190 if (GET_CODE (x) == HIGH
13191 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13192 return true;
13194 if (CONST_INT_P (x))
13195 return true;
13197 if (VECTOR_MODE_P (GET_MODE (x)))
13198 return aarch64_simd_valid_immediate (x, NULL);
13200 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13201 return true;
13203 if (aarch64_sve_cnt_immediate_p (x))
13204 return true;
13206 return aarch64_classify_symbolic_expression (x)
13207 == SYMBOL_TINY_ABSOLUTE;
13210 /* Return a const_int vector of VAL. */
13212 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13214 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13215 return gen_const_vec_duplicate (mode, c);
13218 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13220 bool
13221 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13223 machine_mode vmode;
13225 vmode = aarch64_simd_container_mode (mode, 64);
13226 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13227 return aarch64_simd_valid_immediate (op_v, NULL);
13230 /* Construct and return a PARALLEL RTX vector with elements numbering the
13231 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13232 the vector - from the perspective of the architecture. This does not
13233 line up with GCC's perspective on lane numbers, so we end up with
13234 different masks depending on our target endian-ness. The diagram
13235 below may help. We must draw the distinction when building masks
13236 which select one half of the vector. An instruction selecting
13237 architectural low-lanes for a big-endian target, must be described using
13238 a mask selecting GCC high-lanes.
13240 Big-Endian Little-Endian
13242 GCC 0 1 2 3 3 2 1 0
13243 | x | x | x | x | | x | x | x | x |
13244 Architecture 3 2 1 0 3 2 1 0
13246 Low Mask: { 2, 3 } { 0, 1 }
13247 High Mask: { 0, 1 } { 2, 3 }
13249 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13252 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13254 rtvec v = rtvec_alloc (nunits / 2);
13255 int high_base = nunits / 2;
13256 int low_base = 0;
13257 int base;
13258 rtx t1;
13259 int i;
13261 if (BYTES_BIG_ENDIAN)
13262 base = high ? low_base : high_base;
13263 else
13264 base = high ? high_base : low_base;
13266 for (i = 0; i < nunits / 2; i++)
13267 RTVEC_ELT (v, i) = GEN_INT (base + i);
13269 t1 = gen_rtx_PARALLEL (mode, v);
13270 return t1;
13273 /* Check OP for validity as a PARALLEL RTX vector with elements
13274 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13275 from the perspective of the architecture. See the diagram above
13276 aarch64_simd_vect_par_cnst_half for more details. */
13278 bool
13279 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13280 bool high)
13282 int nelts;
13283 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13284 return false;
13286 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13287 HOST_WIDE_INT count_op = XVECLEN (op, 0);
13288 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13289 int i = 0;
13291 if (count_op != count_ideal)
13292 return false;
13294 for (i = 0; i < count_ideal; i++)
13296 rtx elt_op = XVECEXP (op, 0, i);
13297 rtx elt_ideal = XVECEXP (ideal, 0, i);
13299 if (!CONST_INT_P (elt_op)
13300 || INTVAL (elt_ideal) != INTVAL (elt_op))
13301 return false;
13303 return true;
13306 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13307 HIGH (exclusive). */
13308 void
13309 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13310 const_tree exp)
13312 HOST_WIDE_INT lane;
13313 gcc_assert (CONST_INT_P (operand));
13314 lane = INTVAL (operand);
13316 if (lane < low || lane >= high)
13318 if (exp)
13319 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13320 else
13321 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13325 /* Peform endian correction on lane number N, which indexes a vector
13326 of mode MODE, and return the result as an SImode rtx. */
13329 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13331 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13334 /* Return TRUE if OP is a valid vector addressing mode. */
13336 bool
13337 aarch64_simd_mem_operand_p (rtx op)
13339 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13340 || REG_P (XEXP (op, 0)));
13343 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13345 bool
13346 aarch64_sve_ld1r_operand_p (rtx op)
13348 struct aarch64_address_info addr;
13349 scalar_mode mode;
13351 return (MEM_P (op)
13352 && is_a <scalar_mode> (GET_MODE (op), &mode)
13353 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13354 && addr.type == ADDRESS_REG_IMM
13355 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13358 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13359 The conditions for STR are the same. */
13360 bool
13361 aarch64_sve_ldr_operand_p (rtx op)
13363 struct aarch64_address_info addr;
13365 return (MEM_P (op)
13366 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13367 false, ADDR_QUERY_ANY)
13368 && addr.type == ADDRESS_REG_IMM);
13371 /* Emit a register copy from operand to operand, taking care not to
13372 early-clobber source registers in the process.
13374 COUNT is the number of components into which the copy needs to be
13375 decomposed. */
13376 void
13377 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13378 unsigned int count)
13380 unsigned int i;
13381 int rdest = REGNO (operands[0]);
13382 int rsrc = REGNO (operands[1]);
13384 if (!reg_overlap_mentioned_p (operands[0], operands[1])
13385 || rdest < rsrc)
13386 for (i = 0; i < count; i++)
13387 emit_move_insn (gen_rtx_REG (mode, rdest + i),
13388 gen_rtx_REG (mode, rsrc + i));
13389 else
13390 for (i = 0; i < count; i++)
13391 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13392 gen_rtx_REG (mode, rsrc + count - i - 1));
13395 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13396 one of VSTRUCT modes: OI, CI, or XI. */
13398 aarch64_simd_attr_length_rglist (machine_mode mode)
13400 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13401 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13404 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13405 alignment of a vector to 128 bits. SVE predicates have an alignment of
13406 16 bits. */
13407 static HOST_WIDE_INT
13408 aarch64_simd_vector_alignment (const_tree type)
13410 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13411 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13412 be set for non-predicate vectors of booleans. Modes are the most
13413 direct way we have of identifying real SVE predicate types. */
13414 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13415 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13416 return MIN (align, 128);
13419 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13420 static HOST_WIDE_INT
13421 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13423 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13425 /* If the length of the vector is fixed, try to align to that length,
13426 otherwise don't try to align at all. */
13427 HOST_WIDE_INT result;
13428 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13429 result = TYPE_ALIGN (TREE_TYPE (type));
13430 return result;
13432 return TYPE_ALIGN (type);
13435 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13436 static bool
13437 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13439 if (is_packed)
13440 return false;
13442 /* For fixed-length vectors, check that the vectorizer will aim for
13443 full-vector alignment. This isn't true for generic GCC vectors
13444 that are wider than the ABI maximum of 128 bits. */
13445 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13446 && (wi::to_widest (TYPE_SIZE (type))
13447 != aarch64_vectorize_preferred_vector_alignment (type)))
13448 return false;
13450 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13451 return true;
13454 /* Return true if the vector misalignment factor is supported by the
13455 target. */
13456 static bool
13457 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13458 const_tree type, int misalignment,
13459 bool is_packed)
13461 if (TARGET_SIMD && STRICT_ALIGNMENT)
13463 /* Return if movmisalign pattern is not supported for this mode. */
13464 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13465 return false;
13467 /* Misalignment factor is unknown at compile time. */
13468 if (misalignment == -1)
13469 return false;
13471 return default_builtin_support_vector_misalignment (mode, type, misalignment,
13472 is_packed);
13475 /* If VALS is a vector constant that can be loaded into a register
13476 using DUP, generate instructions to do so and return an RTX to
13477 assign to the register. Otherwise return NULL_RTX. */
13478 static rtx
13479 aarch64_simd_dup_constant (rtx vals)
13481 machine_mode mode = GET_MODE (vals);
13482 machine_mode inner_mode = GET_MODE_INNER (mode);
13483 rtx x;
13485 if (!const_vec_duplicate_p (vals, &x))
13486 return NULL_RTX;
13488 /* We can load this constant by using DUP and a constant in a
13489 single ARM register. This will be cheaper than a vector
13490 load. */
13491 x = copy_to_mode_reg (inner_mode, x);
13492 return gen_vec_duplicate (mode, x);
13496 /* Generate code to load VALS, which is a PARALLEL containing only
13497 constants (for vec_init) or CONST_VECTOR, efficiently into a
13498 register. Returns an RTX to copy into the register, or NULL_RTX
13499 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13500 static rtx
13501 aarch64_simd_make_constant (rtx vals)
13503 machine_mode mode = GET_MODE (vals);
13504 rtx const_dup;
13505 rtx const_vec = NULL_RTX;
13506 int n_const = 0;
13507 int i;
13509 if (GET_CODE (vals) == CONST_VECTOR)
13510 const_vec = vals;
13511 else if (GET_CODE (vals) == PARALLEL)
13513 /* A CONST_VECTOR must contain only CONST_INTs and
13514 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13515 Only store valid constants in a CONST_VECTOR. */
13516 int n_elts = XVECLEN (vals, 0);
13517 for (i = 0; i < n_elts; ++i)
13519 rtx x = XVECEXP (vals, 0, i);
13520 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13521 n_const++;
13523 if (n_const == n_elts)
13524 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13526 else
13527 gcc_unreachable ();
13529 if (const_vec != NULL_RTX
13530 && aarch64_simd_valid_immediate (const_vec, NULL))
13531 /* Load using MOVI/MVNI. */
13532 return const_vec;
13533 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13534 /* Loaded using DUP. */
13535 return const_dup;
13536 else if (const_vec != NULL_RTX)
13537 /* Load from constant pool. We can not take advantage of single-cycle
13538 LD1 because we need a PC-relative addressing mode. */
13539 return const_vec;
13540 else
13541 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13542 We can not construct an initializer. */
13543 return NULL_RTX;
13546 /* Expand a vector initialisation sequence, such that TARGET is
13547 initialised to contain VALS. */
13549 void
13550 aarch64_expand_vector_init (rtx target, rtx vals)
13552 machine_mode mode = GET_MODE (target);
13553 scalar_mode inner_mode = GET_MODE_INNER (mode);
13554 /* The number of vector elements. */
13555 int n_elts = XVECLEN (vals, 0);
13556 /* The number of vector elements which are not constant. */
13557 int n_var = 0;
13558 rtx any_const = NULL_RTX;
13559 /* The first element of vals. */
13560 rtx v0 = XVECEXP (vals, 0, 0);
13561 bool all_same = true;
13563 /* Count the number of variable elements to initialise. */
13564 for (int i = 0; i < n_elts; ++i)
13566 rtx x = XVECEXP (vals, 0, i);
13567 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13568 ++n_var;
13569 else
13570 any_const = x;
13572 all_same &= rtx_equal_p (x, v0);
13575 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13576 how best to handle this. */
13577 if (n_var == 0)
13579 rtx constant = aarch64_simd_make_constant (vals);
13580 if (constant != NULL_RTX)
13582 emit_move_insn (target, constant);
13583 return;
13587 /* Splat a single non-constant element if we can. */
13588 if (all_same)
13590 rtx x = copy_to_mode_reg (inner_mode, v0);
13591 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13592 return;
13595 enum insn_code icode = optab_handler (vec_set_optab, mode);
13596 gcc_assert (icode != CODE_FOR_nothing);
13598 /* If there are only variable elements, try to optimize
13599 the insertion using dup for the most common element
13600 followed by insertions. */
13602 /* The algorithm will fill matches[*][0] with the earliest matching element,
13603 and matches[X][1] with the count of duplicate elements (if X is the
13604 earliest element which has duplicates). */
13606 if (n_var == n_elts && n_elts <= 16)
13608 int matches[16][2] = {0};
13609 for (int i = 0; i < n_elts; i++)
13611 for (int j = 0; j <= i; j++)
13613 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13615 matches[i][0] = j;
13616 matches[j][1]++;
13617 break;
13621 int maxelement = 0;
13622 int maxv = 0;
13623 for (int i = 0; i < n_elts; i++)
13624 if (matches[i][1] > maxv)
13626 maxelement = i;
13627 maxv = matches[i][1];
13630 /* Create a duplicate of the most common element. */
13631 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13632 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13634 /* Insert the rest. */
13635 for (int i = 0; i < n_elts; i++)
13637 rtx x = XVECEXP (vals, 0, i);
13638 if (matches[i][0] == maxelement)
13639 continue;
13640 x = copy_to_mode_reg (inner_mode, x);
13641 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13643 return;
13646 /* Initialise a vector which is part-variable. We want to first try
13647 to build those lanes which are constant in the most efficient way we
13648 can. */
13649 if (n_var != n_elts)
13651 rtx copy = copy_rtx (vals);
13653 /* Load constant part of vector. We really don't care what goes into the
13654 parts we will overwrite, but we're more likely to be able to load the
13655 constant efficiently if it has fewer, larger, repeating parts
13656 (see aarch64_simd_valid_immediate). */
13657 for (int i = 0; i < n_elts; i++)
13659 rtx x = XVECEXP (vals, 0, i);
13660 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13661 continue;
13662 rtx subst = any_const;
13663 for (int bit = n_elts / 2; bit > 0; bit /= 2)
13665 /* Look in the copied vector, as more elements are const. */
13666 rtx test = XVECEXP (copy, 0, i ^ bit);
13667 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13669 subst = test;
13670 break;
13673 XVECEXP (copy, 0, i) = subst;
13675 aarch64_expand_vector_init (target, copy);
13678 /* Insert the variable lanes directly. */
13679 for (int i = 0; i < n_elts; i++)
13681 rtx x = XVECEXP (vals, 0, i);
13682 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13683 continue;
13684 x = copy_to_mode_reg (inner_mode, x);
13685 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13689 static unsigned HOST_WIDE_INT
13690 aarch64_shift_truncation_mask (machine_mode mode)
13692 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
13693 return 0;
13694 return GET_MODE_UNIT_BITSIZE (mode) - 1;
13697 /* Select a format to encode pointers in exception handling data. */
13699 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
13701 int type;
13702 switch (aarch64_cmodel)
13704 case AARCH64_CMODEL_TINY:
13705 case AARCH64_CMODEL_TINY_PIC:
13706 case AARCH64_CMODEL_SMALL:
13707 case AARCH64_CMODEL_SMALL_PIC:
13708 case AARCH64_CMODEL_SMALL_SPIC:
13709 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
13710 for everything. */
13711 type = DW_EH_PE_sdata4;
13712 break;
13713 default:
13714 /* No assumptions here. 8-byte relocs required. */
13715 type = DW_EH_PE_sdata8;
13716 break;
13718 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
13721 /* The last .arch and .tune assembly strings that we printed. */
13722 static std::string aarch64_last_printed_arch_string;
13723 static std::string aarch64_last_printed_tune_string;
13725 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
13726 by the function fndecl. */
13728 void
13729 aarch64_declare_function_name (FILE *stream, const char* name,
13730 tree fndecl)
13732 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13734 struct cl_target_option *targ_options;
13735 if (target_parts)
13736 targ_options = TREE_TARGET_OPTION (target_parts);
13737 else
13738 targ_options = TREE_TARGET_OPTION (target_option_current_node);
13739 gcc_assert (targ_options);
13741 const struct processor *this_arch
13742 = aarch64_get_arch (targ_options->x_explicit_arch);
13744 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
13745 std::string extension
13746 = aarch64_get_extension_string_for_isa_flags (isa_flags,
13747 this_arch->flags);
13748 /* Only update the assembler .arch string if it is distinct from the last
13749 such string we printed. */
13750 std::string to_print = this_arch->name + extension;
13751 if (to_print != aarch64_last_printed_arch_string)
13753 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
13754 aarch64_last_printed_arch_string = to_print;
13757 /* Print the cpu name we're tuning for in the comments, might be
13758 useful to readers of the generated asm. Do it only when it changes
13759 from function to function and verbose assembly is requested. */
13760 const struct processor *this_tune
13761 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
13763 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
13765 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
13766 this_tune->name);
13767 aarch64_last_printed_tune_string = this_tune->name;
13770 /* Don't forget the type directive for ELF. */
13771 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
13772 ASM_OUTPUT_LABEL (stream, name);
13775 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
13777 static void
13778 aarch64_start_file (void)
13780 struct cl_target_option *default_options
13781 = TREE_TARGET_OPTION (target_option_default_node);
13783 const struct processor *default_arch
13784 = aarch64_get_arch (default_options->x_explicit_arch);
13785 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
13786 std::string extension
13787 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
13788 default_arch->flags);
13790 aarch64_last_printed_arch_string = default_arch->name + extension;
13791 aarch64_last_printed_tune_string = "";
13792 asm_fprintf (asm_out_file, "\t.arch %s\n",
13793 aarch64_last_printed_arch_string.c_str ());
13795 default_file_start ();
13798 /* Emit load exclusive. */
13800 static void
13801 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
13802 rtx mem, rtx model_rtx)
13804 rtx (*gen) (rtx, rtx, rtx);
13806 switch (mode)
13808 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
13809 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
13810 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
13811 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
13812 default:
13813 gcc_unreachable ();
13816 emit_insn (gen (rval, mem, model_rtx));
13819 /* Emit store exclusive. */
13821 static void
13822 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
13823 rtx rval, rtx mem, rtx model_rtx)
13825 rtx (*gen) (rtx, rtx, rtx, rtx);
13827 switch (mode)
13829 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
13830 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
13831 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
13832 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
13833 default:
13834 gcc_unreachable ();
13837 emit_insn (gen (bval, rval, mem, model_rtx));
13840 /* Mark the previous jump instruction as unlikely. */
13842 static void
13843 aarch64_emit_unlikely_jump (rtx insn)
13845 rtx_insn *jump = emit_jump_insn (insn);
13846 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
13849 /* Expand a compare and swap pattern. */
13851 void
13852 aarch64_expand_compare_and_swap (rtx operands[])
13854 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
13855 machine_mode mode, cmp_mode;
13856 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
13857 int idx;
13858 gen_cas_fn gen;
13859 const gen_cas_fn split_cas[] =
13861 gen_aarch64_compare_and_swapqi,
13862 gen_aarch64_compare_and_swaphi,
13863 gen_aarch64_compare_and_swapsi,
13864 gen_aarch64_compare_and_swapdi
13866 const gen_cas_fn atomic_cas[] =
13868 gen_aarch64_compare_and_swapqi_lse,
13869 gen_aarch64_compare_and_swaphi_lse,
13870 gen_aarch64_compare_and_swapsi_lse,
13871 gen_aarch64_compare_and_swapdi_lse
13874 bval = operands[0];
13875 rval = operands[1];
13876 mem = operands[2];
13877 oldval = operands[3];
13878 newval = operands[4];
13879 is_weak = operands[5];
13880 mod_s = operands[6];
13881 mod_f = operands[7];
13882 mode = GET_MODE (mem);
13883 cmp_mode = mode;
13885 /* Normally the succ memory model must be stronger than fail, but in the
13886 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
13887 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
13889 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
13890 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
13891 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
13893 switch (mode)
13895 case E_QImode:
13896 case E_HImode:
13897 /* For short modes, we're going to perform the comparison in SImode,
13898 so do the zero-extension now. */
13899 cmp_mode = SImode;
13900 rval = gen_reg_rtx (SImode);
13901 oldval = convert_modes (SImode, mode, oldval, true);
13902 /* Fall through. */
13904 case E_SImode:
13905 case E_DImode:
13906 /* Force the value into a register if needed. */
13907 if (!aarch64_plus_operand (oldval, mode))
13908 oldval = force_reg (cmp_mode, oldval);
13909 break;
13911 default:
13912 gcc_unreachable ();
13915 switch (mode)
13917 case E_QImode: idx = 0; break;
13918 case E_HImode: idx = 1; break;
13919 case E_SImode: idx = 2; break;
13920 case E_DImode: idx = 3; break;
13921 default:
13922 gcc_unreachable ();
13924 if (TARGET_LSE)
13925 gen = atomic_cas[idx];
13926 else
13927 gen = split_cas[idx];
13929 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
13931 if (mode == QImode || mode == HImode)
13932 emit_move_insn (operands[1], gen_lowpart (mode, rval));
13934 x = gen_rtx_REG (CCmode, CC_REGNUM);
13935 x = gen_rtx_EQ (SImode, x, const0_rtx);
13936 emit_insn (gen_rtx_SET (bval, x));
13939 /* Test whether the target supports using a atomic load-operate instruction.
13940 CODE is the operation and AFTER is TRUE if the data in memory after the
13941 operation should be returned and FALSE if the data before the operation
13942 should be returned. Returns FALSE if the operation isn't supported by the
13943 architecture. */
13945 bool
13946 aarch64_atomic_ldop_supported_p (enum rtx_code code)
13948 if (!TARGET_LSE)
13949 return false;
13951 switch (code)
13953 case SET:
13954 case AND:
13955 case IOR:
13956 case XOR:
13957 case MINUS:
13958 case PLUS:
13959 return true;
13960 default:
13961 return false;
13965 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
13966 sequence implementing an atomic operation. */
13968 static void
13969 aarch64_emit_post_barrier (enum memmodel model)
13971 const enum memmodel base_model = memmodel_base (model);
13973 if (is_mm_sync (model)
13974 && (base_model == MEMMODEL_ACQUIRE
13975 || base_model == MEMMODEL_ACQ_REL
13976 || base_model == MEMMODEL_SEQ_CST))
13978 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
13982 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
13983 for the data in memory. EXPECTED is the value expected to be in memory.
13984 DESIRED is the value to store to memory. MEM is the memory location. MODEL
13985 is the memory ordering to use. */
13987 void
13988 aarch64_gen_atomic_cas (rtx rval, rtx mem,
13989 rtx expected, rtx desired,
13990 rtx model)
13992 rtx (*gen) (rtx, rtx, rtx, rtx);
13993 machine_mode mode;
13995 mode = GET_MODE (mem);
13997 switch (mode)
13999 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14000 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14001 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14002 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14003 default:
14004 gcc_unreachable ();
14007 /* Move the expected value into the CAS destination register. */
14008 emit_insn (gen_rtx_SET (rval, expected));
14010 /* Emit the CAS. */
14011 emit_insn (gen (rval, mem, desired, model));
14013 /* Compare the expected value with the value loaded by the CAS, to establish
14014 whether the swap was made. */
14015 aarch64_gen_compare_reg (EQ, rval, expected);
14018 /* Split a compare and swap pattern. */
14020 void
14021 aarch64_split_compare_and_swap (rtx operands[])
14023 rtx rval, mem, oldval, newval, scratch;
14024 machine_mode mode;
14025 bool is_weak;
14026 rtx_code_label *label1, *label2;
14027 rtx x, cond;
14028 enum memmodel model;
14029 rtx model_rtx;
14031 rval = operands[0];
14032 mem = operands[1];
14033 oldval = operands[2];
14034 newval = operands[3];
14035 is_weak = (operands[4] != const0_rtx);
14036 model_rtx = operands[5];
14037 scratch = operands[7];
14038 mode = GET_MODE (mem);
14039 model = memmodel_from_int (INTVAL (model_rtx));
14041 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14042 loop:
14043 .label1:
14044 LD[A]XR rval, [mem]
14045 CBNZ rval, .label2
14046 ST[L]XR scratch, newval, [mem]
14047 CBNZ scratch, .label1
14048 .label2:
14049 CMP rval, 0. */
14050 bool strong_zero_p = !is_weak && oldval == const0_rtx;
14052 label1 = NULL;
14053 if (!is_weak)
14055 label1 = gen_label_rtx ();
14056 emit_label (label1);
14058 label2 = gen_label_rtx ();
14060 /* The initial load can be relaxed for a __sync operation since a final
14061 barrier will be emitted to stop code hoisting. */
14062 if (is_mm_sync (model))
14063 aarch64_emit_load_exclusive (mode, rval, mem,
14064 GEN_INT (MEMMODEL_RELAXED));
14065 else
14066 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14068 if (strong_zero_p)
14070 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14071 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14072 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14073 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14075 else
14077 cond = aarch64_gen_compare_reg (NE, rval, oldval);
14078 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14079 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14080 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14081 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14084 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14086 if (!is_weak)
14088 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14089 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14090 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14091 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14093 else
14095 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14096 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14097 emit_insn (gen_rtx_SET (cond, x));
14100 emit_label (label2);
14101 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14102 to set the condition flags. If this is not used it will be removed by
14103 later passes. */
14104 if (strong_zero_p)
14106 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14107 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14108 emit_insn (gen_rtx_SET (cond, x));
14110 /* Emit any final barrier needed for a __sync operation. */
14111 if (is_mm_sync (model))
14112 aarch64_emit_post_barrier (model);
14115 /* Emit a BIC instruction. */
14117 static void
14118 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14120 rtx shift_rtx = GEN_INT (shift);
14121 rtx (*gen) (rtx, rtx, rtx, rtx);
14123 switch (mode)
14125 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14126 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14127 default:
14128 gcc_unreachable ();
14131 emit_insn (gen (dst, s2, shift_rtx, s1));
14134 /* Emit an atomic swap. */
14136 static void
14137 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14138 rtx mem, rtx model)
14140 rtx (*gen) (rtx, rtx, rtx, rtx);
14142 switch (mode)
14144 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14145 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14146 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14147 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14148 default:
14149 gcc_unreachable ();
14152 emit_insn (gen (dst, mem, value, model));
14155 /* Operations supported by aarch64_emit_atomic_load_op. */
14157 enum aarch64_atomic_load_op_code
14159 AARCH64_LDOP_PLUS, /* A + B */
14160 AARCH64_LDOP_XOR, /* A ^ B */
14161 AARCH64_LDOP_OR, /* A | B */
14162 AARCH64_LDOP_BIC /* A & ~B */
14165 /* Emit an atomic load-operate. */
14167 static void
14168 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14169 machine_mode mode, rtx dst, rtx src,
14170 rtx mem, rtx model)
14172 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14173 const aarch64_atomic_load_op_fn plus[] =
14175 gen_aarch64_atomic_loadaddqi,
14176 gen_aarch64_atomic_loadaddhi,
14177 gen_aarch64_atomic_loadaddsi,
14178 gen_aarch64_atomic_loadadddi
14180 const aarch64_atomic_load_op_fn eor[] =
14182 gen_aarch64_atomic_loadeorqi,
14183 gen_aarch64_atomic_loadeorhi,
14184 gen_aarch64_atomic_loadeorsi,
14185 gen_aarch64_atomic_loadeordi
14187 const aarch64_atomic_load_op_fn ior[] =
14189 gen_aarch64_atomic_loadsetqi,
14190 gen_aarch64_atomic_loadsethi,
14191 gen_aarch64_atomic_loadsetsi,
14192 gen_aarch64_atomic_loadsetdi
14194 const aarch64_atomic_load_op_fn bic[] =
14196 gen_aarch64_atomic_loadclrqi,
14197 gen_aarch64_atomic_loadclrhi,
14198 gen_aarch64_atomic_loadclrsi,
14199 gen_aarch64_atomic_loadclrdi
14201 aarch64_atomic_load_op_fn gen;
14202 int idx = 0;
14204 switch (mode)
14206 case E_QImode: idx = 0; break;
14207 case E_HImode: idx = 1; break;
14208 case E_SImode: idx = 2; break;
14209 case E_DImode: idx = 3; break;
14210 default:
14211 gcc_unreachable ();
14214 switch (code)
14216 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14217 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14218 case AARCH64_LDOP_OR: gen = ior[idx]; break;
14219 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14220 default:
14221 gcc_unreachable ();
14224 emit_insn (gen (dst, mem, src, model));
14227 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14228 location to store the data read from memory. OUT_RESULT is the location to
14229 store the result of the operation. MEM is the memory location to read and
14230 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14231 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14232 be NULL. */
14234 void
14235 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14236 rtx mem, rtx value, rtx model_rtx)
14238 machine_mode mode = GET_MODE (mem);
14239 machine_mode wmode = (mode == DImode ? DImode : SImode);
14240 const bool short_mode = (mode < SImode);
14241 aarch64_atomic_load_op_code ldop_code;
14242 rtx src;
14243 rtx x;
14245 if (out_data)
14246 out_data = gen_lowpart (mode, out_data);
14248 if (out_result)
14249 out_result = gen_lowpart (mode, out_result);
14251 /* Make sure the value is in a register, putting it into a destination
14252 register if it needs to be manipulated. */
14253 if (!register_operand (value, mode)
14254 || code == AND || code == MINUS)
14256 src = out_result ? out_result : out_data;
14257 emit_move_insn (src, gen_lowpart (mode, value));
14259 else
14260 src = value;
14261 gcc_assert (register_operand (src, mode));
14263 /* Preprocess the data for the operation as necessary. If the operation is
14264 a SET then emit a swap instruction and finish. */
14265 switch (code)
14267 case SET:
14268 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14269 return;
14271 case MINUS:
14272 /* Negate the value and treat it as a PLUS. */
14274 rtx neg_src;
14276 /* Resize the value if necessary. */
14277 if (short_mode)
14278 src = gen_lowpart (wmode, src);
14280 neg_src = gen_rtx_NEG (wmode, src);
14281 emit_insn (gen_rtx_SET (src, neg_src));
14283 if (short_mode)
14284 src = gen_lowpart (mode, src);
14286 /* Fall-through. */
14287 case PLUS:
14288 ldop_code = AARCH64_LDOP_PLUS;
14289 break;
14291 case IOR:
14292 ldop_code = AARCH64_LDOP_OR;
14293 break;
14295 case XOR:
14296 ldop_code = AARCH64_LDOP_XOR;
14297 break;
14299 case AND:
14301 rtx not_src;
14303 /* Resize the value if necessary. */
14304 if (short_mode)
14305 src = gen_lowpart (wmode, src);
14307 not_src = gen_rtx_NOT (wmode, src);
14308 emit_insn (gen_rtx_SET (src, not_src));
14310 if (short_mode)
14311 src = gen_lowpart (mode, src);
14313 ldop_code = AARCH64_LDOP_BIC;
14314 break;
14316 default:
14317 /* The operation can't be done with atomic instructions. */
14318 gcc_unreachable ();
14321 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14323 /* If necessary, calculate the data in memory after the update by redoing the
14324 operation from values in registers. */
14325 if (!out_result)
14326 return;
14328 if (short_mode)
14330 src = gen_lowpart (wmode, src);
14331 out_data = gen_lowpart (wmode, out_data);
14332 out_result = gen_lowpart (wmode, out_result);
14335 x = NULL_RTX;
14337 switch (code)
14339 case MINUS:
14340 case PLUS:
14341 x = gen_rtx_PLUS (wmode, out_data, src);
14342 break;
14343 case IOR:
14344 x = gen_rtx_IOR (wmode, out_data, src);
14345 break;
14346 case XOR:
14347 x = gen_rtx_XOR (wmode, out_data, src);
14348 break;
14349 case AND:
14350 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14351 return;
14352 default:
14353 gcc_unreachable ();
14356 emit_set_insn (out_result, x);
14358 return;
14361 /* Split an atomic operation. */
14363 void
14364 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14365 rtx value, rtx model_rtx, rtx cond)
14367 machine_mode mode = GET_MODE (mem);
14368 machine_mode wmode = (mode == DImode ? DImode : SImode);
14369 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14370 const bool is_sync = is_mm_sync (model);
14371 rtx_code_label *label;
14372 rtx x;
14374 /* Split the atomic operation into a sequence. */
14375 label = gen_label_rtx ();
14376 emit_label (label);
14378 if (new_out)
14379 new_out = gen_lowpart (wmode, new_out);
14380 if (old_out)
14381 old_out = gen_lowpart (wmode, old_out);
14382 else
14383 old_out = new_out;
14384 value = simplify_gen_subreg (wmode, value, mode, 0);
14386 /* The initial load can be relaxed for a __sync operation since a final
14387 barrier will be emitted to stop code hoisting. */
14388 if (is_sync)
14389 aarch64_emit_load_exclusive (mode, old_out, mem,
14390 GEN_INT (MEMMODEL_RELAXED));
14391 else
14392 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14394 switch (code)
14396 case SET:
14397 new_out = value;
14398 break;
14400 case NOT:
14401 x = gen_rtx_AND (wmode, old_out, value);
14402 emit_insn (gen_rtx_SET (new_out, x));
14403 x = gen_rtx_NOT (wmode, new_out);
14404 emit_insn (gen_rtx_SET (new_out, x));
14405 break;
14407 case MINUS:
14408 if (CONST_INT_P (value))
14410 value = GEN_INT (-INTVAL (value));
14411 code = PLUS;
14413 /* Fall through. */
14415 default:
14416 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14417 emit_insn (gen_rtx_SET (new_out, x));
14418 break;
14421 aarch64_emit_store_exclusive (mode, cond, mem,
14422 gen_lowpart (mode, new_out), model_rtx);
14424 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14425 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14426 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14427 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14429 /* Emit any final barrier needed for a __sync operation. */
14430 if (is_sync)
14431 aarch64_emit_post_barrier (model);
14434 static void
14435 aarch64_init_libfuncs (void)
14437 /* Half-precision float operations. The compiler handles all operations
14438 with NULL libfuncs by converting to SFmode. */
14440 /* Conversions. */
14441 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14442 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14444 /* Arithmetic. */
14445 set_optab_libfunc (add_optab, HFmode, NULL);
14446 set_optab_libfunc (sdiv_optab, HFmode, NULL);
14447 set_optab_libfunc (smul_optab, HFmode, NULL);
14448 set_optab_libfunc (neg_optab, HFmode, NULL);
14449 set_optab_libfunc (sub_optab, HFmode, NULL);
14451 /* Comparisons. */
14452 set_optab_libfunc (eq_optab, HFmode, NULL);
14453 set_optab_libfunc (ne_optab, HFmode, NULL);
14454 set_optab_libfunc (lt_optab, HFmode, NULL);
14455 set_optab_libfunc (le_optab, HFmode, NULL);
14456 set_optab_libfunc (ge_optab, HFmode, NULL);
14457 set_optab_libfunc (gt_optab, HFmode, NULL);
14458 set_optab_libfunc (unord_optab, HFmode, NULL);
14461 /* Target hook for c_mode_for_suffix. */
14462 static machine_mode
14463 aarch64_c_mode_for_suffix (char suffix)
14465 if (suffix == 'q')
14466 return TFmode;
14468 return VOIDmode;
14471 /* We can only represent floating point constants which will fit in
14472 "quarter-precision" values. These values are characterised by
14473 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14476 (-1)^s * (n/16) * 2^r
14478 Where:
14479 's' is the sign bit.
14480 'n' is an integer in the range 16 <= n <= 31.
14481 'r' is an integer in the range -3 <= r <= 4. */
14483 /* Return true iff X can be represented by a quarter-precision
14484 floating point immediate operand X. Note, we cannot represent 0.0. */
14485 bool
14486 aarch64_float_const_representable_p (rtx x)
14488 /* This represents our current view of how many bits
14489 make up the mantissa. */
14490 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14491 int exponent;
14492 unsigned HOST_WIDE_INT mantissa, mask;
14493 REAL_VALUE_TYPE r, m;
14494 bool fail;
14496 if (!CONST_DOUBLE_P (x))
14497 return false;
14499 /* We don't support HFmode constants yet. */
14500 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14501 return false;
14503 r = *CONST_DOUBLE_REAL_VALUE (x);
14505 /* We cannot represent infinities, NaNs or +/-zero. We won't
14506 know if we have +zero until we analyse the mantissa, but we
14507 can reject the other invalid values. */
14508 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14509 || REAL_VALUE_MINUS_ZERO (r))
14510 return false;
14512 /* Extract exponent. */
14513 r = real_value_abs (&r);
14514 exponent = REAL_EXP (&r);
14516 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14517 highest (sign) bit, with a fixed binary point at bit point_pos.
14518 m1 holds the low part of the mantissa, m2 the high part.
14519 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14520 bits for the mantissa, this can fail (low bits will be lost). */
14521 real_ldexp (&m, &r, point_pos - exponent);
14522 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14524 /* If the low part of the mantissa has bits set we cannot represent
14525 the value. */
14526 if (w.ulow () != 0)
14527 return false;
14528 /* We have rejected the lower HOST_WIDE_INT, so update our
14529 understanding of how many bits lie in the mantissa and
14530 look only at the high HOST_WIDE_INT. */
14531 mantissa = w.elt (1);
14532 point_pos -= HOST_BITS_PER_WIDE_INT;
14534 /* We can only represent values with a mantissa of the form 1.xxxx. */
14535 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14536 if ((mantissa & mask) != 0)
14537 return false;
14539 /* Having filtered unrepresentable values, we may now remove all
14540 but the highest 5 bits. */
14541 mantissa >>= point_pos - 5;
14543 /* We cannot represent the value 0.0, so reject it. This is handled
14544 elsewhere. */
14545 if (mantissa == 0)
14546 return false;
14548 /* Then, as bit 4 is always set, we can mask it off, leaving
14549 the mantissa in the range [0, 15]. */
14550 mantissa &= ~(1 << 4);
14551 gcc_assert (mantissa <= 15);
14553 /* GCC internally does not use IEEE754-like encoding (where normalized
14554 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14555 Our mantissa values are shifted 4 places to the left relative to
14556 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14557 by 5 places to correct for GCC's representation. */
14558 exponent = 5 - exponent;
14560 return (exponent >= 0 && exponent <= 7);
14563 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14564 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14565 output MOVI/MVNI, ORR or BIC immediate. */
14566 char*
14567 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14568 enum simd_immediate_check which)
14570 bool is_valid;
14571 static char templ[40];
14572 const char *mnemonic;
14573 const char *shift_op;
14574 unsigned int lane_count = 0;
14575 char element_char;
14577 struct simd_immediate_info info;
14579 /* This will return true to show const_vector is legal for use as either
14580 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14581 It will also update INFO to show how the immediate should be generated.
14582 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14583 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14584 gcc_assert (is_valid);
14586 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14587 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14589 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14591 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14592 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14593 move immediate path. */
14594 if (aarch64_float_const_zero_rtx_p (info.value))
14595 info.value = GEN_INT (0);
14596 else
14598 const unsigned int buf_size = 20;
14599 char float_buf[buf_size] = {'\0'};
14600 real_to_decimal_for_mode (float_buf,
14601 CONST_DOUBLE_REAL_VALUE (info.value),
14602 buf_size, buf_size, 1, info.elt_mode);
14604 if (lane_count == 1)
14605 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14606 else
14607 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14608 lane_count, element_char, float_buf);
14609 return templ;
14613 gcc_assert (CONST_INT_P (info.value));
14615 if (which == AARCH64_CHECK_MOV)
14617 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14618 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14619 if (lane_count == 1)
14620 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14621 mnemonic, UINTVAL (info.value));
14622 else if (info.shift)
14623 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14624 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14625 element_char, UINTVAL (info.value), shift_op, info.shift);
14626 else
14627 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14628 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14629 element_char, UINTVAL (info.value));
14631 else
14633 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
14634 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14635 if (info.shift)
14636 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14637 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14638 element_char, UINTVAL (info.value), "lsl", info.shift);
14639 else
14640 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14641 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14642 element_char, UINTVAL (info.value));
14644 return templ;
14647 char*
14648 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14651 /* If a floating point number was passed and we desire to use it in an
14652 integer mode do the conversion to integer. */
14653 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14655 unsigned HOST_WIDE_INT ival;
14656 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14657 gcc_unreachable ();
14658 immediate = gen_int_mode (ival, mode);
14661 machine_mode vmode;
14662 /* use a 64 bit mode for everything except for DI/DF mode, where we use
14663 a 128 bit vector mode. */
14664 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14666 vmode = aarch64_simd_container_mode (mode, width);
14667 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14668 return aarch64_output_simd_mov_immediate (v_op, width);
14671 /* Return the output string to use for moving immediate CONST_VECTOR
14672 into an SVE register. */
14674 char *
14675 aarch64_output_sve_mov_immediate (rtx const_vector)
14677 static char templ[40];
14678 struct simd_immediate_info info;
14679 char element_char;
14681 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
14682 gcc_assert (is_valid);
14684 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14686 if (info.step)
14688 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
14689 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
14690 element_char, INTVAL (info.value), INTVAL (info.step));
14691 return templ;
14694 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14696 if (aarch64_float_const_zero_rtx_p (info.value))
14697 info.value = GEN_INT (0);
14698 else
14700 const int buf_size = 20;
14701 char float_buf[buf_size] = {};
14702 real_to_decimal_for_mode (float_buf,
14703 CONST_DOUBLE_REAL_VALUE (info.value),
14704 buf_size, buf_size, 1, info.elt_mode);
14706 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
14707 element_char, float_buf);
14708 return templ;
14712 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
14713 element_char, INTVAL (info.value));
14714 return templ;
14717 /* Return the asm format for a PTRUE instruction whose destination has
14718 mode MODE. SUFFIX is the element size suffix. */
14720 char *
14721 aarch64_output_ptrue (machine_mode mode, char suffix)
14723 unsigned int nunits;
14724 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
14725 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
14726 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
14727 else
14728 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
14729 return buf;
14732 /* Split operands into moves from op[1] + op[2] into op[0]. */
14734 void
14735 aarch64_split_combinev16qi (rtx operands[3])
14737 unsigned int dest = REGNO (operands[0]);
14738 unsigned int src1 = REGNO (operands[1]);
14739 unsigned int src2 = REGNO (operands[2]);
14740 machine_mode halfmode = GET_MODE (operands[1]);
14741 unsigned int halfregs = REG_NREGS (operands[1]);
14742 rtx destlo, desthi;
14744 gcc_assert (halfmode == V16QImode);
14746 if (src1 == dest && src2 == dest + halfregs)
14748 /* No-op move. Can't split to nothing; emit something. */
14749 emit_note (NOTE_INSN_DELETED);
14750 return;
14753 /* Preserve register attributes for variable tracking. */
14754 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
14755 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
14756 GET_MODE_SIZE (halfmode));
14758 /* Special case of reversed high/low parts. */
14759 if (reg_overlap_mentioned_p (operands[2], destlo)
14760 && reg_overlap_mentioned_p (operands[1], desthi))
14762 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14763 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
14764 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14766 else if (!reg_overlap_mentioned_p (operands[2], destlo))
14768 /* Try to avoid unnecessary moves if part of the result
14769 is in the right place already. */
14770 if (src1 != dest)
14771 emit_move_insn (destlo, operands[1]);
14772 if (src2 != dest + halfregs)
14773 emit_move_insn (desthi, operands[2]);
14775 else
14777 if (src2 != dest + halfregs)
14778 emit_move_insn (desthi, operands[2]);
14779 if (src1 != dest)
14780 emit_move_insn (destlo, operands[1]);
14784 /* vec_perm support. */
14786 struct expand_vec_perm_d
14788 rtx target, op0, op1;
14789 vec_perm_indices perm;
14790 machine_mode vmode;
14791 unsigned int vec_flags;
14792 bool one_vector_p;
14793 bool testing_p;
14796 /* Generate a variable permutation. */
14798 static void
14799 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
14801 machine_mode vmode = GET_MODE (target);
14802 bool one_vector_p = rtx_equal_p (op0, op1);
14804 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
14805 gcc_checking_assert (GET_MODE (op0) == vmode);
14806 gcc_checking_assert (GET_MODE (op1) == vmode);
14807 gcc_checking_assert (GET_MODE (sel) == vmode);
14808 gcc_checking_assert (TARGET_SIMD);
14810 if (one_vector_p)
14812 if (vmode == V8QImode)
14814 /* Expand the argument to a V16QI mode by duplicating it. */
14815 rtx pair = gen_reg_rtx (V16QImode);
14816 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
14817 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14819 else
14821 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
14824 else
14826 rtx pair;
14828 if (vmode == V8QImode)
14830 pair = gen_reg_rtx (V16QImode);
14831 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
14832 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14834 else
14836 pair = gen_reg_rtx (OImode);
14837 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
14838 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
14843 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
14844 NELT is the number of elements in the vector. */
14846 void
14847 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
14848 unsigned int nelt)
14850 machine_mode vmode = GET_MODE (target);
14851 bool one_vector_p = rtx_equal_p (op0, op1);
14852 rtx mask;
14854 /* The TBL instruction does not use a modulo index, so we must take care
14855 of that ourselves. */
14856 mask = aarch64_simd_gen_const_vector_dup (vmode,
14857 one_vector_p ? nelt - 1 : 2 * nelt - 1);
14858 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
14860 /* For big-endian, we also need to reverse the index within the vector
14861 (but not which vector). */
14862 if (BYTES_BIG_ENDIAN)
14864 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
14865 if (!one_vector_p)
14866 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
14867 sel = expand_simple_binop (vmode, XOR, sel, mask,
14868 NULL, 0, OPTAB_LIB_WIDEN);
14870 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
14873 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
14875 static void
14876 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
14878 emit_insn (gen_rtx_SET (target,
14879 gen_rtx_UNSPEC (GET_MODE (target),
14880 gen_rtvec (2, op0, op1), code)));
14883 /* Expand an SVE vec_perm with the given operands. */
14885 void
14886 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
14888 machine_mode data_mode = GET_MODE (target);
14889 machine_mode sel_mode = GET_MODE (sel);
14890 /* Enforced by the pattern condition. */
14891 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
14893 /* Note: vec_perm indices are supposed to wrap when they go beyond the
14894 size of the two value vectors, i.e. the upper bits of the indices
14895 are effectively ignored. SVE TBL instead produces 0 for any
14896 out-of-range indices, so we need to modulo all the vec_perm indices
14897 to ensure they are all in range. */
14898 rtx sel_reg = force_reg (sel_mode, sel);
14900 /* Check if the sel only references the first values vector. */
14901 if (GET_CODE (sel) == CONST_VECTOR
14902 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
14904 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
14905 return;
14908 /* Check if the two values vectors are the same. */
14909 if (rtx_equal_p (op0, op1))
14911 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
14912 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
14913 NULL, 0, OPTAB_DIRECT);
14914 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
14915 return;
14918 /* Run TBL on for each value vector and combine the results. */
14920 rtx res0 = gen_reg_rtx (data_mode);
14921 rtx res1 = gen_reg_rtx (data_mode);
14922 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
14923 if (GET_CODE (sel) != CONST_VECTOR
14924 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
14926 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
14927 2 * nunits - 1);
14928 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
14929 NULL, 0, OPTAB_DIRECT);
14931 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
14932 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
14933 NULL, 0, OPTAB_DIRECT);
14934 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
14935 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
14936 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
14937 else
14938 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
14941 /* Recognize patterns suitable for the TRN instructions. */
14942 static bool
14943 aarch64_evpc_trn (struct expand_vec_perm_d *d)
14945 HOST_WIDE_INT odd;
14946 poly_uint64 nelt = d->perm.length ();
14947 rtx out, in0, in1, x;
14948 machine_mode vmode = d->vmode;
14950 if (GET_MODE_UNIT_SIZE (vmode) > 8)
14951 return false;
14953 /* Note that these are little-endian tests.
14954 We correct for big-endian later. */
14955 if (!d->perm[0].is_constant (&odd)
14956 || (odd != 0 && odd != 1)
14957 || !d->perm.series_p (0, 2, odd, 2)
14958 || !d->perm.series_p (1, 2, nelt + odd, 2))
14959 return false;
14961 /* Success! */
14962 if (d->testing_p)
14963 return true;
14965 in0 = d->op0;
14966 in1 = d->op1;
14967 /* We don't need a big-endian lane correction for SVE; see the comment
14968 at the head of aarch64-sve.md for details. */
14969 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
14971 x = in0, in0 = in1, in1 = x;
14972 odd = !odd;
14974 out = d->target;
14976 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
14977 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
14978 return true;
14981 /* Recognize patterns suitable for the UZP instructions. */
14982 static bool
14983 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
14985 HOST_WIDE_INT odd;
14986 rtx out, in0, in1, x;
14987 machine_mode vmode = d->vmode;
14989 if (GET_MODE_UNIT_SIZE (vmode) > 8)
14990 return false;
14992 /* Note that these are little-endian tests.
14993 We correct for big-endian later. */
14994 if (!d->perm[0].is_constant (&odd)
14995 || (odd != 0 && odd != 1)
14996 || !d->perm.series_p (0, 1, odd, 2))
14997 return false;
14999 /* Success! */
15000 if (d->testing_p)
15001 return true;
15003 in0 = d->op0;
15004 in1 = d->op1;
15005 /* We don't need a big-endian lane correction for SVE; see the comment
15006 at the head of aarch64-sve.md for details. */
15007 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15009 x = in0, in0 = in1, in1 = x;
15010 odd = !odd;
15012 out = d->target;
15014 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15015 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15016 return true;
15019 /* Recognize patterns suitable for the ZIP instructions. */
15020 static bool
15021 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15023 unsigned int high;
15024 poly_uint64 nelt = d->perm.length ();
15025 rtx out, in0, in1, x;
15026 machine_mode vmode = d->vmode;
15028 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15029 return false;
15031 /* Note that these are little-endian tests.
15032 We correct for big-endian later. */
15033 poly_uint64 first = d->perm[0];
15034 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15035 || !d->perm.series_p (0, 2, first, 1)
15036 || !d->perm.series_p (1, 2, first + nelt, 1))
15037 return false;
15038 high = maybe_ne (first, 0U);
15040 /* Success! */
15041 if (d->testing_p)
15042 return true;
15044 in0 = d->op0;
15045 in1 = d->op1;
15046 /* We don't need a big-endian lane correction for SVE; see the comment
15047 at the head of aarch64-sve.md for details. */
15048 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15050 x = in0, in0 = in1, in1 = x;
15051 high = !high;
15053 out = d->target;
15055 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15056 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15057 return true;
15060 /* Recognize patterns for the EXT insn. */
15062 static bool
15063 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15065 HOST_WIDE_INT location;
15066 rtx offset;
15068 /* The first element always refers to the first vector.
15069 Check if the extracted indices are increasing by one. */
15070 if (d->vec_flags == VEC_SVE_PRED
15071 || !d->perm[0].is_constant (&location)
15072 || !d->perm.series_p (0, 1, location, 1))
15073 return false;
15075 /* Success! */
15076 if (d->testing_p)
15077 return true;
15079 /* The case where (location == 0) is a no-op for both big- and little-endian,
15080 and is removed by the mid-end at optimization levels -O1 and higher.
15082 We don't need a big-endian lane correction for SVE; see the comment
15083 at the head of aarch64-sve.md for details. */
15084 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15086 /* After setup, we want the high elements of the first vector (stored
15087 at the LSB end of the register), and the low elements of the second
15088 vector (stored at the MSB end of the register). So swap. */
15089 std::swap (d->op0, d->op1);
15090 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15091 to_constant () is safe since this is restricted to Advanced SIMD
15092 vectors. */
15093 location = d->perm.length ().to_constant () - location;
15096 offset = GEN_INT (location);
15097 emit_set_insn (d->target,
15098 gen_rtx_UNSPEC (d->vmode,
15099 gen_rtvec (3, d->op0, d->op1, offset),
15100 UNSPEC_EXT));
15101 return true;
15104 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15105 within each 64-bit, 32-bit or 16-bit granule. */
15107 static bool
15108 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15110 HOST_WIDE_INT diff;
15111 unsigned int i, size, unspec;
15112 machine_mode pred_mode;
15114 if (d->vec_flags == VEC_SVE_PRED
15115 || !d->one_vector_p
15116 || !d->perm[0].is_constant (&diff))
15117 return false;
15119 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15120 if (size == 8)
15122 unspec = UNSPEC_REV64;
15123 pred_mode = VNx2BImode;
15125 else if (size == 4)
15127 unspec = UNSPEC_REV32;
15128 pred_mode = VNx4BImode;
15130 else if (size == 2)
15132 unspec = UNSPEC_REV16;
15133 pred_mode = VNx8BImode;
15135 else
15136 return false;
15138 unsigned int step = diff + 1;
15139 for (i = 0; i < step; ++i)
15140 if (!d->perm.series_p (i, step, diff - i, step))
15141 return false;
15143 /* Success! */
15144 if (d->testing_p)
15145 return true;
15147 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15148 if (d->vec_flags == VEC_SVE_DATA)
15150 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15151 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15152 UNSPEC_MERGE_PTRUE);
15154 emit_set_insn (d->target, src);
15155 return true;
15158 /* Recognize patterns for the REV insn, which reverses elements within
15159 a full vector. */
15161 static bool
15162 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15164 poly_uint64 nelt = d->perm.length ();
15166 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15167 return false;
15169 if (!d->perm.series_p (0, 1, nelt - 1, -1))
15170 return false;
15172 /* Success! */
15173 if (d->testing_p)
15174 return true;
15176 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15177 emit_set_insn (d->target, src);
15178 return true;
15181 static bool
15182 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15184 rtx out = d->target;
15185 rtx in0;
15186 HOST_WIDE_INT elt;
15187 machine_mode vmode = d->vmode;
15188 rtx lane;
15190 if (d->vec_flags == VEC_SVE_PRED
15191 || d->perm.encoding ().encoded_nelts () != 1
15192 || !d->perm[0].is_constant (&elt))
15193 return false;
15195 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15196 return false;
15198 /* Success! */
15199 if (d->testing_p)
15200 return true;
15202 /* The generic preparation in aarch64_expand_vec_perm_const_1
15203 swaps the operand order and the permute indices if it finds
15204 d->perm[0] to be in the second operand. Thus, we can always
15205 use d->op0 and need not do any extra arithmetic to get the
15206 correct lane number. */
15207 in0 = d->op0;
15208 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
15210 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15211 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15212 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15213 return true;
15216 static bool
15217 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15219 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15220 machine_mode vmode = d->vmode;
15222 /* Make sure that the indices are constant. */
15223 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15224 for (unsigned int i = 0; i < encoded_nelts; ++i)
15225 if (!d->perm[i].is_constant ())
15226 return false;
15228 if (d->testing_p)
15229 return true;
15231 /* Generic code will try constant permutation twice. Once with the
15232 original mode and again with the elements lowered to QImode.
15233 So wait and don't do the selector expansion ourselves. */
15234 if (vmode != V8QImode && vmode != V16QImode)
15235 return false;
15237 /* to_constant is safe since this routine is specific to Advanced SIMD
15238 vectors. */
15239 unsigned int nelt = d->perm.length ().to_constant ();
15240 for (unsigned int i = 0; i < nelt; ++i)
15241 /* If big-endian and two vectors we end up with a weird mixed-endian
15242 mode on NEON. Reverse the index within each word but not the word
15243 itself. to_constant is safe because we checked is_constant above. */
15244 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15245 ? d->perm[i].to_constant () ^ (nelt - 1)
15246 : d->perm[i].to_constant ());
15248 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15249 sel = force_reg (vmode, sel);
15251 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15252 return true;
15255 /* Try to implement D using an SVE TBL instruction. */
15257 static bool
15258 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15260 unsigned HOST_WIDE_INT nelt;
15262 /* Permuting two variable-length vectors could overflow the
15263 index range. */
15264 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15265 return false;
15267 if (d->testing_p)
15268 return true;
15270 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15271 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15272 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15273 return true;
15276 static bool
15277 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15279 /* The pattern matching functions above are written to look for a small
15280 number to begin the sequence (0, 1, N/2). If we begin with an index
15281 from the second operand, we can swap the operands. */
15282 poly_int64 nelt = d->perm.length ();
15283 if (known_ge (d->perm[0], nelt))
15285 d->perm.rotate_inputs (1);
15286 std::swap (d->op0, d->op1);
15289 if ((d->vec_flags == VEC_ADVSIMD
15290 || d->vec_flags == VEC_SVE_DATA
15291 || d->vec_flags == VEC_SVE_PRED)
15292 && known_gt (nelt, 1))
15294 if (aarch64_evpc_rev_local (d))
15295 return true;
15296 else if (aarch64_evpc_rev_global (d))
15297 return true;
15298 else if (aarch64_evpc_ext (d))
15299 return true;
15300 else if (aarch64_evpc_dup (d))
15301 return true;
15302 else if (aarch64_evpc_zip (d))
15303 return true;
15304 else if (aarch64_evpc_uzp (d))
15305 return true;
15306 else if (aarch64_evpc_trn (d))
15307 return true;
15308 if (d->vec_flags == VEC_SVE_DATA)
15309 return aarch64_evpc_sve_tbl (d);
15310 else if (d->vec_flags == VEC_SVE_DATA)
15311 return aarch64_evpc_tbl (d);
15313 return false;
15316 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15318 static bool
15319 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15320 rtx op1, const vec_perm_indices &sel)
15322 struct expand_vec_perm_d d;
15324 /* Check whether the mask can be applied to a single vector. */
15325 if (op0 && rtx_equal_p (op0, op1))
15326 d.one_vector_p = true;
15327 else if (sel.all_from_input_p (0))
15329 d.one_vector_p = true;
15330 op1 = op0;
15332 else if (sel.all_from_input_p (1))
15334 d.one_vector_p = true;
15335 op0 = op1;
15337 else
15338 d.one_vector_p = false;
15340 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15341 sel.nelts_per_input ());
15342 d.vmode = vmode;
15343 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15344 d.target = target;
15345 d.op0 = op0;
15346 d.op1 = op1;
15347 d.testing_p = !target;
15349 if (!d.testing_p)
15350 return aarch64_expand_vec_perm_const_1 (&d);
15352 rtx_insn *last = get_last_insn ();
15353 bool ret = aarch64_expand_vec_perm_const_1 (&d);
15354 gcc_assert (last == get_last_insn ());
15356 return ret;
15359 /* Generate a byte permute mask for a register of mode MODE,
15360 which has NUNITS units. */
15363 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15365 /* We have to reverse each vector because we dont have
15366 a permuted load that can reverse-load according to ABI rules. */
15367 rtx mask;
15368 rtvec v = rtvec_alloc (16);
15369 unsigned int i, j;
15370 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15372 gcc_assert (BYTES_BIG_ENDIAN);
15373 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15375 for (i = 0; i < nunits; i++)
15376 for (j = 0; j < usize; j++)
15377 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15378 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15379 return force_reg (V16QImode, mask);
15382 /* Return true if X is a valid second operand for the SVE instruction
15383 that implements integer comparison OP_CODE. */
15385 static bool
15386 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15388 if (register_operand (x, VOIDmode))
15389 return true;
15391 switch (op_code)
15393 case LTU:
15394 case LEU:
15395 case GEU:
15396 case GTU:
15397 return aarch64_sve_cmp_immediate_p (x, false);
15398 case LT:
15399 case LE:
15400 case GE:
15401 case GT:
15402 case NE:
15403 case EQ:
15404 return aarch64_sve_cmp_immediate_p (x, true);
15405 default:
15406 gcc_unreachable ();
15410 /* Return the UNSPEC_COND_* code for comparison CODE. */
15412 static unsigned int
15413 aarch64_unspec_cond_code (rtx_code code)
15415 switch (code)
15417 case NE:
15418 return UNSPEC_COND_NE;
15419 case EQ:
15420 return UNSPEC_COND_EQ;
15421 case LT:
15422 return UNSPEC_COND_LT;
15423 case GT:
15424 return UNSPEC_COND_GT;
15425 case LE:
15426 return UNSPEC_COND_LE;
15427 case GE:
15428 return UNSPEC_COND_GE;
15429 case LTU:
15430 return UNSPEC_COND_LO;
15431 case GTU:
15432 return UNSPEC_COND_HI;
15433 case LEU:
15434 return UNSPEC_COND_LS;
15435 case GEU:
15436 return UNSPEC_COND_HS;
15437 case UNORDERED:
15438 return UNSPEC_COND_UO;
15439 default:
15440 gcc_unreachable ();
15444 /* Return an (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>) expression,
15445 where <X> is the operation associated with comparison CODE. */
15447 static rtx
15448 aarch64_gen_unspec_cond (rtx_code code, machine_mode pred_mode,
15449 rtx pred, rtx op0, rtx op1)
15451 rtvec vec = gen_rtvec (3, pred, op0, op1);
15452 return gen_rtx_UNSPEC (pred_mode, vec, aarch64_unspec_cond_code (code));
15455 /* Expand an SVE integer comparison:
15457 TARGET = CODE (OP0, OP1). */
15459 void
15460 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15462 machine_mode pred_mode = GET_MODE (target);
15463 machine_mode data_mode = GET_MODE (op0);
15465 if (!aarch64_sve_cmp_operand_p (code, op1))
15466 op1 = force_reg (data_mode, op1);
15468 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15469 rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, ptrue, op0, op1);
15470 emit_insn (gen_set_clobber_cc (target, unspec));
15473 /* Emit an instruction:
15475 (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15477 where <X> is the operation associated with comparison CODE. */
15479 static void
15480 aarch64_emit_unspec_cond (rtx target, rtx_code code, machine_mode pred_mode,
15481 rtx pred, rtx op0, rtx op1)
15483 rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, pred, op0, op1);
15484 emit_set_insn (target, unspec);
15487 /* Emit:
15489 (set TMP1 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X1>))
15490 (set TMP2 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X2>))
15491 (set TARGET (and:PRED_MODE (ior:PRED_MODE TMP1 TMP2) PTRUE))
15493 where <Xi> is the operation associated with comparison CODEi. */
15495 static void
15496 aarch64_emit_unspec_cond_or (rtx target, rtx_code code1, rtx_code code2,
15497 machine_mode pred_mode, rtx ptrue,
15498 rtx op0, rtx op1)
15500 rtx tmp1 = gen_reg_rtx (pred_mode);
15501 aarch64_emit_unspec_cond (tmp1, code1, pred_mode, ptrue, op0, op1);
15502 rtx tmp2 = gen_reg_rtx (pred_mode);
15503 aarch64_emit_unspec_cond (tmp2, code2, pred_mode, ptrue, op0, op1);
15504 emit_set_insn (target, gen_rtx_AND (pred_mode,
15505 gen_rtx_IOR (pred_mode, tmp1, tmp2),
15506 ptrue));
15509 /* If CAN_INVERT_P, emit an instruction:
15511 (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15513 where <X> is the operation associated with comparison CODE. Otherwise
15514 emit:
15516 (set TMP (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15517 (set TARGET (and:PRED_MODE (not:PRED_MODE TMP) PTRUE))
15519 where the second instructions sets TARGET to the inverse of TMP. */
15521 static void
15522 aarch64_emit_inverted_unspec_cond (rtx target, rtx_code code,
15523 machine_mode pred_mode, rtx ptrue, rtx pred,
15524 rtx op0, rtx op1, bool can_invert_p)
15526 if (can_invert_p)
15527 aarch64_emit_unspec_cond (target, code, pred_mode, pred, op0, op1);
15528 else
15530 rtx tmp = gen_reg_rtx (pred_mode);
15531 aarch64_emit_unspec_cond (tmp, code, pred_mode, pred, op0, op1);
15532 emit_set_insn (target, gen_rtx_AND (pred_mode,
15533 gen_rtx_NOT (pred_mode, tmp),
15534 ptrue));
15538 /* Expand an SVE floating-point comparison:
15540 TARGET = CODE (OP0, OP1)
15542 If CAN_INVERT_P is true, the caller can also handle inverted results;
15543 return true if the result is in fact inverted. */
15545 bool
15546 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15547 rtx op0, rtx op1, bool can_invert_p)
15549 machine_mode pred_mode = GET_MODE (target);
15550 machine_mode data_mode = GET_MODE (op0);
15552 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15553 switch (code)
15555 case UNORDERED:
15556 /* UNORDERED has no immediate form. */
15557 op1 = force_reg (data_mode, op1);
15558 aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15559 return false;
15561 case LT:
15562 case LE:
15563 case GT:
15564 case GE:
15565 case EQ:
15566 case NE:
15567 /* There is native support for the comparison. */
15568 aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15569 return false;
15571 case ORDERED:
15572 /* There is native support for the inverse comparison. */
15573 op1 = force_reg (data_mode, op1);
15574 aarch64_emit_inverted_unspec_cond (target, UNORDERED,
15575 pred_mode, ptrue, ptrue, op0, op1,
15576 can_invert_p);
15577 return can_invert_p;
15579 case LTGT:
15580 /* This is a trapping operation (LT or GT). */
15581 aarch64_emit_unspec_cond_or (target, LT, GT, pred_mode, ptrue, op0, op1);
15582 return false;
15584 case UNEQ:
15585 if (!flag_trapping_math)
15587 /* This would trap for signaling NaNs. */
15588 op1 = force_reg (data_mode, op1);
15589 aarch64_emit_unspec_cond_or (target, UNORDERED, EQ,
15590 pred_mode, ptrue, op0, op1);
15591 return false;
15593 /* fall through */
15595 case UNLT:
15596 case UNLE:
15597 case UNGT:
15598 case UNGE:
15600 rtx ordered = ptrue;
15601 if (flag_trapping_math)
15603 /* Only compare the elements that are known to be ordered. */
15604 ordered = gen_reg_rtx (pred_mode);
15605 op1 = force_reg (data_mode, op1);
15606 aarch64_emit_inverted_unspec_cond (ordered, UNORDERED, pred_mode,
15607 ptrue, ptrue, op0, op1, false);
15609 if (code == UNEQ)
15610 code = NE;
15611 else
15612 code = reverse_condition_maybe_unordered (code);
15613 aarch64_emit_inverted_unspec_cond (target, code, pred_mode, ptrue,
15614 ordered, op0, op1, can_invert_p);
15615 return can_invert_p;
15618 default:
15619 gcc_unreachable ();
15623 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
15624 of the data being selected and CMP_MODE is the mode of the values being
15625 compared. */
15627 void
15628 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15629 rtx *ops)
15631 machine_mode pred_mode
15632 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15633 GET_MODE_SIZE (cmp_mode)).require ();
15634 rtx pred = gen_reg_rtx (pred_mode);
15635 if (FLOAT_MODE_P (cmp_mode))
15637 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15638 ops[4], ops[5], true))
15639 std::swap (ops[1], ops[2]);
15641 else
15642 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15644 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15645 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15648 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
15649 true. However due to issues with register allocation it is preferable
15650 to avoid tieing integer scalar and FP scalar modes. Executing integer
15651 operations in general registers is better than treating them as scalar
15652 vector operations. This reduces latency and avoids redundant int<->FP
15653 moves. So tie modes if they are either the same class, or vector modes
15654 with other vector modes, vector structs or any scalar mode. */
15656 static bool
15657 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15659 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15660 return true;
15662 /* We specifically want to allow elements of "structure" modes to
15663 be tieable to the structure. This more general condition allows
15664 other rarer situations too. The reason we don't extend this to
15665 predicate modes is that there are no predicate structure modes
15666 nor any specific instructions for extracting part of a predicate
15667 register. */
15668 if (aarch64_vector_data_mode_p (mode1)
15669 && aarch64_vector_data_mode_p (mode2))
15670 return true;
15672 /* Also allow any scalar modes with vectors. */
15673 if (aarch64_vector_mode_supported_p (mode1)
15674 || aarch64_vector_mode_supported_p (mode2))
15675 return true;
15677 return false;
15680 /* Return a new RTX holding the result of moving POINTER forward by
15681 AMOUNT bytes. */
15683 static rtx
15684 aarch64_move_pointer (rtx pointer, poly_int64 amount)
15686 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
15688 return adjust_automodify_address (pointer, GET_MODE (pointer),
15689 next, amount);
15692 /* Return a new RTX holding the result of moving POINTER forward by the
15693 size of the mode it points to. */
15695 static rtx
15696 aarch64_progress_pointer (rtx pointer)
15698 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
15701 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15702 MODE bytes. */
15704 static void
15705 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
15706 machine_mode mode)
15708 rtx reg = gen_reg_rtx (mode);
15710 /* "Cast" the pointers to the correct mode. */
15711 *src = adjust_address (*src, mode, 0);
15712 *dst = adjust_address (*dst, mode, 0);
15713 /* Emit the memcpy. */
15714 emit_move_insn (reg, *src);
15715 emit_move_insn (*dst, reg);
15716 /* Move the pointers forward. */
15717 *src = aarch64_progress_pointer (*src);
15718 *dst = aarch64_progress_pointer (*dst);
15721 /* Expand movmem, as if from a __builtin_memcpy. Return true if
15722 we succeed, otherwise return false. */
15724 bool
15725 aarch64_expand_movmem (rtx *operands)
15727 unsigned int n;
15728 rtx dst = operands[0];
15729 rtx src = operands[1];
15730 rtx base;
15731 bool speed_p = !optimize_function_for_size_p (cfun);
15733 /* When optimizing for size, give a better estimate of the length of a
15734 memcpy call, but use the default otherwise. */
15735 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
15737 /* We can't do anything smart if the amount to copy is not constant. */
15738 if (!CONST_INT_P (operands[2]))
15739 return false;
15741 n = UINTVAL (operands[2]);
15743 /* Try to keep the number of instructions low. For cases below 16 bytes we
15744 need to make at most two moves. For cases above 16 bytes it will be one
15745 move for each 16 byte chunk, then at most two additional moves. */
15746 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
15747 return false;
15749 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
15750 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
15752 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
15753 src = adjust_automodify_address (src, VOIDmode, base, 0);
15755 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
15756 1-byte chunk. */
15757 if (n < 4)
15759 if (n >= 2)
15761 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
15762 n -= 2;
15765 if (n == 1)
15766 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
15768 return true;
15771 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
15772 4-byte chunk, partially overlapping with the previously copied chunk. */
15773 if (n < 8)
15775 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15776 n -= 4;
15777 if (n > 0)
15779 int move = n - 4;
15781 src = aarch64_move_pointer (src, move);
15782 dst = aarch64_move_pointer (dst, move);
15783 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15785 return true;
15788 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
15789 them, then (if applicable) an 8-byte chunk. */
15790 while (n >= 8)
15792 if (n / 16)
15794 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
15795 n -= 16;
15797 else
15799 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
15800 n -= 8;
15804 /* Finish the final bytes of the copy. We can always do this in one
15805 instruction. We either copy the exact amount we need, or partially
15806 overlap with the previous chunk we copied and copy 8-bytes. */
15807 if (n == 0)
15808 return true;
15809 else if (n == 1)
15810 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
15811 else if (n == 2)
15812 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
15813 else if (n == 4)
15814 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15815 else
15817 if (n == 3)
15819 src = aarch64_move_pointer (src, -1);
15820 dst = aarch64_move_pointer (dst, -1);
15821 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15823 else
15825 int move = n - 8;
15827 src = aarch64_move_pointer (src, move);
15828 dst = aarch64_move_pointer (dst, move);
15829 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
15833 return true;
15836 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
15837 SImode stores. Handle the case when the constant has identical
15838 bottom and top halves. This is beneficial when the two stores can be
15839 merged into an STP and we avoid synthesising potentially expensive
15840 immediates twice. Return true if such a split is possible. */
15842 bool
15843 aarch64_split_dimode_const_store (rtx dst, rtx src)
15845 rtx lo = gen_lowpart (SImode, src);
15846 rtx hi = gen_highpart_mode (SImode, DImode, src);
15848 bool size_p = optimize_function_for_size_p (cfun);
15850 if (!rtx_equal_p (lo, hi))
15851 return false;
15853 unsigned int orig_cost
15854 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
15855 unsigned int lo_cost
15856 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
15858 /* We want to transform:
15859 MOV x1, 49370
15860 MOVK x1, 0x140, lsl 16
15861 MOVK x1, 0xc0da, lsl 32
15862 MOVK x1, 0x140, lsl 48
15863 STR x1, [x0]
15864 into:
15865 MOV w1, 49370
15866 MOVK w1, 0x140, lsl 16
15867 STP w1, w1, [x0]
15868 So we want to perform this only when we save two instructions
15869 or more. When optimizing for size, however, accept any code size
15870 savings we can. */
15871 if (size_p && orig_cost <= lo_cost)
15872 return false;
15874 if (!size_p
15875 && (orig_cost <= lo_cost + 1))
15876 return false;
15878 rtx mem_lo = adjust_address (dst, SImode, 0);
15879 if (!aarch64_mem_pair_operand (mem_lo, SImode))
15880 return false;
15882 rtx tmp_reg = gen_reg_rtx (SImode);
15883 aarch64_expand_mov_immediate (tmp_reg, lo);
15884 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
15885 /* Don't emit an explicit store pair as this may not be always profitable.
15886 Let the sched-fusion logic decide whether to merge them. */
15887 emit_move_insn (mem_lo, tmp_reg);
15888 emit_move_insn (mem_hi, tmp_reg);
15890 return true;
15893 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
15895 static unsigned HOST_WIDE_INT
15896 aarch64_asan_shadow_offset (void)
15898 return (HOST_WIDE_INT_1 << 36);
15901 static rtx
15902 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
15903 int code, tree treeop0, tree treeop1)
15905 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
15906 rtx op0, op1;
15907 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
15908 insn_code icode;
15909 struct expand_operand ops[4];
15911 start_sequence ();
15912 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
15914 op_mode = GET_MODE (op0);
15915 if (op_mode == VOIDmode)
15916 op_mode = GET_MODE (op1);
15918 switch (op_mode)
15920 case E_QImode:
15921 case E_HImode:
15922 case E_SImode:
15923 cmp_mode = SImode;
15924 icode = CODE_FOR_cmpsi;
15925 break;
15927 case E_DImode:
15928 cmp_mode = DImode;
15929 icode = CODE_FOR_cmpdi;
15930 break;
15932 case E_SFmode:
15933 cmp_mode = SFmode;
15934 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
15935 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
15936 break;
15938 case E_DFmode:
15939 cmp_mode = DFmode;
15940 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
15941 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
15942 break;
15944 default:
15945 end_sequence ();
15946 return NULL_RTX;
15949 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
15950 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
15951 if (!op0 || !op1)
15953 end_sequence ();
15954 return NULL_RTX;
15956 *prep_seq = get_insns ();
15957 end_sequence ();
15959 create_fixed_operand (&ops[0], op0);
15960 create_fixed_operand (&ops[1], op1);
15962 start_sequence ();
15963 if (!maybe_expand_insn (icode, 2, ops))
15965 end_sequence ();
15966 return NULL_RTX;
15968 *gen_seq = get_insns ();
15969 end_sequence ();
15971 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
15972 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
15975 static rtx
15976 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
15977 int cmp_code, tree treeop0, tree treeop1, int bit_code)
15979 rtx op0, op1, target;
15980 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
15981 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
15982 insn_code icode;
15983 struct expand_operand ops[6];
15984 int aarch64_cond;
15986 push_to_sequence (*prep_seq);
15987 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
15989 op_mode = GET_MODE (op0);
15990 if (op_mode == VOIDmode)
15991 op_mode = GET_MODE (op1);
15993 switch (op_mode)
15995 case E_QImode:
15996 case E_HImode:
15997 case E_SImode:
15998 cmp_mode = SImode;
15999 icode = CODE_FOR_ccmpsi;
16000 break;
16002 case E_DImode:
16003 cmp_mode = DImode;
16004 icode = CODE_FOR_ccmpdi;
16005 break;
16007 case E_SFmode:
16008 cmp_mode = SFmode;
16009 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16010 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16011 break;
16013 case E_DFmode:
16014 cmp_mode = DFmode;
16015 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16016 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16017 break;
16019 default:
16020 end_sequence ();
16021 return NULL_RTX;
16024 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16025 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16026 if (!op0 || !op1)
16028 end_sequence ();
16029 return NULL_RTX;
16031 *prep_seq = get_insns ();
16032 end_sequence ();
16034 target = gen_rtx_REG (cc_mode, CC_REGNUM);
16035 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16037 if (bit_code != AND)
16039 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16040 GET_MODE (XEXP (prev, 0))),
16041 VOIDmode, XEXP (prev, 0), const0_rtx);
16042 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16045 create_fixed_operand (&ops[0], XEXP (prev, 0));
16046 create_fixed_operand (&ops[1], target);
16047 create_fixed_operand (&ops[2], op0);
16048 create_fixed_operand (&ops[3], op1);
16049 create_fixed_operand (&ops[4], prev);
16050 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16052 push_to_sequence (*gen_seq);
16053 if (!maybe_expand_insn (icode, 6, ops))
16055 end_sequence ();
16056 return NULL_RTX;
16059 *gen_seq = get_insns ();
16060 end_sequence ();
16062 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16065 #undef TARGET_GEN_CCMP_FIRST
16066 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16068 #undef TARGET_GEN_CCMP_NEXT
16069 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16071 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16072 instruction fusion of some sort. */
16074 static bool
16075 aarch64_macro_fusion_p (void)
16077 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16081 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16082 should be kept together during scheduling. */
16084 static bool
16085 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16087 rtx set_dest;
16088 rtx prev_set = single_set (prev);
16089 rtx curr_set = single_set (curr);
16090 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16091 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16093 if (!aarch64_macro_fusion_p ())
16094 return false;
16096 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16098 /* We are trying to match:
16099 prev (mov) == (set (reg r0) (const_int imm16))
16100 curr (movk) == (set (zero_extract (reg r0)
16101 (const_int 16)
16102 (const_int 16))
16103 (const_int imm16_1)) */
16105 set_dest = SET_DEST (curr_set);
16107 if (GET_CODE (set_dest) == ZERO_EXTRACT
16108 && CONST_INT_P (SET_SRC (curr_set))
16109 && CONST_INT_P (SET_SRC (prev_set))
16110 && CONST_INT_P (XEXP (set_dest, 2))
16111 && INTVAL (XEXP (set_dest, 2)) == 16
16112 && REG_P (XEXP (set_dest, 0))
16113 && REG_P (SET_DEST (prev_set))
16114 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16116 return true;
16120 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16123 /* We're trying to match:
16124 prev (adrp) == (set (reg r1)
16125 (high (symbol_ref ("SYM"))))
16126 curr (add) == (set (reg r0)
16127 (lo_sum (reg r1)
16128 (symbol_ref ("SYM"))))
16129 Note that r0 need not necessarily be the same as r1, especially
16130 during pre-regalloc scheduling. */
16132 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16133 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16135 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16136 && REG_P (XEXP (SET_SRC (curr_set), 0))
16137 && REGNO (XEXP (SET_SRC (curr_set), 0))
16138 == REGNO (SET_DEST (prev_set))
16139 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16140 XEXP (SET_SRC (curr_set), 1)))
16141 return true;
16145 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16148 /* We're trying to match:
16149 prev (movk) == (set (zero_extract (reg r0)
16150 (const_int 16)
16151 (const_int 32))
16152 (const_int imm16_1))
16153 curr (movk) == (set (zero_extract (reg r0)
16154 (const_int 16)
16155 (const_int 48))
16156 (const_int imm16_2)) */
16158 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16159 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16160 && REG_P (XEXP (SET_DEST (prev_set), 0))
16161 && REG_P (XEXP (SET_DEST (curr_set), 0))
16162 && REGNO (XEXP (SET_DEST (prev_set), 0))
16163 == REGNO (XEXP (SET_DEST (curr_set), 0))
16164 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16165 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16166 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16167 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16168 && CONST_INT_P (SET_SRC (prev_set))
16169 && CONST_INT_P (SET_SRC (curr_set)))
16170 return true;
16173 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16175 /* We're trying to match:
16176 prev (adrp) == (set (reg r0)
16177 (high (symbol_ref ("SYM"))))
16178 curr (ldr) == (set (reg r1)
16179 (mem (lo_sum (reg r0)
16180 (symbol_ref ("SYM")))))
16182 curr (ldr) == (set (reg r1)
16183 (zero_extend (mem
16184 (lo_sum (reg r0)
16185 (symbol_ref ("SYM")))))) */
16186 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16187 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16189 rtx curr_src = SET_SRC (curr_set);
16191 if (GET_CODE (curr_src) == ZERO_EXTEND)
16192 curr_src = XEXP (curr_src, 0);
16194 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16195 && REG_P (XEXP (XEXP (curr_src, 0), 0))
16196 && REGNO (XEXP (XEXP (curr_src, 0), 0))
16197 == REGNO (SET_DEST (prev_set))
16198 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16199 XEXP (SET_SRC (prev_set), 0)))
16200 return true;
16204 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16205 && aarch_crypto_can_dual_issue (prev, curr))
16206 return true;
16208 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16209 && any_condjump_p (curr))
16211 enum attr_type prev_type = get_attr_type (prev);
16213 unsigned int condreg1, condreg2;
16214 rtx cc_reg_1;
16215 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16216 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16218 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16219 && prev
16220 && modified_in_p (cc_reg_1, prev))
16222 /* FIXME: this misses some which is considered simple arthematic
16223 instructions for ThunderX. Simple shifts are missed here. */
16224 if (prev_type == TYPE_ALUS_SREG
16225 || prev_type == TYPE_ALUS_IMM
16226 || prev_type == TYPE_LOGICS_REG
16227 || prev_type == TYPE_LOGICS_IMM)
16228 return true;
16232 if (prev_set
16233 && curr_set
16234 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16235 && any_condjump_p (curr))
16237 /* We're trying to match:
16238 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16239 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16240 (const_int 0))
16241 (label_ref ("SYM"))
16242 (pc)) */
16243 if (SET_DEST (curr_set) == (pc_rtx)
16244 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16245 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16246 && REG_P (SET_DEST (prev_set))
16247 && REGNO (SET_DEST (prev_set))
16248 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16250 /* Fuse ALU operations followed by conditional branch instruction. */
16251 switch (get_attr_type (prev))
16253 case TYPE_ALU_IMM:
16254 case TYPE_ALU_SREG:
16255 case TYPE_ADC_REG:
16256 case TYPE_ADC_IMM:
16257 case TYPE_ADCS_REG:
16258 case TYPE_ADCS_IMM:
16259 case TYPE_LOGIC_REG:
16260 case TYPE_LOGIC_IMM:
16261 case TYPE_CSEL:
16262 case TYPE_ADR:
16263 case TYPE_MOV_IMM:
16264 case TYPE_SHIFT_REG:
16265 case TYPE_SHIFT_IMM:
16266 case TYPE_BFM:
16267 case TYPE_RBIT:
16268 case TYPE_REV:
16269 case TYPE_EXTEND:
16270 return true;
16272 default:;
16277 return false;
16280 /* Return true iff the instruction fusion described by OP is enabled. */
16282 bool
16283 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16285 return (aarch64_tune_params.fusible_ops & op) != 0;
16288 /* If MEM is in the form of [base+offset], extract the two parts
16289 of address and set to BASE and OFFSET, otherwise return false
16290 after clearing BASE and OFFSET. */
16292 bool
16293 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16295 rtx addr;
16297 gcc_assert (MEM_P (mem));
16299 addr = XEXP (mem, 0);
16301 if (REG_P (addr))
16303 *base = addr;
16304 *offset = const0_rtx;
16305 return true;
16308 if (GET_CODE (addr) == PLUS
16309 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16311 *base = XEXP (addr, 0);
16312 *offset = XEXP (addr, 1);
16313 return true;
16316 *base = NULL_RTX;
16317 *offset = NULL_RTX;
16319 return false;
16322 /* Types for scheduling fusion. */
16323 enum sched_fusion_type
16325 SCHED_FUSION_NONE = 0,
16326 SCHED_FUSION_LD_SIGN_EXTEND,
16327 SCHED_FUSION_LD_ZERO_EXTEND,
16328 SCHED_FUSION_LD,
16329 SCHED_FUSION_ST,
16330 SCHED_FUSION_NUM
16333 /* If INSN is a load or store of address in the form of [base+offset],
16334 extract the two parts and set to BASE and OFFSET. Return scheduling
16335 fusion type this INSN is. */
16337 static enum sched_fusion_type
16338 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16340 rtx x, dest, src;
16341 enum sched_fusion_type fusion = SCHED_FUSION_LD;
16343 gcc_assert (INSN_P (insn));
16344 x = PATTERN (insn);
16345 if (GET_CODE (x) != SET)
16346 return SCHED_FUSION_NONE;
16348 src = SET_SRC (x);
16349 dest = SET_DEST (x);
16351 machine_mode dest_mode = GET_MODE (dest);
16353 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16354 return SCHED_FUSION_NONE;
16356 if (GET_CODE (src) == SIGN_EXTEND)
16358 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16359 src = XEXP (src, 0);
16360 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16361 return SCHED_FUSION_NONE;
16363 else if (GET_CODE (src) == ZERO_EXTEND)
16365 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16366 src = XEXP (src, 0);
16367 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16368 return SCHED_FUSION_NONE;
16371 if (GET_CODE (src) == MEM && REG_P (dest))
16372 extract_base_offset_in_addr (src, base, offset);
16373 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16375 fusion = SCHED_FUSION_ST;
16376 extract_base_offset_in_addr (dest, base, offset);
16378 else
16379 return SCHED_FUSION_NONE;
16381 if (*base == NULL_RTX || *offset == NULL_RTX)
16382 fusion = SCHED_FUSION_NONE;
16384 return fusion;
16387 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16389 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16390 and PRI are only calculated for these instructions. For other instruction,
16391 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16392 type instruction fusion can be added by returning different priorities.
16394 It's important that irrelevant instructions get the largest FUSION_PRI. */
16396 static void
16397 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16398 int *fusion_pri, int *pri)
16400 int tmp, off_val;
16401 rtx base, offset;
16402 enum sched_fusion_type fusion;
16404 gcc_assert (INSN_P (insn));
16406 tmp = max_pri - 1;
16407 fusion = fusion_load_store (insn, &base, &offset);
16408 if (fusion == SCHED_FUSION_NONE)
16410 *pri = tmp;
16411 *fusion_pri = tmp;
16412 return;
16415 /* Set FUSION_PRI according to fusion type and base register. */
16416 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16418 /* Calculate PRI. */
16419 tmp /= 2;
16421 /* INSN with smaller offset goes first. */
16422 off_val = (int)(INTVAL (offset));
16423 if (off_val >= 0)
16424 tmp -= (off_val & 0xfffff);
16425 else
16426 tmp += ((- off_val) & 0xfffff);
16428 *pri = tmp;
16429 return;
16432 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16433 Adjust priority of sha1h instructions so they are scheduled before
16434 other SHA1 instructions. */
16436 static int
16437 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16439 rtx x = PATTERN (insn);
16441 if (GET_CODE (x) == SET)
16443 x = SET_SRC (x);
16445 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16446 return priority + 10;
16449 return priority;
16452 /* Given OPERANDS of consecutive load/store, check if we can merge
16453 them into ldp/stp. LOAD is true if they are load instructions.
16454 MODE is the mode of memory operands. */
16456 bool
16457 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16458 machine_mode mode)
16460 HOST_WIDE_INT offval_1, offval_2, msize;
16461 enum reg_class rclass_1, rclass_2;
16462 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16464 if (load)
16466 mem_1 = operands[1];
16467 mem_2 = operands[3];
16468 reg_1 = operands[0];
16469 reg_2 = operands[2];
16470 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16471 if (REGNO (reg_1) == REGNO (reg_2))
16472 return false;
16474 else
16476 mem_1 = operands[0];
16477 mem_2 = operands[2];
16478 reg_1 = operands[1];
16479 reg_2 = operands[3];
16482 /* The mems cannot be volatile. */
16483 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16484 return false;
16486 /* If we have SImode and slow unaligned ldp,
16487 check the alignment to be at least 8 byte. */
16488 if (mode == SImode
16489 && (aarch64_tune_params.extra_tuning_flags
16490 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16491 && !optimize_size
16492 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16493 return false;
16495 /* Check if the addresses are in the form of [base+offset]. */
16496 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16497 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16498 return false;
16499 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16500 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16501 return false;
16503 /* Check if the bases are same. */
16504 if (!rtx_equal_p (base_1, base_2))
16505 return false;
16507 offval_1 = INTVAL (offset_1);
16508 offval_2 = INTVAL (offset_2);
16509 /* We should only be trying this for fixed-sized modes. There is no
16510 SVE LDP/STP instruction. */
16511 msize = GET_MODE_SIZE (mode).to_constant ();
16512 /* Check if the offsets are consecutive. */
16513 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16514 return false;
16516 /* Check if the addresses are clobbered by load. */
16517 if (load)
16519 if (reg_mentioned_p (reg_1, mem_1))
16520 return false;
16522 /* In increasing order, the last load can clobber the address. */
16523 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16524 return false;
16527 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16528 rclass_1 = FP_REGS;
16529 else
16530 rclass_1 = GENERAL_REGS;
16532 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16533 rclass_2 = FP_REGS;
16534 else
16535 rclass_2 = GENERAL_REGS;
16537 /* Check if the registers are of same class. */
16538 if (rclass_1 != rclass_2)
16539 return false;
16541 return true;
16544 /* Given OPERANDS of consecutive load/store, check if we can merge
16545 them into ldp/stp by adjusting the offset. LOAD is true if they
16546 are load instructions. MODE is the mode of memory operands.
16548 Given below consecutive stores:
16550 str w1, [xb, 0x100]
16551 str w1, [xb, 0x104]
16552 str w1, [xb, 0x108]
16553 str w1, [xb, 0x10c]
16555 Though the offsets are out of the range supported by stp, we can
16556 still pair them after adjusting the offset, like:
16558 add scratch, xb, 0x100
16559 stp w1, w1, [scratch]
16560 stp w1, w1, [scratch, 0x8]
16562 The peephole patterns detecting this opportunity should guarantee
16563 the scratch register is avaliable. */
16565 bool
16566 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16567 scalar_mode mode)
16569 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16570 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16571 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16572 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16574 if (load)
16576 reg_1 = operands[0];
16577 mem_1 = operands[1];
16578 reg_2 = operands[2];
16579 mem_2 = operands[3];
16580 reg_3 = operands[4];
16581 mem_3 = operands[5];
16582 reg_4 = operands[6];
16583 mem_4 = operands[7];
16584 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16585 && REG_P (reg_3) && REG_P (reg_4));
16586 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16587 return false;
16589 else
16591 mem_1 = operands[0];
16592 reg_1 = operands[1];
16593 mem_2 = operands[2];
16594 reg_2 = operands[3];
16595 mem_3 = operands[4];
16596 reg_3 = operands[5];
16597 mem_4 = operands[6];
16598 reg_4 = operands[7];
16600 /* Skip if memory operand is by itslef valid for ldp/stp. */
16601 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
16602 return false;
16604 /* The mems cannot be volatile. */
16605 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
16606 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
16607 return false;
16609 /* Check if the addresses are in the form of [base+offset]. */
16610 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16611 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16612 return false;
16613 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16614 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16615 return false;
16616 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
16617 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
16618 return false;
16619 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
16620 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
16621 return false;
16623 /* Check if the bases are same. */
16624 if (!rtx_equal_p (base_1, base_2)
16625 || !rtx_equal_p (base_2, base_3)
16626 || !rtx_equal_p (base_3, base_4))
16627 return false;
16629 offval_1 = INTVAL (offset_1);
16630 offval_2 = INTVAL (offset_2);
16631 offval_3 = INTVAL (offset_3);
16632 offval_4 = INTVAL (offset_4);
16633 msize = GET_MODE_SIZE (mode);
16634 /* Check if the offsets are consecutive. */
16635 if ((offval_1 != (offval_2 + msize)
16636 || offval_1 != (offval_3 + msize * 2)
16637 || offval_1 != (offval_4 + msize * 3))
16638 && (offval_4 != (offval_3 + msize)
16639 || offval_4 != (offval_2 + msize * 2)
16640 || offval_4 != (offval_1 + msize * 3)))
16641 return false;
16643 /* Check if the addresses are clobbered by load. */
16644 if (load)
16646 if (reg_mentioned_p (reg_1, mem_1)
16647 || reg_mentioned_p (reg_2, mem_2)
16648 || reg_mentioned_p (reg_3, mem_3))
16649 return false;
16651 /* In increasing order, the last load can clobber the address. */
16652 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
16653 return false;
16656 /* If we have SImode and slow unaligned ldp,
16657 check the alignment to be at least 8 byte. */
16658 if (mode == SImode
16659 && (aarch64_tune_params.extra_tuning_flags
16660 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16661 && !optimize_size
16662 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16663 return false;
16665 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16666 rclass_1 = FP_REGS;
16667 else
16668 rclass_1 = GENERAL_REGS;
16670 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16671 rclass_2 = FP_REGS;
16672 else
16673 rclass_2 = GENERAL_REGS;
16675 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
16676 rclass_3 = FP_REGS;
16677 else
16678 rclass_3 = GENERAL_REGS;
16680 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
16681 rclass_4 = FP_REGS;
16682 else
16683 rclass_4 = GENERAL_REGS;
16685 /* Check if the registers are of same class. */
16686 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
16687 return false;
16689 return true;
16692 /* Given OPERANDS of consecutive load/store, this function pairs them
16693 into ldp/stp after adjusting the offset. It depends on the fact
16694 that addresses of load/store instructions are in increasing order.
16695 MODE is the mode of memory operands. CODE is the rtl operator
16696 which should be applied to all memory operands, it's SIGN_EXTEND,
16697 ZERO_EXTEND or UNKNOWN. */
16699 bool
16700 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
16701 scalar_mode mode, RTX_CODE code)
16703 rtx base, offset, t1, t2;
16704 rtx mem_1, mem_2, mem_3, mem_4;
16705 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
16707 if (load)
16709 mem_1 = operands[1];
16710 mem_2 = operands[3];
16711 mem_3 = operands[5];
16712 mem_4 = operands[7];
16714 else
16716 mem_1 = operands[0];
16717 mem_2 = operands[2];
16718 mem_3 = operands[4];
16719 mem_4 = operands[6];
16720 gcc_assert (code == UNKNOWN);
16723 extract_base_offset_in_addr (mem_1, &base, &offset);
16724 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
16726 /* Adjust offset thus it can fit in ldp/stp instruction. */
16727 msize = GET_MODE_SIZE (mode);
16728 stp_off_limit = msize * 0x40;
16729 off_val = INTVAL (offset);
16730 abs_off = (off_val < 0) ? -off_val : off_val;
16731 new_off = abs_off % stp_off_limit;
16732 adj_off = abs_off - new_off;
16734 /* Further adjust to make sure all offsets are OK. */
16735 if ((new_off + msize * 2) >= stp_off_limit)
16737 adj_off += stp_off_limit;
16738 new_off -= stp_off_limit;
16741 /* Make sure the adjustment can be done with ADD/SUB instructions. */
16742 if (adj_off >= 0x1000)
16743 return false;
16745 if (off_val < 0)
16747 adj_off = -adj_off;
16748 new_off = -new_off;
16751 /* Create new memory references. */
16752 mem_1 = change_address (mem_1, VOIDmode,
16753 plus_constant (DImode, operands[8], new_off));
16755 /* Check if the adjusted address is OK for ldp/stp. */
16756 if (!aarch64_mem_pair_operand (mem_1, mode))
16757 return false;
16759 msize = GET_MODE_SIZE (mode);
16760 mem_2 = change_address (mem_2, VOIDmode,
16761 plus_constant (DImode,
16762 operands[8],
16763 new_off + msize));
16764 mem_3 = change_address (mem_3, VOIDmode,
16765 plus_constant (DImode,
16766 operands[8],
16767 new_off + msize * 2));
16768 mem_4 = change_address (mem_4, VOIDmode,
16769 plus_constant (DImode,
16770 operands[8],
16771 new_off + msize * 3));
16773 if (code == ZERO_EXTEND)
16775 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
16776 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
16777 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
16778 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
16780 else if (code == SIGN_EXTEND)
16782 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
16783 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
16784 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
16785 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
16788 if (load)
16790 operands[1] = mem_1;
16791 operands[3] = mem_2;
16792 operands[5] = mem_3;
16793 operands[7] = mem_4;
16795 else
16797 operands[0] = mem_1;
16798 operands[2] = mem_2;
16799 operands[4] = mem_3;
16800 operands[6] = mem_4;
16803 /* Emit adjusting instruction. */
16804 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
16805 /* Emit ldp/stp instructions. */
16806 t1 = gen_rtx_SET (operands[0], operands[1]);
16807 t2 = gen_rtx_SET (operands[2], operands[3]);
16808 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
16809 t1 = gen_rtx_SET (operands[4], operands[5]);
16810 t2 = gen_rtx_SET (operands[6], operands[7]);
16811 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
16812 return true;
16815 /* Return 1 if pseudo register should be created and used to hold
16816 GOT address for PIC code. */
16818 bool
16819 aarch64_use_pseudo_pic_reg (void)
16821 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
16824 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
16826 static int
16827 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
16829 switch (XINT (x, 1))
16831 case UNSPEC_GOTSMALLPIC:
16832 case UNSPEC_GOTSMALLPIC28K:
16833 case UNSPEC_GOTTINYPIC:
16834 return 0;
16835 default:
16836 break;
16839 return default_unspec_may_trap_p (x, flags);
16843 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
16844 return the log2 of that value. Otherwise return -1. */
16847 aarch64_fpconst_pow_of_2 (rtx x)
16849 const REAL_VALUE_TYPE *r;
16851 if (!CONST_DOUBLE_P (x))
16852 return -1;
16854 r = CONST_DOUBLE_REAL_VALUE (x);
16856 if (REAL_VALUE_NEGATIVE (*r)
16857 || REAL_VALUE_ISNAN (*r)
16858 || REAL_VALUE_ISINF (*r)
16859 || !real_isinteger (r, DFmode))
16860 return -1;
16862 return exact_log2 (real_to_integer (r));
16865 /* If X is a vector of equal CONST_DOUBLE values and that value is
16866 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
16869 aarch64_vec_fpconst_pow_of_2 (rtx x)
16871 int nelts;
16872 if (GET_CODE (x) != CONST_VECTOR
16873 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
16874 return -1;
16876 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
16877 return -1;
16879 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
16880 if (firstval <= 0)
16881 return -1;
16883 for (int i = 1; i < nelts; i++)
16884 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
16885 return -1;
16887 return firstval;
16890 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
16891 to float.
16893 __fp16 always promotes through this hook.
16894 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
16895 through the generic excess precision logic rather than here. */
16897 static tree
16898 aarch64_promoted_type (const_tree t)
16900 if (SCALAR_FLOAT_TYPE_P (t)
16901 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
16902 return float_type_node;
16904 return NULL_TREE;
16907 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
16909 static bool
16910 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
16911 optimization_type opt_type)
16913 switch (op)
16915 case rsqrt_optab:
16916 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
16918 default:
16919 return true;
16923 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
16925 static unsigned int
16926 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
16927 int *offset)
16929 /* Polynomial invariant 1 == (VG / 2) - 1. */
16930 gcc_assert (i == 1);
16931 *factor = 2;
16932 *offset = 1;
16933 return AARCH64_DWARF_VG;
16936 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
16937 if MODE is HFmode, and punt to the generic implementation otherwise. */
16939 static bool
16940 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
16942 return (mode == HFmode
16943 ? true
16944 : default_libgcc_floating_mode_supported_p (mode));
16947 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
16948 if MODE is HFmode, and punt to the generic implementation otherwise. */
16950 static bool
16951 aarch64_scalar_mode_supported_p (scalar_mode mode)
16953 return (mode == HFmode
16954 ? true
16955 : default_scalar_mode_supported_p (mode));
16958 /* Set the value of FLT_EVAL_METHOD.
16959 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
16961 0: evaluate all operations and constants, whose semantic type has at
16962 most the range and precision of type float, to the range and
16963 precision of float; evaluate all other operations and constants to
16964 the range and precision of the semantic type;
16966 N, where _FloatN is a supported interchange floating type
16967 evaluate all operations and constants, whose semantic type has at
16968 most the range and precision of _FloatN type, to the range and
16969 precision of the _FloatN type; evaluate all other operations and
16970 constants to the range and precision of the semantic type;
16972 If we have the ARMv8.2-A extensions then we support _Float16 in native
16973 precision, so we should set this to 16. Otherwise, we support the type,
16974 but want to evaluate expressions in float precision, so set this to
16975 0. */
16977 static enum flt_eval_method
16978 aarch64_excess_precision (enum excess_precision_type type)
16980 switch (type)
16982 case EXCESS_PRECISION_TYPE_FAST:
16983 case EXCESS_PRECISION_TYPE_STANDARD:
16984 /* We can calculate either in 16-bit range and precision or
16985 32-bit range and precision. Make that decision based on whether
16986 we have native support for the ARMv8.2-A 16-bit floating-point
16987 instructions or not. */
16988 return (TARGET_FP_F16INST
16989 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
16990 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
16991 case EXCESS_PRECISION_TYPE_IMPLICIT:
16992 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
16993 default:
16994 gcc_unreachable ();
16996 return FLT_EVAL_METHOD_UNPREDICTABLE;
16999 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17000 scheduled for speculative execution. Reject the long-running division
17001 and square-root instructions. */
17003 static bool
17004 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17006 switch (get_attr_type (insn))
17008 case TYPE_SDIV:
17009 case TYPE_UDIV:
17010 case TYPE_FDIVS:
17011 case TYPE_FDIVD:
17012 case TYPE_FSQRTS:
17013 case TYPE_FSQRTD:
17014 case TYPE_NEON_FP_SQRT_S:
17015 case TYPE_NEON_FP_SQRT_D:
17016 case TYPE_NEON_FP_SQRT_S_Q:
17017 case TYPE_NEON_FP_SQRT_D_Q:
17018 case TYPE_NEON_FP_DIV_S:
17019 case TYPE_NEON_FP_DIV_D:
17020 case TYPE_NEON_FP_DIV_S_Q:
17021 case TYPE_NEON_FP_DIV_D_Q:
17022 return false;
17023 default:
17024 return true;
17028 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17030 static int
17031 aarch64_compute_pressure_classes (reg_class *classes)
17033 int i = 0;
17034 classes[i++] = GENERAL_REGS;
17035 classes[i++] = FP_REGS;
17036 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17037 registers need to go in PR_LO_REGS at some point during their
17038 lifetime. Splitting it into two halves has the effect of making
17039 all predicates count against PR_LO_REGS, so that we try whenever
17040 possible to restrict the number of live predicates to 8. This
17041 greatly reduces the amount of spilling in certain loops. */
17042 classes[i++] = PR_LO_REGS;
17043 classes[i++] = PR_HI_REGS;
17044 return i;
17047 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17049 static bool
17050 aarch64_can_change_mode_class (machine_mode from,
17051 machine_mode to, reg_class_t)
17053 /* See the comment at the head of aarch64-sve.md for details. */
17054 if (BYTES_BIG_ENDIAN
17055 && (aarch64_sve_data_mode_p (from) != aarch64_sve_data_mode_p (to)))
17056 return false;
17057 return true;
17060 /* Target-specific selftests. */
17062 #if CHECKING_P
17064 namespace selftest {
17066 /* Selftest for the RTL loader.
17067 Verify that the RTL loader copes with a dump from
17068 print_rtx_function. This is essentially just a test that class
17069 function_reader can handle a real dump, but it also verifies
17070 that lookup_reg_by_dump_name correctly handles hard regs.
17071 The presence of hard reg names in the dump means that the test is
17072 target-specific, hence it is in this file. */
17074 static void
17075 aarch64_test_loading_full_dump ()
17077 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17079 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17081 rtx_insn *insn_1 = get_insn_by_uid (1);
17082 ASSERT_EQ (NOTE, GET_CODE (insn_1));
17084 rtx_insn *insn_15 = get_insn_by_uid (15);
17085 ASSERT_EQ (INSN, GET_CODE (insn_15));
17086 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17088 /* Verify crtl->return_rtx. */
17089 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17090 ASSERT_EQ (0, REGNO (crtl->return_rtx));
17091 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17094 /* Run all target-specific selftests. */
17096 static void
17097 aarch64_run_selftests (void)
17099 aarch64_test_loading_full_dump ();
17102 } // namespace selftest
17104 #endif /* #if CHECKING_P */
17106 #undef TARGET_ADDRESS_COST
17107 #define TARGET_ADDRESS_COST aarch64_address_cost
17109 /* This hook will determines whether unnamed bitfields affect the alignment
17110 of the containing structure. The hook returns true if the structure
17111 should inherit the alignment requirements of an unnamed bitfield's
17112 type. */
17113 #undef TARGET_ALIGN_ANON_BITFIELD
17114 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17116 #undef TARGET_ASM_ALIGNED_DI_OP
17117 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17119 #undef TARGET_ASM_ALIGNED_HI_OP
17120 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17122 #undef TARGET_ASM_ALIGNED_SI_OP
17123 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17125 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17126 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17127 hook_bool_const_tree_hwi_hwi_const_tree_true
17129 #undef TARGET_ASM_FILE_START
17130 #define TARGET_ASM_FILE_START aarch64_start_file
17132 #undef TARGET_ASM_OUTPUT_MI_THUNK
17133 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17135 #undef TARGET_ASM_SELECT_RTX_SECTION
17136 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17138 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17139 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17141 #undef TARGET_BUILD_BUILTIN_VA_LIST
17142 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17144 #undef TARGET_CALLEE_COPIES
17145 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17147 #undef TARGET_CAN_ELIMINATE
17148 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17150 #undef TARGET_CAN_INLINE_P
17151 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17153 #undef TARGET_CANNOT_FORCE_CONST_MEM
17154 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17156 #undef TARGET_CASE_VALUES_THRESHOLD
17157 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17159 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17160 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17162 /* Only the least significant bit is used for initialization guard
17163 variables. */
17164 #undef TARGET_CXX_GUARD_MASK_BIT
17165 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17167 #undef TARGET_C_MODE_FOR_SUFFIX
17168 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17170 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17171 #undef TARGET_DEFAULT_TARGET_FLAGS
17172 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17173 #endif
17175 #undef TARGET_CLASS_MAX_NREGS
17176 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17178 #undef TARGET_BUILTIN_DECL
17179 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17181 #undef TARGET_BUILTIN_RECIPROCAL
17182 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17184 #undef TARGET_C_EXCESS_PRECISION
17185 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17187 #undef TARGET_EXPAND_BUILTIN
17188 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17190 #undef TARGET_EXPAND_BUILTIN_VA_START
17191 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17193 #undef TARGET_FOLD_BUILTIN
17194 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17196 #undef TARGET_FUNCTION_ARG
17197 #define TARGET_FUNCTION_ARG aarch64_function_arg
17199 #undef TARGET_FUNCTION_ARG_ADVANCE
17200 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17202 #undef TARGET_FUNCTION_ARG_BOUNDARY
17203 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17205 #undef TARGET_FUNCTION_ARG_PADDING
17206 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17208 #undef TARGET_GET_RAW_RESULT_MODE
17209 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17210 #undef TARGET_GET_RAW_ARG_MODE
17211 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17213 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17214 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17216 #undef TARGET_FUNCTION_VALUE
17217 #define TARGET_FUNCTION_VALUE aarch64_function_value
17219 #undef TARGET_FUNCTION_VALUE_REGNO_P
17220 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17222 #undef TARGET_GIMPLE_FOLD_BUILTIN
17223 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17225 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17226 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17228 #undef TARGET_INIT_BUILTINS
17229 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17231 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17232 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17233 aarch64_ira_change_pseudo_allocno_class
17235 #undef TARGET_LEGITIMATE_ADDRESS_P
17236 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17238 #undef TARGET_LEGITIMATE_CONSTANT_P
17239 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17241 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17242 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17243 aarch64_legitimize_address_displacement
17245 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17246 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17248 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17249 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17250 aarch64_libgcc_floating_mode_supported_p
17252 #undef TARGET_MANGLE_TYPE
17253 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17255 #undef TARGET_MEMORY_MOVE_COST
17256 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17258 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17259 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17261 #undef TARGET_MUST_PASS_IN_STACK
17262 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17264 /* This target hook should return true if accesses to volatile bitfields
17265 should use the narrowest mode possible. It should return false if these
17266 accesses should use the bitfield container type. */
17267 #undef TARGET_NARROW_VOLATILE_BITFIELD
17268 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17270 #undef TARGET_OPTION_OVERRIDE
17271 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17273 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17274 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17275 aarch64_override_options_after_change
17277 #undef TARGET_OPTION_SAVE
17278 #define TARGET_OPTION_SAVE aarch64_option_save
17280 #undef TARGET_OPTION_RESTORE
17281 #define TARGET_OPTION_RESTORE aarch64_option_restore
17283 #undef TARGET_OPTION_PRINT
17284 #define TARGET_OPTION_PRINT aarch64_option_print
17286 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17287 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17289 #undef TARGET_SET_CURRENT_FUNCTION
17290 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17292 #undef TARGET_PASS_BY_REFERENCE
17293 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17295 #undef TARGET_PREFERRED_RELOAD_CLASS
17296 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17298 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17299 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17301 #undef TARGET_PROMOTED_TYPE
17302 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17304 #undef TARGET_SECONDARY_RELOAD
17305 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17307 #undef TARGET_SHIFT_TRUNCATION_MASK
17308 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17310 #undef TARGET_SETUP_INCOMING_VARARGS
17311 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17313 #undef TARGET_STRUCT_VALUE_RTX
17314 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17316 #undef TARGET_REGISTER_MOVE_COST
17317 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17319 #undef TARGET_RETURN_IN_MEMORY
17320 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17322 #undef TARGET_RETURN_IN_MSB
17323 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17325 #undef TARGET_RTX_COSTS
17326 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17328 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17329 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17331 #undef TARGET_SCHED_ISSUE_RATE
17332 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17334 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17335 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17336 aarch64_sched_first_cycle_multipass_dfa_lookahead
17338 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17339 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17340 aarch64_first_cycle_multipass_dfa_lookahead_guard
17342 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17343 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17344 aarch64_get_separate_components
17346 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17347 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17348 aarch64_components_for_bb
17350 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17351 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17352 aarch64_disqualify_components
17354 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17355 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17356 aarch64_emit_prologue_components
17358 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17359 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17360 aarch64_emit_epilogue_components
17362 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17363 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17364 aarch64_set_handled_components
17366 #undef TARGET_TRAMPOLINE_INIT
17367 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17369 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17370 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17372 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17373 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17375 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17376 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17377 aarch64_builtin_support_vector_misalignment
17379 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17380 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17382 #undef TARGET_VECTORIZE_ADD_STMT_COST
17383 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17385 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17386 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17387 aarch64_builtin_vectorization_cost
17389 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17390 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17392 #undef TARGET_VECTORIZE_BUILTINS
17393 #define TARGET_VECTORIZE_BUILTINS
17395 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17396 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17397 aarch64_builtin_vectorized_function
17399 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17400 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17401 aarch64_autovectorize_vector_sizes
17403 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17404 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17405 aarch64_atomic_assign_expand_fenv
17407 /* Section anchor support. */
17409 #undef TARGET_MIN_ANCHOR_OFFSET
17410 #define TARGET_MIN_ANCHOR_OFFSET -256
17412 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17413 byte offset; we can do much more for larger data types, but have no way
17414 to determine the size of the access. We assume accesses are aligned. */
17415 #undef TARGET_MAX_ANCHOR_OFFSET
17416 #define TARGET_MAX_ANCHOR_OFFSET 4095
17418 #undef TARGET_VECTOR_ALIGNMENT
17419 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17421 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17422 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17423 aarch64_vectorize_preferred_vector_alignment
17424 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17425 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17426 aarch64_simd_vector_alignment_reachable
17428 /* vec_perm support. */
17430 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17431 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17432 aarch64_vectorize_vec_perm_const
17434 #undef TARGET_VECTORIZE_GET_MASK_MODE
17435 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17437 #undef TARGET_INIT_LIBFUNCS
17438 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17440 #undef TARGET_FIXED_CONDITION_CODE_REGS
17441 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17443 #undef TARGET_FLAGS_REGNUM
17444 #define TARGET_FLAGS_REGNUM CC_REGNUM
17446 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17447 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17449 #undef TARGET_ASAN_SHADOW_OFFSET
17450 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17452 #undef TARGET_LEGITIMIZE_ADDRESS
17453 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17455 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17456 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17458 #undef TARGET_CAN_USE_DOLOOP_P
17459 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17461 #undef TARGET_SCHED_ADJUST_PRIORITY
17462 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17464 #undef TARGET_SCHED_MACRO_FUSION_P
17465 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17467 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17468 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17470 #undef TARGET_SCHED_FUSION_PRIORITY
17471 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17473 #undef TARGET_UNSPEC_MAY_TRAP_P
17474 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17476 #undef TARGET_USE_PSEUDO_PIC_REG
17477 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17479 #undef TARGET_PRINT_OPERAND
17480 #define TARGET_PRINT_OPERAND aarch64_print_operand
17482 #undef TARGET_PRINT_OPERAND_ADDRESS
17483 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17485 #undef TARGET_OPTAB_SUPPORTED_P
17486 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17488 #undef TARGET_OMIT_STRUCT_RETURN_REG
17489 #define TARGET_OMIT_STRUCT_RETURN_REG true
17491 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17492 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17493 aarch64_dwarf_poly_indeterminate_value
17495 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
17496 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17497 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17499 #undef TARGET_HARD_REGNO_NREGS
17500 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17501 #undef TARGET_HARD_REGNO_MODE_OK
17502 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17504 #undef TARGET_MODES_TIEABLE_P
17505 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17507 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17508 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17509 aarch64_hard_regno_call_part_clobbered
17511 #undef TARGET_CONSTANT_ALIGNMENT
17512 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17514 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17515 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17517 #undef TARGET_CAN_CHANGE_MODE_CLASS
17518 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17520 #if CHECKING_P
17521 #undef TARGET_RUN_TARGET_SELFTESTS
17522 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17523 #endif /* #if CHECKING_P */
17525 struct gcc_target targetm = TARGET_INITIALIZER;
17527 #include "gt-aarch64.h"