[AArch64] Prefer LD1RQ for big-endian SVE
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob6296ffe959f62ac5515a3d32e617a909f829f090
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Classifies an address.
83 ADDRESS_REG_IMM
84 A simple base register plus immediate offset.
86 ADDRESS_REG_WB
87 A base register indexed by immediate offset with writeback.
89 ADDRESS_REG_REG
90 A base register indexed by (optionally scaled) register.
92 ADDRESS_REG_UXTW
93 A base register indexed by (optionally scaled) zero-extended register.
95 ADDRESS_REG_SXTW
96 A base register indexed by (optionally scaled) sign-extended register.
98 ADDRESS_LO_SUM
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
101 ADDRESS_SYMBOLIC:
102 A constant symbolic address, in pc-relative literal pool. */
104 enum aarch64_address_type {
105 ADDRESS_REG_IMM,
106 ADDRESS_REG_WB,
107 ADDRESS_REG_REG,
108 ADDRESS_REG_UXTW,
109 ADDRESS_REG_SXTW,
110 ADDRESS_LO_SUM,
111 ADDRESS_SYMBOLIC
114 struct aarch64_address_info {
115 enum aarch64_address_type type;
116 rtx base;
117 rtx offset;
118 poly_int64 const_offset;
119 int shift;
120 enum aarch64_symbol_type symbol_type;
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
126 enum insn_type { MOV, MVN };
127 enum modifier_type { LSL, MSL };
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode, rtx);
131 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
132 insn_type = MOV, modifier_type = LSL,
133 unsigned int = 0);
134 simd_immediate_info (scalar_mode, rtx, rtx);
136 /* The mode of the elements. */
137 scalar_mode elt_mode;
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
141 rtx value;
143 /* The value of the step if the constant is a series, null otherwise. */
144 rtx step;
146 /* The instruction to use to move the immediate into a vector. */
147 insn_type insn;
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier;
152 unsigned int shift;
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
159 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
160 modifier (LSL), shift (0)
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
165 fields. */
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
168 unsigned HOST_WIDE_INT value_in,
169 insn_type insn_in, modifier_type modifier_in,
170 unsigned int shift_in)
171 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
172 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
179 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
180 modifier (LSL), shift (0)
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel;
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg;
189 #ifdef HAVE_AS_TLS
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
192 #endif
194 static bool aarch64_composite_type_p (const_tree, machine_mode);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
196 const_tree,
197 machine_mode *, int *,
198 bool *);
199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode);
203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
205 const_tree type,
206 int misalignment,
207 bool is_packed);
208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
211 /* Major revision number of the ARM Architecture implemented by the target. */
212 unsigned aarch64_architecture_version;
214 /* The processor for which instructions should be scheduled. */
215 enum aarch64_processor aarch64_tune = cortexa53;
217 /* Mask to specify which instruction scheduling options should be used. */
218 unsigned long aarch64_tune_flags = 0;
220 /* Global flag for PC relative loads. */
221 bool aarch64_pcrelative_literal_loads;
223 /* Support for command line parsing of boolean flags in the tuning
224 structures. */
225 struct aarch64_flag_desc
227 const char* name;
228 unsigned int flag;
231 #define AARCH64_FUSION_PAIR(name, internal_name) \
232 { name, AARCH64_FUSE_##internal_name },
233 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
235 { "none", AARCH64_FUSE_NOTHING },
236 #include "aarch64-fusion-pairs.def"
237 { "all", AARCH64_FUSE_ALL },
238 { NULL, AARCH64_FUSE_NOTHING }
241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
242 { name, AARCH64_EXTRA_TUNE_##internal_name },
243 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
245 { "none", AARCH64_EXTRA_TUNE_NONE },
246 #include "aarch64-tuning-flags.def"
247 { "all", AARCH64_EXTRA_TUNE_ALL },
248 { NULL, AARCH64_EXTRA_TUNE_NONE }
251 /* Tuning parameters. */
253 static const struct cpu_addrcost_table generic_addrcost_table =
256 1, /* hi */
257 0, /* si */
258 0, /* di */
259 1, /* ti */
261 0, /* pre_modify */
262 0, /* post_modify */
263 0, /* register_offset */
264 0, /* register_sextend */
265 0, /* register_zextend */
266 0 /* imm_offset */
269 static const struct cpu_addrcost_table exynosm1_addrcost_table =
272 0, /* hi */
273 0, /* si */
274 0, /* di */
275 2, /* ti */
277 0, /* pre_modify */
278 0, /* post_modify */
279 1, /* register_offset */
280 1, /* register_sextend */
281 2, /* register_zextend */
282 0, /* imm_offset */
285 static const struct cpu_addrcost_table xgene1_addrcost_table =
288 1, /* hi */
289 0, /* si */
290 0, /* di */
291 1, /* ti */
293 1, /* pre_modify */
294 0, /* post_modify */
295 0, /* register_offset */
296 1, /* register_sextend */
297 1, /* register_zextend */
298 0, /* imm_offset */
301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
304 1, /* hi */
305 1, /* si */
306 1, /* di */
307 2, /* ti */
309 0, /* pre_modify */
310 0, /* post_modify */
311 2, /* register_offset */
312 3, /* register_sextend */
313 3, /* register_zextend */
314 0, /* imm_offset */
317 static const struct cpu_regmove_cost generic_regmove_cost =
319 1, /* GP2GP */
320 /* Avoid the use of slow int<->fp moves for spilling by setting
321 their cost higher than memmov_cost. */
322 5, /* GP2FP */
323 5, /* FP2GP */
324 2 /* FP2FP */
327 static const struct cpu_regmove_cost cortexa57_regmove_cost =
329 1, /* GP2GP */
330 /* Avoid the use of slow int<->fp moves for spilling by setting
331 their cost higher than memmov_cost. */
332 5, /* GP2FP */
333 5, /* FP2GP */
334 2 /* FP2FP */
337 static const struct cpu_regmove_cost cortexa53_regmove_cost =
339 1, /* GP2GP */
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
342 5, /* GP2FP */
343 5, /* FP2GP */
344 2 /* FP2FP */
347 static const struct cpu_regmove_cost exynosm1_regmove_cost =
349 1, /* GP2GP */
350 /* Avoid the use of slow int<->fp moves for spilling by setting
351 their cost higher than memmov_cost (actual, 4 and 9). */
352 9, /* GP2FP */
353 9, /* FP2GP */
354 1 /* FP2FP */
357 static const struct cpu_regmove_cost thunderx_regmove_cost =
359 2, /* GP2GP */
360 2, /* GP2FP */
361 6, /* FP2GP */
362 4 /* FP2FP */
365 static const struct cpu_regmove_cost xgene1_regmove_cost =
367 1, /* GP2GP */
368 /* Avoid the use of slow int<->fp moves for spilling by setting
369 their cost higher than memmov_cost. */
370 8, /* GP2FP */
371 8, /* FP2GP */
372 2 /* FP2FP */
375 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
377 2, /* GP2GP */
378 /* Avoid the use of int<->fp moves for spilling. */
379 6, /* GP2FP */
380 6, /* FP2GP */
381 4 /* FP2FP */
384 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
386 1, /* GP2GP */
387 /* Avoid the use of int<->fp moves for spilling. */
388 8, /* GP2FP */
389 8, /* FP2GP */
390 4 /* FP2FP */
393 /* Generic costs for vector insn classes. */
394 static const struct cpu_vector_cost generic_vector_cost =
396 1, /* scalar_int_stmt_cost */
397 1, /* scalar_fp_stmt_cost */
398 1, /* scalar_load_cost */
399 1, /* scalar_store_cost */
400 1, /* vec_int_stmt_cost */
401 1, /* vec_fp_stmt_cost */
402 2, /* vec_permute_cost */
403 1, /* vec_to_scalar_cost */
404 1, /* scalar_to_vec_cost */
405 1, /* vec_align_load_cost */
406 1, /* vec_unalign_load_cost */
407 1, /* vec_unalign_store_cost */
408 1, /* vec_store_cost */
409 3, /* cond_taken_branch_cost */
410 1 /* cond_not_taken_branch_cost */
413 /* ThunderX costs for vector insn classes. */
414 static const struct cpu_vector_cost thunderx_vector_cost =
416 1, /* scalar_int_stmt_cost */
417 1, /* scalar_fp_stmt_cost */
418 3, /* scalar_load_cost */
419 1, /* scalar_store_cost */
420 4, /* vec_int_stmt_cost */
421 1, /* vec_fp_stmt_cost */
422 4, /* vec_permute_cost */
423 2, /* vec_to_scalar_cost */
424 2, /* scalar_to_vec_cost */
425 3, /* vec_align_load_cost */
426 5, /* vec_unalign_load_cost */
427 5, /* vec_unalign_store_cost */
428 1, /* vec_store_cost */
429 3, /* cond_taken_branch_cost */
430 3 /* cond_not_taken_branch_cost */
433 /* Generic costs for vector insn classes. */
434 static const struct cpu_vector_cost cortexa57_vector_cost =
436 1, /* scalar_int_stmt_cost */
437 1, /* scalar_fp_stmt_cost */
438 4, /* scalar_load_cost */
439 1, /* scalar_store_cost */
440 2, /* vec_int_stmt_cost */
441 2, /* vec_fp_stmt_cost */
442 3, /* vec_permute_cost */
443 8, /* vec_to_scalar_cost */
444 8, /* scalar_to_vec_cost */
445 4, /* vec_align_load_cost */
446 4, /* vec_unalign_load_cost */
447 1, /* vec_unalign_store_cost */
448 1, /* vec_store_cost */
449 1, /* cond_taken_branch_cost */
450 1 /* cond_not_taken_branch_cost */
453 static const struct cpu_vector_cost exynosm1_vector_cost =
455 1, /* scalar_int_stmt_cost */
456 1, /* scalar_fp_stmt_cost */
457 5, /* scalar_load_cost */
458 1, /* scalar_store_cost */
459 3, /* vec_int_stmt_cost */
460 3, /* vec_fp_stmt_cost */
461 3, /* vec_permute_cost */
462 3, /* vec_to_scalar_cost */
463 3, /* scalar_to_vec_cost */
464 5, /* vec_align_load_cost */
465 5, /* vec_unalign_load_cost */
466 1, /* vec_unalign_store_cost */
467 1, /* vec_store_cost */
468 1, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
472 /* Generic costs for vector insn classes. */
473 static const struct cpu_vector_cost xgene1_vector_cost =
475 1, /* scalar_int_stmt_cost */
476 1, /* scalar_fp_stmt_cost */
477 5, /* scalar_load_cost */
478 1, /* scalar_store_cost */
479 2, /* vec_int_stmt_cost */
480 2, /* vec_fp_stmt_cost */
481 2, /* vec_permute_cost */
482 4, /* vec_to_scalar_cost */
483 4, /* scalar_to_vec_cost */
484 10, /* vec_align_load_cost */
485 10, /* vec_unalign_load_cost */
486 2, /* vec_unalign_store_cost */
487 2, /* vec_store_cost */
488 2, /* cond_taken_branch_cost */
489 1 /* cond_not_taken_branch_cost */
492 /* Costs for vector insn classes for Vulcan. */
493 static const struct cpu_vector_cost thunderx2t99_vector_cost =
495 1, /* scalar_int_stmt_cost */
496 6, /* scalar_fp_stmt_cost */
497 4, /* scalar_load_cost */
498 1, /* scalar_store_cost */
499 5, /* vec_int_stmt_cost */
500 6, /* vec_fp_stmt_cost */
501 3, /* vec_permute_cost */
502 6, /* vec_to_scalar_cost */
503 5, /* scalar_to_vec_cost */
504 8, /* vec_align_load_cost */
505 8, /* vec_unalign_load_cost */
506 4, /* vec_unalign_store_cost */
507 4, /* vec_store_cost */
508 2, /* cond_taken_branch_cost */
509 1 /* cond_not_taken_branch_cost */
512 /* Generic costs for branch instructions. */
513 static const struct cpu_branch_cost generic_branch_cost =
515 1, /* Predictable. */
516 3 /* Unpredictable. */
519 /* Generic approximation modes. */
520 static const cpu_approx_modes generic_approx_modes =
522 AARCH64_APPROX_NONE, /* division */
523 AARCH64_APPROX_NONE, /* sqrt */
524 AARCH64_APPROX_NONE /* recip_sqrt */
527 /* Approximation modes for Exynos M1. */
528 static const cpu_approx_modes exynosm1_approx_modes =
530 AARCH64_APPROX_NONE, /* division */
531 AARCH64_APPROX_ALL, /* sqrt */
532 AARCH64_APPROX_ALL /* recip_sqrt */
535 /* Approximation modes for X-Gene 1. */
536 static const cpu_approx_modes xgene1_approx_modes =
538 AARCH64_APPROX_NONE, /* division */
539 AARCH64_APPROX_NONE, /* sqrt */
540 AARCH64_APPROX_ALL /* recip_sqrt */
543 /* Generic prefetch settings (which disable prefetch). */
544 static const cpu_prefetch_tune generic_prefetch_tune =
546 0, /* num_slots */
547 -1, /* l1_cache_size */
548 -1, /* l1_cache_line_size */
549 -1, /* l2_cache_size */
550 -1 /* default_opt_level */
553 static const cpu_prefetch_tune exynosm1_prefetch_tune =
555 0, /* num_slots */
556 -1, /* l1_cache_size */
557 64, /* l1_cache_line_size */
558 -1, /* l2_cache_size */
559 -1 /* default_opt_level */
562 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
564 4, /* num_slots */
565 32, /* l1_cache_size */
566 64, /* l1_cache_line_size */
567 1024, /* l2_cache_size */
568 -1 /* default_opt_level */
571 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
573 8, /* num_slots */
574 32, /* l1_cache_size */
575 128, /* l1_cache_line_size */
576 16*1024, /* l2_cache_size */
577 3 /* default_opt_level */
580 static const cpu_prefetch_tune thunderx_prefetch_tune =
582 8, /* num_slots */
583 32, /* l1_cache_size */
584 128, /* l1_cache_line_size */
585 -1, /* l2_cache_size */
586 -1 /* default_opt_level */
589 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
591 8, /* num_slots */
592 32, /* l1_cache_size */
593 64, /* l1_cache_line_size */
594 256, /* l2_cache_size */
595 -1 /* default_opt_level */
598 static const struct tune_params generic_tunings =
600 &cortexa57_extra_costs,
601 &generic_addrcost_table,
602 &generic_regmove_cost,
603 &generic_vector_cost,
604 &generic_branch_cost,
605 &generic_approx_modes,
606 4, /* memmov_cost */
607 2, /* issue_rate */
608 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
609 8, /* function_align. */
610 4, /* jump_align. */
611 8, /* loop_align. */
612 2, /* int_reassoc_width. */
613 4, /* fp_reassoc_width. */
614 1, /* vec_reassoc_width. */
615 2, /* min_div_recip_mul_sf. */
616 2, /* min_div_recip_mul_df. */
617 0, /* max_case_values. */
618 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
619 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
620 &generic_prefetch_tune
623 static const struct tune_params cortexa35_tunings =
625 &cortexa53_extra_costs,
626 &generic_addrcost_table,
627 &cortexa53_regmove_cost,
628 &generic_vector_cost,
629 &generic_branch_cost,
630 &generic_approx_modes,
631 4, /* memmov_cost */
632 1, /* issue_rate */
633 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
634 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
635 16, /* function_align. */
636 4, /* jump_align. */
637 8, /* loop_align. */
638 2, /* int_reassoc_width. */
639 4, /* fp_reassoc_width. */
640 1, /* vec_reassoc_width. */
641 2, /* min_div_recip_mul_sf. */
642 2, /* min_div_recip_mul_df. */
643 0, /* max_case_values. */
644 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
645 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
646 &generic_prefetch_tune
649 static const struct tune_params cortexa53_tunings =
651 &cortexa53_extra_costs,
652 &generic_addrcost_table,
653 &cortexa53_regmove_cost,
654 &generic_vector_cost,
655 &generic_branch_cost,
656 &generic_approx_modes,
657 4, /* memmov_cost */
658 2, /* issue_rate */
659 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
660 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
661 16, /* function_align. */
662 4, /* jump_align. */
663 8, /* loop_align. */
664 2, /* int_reassoc_width. */
665 4, /* fp_reassoc_width. */
666 1, /* vec_reassoc_width. */
667 2, /* min_div_recip_mul_sf. */
668 2, /* min_div_recip_mul_df. */
669 0, /* max_case_values. */
670 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
671 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
672 &generic_prefetch_tune
675 static const struct tune_params cortexa57_tunings =
677 &cortexa57_extra_costs,
678 &generic_addrcost_table,
679 &cortexa57_regmove_cost,
680 &cortexa57_vector_cost,
681 &generic_branch_cost,
682 &generic_approx_modes,
683 4, /* memmov_cost */
684 3, /* issue_rate */
685 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
686 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
687 16, /* function_align. */
688 4, /* jump_align. */
689 8, /* loop_align. */
690 2, /* int_reassoc_width. */
691 4, /* fp_reassoc_width. */
692 1, /* vec_reassoc_width. */
693 2, /* min_div_recip_mul_sf. */
694 2, /* min_div_recip_mul_df. */
695 0, /* max_case_values. */
696 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
697 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
698 &generic_prefetch_tune
701 static const struct tune_params cortexa72_tunings =
703 &cortexa57_extra_costs,
704 &generic_addrcost_table,
705 &cortexa57_regmove_cost,
706 &cortexa57_vector_cost,
707 &generic_branch_cost,
708 &generic_approx_modes,
709 4, /* memmov_cost */
710 3, /* issue_rate */
711 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
712 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
713 16, /* function_align. */
714 4, /* jump_align. */
715 8, /* loop_align. */
716 2, /* int_reassoc_width. */
717 4, /* fp_reassoc_width. */
718 1, /* vec_reassoc_width. */
719 2, /* min_div_recip_mul_sf. */
720 2, /* min_div_recip_mul_df. */
721 0, /* max_case_values. */
722 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
723 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
724 &generic_prefetch_tune
727 static const struct tune_params cortexa73_tunings =
729 &cortexa57_extra_costs,
730 &generic_addrcost_table,
731 &cortexa57_regmove_cost,
732 &cortexa57_vector_cost,
733 &generic_branch_cost,
734 &generic_approx_modes,
735 4, /* memmov_cost. */
736 2, /* issue_rate. */
737 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
738 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
739 16, /* function_align. */
740 4, /* jump_align. */
741 8, /* loop_align. */
742 2, /* int_reassoc_width. */
743 4, /* fp_reassoc_width. */
744 1, /* vec_reassoc_width. */
745 2, /* min_div_recip_mul_sf. */
746 2, /* min_div_recip_mul_df. */
747 0, /* max_case_values. */
748 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
749 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
750 &generic_prefetch_tune
755 static const struct tune_params exynosm1_tunings =
757 &exynosm1_extra_costs,
758 &exynosm1_addrcost_table,
759 &exynosm1_regmove_cost,
760 &exynosm1_vector_cost,
761 &generic_branch_cost,
762 &exynosm1_approx_modes,
763 4, /* memmov_cost */
764 3, /* issue_rate */
765 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
766 4, /* function_align. */
767 4, /* jump_align. */
768 4, /* loop_align. */
769 2, /* int_reassoc_width. */
770 4, /* fp_reassoc_width. */
771 1, /* vec_reassoc_width. */
772 2, /* min_div_recip_mul_sf. */
773 2, /* min_div_recip_mul_df. */
774 48, /* max_case_values. */
775 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
776 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
777 &exynosm1_prefetch_tune
780 static const struct tune_params thunderxt88_tunings =
782 &thunderx_extra_costs,
783 &generic_addrcost_table,
784 &thunderx_regmove_cost,
785 &thunderx_vector_cost,
786 &generic_branch_cost,
787 &generic_approx_modes,
788 6, /* memmov_cost */
789 2, /* issue_rate */
790 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
791 8, /* function_align. */
792 8, /* jump_align. */
793 8, /* loop_align. */
794 2, /* int_reassoc_width. */
795 4, /* fp_reassoc_width. */
796 1, /* vec_reassoc_width. */
797 2, /* min_div_recip_mul_sf. */
798 2, /* min_div_recip_mul_df. */
799 0, /* max_case_values. */
800 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
801 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
802 &thunderxt88_prefetch_tune
805 static const struct tune_params thunderx_tunings =
807 &thunderx_extra_costs,
808 &generic_addrcost_table,
809 &thunderx_regmove_cost,
810 &thunderx_vector_cost,
811 &generic_branch_cost,
812 &generic_approx_modes,
813 6, /* memmov_cost */
814 2, /* issue_rate */
815 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
816 8, /* function_align. */
817 8, /* jump_align. */
818 8, /* loop_align. */
819 2, /* int_reassoc_width. */
820 4, /* fp_reassoc_width. */
821 1, /* vec_reassoc_width. */
822 2, /* min_div_recip_mul_sf. */
823 2, /* min_div_recip_mul_df. */
824 0, /* max_case_values. */
825 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
826 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
827 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
828 &thunderx_prefetch_tune
831 static const struct tune_params xgene1_tunings =
833 &xgene1_extra_costs,
834 &xgene1_addrcost_table,
835 &xgene1_regmove_cost,
836 &xgene1_vector_cost,
837 &generic_branch_cost,
838 &xgene1_approx_modes,
839 6, /* memmov_cost */
840 4, /* issue_rate */
841 AARCH64_FUSE_NOTHING, /* fusible_ops */
842 16, /* function_align. */
843 8, /* jump_align. */
844 16, /* loop_align. */
845 2, /* int_reassoc_width. */
846 4, /* fp_reassoc_width. */
847 1, /* vec_reassoc_width. */
848 2, /* min_div_recip_mul_sf. */
849 2, /* min_div_recip_mul_df. */
850 0, /* max_case_values. */
851 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
852 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
853 &generic_prefetch_tune
856 static const struct tune_params qdf24xx_tunings =
858 &qdf24xx_extra_costs,
859 &generic_addrcost_table,
860 &qdf24xx_regmove_cost,
861 &generic_vector_cost,
862 &generic_branch_cost,
863 &generic_approx_modes,
864 4, /* memmov_cost */
865 4, /* issue_rate */
866 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
867 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
868 16, /* function_align. */
869 8, /* jump_align. */
870 16, /* loop_align. */
871 2, /* int_reassoc_width. */
872 4, /* fp_reassoc_width. */
873 1, /* vec_reassoc_width. */
874 2, /* min_div_recip_mul_sf. */
875 2, /* min_div_recip_mul_df. */
876 0, /* max_case_values. */
877 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
878 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
879 &qdf24xx_prefetch_tune
882 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
883 for now. */
884 static const struct tune_params saphira_tunings =
886 &generic_extra_costs,
887 &generic_addrcost_table,
888 &generic_regmove_cost,
889 &generic_vector_cost,
890 &generic_branch_cost,
891 &generic_approx_modes,
892 4, /* memmov_cost */
893 4, /* issue_rate */
894 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
895 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
896 16, /* function_align. */
897 8, /* jump_align. */
898 16, /* loop_align. */
899 2, /* int_reassoc_width. */
900 4, /* fp_reassoc_width. */
901 1, /* vec_reassoc_width. */
902 2, /* min_div_recip_mul_sf. */
903 2, /* min_div_recip_mul_df. */
904 0, /* max_case_values. */
905 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
906 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
907 &generic_prefetch_tune
910 static const struct tune_params thunderx2t99_tunings =
912 &thunderx2t99_extra_costs,
913 &thunderx2t99_addrcost_table,
914 &thunderx2t99_regmove_cost,
915 &thunderx2t99_vector_cost,
916 &generic_branch_cost,
917 &generic_approx_modes,
918 4, /* memmov_cost. */
919 4, /* issue_rate. */
920 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
921 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
922 16, /* function_align. */
923 8, /* jump_align. */
924 16, /* loop_align. */
925 3, /* int_reassoc_width. */
926 2, /* fp_reassoc_width. */
927 2, /* vec_reassoc_width. */
928 2, /* min_div_recip_mul_sf. */
929 2, /* min_div_recip_mul_df. */
930 0, /* max_case_values. */
931 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
932 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
933 &thunderx2t99_prefetch_tune
936 /* Support for fine-grained override of the tuning structures. */
937 struct aarch64_tuning_override_function
939 const char* name;
940 void (*parse_override)(const char*, struct tune_params*);
943 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
944 static void aarch64_parse_tune_string (const char*, struct tune_params*);
946 static const struct aarch64_tuning_override_function
947 aarch64_tuning_override_functions[] =
949 { "fuse", aarch64_parse_fuse_string },
950 { "tune", aarch64_parse_tune_string },
951 { NULL, NULL }
954 /* A processor implementing AArch64. */
955 struct processor
957 const char *const name;
958 enum aarch64_processor ident;
959 enum aarch64_processor sched_core;
960 enum aarch64_arch arch;
961 unsigned architecture_version;
962 const unsigned long flags;
963 const struct tune_params *const tune;
966 /* Architectures implementing AArch64. */
967 static const struct processor all_architectures[] =
969 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
970 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
971 #include "aarch64-arches.def"
972 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
975 /* Processor cores implementing AArch64. */
976 static const struct processor all_cores[] =
978 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
979 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
980 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
981 FLAGS, &COSTS##_tunings},
982 #include "aarch64-cores.def"
983 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
984 AARCH64_FL_FOR_ARCH8, &generic_tunings},
985 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
989 /* Target specification. These are populated by the -march, -mtune, -mcpu
990 handling code or by target attributes. */
991 static const struct processor *selected_arch;
992 static const struct processor *selected_cpu;
993 static const struct processor *selected_tune;
995 /* The current tuning set. */
996 struct tune_params aarch64_tune_params = generic_tunings;
998 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1000 /* An ISA extension in the co-processor and main instruction set space. */
1001 struct aarch64_option_extension
1003 const char *const name;
1004 const unsigned long flags_on;
1005 const unsigned long flags_off;
1008 typedef enum aarch64_cond_code
1010 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1011 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1012 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1014 aarch64_cc;
1016 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1018 /* The condition codes of the processor, and the inverse function. */
1019 static const char * const aarch64_condition_codes[] =
1021 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1022 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1025 /* Generate code to enable conditional branches in functions over 1 MiB. */
1026 const char *
1027 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1028 const char * branch_format)
1030 rtx_code_label * tmp_label = gen_label_rtx ();
1031 char label_buf[256];
1032 char buffer[128];
1033 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1034 CODE_LABEL_NUMBER (tmp_label));
1035 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1036 rtx dest_label = operands[pos_label];
1037 operands[pos_label] = tmp_label;
1039 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1040 output_asm_insn (buffer, operands);
1042 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1043 operands[pos_label] = dest_label;
1044 output_asm_insn (buffer, operands);
1045 return "";
1048 void
1049 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1051 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1052 if (TARGET_GENERAL_REGS_ONLY)
1053 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1054 else
1055 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1058 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1059 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1060 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
1061 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1062 cost (in this case the best class is the lowest cost one). Using ALL_REGS
1063 irrespectively of its cost results in bad allocations with many redundant
1064 int<->FP moves which are expensive on various cores.
1065 To avoid this we don't allow ALL_REGS as the allocno class, but force a
1066 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
1067 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
1068 Otherwise set the allocno class depending on the mode.
1069 The result of this is that it is no longer inefficient to have a higher
1070 memory move cost than the register move cost.
1073 static reg_class_t
1074 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1075 reg_class_t best_class)
1077 machine_mode mode;
1079 if (allocno_class != ALL_REGS)
1080 return allocno_class;
1082 if (best_class != ALL_REGS)
1083 return best_class;
1085 mode = PSEUDO_REGNO_MODE (regno);
1086 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1089 static unsigned int
1090 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1092 if (GET_MODE_UNIT_SIZE (mode) == 4)
1093 return aarch64_tune_params.min_div_recip_mul_sf;
1094 return aarch64_tune_params.min_div_recip_mul_df;
1097 static int
1098 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
1099 machine_mode mode)
1101 if (VECTOR_MODE_P (mode))
1102 return aarch64_tune_params.vec_reassoc_width;
1103 if (INTEGRAL_MODE_P (mode))
1104 return aarch64_tune_params.int_reassoc_width;
1105 if (FLOAT_MODE_P (mode))
1106 return aarch64_tune_params.fp_reassoc_width;
1107 return 1;
1110 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1111 unsigned
1112 aarch64_dbx_register_number (unsigned regno)
1114 if (GP_REGNUM_P (regno))
1115 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1116 else if (regno == SP_REGNUM)
1117 return AARCH64_DWARF_SP;
1118 else if (FP_REGNUM_P (regno))
1119 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1120 else if (PR_REGNUM_P (regno))
1121 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1122 else if (regno == VG_REGNUM)
1123 return AARCH64_DWARF_VG;
1125 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1126 equivalent DWARF register. */
1127 return DWARF_FRAME_REGISTERS;
1130 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1131 static bool
1132 aarch64_advsimd_struct_mode_p (machine_mode mode)
1134 return (TARGET_SIMD
1135 && (mode == OImode || mode == CImode || mode == XImode));
1138 /* Return true if MODE is an SVE predicate mode. */
1139 static bool
1140 aarch64_sve_pred_mode_p (machine_mode mode)
1142 return (TARGET_SVE
1143 && (mode == VNx16BImode
1144 || mode == VNx8BImode
1145 || mode == VNx4BImode
1146 || mode == VNx2BImode));
1149 /* Three mutually-exclusive flags describing a vector or predicate type. */
1150 const unsigned int VEC_ADVSIMD = 1;
1151 const unsigned int VEC_SVE_DATA = 2;
1152 const unsigned int VEC_SVE_PRED = 4;
1153 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1154 a structure of 2, 3 or 4 vectors. */
1155 const unsigned int VEC_STRUCT = 8;
1156 /* Useful combinations of the above. */
1157 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1158 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1160 /* Return a set of flags describing the vector properties of mode MODE.
1161 Ignore modes that are not supported by the current target. */
1162 static unsigned int
1163 aarch64_classify_vector_mode (machine_mode mode)
1165 if (aarch64_advsimd_struct_mode_p (mode))
1166 return VEC_ADVSIMD | VEC_STRUCT;
1168 if (aarch64_sve_pred_mode_p (mode))
1169 return VEC_SVE_PRED;
1171 scalar_mode inner = GET_MODE_INNER (mode);
1172 if (VECTOR_MODE_P (mode)
1173 && (inner == QImode
1174 || inner == HImode
1175 || inner == HFmode
1176 || inner == SImode
1177 || inner == SFmode
1178 || inner == DImode
1179 || inner == DFmode))
1181 if (TARGET_SVE)
1183 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1184 return VEC_SVE_DATA;
1185 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1186 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1187 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1188 return VEC_SVE_DATA | VEC_STRUCT;
1191 /* This includes V1DF but not V1DI (which doesn't exist). */
1192 if (TARGET_SIMD
1193 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1194 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1195 return VEC_ADVSIMD;
1198 return 0;
1201 /* Return true if MODE is any of the data vector modes, including
1202 structure modes. */
1203 static bool
1204 aarch64_vector_data_mode_p (machine_mode mode)
1206 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1209 /* Return true if MODE is an SVE data vector mode; either a single vector
1210 or a structure of vectors. */
1211 static bool
1212 aarch64_sve_data_mode_p (machine_mode mode)
1214 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1217 /* Implement target hook TARGET_ARRAY_MODE. */
1218 static opt_machine_mode
1219 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1221 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1222 && IN_RANGE (nelems, 2, 4))
1223 return mode_for_vector (GET_MODE_INNER (mode),
1224 GET_MODE_NUNITS (mode) * nelems);
1226 return opt_machine_mode ();
1229 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1230 static bool
1231 aarch64_array_mode_supported_p (machine_mode mode,
1232 unsigned HOST_WIDE_INT nelems)
1234 if (TARGET_SIMD
1235 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1236 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1237 && (nelems >= 2 && nelems <= 4))
1238 return true;
1240 return false;
1243 /* Return the SVE predicate mode to use for elements that have
1244 ELEM_NBYTES bytes, if such a mode exists. */
1246 opt_machine_mode
1247 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1249 if (TARGET_SVE)
1251 if (elem_nbytes == 1)
1252 return VNx16BImode;
1253 if (elem_nbytes == 2)
1254 return VNx8BImode;
1255 if (elem_nbytes == 4)
1256 return VNx4BImode;
1257 if (elem_nbytes == 8)
1258 return VNx2BImode;
1260 return opt_machine_mode ();
1263 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1265 static opt_machine_mode
1266 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1268 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1270 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1271 machine_mode pred_mode;
1272 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1273 return pred_mode;
1276 return default_get_mask_mode (nunits, nbytes);
1279 /* Implement TARGET_HARD_REGNO_NREGS. */
1281 static unsigned int
1282 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1284 /* ??? Logically we should only need to provide a value when
1285 HARD_REGNO_MODE_OK says that the combination is valid,
1286 but at the moment we need to handle all modes. Just ignore
1287 any runtime parts for registers that can't store them. */
1288 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1289 switch (aarch64_regno_regclass (regno))
1291 case FP_REGS:
1292 case FP_LO_REGS:
1293 if (aarch64_sve_data_mode_p (mode))
1294 return exact_div (GET_MODE_SIZE (mode),
1295 BYTES_PER_SVE_VECTOR).to_constant ();
1296 return CEIL (lowest_size, UNITS_PER_VREG);
1297 case PR_REGS:
1298 case PR_LO_REGS:
1299 case PR_HI_REGS:
1300 return 1;
1301 default:
1302 return CEIL (lowest_size, UNITS_PER_WORD);
1304 gcc_unreachable ();
1307 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1309 static bool
1310 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1312 if (GET_MODE_CLASS (mode) == MODE_CC)
1313 return regno == CC_REGNUM;
1315 if (regno == VG_REGNUM)
1316 /* This must have the same size as _Unwind_Word. */
1317 return mode == DImode;
1319 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1320 if (vec_flags & VEC_SVE_PRED)
1321 return PR_REGNUM_P (regno);
1323 if (PR_REGNUM_P (regno))
1324 return 0;
1326 if (regno == SP_REGNUM)
1327 /* The purpose of comparing with ptr_mode is to support the
1328 global register variable associated with the stack pointer
1329 register via the syntax of asm ("wsp") in ILP32. */
1330 return mode == Pmode || mode == ptr_mode;
1332 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1333 return mode == Pmode;
1335 if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1336 return true;
1338 if (FP_REGNUM_P (regno))
1340 if (vec_flags & VEC_STRUCT)
1341 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1342 else
1343 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1346 return false;
1349 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1350 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1351 clobbers the top 64 bits when restoring the bottom 64 bits. */
1353 static bool
1354 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1356 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1359 /* Implement REGMODE_NATURAL_SIZE. */
1360 poly_uint64
1361 aarch64_regmode_natural_size (machine_mode mode)
1363 /* The natural size for SVE data modes is one SVE data vector,
1364 and similarly for predicates. We can't independently modify
1365 anything smaller than that. */
1366 /* ??? For now, only do this for variable-width SVE registers.
1367 Doing it for constant-sized registers breaks lower-subreg.c. */
1368 /* ??? And once that's fixed, we should probably have similar
1369 code for Advanced SIMD. */
1370 if (!aarch64_sve_vg.is_constant ())
1372 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1373 if (vec_flags & VEC_SVE_PRED)
1374 return BYTES_PER_SVE_PRED;
1375 if (vec_flags & VEC_SVE_DATA)
1376 return BYTES_PER_SVE_VECTOR;
1378 return UNITS_PER_WORD;
1381 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1382 machine_mode
1383 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1384 machine_mode mode)
1386 /* The predicate mode determines which bits are significant and
1387 which are "don't care". Decreasing the number of lanes would
1388 lose data while increasing the number of lanes would make bits
1389 unnecessarily significant. */
1390 if (PR_REGNUM_P (regno))
1391 return mode;
1392 if (known_ge (GET_MODE_SIZE (mode), 4))
1393 return mode;
1394 else
1395 return SImode;
1398 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1399 that strcpy from constants will be faster. */
1401 static HOST_WIDE_INT
1402 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1404 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1405 return MAX (align, BITS_PER_WORD);
1406 return align;
1409 /* Return true if calls to DECL should be treated as
1410 long-calls (ie called via a register). */
1411 static bool
1412 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1414 return false;
1417 /* Return true if calls to symbol-ref SYM should be treated as
1418 long-calls (ie called via a register). */
1419 bool
1420 aarch64_is_long_call_p (rtx sym)
1422 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1425 /* Return true if calls to symbol-ref SYM should not go through
1426 plt stubs. */
1428 bool
1429 aarch64_is_noplt_call_p (rtx sym)
1431 const_tree decl = SYMBOL_REF_DECL (sym);
1433 if (flag_pic
1434 && decl
1435 && (!flag_plt
1436 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1437 && !targetm.binds_local_p (decl))
1438 return true;
1440 return false;
1443 /* Return true if the offsets to a zero/sign-extract operation
1444 represent an expression that matches an extend operation. The
1445 operands represent the paramters from
1447 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1448 bool
1449 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1450 rtx extract_imm)
1452 HOST_WIDE_INT mult_val, extract_val;
1454 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1455 return false;
1457 mult_val = INTVAL (mult_imm);
1458 extract_val = INTVAL (extract_imm);
1460 if (extract_val > 8
1461 && extract_val < GET_MODE_BITSIZE (mode)
1462 && exact_log2 (extract_val & ~7) > 0
1463 && (extract_val & 7) <= 4
1464 && mult_val == (1 << (extract_val & 7)))
1465 return true;
1467 return false;
1470 /* Emit an insn that's a simple single-set. Both the operands must be
1471 known to be valid. */
1472 inline static rtx_insn *
1473 emit_set_insn (rtx x, rtx y)
1475 return emit_insn (gen_rtx_SET (x, y));
1478 /* X and Y are two things to compare using CODE. Emit the compare insn and
1479 return the rtx for register 0 in the proper mode. */
1481 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1483 machine_mode mode = SELECT_CC_MODE (code, x, y);
1484 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1486 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1487 return cc_reg;
1490 /* Build the SYMBOL_REF for __tls_get_addr. */
1492 static GTY(()) rtx tls_get_addr_libfunc;
1495 aarch64_tls_get_addr (void)
1497 if (!tls_get_addr_libfunc)
1498 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1499 return tls_get_addr_libfunc;
1502 /* Return the TLS model to use for ADDR. */
1504 static enum tls_model
1505 tls_symbolic_operand_type (rtx addr)
1507 enum tls_model tls_kind = TLS_MODEL_NONE;
1508 if (GET_CODE (addr) == CONST)
1510 poly_int64 addend;
1511 rtx sym = strip_offset (addr, &addend);
1512 if (GET_CODE (sym) == SYMBOL_REF)
1513 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1515 else if (GET_CODE (addr) == SYMBOL_REF)
1516 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1518 return tls_kind;
1521 /* We'll allow lo_sum's in addresses in our legitimate addresses
1522 so that combine would take care of combining addresses where
1523 necessary, but for generation purposes, we'll generate the address
1524 as :
1525 RTL Absolute
1526 tmp = hi (symbol_ref); adrp x1, foo
1527 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1530 PIC TLS
1531 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1532 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1533 bl __tls_get_addr
1536 Load TLS symbol, depending on TLS mechanism and TLS access model.
1538 Global Dynamic - Traditional TLS:
1539 adrp tmp, :tlsgd:imm
1540 add dest, tmp, #:tlsgd_lo12:imm
1541 bl __tls_get_addr
1543 Global Dynamic - TLS Descriptors:
1544 adrp dest, :tlsdesc:imm
1545 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1546 add dest, dest, #:tlsdesc_lo12:imm
1547 blr tmp
1548 mrs tp, tpidr_el0
1549 add dest, dest, tp
1551 Initial Exec:
1552 mrs tp, tpidr_el0
1553 adrp tmp, :gottprel:imm
1554 ldr dest, [tmp, #:gottprel_lo12:imm]
1555 add dest, dest, tp
1557 Local Exec:
1558 mrs tp, tpidr_el0
1559 add t0, tp, #:tprel_hi12:imm, lsl #12
1560 add t0, t0, #:tprel_lo12_nc:imm
1563 static void
1564 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1565 enum aarch64_symbol_type type)
1567 switch (type)
1569 case SYMBOL_SMALL_ABSOLUTE:
1571 /* In ILP32, the mode of dest can be either SImode or DImode. */
1572 rtx tmp_reg = dest;
1573 machine_mode mode = GET_MODE (dest);
1575 gcc_assert (mode == Pmode || mode == ptr_mode);
1577 if (can_create_pseudo_p ())
1578 tmp_reg = gen_reg_rtx (mode);
1580 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1581 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1582 return;
1585 case SYMBOL_TINY_ABSOLUTE:
1586 emit_insn (gen_rtx_SET (dest, imm));
1587 return;
1589 case SYMBOL_SMALL_GOT_28K:
1591 machine_mode mode = GET_MODE (dest);
1592 rtx gp_rtx = pic_offset_table_rtx;
1593 rtx insn;
1594 rtx mem;
1596 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1597 here before rtl expand. Tree IVOPT will generate rtl pattern to
1598 decide rtx costs, in which case pic_offset_table_rtx is not
1599 initialized. For that case no need to generate the first adrp
1600 instruction as the final cost for global variable access is
1601 one instruction. */
1602 if (gp_rtx != NULL)
1604 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1605 using the page base as GOT base, the first page may be wasted,
1606 in the worst scenario, there is only 28K space for GOT).
1608 The generate instruction sequence for accessing global variable
1611 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1613 Only one instruction needed. But we must initialize
1614 pic_offset_table_rtx properly. We generate initialize insn for
1615 every global access, and allow CSE to remove all redundant.
1617 The final instruction sequences will look like the following
1618 for multiply global variables access.
1620 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1622 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1623 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1624 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1625 ... */
1627 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1628 crtl->uses_pic_offset_table = 1;
1629 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1631 if (mode != GET_MODE (gp_rtx))
1632 gp_rtx = gen_lowpart (mode, gp_rtx);
1636 if (mode == ptr_mode)
1638 if (mode == DImode)
1639 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1640 else
1641 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1643 mem = XVECEXP (SET_SRC (insn), 0, 0);
1645 else
1647 gcc_assert (mode == Pmode);
1649 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1650 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1653 /* The operand is expected to be MEM. Whenever the related insn
1654 pattern changed, above code which calculate mem should be
1655 updated. */
1656 gcc_assert (GET_CODE (mem) == MEM);
1657 MEM_READONLY_P (mem) = 1;
1658 MEM_NOTRAP_P (mem) = 1;
1659 emit_insn (insn);
1660 return;
1663 case SYMBOL_SMALL_GOT_4G:
1665 /* In ILP32, the mode of dest can be either SImode or DImode,
1666 while the got entry is always of SImode size. The mode of
1667 dest depends on how dest is used: if dest is assigned to a
1668 pointer (e.g. in the memory), it has SImode; it may have
1669 DImode if dest is dereferenced to access the memeory.
1670 This is why we have to handle three different ldr_got_small
1671 patterns here (two patterns for ILP32). */
1673 rtx insn;
1674 rtx mem;
1675 rtx tmp_reg = dest;
1676 machine_mode mode = GET_MODE (dest);
1678 if (can_create_pseudo_p ())
1679 tmp_reg = gen_reg_rtx (mode);
1681 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1682 if (mode == ptr_mode)
1684 if (mode == DImode)
1685 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1686 else
1687 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1689 mem = XVECEXP (SET_SRC (insn), 0, 0);
1691 else
1693 gcc_assert (mode == Pmode);
1695 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1696 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1699 gcc_assert (GET_CODE (mem) == MEM);
1700 MEM_READONLY_P (mem) = 1;
1701 MEM_NOTRAP_P (mem) = 1;
1702 emit_insn (insn);
1703 return;
1706 case SYMBOL_SMALL_TLSGD:
1708 rtx_insn *insns;
1709 machine_mode mode = GET_MODE (dest);
1710 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1712 start_sequence ();
1713 if (TARGET_ILP32)
1714 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1715 else
1716 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1717 insns = get_insns ();
1718 end_sequence ();
1720 RTL_CONST_CALL_P (insns) = 1;
1721 emit_libcall_block (insns, dest, result, imm);
1722 return;
1725 case SYMBOL_SMALL_TLSDESC:
1727 machine_mode mode = GET_MODE (dest);
1728 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1729 rtx tp;
1731 gcc_assert (mode == Pmode || mode == ptr_mode);
1733 /* In ILP32, the got entry is always of SImode size. Unlike
1734 small GOT, the dest is fixed at reg 0. */
1735 if (TARGET_ILP32)
1736 emit_insn (gen_tlsdesc_small_si (imm));
1737 else
1738 emit_insn (gen_tlsdesc_small_di (imm));
1739 tp = aarch64_load_tp (NULL);
1741 if (mode != Pmode)
1742 tp = gen_lowpart (mode, tp);
1744 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1745 if (REG_P (dest))
1746 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1747 return;
1750 case SYMBOL_SMALL_TLSIE:
1752 /* In ILP32, the mode of dest can be either SImode or DImode,
1753 while the got entry is always of SImode size. The mode of
1754 dest depends on how dest is used: if dest is assigned to a
1755 pointer (e.g. in the memory), it has SImode; it may have
1756 DImode if dest is dereferenced to access the memeory.
1757 This is why we have to handle three different tlsie_small
1758 patterns here (two patterns for ILP32). */
1759 machine_mode mode = GET_MODE (dest);
1760 rtx tmp_reg = gen_reg_rtx (mode);
1761 rtx tp = aarch64_load_tp (NULL);
1763 if (mode == ptr_mode)
1765 if (mode == DImode)
1766 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1767 else
1769 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1770 tp = gen_lowpart (mode, tp);
1773 else
1775 gcc_assert (mode == Pmode);
1776 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1779 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1780 if (REG_P (dest))
1781 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1782 return;
1785 case SYMBOL_TLSLE12:
1786 case SYMBOL_TLSLE24:
1787 case SYMBOL_TLSLE32:
1788 case SYMBOL_TLSLE48:
1790 machine_mode mode = GET_MODE (dest);
1791 rtx tp = aarch64_load_tp (NULL);
1793 if (mode != Pmode)
1794 tp = gen_lowpart (mode, tp);
1796 switch (type)
1798 case SYMBOL_TLSLE12:
1799 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1800 (dest, tp, imm));
1801 break;
1802 case SYMBOL_TLSLE24:
1803 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1804 (dest, tp, imm));
1805 break;
1806 case SYMBOL_TLSLE32:
1807 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1808 (dest, imm));
1809 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1810 (dest, dest, tp));
1811 break;
1812 case SYMBOL_TLSLE48:
1813 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1814 (dest, imm));
1815 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1816 (dest, dest, tp));
1817 break;
1818 default:
1819 gcc_unreachable ();
1822 if (REG_P (dest))
1823 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1824 return;
1827 case SYMBOL_TINY_GOT:
1828 emit_insn (gen_ldr_got_tiny (dest, imm));
1829 return;
1831 case SYMBOL_TINY_TLSIE:
1833 machine_mode mode = GET_MODE (dest);
1834 rtx tp = aarch64_load_tp (NULL);
1836 if (mode == ptr_mode)
1838 if (mode == DImode)
1839 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1840 else
1842 tp = gen_lowpart (mode, tp);
1843 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1846 else
1848 gcc_assert (mode == Pmode);
1849 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1852 if (REG_P (dest))
1853 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1854 return;
1857 default:
1858 gcc_unreachable ();
1862 /* Emit a move from SRC to DEST. Assume that the move expanders can
1863 handle all moves if !can_create_pseudo_p (). The distinction is
1864 important because, unlike emit_move_insn, the move expanders know
1865 how to force Pmode objects into the constant pool even when the
1866 constant pool address is not itself legitimate. */
1867 static rtx
1868 aarch64_emit_move (rtx dest, rtx src)
1870 return (can_create_pseudo_p ()
1871 ? emit_move_insn (dest, src)
1872 : emit_move_insn_1 (dest, src));
1875 /* Split a 128-bit move operation into two 64-bit move operations,
1876 taking care to handle partial overlap of register to register
1877 copies. Special cases are needed when moving between GP regs and
1878 FP regs. SRC can be a register, constant or memory; DST a register
1879 or memory. If either operand is memory it must not have any side
1880 effects. */
1881 void
1882 aarch64_split_128bit_move (rtx dst, rtx src)
1884 rtx dst_lo, dst_hi;
1885 rtx src_lo, src_hi;
1887 machine_mode mode = GET_MODE (dst);
1889 gcc_assert (mode == TImode || mode == TFmode);
1890 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1891 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1893 if (REG_P (dst) && REG_P (src))
1895 int src_regno = REGNO (src);
1896 int dst_regno = REGNO (dst);
1898 /* Handle FP <-> GP regs. */
1899 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1901 src_lo = gen_lowpart (word_mode, src);
1902 src_hi = gen_highpart (word_mode, src);
1904 if (mode == TImode)
1906 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1907 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1909 else
1911 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1912 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1914 return;
1916 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1918 dst_lo = gen_lowpart (word_mode, dst);
1919 dst_hi = gen_highpart (word_mode, dst);
1921 if (mode == TImode)
1923 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1924 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1926 else
1928 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1929 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1931 return;
1935 dst_lo = gen_lowpart (word_mode, dst);
1936 dst_hi = gen_highpart (word_mode, dst);
1937 src_lo = gen_lowpart (word_mode, src);
1938 src_hi = gen_highpart_mode (word_mode, mode, src);
1940 /* At most one pairing may overlap. */
1941 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1943 aarch64_emit_move (dst_hi, src_hi);
1944 aarch64_emit_move (dst_lo, src_lo);
1946 else
1948 aarch64_emit_move (dst_lo, src_lo);
1949 aarch64_emit_move (dst_hi, src_hi);
1953 bool
1954 aarch64_split_128bit_move_p (rtx dst, rtx src)
1956 return (! REG_P (src)
1957 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1960 /* Split a complex SIMD combine. */
1962 void
1963 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1965 machine_mode src_mode = GET_MODE (src1);
1966 machine_mode dst_mode = GET_MODE (dst);
1968 gcc_assert (VECTOR_MODE_P (dst_mode));
1969 gcc_assert (register_operand (dst, dst_mode)
1970 && register_operand (src1, src_mode)
1971 && register_operand (src2, src_mode));
1973 rtx (*gen) (rtx, rtx, rtx);
1975 switch (src_mode)
1977 case E_V8QImode:
1978 gen = gen_aarch64_simd_combinev8qi;
1979 break;
1980 case E_V4HImode:
1981 gen = gen_aarch64_simd_combinev4hi;
1982 break;
1983 case E_V2SImode:
1984 gen = gen_aarch64_simd_combinev2si;
1985 break;
1986 case E_V4HFmode:
1987 gen = gen_aarch64_simd_combinev4hf;
1988 break;
1989 case E_V2SFmode:
1990 gen = gen_aarch64_simd_combinev2sf;
1991 break;
1992 case E_DImode:
1993 gen = gen_aarch64_simd_combinedi;
1994 break;
1995 case E_DFmode:
1996 gen = gen_aarch64_simd_combinedf;
1997 break;
1998 default:
1999 gcc_unreachable ();
2002 emit_insn (gen (dst, src1, src2));
2003 return;
2006 /* Split a complex SIMD move. */
2008 void
2009 aarch64_split_simd_move (rtx dst, rtx src)
2011 machine_mode src_mode = GET_MODE (src);
2012 machine_mode dst_mode = GET_MODE (dst);
2014 gcc_assert (VECTOR_MODE_P (dst_mode));
2016 if (REG_P (dst) && REG_P (src))
2018 rtx (*gen) (rtx, rtx);
2020 gcc_assert (VECTOR_MODE_P (src_mode));
2022 switch (src_mode)
2024 case E_V16QImode:
2025 gen = gen_aarch64_split_simd_movv16qi;
2026 break;
2027 case E_V8HImode:
2028 gen = gen_aarch64_split_simd_movv8hi;
2029 break;
2030 case E_V4SImode:
2031 gen = gen_aarch64_split_simd_movv4si;
2032 break;
2033 case E_V2DImode:
2034 gen = gen_aarch64_split_simd_movv2di;
2035 break;
2036 case E_V8HFmode:
2037 gen = gen_aarch64_split_simd_movv8hf;
2038 break;
2039 case E_V4SFmode:
2040 gen = gen_aarch64_split_simd_movv4sf;
2041 break;
2042 case E_V2DFmode:
2043 gen = gen_aarch64_split_simd_movv2df;
2044 break;
2045 default:
2046 gcc_unreachable ();
2049 emit_insn (gen (dst, src));
2050 return;
2054 bool
2055 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2056 machine_mode ymode, rtx y)
2058 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2059 gcc_assert (r != NULL);
2060 return rtx_equal_p (x, r);
2064 static rtx
2065 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2067 if (can_create_pseudo_p ())
2068 return force_reg (mode, value);
2069 else
2071 gcc_assert (x);
2072 aarch64_emit_move (x, value);
2073 return x;
2077 /* Return true if we can move VALUE into a register using a single
2078 CNT[BHWD] instruction. */
2080 static bool
2081 aarch64_sve_cnt_immediate_p (poly_int64 value)
2083 HOST_WIDE_INT factor = value.coeffs[0];
2084 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2085 return (value.coeffs[1] == factor
2086 && IN_RANGE (factor, 2, 16 * 16)
2087 && (factor & 1) == 0
2088 && factor <= 16 * (factor & -factor));
2091 /* Likewise for rtx X. */
2093 bool
2094 aarch64_sve_cnt_immediate_p (rtx x)
2096 poly_int64 value;
2097 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2100 /* Return the asm string for an instruction with a CNT-like vector size
2101 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2102 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2103 first part of the operands template (the part that comes before the
2104 vector size itself). FACTOR is the number of quadwords.
2105 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2106 If it is zero, we can use any element size. */
2108 static char *
2109 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2110 unsigned int factor,
2111 unsigned int nelts_per_vq)
2113 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2115 if (nelts_per_vq == 0)
2116 /* There is some overlap in the ranges of the four CNT instructions.
2117 Here we always use the smallest possible element size, so that the
2118 multiplier is 1 whereever possible. */
2119 nelts_per_vq = factor & -factor;
2120 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2121 gcc_assert (IN_RANGE (shift, 1, 4));
2122 char suffix = "dwhb"[shift - 1];
2124 factor >>= shift;
2125 unsigned int written;
2126 if (factor == 1)
2127 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2128 prefix, suffix, operands);
2129 else
2130 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2131 prefix, suffix, operands, factor);
2132 gcc_assert (written < sizeof (buffer));
2133 return buffer;
2136 /* Return the asm string for an instruction with a CNT-like vector size
2137 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2138 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2139 first part of the operands template (the part that comes before the
2140 vector size itself). X is the value of the vector size operand,
2141 as a polynomial integer rtx. */
2143 char *
2144 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2145 rtx x)
2147 poly_int64 value = rtx_to_poly_int64 (x);
2148 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2149 return aarch64_output_sve_cnt_immediate (prefix, operands,
2150 value.coeffs[1], 0);
2153 /* Return true if we can add VALUE to a register using a single ADDVL
2154 or ADDPL instruction. */
2156 static bool
2157 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2159 HOST_WIDE_INT factor = value.coeffs[0];
2160 if (factor == 0 || value.coeffs[1] != factor)
2161 return false;
2162 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2163 and a value of 16 is one vector width. */
2164 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2165 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2168 /* Likewise for rtx X. */
2170 bool
2171 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2173 poly_int64 value;
2174 return (poly_int_rtx_p (x, &value)
2175 && aarch64_sve_addvl_addpl_immediate_p (value));
2178 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2179 and storing the result in operand 0. */
2181 char *
2182 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2184 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2185 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2186 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2188 /* Use INC or DEC if possible. */
2189 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2191 if (aarch64_sve_cnt_immediate_p (offset_value))
2192 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2193 offset_value.coeffs[1], 0);
2194 if (aarch64_sve_cnt_immediate_p (-offset_value))
2195 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2196 -offset_value.coeffs[1], 0);
2199 int factor = offset_value.coeffs[1];
2200 if ((factor & 15) == 0)
2201 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2202 else
2203 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2204 return buffer;
2207 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2208 instruction. If it is, store the number of elements in each vector
2209 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2210 factor in *FACTOR_OUT (if nonnull). */
2212 bool
2213 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2214 unsigned int *nelts_per_vq_out)
2216 rtx elt;
2217 poly_int64 value;
2219 if (!const_vec_duplicate_p (x, &elt)
2220 || !poly_int_rtx_p (elt, &value))
2221 return false;
2223 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2224 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2225 /* There's no vector INCB. */
2226 return false;
2228 HOST_WIDE_INT factor = value.coeffs[0];
2229 if (value.coeffs[1] != factor)
2230 return false;
2232 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2233 if ((factor % nelts_per_vq) != 0
2234 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2235 return false;
2237 if (factor_out)
2238 *factor_out = factor;
2239 if (nelts_per_vq_out)
2240 *nelts_per_vq_out = nelts_per_vq;
2241 return true;
2244 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2245 instruction. */
2247 bool
2248 aarch64_sve_inc_dec_immediate_p (rtx x)
2250 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2253 /* Return the asm template for an SVE vector INC or DEC instruction.
2254 OPERANDS gives the operands before the vector count and X is the
2255 value of the vector count operand itself. */
2257 char *
2258 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2260 int factor;
2261 unsigned int nelts_per_vq;
2262 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2263 gcc_unreachable ();
2264 if (factor < 0)
2265 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2266 nelts_per_vq);
2267 else
2268 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2269 nelts_per_vq);
2272 static int
2273 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2274 scalar_int_mode mode)
2276 int i;
2277 unsigned HOST_WIDE_INT val, val2, mask;
2278 int one_match, zero_match;
2279 int num_insns;
2281 val = INTVAL (imm);
2283 if (aarch64_move_imm (val, mode))
2285 if (generate)
2286 emit_insn (gen_rtx_SET (dest, imm));
2287 return 1;
2290 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2291 (with XXXX non-zero). In that case check to see if the move can be done in
2292 a smaller mode. */
2293 val2 = val & 0xffffffff;
2294 if (mode == DImode
2295 && aarch64_move_imm (val2, SImode)
2296 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2298 if (generate)
2299 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2301 /* Check if we have to emit a second instruction by checking to see
2302 if any of the upper 32 bits of the original DI mode value is set. */
2303 if (val == val2)
2304 return 1;
2306 i = (val >> 48) ? 48 : 32;
2308 if (generate)
2309 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2310 GEN_INT ((val >> i) & 0xffff)));
2312 return 2;
2315 if ((val >> 32) == 0 || mode == SImode)
2317 if (generate)
2319 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2320 if (mode == SImode)
2321 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2322 GEN_INT ((val >> 16) & 0xffff)));
2323 else
2324 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2325 GEN_INT ((val >> 16) & 0xffff)));
2327 return 2;
2330 /* Remaining cases are all for DImode. */
2332 mask = 0xffff;
2333 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2334 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2335 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2336 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2338 if (zero_match != 2 && one_match != 2)
2340 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2341 For a 64-bit bitmask try whether changing 16 bits to all ones or
2342 zeroes creates a valid bitmask. To check any repeated bitmask,
2343 try using 16 bits from the other 32-bit half of val. */
2345 for (i = 0; i < 64; i += 16, mask <<= 16)
2347 val2 = val & ~mask;
2348 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2349 break;
2350 val2 = val | mask;
2351 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2352 break;
2353 val2 = val2 & ~mask;
2354 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2355 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2356 break;
2358 if (i != 64)
2360 if (generate)
2362 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2363 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2364 GEN_INT ((val >> i) & 0xffff)));
2366 return 2;
2370 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2371 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2372 otherwise skip zero bits. */
2374 num_insns = 1;
2375 mask = 0xffff;
2376 val2 = one_match > zero_match ? ~val : val;
2377 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2379 if (generate)
2380 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2381 ? (val | ~(mask << i))
2382 : (val & (mask << i)))));
2383 for (i += 16; i < 64; i += 16)
2385 if ((val2 & (mask << i)) == 0)
2386 continue;
2387 if (generate)
2388 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2389 GEN_INT ((val >> i) & 0xffff)));
2390 num_insns ++;
2393 return num_insns;
2396 /* Return whether imm is a 128-bit immediate which is simple enough to
2397 expand inline. */
2398 bool
2399 aarch64_mov128_immediate (rtx imm)
2401 if (GET_CODE (imm) == CONST_INT)
2402 return true;
2404 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2406 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2407 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2409 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2410 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2414 /* Return the number of temporary registers that aarch64_add_offset_1
2415 would need to add OFFSET to a register. */
2417 static unsigned int
2418 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2420 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2423 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2424 a non-polynomial OFFSET. MODE is the mode of the addition.
2425 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2426 be set and CFA adjustments added to the generated instructions.
2428 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2429 temporary if register allocation is already complete. This temporary
2430 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2431 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2432 the immediate again.
2434 Since this function may be used to adjust the stack pointer, we must
2435 ensure that it cannot cause transient stack deallocation (for example
2436 by first incrementing SP and then decrementing when adjusting by a
2437 large immediate). */
2439 static void
2440 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2441 rtx src, HOST_WIDE_INT offset, rtx temp1,
2442 bool frame_related_p, bool emit_move_imm)
2444 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2445 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2447 HOST_WIDE_INT moffset = abs_hwi (offset);
2448 rtx_insn *insn;
2450 if (!moffset)
2452 if (!rtx_equal_p (dest, src))
2454 insn = emit_insn (gen_rtx_SET (dest, src));
2455 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2457 return;
2460 /* Single instruction adjustment. */
2461 if (aarch64_uimm12_shift (moffset))
2463 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2464 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2465 return;
2468 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2469 and either:
2471 a) the offset cannot be loaded by a 16-bit move or
2472 b) there is no spare register into which we can move it. */
2473 if (moffset < 0x1000000
2474 && ((!temp1 && !can_create_pseudo_p ())
2475 || !aarch64_move_imm (moffset, mode)))
2477 HOST_WIDE_INT low_off = moffset & 0xfff;
2479 low_off = offset < 0 ? -low_off : low_off;
2480 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2481 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2482 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2483 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2484 return;
2487 /* Emit a move immediate if required and an addition/subtraction. */
2488 if (emit_move_imm)
2490 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2491 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2493 insn = emit_insn (offset < 0
2494 ? gen_sub3_insn (dest, src, temp1)
2495 : gen_add3_insn (dest, src, temp1));
2496 if (frame_related_p)
2498 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2499 rtx adj = plus_constant (mode, src, offset);
2500 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2504 /* Return the number of temporary registers that aarch64_add_offset
2505 would need to move OFFSET into a register or add OFFSET to a register;
2506 ADD_P is true if we want the latter rather than the former. */
2508 static unsigned int
2509 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2511 /* This follows the same structure as aarch64_add_offset. */
2512 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2513 return 0;
2515 unsigned int count = 0;
2516 HOST_WIDE_INT factor = offset.coeffs[1];
2517 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2518 poly_int64 poly_offset (factor, factor);
2519 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2520 /* Need one register for the ADDVL/ADDPL result. */
2521 count += 1;
2522 else if (factor != 0)
2524 factor = abs (factor);
2525 if (factor > 16 * (factor & -factor))
2526 /* Need one register for the CNT result and one for the multiplication
2527 factor. If necessary, the second temporary can be reused for the
2528 constant part of the offset. */
2529 return 2;
2530 /* Need one register for the CNT result (which might then
2531 be shifted). */
2532 count += 1;
2534 return count + aarch64_add_offset_1_temporaries (constant);
2537 /* If X can be represented as a poly_int64, return the number
2538 of temporaries that are required to add it to a register.
2539 Return -1 otherwise. */
2542 aarch64_add_offset_temporaries (rtx x)
2544 poly_int64 offset;
2545 if (!poly_int_rtx_p (x, &offset))
2546 return -1;
2547 return aarch64_offset_temporaries (true, offset);
2550 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2551 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2552 be set and CFA adjustments added to the generated instructions.
2554 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2555 temporary if register allocation is already complete. This temporary
2556 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2557 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2558 false to avoid emitting the immediate again.
2560 TEMP2, if nonnull, is a second temporary register that doesn't
2561 overlap either DEST or REG.
2563 Since this function may be used to adjust the stack pointer, we must
2564 ensure that it cannot cause transient stack deallocation (for example
2565 by first incrementing SP and then decrementing when adjusting by a
2566 large immediate). */
2568 static void
2569 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2570 poly_int64 offset, rtx temp1, rtx temp2,
2571 bool frame_related_p, bool emit_move_imm = true)
2573 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2574 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2575 gcc_assert (temp1 == NULL_RTX
2576 || !frame_related_p
2577 || !reg_overlap_mentioned_p (temp1, dest));
2578 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2580 /* Try using ADDVL or ADDPL to add the whole value. */
2581 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2583 rtx offset_rtx = gen_int_mode (offset, mode);
2584 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2585 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2586 return;
2589 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2590 SVE vector register, over and above the minimum size of 128 bits.
2591 This is equivalent to half the value returned by CNTD with a
2592 vector shape of ALL. */
2593 HOST_WIDE_INT factor = offset.coeffs[1];
2594 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2596 /* Try using ADDVL or ADDPL to add the VG-based part. */
2597 poly_int64 poly_offset (factor, factor);
2598 if (src != const0_rtx
2599 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2601 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2602 if (frame_related_p)
2604 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2605 RTX_FRAME_RELATED_P (insn) = true;
2606 src = dest;
2608 else
2610 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2611 src = aarch64_force_temporary (mode, temp1, addr);
2612 temp1 = temp2;
2613 temp2 = NULL_RTX;
2616 /* Otherwise use a CNT-based sequence. */
2617 else if (factor != 0)
2619 /* Use a subtraction if we have a negative factor. */
2620 rtx_code code = PLUS;
2621 if (factor < 0)
2623 factor = -factor;
2624 code = MINUS;
2627 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2628 into the multiplication. */
2629 rtx val;
2630 int shift = 0;
2631 if (factor & 1)
2632 /* Use a right shift by 1. */
2633 shift = -1;
2634 else
2635 factor /= 2;
2636 HOST_WIDE_INT low_bit = factor & -factor;
2637 if (factor <= 16 * low_bit)
2639 if (factor > 16 * 8)
2641 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2642 the value with the minimum multiplier and shift it into
2643 position. */
2644 int extra_shift = exact_log2 (low_bit);
2645 shift += extra_shift;
2646 factor >>= extra_shift;
2648 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2650 else
2652 /* Use CNTD, then multiply it by FACTOR. */
2653 val = gen_int_mode (poly_int64 (2, 2), mode);
2654 val = aarch64_force_temporary (mode, temp1, val);
2656 /* Go back to using a negative multiplication factor if we have
2657 no register from which to subtract. */
2658 if (code == MINUS && src == const0_rtx)
2660 factor = -factor;
2661 code = PLUS;
2663 rtx coeff1 = gen_int_mode (factor, mode);
2664 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2665 val = gen_rtx_MULT (mode, val, coeff1);
2668 if (shift > 0)
2670 /* Multiply by 1 << SHIFT. */
2671 val = aarch64_force_temporary (mode, temp1, val);
2672 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2674 else if (shift == -1)
2676 /* Divide by 2. */
2677 val = aarch64_force_temporary (mode, temp1, val);
2678 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2681 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2682 if (src != const0_rtx)
2684 val = aarch64_force_temporary (mode, temp1, val);
2685 val = gen_rtx_fmt_ee (code, mode, src, val);
2687 else if (code == MINUS)
2689 val = aarch64_force_temporary (mode, temp1, val);
2690 val = gen_rtx_NEG (mode, val);
2693 if (constant == 0 || frame_related_p)
2695 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2696 if (frame_related_p)
2698 RTX_FRAME_RELATED_P (insn) = true;
2699 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2700 gen_rtx_SET (dest, plus_constant (Pmode, src,
2701 poly_offset)));
2703 src = dest;
2704 if (constant == 0)
2705 return;
2707 else
2709 src = aarch64_force_temporary (mode, temp1, val);
2710 temp1 = temp2;
2711 temp2 = NULL_RTX;
2714 emit_move_imm = true;
2717 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2718 frame_related_p, emit_move_imm);
2721 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2722 than a poly_int64. */
2724 void
2725 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2726 rtx offset_rtx, rtx temp1, rtx temp2)
2728 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2729 temp1, temp2, false);
2732 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2733 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2734 if TEMP1 already contains abs (DELTA). */
2736 static inline void
2737 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2739 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2740 temp1, temp2, true, emit_move_imm);
2743 /* Subtract DELTA from the stack pointer, marking the instructions
2744 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2745 if nonnull. */
2747 static inline void
2748 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2750 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2751 temp1, temp2, frame_related_p);
2754 /* Set DEST to (vec_series BASE STEP). */
2756 static void
2757 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2759 machine_mode mode = GET_MODE (dest);
2760 scalar_mode inner = GET_MODE_INNER (mode);
2762 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2763 if (!aarch64_sve_index_immediate_p (base))
2764 base = force_reg (inner, base);
2765 if (!aarch64_sve_index_immediate_p (step))
2766 step = force_reg (inner, step);
2768 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2771 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2772 integer of mode INT_MODE. Return true on success. */
2774 static bool
2775 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2776 rtx src)
2778 /* If the constant is smaller than 128 bits, we can do the move
2779 using a vector of SRC_MODEs. */
2780 if (src_mode != TImode)
2782 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2783 GET_MODE_SIZE (src_mode));
2784 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2785 emit_move_insn (gen_lowpart (dup_mode, dest),
2786 gen_const_vec_duplicate (dup_mode, src));
2787 return true;
2790 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2791 src = force_const_mem (src_mode, src);
2792 if (!src)
2793 return false;
2795 /* Make sure that the address is legitimate. */
2796 if (!aarch64_sve_ld1r_operand_p (src))
2798 rtx addr = force_reg (Pmode, XEXP (src, 0));
2799 src = replace_equiv_address (src, addr);
2802 machine_mode mode = GET_MODE (dest);
2803 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2804 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2805 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2806 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2807 emit_insn (gen_rtx_SET (dest, src));
2808 return true;
2811 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2812 isn't a simple duplicate or series. */
2814 static void
2815 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2817 machine_mode mode = GET_MODE (src);
2818 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2819 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2820 gcc_assert (npatterns > 1);
2822 if (nelts_per_pattern == 1)
2824 /* The constant is a repeating seqeuence of at least two elements,
2825 where the repeating elements occupy no more than 128 bits.
2826 Get an integer representation of the replicated value. */
2827 scalar_int_mode int_mode;
2828 if (BYTES_BIG_ENDIAN)
2829 /* For now, always use LD1RQ to load the value on big-endian
2830 targets, since the handling of smaller integers includes a
2831 subreg that is semantically an element reverse. */
2832 int_mode = TImode;
2833 else
2835 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2836 gcc_assert (int_bits <= 128);
2837 int_mode = int_mode_for_size (int_bits, 0).require ();
2839 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2840 if (int_value
2841 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2842 return;
2845 /* Expand each pattern individually. */
2846 rtx_vector_builder builder;
2847 auto_vec<rtx, 16> vectors (npatterns);
2848 for (unsigned int i = 0; i < npatterns; ++i)
2850 builder.new_vector (mode, 1, nelts_per_pattern);
2851 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2852 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2853 vectors.quick_push (force_reg (mode, builder.build ()));
2856 /* Use permutes to interleave the separate vectors. */
2857 while (npatterns > 1)
2859 npatterns /= 2;
2860 for (unsigned int i = 0; i < npatterns; ++i)
2862 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2863 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2864 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2865 vectors[i] = tmp;
2868 gcc_assert (vectors[0] == dest);
2871 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2872 is a pattern that can be used to set DEST to a replicated scalar
2873 element. */
2875 void
2876 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2877 rtx (*gen_vec_duplicate) (rtx, rtx))
2879 machine_mode mode = GET_MODE (dest);
2881 /* Check on what type of symbol it is. */
2882 scalar_int_mode int_mode;
2883 if ((GET_CODE (imm) == SYMBOL_REF
2884 || GET_CODE (imm) == LABEL_REF
2885 || GET_CODE (imm) == CONST
2886 || GET_CODE (imm) == CONST_POLY_INT)
2887 && is_a <scalar_int_mode> (mode, &int_mode))
2889 rtx mem;
2890 poly_int64 offset;
2891 HOST_WIDE_INT const_offset;
2892 enum aarch64_symbol_type sty;
2894 /* If we have (const (plus symbol offset)), separate out the offset
2895 before we start classifying the symbol. */
2896 rtx base = strip_offset (imm, &offset);
2898 /* We must always add an offset involving VL separately, rather than
2899 folding it into the relocation. */
2900 if (!offset.is_constant (&const_offset))
2902 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2903 emit_insn (gen_rtx_SET (dest, imm));
2904 else
2906 /* Do arithmetic on 32-bit values if the result is smaller
2907 than that. */
2908 if (partial_subreg_p (int_mode, SImode))
2910 /* It is invalid to do symbol calculations in modes
2911 narrower than SImode. */
2912 gcc_assert (base == const0_rtx);
2913 dest = gen_lowpart (SImode, dest);
2914 int_mode = SImode;
2916 if (base != const0_rtx)
2918 base = aarch64_force_temporary (int_mode, dest, base);
2919 aarch64_add_offset (int_mode, dest, base, offset,
2920 NULL_RTX, NULL_RTX, false);
2922 else
2923 aarch64_add_offset (int_mode, dest, base, offset,
2924 dest, NULL_RTX, false);
2926 return;
2929 sty = aarch64_classify_symbol (base, const_offset);
2930 switch (sty)
2932 case SYMBOL_FORCE_TO_MEM:
2933 if (const_offset != 0
2934 && targetm.cannot_force_const_mem (int_mode, imm))
2936 gcc_assert (can_create_pseudo_p ());
2937 base = aarch64_force_temporary (int_mode, dest, base);
2938 aarch64_add_offset (int_mode, dest, base, const_offset,
2939 NULL_RTX, NULL_RTX, false);
2940 return;
2943 mem = force_const_mem (ptr_mode, imm);
2944 gcc_assert (mem);
2946 /* If we aren't generating PC relative literals, then
2947 we need to expand the literal pool access carefully.
2948 This is something that needs to be done in a number
2949 of places, so could well live as a separate function. */
2950 if (!aarch64_pcrelative_literal_loads)
2952 gcc_assert (can_create_pseudo_p ());
2953 base = gen_reg_rtx (ptr_mode);
2954 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2955 if (ptr_mode != Pmode)
2956 base = convert_memory_address (Pmode, base);
2957 mem = gen_rtx_MEM (ptr_mode, base);
2960 if (int_mode != ptr_mode)
2961 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2963 emit_insn (gen_rtx_SET (dest, mem));
2965 return;
2967 case SYMBOL_SMALL_TLSGD:
2968 case SYMBOL_SMALL_TLSDESC:
2969 case SYMBOL_SMALL_TLSIE:
2970 case SYMBOL_SMALL_GOT_28K:
2971 case SYMBOL_SMALL_GOT_4G:
2972 case SYMBOL_TINY_GOT:
2973 case SYMBOL_TINY_TLSIE:
2974 if (const_offset != 0)
2976 gcc_assert(can_create_pseudo_p ());
2977 base = aarch64_force_temporary (int_mode, dest, base);
2978 aarch64_add_offset (int_mode, dest, base, const_offset,
2979 NULL_RTX, NULL_RTX, false);
2980 return;
2982 /* FALLTHRU */
2984 case SYMBOL_SMALL_ABSOLUTE:
2985 case SYMBOL_TINY_ABSOLUTE:
2986 case SYMBOL_TLSLE12:
2987 case SYMBOL_TLSLE24:
2988 case SYMBOL_TLSLE32:
2989 case SYMBOL_TLSLE48:
2990 aarch64_load_symref_appropriately (dest, imm, sty);
2991 return;
2993 default:
2994 gcc_unreachable ();
2998 if (!CONST_INT_P (imm))
3000 rtx base, step, value;
3001 if (GET_CODE (imm) == HIGH
3002 || aarch64_simd_valid_immediate (imm, NULL))
3003 emit_insn (gen_rtx_SET (dest, imm));
3004 else if (const_vec_series_p (imm, &base, &step))
3005 aarch64_expand_vec_series (dest, base, step);
3006 else if (const_vec_duplicate_p (imm, &value))
3008 /* If the constant is out of range of an SVE vector move,
3009 load it from memory if we can, otherwise move it into
3010 a register and use a DUP. */
3011 scalar_mode inner_mode = GET_MODE_INNER (mode);
3012 rtx op = force_const_mem (inner_mode, value);
3013 if (!op)
3014 op = force_reg (inner_mode, value);
3015 else if (!aarch64_sve_ld1r_operand_p (op))
3017 rtx addr = force_reg (Pmode, XEXP (op, 0));
3018 op = replace_equiv_address (op, addr);
3020 emit_insn (gen_vec_duplicate (dest, op));
3022 else if (GET_CODE (imm) == CONST_VECTOR
3023 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3024 aarch64_expand_sve_const_vector (dest, imm);
3025 else
3027 rtx mem = force_const_mem (mode, imm);
3028 gcc_assert (mem);
3029 emit_move_insn (dest, mem);
3032 return;
3035 aarch64_internal_mov_immediate (dest, imm, true,
3036 as_a <scalar_int_mode> (mode));
3039 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3040 that is known to contain PTRUE. */
3042 void
3043 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3045 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3046 gen_rtvec (2, pred, src),
3047 UNSPEC_MERGE_PTRUE)));
3050 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3051 operand is in memory. In this case we need to use the predicated LD1
3052 and ST1 instead of LDR and STR, both for correctness on big-endian
3053 targets and because LD1 and ST1 support a wider range of addressing modes.
3054 PRED_MODE is the mode of the predicate.
3056 See the comment at the head of aarch64-sve.md for details about the
3057 big-endian handling. */
3059 void
3060 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3062 machine_mode mode = GET_MODE (dest);
3063 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3064 if (!register_operand (src, mode)
3065 && !register_operand (dest, mode))
3067 rtx tmp = gen_reg_rtx (mode);
3068 if (MEM_P (src))
3069 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3070 else
3071 emit_move_insn (tmp, src);
3072 src = tmp;
3074 aarch64_emit_sve_pred_move (dest, ptrue, src);
3077 static bool
3078 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3079 tree exp ATTRIBUTE_UNUSED)
3081 /* Currently, always true. */
3082 return true;
3085 /* Implement TARGET_PASS_BY_REFERENCE. */
3087 static bool
3088 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3089 machine_mode mode,
3090 const_tree type,
3091 bool named ATTRIBUTE_UNUSED)
3093 HOST_WIDE_INT size;
3094 machine_mode dummymode;
3095 int nregs;
3097 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3098 if (mode == BLKmode && type)
3099 size = int_size_in_bytes (type);
3100 else
3101 /* No frontends can create types with variable-sized modes, so we
3102 shouldn't be asked to pass or return them. */
3103 size = GET_MODE_SIZE (mode).to_constant ();
3105 /* Aggregates are passed by reference based on their size. */
3106 if (type && AGGREGATE_TYPE_P (type))
3108 size = int_size_in_bytes (type);
3111 /* Variable sized arguments are always returned by reference. */
3112 if (size < 0)
3113 return true;
3115 /* Can this be a candidate to be passed in fp/simd register(s)? */
3116 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3117 &dummymode, &nregs,
3118 NULL))
3119 return false;
3121 /* Arguments which are variable sized or larger than 2 registers are
3122 passed by reference unless they are a homogenous floating point
3123 aggregate. */
3124 return size > 2 * UNITS_PER_WORD;
3127 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3128 static bool
3129 aarch64_return_in_msb (const_tree valtype)
3131 machine_mode dummy_mode;
3132 int dummy_int;
3134 /* Never happens in little-endian mode. */
3135 if (!BYTES_BIG_ENDIAN)
3136 return false;
3138 /* Only composite types smaller than or equal to 16 bytes can
3139 be potentially returned in registers. */
3140 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3141 || int_size_in_bytes (valtype) <= 0
3142 || int_size_in_bytes (valtype) > 16)
3143 return false;
3145 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3146 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3147 is always passed/returned in the least significant bits of fp/simd
3148 register(s). */
3149 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3150 &dummy_mode, &dummy_int, NULL))
3151 return false;
3153 return true;
3156 /* Implement TARGET_FUNCTION_VALUE.
3157 Define how to find the value returned by a function. */
3159 static rtx
3160 aarch64_function_value (const_tree type, const_tree func,
3161 bool outgoing ATTRIBUTE_UNUSED)
3163 machine_mode mode;
3164 int unsignedp;
3165 int count;
3166 machine_mode ag_mode;
3168 mode = TYPE_MODE (type);
3169 if (INTEGRAL_TYPE_P (type))
3170 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3172 if (aarch64_return_in_msb (type))
3174 HOST_WIDE_INT size = int_size_in_bytes (type);
3176 if (size % UNITS_PER_WORD != 0)
3178 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3179 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3183 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3184 &ag_mode, &count, NULL))
3186 if (!aarch64_composite_type_p (type, mode))
3188 gcc_assert (count == 1 && mode == ag_mode);
3189 return gen_rtx_REG (mode, V0_REGNUM);
3191 else
3193 int i;
3194 rtx par;
3196 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3197 for (i = 0; i < count; i++)
3199 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3200 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3201 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3202 XVECEXP (par, 0, i) = tmp;
3204 return par;
3207 else
3208 return gen_rtx_REG (mode, R0_REGNUM);
3211 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3212 Return true if REGNO is the number of a hard register in which the values
3213 of called function may come back. */
3215 static bool
3216 aarch64_function_value_regno_p (const unsigned int regno)
3218 /* Maximum of 16 bytes can be returned in the general registers. Examples
3219 of 16-byte return values are: 128-bit integers and 16-byte small
3220 structures (excluding homogeneous floating-point aggregates). */
3221 if (regno == R0_REGNUM || regno == R1_REGNUM)
3222 return true;
3224 /* Up to four fp/simd registers can return a function value, e.g. a
3225 homogeneous floating-point aggregate having four members. */
3226 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3227 return TARGET_FLOAT;
3229 return false;
3232 /* Implement TARGET_RETURN_IN_MEMORY.
3234 If the type T of the result of a function is such that
3235 void func (T arg)
3236 would require that arg be passed as a value in a register (or set of
3237 registers) according to the parameter passing rules, then the result
3238 is returned in the same registers as would be used for such an
3239 argument. */
3241 static bool
3242 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3244 HOST_WIDE_INT size;
3245 machine_mode ag_mode;
3246 int count;
3248 if (!AGGREGATE_TYPE_P (type)
3249 && TREE_CODE (type) != COMPLEX_TYPE
3250 && TREE_CODE (type) != VECTOR_TYPE)
3251 /* Simple scalar types always returned in registers. */
3252 return false;
3254 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3255 type,
3256 &ag_mode,
3257 &count,
3258 NULL))
3259 return false;
3261 /* Types larger than 2 registers returned in memory. */
3262 size = int_size_in_bytes (type);
3263 return (size < 0 || size > 2 * UNITS_PER_WORD);
3266 static bool
3267 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3268 const_tree type, int *nregs)
3270 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3271 return aarch64_vfp_is_call_or_return_candidate (mode,
3272 type,
3273 &pcum->aapcs_vfp_rmode,
3274 nregs,
3275 NULL);
3278 /* Given MODE and TYPE of a function argument, return the alignment in
3279 bits. The idea is to suppress any stronger alignment requested by
3280 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3281 This is a helper function for local use only. */
3283 static unsigned int
3284 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3286 if (!type)
3287 return GET_MODE_ALIGNMENT (mode);
3289 if (integer_zerop (TYPE_SIZE (type)))
3290 return 0;
3292 gcc_assert (TYPE_MODE (type) == mode);
3294 if (!AGGREGATE_TYPE_P (type))
3295 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3297 if (TREE_CODE (type) == ARRAY_TYPE)
3298 return TYPE_ALIGN (TREE_TYPE (type));
3300 unsigned int alignment = 0;
3301 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3302 if (TREE_CODE (field) == FIELD_DECL)
3303 alignment = std::max (alignment, DECL_ALIGN (field));
3305 return alignment;
3308 /* Layout a function argument according to the AAPCS64 rules. The rule
3309 numbers refer to the rule numbers in the AAPCS64. */
3311 static void
3312 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3313 const_tree type,
3314 bool named ATTRIBUTE_UNUSED)
3316 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3317 int ncrn, nvrn, nregs;
3318 bool allocate_ncrn, allocate_nvrn;
3319 HOST_WIDE_INT size;
3321 /* We need to do this once per argument. */
3322 if (pcum->aapcs_arg_processed)
3323 return;
3325 pcum->aapcs_arg_processed = true;
3327 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3328 if (type)
3329 size = int_size_in_bytes (type);
3330 else
3331 /* No frontends can create types with variable-sized modes, so we
3332 shouldn't be asked to pass or return them. */
3333 size = GET_MODE_SIZE (mode).to_constant ();
3334 size = ROUND_UP (size, UNITS_PER_WORD);
3336 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3337 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3338 mode,
3339 type,
3340 &nregs);
3342 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3343 The following code thus handles passing by SIMD/FP registers first. */
3345 nvrn = pcum->aapcs_nvrn;
3347 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3348 and homogenous short-vector aggregates (HVA). */
3349 if (allocate_nvrn)
3351 if (!TARGET_FLOAT)
3352 aarch64_err_no_fpadvsimd (mode, "argument");
3354 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3356 pcum->aapcs_nextnvrn = nvrn + nregs;
3357 if (!aarch64_composite_type_p (type, mode))
3359 gcc_assert (nregs == 1);
3360 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3362 else
3364 rtx par;
3365 int i;
3366 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3367 for (i = 0; i < nregs; i++)
3369 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3370 V0_REGNUM + nvrn + i);
3371 rtx offset = gen_int_mode
3372 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3373 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3374 XVECEXP (par, 0, i) = tmp;
3376 pcum->aapcs_reg = par;
3378 return;
3380 else
3382 /* C.3 NSRN is set to 8. */
3383 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3384 goto on_stack;
3388 ncrn = pcum->aapcs_ncrn;
3389 nregs = size / UNITS_PER_WORD;
3391 /* C6 - C9. though the sign and zero extension semantics are
3392 handled elsewhere. This is the case where the argument fits
3393 entirely general registers. */
3394 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3397 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3399 /* C.8 if the argument has an alignment of 16 then the NGRN is
3400 rounded up to the next even number. */
3401 if (nregs == 2
3402 && ncrn % 2
3403 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3404 comparison is there because for > 16 * BITS_PER_UNIT
3405 alignment nregs should be > 2 and therefore it should be
3406 passed by reference rather than value. */
3407 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3409 ++ncrn;
3410 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3413 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3414 A reg is still generated for it, but the caller should be smart
3415 enough not to use it. */
3416 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3417 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3418 else
3420 rtx par;
3421 int i;
3423 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3424 for (i = 0; i < nregs; i++)
3426 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3427 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3428 GEN_INT (i * UNITS_PER_WORD));
3429 XVECEXP (par, 0, i) = tmp;
3431 pcum->aapcs_reg = par;
3434 pcum->aapcs_nextncrn = ncrn + nregs;
3435 return;
3438 /* C.11 */
3439 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3441 /* The argument is passed on stack; record the needed number of words for
3442 this argument and align the total size if necessary. */
3443 on_stack:
3444 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3446 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3447 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3448 16 / UNITS_PER_WORD);
3449 return;
3452 /* Implement TARGET_FUNCTION_ARG. */
3454 static rtx
3455 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3456 const_tree type, bool named)
3458 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3459 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3461 if (mode == VOIDmode)
3462 return NULL_RTX;
3464 aarch64_layout_arg (pcum_v, mode, type, named);
3465 return pcum->aapcs_reg;
3468 void
3469 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3470 const_tree fntype ATTRIBUTE_UNUSED,
3471 rtx libname ATTRIBUTE_UNUSED,
3472 const_tree fndecl ATTRIBUTE_UNUSED,
3473 unsigned n_named ATTRIBUTE_UNUSED)
3475 pcum->aapcs_ncrn = 0;
3476 pcum->aapcs_nvrn = 0;
3477 pcum->aapcs_nextncrn = 0;
3478 pcum->aapcs_nextnvrn = 0;
3479 pcum->pcs_variant = ARM_PCS_AAPCS64;
3480 pcum->aapcs_reg = NULL_RTX;
3481 pcum->aapcs_arg_processed = false;
3482 pcum->aapcs_stack_words = 0;
3483 pcum->aapcs_stack_size = 0;
3485 if (!TARGET_FLOAT
3486 && fndecl && TREE_PUBLIC (fndecl)
3487 && fntype && fntype != error_mark_node)
3489 const_tree type = TREE_TYPE (fntype);
3490 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3491 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3492 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3493 &mode, &nregs, NULL))
3494 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3496 return;
3499 static void
3500 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3501 machine_mode mode,
3502 const_tree type,
3503 bool named)
3505 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3506 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3508 aarch64_layout_arg (pcum_v, mode, type, named);
3509 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3510 != (pcum->aapcs_stack_words != 0));
3511 pcum->aapcs_arg_processed = false;
3512 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3513 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3514 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3515 pcum->aapcs_stack_words = 0;
3516 pcum->aapcs_reg = NULL_RTX;
3520 bool
3521 aarch64_function_arg_regno_p (unsigned regno)
3523 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3524 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3527 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3528 PARM_BOUNDARY bits of alignment, but will be given anything up
3529 to STACK_BOUNDARY bits if the type requires it. This makes sure
3530 that both before and after the layout of each argument, the Next
3531 Stacked Argument Address (NSAA) will have a minimum alignment of
3532 8 bytes. */
3534 static unsigned int
3535 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3537 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3538 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3541 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3543 static fixed_size_mode
3544 aarch64_get_reg_raw_mode (int regno)
3546 if (TARGET_SVE && FP_REGNUM_P (regno))
3547 /* Don't use the SVE part of the register for __builtin_apply and
3548 __builtin_return. The SVE registers aren't used by the normal PCS,
3549 so using them there would be a waste of time. The PCS extensions
3550 for SVE types are fundamentally incompatible with the
3551 __builtin_return/__builtin_apply interface. */
3552 return as_a <fixed_size_mode> (V16QImode);
3553 return default_get_reg_raw_mode (regno);
3556 /* Implement TARGET_FUNCTION_ARG_PADDING.
3558 Small aggregate types are placed in the lowest memory address.
3560 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3562 static pad_direction
3563 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3565 /* On little-endian targets, the least significant byte of every stack
3566 argument is passed at the lowest byte address of the stack slot. */
3567 if (!BYTES_BIG_ENDIAN)
3568 return PAD_UPWARD;
3570 /* Otherwise, integral, floating-point and pointer types are padded downward:
3571 the least significant byte of a stack argument is passed at the highest
3572 byte address of the stack slot. */
3573 if (type
3574 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3575 || POINTER_TYPE_P (type))
3576 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3577 return PAD_DOWNWARD;
3579 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3580 return PAD_UPWARD;
3583 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3585 It specifies padding for the last (may also be the only)
3586 element of a block move between registers and memory. If
3587 assuming the block is in the memory, padding upward means that
3588 the last element is padded after its highest significant byte,
3589 while in downward padding, the last element is padded at the
3590 its least significant byte side.
3592 Small aggregates and small complex types are always padded
3593 upwards.
3595 We don't need to worry about homogeneous floating-point or
3596 short-vector aggregates; their move is not affected by the
3597 padding direction determined here. Regardless of endianness,
3598 each element of such an aggregate is put in the least
3599 significant bits of a fp/simd register.
3601 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3602 register has useful data, and return the opposite if the most
3603 significant byte does. */
3605 bool
3606 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3607 bool first ATTRIBUTE_UNUSED)
3610 /* Small composite types are always padded upward. */
3611 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3613 HOST_WIDE_INT size;
3614 if (type)
3615 size = int_size_in_bytes (type);
3616 else
3617 /* No frontends can create types with variable-sized modes, so we
3618 shouldn't be asked to pass or return them. */
3619 size = GET_MODE_SIZE (mode).to_constant ();
3620 if (size < 2 * UNITS_PER_WORD)
3621 return true;
3624 /* Otherwise, use the default padding. */
3625 return !BYTES_BIG_ENDIAN;
3628 static scalar_int_mode
3629 aarch64_libgcc_cmp_return_mode (void)
3631 return SImode;
3634 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3636 /* We use the 12-bit shifted immediate arithmetic instructions so values
3637 must be multiple of (1 << 12), i.e. 4096. */
3638 #define ARITH_FACTOR 4096
3640 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3641 #error Cannot use simple address calculation for stack probing
3642 #endif
3644 /* The pair of scratch registers used for stack probing. */
3645 #define PROBE_STACK_FIRST_REG 9
3646 #define PROBE_STACK_SECOND_REG 10
3648 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3649 inclusive. These are offsets from the current stack pointer. */
3651 static void
3652 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3654 HOST_WIDE_INT size;
3655 if (!poly_size.is_constant (&size))
3657 sorry ("stack probes for SVE frames");
3658 return;
3661 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3663 /* See the same assertion on PROBE_INTERVAL above. */
3664 gcc_assert ((first % ARITH_FACTOR) == 0);
3666 /* See if we have a constant small number of probes to generate. If so,
3667 that's the easy case. */
3668 if (size <= PROBE_INTERVAL)
3670 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3672 emit_set_insn (reg1,
3673 plus_constant (Pmode,
3674 stack_pointer_rtx, -(first + base)));
3675 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3678 /* The run-time loop is made up of 8 insns in the generic case while the
3679 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3680 else if (size <= 4 * PROBE_INTERVAL)
3682 HOST_WIDE_INT i, rem;
3684 emit_set_insn (reg1,
3685 plus_constant (Pmode,
3686 stack_pointer_rtx,
3687 -(first + PROBE_INTERVAL)));
3688 emit_stack_probe (reg1);
3690 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3691 it exceeds SIZE. If only two probes are needed, this will not
3692 generate any code. Then probe at FIRST + SIZE. */
3693 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3695 emit_set_insn (reg1,
3696 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3697 emit_stack_probe (reg1);
3700 rem = size - (i - PROBE_INTERVAL);
3701 if (rem > 256)
3703 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3705 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3706 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3708 else
3709 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3712 /* Otherwise, do the same as above, but in a loop. Note that we must be
3713 extra careful with variables wrapping around because we might be at
3714 the very top (or the very bottom) of the address space and we have
3715 to be able to handle this case properly; in particular, we use an
3716 equality test for the loop condition. */
3717 else
3719 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3721 /* Step 1: round SIZE to the previous multiple of the interval. */
3723 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3726 /* Step 2: compute initial and final value of the loop counter. */
3728 /* TEST_ADDR = SP + FIRST. */
3729 emit_set_insn (reg1,
3730 plus_constant (Pmode, stack_pointer_rtx, -first));
3732 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3733 HOST_WIDE_INT adjustment = - (first + rounded_size);
3734 if (! aarch64_uimm12_shift (adjustment))
3736 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3737 true, Pmode);
3738 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3740 else
3742 emit_set_insn (reg2,
3743 plus_constant (Pmode, stack_pointer_rtx, adjustment));
3746 /* Step 3: the loop
3750 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3751 probe at TEST_ADDR
3753 while (TEST_ADDR != LAST_ADDR)
3755 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3756 until it is equal to ROUNDED_SIZE. */
3758 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3761 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3762 that SIZE is equal to ROUNDED_SIZE. */
3764 if (size != rounded_size)
3766 HOST_WIDE_INT rem = size - rounded_size;
3768 if (rem > 256)
3770 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3772 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3773 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3775 else
3776 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3780 /* Make sure nothing is scheduled before we are done. */
3781 emit_insn (gen_blockage ());
3784 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3785 absolute addresses. */
3787 const char *
3788 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3790 static int labelno = 0;
3791 char loop_lab[32];
3792 rtx xops[2];
3794 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3796 /* Loop. */
3797 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3799 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3800 xops[0] = reg1;
3801 xops[1] = GEN_INT (PROBE_INTERVAL);
3802 output_asm_insn ("sub\t%0, %0, %1", xops);
3804 /* Probe at TEST_ADDR. */
3805 output_asm_insn ("str\txzr, [%0]", xops);
3807 /* Test if TEST_ADDR == LAST_ADDR. */
3808 xops[1] = reg2;
3809 output_asm_insn ("cmp\t%0, %1", xops);
3811 /* Branch. */
3812 fputs ("\tb.ne\t", asm_out_file);
3813 assemble_name_raw (asm_out_file, loop_lab);
3814 fputc ('\n', asm_out_file);
3816 return "";
3819 /* Mark the registers that need to be saved by the callee and calculate
3820 the size of the callee-saved registers area and frame record (both FP
3821 and LR may be omitted). */
3822 static void
3823 aarch64_layout_frame (void)
3825 HOST_WIDE_INT offset = 0;
3826 int regno, last_fp_reg = INVALID_REGNUM;
3828 if (reload_completed && cfun->machine->frame.laid_out)
3829 return;
3831 /* Force a frame chain for EH returns so the return address is at FP+8. */
3832 cfun->machine->frame.emit_frame_chain
3833 = frame_pointer_needed || crtl->calls_eh_return;
3835 /* Emit a frame chain if the frame pointer is enabled.
3836 If -momit-leaf-frame-pointer is used, do not use a frame chain
3837 in leaf functions which do not use LR. */
3838 if (flag_omit_frame_pointer == 2
3839 && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
3840 && !df_regs_ever_live_p (LR_REGNUM)))
3841 cfun->machine->frame.emit_frame_chain = true;
3843 #define SLOT_NOT_REQUIRED (-2)
3844 #define SLOT_REQUIRED (-1)
3846 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3847 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3849 /* First mark all the registers that really need to be saved... */
3850 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3851 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3853 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3854 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3856 /* ... that includes the eh data registers (if needed)... */
3857 if (crtl->calls_eh_return)
3858 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
3859 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
3860 = SLOT_REQUIRED;
3862 /* ... and any callee saved register that dataflow says is live. */
3863 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3864 if (df_regs_ever_live_p (regno)
3865 && (regno == R30_REGNUM
3866 || !call_used_regs[regno]))
3867 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3869 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3870 if (df_regs_ever_live_p (regno)
3871 && !call_used_regs[regno])
3873 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
3874 last_fp_reg = regno;
3877 if (cfun->machine->frame.emit_frame_chain)
3879 /* FP and LR are placed in the linkage record. */
3880 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
3881 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
3882 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
3883 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
3884 offset = 2 * UNITS_PER_WORD;
3887 /* Now assign stack slots for them. */
3888 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3889 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
3891 cfun->machine->frame.reg_offset[regno] = offset;
3892 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
3893 cfun->machine->frame.wb_candidate1 = regno;
3894 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
3895 cfun->machine->frame.wb_candidate2 = regno;
3896 offset += UNITS_PER_WORD;
3899 HOST_WIDE_INT max_int_offset = offset;
3900 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
3901 bool has_align_gap = offset != max_int_offset;
3903 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3904 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
3906 /* If there is an alignment gap between integer and fp callee-saves,
3907 allocate the last fp register to it if possible. */
3908 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
3910 cfun->machine->frame.reg_offset[regno] = max_int_offset;
3911 break;
3914 cfun->machine->frame.reg_offset[regno] = offset;
3915 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
3916 cfun->machine->frame.wb_candidate1 = regno;
3917 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
3918 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
3919 cfun->machine->frame.wb_candidate2 = regno;
3920 offset += UNITS_PER_WORD;
3923 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
3925 cfun->machine->frame.saved_regs_size = offset;
3927 HOST_WIDE_INT varargs_and_saved_regs_size
3928 = offset + cfun->machine->frame.saved_varargs_size;
3930 cfun->machine->frame.hard_fp_offset
3931 = aligned_upper_bound (varargs_and_saved_regs_size
3932 + get_frame_size (),
3933 STACK_BOUNDARY / BITS_PER_UNIT);
3935 /* Both these values are already aligned. */
3936 gcc_assert (multiple_p (crtl->outgoing_args_size,
3937 STACK_BOUNDARY / BITS_PER_UNIT));
3938 cfun->machine->frame.frame_size
3939 = (cfun->machine->frame.hard_fp_offset
3940 + crtl->outgoing_args_size);
3942 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
3944 cfun->machine->frame.initial_adjust = 0;
3945 cfun->machine->frame.final_adjust = 0;
3946 cfun->machine->frame.callee_adjust = 0;
3947 cfun->machine->frame.callee_offset = 0;
3949 HOST_WIDE_INT max_push_offset = 0;
3950 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
3951 max_push_offset = 512;
3952 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
3953 max_push_offset = 256;
3955 HOST_WIDE_INT const_size, const_fp_offset;
3956 if (cfun->machine->frame.frame_size.is_constant (&const_size)
3957 && const_size < max_push_offset
3958 && known_eq (crtl->outgoing_args_size, 0))
3960 /* Simple, small frame with no outgoing arguments:
3961 stp reg1, reg2, [sp, -frame_size]!
3962 stp reg3, reg4, [sp, 16] */
3963 cfun->machine->frame.callee_adjust = const_size;
3965 else if (known_lt (crtl->outgoing_args_size
3966 + cfun->machine->frame.saved_regs_size, 512)
3967 && !(cfun->calls_alloca
3968 && known_lt (cfun->machine->frame.hard_fp_offset,
3969 max_push_offset)))
3971 /* Frame with small outgoing arguments:
3972 sub sp, sp, frame_size
3973 stp reg1, reg2, [sp, outgoing_args_size]
3974 stp reg3, reg4, [sp, outgoing_args_size + 16] */
3975 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
3976 cfun->machine->frame.callee_offset
3977 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
3979 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
3980 && const_fp_offset < max_push_offset)
3982 /* Frame with large outgoing arguments but a small local area:
3983 stp reg1, reg2, [sp, -hard_fp_offset]!
3984 stp reg3, reg4, [sp, 16]
3985 sub sp, sp, outgoing_args_size */
3986 cfun->machine->frame.callee_adjust = const_fp_offset;
3987 cfun->machine->frame.final_adjust
3988 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
3990 else
3992 /* Frame with large local area and outgoing arguments using frame pointer:
3993 sub sp, sp, hard_fp_offset
3994 stp x29, x30, [sp, 0]
3995 add x29, sp, 0
3996 stp reg3, reg4, [sp, 16]
3997 sub sp, sp, outgoing_args_size */
3998 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3999 cfun->machine->frame.final_adjust
4000 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4003 cfun->machine->frame.laid_out = true;
4006 /* Return true if the register REGNO is saved on entry to
4007 the current function. */
4009 static bool
4010 aarch64_register_saved_on_entry (int regno)
4012 return cfun->machine->frame.reg_offset[regno] >= 0;
4015 /* Return the next register up from REGNO up to LIMIT for the callee
4016 to save. */
4018 static unsigned
4019 aarch64_next_callee_save (unsigned regno, unsigned limit)
4021 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4022 regno ++;
4023 return regno;
4026 /* Push the register number REGNO of mode MODE to the stack with write-back
4027 adjusting the stack by ADJUSTMENT. */
4029 static void
4030 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4031 HOST_WIDE_INT adjustment)
4033 rtx base_rtx = stack_pointer_rtx;
4034 rtx insn, reg, mem;
4036 reg = gen_rtx_REG (mode, regno);
4037 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4038 plus_constant (Pmode, base_rtx, -adjustment));
4039 mem = gen_frame_mem (mode, mem);
4041 insn = emit_move_insn (mem, reg);
4042 RTX_FRAME_RELATED_P (insn) = 1;
4045 /* Generate and return an instruction to store the pair of registers
4046 REG and REG2 of mode MODE to location BASE with write-back adjusting
4047 the stack location BASE by ADJUSTMENT. */
4049 static rtx
4050 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4051 HOST_WIDE_INT adjustment)
4053 switch (mode)
4055 case E_DImode:
4056 return gen_storewb_pairdi_di (base, base, reg, reg2,
4057 GEN_INT (-adjustment),
4058 GEN_INT (UNITS_PER_WORD - adjustment));
4059 case E_DFmode:
4060 return gen_storewb_pairdf_di (base, base, reg, reg2,
4061 GEN_INT (-adjustment),
4062 GEN_INT (UNITS_PER_WORD - adjustment));
4063 default:
4064 gcc_unreachable ();
4068 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4069 stack pointer by ADJUSTMENT. */
4071 static void
4072 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4074 rtx_insn *insn;
4075 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4077 if (regno2 == INVALID_REGNUM)
4078 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4080 rtx reg1 = gen_rtx_REG (mode, regno1);
4081 rtx reg2 = gen_rtx_REG (mode, regno2);
4083 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4084 reg2, adjustment));
4085 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4086 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4087 RTX_FRAME_RELATED_P (insn) = 1;
4090 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4091 adjusting it by ADJUSTMENT afterwards. */
4093 static rtx
4094 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4095 HOST_WIDE_INT adjustment)
4097 switch (mode)
4099 case E_DImode:
4100 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4101 GEN_INT (UNITS_PER_WORD));
4102 case E_DFmode:
4103 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4104 GEN_INT (UNITS_PER_WORD));
4105 default:
4106 gcc_unreachable ();
4110 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4111 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4112 into CFI_OPS. */
4114 static void
4115 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4116 rtx *cfi_ops)
4118 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4119 rtx reg1 = gen_rtx_REG (mode, regno1);
4121 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4123 if (regno2 == INVALID_REGNUM)
4125 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4126 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4127 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4129 else
4131 rtx reg2 = gen_rtx_REG (mode, regno2);
4132 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4133 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4134 reg2, adjustment));
4138 /* Generate and return a store pair instruction of mode MODE to store
4139 register REG1 to MEM1 and register REG2 to MEM2. */
4141 static rtx
4142 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4143 rtx reg2)
4145 switch (mode)
4147 case E_DImode:
4148 return gen_store_pairdi (mem1, reg1, mem2, reg2);
4150 case E_DFmode:
4151 return gen_store_pairdf (mem1, reg1, mem2, reg2);
4153 default:
4154 gcc_unreachable ();
4158 /* Generate and regurn a load pair isntruction of mode MODE to load register
4159 REG1 from MEM1 and register REG2 from MEM2. */
4161 static rtx
4162 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4163 rtx mem2)
4165 switch (mode)
4167 case E_DImode:
4168 return gen_load_pairdi (reg1, mem1, reg2, mem2);
4170 case E_DFmode:
4171 return gen_load_pairdf (reg1, mem1, reg2, mem2);
4173 default:
4174 gcc_unreachable ();
4178 /* Return TRUE if return address signing should be enabled for the current
4179 function, otherwise return FALSE. */
4181 bool
4182 aarch64_return_address_signing_enabled (void)
4184 /* This function should only be called after frame laid out. */
4185 gcc_assert (cfun->machine->frame.laid_out);
4187 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4188 if it's LR is pushed onto stack. */
4189 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4190 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4191 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4194 /* Emit code to save the callee-saved registers from register number START
4195 to LIMIT to the stack at the location starting at offset START_OFFSET,
4196 skipping any write-back candidates if SKIP_WB is true. */
4198 static void
4199 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4200 unsigned start, unsigned limit, bool skip_wb)
4202 rtx_insn *insn;
4203 unsigned regno;
4204 unsigned regno2;
4206 for (regno = aarch64_next_callee_save (start, limit);
4207 regno <= limit;
4208 regno = aarch64_next_callee_save (regno + 1, limit))
4210 rtx reg, mem;
4211 poly_int64 offset;
4213 if (skip_wb
4214 && (regno == cfun->machine->frame.wb_candidate1
4215 || regno == cfun->machine->frame.wb_candidate2))
4216 continue;
4218 if (cfun->machine->reg_is_wrapped_separately[regno])
4219 continue;
4221 reg = gen_rtx_REG (mode, regno);
4222 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4223 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4224 offset));
4226 regno2 = aarch64_next_callee_save (regno + 1, limit);
4228 if (regno2 <= limit
4229 && !cfun->machine->reg_is_wrapped_separately[regno2]
4230 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4231 == cfun->machine->frame.reg_offset[regno2]))
4234 rtx reg2 = gen_rtx_REG (mode, regno2);
4235 rtx mem2;
4237 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4238 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4239 offset));
4240 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4241 reg2));
4243 /* The first part of a frame-related parallel insn is
4244 always assumed to be relevant to the frame
4245 calculations; subsequent parts, are only
4246 frame-related if explicitly marked. */
4247 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4248 regno = regno2;
4250 else
4251 insn = emit_move_insn (mem, reg);
4253 RTX_FRAME_RELATED_P (insn) = 1;
4257 /* Emit code to restore the callee registers of mode MODE from register
4258 number START up to and including LIMIT. Restore from the stack offset
4259 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4260 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4262 static void
4263 aarch64_restore_callee_saves (machine_mode mode,
4264 poly_int64 start_offset, unsigned start,
4265 unsigned limit, bool skip_wb, rtx *cfi_ops)
4267 rtx base_rtx = stack_pointer_rtx;
4268 unsigned regno;
4269 unsigned regno2;
4270 poly_int64 offset;
4272 for (regno = aarch64_next_callee_save (start, limit);
4273 regno <= limit;
4274 regno = aarch64_next_callee_save (regno + 1, limit))
4276 if (cfun->machine->reg_is_wrapped_separately[regno])
4277 continue;
4279 rtx reg, mem;
4281 if (skip_wb
4282 && (regno == cfun->machine->frame.wb_candidate1
4283 || regno == cfun->machine->frame.wb_candidate2))
4284 continue;
4286 reg = gen_rtx_REG (mode, regno);
4287 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4288 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4290 regno2 = aarch64_next_callee_save (regno + 1, limit);
4292 if (regno2 <= limit
4293 && !cfun->machine->reg_is_wrapped_separately[regno2]
4294 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4295 == cfun->machine->frame.reg_offset[regno2]))
4297 rtx reg2 = gen_rtx_REG (mode, regno2);
4298 rtx mem2;
4300 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4301 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4302 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4304 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4305 regno = regno2;
4307 else
4308 emit_move_insn (reg, mem);
4309 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4313 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4314 of MODE. */
4316 static inline bool
4317 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4319 HOST_WIDE_INT multiple;
4320 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4321 && IN_RANGE (multiple, -8, 7));
4324 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4325 of MODE. */
4327 static inline bool
4328 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4330 HOST_WIDE_INT multiple;
4331 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4332 && IN_RANGE (multiple, 0, 63));
4335 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4336 of MODE. */
4338 bool
4339 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4341 HOST_WIDE_INT multiple;
4342 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4343 && IN_RANGE (multiple, -64, 63));
4346 /* Return true if OFFSET is a signed 9-bit value. */
4348 static inline bool
4349 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4350 poly_int64 offset)
4352 HOST_WIDE_INT const_offset;
4353 return (offset.is_constant (&const_offset)
4354 && IN_RANGE (const_offset, -256, 255));
4357 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4358 of MODE. */
4360 static inline bool
4361 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4363 HOST_WIDE_INT multiple;
4364 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4365 && IN_RANGE (multiple, -256, 255));
4368 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4369 of MODE. */
4371 static inline bool
4372 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4374 HOST_WIDE_INT multiple;
4375 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4376 && IN_RANGE (multiple, 0, 4095));
4379 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4381 static sbitmap
4382 aarch64_get_separate_components (void)
4384 aarch64_layout_frame ();
4386 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4387 bitmap_clear (components);
4389 /* The registers we need saved to the frame. */
4390 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4391 if (aarch64_register_saved_on_entry (regno))
4393 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4394 if (!frame_pointer_needed)
4395 offset += cfun->machine->frame.frame_size
4396 - cfun->machine->frame.hard_fp_offset;
4397 /* Check that we can access the stack slot of the register with one
4398 direct load with no adjustments needed. */
4399 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4400 bitmap_set_bit (components, regno);
4403 /* Don't mess with the hard frame pointer. */
4404 if (frame_pointer_needed)
4405 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4407 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4408 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4409 /* If aarch64_layout_frame has chosen registers to store/restore with
4410 writeback don't interfere with them to avoid having to output explicit
4411 stack adjustment instructions. */
4412 if (reg2 != INVALID_REGNUM)
4413 bitmap_clear_bit (components, reg2);
4414 if (reg1 != INVALID_REGNUM)
4415 bitmap_clear_bit (components, reg1);
4417 bitmap_clear_bit (components, LR_REGNUM);
4418 bitmap_clear_bit (components, SP_REGNUM);
4420 return components;
4423 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4425 static sbitmap
4426 aarch64_components_for_bb (basic_block bb)
4428 bitmap in = DF_LIVE_IN (bb);
4429 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4430 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4432 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4433 bitmap_clear (components);
4435 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4436 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4437 if ((!call_used_regs[regno])
4438 && (bitmap_bit_p (in, regno)
4439 || bitmap_bit_p (gen, regno)
4440 || bitmap_bit_p (kill, regno)))
4441 bitmap_set_bit (components, regno);
4443 return components;
4446 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4447 Nothing to do for aarch64. */
4449 static void
4450 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4454 /* Return the next set bit in BMP from START onwards. Return the total number
4455 of bits in BMP if no set bit is found at or after START. */
4457 static unsigned int
4458 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4460 unsigned int nbits = SBITMAP_SIZE (bmp);
4461 if (start == nbits)
4462 return start;
4464 gcc_assert (start < nbits);
4465 for (unsigned int i = start; i < nbits; i++)
4466 if (bitmap_bit_p (bmp, i))
4467 return i;
4469 return nbits;
4472 /* Do the work for aarch64_emit_prologue_components and
4473 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4474 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4475 for these components or the epilogue sequence. That is, it determines
4476 whether we should emit stores or loads and what kind of CFA notes to attach
4477 to the insns. Otherwise the logic for the two sequences is very
4478 similar. */
4480 static void
4481 aarch64_process_components (sbitmap components, bool prologue_p)
4483 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4484 ? HARD_FRAME_POINTER_REGNUM
4485 : STACK_POINTER_REGNUM);
4487 unsigned last_regno = SBITMAP_SIZE (components);
4488 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4489 rtx_insn *insn = NULL;
4491 while (regno != last_regno)
4493 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4494 so DFmode for the vector registers is enough. */
4495 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4496 rtx reg = gen_rtx_REG (mode, regno);
4497 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4498 if (!frame_pointer_needed)
4499 offset += cfun->machine->frame.frame_size
4500 - cfun->machine->frame.hard_fp_offset;
4501 rtx addr = plus_constant (Pmode, ptr_reg, offset);
4502 rtx mem = gen_frame_mem (mode, addr);
4504 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4505 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4506 /* No more registers to handle after REGNO.
4507 Emit a single save/restore and exit. */
4508 if (regno2 == last_regno)
4510 insn = emit_insn (set);
4511 RTX_FRAME_RELATED_P (insn) = 1;
4512 if (prologue_p)
4513 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4514 else
4515 add_reg_note (insn, REG_CFA_RESTORE, reg);
4516 break;
4519 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4520 /* The next register is not of the same class or its offset is not
4521 mergeable with the current one into a pair. */
4522 if (!satisfies_constraint_Ump (mem)
4523 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4524 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4525 GET_MODE_SIZE (mode)))
4527 insn = emit_insn (set);
4528 RTX_FRAME_RELATED_P (insn) = 1;
4529 if (prologue_p)
4530 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4531 else
4532 add_reg_note (insn, REG_CFA_RESTORE, reg);
4534 regno = regno2;
4535 continue;
4538 /* REGNO2 can be saved/restored in a pair with REGNO. */
4539 rtx reg2 = gen_rtx_REG (mode, regno2);
4540 if (!frame_pointer_needed)
4541 offset2 += cfun->machine->frame.frame_size
4542 - cfun->machine->frame.hard_fp_offset;
4543 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4544 rtx mem2 = gen_frame_mem (mode, addr2);
4545 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4546 : gen_rtx_SET (reg2, mem2);
4548 if (prologue_p)
4549 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4550 else
4551 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4553 RTX_FRAME_RELATED_P (insn) = 1;
4554 if (prologue_p)
4556 add_reg_note (insn, REG_CFA_OFFSET, set);
4557 add_reg_note (insn, REG_CFA_OFFSET, set2);
4559 else
4561 add_reg_note (insn, REG_CFA_RESTORE, reg);
4562 add_reg_note (insn, REG_CFA_RESTORE, reg2);
4565 regno = aarch64_get_next_set_bit (components, regno2 + 1);
4569 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4571 static void
4572 aarch64_emit_prologue_components (sbitmap components)
4574 aarch64_process_components (components, true);
4577 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4579 static void
4580 aarch64_emit_epilogue_components (sbitmap components)
4582 aarch64_process_components (components, false);
4585 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4587 static void
4588 aarch64_set_handled_components (sbitmap components)
4590 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4591 if (bitmap_bit_p (components, regno))
4592 cfun->machine->reg_is_wrapped_separately[regno] = true;
4595 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4596 is saved at BASE + OFFSET. */
4598 static void
4599 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4600 rtx base, poly_int64 offset)
4602 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4603 add_reg_note (insn, REG_CFA_EXPRESSION,
4604 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4607 /* AArch64 stack frames generated by this compiler look like:
4609 +-------------------------------+
4611 | incoming stack arguments |
4613 +-------------------------------+
4614 | | <-- incoming stack pointer (aligned)
4615 | callee-allocated save area |
4616 | for register varargs |
4618 +-------------------------------+
4619 | local variables | <-- frame_pointer_rtx
4621 +-------------------------------+
4622 | padding0 | \
4623 +-------------------------------+ |
4624 | callee-saved registers | | frame.saved_regs_size
4625 +-------------------------------+ |
4626 | LR' | |
4627 +-------------------------------+ |
4628 | FP' | / <- hard_frame_pointer_rtx (aligned)
4629 +-------------------------------+
4630 | dynamic allocation |
4631 +-------------------------------+
4632 | padding |
4633 +-------------------------------+
4634 | outgoing stack arguments | <-- arg_pointer
4636 +-------------------------------+
4637 | | <-- stack_pointer_rtx (aligned)
4639 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4640 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4641 unchanged. */
4643 /* Generate the prologue instructions for entry into a function.
4644 Establish the stack frame by decreasing the stack pointer with a
4645 properly calculated size and, if necessary, create a frame record
4646 filled with the values of LR and previous frame pointer. The
4647 current FP is also set up if it is in use. */
4649 void
4650 aarch64_expand_prologue (void)
4652 aarch64_layout_frame ();
4654 poly_int64 frame_size = cfun->machine->frame.frame_size;
4655 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4656 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4657 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4658 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4659 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4660 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4661 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4662 rtx_insn *insn;
4664 /* Sign return address for functions. */
4665 if (aarch64_return_address_signing_enabled ())
4667 insn = emit_insn (gen_pacisp ());
4668 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4669 RTX_FRAME_RELATED_P (insn) = 1;
4672 if (flag_stack_usage_info)
4673 current_function_static_stack_size = constant_lower_bound (frame_size);
4675 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4677 if (crtl->is_leaf && !cfun->calls_alloca)
4679 if (maybe_gt (frame_size, PROBE_INTERVAL)
4680 && maybe_gt (frame_size, get_stack_check_protect ()))
4681 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4682 (frame_size
4683 - get_stack_check_protect ()));
4685 else if (maybe_gt (frame_size, 0))
4686 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4689 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4690 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4692 aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4694 if (callee_adjust != 0)
4695 aarch64_push_regs (reg1, reg2, callee_adjust);
4697 if (emit_frame_chain)
4699 poly_int64 reg_offset = callee_adjust;
4700 if (callee_adjust == 0)
4702 reg1 = R29_REGNUM;
4703 reg2 = R30_REGNUM;
4704 reg_offset = callee_offset;
4705 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4707 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4708 stack_pointer_rtx, callee_offset,
4709 ip1_rtx, ip0_rtx, frame_pointer_needed);
4710 if (frame_pointer_needed && !frame_size.is_constant ())
4712 /* Variable-sized frames need to describe the save slot
4713 address using DW_CFA_expression rather than DW_CFA_offset.
4714 This means that, without taking further action, the
4715 locations of the registers that we've already saved would
4716 remain based on the stack pointer even after we redefine
4717 the CFA based on the frame pointer. We therefore need new
4718 DW_CFA_expressions to re-express the save slots with addresses
4719 based on the frame pointer. */
4720 rtx_insn *insn = get_last_insn ();
4721 gcc_assert (RTX_FRAME_RELATED_P (insn));
4723 /* Add an explicit CFA definition if this was previously
4724 implicit. */
4725 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4727 rtx src = plus_constant (Pmode, stack_pointer_rtx,
4728 callee_offset);
4729 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4730 gen_rtx_SET (hard_frame_pointer_rtx, src));
4733 /* Change the save slot expressions for the registers that
4734 we've already saved. */
4735 reg_offset -= callee_offset;
4736 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4737 reg_offset + UNITS_PER_WORD);
4738 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4739 reg_offset);
4741 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4744 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4745 callee_adjust != 0 || emit_frame_chain);
4746 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4747 callee_adjust != 0 || emit_frame_chain);
4748 aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4751 /* Return TRUE if we can use a simple_return insn.
4753 This function checks whether the callee saved stack is empty, which
4754 means no restore actions are need. The pro_and_epilogue will use
4755 this to check whether shrink-wrapping opt is feasible. */
4757 bool
4758 aarch64_use_return_insn_p (void)
4760 if (!reload_completed)
4761 return false;
4763 if (crtl->profile)
4764 return false;
4766 aarch64_layout_frame ();
4768 return known_eq (cfun->machine->frame.frame_size, 0);
4771 /* Generate the epilogue instructions for returning from a function.
4772 This is almost exactly the reverse of the prolog sequence, except
4773 that we need to insert barriers to avoid scheduling loads that read
4774 from a deallocated stack, and we optimize the unwind records by
4775 emitting them all together if possible. */
4776 void
4777 aarch64_expand_epilogue (bool for_sibcall)
4779 aarch64_layout_frame ();
4781 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4782 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4783 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4784 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4785 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4786 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4787 rtx cfi_ops = NULL;
4788 rtx_insn *insn;
4789 /* A stack clash protection prologue may not have left IP0_REGNUM or
4790 IP1_REGNUM in a usable state. The same is true for allocations
4791 with an SVE component, since we then need both temporary registers
4792 for each allocation. */
4793 bool can_inherit_p = (initial_adjust.is_constant ()
4794 && final_adjust.is_constant ()
4795 && !flag_stack_clash_protection);
4797 /* We need to add memory barrier to prevent read from deallocated stack. */
4798 bool need_barrier_p
4799 = maybe_ne (get_frame_size ()
4800 + cfun->machine->frame.saved_varargs_size, 0);
4802 /* Emit a barrier to prevent loads from a deallocated stack. */
4803 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4804 || cfun->calls_alloca
4805 || crtl->calls_eh_return)
4807 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4808 need_barrier_p = false;
4811 /* Restore the stack pointer from the frame pointer if it may not
4812 be the same as the stack pointer. */
4813 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4814 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4815 if (frame_pointer_needed
4816 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4817 /* If writeback is used when restoring callee-saves, the CFA
4818 is restored on the instruction doing the writeback. */
4819 aarch64_add_offset (Pmode, stack_pointer_rtx,
4820 hard_frame_pointer_rtx, -callee_offset,
4821 ip1_rtx, ip0_rtx, callee_adjust == 0);
4822 else
4823 aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4824 !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4826 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4827 callee_adjust != 0, &cfi_ops);
4828 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4829 callee_adjust != 0, &cfi_ops);
4831 if (need_barrier_p)
4832 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4834 if (callee_adjust != 0)
4835 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4837 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4839 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
4840 insn = get_last_insn ();
4841 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
4842 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
4843 RTX_FRAME_RELATED_P (insn) = 1;
4844 cfi_ops = NULL;
4847 aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
4848 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
4850 if (cfi_ops)
4852 /* Emit delayed restores and reset the CFA to be SP. */
4853 insn = get_last_insn ();
4854 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
4855 REG_NOTES (insn) = cfi_ops;
4856 RTX_FRAME_RELATED_P (insn) = 1;
4859 /* We prefer to emit the combined return/authenticate instruction RETAA,
4860 however there are three cases in which we must instead emit an explicit
4861 authentication instruction.
4863 1) Sibcalls don't return in a normal way, so if we're about to call one
4864 we must authenticate.
4866 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
4867 generating code for !TARGET_ARMV8_3 we can't use it and must
4868 explicitly authenticate.
4870 3) On an eh_return path we make extra stack adjustments to update the
4871 canonical frame address to be the exception handler's CFA. We want
4872 to authenticate using the CFA of the function which calls eh_return.
4874 if (aarch64_return_address_signing_enabled ()
4875 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
4877 insn = emit_insn (gen_autisp ());
4878 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4879 RTX_FRAME_RELATED_P (insn) = 1;
4882 /* Stack adjustment for exception handler. */
4883 if (crtl->calls_eh_return)
4885 /* We need to unwind the stack by the offset computed by
4886 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
4887 to be SP; letting the CFA move during this adjustment
4888 is just as correct as retaining the CFA from the body
4889 of the function. Therefore, do nothing special. */
4890 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
4893 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
4894 if (!for_sibcall)
4895 emit_jump_insn (ret_rtx);
4898 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
4899 normally or return to a previous frame after unwinding.
4901 An EH return uses a single shared return sequence. The epilogue is
4902 exactly like a normal epilogue except that it has an extra input
4903 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
4904 that must be applied after the frame has been destroyed. An extra label
4905 is inserted before the epilogue which initializes this register to zero,
4906 and this is the entry point for a normal return.
4908 An actual EH return updates the return address, initializes the stack
4909 adjustment and jumps directly into the epilogue (bypassing the zeroing
4910 of the adjustment). Since the return address is typically saved on the
4911 stack when a function makes a call, the saved LR must be updated outside
4912 the epilogue.
4914 This poses problems as the store is generated well before the epilogue,
4915 so the offset of LR is not known yet. Also optimizations will remove the
4916 store as it appears dead, even after the epilogue is generated (as the
4917 base or offset for loading LR is different in many cases).
4919 To avoid these problems this implementation forces the frame pointer
4920 in eh_return functions so that the location of LR is fixed and known early.
4921 It also marks the store volatile, so no optimization is permitted to
4922 remove the store. */
4924 aarch64_eh_return_handler_rtx (void)
4926 rtx tmp = gen_frame_mem (Pmode,
4927 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
4929 /* Mark the store volatile, so no optimization is permitted to remove it. */
4930 MEM_VOLATILE_P (tmp) = true;
4931 return tmp;
4934 /* Output code to add DELTA to the first argument, and then jump
4935 to FUNCTION. Used for C++ multiple inheritance. */
4936 static void
4937 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
4938 HOST_WIDE_INT delta,
4939 HOST_WIDE_INT vcall_offset,
4940 tree function)
4942 /* The this pointer is always in x0. Note that this differs from
4943 Arm where the this pointer maybe bumped to r1 if r0 is required
4944 to return a pointer to an aggregate. On AArch64 a result value
4945 pointer will be in x8. */
4946 int this_regno = R0_REGNUM;
4947 rtx this_rtx, temp0, temp1, addr, funexp;
4948 rtx_insn *insn;
4950 reload_completed = 1;
4951 emit_note (NOTE_INSN_PROLOGUE_END);
4953 this_rtx = gen_rtx_REG (Pmode, this_regno);
4954 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
4955 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
4957 if (vcall_offset == 0)
4958 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
4959 else
4961 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
4963 addr = this_rtx;
4964 if (delta != 0)
4966 if (delta >= -256 && delta < 256)
4967 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
4968 plus_constant (Pmode, this_rtx, delta));
4969 else
4970 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
4971 temp1, temp0, false);
4974 if (Pmode == ptr_mode)
4975 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
4976 else
4977 aarch64_emit_move (temp0,
4978 gen_rtx_ZERO_EXTEND (Pmode,
4979 gen_rtx_MEM (ptr_mode, addr)));
4981 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
4982 addr = plus_constant (Pmode, temp0, vcall_offset);
4983 else
4985 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
4986 Pmode);
4987 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
4990 if (Pmode == ptr_mode)
4991 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
4992 else
4993 aarch64_emit_move (temp1,
4994 gen_rtx_SIGN_EXTEND (Pmode,
4995 gen_rtx_MEM (ptr_mode, addr)));
4997 emit_insn (gen_add2_insn (this_rtx, temp1));
5000 /* Generate a tail call to the target function. */
5001 if (!TREE_USED (function))
5003 assemble_external (function);
5004 TREE_USED (function) = 1;
5006 funexp = XEXP (DECL_RTL (function), 0);
5007 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5008 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5009 SIBLING_CALL_P (insn) = 1;
5011 insn = get_insns ();
5012 shorten_branches (insn);
5013 final_start_function (insn, file, 1);
5014 final (insn, file, 1);
5015 final_end_function ();
5017 /* Stop pretending to be a post-reload pass. */
5018 reload_completed = 0;
5021 static bool
5022 aarch64_tls_referenced_p (rtx x)
5024 if (!TARGET_HAVE_TLS)
5025 return false;
5026 subrtx_iterator::array_type array;
5027 FOR_EACH_SUBRTX (iter, array, x, ALL)
5029 const_rtx x = *iter;
5030 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5031 return true;
5032 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5033 TLS offsets, not real symbol references. */
5034 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5035 iter.skip_subrtxes ();
5037 return false;
5041 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5042 a left shift of 0 or 12 bits. */
5043 bool
5044 aarch64_uimm12_shift (HOST_WIDE_INT val)
5046 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5047 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5052 /* Return true if val is an immediate that can be loaded into a
5053 register by a MOVZ instruction. */
5054 static bool
5055 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5057 if (GET_MODE_SIZE (mode) > 4)
5059 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5060 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5061 return 1;
5063 else
5065 /* Ignore sign extension. */
5066 val &= (HOST_WIDE_INT) 0xffffffff;
5068 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5069 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5072 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5073 64-bit (DImode) integer. */
5075 static unsigned HOST_WIDE_INT
5076 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5078 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5079 while (size < 64)
5081 val &= (HOST_WIDE_INT_1U << size) - 1;
5082 val |= val << size;
5083 size *= 2;
5085 return val;
5088 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5090 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5092 0x0000000100000001ull,
5093 0x0001000100010001ull,
5094 0x0101010101010101ull,
5095 0x1111111111111111ull,
5096 0x5555555555555555ull,
5100 /* Return true if val is a valid bitmask immediate. */
5102 bool
5103 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5105 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5106 int bits;
5108 /* Check for a single sequence of one bits and return quickly if so.
5109 The special cases of all ones and all zeroes returns false. */
5110 val = aarch64_replicate_bitmask_imm (val_in, mode);
5111 tmp = val + (val & -val);
5113 if (tmp == (tmp & -tmp))
5114 return (val + 1) > 1;
5116 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5117 if (mode == SImode)
5118 val = (val << 32) | (val & 0xffffffff);
5120 /* Invert if the immediate doesn't start with a zero bit - this means we
5121 only need to search for sequences of one bits. */
5122 if (val & 1)
5123 val = ~val;
5125 /* Find the first set bit and set tmp to val with the first sequence of one
5126 bits removed. Return success if there is a single sequence of ones. */
5127 first_one = val & -val;
5128 tmp = val & (val + first_one);
5130 if (tmp == 0)
5131 return true;
5133 /* Find the next set bit and compute the difference in bit position. */
5134 next_one = tmp & -tmp;
5135 bits = clz_hwi (first_one) - clz_hwi (next_one);
5136 mask = val ^ tmp;
5138 /* Check the bit position difference is a power of 2, and that the first
5139 sequence of one bits fits within 'bits' bits. */
5140 if ((mask >> bits) != 0 || bits != (bits & -bits))
5141 return false;
5143 /* Check the sequence of one bits is repeated 64/bits times. */
5144 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5147 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5148 Assumed precondition: VAL_IN Is not zero. */
5150 unsigned HOST_WIDE_INT
5151 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5153 int lowest_bit_set = ctz_hwi (val_in);
5154 int highest_bit_set = floor_log2 (val_in);
5155 gcc_assert (val_in != 0);
5157 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5158 (HOST_WIDE_INT_1U << lowest_bit_set));
5161 /* Create constant where bits outside of lowest bit set to highest bit set
5162 are set to 1. */
5164 unsigned HOST_WIDE_INT
5165 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5167 return val_in | ~aarch64_and_split_imm1 (val_in);
5170 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5172 bool
5173 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5175 scalar_int_mode int_mode;
5176 if (!is_a <scalar_int_mode> (mode, &int_mode))
5177 return false;
5179 if (aarch64_bitmask_imm (val_in, int_mode))
5180 return false;
5182 if (aarch64_move_imm (val_in, int_mode))
5183 return false;
5185 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5187 return aarch64_bitmask_imm (imm2, int_mode);
5190 /* Return true if val is an immediate that can be loaded into a
5191 register in a single instruction. */
5192 bool
5193 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5195 scalar_int_mode int_mode;
5196 if (!is_a <scalar_int_mode> (mode, &int_mode))
5197 return false;
5199 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5200 return 1;
5201 return aarch64_bitmask_imm (val, int_mode);
5204 static bool
5205 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5207 rtx base, offset;
5209 if (GET_CODE (x) == HIGH)
5210 return true;
5212 /* There's no way to calculate VL-based values using relocations. */
5213 subrtx_iterator::array_type array;
5214 FOR_EACH_SUBRTX (iter, array, x, ALL)
5215 if (GET_CODE (*iter) == CONST_POLY_INT)
5216 return true;
5218 split_const (x, &base, &offset);
5219 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5221 if (aarch64_classify_symbol (base, INTVAL (offset))
5222 != SYMBOL_FORCE_TO_MEM)
5223 return true;
5224 else
5225 /* Avoid generating a 64-bit relocation in ILP32; leave
5226 to aarch64_expand_mov_immediate to handle it properly. */
5227 return mode != ptr_mode;
5230 return aarch64_tls_referenced_p (x);
5233 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5234 The expansion for a table switch is quite expensive due to the number
5235 of instructions, the table lookup and hard to predict indirect jump.
5236 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5237 set, otherwise use tables for > 16 cases as a tradeoff between size and
5238 performance. When optimizing for size, use the default setting. */
5240 static unsigned int
5241 aarch64_case_values_threshold (void)
5243 /* Use the specified limit for the number of cases before using jump
5244 tables at higher optimization levels. */
5245 if (optimize > 2
5246 && selected_cpu->tune->max_case_values != 0)
5247 return selected_cpu->tune->max_case_values;
5248 else
5249 return optimize_size ? default_case_values_threshold () : 17;
5252 /* Return true if register REGNO is a valid index register.
5253 STRICT_P is true if REG_OK_STRICT is in effect. */
5255 bool
5256 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5258 if (!HARD_REGISTER_NUM_P (regno))
5260 if (!strict_p)
5261 return true;
5263 if (!reg_renumber)
5264 return false;
5266 regno = reg_renumber[regno];
5268 return GP_REGNUM_P (regno);
5271 /* Return true if register REGNO is a valid base register for mode MODE.
5272 STRICT_P is true if REG_OK_STRICT is in effect. */
5274 bool
5275 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5277 if (!HARD_REGISTER_NUM_P (regno))
5279 if (!strict_p)
5280 return true;
5282 if (!reg_renumber)
5283 return false;
5285 regno = reg_renumber[regno];
5288 /* The fake registers will be eliminated to either the stack or
5289 hard frame pointer, both of which are usually valid base registers.
5290 Reload deals with the cases where the eliminated form isn't valid. */
5291 return (GP_REGNUM_P (regno)
5292 || regno == SP_REGNUM
5293 || regno == FRAME_POINTER_REGNUM
5294 || regno == ARG_POINTER_REGNUM);
5297 /* Return true if X is a valid base register for mode MODE.
5298 STRICT_P is true if REG_OK_STRICT is in effect. */
5300 static bool
5301 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5303 if (!strict_p
5304 && GET_CODE (x) == SUBREG
5305 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5306 x = SUBREG_REG (x);
5308 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5311 /* Return true if address offset is a valid index. If it is, fill in INFO
5312 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5314 static bool
5315 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5316 machine_mode mode, bool strict_p)
5318 enum aarch64_address_type type;
5319 rtx index;
5320 int shift;
5322 /* (reg:P) */
5323 if ((REG_P (x) || GET_CODE (x) == SUBREG)
5324 && GET_MODE (x) == Pmode)
5326 type = ADDRESS_REG_REG;
5327 index = x;
5328 shift = 0;
5330 /* (sign_extend:DI (reg:SI)) */
5331 else if ((GET_CODE (x) == SIGN_EXTEND
5332 || GET_CODE (x) == ZERO_EXTEND)
5333 && GET_MODE (x) == DImode
5334 && GET_MODE (XEXP (x, 0)) == SImode)
5336 type = (GET_CODE (x) == SIGN_EXTEND)
5337 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5338 index = XEXP (x, 0);
5339 shift = 0;
5341 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5342 else if (GET_CODE (x) == MULT
5343 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5344 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5345 && GET_MODE (XEXP (x, 0)) == DImode
5346 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5347 && CONST_INT_P (XEXP (x, 1)))
5349 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5350 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5351 index = XEXP (XEXP (x, 0), 0);
5352 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5354 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5355 else if (GET_CODE (x) == ASHIFT
5356 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5357 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5358 && GET_MODE (XEXP (x, 0)) == DImode
5359 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5360 && CONST_INT_P (XEXP (x, 1)))
5362 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5363 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5364 index = XEXP (XEXP (x, 0), 0);
5365 shift = INTVAL (XEXP (x, 1));
5367 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5368 else if ((GET_CODE (x) == SIGN_EXTRACT
5369 || GET_CODE (x) == ZERO_EXTRACT)
5370 && GET_MODE (x) == DImode
5371 && GET_CODE (XEXP (x, 0)) == MULT
5372 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5373 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5375 type = (GET_CODE (x) == SIGN_EXTRACT)
5376 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5377 index = XEXP (XEXP (x, 0), 0);
5378 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5379 if (INTVAL (XEXP (x, 1)) != 32 + shift
5380 || INTVAL (XEXP (x, 2)) != 0)
5381 shift = -1;
5383 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5384 (const_int 0xffffffff<<shift)) */
5385 else if (GET_CODE (x) == AND
5386 && GET_MODE (x) == DImode
5387 && GET_CODE (XEXP (x, 0)) == MULT
5388 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5389 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5390 && CONST_INT_P (XEXP (x, 1)))
5392 type = ADDRESS_REG_UXTW;
5393 index = XEXP (XEXP (x, 0), 0);
5394 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5395 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5396 shift = -1;
5398 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5399 else if ((GET_CODE (x) == SIGN_EXTRACT
5400 || GET_CODE (x) == ZERO_EXTRACT)
5401 && GET_MODE (x) == DImode
5402 && GET_CODE (XEXP (x, 0)) == ASHIFT
5403 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5404 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5406 type = (GET_CODE (x) == SIGN_EXTRACT)
5407 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5408 index = XEXP (XEXP (x, 0), 0);
5409 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5410 if (INTVAL (XEXP (x, 1)) != 32 + shift
5411 || INTVAL (XEXP (x, 2)) != 0)
5412 shift = -1;
5414 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5415 (const_int 0xffffffff<<shift)) */
5416 else if (GET_CODE (x) == AND
5417 && GET_MODE (x) == DImode
5418 && GET_CODE (XEXP (x, 0)) == ASHIFT
5419 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5420 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5421 && CONST_INT_P (XEXP (x, 1)))
5423 type = ADDRESS_REG_UXTW;
5424 index = XEXP (XEXP (x, 0), 0);
5425 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5426 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5427 shift = -1;
5429 /* (mult:P (reg:P) (const_int scale)) */
5430 else if (GET_CODE (x) == MULT
5431 && GET_MODE (x) == Pmode
5432 && GET_MODE (XEXP (x, 0)) == Pmode
5433 && CONST_INT_P (XEXP (x, 1)))
5435 type = ADDRESS_REG_REG;
5436 index = XEXP (x, 0);
5437 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5439 /* (ashift:P (reg:P) (const_int shift)) */
5440 else if (GET_CODE (x) == ASHIFT
5441 && GET_MODE (x) == Pmode
5442 && GET_MODE (XEXP (x, 0)) == Pmode
5443 && CONST_INT_P (XEXP (x, 1)))
5445 type = ADDRESS_REG_REG;
5446 index = XEXP (x, 0);
5447 shift = INTVAL (XEXP (x, 1));
5449 else
5450 return false;
5452 if (!strict_p
5453 && GET_CODE (index) == SUBREG
5454 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5455 index = SUBREG_REG (index);
5457 if (aarch64_sve_data_mode_p (mode))
5459 if (type != ADDRESS_REG_REG
5460 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5461 return false;
5463 else
5465 if (shift != 0
5466 && !(IN_RANGE (shift, 1, 3)
5467 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5468 return false;
5471 if (REG_P (index)
5472 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5474 info->type = type;
5475 info->offset = index;
5476 info->shift = shift;
5477 return true;
5480 return false;
5483 /* Return true if MODE is one of the modes for which we
5484 support LDP/STP operations. */
5486 static bool
5487 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5489 return mode == SImode || mode == DImode
5490 || mode == SFmode || mode == DFmode
5491 || (aarch64_vector_mode_supported_p (mode)
5492 && known_eq (GET_MODE_SIZE (mode), 8));
5495 /* Return true if REGNO is a virtual pointer register, or an eliminable
5496 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5497 include stack_pointer or hard_frame_pointer. */
5498 static bool
5499 virt_or_elim_regno_p (unsigned regno)
5501 return ((regno >= FIRST_VIRTUAL_REGISTER
5502 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5503 || regno == FRAME_POINTER_REGNUM
5504 || regno == ARG_POINTER_REGNUM);
5507 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5508 If it is, fill in INFO appropriately. STRICT_P is true if
5509 REG_OK_STRICT is in effect. */
5511 static bool
5512 aarch64_classify_address (struct aarch64_address_info *info,
5513 rtx x, machine_mode mode, bool strict_p,
5514 aarch64_addr_query_type type = ADDR_QUERY_M)
5516 enum rtx_code code = GET_CODE (x);
5517 rtx op0, op1;
5518 poly_int64 offset;
5520 HOST_WIDE_INT const_size;
5522 /* On BE, we use load/store pair for all large int mode load/stores.
5523 TI/TFmode may also use a load/store pair. */
5524 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5525 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5526 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5527 || mode == TImode
5528 || mode == TFmode
5529 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5531 bool allow_reg_index_p = (!load_store_pair_p
5532 && (known_lt (GET_MODE_SIZE (mode), 16)
5533 || vec_flags == VEC_ADVSIMD
5534 || vec_flags == VEC_SVE_DATA));
5536 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5537 [Rn, #offset, MUL VL]. */
5538 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5539 && (code != REG && code != PLUS))
5540 return false;
5542 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5543 REG addressing. */
5544 if (advsimd_struct_p
5545 && !BYTES_BIG_ENDIAN
5546 && (code != POST_INC && code != REG))
5547 return false;
5549 gcc_checking_assert (GET_MODE (x) == VOIDmode
5550 || SCALAR_INT_MODE_P (GET_MODE (x)));
5552 switch (code)
5554 case REG:
5555 case SUBREG:
5556 info->type = ADDRESS_REG_IMM;
5557 info->base = x;
5558 info->offset = const0_rtx;
5559 info->const_offset = 0;
5560 return aarch64_base_register_rtx_p (x, strict_p);
5562 case PLUS:
5563 op0 = XEXP (x, 0);
5564 op1 = XEXP (x, 1);
5566 if (! strict_p
5567 && REG_P (op0)
5568 && virt_or_elim_regno_p (REGNO (op0))
5569 && poly_int_rtx_p (op1, &offset))
5571 info->type = ADDRESS_REG_IMM;
5572 info->base = op0;
5573 info->offset = op1;
5574 info->const_offset = offset;
5576 return true;
5579 if (maybe_ne (GET_MODE_SIZE (mode), 0)
5580 && aarch64_base_register_rtx_p (op0, strict_p)
5581 && poly_int_rtx_p (op1, &offset))
5583 info->type = ADDRESS_REG_IMM;
5584 info->base = op0;
5585 info->offset = op1;
5586 info->const_offset = offset;
5588 /* TImode and TFmode values are allowed in both pairs of X
5589 registers and individual Q registers. The available
5590 address modes are:
5591 X,X: 7-bit signed scaled offset
5592 Q: 9-bit signed offset
5593 We conservatively require an offset representable in either mode.
5594 When performing the check for pairs of X registers i.e. LDP/STP
5595 pass down DImode since that is the natural size of the LDP/STP
5596 instruction memory accesses. */
5597 if (mode == TImode || mode == TFmode)
5598 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5599 && (offset_9bit_signed_unscaled_p (mode, offset)
5600 || offset_12bit_unsigned_scaled_p (mode, offset)));
5602 /* A 7bit offset check because OImode will emit a ldp/stp
5603 instruction (only big endian will get here).
5604 For ldp/stp instructions, the offset is scaled for the size of a
5605 single element of the pair. */
5606 if (mode == OImode)
5607 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5609 /* Three 9/12 bit offsets checks because CImode will emit three
5610 ldr/str instructions (only big endian will get here). */
5611 if (mode == CImode)
5612 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5613 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5614 || offset_12bit_unsigned_scaled_p (V16QImode,
5615 offset + 32)));
5617 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5618 instructions (only big endian will get here). */
5619 if (mode == XImode)
5620 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5621 && aarch64_offset_7bit_signed_scaled_p (TImode,
5622 offset + 32));
5624 /* Make "m" use the LD1 offset range for SVE data modes, so
5625 that pre-RTL optimizers like ivopts will work to that
5626 instead of the wider LDR/STR range. */
5627 if (vec_flags == VEC_SVE_DATA)
5628 return (type == ADDR_QUERY_M
5629 ? offset_4bit_signed_scaled_p (mode, offset)
5630 : offset_9bit_signed_scaled_p (mode, offset));
5632 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5634 poly_int64 end_offset = (offset
5635 + GET_MODE_SIZE (mode)
5636 - BYTES_PER_SVE_VECTOR);
5637 return (type == ADDR_QUERY_M
5638 ? offset_4bit_signed_scaled_p (mode, offset)
5639 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5640 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5641 end_offset)));
5644 if (vec_flags == VEC_SVE_PRED)
5645 return offset_9bit_signed_scaled_p (mode, offset);
5647 if (load_store_pair_p)
5648 return ((known_eq (GET_MODE_SIZE (mode), 4)
5649 || known_eq (GET_MODE_SIZE (mode), 8))
5650 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5651 else
5652 return (offset_9bit_signed_unscaled_p (mode, offset)
5653 || offset_12bit_unsigned_scaled_p (mode, offset));
5656 if (allow_reg_index_p)
5658 /* Look for base + (scaled/extended) index register. */
5659 if (aarch64_base_register_rtx_p (op0, strict_p)
5660 && aarch64_classify_index (info, op1, mode, strict_p))
5662 info->base = op0;
5663 return true;
5665 if (aarch64_base_register_rtx_p (op1, strict_p)
5666 && aarch64_classify_index (info, op0, mode, strict_p))
5668 info->base = op1;
5669 return true;
5673 return false;
5675 case POST_INC:
5676 case POST_DEC:
5677 case PRE_INC:
5678 case PRE_DEC:
5679 info->type = ADDRESS_REG_WB;
5680 info->base = XEXP (x, 0);
5681 info->offset = NULL_RTX;
5682 return aarch64_base_register_rtx_p (info->base, strict_p);
5684 case POST_MODIFY:
5685 case PRE_MODIFY:
5686 info->type = ADDRESS_REG_WB;
5687 info->base = XEXP (x, 0);
5688 if (GET_CODE (XEXP (x, 1)) == PLUS
5689 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5690 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5691 && aarch64_base_register_rtx_p (info->base, strict_p))
5693 info->offset = XEXP (XEXP (x, 1), 1);
5694 info->const_offset = offset;
5696 /* TImode and TFmode values are allowed in both pairs of X
5697 registers and individual Q registers. The available
5698 address modes are:
5699 X,X: 7-bit signed scaled offset
5700 Q: 9-bit signed offset
5701 We conservatively require an offset representable in either mode.
5703 if (mode == TImode || mode == TFmode)
5704 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5705 && offset_9bit_signed_unscaled_p (mode, offset));
5707 if (load_store_pair_p)
5708 return ((known_eq (GET_MODE_SIZE (mode), 4)
5709 || known_eq (GET_MODE_SIZE (mode), 8))
5710 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5711 else
5712 return offset_9bit_signed_unscaled_p (mode, offset);
5714 return false;
5716 case CONST:
5717 case SYMBOL_REF:
5718 case LABEL_REF:
5719 /* load literal: pc-relative constant pool entry. Only supported
5720 for SI mode or larger. */
5721 info->type = ADDRESS_SYMBOLIC;
5723 if (!load_store_pair_p
5724 && GET_MODE_SIZE (mode).is_constant (&const_size)
5725 && const_size >= 4)
5727 rtx sym, addend;
5729 split_const (x, &sym, &addend);
5730 return ((GET_CODE (sym) == LABEL_REF
5731 || (GET_CODE (sym) == SYMBOL_REF
5732 && CONSTANT_POOL_ADDRESS_P (sym)
5733 && aarch64_pcrelative_literal_loads)));
5735 return false;
5737 case LO_SUM:
5738 info->type = ADDRESS_LO_SUM;
5739 info->base = XEXP (x, 0);
5740 info->offset = XEXP (x, 1);
5741 if (allow_reg_index_p
5742 && aarch64_base_register_rtx_p (info->base, strict_p))
5744 rtx sym, offs;
5745 split_const (info->offset, &sym, &offs);
5746 if (GET_CODE (sym) == SYMBOL_REF
5747 && (aarch64_classify_symbol (sym, INTVAL (offs))
5748 == SYMBOL_SMALL_ABSOLUTE))
5750 /* The symbol and offset must be aligned to the access size. */
5751 unsigned int align;
5753 if (CONSTANT_POOL_ADDRESS_P (sym))
5754 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5755 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5757 tree exp = SYMBOL_REF_DECL (sym);
5758 align = TYPE_ALIGN (TREE_TYPE (exp));
5759 align = aarch64_constant_alignment (exp, align);
5761 else if (SYMBOL_REF_DECL (sym))
5762 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5763 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5764 && SYMBOL_REF_BLOCK (sym) != NULL)
5765 align = SYMBOL_REF_BLOCK (sym)->alignment;
5766 else
5767 align = BITS_PER_UNIT;
5769 poly_int64 ref_size = GET_MODE_SIZE (mode);
5770 if (known_eq (ref_size, 0))
5771 ref_size = GET_MODE_SIZE (DImode);
5773 return (multiple_p (INTVAL (offs), ref_size)
5774 && multiple_p (align / BITS_PER_UNIT, ref_size));
5777 return false;
5779 default:
5780 return false;
5784 /* Return true if the address X is valid for a PRFM instruction.
5785 STRICT_P is true if we should do strict checking with
5786 aarch64_classify_address. */
5788 bool
5789 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5791 struct aarch64_address_info addr;
5793 /* PRFM accepts the same addresses as DImode... */
5794 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5795 if (!res)
5796 return false;
5798 /* ... except writeback forms. */
5799 return addr.type != ADDRESS_REG_WB;
5802 bool
5803 aarch64_symbolic_address_p (rtx x)
5805 rtx offset;
5807 split_const (x, &x, &offset);
5808 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5811 /* Classify the base of symbolic expression X. */
5813 enum aarch64_symbol_type
5814 aarch64_classify_symbolic_expression (rtx x)
5816 rtx offset;
5818 split_const (x, &x, &offset);
5819 return aarch64_classify_symbol (x, INTVAL (offset));
5823 /* Return TRUE if X is a legitimate address for accessing memory in
5824 mode MODE. */
5825 static bool
5826 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5828 struct aarch64_address_info addr;
5830 return aarch64_classify_address (&addr, x, mode, strict_p);
5833 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5834 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
5835 bool
5836 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5837 aarch64_addr_query_type type)
5839 struct aarch64_address_info addr;
5841 return aarch64_classify_address (&addr, x, mode, strict_p, type);
5844 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
5846 static bool
5847 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
5848 poly_int64 orig_offset,
5849 machine_mode mode)
5851 HOST_WIDE_INT size;
5852 if (GET_MODE_SIZE (mode).is_constant (&size))
5854 HOST_WIDE_INT const_offset, second_offset;
5856 /* A general SVE offset is A * VQ + B. Remove the A component from
5857 coefficient 0 in order to get the constant B. */
5858 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
5860 /* Split an out-of-range address displacement into a base and
5861 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
5862 range otherwise to increase opportunities for sharing the base
5863 address of different sizes. Unaligned accesses use the signed
5864 9-bit range, TImode/TFmode use the intersection of signed
5865 scaled 7-bit and signed 9-bit offset. */
5866 if (mode == TImode || mode == TFmode)
5867 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
5868 else if ((const_offset & (size - 1)) != 0)
5869 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
5870 else
5871 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
5873 if (second_offset == 0 || known_eq (orig_offset, second_offset))
5874 return false;
5876 /* Split the offset into second_offset and the rest. */
5877 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
5878 *offset2 = gen_int_mode (second_offset, Pmode);
5879 return true;
5881 else
5883 /* Get the mode we should use as the basis of the range. For structure
5884 modes this is the mode of one vector. */
5885 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5886 machine_mode step_mode
5887 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
5889 /* Get the "mul vl" multiplier we'd like to use. */
5890 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
5891 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
5892 if (vec_flags & VEC_SVE_DATA)
5893 /* LDR supports a 9-bit range, but the move patterns for
5894 structure modes require all vectors to be in range of the
5895 same base. The simplest way of accomodating that while still
5896 promoting reuse of anchor points between different modes is
5897 to use an 8-bit range unconditionally. */
5898 vnum = ((vnum + 128) & 255) - 128;
5899 else
5900 /* Predicates are only handled singly, so we might as well use
5901 the full range. */
5902 vnum = ((vnum + 256) & 511) - 256;
5903 if (vnum == 0)
5904 return false;
5906 /* Convert the "mul vl" multiplier into a byte offset. */
5907 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
5908 if (known_eq (second_offset, orig_offset))
5909 return false;
5911 /* Split the offset into second_offset and the rest. */
5912 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
5913 *offset2 = gen_int_mode (second_offset, Pmode);
5914 return true;
5918 /* Return the binary representation of floating point constant VALUE in INTVAL.
5919 If the value cannot be converted, return false without setting INTVAL.
5920 The conversion is done in the given MODE. */
5921 bool
5922 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
5925 /* We make a general exception for 0. */
5926 if (aarch64_float_const_zero_rtx_p (value))
5928 *intval = 0;
5929 return true;
5932 scalar_float_mode mode;
5933 if (GET_CODE (value) != CONST_DOUBLE
5934 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
5935 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
5936 /* Only support up to DF mode. */
5937 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
5938 return false;
5940 unsigned HOST_WIDE_INT ival = 0;
5942 long res[2];
5943 real_to_target (res,
5944 CONST_DOUBLE_REAL_VALUE (value),
5945 REAL_MODE_FORMAT (mode));
5947 if (mode == DFmode)
5949 int order = BYTES_BIG_ENDIAN ? 1 : 0;
5950 ival = zext_hwi (res[order], 32);
5951 ival |= (zext_hwi (res[1 - order], 32) << 32);
5953 else
5954 ival = zext_hwi (res[0], 32);
5956 *intval = ival;
5957 return true;
5960 /* Return TRUE if rtx X is an immediate constant that can be moved using a
5961 single MOV(+MOVK) followed by an FMOV. */
5962 bool
5963 aarch64_float_const_rtx_p (rtx x)
5965 machine_mode mode = GET_MODE (x);
5966 if (mode == VOIDmode)
5967 return false;
5969 /* Determine whether it's cheaper to write float constants as
5970 mov/movk pairs over ldr/adrp pairs. */
5971 unsigned HOST_WIDE_INT ival;
5973 if (GET_CODE (x) == CONST_DOUBLE
5974 && SCALAR_FLOAT_MODE_P (mode)
5975 && aarch64_reinterpret_float_as_int (x, &ival))
5977 scalar_int_mode imode = (mode == HFmode
5978 ? SImode
5979 : int_mode_for_mode (mode).require ());
5980 int num_instr = aarch64_internal_mov_immediate
5981 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
5982 return num_instr < 3;
5985 return false;
5988 /* Return TRUE if rtx X is immediate constant 0.0 */
5989 bool
5990 aarch64_float_const_zero_rtx_p (rtx x)
5992 if (GET_MODE (x) == VOIDmode)
5993 return false;
5995 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
5996 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
5997 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6000 /* Return TRUE if rtx X is immediate constant that fits in a single
6001 MOVI immediate operation. */
6002 bool
6003 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6005 if (!TARGET_SIMD)
6006 return false;
6008 machine_mode vmode;
6009 scalar_int_mode imode;
6010 unsigned HOST_WIDE_INT ival;
6012 if (GET_CODE (x) == CONST_DOUBLE
6013 && SCALAR_FLOAT_MODE_P (mode))
6015 if (!aarch64_reinterpret_float_as_int (x, &ival))
6016 return false;
6018 /* We make a general exception for 0. */
6019 if (aarch64_float_const_zero_rtx_p (x))
6020 return true;
6022 imode = int_mode_for_mode (mode).require ();
6024 else if (GET_CODE (x) == CONST_INT
6025 && is_a <scalar_int_mode> (mode, &imode))
6026 ival = INTVAL (x);
6027 else
6028 return false;
6030 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6031 a 128 bit vector mode. */
6032 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6034 vmode = aarch64_simd_container_mode (imode, width);
6035 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6037 return aarch64_simd_valid_immediate (v_op, NULL);
6041 /* Return the fixed registers used for condition codes. */
6043 static bool
6044 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6046 *p1 = CC_REGNUM;
6047 *p2 = INVALID_REGNUM;
6048 return true;
6051 /* This function is used by the call expanders of the machine description.
6052 RESULT is the register in which the result is returned. It's NULL for
6053 "call" and "sibcall".
6054 MEM is the location of the function call.
6055 SIBCALL indicates whether this function call is normal call or sibling call.
6056 It will generate different pattern accordingly. */
6058 void
6059 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6061 rtx call, callee, tmp;
6062 rtvec vec;
6063 machine_mode mode;
6065 gcc_assert (MEM_P (mem));
6066 callee = XEXP (mem, 0);
6067 mode = GET_MODE (callee);
6068 gcc_assert (mode == Pmode);
6070 /* Decide if we should generate indirect calls by loading the
6071 address of the callee into a register before performing
6072 the branch-and-link. */
6073 if (SYMBOL_REF_P (callee)
6074 ? (aarch64_is_long_call_p (callee)
6075 || aarch64_is_noplt_call_p (callee))
6076 : !REG_P (callee))
6077 XEXP (mem, 0) = force_reg (mode, callee);
6079 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6081 if (result != NULL_RTX)
6082 call = gen_rtx_SET (result, call);
6084 if (sibcall)
6085 tmp = ret_rtx;
6086 else
6087 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6089 vec = gen_rtvec (2, call, tmp);
6090 call = gen_rtx_PARALLEL (VOIDmode, vec);
6092 aarch64_emit_call_insn (call);
6095 /* Emit call insn with PAT and do aarch64-specific handling. */
6097 void
6098 aarch64_emit_call_insn (rtx pat)
6100 rtx insn = emit_call_insn (pat);
6102 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6103 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6104 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6107 machine_mode
6108 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6110 /* All floating point compares return CCFP if it is an equality
6111 comparison, and CCFPE otherwise. */
6112 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6114 switch (code)
6116 case EQ:
6117 case NE:
6118 case UNORDERED:
6119 case ORDERED:
6120 case UNLT:
6121 case UNLE:
6122 case UNGT:
6123 case UNGE:
6124 case UNEQ:
6125 return CCFPmode;
6127 case LT:
6128 case LE:
6129 case GT:
6130 case GE:
6131 case LTGT:
6132 return CCFPEmode;
6134 default:
6135 gcc_unreachable ();
6139 /* Equality comparisons of short modes against zero can be performed
6140 using the TST instruction with the appropriate bitmask. */
6141 if (y == const0_rtx && REG_P (x)
6142 && (code == EQ || code == NE)
6143 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6144 return CC_NZmode;
6146 /* Similarly, comparisons of zero_extends from shorter modes can
6147 be performed using an ANDS with an immediate mask. */
6148 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6149 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6150 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6151 && (code == EQ || code == NE))
6152 return CC_NZmode;
6154 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6155 && y == const0_rtx
6156 && (code == EQ || code == NE || code == LT || code == GE)
6157 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6158 || GET_CODE (x) == NEG
6159 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6160 && CONST_INT_P (XEXP (x, 2)))))
6161 return CC_NZmode;
6163 /* A compare with a shifted operand. Because of canonicalization,
6164 the comparison will have to be swapped when we emit the assembly
6165 code. */
6166 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6167 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6168 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6169 || GET_CODE (x) == LSHIFTRT
6170 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6171 return CC_SWPmode;
6173 /* Similarly for a negated operand, but we can only do this for
6174 equalities. */
6175 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6176 && (REG_P (y) || GET_CODE (y) == SUBREG)
6177 && (code == EQ || code == NE)
6178 && GET_CODE (x) == NEG)
6179 return CC_Zmode;
6181 /* A test for unsigned overflow. */
6182 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6183 && code == NE
6184 && GET_CODE (x) == PLUS
6185 && GET_CODE (y) == ZERO_EXTEND)
6186 return CC_Cmode;
6188 /* For everything else, return CCmode. */
6189 return CCmode;
6192 static int
6193 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6196 aarch64_get_condition_code (rtx x)
6198 machine_mode mode = GET_MODE (XEXP (x, 0));
6199 enum rtx_code comp_code = GET_CODE (x);
6201 if (GET_MODE_CLASS (mode) != MODE_CC)
6202 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6203 return aarch64_get_condition_code_1 (mode, comp_code);
6206 static int
6207 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6209 switch (mode)
6211 case E_CCFPmode:
6212 case E_CCFPEmode:
6213 switch (comp_code)
6215 case GE: return AARCH64_GE;
6216 case GT: return AARCH64_GT;
6217 case LE: return AARCH64_LS;
6218 case LT: return AARCH64_MI;
6219 case NE: return AARCH64_NE;
6220 case EQ: return AARCH64_EQ;
6221 case ORDERED: return AARCH64_VC;
6222 case UNORDERED: return AARCH64_VS;
6223 case UNLT: return AARCH64_LT;
6224 case UNLE: return AARCH64_LE;
6225 case UNGT: return AARCH64_HI;
6226 case UNGE: return AARCH64_PL;
6227 default: return -1;
6229 break;
6231 case E_CCmode:
6232 switch (comp_code)
6234 case NE: return AARCH64_NE;
6235 case EQ: return AARCH64_EQ;
6236 case GE: return AARCH64_GE;
6237 case GT: return AARCH64_GT;
6238 case LE: return AARCH64_LE;
6239 case LT: return AARCH64_LT;
6240 case GEU: return AARCH64_CS;
6241 case GTU: return AARCH64_HI;
6242 case LEU: return AARCH64_LS;
6243 case LTU: return AARCH64_CC;
6244 default: return -1;
6246 break;
6248 case E_CC_SWPmode:
6249 switch (comp_code)
6251 case NE: return AARCH64_NE;
6252 case EQ: return AARCH64_EQ;
6253 case GE: return AARCH64_LE;
6254 case GT: return AARCH64_LT;
6255 case LE: return AARCH64_GE;
6256 case LT: return AARCH64_GT;
6257 case GEU: return AARCH64_LS;
6258 case GTU: return AARCH64_CC;
6259 case LEU: return AARCH64_CS;
6260 case LTU: return AARCH64_HI;
6261 default: return -1;
6263 break;
6265 case E_CC_NZmode:
6266 switch (comp_code)
6268 case NE: return AARCH64_NE;
6269 case EQ: return AARCH64_EQ;
6270 case GE: return AARCH64_PL;
6271 case LT: return AARCH64_MI;
6272 default: return -1;
6274 break;
6276 case E_CC_Zmode:
6277 switch (comp_code)
6279 case NE: return AARCH64_NE;
6280 case EQ: return AARCH64_EQ;
6281 default: return -1;
6283 break;
6285 case E_CC_Cmode:
6286 switch (comp_code)
6288 case NE: return AARCH64_CS;
6289 case EQ: return AARCH64_CC;
6290 default: return -1;
6292 break;
6294 default:
6295 return -1;
6298 return -1;
6301 bool
6302 aarch64_const_vec_all_same_in_range_p (rtx x,
6303 HOST_WIDE_INT minval,
6304 HOST_WIDE_INT maxval)
6306 rtx elt;
6307 return (const_vec_duplicate_p (x, &elt)
6308 && CONST_INT_P (elt)
6309 && IN_RANGE (INTVAL (elt), minval, maxval));
6312 bool
6313 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6315 return aarch64_const_vec_all_same_in_range_p (x, val, val);
6318 /* Return true if VEC is a constant in which every element is in the range
6319 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6321 static bool
6322 aarch64_const_vec_all_in_range_p (rtx vec,
6323 HOST_WIDE_INT minval,
6324 HOST_WIDE_INT maxval)
6326 if (GET_CODE (vec) != CONST_VECTOR
6327 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6328 return false;
6330 int nunits;
6331 if (!CONST_VECTOR_STEPPED_P (vec))
6332 nunits = const_vector_encoded_nelts (vec);
6333 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6334 return false;
6336 for (int i = 0; i < nunits; i++)
6338 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6339 if (!CONST_INT_P (vec_elem)
6340 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6341 return false;
6343 return true;
6346 /* N Z C V. */
6347 #define AARCH64_CC_V 1
6348 #define AARCH64_CC_C (1 << 1)
6349 #define AARCH64_CC_Z (1 << 2)
6350 #define AARCH64_CC_N (1 << 3)
6352 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6353 static const int aarch64_nzcv_codes[] =
6355 0, /* EQ, Z == 1. */
6356 AARCH64_CC_Z, /* NE, Z == 0. */
6357 0, /* CS, C == 1. */
6358 AARCH64_CC_C, /* CC, C == 0. */
6359 0, /* MI, N == 1. */
6360 AARCH64_CC_N, /* PL, N == 0. */
6361 0, /* VS, V == 1. */
6362 AARCH64_CC_V, /* VC, V == 0. */
6363 0, /* HI, C ==1 && Z == 0. */
6364 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
6365 AARCH64_CC_V, /* GE, N == V. */
6366 0, /* LT, N != V. */
6367 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
6368 0, /* LE, !(Z == 0 && N == V). */
6369 0, /* AL, Any. */
6370 0 /* NV, Any. */
6373 /* Print floating-point vector immediate operand X to F, negating it
6374 first if NEGATE is true. Return true on success, false if it isn't
6375 a constant we can handle. */
6377 static bool
6378 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6380 rtx elt;
6382 if (!const_vec_duplicate_p (x, &elt))
6383 return false;
6385 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6386 if (negate)
6387 r = real_value_negate (&r);
6389 /* We only handle the SVE single-bit immediates here. */
6390 if (real_equal (&r, &dconst0))
6391 asm_fprintf (f, "0.0");
6392 else if (real_equal (&r, &dconst1))
6393 asm_fprintf (f, "1.0");
6394 else if (real_equal (&r, &dconsthalf))
6395 asm_fprintf (f, "0.5");
6396 else
6397 return false;
6399 return true;
6402 /* Return the equivalent letter for size. */
6403 static char
6404 sizetochar (int size)
6406 switch (size)
6408 case 64: return 'd';
6409 case 32: return 's';
6410 case 16: return 'h';
6411 case 8 : return 'b';
6412 default: gcc_unreachable ();
6416 /* Print operand X to file F in a target specific manner according to CODE.
6417 The acceptable formatting commands given by CODE are:
6418 'c': An integer or symbol address without a preceding #
6419 sign.
6420 'C': Take the duplicated element in a vector constant
6421 and print it in hex.
6422 'D': Take the duplicated element in a vector constant
6423 and print it as an unsigned integer, in decimal.
6424 'e': Print the sign/zero-extend size as a character 8->b,
6425 16->h, 32->w.
6426 'p': Prints N such that 2^N == X (X must be power of 2 and
6427 const int).
6428 'P': Print the number of non-zero bits in X (a const_int).
6429 'H': Print the higher numbered register of a pair (TImode)
6430 of regs.
6431 'm': Print a condition (eq, ne, etc).
6432 'M': Same as 'm', but invert condition.
6433 'N': Take the duplicated element in a vector constant
6434 and print the negative of it in decimal.
6435 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6436 'S/T/U/V': Print a FP/SIMD register name for a register list.
6437 The register printed is the FP/SIMD register name
6438 of X + 0/1/2/3 for S/T/U/V.
6439 'R': Print a scalar FP/SIMD register name + 1.
6440 'X': Print bottom 16 bits of integer constant in hex.
6441 'w/x': Print a general register name or the zero register
6442 (32-bit or 64-bit).
6443 '0': Print a normal operand, if it's a general register,
6444 then we assume DImode.
6445 'k': Print NZCV for conditional compare instructions.
6446 'A': Output address constant representing the first
6447 argument of X, specifying a relocation offset
6448 if appropriate.
6449 'L': Output constant address specified by X
6450 with a relocation offset if appropriate.
6451 'G': Prints address of X, specifying a PC relative
6452 relocation mode if appropriate.
6453 'y': Output address of LDP or STP - this is used for
6454 some LDP/STPs which don't use a PARALLEL in their
6455 pattern (so the mode needs to be adjusted).
6456 'z': Output address of a typical LDP or STP. */
6458 static void
6459 aarch64_print_operand (FILE *f, rtx x, int code)
6461 rtx elt;
6462 switch (code)
6464 case 'c':
6465 switch (GET_CODE (x))
6467 case CONST_INT:
6468 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6469 break;
6471 case SYMBOL_REF:
6472 output_addr_const (f, x);
6473 break;
6475 case CONST:
6476 if (GET_CODE (XEXP (x, 0)) == PLUS
6477 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6479 output_addr_const (f, x);
6480 break;
6482 /* Fall through. */
6484 default:
6485 output_operand_lossage ("unsupported operand for code '%c'", code);
6487 break;
6489 case 'e':
6491 int n;
6493 if (!CONST_INT_P (x)
6494 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6496 output_operand_lossage ("invalid operand for '%%%c'", code);
6497 return;
6500 switch (n)
6502 case 3:
6503 fputc ('b', f);
6504 break;
6505 case 4:
6506 fputc ('h', f);
6507 break;
6508 case 5:
6509 fputc ('w', f);
6510 break;
6511 default:
6512 output_operand_lossage ("invalid operand for '%%%c'", code);
6513 return;
6516 break;
6518 case 'p':
6520 int n;
6522 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6524 output_operand_lossage ("invalid operand for '%%%c'", code);
6525 return;
6528 asm_fprintf (f, "%d", n);
6530 break;
6532 case 'P':
6533 if (!CONST_INT_P (x))
6535 output_operand_lossage ("invalid operand for '%%%c'", code);
6536 return;
6539 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6540 break;
6542 case 'H':
6543 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6545 output_operand_lossage ("invalid operand for '%%%c'", code);
6546 return;
6549 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6550 break;
6552 case 'M':
6553 case 'm':
6555 int cond_code;
6556 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6557 if (x == const_true_rtx)
6559 if (code == 'M')
6560 fputs ("nv", f);
6561 return;
6564 if (!COMPARISON_P (x))
6566 output_operand_lossage ("invalid operand for '%%%c'", code);
6567 return;
6570 cond_code = aarch64_get_condition_code (x);
6571 gcc_assert (cond_code >= 0);
6572 if (code == 'M')
6573 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6574 fputs (aarch64_condition_codes[cond_code], f);
6576 break;
6578 case 'N':
6579 if (!const_vec_duplicate_p (x, &elt))
6581 output_operand_lossage ("invalid vector constant");
6582 return;
6585 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6586 asm_fprintf (f, "%wd", -INTVAL (elt));
6587 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6588 && aarch64_print_vector_float_operand (f, x, true))
6590 else
6592 output_operand_lossage ("invalid vector constant");
6593 return;
6595 break;
6597 case 'b':
6598 case 'h':
6599 case 's':
6600 case 'd':
6601 case 'q':
6602 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6604 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6605 return;
6607 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6608 break;
6610 case 'S':
6611 case 'T':
6612 case 'U':
6613 case 'V':
6614 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6616 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6617 return;
6619 asm_fprintf (f, "%c%d",
6620 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6621 REGNO (x) - V0_REGNUM + (code - 'S'));
6622 break;
6624 case 'R':
6625 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6627 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6628 return;
6630 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6631 break;
6633 case 'X':
6634 if (!CONST_INT_P (x))
6636 output_operand_lossage ("invalid operand for '%%%c'", code);
6637 return;
6639 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6640 break;
6642 case 'C':
6644 /* Print a replicated constant in hex. */
6645 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6647 output_operand_lossage ("invalid operand for '%%%c'", code);
6648 return;
6650 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6651 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6653 break;
6655 case 'D':
6657 /* Print a replicated constant in decimal, treating it as
6658 unsigned. */
6659 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6661 output_operand_lossage ("invalid operand for '%%%c'", code);
6662 return;
6664 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6665 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6667 break;
6669 case 'w':
6670 case 'x':
6671 if (x == const0_rtx
6672 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6674 asm_fprintf (f, "%czr", code);
6675 break;
6678 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6680 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6681 break;
6684 if (REG_P (x) && REGNO (x) == SP_REGNUM)
6686 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6687 break;
6690 /* Fall through */
6692 case 0:
6693 if (x == NULL)
6695 output_operand_lossage ("missing operand");
6696 return;
6699 switch (GET_CODE (x))
6701 case REG:
6702 if (aarch64_sve_data_mode_p (GET_MODE (x)))
6704 if (REG_NREGS (x) == 1)
6705 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6706 else
6708 char suffix
6709 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6710 asm_fprintf (f, "{z%d.%c - z%d.%c}",
6711 REGNO (x) - V0_REGNUM, suffix,
6712 END_REGNO (x) - V0_REGNUM - 1, suffix);
6715 else
6716 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6717 break;
6719 case MEM:
6720 output_address (GET_MODE (x), XEXP (x, 0));
6721 break;
6723 case LABEL_REF:
6724 case SYMBOL_REF:
6725 output_addr_const (asm_out_file, x);
6726 break;
6728 case CONST_INT:
6729 asm_fprintf (f, "%wd", INTVAL (x));
6730 break;
6732 case CONST:
6733 if (!VECTOR_MODE_P (GET_MODE (x)))
6735 output_addr_const (asm_out_file, x);
6736 break;
6738 /* fall through */
6740 case CONST_VECTOR:
6741 if (!const_vec_duplicate_p (x, &elt))
6743 output_operand_lossage ("invalid vector constant");
6744 return;
6747 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6748 asm_fprintf (f, "%wd", INTVAL (elt));
6749 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6750 && aarch64_print_vector_float_operand (f, x, false))
6752 else
6754 output_operand_lossage ("invalid vector constant");
6755 return;
6757 break;
6759 case CONST_DOUBLE:
6760 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6761 be getting CONST_DOUBLEs holding integers. */
6762 gcc_assert (GET_MODE (x) != VOIDmode);
6763 if (aarch64_float_const_zero_rtx_p (x))
6765 fputc ('0', f);
6766 break;
6768 else if (aarch64_float_const_representable_p (x))
6770 #define buf_size 20
6771 char float_buf[buf_size] = {'\0'};
6772 real_to_decimal_for_mode (float_buf,
6773 CONST_DOUBLE_REAL_VALUE (x),
6774 buf_size, buf_size,
6775 1, GET_MODE (x));
6776 asm_fprintf (asm_out_file, "%s", float_buf);
6777 break;
6778 #undef buf_size
6780 output_operand_lossage ("invalid constant");
6781 return;
6782 default:
6783 output_operand_lossage ("invalid operand");
6784 return;
6786 break;
6788 case 'A':
6789 if (GET_CODE (x) == HIGH)
6790 x = XEXP (x, 0);
6792 switch (aarch64_classify_symbolic_expression (x))
6794 case SYMBOL_SMALL_GOT_4G:
6795 asm_fprintf (asm_out_file, ":got:");
6796 break;
6798 case SYMBOL_SMALL_TLSGD:
6799 asm_fprintf (asm_out_file, ":tlsgd:");
6800 break;
6802 case SYMBOL_SMALL_TLSDESC:
6803 asm_fprintf (asm_out_file, ":tlsdesc:");
6804 break;
6806 case SYMBOL_SMALL_TLSIE:
6807 asm_fprintf (asm_out_file, ":gottprel:");
6808 break;
6810 case SYMBOL_TLSLE24:
6811 asm_fprintf (asm_out_file, ":tprel:");
6812 break;
6814 case SYMBOL_TINY_GOT:
6815 gcc_unreachable ();
6816 break;
6818 default:
6819 break;
6821 output_addr_const (asm_out_file, x);
6822 break;
6824 case 'L':
6825 switch (aarch64_classify_symbolic_expression (x))
6827 case SYMBOL_SMALL_GOT_4G:
6828 asm_fprintf (asm_out_file, ":lo12:");
6829 break;
6831 case SYMBOL_SMALL_TLSGD:
6832 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6833 break;
6835 case SYMBOL_SMALL_TLSDESC:
6836 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
6837 break;
6839 case SYMBOL_SMALL_TLSIE:
6840 asm_fprintf (asm_out_file, ":gottprel_lo12:");
6841 break;
6843 case SYMBOL_TLSLE12:
6844 asm_fprintf (asm_out_file, ":tprel_lo12:");
6845 break;
6847 case SYMBOL_TLSLE24:
6848 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
6849 break;
6851 case SYMBOL_TINY_GOT:
6852 asm_fprintf (asm_out_file, ":got:");
6853 break;
6855 case SYMBOL_TINY_TLSIE:
6856 asm_fprintf (asm_out_file, ":gottprel:");
6857 break;
6859 default:
6860 break;
6862 output_addr_const (asm_out_file, x);
6863 break;
6865 case 'G':
6866 switch (aarch64_classify_symbolic_expression (x))
6868 case SYMBOL_TLSLE24:
6869 asm_fprintf (asm_out_file, ":tprel_hi12:");
6870 break;
6871 default:
6872 break;
6874 output_addr_const (asm_out_file, x);
6875 break;
6877 case 'k':
6879 HOST_WIDE_INT cond_code;
6881 if (!CONST_INT_P (x))
6883 output_operand_lossage ("invalid operand for '%%%c'", code);
6884 return;
6887 cond_code = INTVAL (x);
6888 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
6889 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
6891 break;
6893 case 'y':
6894 case 'z':
6896 machine_mode mode = GET_MODE (x);
6898 if (GET_CODE (x) != MEM
6899 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
6901 output_operand_lossage ("invalid operand for '%%%c'", code);
6902 return;
6905 if (code == 'y')
6906 /* LDP/STP which uses a single double-width memory operand.
6907 Adjust the mode to appear like a typical LDP/STP.
6908 Currently this is supported for 16-byte accesses only. */
6909 mode = DFmode;
6911 if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
6912 output_operand_lossage ("invalid operand prefix '%%%c'", code);
6914 break;
6916 default:
6917 output_operand_lossage ("invalid operand prefix '%%%c'", code);
6918 return;
6922 /* Print address 'x' of a memory access with mode 'mode'.
6923 'op' is the context required by aarch64_classify_address. It can either be
6924 MEM for a normal memory access or PARALLEL for LDP/STP. */
6925 static bool
6926 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
6927 aarch64_addr_query_type type)
6929 struct aarch64_address_info addr;
6930 unsigned int size;
6932 /* Check all addresses are Pmode - including ILP32. */
6933 gcc_assert (GET_MODE (x) == Pmode);
6935 if (aarch64_classify_address (&addr, x, mode, true, type))
6936 switch (addr.type)
6938 case ADDRESS_REG_IMM:
6939 if (known_eq (addr.const_offset, 0))
6940 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
6941 else if (aarch64_sve_data_mode_p (mode))
6943 HOST_WIDE_INT vnum
6944 = exact_div (addr.const_offset,
6945 BYTES_PER_SVE_VECTOR).to_constant ();
6946 asm_fprintf (f, "[%s, #%wd, mul vl]",
6947 reg_names[REGNO (addr.base)], vnum);
6949 else if (aarch64_sve_pred_mode_p (mode))
6951 HOST_WIDE_INT vnum
6952 = exact_div (addr.const_offset,
6953 BYTES_PER_SVE_PRED).to_constant ();
6954 asm_fprintf (f, "[%s, #%wd, mul vl]",
6955 reg_names[REGNO (addr.base)], vnum);
6957 else
6958 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
6959 INTVAL (addr.offset));
6960 return true;
6962 case ADDRESS_REG_REG:
6963 if (addr.shift == 0)
6964 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
6965 reg_names [REGNO (addr.offset)]);
6966 else
6967 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
6968 reg_names [REGNO (addr.offset)], addr.shift);
6969 return true;
6971 case ADDRESS_REG_UXTW:
6972 if (addr.shift == 0)
6973 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
6974 REGNO (addr.offset) - R0_REGNUM);
6975 else
6976 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
6977 REGNO (addr.offset) - R0_REGNUM, addr.shift);
6978 return true;
6980 case ADDRESS_REG_SXTW:
6981 if (addr.shift == 0)
6982 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
6983 REGNO (addr.offset) - R0_REGNUM);
6984 else
6985 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
6986 REGNO (addr.offset) - R0_REGNUM, addr.shift);
6987 return true;
6989 case ADDRESS_REG_WB:
6990 /* Writeback is only supported for fixed-width modes. */
6991 size = GET_MODE_SIZE (mode).to_constant ();
6992 switch (GET_CODE (x))
6994 case PRE_INC:
6995 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
6996 return true;
6997 case POST_INC:
6998 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
6999 return true;
7000 case PRE_DEC:
7001 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7002 return true;
7003 case POST_DEC:
7004 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7005 return true;
7006 case PRE_MODIFY:
7007 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7008 INTVAL (addr.offset));
7009 return true;
7010 case POST_MODIFY:
7011 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7012 INTVAL (addr.offset));
7013 return true;
7014 default:
7015 break;
7017 break;
7019 case ADDRESS_LO_SUM:
7020 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7021 output_addr_const (f, addr.offset);
7022 asm_fprintf (f, "]");
7023 return true;
7025 case ADDRESS_SYMBOLIC:
7026 output_addr_const (f, x);
7027 return true;
7030 return false;
7033 /* Print address 'x' of a LDP/STP with mode 'mode'. */
7034 static bool
7035 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7037 return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7040 /* Print address 'x' of a memory access with mode 'mode'. */
7041 static void
7042 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7044 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7045 output_addr_const (f, x);
7048 bool
7049 aarch64_label_mentioned_p (rtx x)
7051 const char *fmt;
7052 int i;
7054 if (GET_CODE (x) == LABEL_REF)
7055 return true;
7057 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7058 referencing instruction, but they are constant offsets, not
7059 symbols. */
7060 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7061 return false;
7063 fmt = GET_RTX_FORMAT (GET_CODE (x));
7064 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7066 if (fmt[i] == 'E')
7068 int j;
7070 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7071 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7072 return 1;
7074 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7075 return 1;
7078 return 0;
7081 /* Implement REGNO_REG_CLASS. */
7083 enum reg_class
7084 aarch64_regno_regclass (unsigned regno)
7086 if (GP_REGNUM_P (regno))
7087 return GENERAL_REGS;
7089 if (regno == SP_REGNUM)
7090 return STACK_REG;
7092 if (regno == FRAME_POINTER_REGNUM
7093 || regno == ARG_POINTER_REGNUM)
7094 return POINTER_REGS;
7096 if (FP_REGNUM_P (regno))
7097 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
7099 if (PR_REGNUM_P (regno))
7100 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7102 return NO_REGS;
7105 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7106 If OFFSET is out of range, return an offset of an anchor point
7107 that is in range. Return 0 otherwise. */
7109 static HOST_WIDE_INT
7110 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7111 machine_mode mode)
7113 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7114 if (size > 16)
7115 return (offset + 0x400) & ~0x7f0;
7117 /* For offsets that aren't a multiple of the access size, the limit is
7118 -256...255. */
7119 if (offset & (size - 1))
7121 /* BLKmode typically uses LDP of X-registers. */
7122 if (mode == BLKmode)
7123 return (offset + 512) & ~0x3ff;
7124 return (offset + 0x100) & ~0x1ff;
7127 /* Small negative offsets are supported. */
7128 if (IN_RANGE (offset, -256, 0))
7129 return 0;
7131 if (mode == TImode || mode == TFmode)
7132 return (offset + 0x100) & ~0x1ff;
7134 /* Use 12-bit offset by access size. */
7135 return offset & (~0xfff * size);
7138 static rtx
7139 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
7141 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7142 where mask is selected by alignment and size of the offset.
7143 We try to pick as large a range for the offset as possible to
7144 maximize the chance of a CSE. However, for aligned addresses
7145 we limit the range to 4k so that structures with different sized
7146 elements are likely to use the same base. We need to be careful
7147 not to split a CONST for some forms of address expression, otherwise
7148 it will generate sub-optimal code. */
7150 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7152 rtx base = XEXP (x, 0);
7153 rtx offset_rtx = XEXP (x, 1);
7154 HOST_WIDE_INT offset = INTVAL (offset_rtx);
7156 if (GET_CODE (base) == PLUS)
7158 rtx op0 = XEXP (base, 0);
7159 rtx op1 = XEXP (base, 1);
7161 /* Force any scaling into a temp for CSE. */
7162 op0 = force_reg (Pmode, op0);
7163 op1 = force_reg (Pmode, op1);
7165 /* Let the pointer register be in op0. */
7166 if (REG_POINTER (op1))
7167 std::swap (op0, op1);
7169 /* If the pointer is virtual or frame related, then we know that
7170 virtual register instantiation or register elimination is going
7171 to apply a second constant. We want the two constants folded
7172 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7173 if (virt_or_elim_regno_p (REGNO (op0)))
7175 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7176 NULL_RTX, true, OPTAB_DIRECT);
7177 return gen_rtx_PLUS (Pmode, base, op1);
7180 /* Otherwise, in order to encourage CSE (and thence loop strength
7181 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7182 base = expand_binop (Pmode, add_optab, op0, op1,
7183 NULL_RTX, true, OPTAB_DIRECT);
7184 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7187 HOST_WIDE_INT size;
7188 if (GET_MODE_SIZE (mode).is_constant (&size))
7190 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7191 mode);
7192 if (base_offset != 0)
7194 base = plus_constant (Pmode, base, base_offset);
7195 base = force_operand (base, NULL_RTX);
7196 return plus_constant (Pmode, base, offset - base_offset);
7201 return x;
7204 /* Return the reload icode required for a constant pool in mode. */
7205 static enum insn_code
7206 aarch64_constant_pool_reload_icode (machine_mode mode)
7208 switch (mode)
7210 case E_SFmode:
7211 return CODE_FOR_aarch64_reload_movcpsfdi;
7213 case E_DFmode:
7214 return CODE_FOR_aarch64_reload_movcpdfdi;
7216 case E_TFmode:
7217 return CODE_FOR_aarch64_reload_movcptfdi;
7219 case E_V8QImode:
7220 return CODE_FOR_aarch64_reload_movcpv8qidi;
7222 case E_V16QImode:
7223 return CODE_FOR_aarch64_reload_movcpv16qidi;
7225 case E_V4HImode:
7226 return CODE_FOR_aarch64_reload_movcpv4hidi;
7228 case E_V8HImode:
7229 return CODE_FOR_aarch64_reload_movcpv8hidi;
7231 case E_V2SImode:
7232 return CODE_FOR_aarch64_reload_movcpv2sidi;
7234 case E_V4SImode:
7235 return CODE_FOR_aarch64_reload_movcpv4sidi;
7237 case E_V2DImode:
7238 return CODE_FOR_aarch64_reload_movcpv2didi;
7240 case E_V2DFmode:
7241 return CODE_FOR_aarch64_reload_movcpv2dfdi;
7243 default:
7244 gcc_unreachable ();
7247 gcc_unreachable ();
7249 static reg_class_t
7250 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7251 reg_class_t rclass,
7252 machine_mode mode,
7253 secondary_reload_info *sri)
7255 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7256 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7257 comment at the head of aarch64-sve.md for more details about the
7258 big-endian handling. */
7259 if (BYTES_BIG_ENDIAN
7260 && reg_class_subset_p (rclass, FP_REGS)
7261 && !((REG_P (x) && HARD_REGISTER_P (x))
7262 || aarch64_simd_valid_immediate (x, NULL))
7263 && aarch64_sve_data_mode_p (mode))
7265 sri->icode = CODE_FOR_aarch64_sve_reload_be;
7266 return NO_REGS;
7269 /* If we have to disable direct literal pool loads and stores because the
7270 function is too big, then we need a scratch register. */
7271 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7272 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7273 || targetm.vector_mode_supported_p (GET_MODE (x)))
7274 && !aarch64_pcrelative_literal_loads)
7276 sri->icode = aarch64_constant_pool_reload_icode (mode);
7277 return NO_REGS;
7280 /* Without the TARGET_SIMD instructions we cannot move a Q register
7281 to a Q register directly. We need a scratch. */
7282 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7283 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7284 && reg_class_subset_p (rclass, FP_REGS))
7286 if (mode == TFmode)
7287 sri->icode = CODE_FOR_aarch64_reload_movtf;
7288 else if (mode == TImode)
7289 sri->icode = CODE_FOR_aarch64_reload_movti;
7290 return NO_REGS;
7293 /* A TFmode or TImode memory access should be handled via an FP_REGS
7294 because AArch64 has richer addressing modes for LDR/STR instructions
7295 than LDP/STP instructions. */
7296 if (TARGET_FLOAT && rclass == GENERAL_REGS
7297 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7298 return FP_REGS;
7300 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7301 return GENERAL_REGS;
7303 return NO_REGS;
7306 static bool
7307 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7309 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7311 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7312 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7313 if (frame_pointer_needed)
7314 return to == HARD_FRAME_POINTER_REGNUM;
7315 return true;
7318 poly_int64
7319 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7321 aarch64_layout_frame ();
7323 if (to == HARD_FRAME_POINTER_REGNUM)
7325 if (from == ARG_POINTER_REGNUM)
7326 return cfun->machine->frame.hard_fp_offset;
7328 if (from == FRAME_POINTER_REGNUM)
7329 return cfun->machine->frame.hard_fp_offset
7330 - cfun->machine->frame.locals_offset;
7333 if (to == STACK_POINTER_REGNUM)
7335 if (from == FRAME_POINTER_REGNUM)
7336 return cfun->machine->frame.frame_size
7337 - cfun->machine->frame.locals_offset;
7340 return cfun->machine->frame.frame_size;
7343 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7344 previous frame. */
7347 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7349 if (count != 0)
7350 return const0_rtx;
7351 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7355 static void
7356 aarch64_asm_trampoline_template (FILE *f)
7358 if (TARGET_ILP32)
7360 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7361 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7363 else
7365 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7366 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7368 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7369 assemble_aligned_integer (4, const0_rtx);
7370 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7371 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7374 static void
7375 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7377 rtx fnaddr, mem, a_tramp;
7378 const int tramp_code_sz = 16;
7380 /* Don't need to copy the trailing D-words, we fill those in below. */
7381 emit_block_move (m_tramp, assemble_trampoline_template (),
7382 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7383 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7384 fnaddr = XEXP (DECL_RTL (fndecl), 0);
7385 if (GET_MODE (fnaddr) != ptr_mode)
7386 fnaddr = convert_memory_address (ptr_mode, fnaddr);
7387 emit_move_insn (mem, fnaddr);
7389 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7390 emit_move_insn (mem, chain_value);
7392 /* XXX We should really define a "clear_cache" pattern and use
7393 gen_clear_cache(). */
7394 a_tramp = XEXP (m_tramp, 0);
7395 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7396 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7397 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7398 ptr_mode);
7401 static unsigned char
7402 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7404 /* ??? Logically we should only need to provide a value when
7405 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7406 can hold MODE, but at the moment we need to handle all modes.
7407 Just ignore any runtime parts for registers that can't store them. */
7408 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7409 unsigned int nregs;
7410 switch (regclass)
7412 case CALLER_SAVE_REGS:
7413 case POINTER_REGS:
7414 case GENERAL_REGS:
7415 case ALL_REGS:
7416 case POINTER_AND_FP_REGS:
7417 case FP_REGS:
7418 case FP_LO_REGS:
7419 if (aarch64_sve_data_mode_p (mode)
7420 && constant_multiple_p (GET_MODE_SIZE (mode),
7421 BYTES_PER_SVE_VECTOR, &nregs))
7422 return nregs;
7423 return (aarch64_vector_data_mode_p (mode)
7424 ? CEIL (lowest_size, UNITS_PER_VREG)
7425 : CEIL (lowest_size, UNITS_PER_WORD));
7426 case STACK_REG:
7427 case PR_REGS:
7428 case PR_LO_REGS:
7429 case PR_HI_REGS:
7430 return 1;
7432 case NO_REGS:
7433 return 0;
7435 default:
7436 break;
7438 gcc_unreachable ();
7441 static reg_class_t
7442 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7444 if (regclass == POINTER_REGS)
7445 return GENERAL_REGS;
7447 if (regclass == STACK_REG)
7449 if (REG_P(x)
7450 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7451 return regclass;
7453 return NO_REGS;
7456 /* Register eliminiation can result in a request for
7457 SP+constant->FP_REGS. We cannot support such operations which
7458 use SP as source and an FP_REG as destination, so reject out
7459 right now. */
7460 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7462 rtx lhs = XEXP (x, 0);
7464 /* Look through a possible SUBREG introduced by ILP32. */
7465 if (GET_CODE (lhs) == SUBREG)
7466 lhs = SUBREG_REG (lhs);
7468 gcc_assert (REG_P (lhs));
7469 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7470 POINTER_REGS));
7471 return NO_REGS;
7474 return regclass;
7477 void
7478 aarch64_asm_output_labelref (FILE* f, const char *name)
7480 asm_fprintf (f, "%U%s", name);
7483 static void
7484 aarch64_elf_asm_constructor (rtx symbol, int priority)
7486 if (priority == DEFAULT_INIT_PRIORITY)
7487 default_ctor_section_asm_out_constructor (symbol, priority);
7488 else
7490 section *s;
7491 /* While priority is known to be in range [0, 65535], so 18 bytes
7492 would be enough, the compiler might not know that. To avoid
7493 -Wformat-truncation false positive, use a larger size. */
7494 char buf[23];
7495 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7496 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7497 switch_to_section (s);
7498 assemble_align (POINTER_SIZE);
7499 assemble_aligned_integer (POINTER_BYTES, symbol);
7503 static void
7504 aarch64_elf_asm_destructor (rtx symbol, int priority)
7506 if (priority == DEFAULT_INIT_PRIORITY)
7507 default_dtor_section_asm_out_destructor (symbol, priority);
7508 else
7510 section *s;
7511 /* While priority is known to be in range [0, 65535], so 18 bytes
7512 would be enough, the compiler might not know that. To avoid
7513 -Wformat-truncation false positive, use a larger size. */
7514 char buf[23];
7515 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7516 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7517 switch_to_section (s);
7518 assemble_align (POINTER_SIZE);
7519 assemble_aligned_integer (POINTER_BYTES, symbol);
7523 const char*
7524 aarch64_output_casesi (rtx *operands)
7526 char buf[100];
7527 char label[100];
7528 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7529 int index;
7530 static const char *const patterns[4][2] =
7533 "ldrb\t%w3, [%0,%w1,uxtw]",
7534 "add\t%3, %4, %w3, sxtb #2"
7537 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7538 "add\t%3, %4, %w3, sxth #2"
7541 "ldr\t%w3, [%0,%w1,uxtw #2]",
7542 "add\t%3, %4, %w3, sxtw #2"
7544 /* We assume that DImode is only generated when not optimizing and
7545 that we don't really need 64-bit address offsets. That would
7546 imply an object file with 8GB of code in a single function! */
7548 "ldr\t%w3, [%0,%w1,uxtw #2]",
7549 "add\t%3, %4, %w3, sxtw #2"
7553 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7555 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7556 index = exact_log2 (GET_MODE_SIZE (mode));
7558 gcc_assert (index >= 0 && index <= 3);
7560 /* Need to implement table size reduction, by chaning the code below. */
7561 output_asm_insn (patterns[index][0], operands);
7562 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7563 snprintf (buf, sizeof (buf),
7564 "adr\t%%4, %s", targetm.strip_name_encoding (label));
7565 output_asm_insn (buf, operands);
7566 output_asm_insn (patterns[index][1], operands);
7567 output_asm_insn ("br\t%3", operands);
7568 assemble_label (asm_out_file, label);
7569 return "";
7573 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7574 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7575 operator. */
7578 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7580 if (shift >= 0 && shift <= 3)
7582 int size;
7583 for (size = 8; size <= 32; size *= 2)
7585 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7586 if (mask == bits << shift)
7587 return size;
7590 return 0;
7593 /* Constant pools are per function only when PC relative
7594 literal loads are true or we are in the large memory
7595 model. */
7597 static inline bool
7598 aarch64_can_use_per_function_literal_pools_p (void)
7600 return (aarch64_pcrelative_literal_loads
7601 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7604 static bool
7605 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7607 /* Fixme:: In an ideal world this would work similar
7608 to the logic in aarch64_select_rtx_section but this
7609 breaks bootstrap in gcc go. For now we workaround
7610 this by returning false here. */
7611 return false;
7614 /* Select appropriate section for constants depending
7615 on where we place literal pools. */
7617 static section *
7618 aarch64_select_rtx_section (machine_mode mode,
7619 rtx x,
7620 unsigned HOST_WIDE_INT align)
7622 if (aarch64_can_use_per_function_literal_pools_p ())
7623 return function_section (current_function_decl);
7625 return default_elf_select_rtx_section (mode, x, align);
7628 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7629 void
7630 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7631 HOST_WIDE_INT offset)
7633 /* When using per-function literal pools, we must ensure that any code
7634 section is aligned to the minimal instruction length, lest we get
7635 errors from the assembler re "unaligned instructions". */
7636 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7637 ASM_OUTPUT_ALIGN (f, 2);
7640 /* Costs. */
7642 /* Helper function for rtx cost calculation. Strip a shift expression
7643 from X. Returns the inner operand if successful, or the original
7644 expression on failure. */
7645 static rtx
7646 aarch64_strip_shift (rtx x)
7648 rtx op = x;
7650 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7651 we can convert both to ROR during final output. */
7652 if ((GET_CODE (op) == ASHIFT
7653 || GET_CODE (op) == ASHIFTRT
7654 || GET_CODE (op) == LSHIFTRT
7655 || GET_CODE (op) == ROTATERT
7656 || GET_CODE (op) == ROTATE)
7657 && CONST_INT_P (XEXP (op, 1)))
7658 return XEXP (op, 0);
7660 if (GET_CODE (op) == MULT
7661 && CONST_INT_P (XEXP (op, 1))
7662 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7663 return XEXP (op, 0);
7665 return x;
7668 /* Helper function for rtx cost calculation. Strip an extend
7669 expression from X. Returns the inner operand if successful, or the
7670 original expression on failure. We deal with a number of possible
7671 canonicalization variations here. If STRIP_SHIFT is true, then
7672 we can strip off a shift also. */
7673 static rtx
7674 aarch64_strip_extend (rtx x, bool strip_shift)
7676 scalar_int_mode mode;
7677 rtx op = x;
7679 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7680 return op;
7682 /* Zero and sign extraction of a widened value. */
7683 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7684 && XEXP (op, 2) == const0_rtx
7685 && GET_CODE (XEXP (op, 0)) == MULT
7686 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7687 XEXP (op, 1)))
7688 return XEXP (XEXP (op, 0), 0);
7690 /* It can also be represented (for zero-extend) as an AND with an
7691 immediate. */
7692 if (GET_CODE (op) == AND
7693 && GET_CODE (XEXP (op, 0)) == MULT
7694 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7695 && CONST_INT_P (XEXP (op, 1))
7696 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7697 INTVAL (XEXP (op, 1))) != 0)
7698 return XEXP (XEXP (op, 0), 0);
7700 /* Now handle extended register, as this may also have an optional
7701 left shift by 1..4. */
7702 if (strip_shift
7703 && GET_CODE (op) == ASHIFT
7704 && CONST_INT_P (XEXP (op, 1))
7705 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7706 op = XEXP (op, 0);
7708 if (GET_CODE (op) == ZERO_EXTEND
7709 || GET_CODE (op) == SIGN_EXTEND)
7710 op = XEXP (op, 0);
7712 if (op != x)
7713 return op;
7715 return x;
7718 /* Return true iff CODE is a shift supported in combination
7719 with arithmetic instructions. */
7721 static bool
7722 aarch64_shift_p (enum rtx_code code)
7724 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7728 /* Return true iff X is a cheap shift without a sign extend. */
7730 static bool
7731 aarch64_cheap_mult_shift_p (rtx x)
7733 rtx op0, op1;
7735 op0 = XEXP (x, 0);
7736 op1 = XEXP (x, 1);
7738 if (!(aarch64_tune_params.extra_tuning_flags
7739 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7740 return false;
7742 if (GET_CODE (op0) == SIGN_EXTEND)
7743 return false;
7745 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7746 && UINTVAL (op1) <= 4)
7747 return true;
7749 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7750 return false;
7752 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7754 if (l2 > 0 && l2 <= 4)
7755 return true;
7757 return false;
7760 /* Helper function for rtx cost calculation. Calculate the cost of
7761 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7762 Return the calculated cost of the expression, recursing manually in to
7763 operands where needed. */
7765 static int
7766 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7768 rtx op0, op1;
7769 const struct cpu_cost_table *extra_cost
7770 = aarch64_tune_params.insn_extra_cost;
7771 int cost = 0;
7772 bool compound_p = (outer == PLUS || outer == MINUS);
7773 machine_mode mode = GET_MODE (x);
7775 gcc_checking_assert (code == MULT);
7777 op0 = XEXP (x, 0);
7778 op1 = XEXP (x, 1);
7780 if (VECTOR_MODE_P (mode))
7781 mode = GET_MODE_INNER (mode);
7783 /* Integer multiply/fma. */
7784 if (GET_MODE_CLASS (mode) == MODE_INT)
7786 /* The multiply will be canonicalized as a shift, cost it as such. */
7787 if (aarch64_shift_p (GET_CODE (x))
7788 || (CONST_INT_P (op1)
7789 && exact_log2 (INTVAL (op1)) > 0))
7791 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7792 || GET_CODE (op0) == SIGN_EXTEND;
7793 if (speed)
7795 if (compound_p)
7797 /* If the shift is considered cheap,
7798 then don't add any cost. */
7799 if (aarch64_cheap_mult_shift_p (x))
7801 else if (REG_P (op1))
7802 /* ARITH + shift-by-register. */
7803 cost += extra_cost->alu.arith_shift_reg;
7804 else if (is_extend)
7805 /* ARITH + extended register. We don't have a cost field
7806 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
7807 cost += extra_cost->alu.extend_arith;
7808 else
7809 /* ARITH + shift-by-immediate. */
7810 cost += extra_cost->alu.arith_shift;
7812 else
7813 /* LSL (immediate). */
7814 cost += extra_cost->alu.shift;
7817 /* Strip extends as we will have costed them in the case above. */
7818 if (is_extend)
7819 op0 = aarch64_strip_extend (op0, true);
7821 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7823 return cost;
7826 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
7827 compound and let the below cases handle it. After all, MNEG is a
7828 special-case alias of MSUB. */
7829 if (GET_CODE (op0) == NEG)
7831 op0 = XEXP (op0, 0);
7832 compound_p = true;
7835 /* Integer multiplies or FMAs have zero/sign extending variants. */
7836 if ((GET_CODE (op0) == ZERO_EXTEND
7837 && GET_CODE (op1) == ZERO_EXTEND)
7838 || (GET_CODE (op0) == SIGN_EXTEND
7839 && GET_CODE (op1) == SIGN_EXTEND))
7841 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
7842 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
7844 if (speed)
7846 if (compound_p)
7847 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
7848 cost += extra_cost->mult[0].extend_add;
7849 else
7850 /* MUL/SMULL/UMULL. */
7851 cost += extra_cost->mult[0].extend;
7854 return cost;
7857 /* This is either an integer multiply or a MADD. In both cases
7858 we want to recurse and cost the operands. */
7859 cost += rtx_cost (op0, mode, MULT, 0, speed);
7860 cost += rtx_cost (op1, mode, MULT, 1, speed);
7862 if (speed)
7864 if (compound_p)
7865 /* MADD/MSUB. */
7866 cost += extra_cost->mult[mode == DImode].add;
7867 else
7868 /* MUL. */
7869 cost += extra_cost->mult[mode == DImode].simple;
7872 return cost;
7874 else
7876 if (speed)
7878 /* Floating-point FMA/FMUL can also support negations of the
7879 operands, unless the rounding mode is upward or downward in
7880 which case FNMUL is different than FMUL with operand negation. */
7881 bool neg0 = GET_CODE (op0) == NEG;
7882 bool neg1 = GET_CODE (op1) == NEG;
7883 if (compound_p || !flag_rounding_math || (neg0 && neg1))
7885 if (neg0)
7886 op0 = XEXP (op0, 0);
7887 if (neg1)
7888 op1 = XEXP (op1, 0);
7891 if (compound_p)
7892 /* FMADD/FNMADD/FNMSUB/FMSUB. */
7893 cost += extra_cost->fp[mode == DFmode].fma;
7894 else
7895 /* FMUL/FNMUL. */
7896 cost += extra_cost->fp[mode == DFmode].mult;
7899 cost += rtx_cost (op0, mode, MULT, 0, speed);
7900 cost += rtx_cost (op1, mode, MULT, 1, speed);
7901 return cost;
7905 static int
7906 aarch64_address_cost (rtx x,
7907 machine_mode mode,
7908 addr_space_t as ATTRIBUTE_UNUSED,
7909 bool speed)
7911 enum rtx_code c = GET_CODE (x);
7912 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
7913 struct aarch64_address_info info;
7914 int cost = 0;
7915 info.shift = 0;
7917 if (!aarch64_classify_address (&info, x, mode, false))
7919 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
7921 /* This is a CONST or SYMBOL ref which will be split
7922 in a different way depending on the code model in use.
7923 Cost it through the generic infrastructure. */
7924 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
7925 /* Divide through by the cost of one instruction to
7926 bring it to the same units as the address costs. */
7927 cost_symbol_ref /= COSTS_N_INSNS (1);
7928 /* The cost is then the cost of preparing the address,
7929 followed by an immediate (possibly 0) offset. */
7930 return cost_symbol_ref + addr_cost->imm_offset;
7932 else
7934 /* This is most likely a jump table from a case
7935 statement. */
7936 return addr_cost->register_offset;
7940 switch (info.type)
7942 case ADDRESS_LO_SUM:
7943 case ADDRESS_SYMBOLIC:
7944 case ADDRESS_REG_IMM:
7945 cost += addr_cost->imm_offset;
7946 break;
7948 case ADDRESS_REG_WB:
7949 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
7950 cost += addr_cost->pre_modify;
7951 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
7952 cost += addr_cost->post_modify;
7953 else
7954 gcc_unreachable ();
7956 break;
7958 case ADDRESS_REG_REG:
7959 cost += addr_cost->register_offset;
7960 break;
7962 case ADDRESS_REG_SXTW:
7963 cost += addr_cost->register_sextend;
7964 break;
7966 case ADDRESS_REG_UXTW:
7967 cost += addr_cost->register_zextend;
7968 break;
7970 default:
7971 gcc_unreachable ();
7975 if (info.shift > 0)
7977 /* For the sake of calculating the cost of the shifted register
7978 component, we can treat same sized modes in the same way. */
7979 if (known_eq (GET_MODE_BITSIZE (mode), 16))
7980 cost += addr_cost->addr_scale_costs.hi;
7981 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
7982 cost += addr_cost->addr_scale_costs.si;
7983 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
7984 cost += addr_cost->addr_scale_costs.di;
7985 else
7986 /* We can't tell, or this is a 128-bit vector. */
7987 cost += addr_cost->addr_scale_costs.ti;
7990 return cost;
7993 /* Return the cost of a branch. If SPEED_P is true then the compiler is
7994 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
7995 to be taken. */
7998 aarch64_branch_cost (bool speed_p, bool predictable_p)
8000 /* When optimizing for speed, use the cost of unpredictable branches. */
8001 const struct cpu_branch_cost *branch_costs =
8002 aarch64_tune_params.branch_costs;
8004 if (!speed_p || predictable_p)
8005 return branch_costs->predictable;
8006 else
8007 return branch_costs->unpredictable;
8010 /* Return true if the RTX X in mode MODE is a zero or sign extract
8011 usable in an ADD or SUB (extended register) instruction. */
8012 static bool
8013 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8015 /* Catch add with a sign extract.
8016 This is add_<optab><mode>_multp2. */
8017 if (GET_CODE (x) == SIGN_EXTRACT
8018 || GET_CODE (x) == ZERO_EXTRACT)
8020 rtx op0 = XEXP (x, 0);
8021 rtx op1 = XEXP (x, 1);
8022 rtx op2 = XEXP (x, 2);
8024 if (GET_CODE (op0) == MULT
8025 && CONST_INT_P (op1)
8026 && op2 == const0_rtx
8027 && CONST_INT_P (XEXP (op0, 1))
8028 && aarch64_is_extend_from_extract (mode,
8029 XEXP (op0, 1),
8030 op1))
8032 return true;
8035 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8036 No shift. */
8037 else if (GET_CODE (x) == SIGN_EXTEND
8038 || GET_CODE (x) == ZERO_EXTEND)
8039 return REG_P (XEXP (x, 0));
8041 return false;
8044 static bool
8045 aarch64_frint_unspec_p (unsigned int u)
8047 switch (u)
8049 case UNSPEC_FRINTZ:
8050 case UNSPEC_FRINTP:
8051 case UNSPEC_FRINTM:
8052 case UNSPEC_FRINTA:
8053 case UNSPEC_FRINTN:
8054 case UNSPEC_FRINTX:
8055 case UNSPEC_FRINTI:
8056 return true;
8058 default:
8059 return false;
8063 /* Return true iff X is an rtx that will match an extr instruction
8064 i.e. as described in the *extr<mode>5_insn family of patterns.
8065 OP0 and OP1 will be set to the operands of the shifts involved
8066 on success and will be NULL_RTX otherwise. */
8068 static bool
8069 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8071 rtx op0, op1;
8072 scalar_int_mode mode;
8073 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8074 return false;
8076 *res_op0 = NULL_RTX;
8077 *res_op1 = NULL_RTX;
8079 if (GET_CODE (x) != IOR)
8080 return false;
8082 op0 = XEXP (x, 0);
8083 op1 = XEXP (x, 1);
8085 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8086 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8088 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8089 if (GET_CODE (op1) == ASHIFT)
8090 std::swap (op0, op1);
8092 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8093 return false;
8095 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8096 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8098 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8099 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8101 *res_op0 = XEXP (op0, 0);
8102 *res_op1 = XEXP (op1, 0);
8103 return true;
8107 return false;
8110 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8111 storing it in *COST. Result is true if the total cost of the operation
8112 has now been calculated. */
8113 static bool
8114 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8116 rtx inner;
8117 rtx comparator;
8118 enum rtx_code cmpcode;
8120 if (COMPARISON_P (op0))
8122 inner = XEXP (op0, 0);
8123 comparator = XEXP (op0, 1);
8124 cmpcode = GET_CODE (op0);
8126 else
8128 inner = op0;
8129 comparator = const0_rtx;
8130 cmpcode = NE;
8133 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8135 /* Conditional branch. */
8136 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8137 return true;
8138 else
8140 if (cmpcode == NE || cmpcode == EQ)
8142 if (comparator == const0_rtx)
8144 /* TBZ/TBNZ/CBZ/CBNZ. */
8145 if (GET_CODE (inner) == ZERO_EXTRACT)
8146 /* TBZ/TBNZ. */
8147 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8148 ZERO_EXTRACT, 0, speed);
8149 else
8150 /* CBZ/CBNZ. */
8151 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8153 return true;
8156 else if (cmpcode == LT || cmpcode == GE)
8158 /* TBZ/TBNZ. */
8159 if (comparator == const0_rtx)
8160 return true;
8164 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8166 /* CCMP. */
8167 if (GET_CODE (op1) == COMPARE)
8169 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8170 if (XEXP (op1, 1) == const0_rtx)
8171 *cost += 1;
8172 if (speed)
8174 machine_mode mode = GET_MODE (XEXP (op1, 0));
8175 const struct cpu_cost_table *extra_cost
8176 = aarch64_tune_params.insn_extra_cost;
8178 if (GET_MODE_CLASS (mode) == MODE_INT)
8179 *cost += extra_cost->alu.arith;
8180 else
8181 *cost += extra_cost->fp[mode == DFmode].compare;
8183 return true;
8186 /* It's a conditional operation based on the status flags,
8187 so it must be some flavor of CSEL. */
8189 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8190 if (GET_CODE (op1) == NEG
8191 || GET_CODE (op1) == NOT
8192 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8193 op1 = XEXP (op1, 0);
8194 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8196 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8197 op1 = XEXP (op1, 0);
8198 op2 = XEXP (op2, 0);
8201 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8202 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8203 return true;
8206 /* We don't know what this is, cost all operands. */
8207 return false;
8210 /* Check whether X is a bitfield operation of the form shift + extend that
8211 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8212 operand to which the bitfield operation is applied. Otherwise return
8213 NULL_RTX. */
8215 static rtx
8216 aarch64_extend_bitfield_pattern_p (rtx x)
8218 rtx_code outer_code = GET_CODE (x);
8219 machine_mode outer_mode = GET_MODE (x);
8221 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8222 && outer_mode != SImode && outer_mode != DImode)
8223 return NULL_RTX;
8225 rtx inner = XEXP (x, 0);
8226 rtx_code inner_code = GET_CODE (inner);
8227 machine_mode inner_mode = GET_MODE (inner);
8228 rtx op = NULL_RTX;
8230 switch (inner_code)
8232 case ASHIFT:
8233 if (CONST_INT_P (XEXP (inner, 1))
8234 && (inner_mode == QImode || inner_mode == HImode))
8235 op = XEXP (inner, 0);
8236 break;
8237 case LSHIFTRT:
8238 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8239 && (inner_mode == QImode || inner_mode == HImode))
8240 op = XEXP (inner, 0);
8241 break;
8242 case ASHIFTRT:
8243 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8244 && (inner_mode == QImode || inner_mode == HImode))
8245 op = XEXP (inner, 0);
8246 break;
8247 default:
8248 break;
8251 return op;
8254 /* Return true if the mask and a shift amount from an RTX of the form
8255 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8256 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8258 bool
8259 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8260 rtx shft_amnt)
8262 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8263 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8264 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8265 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8268 /* Calculate the cost of calculating X, storing it in *COST. Result
8269 is true if the total cost of the operation has now been calculated. */
8270 static bool
8271 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8272 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8274 rtx op0, op1, op2;
8275 const struct cpu_cost_table *extra_cost
8276 = aarch64_tune_params.insn_extra_cost;
8277 int code = GET_CODE (x);
8278 scalar_int_mode int_mode;
8280 /* By default, assume that everything has equivalent cost to the
8281 cheapest instruction. Any additional costs are applied as a delta
8282 above this default. */
8283 *cost = COSTS_N_INSNS (1);
8285 switch (code)
8287 case SET:
8288 /* The cost depends entirely on the operands to SET. */
8289 *cost = 0;
8290 op0 = SET_DEST (x);
8291 op1 = SET_SRC (x);
8293 switch (GET_CODE (op0))
8295 case MEM:
8296 if (speed)
8298 rtx address = XEXP (op0, 0);
8299 if (VECTOR_MODE_P (mode))
8300 *cost += extra_cost->ldst.storev;
8301 else if (GET_MODE_CLASS (mode) == MODE_INT)
8302 *cost += extra_cost->ldst.store;
8303 else if (mode == SFmode)
8304 *cost += extra_cost->ldst.storef;
8305 else if (mode == DFmode)
8306 *cost += extra_cost->ldst.stored;
8308 *cost +=
8309 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8310 0, speed));
8313 *cost += rtx_cost (op1, mode, SET, 1, speed);
8314 return true;
8316 case SUBREG:
8317 if (! REG_P (SUBREG_REG (op0)))
8318 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8320 /* Fall through. */
8321 case REG:
8322 /* The cost is one per vector-register copied. */
8323 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8325 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8326 *cost = COSTS_N_INSNS (nregs);
8328 /* const0_rtx is in general free, but we will use an
8329 instruction to set a register to 0. */
8330 else if (REG_P (op1) || op1 == const0_rtx)
8332 /* The cost is 1 per register copied. */
8333 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8334 *cost = COSTS_N_INSNS (nregs);
8336 else
8337 /* Cost is just the cost of the RHS of the set. */
8338 *cost += rtx_cost (op1, mode, SET, 1, speed);
8339 return true;
8341 case ZERO_EXTRACT:
8342 case SIGN_EXTRACT:
8343 /* Bit-field insertion. Strip any redundant widening of
8344 the RHS to meet the width of the target. */
8345 if (GET_CODE (op1) == SUBREG)
8346 op1 = SUBREG_REG (op1);
8347 if ((GET_CODE (op1) == ZERO_EXTEND
8348 || GET_CODE (op1) == SIGN_EXTEND)
8349 && CONST_INT_P (XEXP (op0, 1))
8350 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8351 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8352 op1 = XEXP (op1, 0);
8354 if (CONST_INT_P (op1))
8356 /* MOV immediate is assumed to always be cheap. */
8357 *cost = COSTS_N_INSNS (1);
8359 else
8361 /* BFM. */
8362 if (speed)
8363 *cost += extra_cost->alu.bfi;
8364 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8367 return true;
8369 default:
8370 /* We can't make sense of this, assume default cost. */
8371 *cost = COSTS_N_INSNS (1);
8372 return false;
8374 return false;
8376 case CONST_INT:
8377 /* If an instruction can incorporate a constant within the
8378 instruction, the instruction's expression avoids calling
8379 rtx_cost() on the constant. If rtx_cost() is called on a
8380 constant, then it is usually because the constant must be
8381 moved into a register by one or more instructions.
8383 The exception is constant 0, which can be expressed
8384 as XZR/WZR and is therefore free. The exception to this is
8385 if we have (set (reg) (const0_rtx)) in which case we must cost
8386 the move. However, we can catch that when we cost the SET, so
8387 we don't need to consider that here. */
8388 if (x == const0_rtx)
8389 *cost = 0;
8390 else
8392 /* To an approximation, building any other constant is
8393 proportionally expensive to the number of instructions
8394 required to build that constant. This is true whether we
8395 are compiling for SPEED or otherwise. */
8396 if (!is_a <scalar_int_mode> (mode, &int_mode))
8397 int_mode = word_mode;
8398 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8399 (NULL_RTX, x, false, int_mode));
8401 return true;
8403 case CONST_DOUBLE:
8405 /* First determine number of instructions to do the move
8406 as an integer constant. */
8407 if (!aarch64_float_const_representable_p (x)
8408 && !aarch64_can_const_movi_rtx_p (x, mode)
8409 && aarch64_float_const_rtx_p (x))
8411 unsigned HOST_WIDE_INT ival;
8412 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8413 gcc_assert (succeed);
8415 scalar_int_mode imode = (mode == HFmode
8416 ? SImode
8417 : int_mode_for_mode (mode).require ());
8418 int ncost = aarch64_internal_mov_immediate
8419 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8420 *cost += COSTS_N_INSNS (ncost);
8421 return true;
8424 if (speed)
8426 /* mov[df,sf]_aarch64. */
8427 if (aarch64_float_const_representable_p (x))
8428 /* FMOV (scalar immediate). */
8429 *cost += extra_cost->fp[mode == DFmode].fpconst;
8430 else if (!aarch64_float_const_zero_rtx_p (x))
8432 /* This will be a load from memory. */
8433 if (mode == DFmode)
8434 *cost += extra_cost->ldst.loadd;
8435 else
8436 *cost += extra_cost->ldst.loadf;
8438 else
8439 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8440 or MOV v0.s[0], wzr - neither of which are modeled by the
8441 cost tables. Just use the default cost. */
8446 return true;
8448 case MEM:
8449 if (speed)
8451 /* For loads we want the base cost of a load, plus an
8452 approximation for the additional cost of the addressing
8453 mode. */
8454 rtx address = XEXP (x, 0);
8455 if (VECTOR_MODE_P (mode))
8456 *cost += extra_cost->ldst.loadv;
8457 else if (GET_MODE_CLASS (mode) == MODE_INT)
8458 *cost += extra_cost->ldst.load;
8459 else if (mode == SFmode)
8460 *cost += extra_cost->ldst.loadf;
8461 else if (mode == DFmode)
8462 *cost += extra_cost->ldst.loadd;
8464 *cost +=
8465 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8466 0, speed));
8469 return true;
8471 case NEG:
8472 op0 = XEXP (x, 0);
8474 if (VECTOR_MODE_P (mode))
8476 if (speed)
8478 /* FNEG. */
8479 *cost += extra_cost->vect.alu;
8481 return false;
8484 if (GET_MODE_CLASS (mode) == MODE_INT)
8486 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8487 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8489 /* CSETM. */
8490 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8491 return true;
8494 /* Cost this as SUB wzr, X. */
8495 op0 = CONST0_RTX (mode);
8496 op1 = XEXP (x, 0);
8497 goto cost_minus;
8500 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8502 /* Support (neg(fma...)) as a single instruction only if
8503 sign of zeros is unimportant. This matches the decision
8504 making in aarch64.md. */
8505 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8507 /* FNMADD. */
8508 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8509 return true;
8511 if (GET_CODE (op0) == MULT)
8513 /* FNMUL. */
8514 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8515 return true;
8517 if (speed)
8518 /* FNEG. */
8519 *cost += extra_cost->fp[mode == DFmode].neg;
8520 return false;
8523 return false;
8525 case CLRSB:
8526 case CLZ:
8527 if (speed)
8529 if (VECTOR_MODE_P (mode))
8530 *cost += extra_cost->vect.alu;
8531 else
8532 *cost += extra_cost->alu.clz;
8535 return false;
8537 case COMPARE:
8538 op0 = XEXP (x, 0);
8539 op1 = XEXP (x, 1);
8541 if (op1 == const0_rtx
8542 && GET_CODE (op0) == AND)
8544 x = op0;
8545 mode = GET_MODE (op0);
8546 goto cost_logic;
8549 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8551 /* TODO: A write to the CC flags possibly costs extra, this
8552 needs encoding in the cost tables. */
8554 mode = GET_MODE (op0);
8555 /* ANDS. */
8556 if (GET_CODE (op0) == AND)
8558 x = op0;
8559 goto cost_logic;
8562 if (GET_CODE (op0) == PLUS)
8564 /* ADDS (and CMN alias). */
8565 x = op0;
8566 goto cost_plus;
8569 if (GET_CODE (op0) == MINUS)
8571 /* SUBS. */
8572 x = op0;
8573 goto cost_minus;
8576 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8577 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8578 && CONST_INT_P (XEXP (op0, 2)))
8580 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8581 Handle it here directly rather than going to cost_logic
8582 since we know the immediate generated for the TST is valid
8583 so we can avoid creating an intermediate rtx for it only
8584 for costing purposes. */
8585 if (speed)
8586 *cost += extra_cost->alu.logical;
8588 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8589 ZERO_EXTRACT, 0, speed);
8590 return true;
8593 if (GET_CODE (op1) == NEG)
8595 /* CMN. */
8596 if (speed)
8597 *cost += extra_cost->alu.arith;
8599 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8600 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8601 return true;
8604 /* CMP.
8606 Compare can freely swap the order of operands, and
8607 canonicalization puts the more complex operation first.
8608 But the integer MINUS logic expects the shift/extend
8609 operation in op1. */
8610 if (! (REG_P (op0)
8611 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8613 op0 = XEXP (x, 1);
8614 op1 = XEXP (x, 0);
8616 goto cost_minus;
8619 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8621 /* FCMP. */
8622 if (speed)
8623 *cost += extra_cost->fp[mode == DFmode].compare;
8625 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8627 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8628 /* FCMP supports constant 0.0 for no extra cost. */
8629 return true;
8631 return false;
8634 if (VECTOR_MODE_P (mode))
8636 /* Vector compare. */
8637 if (speed)
8638 *cost += extra_cost->vect.alu;
8640 if (aarch64_float_const_zero_rtx_p (op1))
8642 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8643 cost. */
8644 return true;
8646 return false;
8648 return false;
8650 case MINUS:
8652 op0 = XEXP (x, 0);
8653 op1 = XEXP (x, 1);
8655 cost_minus:
8656 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8658 /* Detect valid immediates. */
8659 if ((GET_MODE_CLASS (mode) == MODE_INT
8660 || (GET_MODE_CLASS (mode) == MODE_CC
8661 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8662 && CONST_INT_P (op1)
8663 && aarch64_uimm12_shift (INTVAL (op1)))
8665 if (speed)
8666 /* SUB(S) (immediate). */
8667 *cost += extra_cost->alu.arith;
8668 return true;
8671 /* Look for SUB (extended register). */
8672 if (is_a <scalar_int_mode> (mode, &int_mode)
8673 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8675 if (speed)
8676 *cost += extra_cost->alu.extend_arith;
8678 op1 = aarch64_strip_extend (op1, true);
8679 *cost += rtx_cost (op1, VOIDmode,
8680 (enum rtx_code) GET_CODE (op1), 0, speed);
8681 return true;
8684 rtx new_op1 = aarch64_strip_extend (op1, false);
8686 /* Cost this as an FMA-alike operation. */
8687 if ((GET_CODE (new_op1) == MULT
8688 || aarch64_shift_p (GET_CODE (new_op1)))
8689 && code != COMPARE)
8691 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8692 (enum rtx_code) code,
8693 speed);
8694 return true;
8697 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8699 if (speed)
8701 if (VECTOR_MODE_P (mode))
8703 /* Vector SUB. */
8704 *cost += extra_cost->vect.alu;
8706 else if (GET_MODE_CLASS (mode) == MODE_INT)
8708 /* SUB(S). */
8709 *cost += extra_cost->alu.arith;
8711 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8713 /* FSUB. */
8714 *cost += extra_cost->fp[mode == DFmode].addsub;
8717 return true;
8720 case PLUS:
8722 rtx new_op0;
8724 op0 = XEXP (x, 0);
8725 op1 = XEXP (x, 1);
8727 cost_plus:
8728 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8729 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8731 /* CSINC. */
8732 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8733 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8734 return true;
8737 if (GET_MODE_CLASS (mode) == MODE_INT
8738 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8739 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8741 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8743 if (speed)
8744 /* ADD (immediate). */
8745 *cost += extra_cost->alu.arith;
8746 return true;
8749 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8751 /* Look for ADD (extended register). */
8752 if (is_a <scalar_int_mode> (mode, &int_mode)
8753 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8755 if (speed)
8756 *cost += extra_cost->alu.extend_arith;
8758 op0 = aarch64_strip_extend (op0, true);
8759 *cost += rtx_cost (op0, VOIDmode,
8760 (enum rtx_code) GET_CODE (op0), 0, speed);
8761 return true;
8764 /* Strip any extend, leave shifts behind as we will
8765 cost them through mult_cost. */
8766 new_op0 = aarch64_strip_extend (op0, false);
8768 if (GET_CODE (new_op0) == MULT
8769 || aarch64_shift_p (GET_CODE (new_op0)))
8771 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8772 speed);
8773 return true;
8776 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8778 if (speed)
8780 if (VECTOR_MODE_P (mode))
8782 /* Vector ADD. */
8783 *cost += extra_cost->vect.alu;
8785 else if (GET_MODE_CLASS (mode) == MODE_INT)
8787 /* ADD. */
8788 *cost += extra_cost->alu.arith;
8790 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8792 /* FADD. */
8793 *cost += extra_cost->fp[mode == DFmode].addsub;
8796 return true;
8799 case BSWAP:
8800 *cost = COSTS_N_INSNS (1);
8802 if (speed)
8804 if (VECTOR_MODE_P (mode))
8805 *cost += extra_cost->vect.alu;
8806 else
8807 *cost += extra_cost->alu.rev;
8809 return false;
8811 case IOR:
8812 if (aarch_rev16_p (x))
8814 *cost = COSTS_N_INSNS (1);
8816 if (speed)
8818 if (VECTOR_MODE_P (mode))
8819 *cost += extra_cost->vect.alu;
8820 else
8821 *cost += extra_cost->alu.rev;
8823 return true;
8826 if (aarch64_extr_rtx_p (x, &op0, &op1))
8828 *cost += rtx_cost (op0, mode, IOR, 0, speed);
8829 *cost += rtx_cost (op1, mode, IOR, 1, speed);
8830 if (speed)
8831 *cost += extra_cost->alu.shift;
8833 return true;
8835 /* Fall through. */
8836 case XOR:
8837 case AND:
8838 cost_logic:
8839 op0 = XEXP (x, 0);
8840 op1 = XEXP (x, 1);
8842 if (VECTOR_MODE_P (mode))
8844 if (speed)
8845 *cost += extra_cost->vect.alu;
8846 return true;
8849 if (code == AND
8850 && GET_CODE (op0) == MULT
8851 && CONST_INT_P (XEXP (op0, 1))
8852 && CONST_INT_P (op1)
8853 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
8854 INTVAL (op1)) != 0)
8856 /* This is a UBFM/SBFM. */
8857 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
8858 if (speed)
8859 *cost += extra_cost->alu.bfx;
8860 return true;
8863 if (is_int_mode (mode, &int_mode))
8865 if (CONST_INT_P (op1))
8867 /* We have a mask + shift version of a UBFIZ
8868 i.e. the *andim_ashift<mode>_bfiz pattern. */
8869 if (GET_CODE (op0) == ASHIFT
8870 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
8871 XEXP (op0, 1)))
8873 *cost += rtx_cost (XEXP (op0, 0), int_mode,
8874 (enum rtx_code) code, 0, speed);
8875 if (speed)
8876 *cost += extra_cost->alu.bfx;
8878 return true;
8880 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
8882 /* We possibly get the immediate for free, this is not
8883 modelled. */
8884 *cost += rtx_cost (op0, int_mode,
8885 (enum rtx_code) code, 0, speed);
8886 if (speed)
8887 *cost += extra_cost->alu.logical;
8889 return true;
8892 else
8894 rtx new_op0 = op0;
8896 /* Handle ORN, EON, or BIC. */
8897 if (GET_CODE (op0) == NOT)
8898 op0 = XEXP (op0, 0);
8900 new_op0 = aarch64_strip_shift (op0);
8902 /* If we had a shift on op0 then this is a logical-shift-
8903 by-register/immediate operation. Otherwise, this is just
8904 a logical operation. */
8905 if (speed)
8907 if (new_op0 != op0)
8909 /* Shift by immediate. */
8910 if (CONST_INT_P (XEXP (op0, 1)))
8911 *cost += extra_cost->alu.log_shift;
8912 else
8913 *cost += extra_cost->alu.log_shift_reg;
8915 else
8916 *cost += extra_cost->alu.logical;
8919 /* In both cases we want to cost both operands. */
8920 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
8921 0, speed);
8922 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
8923 1, speed);
8925 return true;
8928 return false;
8930 case NOT:
8931 x = XEXP (x, 0);
8932 op0 = aarch64_strip_shift (x);
8934 if (VECTOR_MODE_P (mode))
8936 /* Vector NOT. */
8937 *cost += extra_cost->vect.alu;
8938 return false;
8941 /* MVN-shifted-reg. */
8942 if (op0 != x)
8944 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
8946 if (speed)
8947 *cost += extra_cost->alu.log_shift;
8949 return true;
8951 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
8952 Handle the second form here taking care that 'a' in the above can
8953 be a shift. */
8954 else if (GET_CODE (op0) == XOR)
8956 rtx newop0 = XEXP (op0, 0);
8957 rtx newop1 = XEXP (op0, 1);
8958 rtx op0_stripped = aarch64_strip_shift (newop0);
8960 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
8961 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
8963 if (speed)
8965 if (op0_stripped != newop0)
8966 *cost += extra_cost->alu.log_shift;
8967 else
8968 *cost += extra_cost->alu.logical;
8971 return true;
8973 /* MVN. */
8974 if (speed)
8975 *cost += extra_cost->alu.logical;
8977 return false;
8979 case ZERO_EXTEND:
8981 op0 = XEXP (x, 0);
8982 /* If a value is written in SI mode, then zero extended to DI
8983 mode, the operation will in general be free as a write to
8984 a 'w' register implicitly zeroes the upper bits of an 'x'
8985 register. However, if this is
8987 (set (reg) (zero_extend (reg)))
8989 we must cost the explicit register move. */
8990 if (mode == DImode
8991 && GET_MODE (op0) == SImode
8992 && outer == SET)
8994 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
8996 /* If OP_COST is non-zero, then the cost of the zero extend
8997 is effectively the cost of the inner operation. Otherwise
8998 we have a MOV instruction and we take the cost from the MOV
8999 itself. This is true independently of whether we are
9000 optimizing for space or time. */
9001 if (op_cost)
9002 *cost = op_cost;
9004 return true;
9006 else if (MEM_P (op0))
9008 /* All loads can zero extend to any size for free. */
9009 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9010 return true;
9013 op0 = aarch64_extend_bitfield_pattern_p (x);
9014 if (op0)
9016 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9017 if (speed)
9018 *cost += extra_cost->alu.bfx;
9019 return true;
9022 if (speed)
9024 if (VECTOR_MODE_P (mode))
9026 /* UMOV. */
9027 *cost += extra_cost->vect.alu;
9029 else
9031 /* We generate an AND instead of UXTB/UXTH. */
9032 *cost += extra_cost->alu.logical;
9035 return false;
9037 case SIGN_EXTEND:
9038 if (MEM_P (XEXP (x, 0)))
9040 /* LDRSH. */
9041 if (speed)
9043 rtx address = XEXP (XEXP (x, 0), 0);
9044 *cost += extra_cost->ldst.load_sign_extend;
9046 *cost +=
9047 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9048 0, speed));
9050 return true;
9053 op0 = aarch64_extend_bitfield_pattern_p (x);
9054 if (op0)
9056 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9057 if (speed)
9058 *cost += extra_cost->alu.bfx;
9059 return true;
9062 if (speed)
9064 if (VECTOR_MODE_P (mode))
9065 *cost += extra_cost->vect.alu;
9066 else
9067 *cost += extra_cost->alu.extend;
9069 return false;
9071 case ASHIFT:
9072 op0 = XEXP (x, 0);
9073 op1 = XEXP (x, 1);
9075 if (CONST_INT_P (op1))
9077 if (speed)
9079 if (VECTOR_MODE_P (mode))
9081 /* Vector shift (immediate). */
9082 *cost += extra_cost->vect.alu;
9084 else
9086 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9087 aliases. */
9088 *cost += extra_cost->alu.shift;
9092 /* We can incorporate zero/sign extend for free. */
9093 if (GET_CODE (op0) == ZERO_EXTEND
9094 || GET_CODE (op0) == SIGN_EXTEND)
9095 op0 = XEXP (op0, 0);
9097 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9098 return true;
9100 else
9102 if (VECTOR_MODE_P (mode))
9104 if (speed)
9105 /* Vector shift (register). */
9106 *cost += extra_cost->vect.alu;
9108 else
9110 if (speed)
9111 /* LSLV. */
9112 *cost += extra_cost->alu.shift_reg;
9114 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9115 && CONST_INT_P (XEXP (op1, 1))
9116 && known_eq (INTVAL (XEXP (op1, 1)),
9117 GET_MODE_BITSIZE (mode) - 1))
9119 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9120 /* We already demanded XEXP (op1, 0) to be REG_P, so
9121 don't recurse into it. */
9122 return true;
9125 return false; /* All arguments need to be in registers. */
9128 case ROTATE:
9129 case ROTATERT:
9130 case LSHIFTRT:
9131 case ASHIFTRT:
9132 op0 = XEXP (x, 0);
9133 op1 = XEXP (x, 1);
9135 if (CONST_INT_P (op1))
9137 /* ASR (immediate) and friends. */
9138 if (speed)
9140 if (VECTOR_MODE_P (mode))
9141 *cost += extra_cost->vect.alu;
9142 else
9143 *cost += extra_cost->alu.shift;
9146 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9147 return true;
9149 else
9151 if (VECTOR_MODE_P (mode))
9153 if (speed)
9154 /* Vector shift (register). */
9155 *cost += extra_cost->vect.alu;
9157 else
9159 if (speed)
9160 /* ASR (register) and friends. */
9161 *cost += extra_cost->alu.shift_reg;
9163 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9164 && CONST_INT_P (XEXP (op1, 1))
9165 && known_eq (INTVAL (XEXP (op1, 1)),
9166 GET_MODE_BITSIZE (mode) - 1))
9168 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9169 /* We already demanded XEXP (op1, 0) to be REG_P, so
9170 don't recurse into it. */
9171 return true;
9174 return false; /* All arguments need to be in registers. */
9177 case SYMBOL_REF:
9179 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9180 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9182 /* LDR. */
9183 if (speed)
9184 *cost += extra_cost->ldst.load;
9186 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9187 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9189 /* ADRP, followed by ADD. */
9190 *cost += COSTS_N_INSNS (1);
9191 if (speed)
9192 *cost += 2 * extra_cost->alu.arith;
9194 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9195 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9197 /* ADR. */
9198 if (speed)
9199 *cost += extra_cost->alu.arith;
9202 if (flag_pic)
9204 /* One extra load instruction, after accessing the GOT. */
9205 *cost += COSTS_N_INSNS (1);
9206 if (speed)
9207 *cost += extra_cost->ldst.load;
9209 return true;
9211 case HIGH:
9212 case LO_SUM:
9213 /* ADRP/ADD (immediate). */
9214 if (speed)
9215 *cost += extra_cost->alu.arith;
9216 return true;
9218 case ZERO_EXTRACT:
9219 case SIGN_EXTRACT:
9220 /* UBFX/SBFX. */
9221 if (speed)
9223 if (VECTOR_MODE_P (mode))
9224 *cost += extra_cost->vect.alu;
9225 else
9226 *cost += extra_cost->alu.bfx;
9229 /* We can trust that the immediates used will be correct (there
9230 are no by-register forms), so we need only cost op0. */
9231 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9232 return true;
9234 case MULT:
9235 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9236 /* aarch64_rtx_mult_cost always handles recursion to its
9237 operands. */
9238 return true;
9240 case MOD:
9241 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9242 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9243 an unconditional negate. This case should only ever be reached through
9244 the set_smod_pow2_cheap check in expmed.c. */
9245 if (CONST_INT_P (XEXP (x, 1))
9246 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9247 && (mode == SImode || mode == DImode))
9249 /* We expand to 4 instructions. Reset the baseline. */
9250 *cost = COSTS_N_INSNS (4);
9252 if (speed)
9253 *cost += 2 * extra_cost->alu.logical
9254 + 2 * extra_cost->alu.arith;
9256 return true;
9259 /* Fall-through. */
9260 case UMOD:
9261 if (speed)
9263 /* Slighly prefer UMOD over SMOD. */
9264 if (VECTOR_MODE_P (mode))
9265 *cost += extra_cost->vect.alu;
9266 else if (GET_MODE_CLASS (mode) == MODE_INT)
9267 *cost += (extra_cost->mult[mode == DImode].add
9268 + extra_cost->mult[mode == DImode].idiv
9269 + (code == MOD ? 1 : 0));
9271 return false; /* All arguments need to be in registers. */
9273 case DIV:
9274 case UDIV:
9275 case SQRT:
9276 if (speed)
9278 if (VECTOR_MODE_P (mode))
9279 *cost += extra_cost->vect.alu;
9280 else if (GET_MODE_CLASS (mode) == MODE_INT)
9281 /* There is no integer SQRT, so only DIV and UDIV can get
9282 here. */
9283 *cost += (extra_cost->mult[mode == DImode].idiv
9284 /* Slighly prefer UDIV over SDIV. */
9285 + (code == DIV ? 1 : 0));
9286 else
9287 *cost += extra_cost->fp[mode == DFmode].div;
9289 return false; /* All arguments need to be in registers. */
9291 case IF_THEN_ELSE:
9292 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9293 XEXP (x, 2), cost, speed);
9295 case EQ:
9296 case NE:
9297 case GT:
9298 case GTU:
9299 case LT:
9300 case LTU:
9301 case GE:
9302 case GEU:
9303 case LE:
9304 case LEU:
9306 return false; /* All arguments must be in registers. */
9308 case FMA:
9309 op0 = XEXP (x, 0);
9310 op1 = XEXP (x, 1);
9311 op2 = XEXP (x, 2);
9313 if (speed)
9315 if (VECTOR_MODE_P (mode))
9316 *cost += extra_cost->vect.alu;
9317 else
9318 *cost += extra_cost->fp[mode == DFmode].fma;
9321 /* FMSUB, FNMADD, and FNMSUB are free. */
9322 if (GET_CODE (op0) == NEG)
9323 op0 = XEXP (op0, 0);
9325 if (GET_CODE (op2) == NEG)
9326 op2 = XEXP (op2, 0);
9328 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9329 and the by-element operand as operand 0. */
9330 if (GET_CODE (op1) == NEG)
9331 op1 = XEXP (op1, 0);
9333 /* Catch vector-by-element operations. The by-element operand can
9334 either be (vec_duplicate (vec_select (x))) or just
9335 (vec_select (x)), depending on whether we are multiplying by
9336 a vector or a scalar.
9338 Canonicalization is not very good in these cases, FMA4 will put the
9339 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9340 if (GET_CODE (op0) == VEC_DUPLICATE)
9341 op0 = XEXP (op0, 0);
9342 else if (GET_CODE (op1) == VEC_DUPLICATE)
9343 op1 = XEXP (op1, 0);
9345 if (GET_CODE (op0) == VEC_SELECT)
9346 op0 = XEXP (op0, 0);
9347 else if (GET_CODE (op1) == VEC_SELECT)
9348 op1 = XEXP (op1, 0);
9350 /* If the remaining parameters are not registers,
9351 get the cost to put them into registers. */
9352 *cost += rtx_cost (op0, mode, FMA, 0, speed);
9353 *cost += rtx_cost (op1, mode, FMA, 1, speed);
9354 *cost += rtx_cost (op2, mode, FMA, 2, speed);
9355 return true;
9357 case FLOAT:
9358 case UNSIGNED_FLOAT:
9359 if (speed)
9360 *cost += extra_cost->fp[mode == DFmode].fromint;
9361 return false;
9363 case FLOAT_EXTEND:
9364 if (speed)
9366 if (VECTOR_MODE_P (mode))
9368 /*Vector truncate. */
9369 *cost += extra_cost->vect.alu;
9371 else
9372 *cost += extra_cost->fp[mode == DFmode].widen;
9374 return false;
9376 case FLOAT_TRUNCATE:
9377 if (speed)
9379 if (VECTOR_MODE_P (mode))
9381 /*Vector conversion. */
9382 *cost += extra_cost->vect.alu;
9384 else
9385 *cost += extra_cost->fp[mode == DFmode].narrow;
9387 return false;
9389 case FIX:
9390 case UNSIGNED_FIX:
9391 x = XEXP (x, 0);
9392 /* Strip the rounding part. They will all be implemented
9393 by the fcvt* family of instructions anyway. */
9394 if (GET_CODE (x) == UNSPEC)
9396 unsigned int uns_code = XINT (x, 1);
9398 if (uns_code == UNSPEC_FRINTA
9399 || uns_code == UNSPEC_FRINTM
9400 || uns_code == UNSPEC_FRINTN
9401 || uns_code == UNSPEC_FRINTP
9402 || uns_code == UNSPEC_FRINTZ)
9403 x = XVECEXP (x, 0, 0);
9406 if (speed)
9408 if (VECTOR_MODE_P (mode))
9409 *cost += extra_cost->vect.alu;
9410 else
9411 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9414 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9415 fixed-point fcvt. */
9416 if (GET_CODE (x) == MULT
9417 && ((VECTOR_MODE_P (mode)
9418 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9419 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9421 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9422 0, speed);
9423 return true;
9426 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9427 return true;
9429 case ABS:
9430 if (VECTOR_MODE_P (mode))
9432 /* ABS (vector). */
9433 if (speed)
9434 *cost += extra_cost->vect.alu;
9436 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9438 op0 = XEXP (x, 0);
9440 /* FABD, which is analogous to FADD. */
9441 if (GET_CODE (op0) == MINUS)
9443 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9444 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9445 if (speed)
9446 *cost += extra_cost->fp[mode == DFmode].addsub;
9448 return true;
9450 /* Simple FABS is analogous to FNEG. */
9451 if (speed)
9452 *cost += extra_cost->fp[mode == DFmode].neg;
9454 else
9456 /* Integer ABS will either be split to
9457 two arithmetic instructions, or will be an ABS
9458 (scalar), which we don't model. */
9459 *cost = COSTS_N_INSNS (2);
9460 if (speed)
9461 *cost += 2 * extra_cost->alu.arith;
9463 return false;
9465 case SMAX:
9466 case SMIN:
9467 if (speed)
9469 if (VECTOR_MODE_P (mode))
9470 *cost += extra_cost->vect.alu;
9471 else
9473 /* FMAXNM/FMINNM/FMAX/FMIN.
9474 TODO: This may not be accurate for all implementations, but
9475 we do not model this in the cost tables. */
9476 *cost += extra_cost->fp[mode == DFmode].addsub;
9479 return false;
9481 case UNSPEC:
9482 /* The floating point round to integer frint* instructions. */
9483 if (aarch64_frint_unspec_p (XINT (x, 1)))
9485 if (speed)
9486 *cost += extra_cost->fp[mode == DFmode].roundint;
9488 return false;
9491 if (XINT (x, 1) == UNSPEC_RBIT)
9493 if (speed)
9494 *cost += extra_cost->alu.rev;
9496 return false;
9498 break;
9500 case TRUNCATE:
9502 /* Decompose <su>muldi3_highpart. */
9503 if (/* (truncate:DI */
9504 mode == DImode
9505 /* (lshiftrt:TI */
9506 && GET_MODE (XEXP (x, 0)) == TImode
9507 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9508 /* (mult:TI */
9509 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9510 /* (ANY_EXTEND:TI (reg:DI))
9511 (ANY_EXTEND:TI (reg:DI))) */
9512 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9513 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9514 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9515 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9516 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9517 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9518 /* (const_int 64) */
9519 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9520 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9522 /* UMULH/SMULH. */
9523 if (speed)
9524 *cost += extra_cost->mult[mode == DImode].extend;
9525 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9526 mode, MULT, 0, speed);
9527 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9528 mode, MULT, 1, speed);
9529 return true;
9532 /* Fall through. */
9533 default:
9534 break;
9537 if (dump_file
9538 && flag_aarch64_verbose_cost)
9539 fprintf (dump_file,
9540 "\nFailed to cost RTX. Assuming default cost.\n");
9542 return true;
9545 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9546 calculated for X. This cost is stored in *COST. Returns true
9547 if the total cost of X was calculated. */
9548 static bool
9549 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9550 int param, int *cost, bool speed)
9552 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9554 if (dump_file
9555 && flag_aarch64_verbose_cost)
9557 print_rtl_single (dump_file, x);
9558 fprintf (dump_file, "\n%s cost: %d (%s)\n",
9559 speed ? "Hot" : "Cold",
9560 *cost, result ? "final" : "partial");
9563 return result;
9566 static int
9567 aarch64_register_move_cost (machine_mode mode,
9568 reg_class_t from_i, reg_class_t to_i)
9570 enum reg_class from = (enum reg_class) from_i;
9571 enum reg_class to = (enum reg_class) to_i;
9572 const struct cpu_regmove_cost *regmove_cost
9573 = aarch64_tune_params.regmove_cost;
9575 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9576 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
9577 to = GENERAL_REGS;
9579 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
9580 from = GENERAL_REGS;
9582 /* Moving between GPR and stack cost is the same as GP2GP. */
9583 if ((from == GENERAL_REGS && to == STACK_REG)
9584 || (to == GENERAL_REGS && from == STACK_REG))
9585 return regmove_cost->GP2GP;
9587 /* To/From the stack register, we move via the gprs. */
9588 if (to == STACK_REG || from == STACK_REG)
9589 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9590 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9592 if (known_eq (GET_MODE_SIZE (mode), 16))
9594 /* 128-bit operations on general registers require 2 instructions. */
9595 if (from == GENERAL_REGS && to == GENERAL_REGS)
9596 return regmove_cost->GP2GP * 2;
9597 else if (from == GENERAL_REGS)
9598 return regmove_cost->GP2FP * 2;
9599 else if (to == GENERAL_REGS)
9600 return regmove_cost->FP2GP * 2;
9602 /* When AdvSIMD instructions are disabled it is not possible to move
9603 a 128-bit value directly between Q registers. This is handled in
9604 secondary reload. A general register is used as a scratch to move
9605 the upper DI value and the lower DI value is moved directly,
9606 hence the cost is the sum of three moves. */
9607 if (! TARGET_SIMD)
9608 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9610 return regmove_cost->FP2FP;
9613 if (from == GENERAL_REGS && to == GENERAL_REGS)
9614 return regmove_cost->GP2GP;
9615 else if (from == GENERAL_REGS)
9616 return regmove_cost->GP2FP;
9617 else if (to == GENERAL_REGS)
9618 return regmove_cost->FP2GP;
9620 return regmove_cost->FP2FP;
9623 static int
9624 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9625 reg_class_t rclass ATTRIBUTE_UNUSED,
9626 bool in ATTRIBUTE_UNUSED)
9628 return aarch64_tune_params.memmov_cost;
9631 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9632 to optimize 1.0/sqrt. */
9634 static bool
9635 use_rsqrt_p (machine_mode mode)
9637 return (!flag_trapping_math
9638 && flag_unsafe_math_optimizations
9639 && ((aarch64_tune_params.approx_modes->recip_sqrt
9640 & AARCH64_APPROX_MODE (mode))
9641 || flag_mrecip_low_precision_sqrt));
9644 /* Function to decide when to use the approximate reciprocal square root
9645 builtin. */
9647 static tree
9648 aarch64_builtin_reciprocal (tree fndecl)
9650 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9652 if (!use_rsqrt_p (mode))
9653 return NULL_TREE;
9654 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9657 typedef rtx (*rsqrte_type) (rtx, rtx);
9659 /* Select reciprocal square root initial estimate insn depending on machine
9660 mode. */
9662 static rsqrte_type
9663 get_rsqrte_type (machine_mode mode)
9665 switch (mode)
9667 case E_DFmode: return gen_aarch64_rsqrtedf;
9668 case E_SFmode: return gen_aarch64_rsqrtesf;
9669 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9670 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9671 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9672 default: gcc_unreachable ();
9676 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9678 /* Select reciprocal square root series step insn depending on machine mode. */
9680 static rsqrts_type
9681 get_rsqrts_type (machine_mode mode)
9683 switch (mode)
9685 case E_DFmode: return gen_aarch64_rsqrtsdf;
9686 case E_SFmode: return gen_aarch64_rsqrtssf;
9687 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9688 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9689 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9690 default: gcc_unreachable ();
9694 /* Emit instruction sequence to compute either the approximate square root
9695 or its approximate reciprocal, depending on the flag RECP, and return
9696 whether the sequence was emitted or not. */
9698 bool
9699 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9701 machine_mode mode = GET_MODE (dst);
9703 if (GET_MODE_INNER (mode) == HFmode)
9705 gcc_assert (!recp);
9706 return false;
9709 if (!recp)
9711 if (!(flag_mlow_precision_sqrt
9712 || (aarch64_tune_params.approx_modes->sqrt
9713 & AARCH64_APPROX_MODE (mode))))
9714 return false;
9716 if (flag_finite_math_only
9717 || flag_trapping_math
9718 || !flag_unsafe_math_optimizations
9719 || optimize_function_for_size_p (cfun))
9720 return false;
9722 else
9723 /* Caller assumes we cannot fail. */
9724 gcc_assert (use_rsqrt_p (mode));
9726 machine_mode mmsk = mode_for_int_vector (mode).require ();
9727 rtx xmsk = gen_reg_rtx (mmsk);
9728 if (!recp)
9729 /* When calculating the approximate square root, compare the
9730 argument with 0.0 and create a mask. */
9731 emit_insn (gen_rtx_SET (xmsk,
9732 gen_rtx_NEG (mmsk,
9733 gen_rtx_EQ (mmsk, src,
9734 CONST0_RTX (mode)))));
9736 /* Estimate the approximate reciprocal square root. */
9737 rtx xdst = gen_reg_rtx (mode);
9738 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9740 /* Iterate over the series twice for SF and thrice for DF. */
9741 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9743 /* Optionally iterate over the series once less for faster performance
9744 while sacrificing the accuracy. */
9745 if ((recp && flag_mrecip_low_precision_sqrt)
9746 || (!recp && flag_mlow_precision_sqrt))
9747 iterations--;
9749 /* Iterate over the series to calculate the approximate reciprocal square
9750 root. */
9751 rtx x1 = gen_reg_rtx (mode);
9752 while (iterations--)
9754 rtx x2 = gen_reg_rtx (mode);
9755 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9757 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9759 if (iterations > 0)
9760 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9763 if (!recp)
9765 /* Qualify the approximate reciprocal square root when the argument is
9766 0.0 by squashing the intermediary result to 0.0. */
9767 rtx xtmp = gen_reg_rtx (mmsk);
9768 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9769 gen_rtx_SUBREG (mmsk, xdst, 0)));
9770 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9772 /* Calculate the approximate square root. */
9773 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9776 /* Finalize the approximation. */
9777 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9779 return true;
9782 typedef rtx (*recpe_type) (rtx, rtx);
9784 /* Select reciprocal initial estimate insn depending on machine mode. */
9786 static recpe_type
9787 get_recpe_type (machine_mode mode)
9789 switch (mode)
9791 case E_SFmode: return (gen_aarch64_frecpesf);
9792 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9793 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9794 case E_DFmode: return (gen_aarch64_frecpedf);
9795 case E_V2DFmode: return (gen_aarch64_frecpev2df);
9796 default: gcc_unreachable ();
9800 typedef rtx (*recps_type) (rtx, rtx, rtx);
9802 /* Select reciprocal series step insn depending on machine mode. */
9804 static recps_type
9805 get_recps_type (machine_mode mode)
9807 switch (mode)
9809 case E_SFmode: return (gen_aarch64_frecpssf);
9810 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9811 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9812 case E_DFmode: return (gen_aarch64_frecpsdf);
9813 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9814 default: gcc_unreachable ();
9818 /* Emit the instruction sequence to compute the approximation for the division
9819 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
9821 bool
9822 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9824 machine_mode mode = GET_MODE (quo);
9826 if (GET_MODE_INNER (mode) == HFmode)
9827 return false;
9829 bool use_approx_division_p = (flag_mlow_precision_div
9830 || (aarch64_tune_params.approx_modes->division
9831 & AARCH64_APPROX_MODE (mode)));
9833 if (!flag_finite_math_only
9834 || flag_trapping_math
9835 || !flag_unsafe_math_optimizations
9836 || optimize_function_for_size_p (cfun)
9837 || !use_approx_division_p)
9838 return false;
9840 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9841 return false;
9843 /* Estimate the approximate reciprocal. */
9844 rtx xrcp = gen_reg_rtx (mode);
9845 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
9847 /* Iterate over the series twice for SF and thrice for DF. */
9848 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9850 /* Optionally iterate over the series once less for faster performance,
9851 while sacrificing the accuracy. */
9852 if (flag_mlow_precision_div)
9853 iterations--;
9855 /* Iterate over the series to calculate the approximate reciprocal. */
9856 rtx xtmp = gen_reg_rtx (mode);
9857 while (iterations--)
9859 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
9861 if (iterations > 0)
9862 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
9865 if (num != CONST1_RTX (mode))
9867 /* As the approximate reciprocal of DEN is already calculated, only
9868 calculate the approximate division when NUM is not 1.0. */
9869 rtx xnum = force_reg (mode, num);
9870 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
9873 /* Finalize the approximation. */
9874 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
9875 return true;
9878 /* Return the number of instructions that can be issued per cycle. */
9879 static int
9880 aarch64_sched_issue_rate (void)
9882 return aarch64_tune_params.issue_rate;
9885 static int
9886 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
9888 int issue_rate = aarch64_sched_issue_rate ();
9890 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
9894 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
9895 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
9896 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
9898 static int
9899 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
9900 int ready_index)
9902 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
9906 /* Vectorizer cost model target hooks. */
9908 /* Implement targetm.vectorize.builtin_vectorization_cost. */
9909 static int
9910 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
9911 tree vectype,
9912 int misalign ATTRIBUTE_UNUSED)
9914 unsigned elements;
9915 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
9916 bool fp = false;
9918 if (vectype != NULL)
9919 fp = FLOAT_TYPE_P (vectype);
9921 switch (type_of_cost)
9923 case scalar_stmt:
9924 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
9926 case scalar_load:
9927 return costs->scalar_load_cost;
9929 case scalar_store:
9930 return costs->scalar_store_cost;
9932 case vector_stmt:
9933 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
9935 case vector_load:
9936 return costs->vec_align_load_cost;
9938 case vector_store:
9939 return costs->vec_store_cost;
9941 case vec_to_scalar:
9942 return costs->vec_to_scalar_cost;
9944 case scalar_to_vec:
9945 return costs->scalar_to_vec_cost;
9947 case unaligned_load:
9948 case vector_gather_load:
9949 return costs->vec_unalign_load_cost;
9951 case unaligned_store:
9952 case vector_scatter_store:
9953 return costs->vec_unalign_store_cost;
9955 case cond_branch_taken:
9956 return costs->cond_taken_branch_cost;
9958 case cond_branch_not_taken:
9959 return costs->cond_not_taken_branch_cost;
9961 case vec_perm:
9962 return costs->vec_permute_cost;
9964 case vec_promote_demote:
9965 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
9967 case vec_construct:
9968 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
9969 return elements / 2 + 1;
9971 default:
9972 gcc_unreachable ();
9976 /* Implement targetm.vectorize.add_stmt_cost. */
9977 static unsigned
9978 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
9979 struct _stmt_vec_info *stmt_info, int misalign,
9980 enum vect_cost_model_location where)
9982 unsigned *cost = (unsigned *) data;
9983 unsigned retval = 0;
9985 if (flag_vect_cost_model)
9987 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
9988 int stmt_cost =
9989 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
9991 /* Statements in an inner loop relative to the loop being
9992 vectorized are weighted more heavily. The value here is
9993 arbitrary and could potentially be improved with analysis. */
9994 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
9995 count *= 50; /* FIXME */
9997 retval = (unsigned) (count * stmt_cost);
9998 cost[where] += retval;
10001 return retval;
10004 static void initialize_aarch64_code_model (struct gcc_options *);
10006 /* Parse the TO_PARSE string and put the architecture struct that it
10007 selects into RES and the architectural features into ISA_FLAGS.
10008 Return an aarch64_parse_opt_result describing the parse result.
10009 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10011 static enum aarch64_parse_opt_result
10012 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10013 unsigned long *isa_flags)
10015 char *ext;
10016 const struct processor *arch;
10017 char *str = (char *) alloca (strlen (to_parse) + 1);
10018 size_t len;
10020 strcpy (str, to_parse);
10022 ext = strchr (str, '+');
10024 if (ext != NULL)
10025 len = ext - str;
10026 else
10027 len = strlen (str);
10029 if (len == 0)
10030 return AARCH64_PARSE_MISSING_ARG;
10033 /* Loop through the list of supported ARCHes to find a match. */
10034 for (arch = all_architectures; arch->name != NULL; arch++)
10036 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10038 unsigned long isa_temp = arch->flags;
10040 if (ext != NULL)
10042 /* TO_PARSE string contains at least one extension. */
10043 enum aarch64_parse_opt_result ext_res
10044 = aarch64_parse_extension (ext, &isa_temp);
10046 if (ext_res != AARCH64_PARSE_OK)
10047 return ext_res;
10049 /* Extension parsing was successful. Confirm the result
10050 arch and ISA flags. */
10051 *res = arch;
10052 *isa_flags = isa_temp;
10053 return AARCH64_PARSE_OK;
10057 /* ARCH name not found in list. */
10058 return AARCH64_PARSE_INVALID_ARG;
10061 /* Parse the TO_PARSE string and put the result tuning in RES and the
10062 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10063 describing the parse result. If there is an error parsing, RES and
10064 ISA_FLAGS are left unchanged. */
10066 static enum aarch64_parse_opt_result
10067 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10068 unsigned long *isa_flags)
10070 char *ext;
10071 const struct processor *cpu;
10072 char *str = (char *) alloca (strlen (to_parse) + 1);
10073 size_t len;
10075 strcpy (str, to_parse);
10077 ext = strchr (str, '+');
10079 if (ext != NULL)
10080 len = ext - str;
10081 else
10082 len = strlen (str);
10084 if (len == 0)
10085 return AARCH64_PARSE_MISSING_ARG;
10088 /* Loop through the list of supported CPUs to find a match. */
10089 for (cpu = all_cores; cpu->name != NULL; cpu++)
10091 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10093 unsigned long isa_temp = cpu->flags;
10096 if (ext != NULL)
10098 /* TO_PARSE string contains at least one extension. */
10099 enum aarch64_parse_opt_result ext_res
10100 = aarch64_parse_extension (ext, &isa_temp);
10102 if (ext_res != AARCH64_PARSE_OK)
10103 return ext_res;
10105 /* Extension parsing was successfull. Confirm the result
10106 cpu and ISA flags. */
10107 *res = cpu;
10108 *isa_flags = isa_temp;
10109 return AARCH64_PARSE_OK;
10113 /* CPU name not found in list. */
10114 return AARCH64_PARSE_INVALID_ARG;
10117 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10118 Return an aarch64_parse_opt_result describing the parse result.
10119 If the parsing fails the RES does not change. */
10121 static enum aarch64_parse_opt_result
10122 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10124 const struct processor *cpu;
10125 char *str = (char *) alloca (strlen (to_parse) + 1);
10127 strcpy (str, to_parse);
10129 /* Loop through the list of supported CPUs to find a match. */
10130 for (cpu = all_cores; cpu->name != NULL; cpu++)
10132 if (strcmp (cpu->name, str) == 0)
10134 *res = cpu;
10135 return AARCH64_PARSE_OK;
10139 /* CPU name not found in list. */
10140 return AARCH64_PARSE_INVALID_ARG;
10143 /* Parse TOKEN, which has length LENGTH to see if it is an option
10144 described in FLAG. If it is, return the index bit for that fusion type.
10145 If not, error (printing OPTION_NAME) and return zero. */
10147 static unsigned int
10148 aarch64_parse_one_option_token (const char *token,
10149 size_t length,
10150 const struct aarch64_flag_desc *flag,
10151 const char *option_name)
10153 for (; flag->name != NULL; flag++)
10155 if (length == strlen (flag->name)
10156 && !strncmp (flag->name, token, length))
10157 return flag->flag;
10160 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10161 return 0;
10164 /* Parse OPTION which is a comma-separated list of flags to enable.
10165 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10166 default state we inherit from the CPU tuning structures. OPTION_NAME
10167 gives the top-level option we are parsing in the -moverride string,
10168 for use in error messages. */
10170 static unsigned int
10171 aarch64_parse_boolean_options (const char *option,
10172 const struct aarch64_flag_desc *flags,
10173 unsigned int initial_state,
10174 const char *option_name)
10176 const char separator = '.';
10177 const char* specs = option;
10178 const char* ntoken = option;
10179 unsigned int found_flags = initial_state;
10181 while ((ntoken = strchr (specs, separator)))
10183 size_t token_length = ntoken - specs;
10184 unsigned token_ops = aarch64_parse_one_option_token (specs,
10185 token_length,
10186 flags,
10187 option_name);
10188 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10189 in the token stream, reset the supported operations. So:
10191 adrp+add.cmp+branch.none.adrp+add
10193 would have the result of turning on only adrp+add fusion. */
10194 if (!token_ops)
10195 found_flags = 0;
10197 found_flags |= token_ops;
10198 specs = ++ntoken;
10201 /* We ended with a comma, print something. */
10202 if (!(*specs))
10204 error ("%s string ill-formed\n", option_name);
10205 return 0;
10208 /* We still have one more token to parse. */
10209 size_t token_length = strlen (specs);
10210 unsigned token_ops = aarch64_parse_one_option_token (specs,
10211 token_length,
10212 flags,
10213 option_name);
10214 if (!token_ops)
10215 found_flags = 0;
10217 found_flags |= token_ops;
10218 return found_flags;
10221 /* Support for overriding instruction fusion. */
10223 static void
10224 aarch64_parse_fuse_string (const char *fuse_string,
10225 struct tune_params *tune)
10227 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10228 aarch64_fusible_pairs,
10229 tune->fusible_ops,
10230 "fuse=");
10233 /* Support for overriding other tuning flags. */
10235 static void
10236 aarch64_parse_tune_string (const char *tune_string,
10237 struct tune_params *tune)
10239 tune->extra_tuning_flags
10240 = aarch64_parse_boolean_options (tune_string,
10241 aarch64_tuning_flags,
10242 tune->extra_tuning_flags,
10243 "tune=");
10246 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10247 we understand. If it is, extract the option string and handoff to
10248 the appropriate function. */
10250 void
10251 aarch64_parse_one_override_token (const char* token,
10252 size_t length,
10253 struct tune_params *tune)
10255 const struct aarch64_tuning_override_function *fn
10256 = aarch64_tuning_override_functions;
10258 const char *option_part = strchr (token, '=');
10259 if (!option_part)
10261 error ("tuning string missing in option (%s)", token);
10262 return;
10265 /* Get the length of the option name. */
10266 length = option_part - token;
10267 /* Skip the '=' to get to the option string. */
10268 option_part++;
10270 for (; fn->name != NULL; fn++)
10272 if (!strncmp (fn->name, token, length))
10274 fn->parse_override (option_part, tune);
10275 return;
10279 error ("unknown tuning option (%s)",token);
10280 return;
10283 /* A checking mechanism for the implementation of the tls size. */
10285 static void
10286 initialize_aarch64_tls_size (struct gcc_options *opts)
10288 if (aarch64_tls_size == 0)
10289 aarch64_tls_size = 24;
10291 switch (opts->x_aarch64_cmodel_var)
10293 case AARCH64_CMODEL_TINY:
10294 /* Both the default and maximum TLS size allowed under tiny is 1M which
10295 needs two instructions to address, so we clamp the size to 24. */
10296 if (aarch64_tls_size > 24)
10297 aarch64_tls_size = 24;
10298 break;
10299 case AARCH64_CMODEL_SMALL:
10300 /* The maximum TLS size allowed under small is 4G. */
10301 if (aarch64_tls_size > 32)
10302 aarch64_tls_size = 32;
10303 break;
10304 case AARCH64_CMODEL_LARGE:
10305 /* The maximum TLS size allowed under large is 16E.
10306 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10307 if (aarch64_tls_size > 48)
10308 aarch64_tls_size = 48;
10309 break;
10310 default:
10311 gcc_unreachable ();
10314 return;
10317 /* Parse STRING looking for options in the format:
10318 string :: option:string
10319 option :: name=substring
10320 name :: {a-z}
10321 substring :: defined by option. */
10323 static void
10324 aarch64_parse_override_string (const char* input_string,
10325 struct tune_params* tune)
10327 const char separator = ':';
10328 size_t string_length = strlen (input_string) + 1;
10329 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10330 char *string = string_root;
10331 strncpy (string, input_string, string_length);
10332 string[string_length - 1] = '\0';
10334 char* ntoken = string;
10336 while ((ntoken = strchr (string, separator)))
10338 size_t token_length = ntoken - string;
10339 /* Make this substring look like a string. */
10340 *ntoken = '\0';
10341 aarch64_parse_one_override_token (string, token_length, tune);
10342 string = ++ntoken;
10345 /* One last option to parse. */
10346 aarch64_parse_one_override_token (string, strlen (string), tune);
10347 free (string_root);
10351 static void
10352 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10354 /* PR 70044: We have to be careful about being called multiple times for the
10355 same function. This means all changes should be repeatable. */
10357 /* If the frame pointer is enabled, set it to a special value that behaves
10358 similar to frame pointer omission. If we don't do this all leaf functions
10359 will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10360 If flag_omit_frame_pointer has this special value, we must force the
10361 frame pointer if not in a leaf function. We also need to force it in a
10362 leaf function if flag_omit_frame_pointer is not set or if LR is used. */
10363 if (opts->x_flag_omit_frame_pointer == 0)
10364 opts->x_flag_omit_frame_pointer = 2;
10366 /* If not optimizing for size, set the default
10367 alignment to what the target wants. */
10368 if (!opts->x_optimize_size)
10370 if (opts->x_align_loops <= 0)
10371 opts->x_align_loops = aarch64_tune_params.loop_align;
10372 if (opts->x_align_jumps <= 0)
10373 opts->x_align_jumps = aarch64_tune_params.jump_align;
10374 if (opts->x_align_functions <= 0)
10375 opts->x_align_functions = aarch64_tune_params.function_align;
10378 /* We default to no pc-relative literal loads. */
10380 aarch64_pcrelative_literal_loads = false;
10382 /* If -mpc-relative-literal-loads is set on the command line, this
10383 implies that the user asked for PC relative literal loads. */
10384 if (opts->x_pcrelative_literal_loads == 1)
10385 aarch64_pcrelative_literal_loads = true;
10387 /* In the tiny memory model it makes no sense to disallow PC relative
10388 literal pool loads. */
10389 if (aarch64_cmodel == AARCH64_CMODEL_TINY
10390 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10391 aarch64_pcrelative_literal_loads = true;
10393 /* When enabling the lower precision Newton series for the square root, also
10394 enable it for the reciprocal square root, since the latter is an
10395 intermediary step for the former. */
10396 if (flag_mlow_precision_sqrt)
10397 flag_mrecip_low_precision_sqrt = true;
10400 /* 'Unpack' up the internal tuning structs and update the options
10401 in OPTS. The caller must have set up selected_tune and selected_arch
10402 as all the other target-specific codegen decisions are
10403 derived from them. */
10405 void
10406 aarch64_override_options_internal (struct gcc_options *opts)
10408 aarch64_tune_flags = selected_tune->flags;
10409 aarch64_tune = selected_tune->sched_core;
10410 /* Make a copy of the tuning parameters attached to the core, which
10411 we may later overwrite. */
10412 aarch64_tune_params = *(selected_tune->tune);
10413 aarch64_architecture_version = selected_arch->architecture_version;
10415 if (opts->x_aarch64_override_tune_string)
10416 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10417 &aarch64_tune_params);
10419 /* This target defaults to strict volatile bitfields. */
10420 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10421 opts->x_flag_strict_volatile_bitfields = 1;
10423 initialize_aarch64_code_model (opts);
10424 initialize_aarch64_tls_size (opts);
10426 int queue_depth = 0;
10427 switch (aarch64_tune_params.autoprefetcher_model)
10429 case tune_params::AUTOPREFETCHER_OFF:
10430 queue_depth = -1;
10431 break;
10432 case tune_params::AUTOPREFETCHER_WEAK:
10433 queue_depth = 0;
10434 break;
10435 case tune_params::AUTOPREFETCHER_STRONG:
10436 queue_depth = max_insn_queue_index + 1;
10437 break;
10438 default:
10439 gcc_unreachable ();
10442 /* We don't mind passing in global_options_set here as we don't use
10443 the *options_set structs anyway. */
10444 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10445 queue_depth,
10446 opts->x_param_values,
10447 global_options_set.x_param_values);
10449 /* Set up parameters to be used in prefetching algorithm. Do not
10450 override the defaults unless we are tuning for a core we have
10451 researched values for. */
10452 if (aarch64_tune_params.prefetch->num_slots > 0)
10453 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10454 aarch64_tune_params.prefetch->num_slots,
10455 opts->x_param_values,
10456 global_options_set.x_param_values);
10457 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10458 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10459 aarch64_tune_params.prefetch->l1_cache_size,
10460 opts->x_param_values,
10461 global_options_set.x_param_values);
10462 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10463 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10464 aarch64_tune_params.prefetch->l1_cache_line_size,
10465 opts->x_param_values,
10466 global_options_set.x_param_values);
10467 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10468 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10469 aarch64_tune_params.prefetch->l2_cache_size,
10470 opts->x_param_values,
10471 global_options_set.x_param_values);
10473 /* Use the alternative scheduling-pressure algorithm by default. */
10474 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10475 opts->x_param_values,
10476 global_options_set.x_param_values);
10478 /* Enable sw prefetching at specified optimization level for
10479 CPUS that have prefetch. Lower optimization level threshold by 1
10480 when profiling is enabled. */
10481 if (opts->x_flag_prefetch_loop_arrays < 0
10482 && !opts->x_optimize_size
10483 && aarch64_tune_params.prefetch->default_opt_level >= 0
10484 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10485 opts->x_flag_prefetch_loop_arrays = 1;
10487 aarch64_override_options_after_change_1 (opts);
10490 /* Print a hint with a suggestion for a core or architecture name that
10491 most closely resembles what the user passed in STR. ARCH is true if
10492 the user is asking for an architecture name. ARCH is false if the user
10493 is asking for a core name. */
10495 static void
10496 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10498 auto_vec<const char *> candidates;
10499 const struct processor *entry = arch ? all_architectures : all_cores;
10500 for (; entry->name != NULL; entry++)
10501 candidates.safe_push (entry->name);
10502 char *s;
10503 const char *hint = candidates_list_and_hint (str, s, candidates);
10504 if (hint)
10505 inform (input_location, "valid arguments are: %s;"
10506 " did you mean %qs?", s, hint);
10507 XDELETEVEC (s);
10510 /* Print a hint with a suggestion for a core name that most closely resembles
10511 what the user passed in STR. */
10513 inline static void
10514 aarch64_print_hint_for_core (const char *str)
10516 aarch64_print_hint_for_core_or_arch (str, false);
10519 /* Print a hint with a suggestion for an architecture name that most closely
10520 resembles what the user passed in STR. */
10522 inline static void
10523 aarch64_print_hint_for_arch (const char *str)
10525 aarch64_print_hint_for_core_or_arch (str, true);
10528 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10529 specified in STR and throw errors if appropriate. Put the results if
10530 they are valid in RES and ISA_FLAGS. Return whether the option is
10531 valid. */
10533 static bool
10534 aarch64_validate_mcpu (const char *str, const struct processor **res,
10535 unsigned long *isa_flags)
10537 enum aarch64_parse_opt_result parse_res
10538 = aarch64_parse_cpu (str, res, isa_flags);
10540 if (parse_res == AARCH64_PARSE_OK)
10541 return true;
10543 switch (parse_res)
10545 case AARCH64_PARSE_MISSING_ARG:
10546 error ("missing cpu name in %<-mcpu=%s%>", str);
10547 break;
10548 case AARCH64_PARSE_INVALID_ARG:
10549 error ("unknown value %qs for -mcpu", str);
10550 aarch64_print_hint_for_core (str);
10551 break;
10552 case AARCH64_PARSE_INVALID_FEATURE:
10553 error ("invalid feature modifier in %<-mcpu=%s%>", str);
10554 break;
10555 default:
10556 gcc_unreachable ();
10559 return false;
10562 /* Validate a command-line -march option. Parse the arch and extensions
10563 (if any) specified in STR and throw errors if appropriate. Put the
10564 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10565 option is valid. */
10567 static bool
10568 aarch64_validate_march (const char *str, const struct processor **res,
10569 unsigned long *isa_flags)
10571 enum aarch64_parse_opt_result parse_res
10572 = aarch64_parse_arch (str, res, isa_flags);
10574 if (parse_res == AARCH64_PARSE_OK)
10575 return true;
10577 switch (parse_res)
10579 case AARCH64_PARSE_MISSING_ARG:
10580 error ("missing arch name in %<-march=%s%>", str);
10581 break;
10582 case AARCH64_PARSE_INVALID_ARG:
10583 error ("unknown value %qs for -march", str);
10584 aarch64_print_hint_for_arch (str);
10585 break;
10586 case AARCH64_PARSE_INVALID_FEATURE:
10587 error ("invalid feature modifier in %<-march=%s%>", str);
10588 break;
10589 default:
10590 gcc_unreachable ();
10593 return false;
10596 /* Validate a command-line -mtune option. Parse the cpu
10597 specified in STR and throw errors if appropriate. Put the
10598 result, if it is valid, in RES. Return whether the option is
10599 valid. */
10601 static bool
10602 aarch64_validate_mtune (const char *str, const struct processor **res)
10604 enum aarch64_parse_opt_result parse_res
10605 = aarch64_parse_tune (str, res);
10607 if (parse_res == AARCH64_PARSE_OK)
10608 return true;
10610 switch (parse_res)
10612 case AARCH64_PARSE_MISSING_ARG:
10613 error ("missing cpu name in %<-mtune=%s%>", str);
10614 break;
10615 case AARCH64_PARSE_INVALID_ARG:
10616 error ("unknown value %qs for -mtune", str);
10617 aarch64_print_hint_for_core (str);
10618 break;
10619 default:
10620 gcc_unreachable ();
10622 return false;
10625 /* Return the CPU corresponding to the enum CPU.
10626 If it doesn't specify a cpu, return the default. */
10628 static const struct processor *
10629 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10631 if (cpu != aarch64_none)
10632 return &all_cores[cpu];
10634 /* The & 0x3f is to extract the bottom 6 bits that encode the
10635 default cpu as selected by the --with-cpu GCC configure option
10636 in config.gcc.
10637 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10638 flags mechanism should be reworked to make it more sane. */
10639 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10642 /* Return the architecture corresponding to the enum ARCH.
10643 If it doesn't specify a valid architecture, return the default. */
10645 static const struct processor *
10646 aarch64_get_arch (enum aarch64_arch arch)
10648 if (arch != aarch64_no_arch)
10649 return &all_architectures[arch];
10651 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10653 return &all_architectures[cpu->arch];
10656 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10658 static poly_uint16
10659 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10661 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10662 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10663 deciding which .md file patterns to use and when deciding whether
10664 something is a legitimate address or constant. */
10665 if (value == SVE_SCALABLE || value == SVE_128)
10666 return poly_uint16 (2, 2);
10667 else
10668 return (int) value / 64;
10671 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10672 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10673 tuning structs. In particular it must set selected_tune and
10674 aarch64_isa_flags that define the available ISA features and tuning
10675 decisions. It must also set selected_arch as this will be used to
10676 output the .arch asm tags for each function. */
10678 static void
10679 aarch64_override_options (void)
10681 unsigned long cpu_isa = 0;
10682 unsigned long arch_isa = 0;
10683 aarch64_isa_flags = 0;
10685 bool valid_cpu = true;
10686 bool valid_tune = true;
10687 bool valid_arch = true;
10689 selected_cpu = NULL;
10690 selected_arch = NULL;
10691 selected_tune = NULL;
10693 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10694 If either of -march or -mtune is given, they override their
10695 respective component of -mcpu. */
10696 if (aarch64_cpu_string)
10697 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10698 &cpu_isa);
10700 if (aarch64_arch_string)
10701 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10702 &arch_isa);
10704 if (aarch64_tune_string)
10705 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10707 /* If the user did not specify a processor, choose the default
10708 one for them. This will be the CPU set during configuration using
10709 --with-cpu, otherwise it is "generic". */
10710 if (!selected_cpu)
10712 if (selected_arch)
10714 selected_cpu = &all_cores[selected_arch->ident];
10715 aarch64_isa_flags = arch_isa;
10716 explicit_arch = selected_arch->arch;
10718 else
10720 /* Get default configure-time CPU. */
10721 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10722 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10725 if (selected_tune)
10726 explicit_tune_core = selected_tune->ident;
10728 /* If both -mcpu and -march are specified check that they are architecturally
10729 compatible, warn if they're not and prefer the -march ISA flags. */
10730 else if (selected_arch)
10732 if (selected_arch->arch != selected_cpu->arch)
10734 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10735 all_architectures[selected_cpu->arch].name,
10736 selected_arch->name);
10738 aarch64_isa_flags = arch_isa;
10739 explicit_arch = selected_arch->arch;
10740 explicit_tune_core = selected_tune ? selected_tune->ident
10741 : selected_cpu->ident;
10743 else
10745 /* -mcpu but no -march. */
10746 aarch64_isa_flags = cpu_isa;
10747 explicit_tune_core = selected_tune ? selected_tune->ident
10748 : selected_cpu->ident;
10749 gcc_assert (selected_cpu);
10750 selected_arch = &all_architectures[selected_cpu->arch];
10751 explicit_arch = selected_arch->arch;
10754 /* Set the arch as well as we will need it when outputing
10755 the .arch directive in assembly. */
10756 if (!selected_arch)
10758 gcc_assert (selected_cpu);
10759 selected_arch = &all_architectures[selected_cpu->arch];
10762 if (!selected_tune)
10763 selected_tune = selected_cpu;
10765 #ifndef HAVE_AS_MABI_OPTION
10766 /* The compiler may have been configured with 2.23.* binutils, which does
10767 not have support for ILP32. */
10768 if (TARGET_ILP32)
10769 error ("assembler does not support -mabi=ilp32");
10770 #endif
10772 /* Convert -msve-vector-bits to a VG count. */
10773 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10775 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10776 sorry ("return address signing is only supported for -mabi=lp64");
10778 /* Make sure we properly set up the explicit options. */
10779 if ((aarch64_cpu_string && valid_cpu)
10780 || (aarch64_tune_string && valid_tune))
10781 gcc_assert (explicit_tune_core != aarch64_none);
10783 if ((aarch64_cpu_string && valid_cpu)
10784 || (aarch64_arch_string && valid_arch))
10785 gcc_assert (explicit_arch != aarch64_no_arch);
10787 aarch64_override_options_internal (&global_options);
10789 /* Save these options as the default ones in case we push and pop them later
10790 while processing functions with potential target attributes. */
10791 target_option_default_node = target_option_current_node
10792 = build_target_option_node (&global_options);
10795 /* Implement targetm.override_options_after_change. */
10797 static void
10798 aarch64_override_options_after_change (void)
10800 aarch64_override_options_after_change_1 (&global_options);
10803 static struct machine_function *
10804 aarch64_init_machine_status (void)
10806 struct machine_function *machine;
10807 machine = ggc_cleared_alloc<machine_function> ();
10808 return machine;
10811 void
10812 aarch64_init_expanders (void)
10814 init_machine_status = aarch64_init_machine_status;
10817 /* A checking mechanism for the implementation of the various code models. */
10818 static void
10819 initialize_aarch64_code_model (struct gcc_options *opts)
10821 if (opts->x_flag_pic)
10823 switch (opts->x_aarch64_cmodel_var)
10825 case AARCH64_CMODEL_TINY:
10826 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10827 break;
10828 case AARCH64_CMODEL_SMALL:
10829 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10830 aarch64_cmodel = (flag_pic == 2
10831 ? AARCH64_CMODEL_SMALL_PIC
10832 : AARCH64_CMODEL_SMALL_SPIC);
10833 #else
10834 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10835 #endif
10836 break;
10837 case AARCH64_CMODEL_LARGE:
10838 sorry ("code model %qs with -f%s", "large",
10839 opts->x_flag_pic > 1 ? "PIC" : "pic");
10840 break;
10841 default:
10842 gcc_unreachable ();
10845 else
10846 aarch64_cmodel = opts->x_aarch64_cmodel_var;
10849 /* Implement TARGET_OPTION_SAVE. */
10851 static void
10852 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
10854 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
10857 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
10858 using the information saved in PTR. */
10860 static void
10861 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
10863 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
10864 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10865 opts->x_explicit_arch = ptr->x_explicit_arch;
10866 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
10867 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
10869 aarch64_override_options_internal (opts);
10872 /* Implement TARGET_OPTION_PRINT. */
10874 static void
10875 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
10877 const struct processor *cpu
10878 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10879 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
10880 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
10881 std::string extension
10882 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
10884 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
10885 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
10886 arch->name, extension.c_str ());
10889 static GTY(()) tree aarch64_previous_fndecl;
10891 void
10892 aarch64_reset_previous_fndecl (void)
10894 aarch64_previous_fndecl = NULL;
10897 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
10898 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
10899 make sure optab availability predicates are recomputed when necessary. */
10901 void
10902 aarch64_save_restore_target_globals (tree new_tree)
10904 if (TREE_TARGET_GLOBALS (new_tree))
10905 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
10906 else if (new_tree == target_option_default_node)
10907 restore_target_globals (&default_target_globals);
10908 else
10909 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
10912 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
10913 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
10914 of the function, if such exists. This function may be called multiple
10915 times on a single function so use aarch64_previous_fndecl to avoid
10916 setting up identical state. */
10918 static void
10919 aarch64_set_current_function (tree fndecl)
10921 if (!fndecl || fndecl == aarch64_previous_fndecl)
10922 return;
10924 tree old_tree = (aarch64_previous_fndecl
10925 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
10926 : NULL_TREE);
10928 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
10930 /* If current function has no attributes but the previous one did,
10931 use the default node. */
10932 if (!new_tree && old_tree)
10933 new_tree = target_option_default_node;
10935 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
10936 the default have been handled by aarch64_save_restore_target_globals from
10937 aarch64_pragma_target_parse. */
10938 if (old_tree == new_tree)
10939 return;
10941 aarch64_previous_fndecl = fndecl;
10943 /* First set the target options. */
10944 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
10946 aarch64_save_restore_target_globals (new_tree);
10949 /* Enum describing the various ways we can handle attributes.
10950 In many cases we can reuse the generic option handling machinery. */
10952 enum aarch64_attr_opt_type
10954 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
10955 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
10956 aarch64_attr_enum, /* Attribute sets an enum variable. */
10957 aarch64_attr_custom /* Attribute requires a custom handling function. */
10960 /* All the information needed to handle a target attribute.
10961 NAME is the name of the attribute.
10962 ATTR_TYPE specifies the type of behavior of the attribute as described
10963 in the definition of enum aarch64_attr_opt_type.
10964 ALLOW_NEG is true if the attribute supports a "no-" form.
10965 HANDLER is the function that takes the attribute string as an argument
10966 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
10967 OPT_NUM is the enum specifying the option that the attribute modifies.
10968 This is needed for attributes that mirror the behavior of a command-line
10969 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
10970 aarch64_attr_enum. */
10972 struct aarch64_attribute_info
10974 const char *name;
10975 enum aarch64_attr_opt_type attr_type;
10976 bool allow_neg;
10977 bool (*handler) (const char *);
10978 enum opt_code opt_num;
10981 /* Handle the ARCH_STR argument to the arch= target attribute. */
10983 static bool
10984 aarch64_handle_attr_arch (const char *str)
10986 const struct processor *tmp_arch = NULL;
10987 enum aarch64_parse_opt_result parse_res
10988 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
10990 if (parse_res == AARCH64_PARSE_OK)
10992 gcc_assert (tmp_arch);
10993 selected_arch = tmp_arch;
10994 explicit_arch = selected_arch->arch;
10995 return true;
10998 switch (parse_res)
11000 case AARCH64_PARSE_MISSING_ARG:
11001 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11002 break;
11003 case AARCH64_PARSE_INVALID_ARG:
11004 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11005 aarch64_print_hint_for_arch (str);
11006 break;
11007 case AARCH64_PARSE_INVALID_FEATURE:
11008 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11009 break;
11010 default:
11011 gcc_unreachable ();
11014 return false;
11017 /* Handle the argument CPU_STR to the cpu= target attribute. */
11019 static bool
11020 aarch64_handle_attr_cpu (const char *str)
11022 const struct processor *tmp_cpu = NULL;
11023 enum aarch64_parse_opt_result parse_res
11024 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11026 if (parse_res == AARCH64_PARSE_OK)
11028 gcc_assert (tmp_cpu);
11029 selected_tune = tmp_cpu;
11030 explicit_tune_core = selected_tune->ident;
11032 selected_arch = &all_architectures[tmp_cpu->arch];
11033 explicit_arch = selected_arch->arch;
11034 return true;
11037 switch (parse_res)
11039 case AARCH64_PARSE_MISSING_ARG:
11040 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11041 break;
11042 case AARCH64_PARSE_INVALID_ARG:
11043 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11044 aarch64_print_hint_for_core (str);
11045 break;
11046 case AARCH64_PARSE_INVALID_FEATURE:
11047 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11048 break;
11049 default:
11050 gcc_unreachable ();
11053 return false;
11056 /* Handle the argument STR to the tune= target attribute. */
11058 static bool
11059 aarch64_handle_attr_tune (const char *str)
11061 const struct processor *tmp_tune = NULL;
11062 enum aarch64_parse_opt_result parse_res
11063 = aarch64_parse_tune (str, &tmp_tune);
11065 if (parse_res == AARCH64_PARSE_OK)
11067 gcc_assert (tmp_tune);
11068 selected_tune = tmp_tune;
11069 explicit_tune_core = selected_tune->ident;
11070 return true;
11073 switch (parse_res)
11075 case AARCH64_PARSE_INVALID_ARG:
11076 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11077 aarch64_print_hint_for_core (str);
11078 break;
11079 default:
11080 gcc_unreachable ();
11083 return false;
11086 /* Parse an architecture extensions target attribute string specified in STR.
11087 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11088 if successful. Update aarch64_isa_flags to reflect the ISA features
11089 modified. */
11091 static bool
11092 aarch64_handle_attr_isa_flags (char *str)
11094 enum aarch64_parse_opt_result parse_res;
11095 unsigned long isa_flags = aarch64_isa_flags;
11097 /* We allow "+nothing" in the beginning to clear out all architectural
11098 features if the user wants to handpick specific features. */
11099 if (strncmp ("+nothing", str, 8) == 0)
11101 isa_flags = 0;
11102 str += 8;
11105 parse_res = aarch64_parse_extension (str, &isa_flags);
11107 if (parse_res == AARCH64_PARSE_OK)
11109 aarch64_isa_flags = isa_flags;
11110 return true;
11113 switch (parse_res)
11115 case AARCH64_PARSE_MISSING_ARG:
11116 error ("missing value in %<target()%> pragma or attribute");
11117 break;
11119 case AARCH64_PARSE_INVALID_FEATURE:
11120 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11121 break;
11123 default:
11124 gcc_unreachable ();
11127 return false;
11130 /* The target attributes that we support. On top of these we also support just
11131 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11132 handled explicitly in aarch64_process_one_target_attr. */
11134 static const struct aarch64_attribute_info aarch64_attributes[] =
11136 { "general-regs-only", aarch64_attr_mask, false, NULL,
11137 OPT_mgeneral_regs_only },
11138 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11139 OPT_mfix_cortex_a53_835769 },
11140 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11141 OPT_mfix_cortex_a53_843419 },
11142 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11143 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
11144 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11145 OPT_momit_leaf_frame_pointer },
11146 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11147 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11148 OPT_march_ },
11149 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11150 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11151 OPT_mtune_ },
11152 { "sign-return-address", aarch64_attr_enum, false, NULL,
11153 OPT_msign_return_address_ },
11154 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11157 /* Parse ARG_STR which contains the definition of one target attribute.
11158 Show appropriate errors if any or return true if the attribute is valid. */
11160 static bool
11161 aarch64_process_one_target_attr (char *arg_str)
11163 bool invert = false;
11165 size_t len = strlen (arg_str);
11167 if (len == 0)
11169 error ("malformed %<target()%> pragma or attribute");
11170 return false;
11173 char *str_to_check = (char *) alloca (len + 1);
11174 strcpy (str_to_check, arg_str);
11176 /* Skip leading whitespace. */
11177 while (*str_to_check == ' ' || *str_to_check == '\t')
11178 str_to_check++;
11180 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11181 It is easier to detect and handle it explicitly here rather than going
11182 through the machinery for the rest of the target attributes in this
11183 function. */
11184 if (*str_to_check == '+')
11185 return aarch64_handle_attr_isa_flags (str_to_check);
11187 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11189 invert = true;
11190 str_to_check += 3;
11192 char *arg = strchr (str_to_check, '=');
11194 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11195 and point ARG to "foo". */
11196 if (arg)
11198 *arg = '\0';
11199 arg++;
11201 const struct aarch64_attribute_info *p_attr;
11202 bool found = false;
11203 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11205 /* If the names don't match up, or the user has given an argument
11206 to an attribute that doesn't accept one, or didn't give an argument
11207 to an attribute that expects one, fail to match. */
11208 if (strcmp (str_to_check, p_attr->name) != 0)
11209 continue;
11211 found = true;
11212 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11213 || p_attr->attr_type == aarch64_attr_enum;
11215 if (attr_need_arg_p ^ (arg != NULL))
11217 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11218 return false;
11221 /* If the name matches but the attribute does not allow "no-" versions
11222 then we can't match. */
11223 if (invert && !p_attr->allow_neg)
11225 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11226 return false;
11229 switch (p_attr->attr_type)
11231 /* Has a custom handler registered.
11232 For example, cpu=, arch=, tune=. */
11233 case aarch64_attr_custom:
11234 gcc_assert (p_attr->handler);
11235 if (!p_attr->handler (arg))
11236 return false;
11237 break;
11239 /* Either set or unset a boolean option. */
11240 case aarch64_attr_bool:
11242 struct cl_decoded_option decoded;
11244 generate_option (p_attr->opt_num, NULL, !invert,
11245 CL_TARGET, &decoded);
11246 aarch64_handle_option (&global_options, &global_options_set,
11247 &decoded, input_location);
11248 break;
11250 /* Set or unset a bit in the target_flags. aarch64_handle_option
11251 should know what mask to apply given the option number. */
11252 case aarch64_attr_mask:
11254 struct cl_decoded_option decoded;
11255 /* We only need to specify the option number.
11256 aarch64_handle_option will know which mask to apply. */
11257 decoded.opt_index = p_attr->opt_num;
11258 decoded.value = !invert;
11259 aarch64_handle_option (&global_options, &global_options_set,
11260 &decoded, input_location);
11261 break;
11263 /* Use the option setting machinery to set an option to an enum. */
11264 case aarch64_attr_enum:
11266 gcc_assert (arg);
11267 bool valid;
11268 int value;
11269 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11270 &value, CL_TARGET);
11271 if (valid)
11273 set_option (&global_options, NULL, p_attr->opt_num, value,
11274 NULL, DK_UNSPECIFIED, input_location,
11275 global_dc);
11277 else
11279 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11281 break;
11283 default:
11284 gcc_unreachable ();
11288 /* If we reached here we either have found an attribute and validated
11289 it or didn't match any. If we matched an attribute but its arguments
11290 were malformed we will have returned false already. */
11291 return found;
11294 /* Count how many times the character C appears in
11295 NULL-terminated string STR. */
11297 static unsigned int
11298 num_occurences_in_str (char c, char *str)
11300 unsigned int res = 0;
11301 while (*str != '\0')
11303 if (*str == c)
11304 res++;
11306 str++;
11309 return res;
11312 /* Parse the tree in ARGS that contains the target attribute information
11313 and update the global target options space. */
11315 bool
11316 aarch64_process_target_attr (tree args)
11318 if (TREE_CODE (args) == TREE_LIST)
11322 tree head = TREE_VALUE (args);
11323 if (head)
11325 if (!aarch64_process_target_attr (head))
11326 return false;
11328 args = TREE_CHAIN (args);
11329 } while (args);
11331 return true;
11334 if (TREE_CODE (args) != STRING_CST)
11336 error ("attribute %<target%> argument not a string");
11337 return false;
11340 size_t len = strlen (TREE_STRING_POINTER (args));
11341 char *str_to_check = (char *) alloca (len + 1);
11342 strcpy (str_to_check, TREE_STRING_POINTER (args));
11344 if (len == 0)
11346 error ("malformed %<target()%> pragma or attribute");
11347 return false;
11350 /* Used to catch empty spaces between commas i.e.
11351 attribute ((target ("attr1,,attr2"))). */
11352 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11354 /* Handle multiple target attributes separated by ','. */
11355 char *token = strtok (str_to_check, ",");
11357 unsigned int num_attrs = 0;
11358 while (token)
11360 num_attrs++;
11361 if (!aarch64_process_one_target_attr (token))
11363 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11364 return false;
11367 token = strtok (NULL, ",");
11370 if (num_attrs != num_commas + 1)
11372 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11373 return false;
11376 return true;
11379 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11380 process attribute ((target ("..."))). */
11382 static bool
11383 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11385 struct cl_target_option cur_target;
11386 bool ret;
11387 tree old_optimize;
11388 tree new_target, new_optimize;
11389 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11391 /* If what we're processing is the current pragma string then the
11392 target option node is already stored in target_option_current_node
11393 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11394 having to re-parse the string. This is especially useful to keep
11395 arm_neon.h compile times down since that header contains a lot
11396 of intrinsics enclosed in pragmas. */
11397 if (!existing_target && args == current_target_pragma)
11399 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11400 return true;
11402 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11404 old_optimize = build_optimization_node (&global_options);
11405 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11407 /* If the function changed the optimization levels as well as setting
11408 target options, start with the optimizations specified. */
11409 if (func_optimize && func_optimize != old_optimize)
11410 cl_optimization_restore (&global_options,
11411 TREE_OPTIMIZATION (func_optimize));
11413 /* Save the current target options to restore at the end. */
11414 cl_target_option_save (&cur_target, &global_options);
11416 /* If fndecl already has some target attributes applied to it, unpack
11417 them so that we add this attribute on top of them, rather than
11418 overwriting them. */
11419 if (existing_target)
11421 struct cl_target_option *existing_options
11422 = TREE_TARGET_OPTION (existing_target);
11424 if (existing_options)
11425 cl_target_option_restore (&global_options, existing_options);
11427 else
11428 cl_target_option_restore (&global_options,
11429 TREE_TARGET_OPTION (target_option_current_node));
11431 ret = aarch64_process_target_attr (args);
11433 /* Set up any additional state. */
11434 if (ret)
11436 aarch64_override_options_internal (&global_options);
11437 /* Initialize SIMD builtins if we haven't already.
11438 Set current_target_pragma to NULL for the duration so that
11439 the builtin initialization code doesn't try to tag the functions
11440 being built with the attributes specified by any current pragma, thus
11441 going into an infinite recursion. */
11442 if (TARGET_SIMD)
11444 tree saved_current_target_pragma = current_target_pragma;
11445 current_target_pragma = NULL;
11446 aarch64_init_simd_builtins ();
11447 current_target_pragma = saved_current_target_pragma;
11449 new_target = build_target_option_node (&global_options);
11451 else
11452 new_target = NULL;
11454 new_optimize = build_optimization_node (&global_options);
11456 if (fndecl && ret)
11458 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11460 if (old_optimize != new_optimize)
11461 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11464 cl_target_option_restore (&global_options, &cur_target);
11466 if (old_optimize != new_optimize)
11467 cl_optimization_restore (&global_options,
11468 TREE_OPTIMIZATION (old_optimize));
11469 return ret;
11472 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11473 tri-bool options (yes, no, don't care) and the default value is
11474 DEF, determine whether to reject inlining. */
11476 static bool
11477 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11478 int dont_care, int def)
11480 /* If the callee doesn't care, always allow inlining. */
11481 if (callee == dont_care)
11482 return true;
11484 /* If the caller doesn't care, always allow inlining. */
11485 if (caller == dont_care)
11486 return true;
11488 /* Otherwise, allow inlining if either the callee and caller values
11489 agree, or if the callee is using the default value. */
11490 return (callee == caller || callee == def);
11493 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11494 to inline CALLEE into CALLER based on target-specific info.
11495 Make sure that the caller and callee have compatible architectural
11496 features. Then go through the other possible target attributes
11497 and see if they can block inlining. Try not to reject always_inline
11498 callees unless they are incompatible architecturally. */
11500 static bool
11501 aarch64_can_inline_p (tree caller, tree callee)
11503 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11504 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11506 /* If callee has no option attributes, then it is ok to inline. */
11507 if (!callee_tree)
11508 return true;
11510 struct cl_target_option *caller_opts
11511 = TREE_TARGET_OPTION (caller_tree ? caller_tree
11512 : target_option_default_node);
11514 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
11517 /* Callee's ISA flags should be a subset of the caller's. */
11518 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11519 != callee_opts->x_aarch64_isa_flags)
11520 return false;
11522 /* Allow non-strict aligned functions inlining into strict
11523 aligned ones. */
11524 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11525 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11526 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11527 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11528 return false;
11530 bool always_inline = lookup_attribute ("always_inline",
11531 DECL_ATTRIBUTES (callee));
11533 /* If the architectural features match up and the callee is always_inline
11534 then the other attributes don't matter. */
11535 if (always_inline)
11536 return true;
11538 if (caller_opts->x_aarch64_cmodel_var
11539 != callee_opts->x_aarch64_cmodel_var)
11540 return false;
11542 if (caller_opts->x_aarch64_tls_dialect
11543 != callee_opts->x_aarch64_tls_dialect)
11544 return false;
11546 /* Honour explicit requests to workaround errata. */
11547 if (!aarch64_tribools_ok_for_inlining_p (
11548 caller_opts->x_aarch64_fix_a53_err835769,
11549 callee_opts->x_aarch64_fix_a53_err835769,
11550 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11551 return false;
11553 if (!aarch64_tribools_ok_for_inlining_p (
11554 caller_opts->x_aarch64_fix_a53_err843419,
11555 callee_opts->x_aarch64_fix_a53_err843419,
11556 2, TARGET_FIX_ERR_A53_843419))
11557 return false;
11559 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11560 caller and calle and they don't match up, reject inlining. */
11561 if (!aarch64_tribools_ok_for_inlining_p (
11562 caller_opts->x_flag_omit_leaf_frame_pointer,
11563 callee_opts->x_flag_omit_leaf_frame_pointer,
11564 2, 1))
11565 return false;
11567 /* If the callee has specific tuning overrides, respect them. */
11568 if (callee_opts->x_aarch64_override_tune_string != NULL
11569 && caller_opts->x_aarch64_override_tune_string == NULL)
11570 return false;
11572 /* If the user specified tuning override strings for the
11573 caller and callee and they don't match up, reject inlining.
11574 We just do a string compare here, we don't analyze the meaning
11575 of the string, as it would be too costly for little gain. */
11576 if (callee_opts->x_aarch64_override_tune_string
11577 && caller_opts->x_aarch64_override_tune_string
11578 && (strcmp (callee_opts->x_aarch64_override_tune_string,
11579 caller_opts->x_aarch64_override_tune_string) != 0))
11580 return false;
11582 return true;
11585 /* Return true if SYMBOL_REF X binds locally. */
11587 static bool
11588 aarch64_symbol_binds_local_p (const_rtx x)
11590 return (SYMBOL_REF_DECL (x)
11591 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11592 : SYMBOL_REF_LOCAL_P (x));
11595 /* Return true if SYMBOL_REF X is thread local */
11596 static bool
11597 aarch64_tls_symbol_p (rtx x)
11599 if (! TARGET_HAVE_TLS)
11600 return false;
11602 if (GET_CODE (x) != SYMBOL_REF)
11603 return false;
11605 return SYMBOL_REF_TLS_MODEL (x) != 0;
11608 /* Classify a TLS symbol into one of the TLS kinds. */
11609 enum aarch64_symbol_type
11610 aarch64_classify_tls_symbol (rtx x)
11612 enum tls_model tls_kind = tls_symbolic_operand_type (x);
11614 switch (tls_kind)
11616 case TLS_MODEL_GLOBAL_DYNAMIC:
11617 case TLS_MODEL_LOCAL_DYNAMIC:
11618 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11620 case TLS_MODEL_INITIAL_EXEC:
11621 switch (aarch64_cmodel)
11623 case AARCH64_CMODEL_TINY:
11624 case AARCH64_CMODEL_TINY_PIC:
11625 return SYMBOL_TINY_TLSIE;
11626 default:
11627 return SYMBOL_SMALL_TLSIE;
11630 case TLS_MODEL_LOCAL_EXEC:
11631 if (aarch64_tls_size == 12)
11632 return SYMBOL_TLSLE12;
11633 else if (aarch64_tls_size == 24)
11634 return SYMBOL_TLSLE24;
11635 else if (aarch64_tls_size == 32)
11636 return SYMBOL_TLSLE32;
11637 else if (aarch64_tls_size == 48)
11638 return SYMBOL_TLSLE48;
11639 else
11640 gcc_unreachable ();
11642 case TLS_MODEL_EMULATED:
11643 case TLS_MODEL_NONE:
11644 return SYMBOL_FORCE_TO_MEM;
11646 default:
11647 gcc_unreachable ();
11651 /* Return the correct method for accessing X + OFFSET, where X is either
11652 a SYMBOL_REF or LABEL_REF. */
11654 enum aarch64_symbol_type
11655 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11657 if (GET_CODE (x) == LABEL_REF)
11659 switch (aarch64_cmodel)
11661 case AARCH64_CMODEL_LARGE:
11662 return SYMBOL_FORCE_TO_MEM;
11664 case AARCH64_CMODEL_TINY_PIC:
11665 case AARCH64_CMODEL_TINY:
11666 return SYMBOL_TINY_ABSOLUTE;
11668 case AARCH64_CMODEL_SMALL_SPIC:
11669 case AARCH64_CMODEL_SMALL_PIC:
11670 case AARCH64_CMODEL_SMALL:
11671 return SYMBOL_SMALL_ABSOLUTE;
11673 default:
11674 gcc_unreachable ();
11678 if (GET_CODE (x) == SYMBOL_REF)
11680 if (aarch64_tls_symbol_p (x))
11681 return aarch64_classify_tls_symbol (x);
11683 switch (aarch64_cmodel)
11685 case AARCH64_CMODEL_TINY:
11686 /* When we retrieve symbol + offset address, we have to make sure
11687 the offset does not cause overflow of the final address. But
11688 we have no way of knowing the address of symbol at compile time
11689 so we can't accurately say if the distance between the PC and
11690 symbol + offset is outside the addressible range of +/-1M in the
11691 TINY code model. So we rely on images not being greater than
11692 1M and cap the offset at 1M and anything beyond 1M will have to
11693 be loaded using an alternative mechanism. Furthermore if the
11694 symbol is a weak reference to something that isn't known to
11695 resolve to a symbol in this module, then force to memory. */
11696 if ((SYMBOL_REF_WEAK (x)
11697 && !aarch64_symbol_binds_local_p (x))
11698 || !IN_RANGE (offset, -1048575, 1048575))
11699 return SYMBOL_FORCE_TO_MEM;
11700 return SYMBOL_TINY_ABSOLUTE;
11702 case AARCH64_CMODEL_SMALL:
11703 /* Same reasoning as the tiny code model, but the offset cap here is
11704 4G. */
11705 if ((SYMBOL_REF_WEAK (x)
11706 && !aarch64_symbol_binds_local_p (x))
11707 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11708 HOST_WIDE_INT_C (4294967264)))
11709 return SYMBOL_FORCE_TO_MEM;
11710 return SYMBOL_SMALL_ABSOLUTE;
11712 case AARCH64_CMODEL_TINY_PIC:
11713 if (!aarch64_symbol_binds_local_p (x))
11714 return SYMBOL_TINY_GOT;
11715 return SYMBOL_TINY_ABSOLUTE;
11717 case AARCH64_CMODEL_SMALL_SPIC:
11718 case AARCH64_CMODEL_SMALL_PIC:
11719 if (!aarch64_symbol_binds_local_p (x))
11720 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11721 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11722 return SYMBOL_SMALL_ABSOLUTE;
11724 case AARCH64_CMODEL_LARGE:
11725 /* This is alright even in PIC code as the constant
11726 pool reference is always PC relative and within
11727 the same translation unit. */
11728 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11729 return SYMBOL_SMALL_ABSOLUTE;
11730 else
11731 return SYMBOL_FORCE_TO_MEM;
11733 default:
11734 gcc_unreachable ();
11738 /* By default push everything into the constant pool. */
11739 return SYMBOL_FORCE_TO_MEM;
11742 bool
11743 aarch64_constant_address_p (rtx x)
11745 return (CONSTANT_P (x) && memory_address_p (DImode, x));
11748 bool
11749 aarch64_legitimate_pic_operand_p (rtx x)
11751 if (GET_CODE (x) == SYMBOL_REF
11752 || (GET_CODE (x) == CONST
11753 && GET_CODE (XEXP (x, 0)) == PLUS
11754 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11755 return false;
11757 return true;
11760 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11761 that should be rematerialized rather than spilled. */
11763 static bool
11764 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11766 /* Support CSE and rematerialization of common constants. */
11767 if (CONST_INT_P (x)
11768 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11769 || GET_CODE (x) == CONST_VECTOR)
11770 return true;
11772 /* Do not allow vector struct mode constants for Advanced SIMD.
11773 We could support 0 and -1 easily, but they need support in
11774 aarch64-simd.md. */
11775 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11776 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11777 return false;
11779 /* Only accept variable-length vector constants if they can be
11780 handled directly.
11782 ??? It would be possible to handle rematerialization of other
11783 constants via secondary reloads. */
11784 if (vec_flags & VEC_ANY_SVE)
11785 return aarch64_simd_valid_immediate (x, NULL);
11787 if (GET_CODE (x) == HIGH)
11788 x = XEXP (x, 0);
11790 /* Accept polynomial constants that can be calculated by using the
11791 destination of a move as the sole temporary. Constants that
11792 require a second temporary cannot be rematerialized (they can't be
11793 forced to memory and also aren't legitimate constants). */
11794 poly_int64 offset;
11795 if (poly_int_rtx_p (x, &offset))
11796 return aarch64_offset_temporaries (false, offset) <= 1;
11798 /* If an offset is being added to something else, we need to allow the
11799 base to be moved into the destination register, meaning that there
11800 are no free temporaries for the offset. */
11801 x = strip_offset (x, &offset);
11802 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11803 return false;
11805 /* Do not allow const (plus (anchor_symbol, const_int)). */
11806 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11807 return false;
11809 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
11810 so spilling them is better than rematerialization. */
11811 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11812 return true;
11814 /* Label references are always constant. */
11815 if (GET_CODE (x) == LABEL_REF)
11816 return true;
11818 return false;
11822 aarch64_load_tp (rtx target)
11824 if (!target
11825 || GET_MODE (target) != Pmode
11826 || !register_operand (target, Pmode))
11827 target = gen_reg_rtx (Pmode);
11829 /* Can return in any reg. */
11830 emit_insn (gen_aarch64_load_tp_hard (target));
11831 return target;
11834 /* On AAPCS systems, this is the "struct __va_list". */
11835 static GTY(()) tree va_list_type;
11837 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11838 Return the type to use as __builtin_va_list.
11840 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11842 struct __va_list
11844 void *__stack;
11845 void *__gr_top;
11846 void *__vr_top;
11847 int __gr_offs;
11848 int __vr_offs;
11849 }; */
11851 static tree
11852 aarch64_build_builtin_va_list (void)
11854 tree va_list_name;
11855 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11857 /* Create the type. */
11858 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
11859 /* Give it the required name. */
11860 va_list_name = build_decl (BUILTINS_LOCATION,
11861 TYPE_DECL,
11862 get_identifier ("__va_list"),
11863 va_list_type);
11864 DECL_ARTIFICIAL (va_list_name) = 1;
11865 TYPE_NAME (va_list_type) = va_list_name;
11866 TYPE_STUB_DECL (va_list_type) = va_list_name;
11868 /* Create the fields. */
11869 f_stack = build_decl (BUILTINS_LOCATION,
11870 FIELD_DECL, get_identifier ("__stack"),
11871 ptr_type_node);
11872 f_grtop = build_decl (BUILTINS_LOCATION,
11873 FIELD_DECL, get_identifier ("__gr_top"),
11874 ptr_type_node);
11875 f_vrtop = build_decl (BUILTINS_LOCATION,
11876 FIELD_DECL, get_identifier ("__vr_top"),
11877 ptr_type_node);
11878 f_groff = build_decl (BUILTINS_LOCATION,
11879 FIELD_DECL, get_identifier ("__gr_offs"),
11880 integer_type_node);
11881 f_vroff = build_decl (BUILTINS_LOCATION,
11882 FIELD_DECL, get_identifier ("__vr_offs"),
11883 integer_type_node);
11885 /* Tell tree-stdarg pass about our internal offset fields.
11886 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
11887 purpose to identify whether the code is updating va_list internal
11888 offset fields through irregular way. */
11889 va_list_gpr_counter_field = f_groff;
11890 va_list_fpr_counter_field = f_vroff;
11892 DECL_ARTIFICIAL (f_stack) = 1;
11893 DECL_ARTIFICIAL (f_grtop) = 1;
11894 DECL_ARTIFICIAL (f_vrtop) = 1;
11895 DECL_ARTIFICIAL (f_groff) = 1;
11896 DECL_ARTIFICIAL (f_vroff) = 1;
11898 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
11899 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
11900 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
11901 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
11902 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
11904 TYPE_FIELDS (va_list_type) = f_stack;
11905 DECL_CHAIN (f_stack) = f_grtop;
11906 DECL_CHAIN (f_grtop) = f_vrtop;
11907 DECL_CHAIN (f_vrtop) = f_groff;
11908 DECL_CHAIN (f_groff) = f_vroff;
11910 /* Compute its layout. */
11911 layout_type (va_list_type);
11913 return va_list_type;
11916 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
11917 static void
11918 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
11920 const CUMULATIVE_ARGS *cum;
11921 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11922 tree stack, grtop, vrtop, groff, vroff;
11923 tree t;
11924 int gr_save_area_size = cfun->va_list_gpr_size;
11925 int vr_save_area_size = cfun->va_list_fpr_size;
11926 int vr_offset;
11928 cum = &crtl->args.info;
11929 if (cfun->va_list_gpr_size)
11930 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
11931 cfun->va_list_gpr_size);
11932 if (cfun->va_list_fpr_size)
11933 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
11934 * UNITS_PER_VREG, cfun->va_list_fpr_size);
11936 if (!TARGET_FLOAT)
11938 gcc_assert (cum->aapcs_nvrn == 0);
11939 vr_save_area_size = 0;
11942 f_stack = TYPE_FIELDS (va_list_type_node);
11943 f_grtop = DECL_CHAIN (f_stack);
11944 f_vrtop = DECL_CHAIN (f_grtop);
11945 f_groff = DECL_CHAIN (f_vrtop);
11946 f_vroff = DECL_CHAIN (f_groff);
11948 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
11949 NULL_TREE);
11950 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
11951 NULL_TREE);
11952 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
11953 NULL_TREE);
11954 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
11955 NULL_TREE);
11956 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
11957 NULL_TREE);
11959 /* Emit code to initialize STACK, which points to the next varargs stack
11960 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
11961 by named arguments. STACK is 8-byte aligned. */
11962 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
11963 if (cum->aapcs_stack_size > 0)
11964 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
11965 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
11966 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11968 /* Emit code to initialize GRTOP, the top of the GR save area.
11969 virtual_incoming_args_rtx should have been 16 byte aligned. */
11970 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
11971 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
11972 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11974 /* Emit code to initialize VRTOP, the top of the VR save area.
11975 This address is gr_save_area_bytes below GRTOP, rounded
11976 down to the next 16-byte boundary. */
11977 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
11978 vr_offset = ROUND_UP (gr_save_area_size,
11979 STACK_BOUNDARY / BITS_PER_UNIT);
11981 if (vr_offset)
11982 t = fold_build_pointer_plus_hwi (t, -vr_offset);
11983 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
11984 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11986 /* Emit code to initialize GROFF, the offset from GRTOP of the
11987 next GPR argument. */
11988 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
11989 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
11990 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11992 /* Likewise emit code to initialize VROFF, the offset from FTOP
11993 of the next VR argument. */
11994 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
11995 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
11996 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
11999 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12001 static tree
12002 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12003 gimple_seq *post_p ATTRIBUTE_UNUSED)
12005 tree addr;
12006 bool indirect_p;
12007 bool is_ha; /* is HFA or HVA. */
12008 bool dw_align; /* double-word align. */
12009 machine_mode ag_mode = VOIDmode;
12010 int nregs;
12011 machine_mode mode;
12013 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12014 tree stack, f_top, f_off, off, arg, roundup, on_stack;
12015 HOST_WIDE_INT size, rsize, adjust, align;
12016 tree t, u, cond1, cond2;
12018 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12019 if (indirect_p)
12020 type = build_pointer_type (type);
12022 mode = TYPE_MODE (type);
12024 f_stack = TYPE_FIELDS (va_list_type_node);
12025 f_grtop = DECL_CHAIN (f_stack);
12026 f_vrtop = DECL_CHAIN (f_grtop);
12027 f_groff = DECL_CHAIN (f_vrtop);
12028 f_vroff = DECL_CHAIN (f_groff);
12030 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12031 f_stack, NULL_TREE);
12032 size = int_size_in_bytes (type);
12033 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12035 dw_align = false;
12036 adjust = 0;
12037 if (aarch64_vfp_is_call_or_return_candidate (mode,
12038 type,
12039 &ag_mode,
12040 &nregs,
12041 &is_ha))
12043 /* No frontends can create types with variable-sized modes, so we
12044 shouldn't be asked to pass or return them. */
12045 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12047 /* TYPE passed in fp/simd registers. */
12048 if (!TARGET_FLOAT)
12049 aarch64_err_no_fpadvsimd (mode, "varargs");
12051 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12052 unshare_expr (valist), f_vrtop, NULL_TREE);
12053 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12054 unshare_expr (valist), f_vroff, NULL_TREE);
12056 rsize = nregs * UNITS_PER_VREG;
12058 if (is_ha)
12060 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12061 adjust = UNITS_PER_VREG - ag_size;
12063 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12064 && size < UNITS_PER_VREG)
12066 adjust = UNITS_PER_VREG - size;
12069 else
12071 /* TYPE passed in general registers. */
12072 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12073 unshare_expr (valist), f_grtop, NULL_TREE);
12074 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12075 unshare_expr (valist), f_groff, NULL_TREE);
12076 rsize = ROUND_UP (size, UNITS_PER_WORD);
12077 nregs = rsize / UNITS_PER_WORD;
12079 if (align > 8)
12080 dw_align = true;
12082 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12083 && size < UNITS_PER_WORD)
12085 adjust = UNITS_PER_WORD - size;
12089 /* Get a local temporary for the field value. */
12090 off = get_initialized_tmp_var (f_off, pre_p, NULL);
12092 /* Emit code to branch if off >= 0. */
12093 t = build2 (GE_EXPR, boolean_type_node, off,
12094 build_int_cst (TREE_TYPE (off), 0));
12095 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12097 if (dw_align)
12099 /* Emit: offs = (offs + 15) & -16. */
12100 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12101 build_int_cst (TREE_TYPE (off), 15));
12102 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12103 build_int_cst (TREE_TYPE (off), -16));
12104 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12106 else
12107 roundup = NULL;
12109 /* Update ap.__[g|v]r_offs */
12110 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12111 build_int_cst (TREE_TYPE (off), rsize));
12112 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12114 /* String up. */
12115 if (roundup)
12116 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12118 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12119 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12120 build_int_cst (TREE_TYPE (f_off), 0));
12121 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12123 /* String up: make sure the assignment happens before the use. */
12124 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12125 COND_EXPR_ELSE (cond1) = t;
12127 /* Prepare the trees handling the argument that is passed on the stack;
12128 the top level node will store in ON_STACK. */
12129 arg = get_initialized_tmp_var (stack, pre_p, NULL);
12130 if (align > 8)
12132 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12133 t = fold_convert (intDI_type_node, arg);
12134 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12135 build_int_cst (TREE_TYPE (t), 15));
12136 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12137 build_int_cst (TREE_TYPE (t), -16));
12138 t = fold_convert (TREE_TYPE (arg), t);
12139 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12141 else
12142 roundup = NULL;
12143 /* Advance ap.__stack */
12144 t = fold_convert (intDI_type_node, arg);
12145 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
12146 build_int_cst (TREE_TYPE (t), size + 7));
12147 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12148 build_int_cst (TREE_TYPE (t), -8));
12149 t = fold_convert (TREE_TYPE (arg), t);
12150 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12151 /* String up roundup and advance. */
12152 if (roundup)
12153 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12154 /* String up with arg */
12155 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12156 /* Big-endianness related address adjustment. */
12157 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12158 && size < UNITS_PER_WORD)
12160 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12161 size_int (UNITS_PER_WORD - size));
12162 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12165 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12166 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12168 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12169 t = off;
12170 if (adjust)
12171 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12172 build_int_cst (TREE_TYPE (off), adjust));
12174 t = fold_convert (sizetype, t);
12175 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12177 if (is_ha)
12179 /* type ha; // treat as "struct {ftype field[n];}"
12180 ... [computing offs]
12181 for (i = 0; i <nregs; ++i, offs += 16)
12182 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12183 return ha; */
12184 int i;
12185 tree tmp_ha, field_t, field_ptr_t;
12187 /* Declare a local variable. */
12188 tmp_ha = create_tmp_var_raw (type, "ha");
12189 gimple_add_tmp_var (tmp_ha);
12191 /* Establish the base type. */
12192 switch (ag_mode)
12194 case E_SFmode:
12195 field_t = float_type_node;
12196 field_ptr_t = float_ptr_type_node;
12197 break;
12198 case E_DFmode:
12199 field_t = double_type_node;
12200 field_ptr_t = double_ptr_type_node;
12201 break;
12202 case E_TFmode:
12203 field_t = long_double_type_node;
12204 field_ptr_t = long_double_ptr_type_node;
12205 break;
12206 case E_HFmode:
12207 field_t = aarch64_fp16_type_node;
12208 field_ptr_t = aarch64_fp16_ptr_type_node;
12209 break;
12210 case E_V2SImode:
12211 case E_V4SImode:
12213 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12214 field_t = build_vector_type_for_mode (innertype, ag_mode);
12215 field_ptr_t = build_pointer_type (field_t);
12217 break;
12218 default:
12219 gcc_assert (0);
12222 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12223 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12224 addr = t;
12225 t = fold_convert (field_ptr_t, addr);
12226 t = build2 (MODIFY_EXPR, field_t,
12227 build1 (INDIRECT_REF, field_t, tmp_ha),
12228 build1 (INDIRECT_REF, field_t, t));
12230 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12231 for (i = 1; i < nregs; ++i)
12233 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12234 u = fold_convert (field_ptr_t, addr);
12235 u = build2 (MODIFY_EXPR, field_t,
12236 build2 (MEM_REF, field_t, tmp_ha,
12237 build_int_cst (field_ptr_t,
12238 (i *
12239 int_size_in_bytes (field_t)))),
12240 build1 (INDIRECT_REF, field_t, u));
12241 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12244 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12245 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12248 COND_EXPR_ELSE (cond2) = t;
12249 addr = fold_convert (build_pointer_type (type), cond1);
12250 addr = build_va_arg_indirect_ref (addr);
12252 if (indirect_p)
12253 addr = build_va_arg_indirect_ref (addr);
12255 return addr;
12258 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12260 static void
12261 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12262 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12263 int no_rtl)
12265 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12266 CUMULATIVE_ARGS local_cum;
12267 int gr_saved = cfun->va_list_gpr_size;
12268 int vr_saved = cfun->va_list_fpr_size;
12270 /* The caller has advanced CUM up to, but not beyond, the last named
12271 argument. Advance a local copy of CUM past the last "real" named
12272 argument, to find out how many registers are left over. */
12273 local_cum = *cum;
12274 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12276 /* Found out how many registers we need to save.
12277 Honor tree-stdvar analysis results. */
12278 if (cfun->va_list_gpr_size)
12279 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12280 cfun->va_list_gpr_size / UNITS_PER_WORD);
12281 if (cfun->va_list_fpr_size)
12282 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12283 cfun->va_list_fpr_size / UNITS_PER_VREG);
12285 if (!TARGET_FLOAT)
12287 gcc_assert (local_cum.aapcs_nvrn == 0);
12288 vr_saved = 0;
12291 if (!no_rtl)
12293 if (gr_saved > 0)
12295 rtx ptr, mem;
12297 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12298 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12299 - gr_saved * UNITS_PER_WORD);
12300 mem = gen_frame_mem (BLKmode, ptr);
12301 set_mem_alias_set (mem, get_varargs_alias_set ());
12303 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12304 mem, gr_saved);
12306 if (vr_saved > 0)
12308 /* We can't use move_block_from_reg, because it will use
12309 the wrong mode, storing D regs only. */
12310 machine_mode mode = TImode;
12311 int off, i, vr_start;
12313 /* Set OFF to the offset from virtual_incoming_args_rtx of
12314 the first vector register. The VR save area lies below
12315 the GR one, and is aligned to 16 bytes. */
12316 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12317 STACK_BOUNDARY / BITS_PER_UNIT);
12318 off -= vr_saved * UNITS_PER_VREG;
12320 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12321 for (i = 0; i < vr_saved; ++i)
12323 rtx ptr, mem;
12325 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12326 mem = gen_frame_mem (mode, ptr);
12327 set_mem_alias_set (mem, get_varargs_alias_set ());
12328 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12329 off += UNITS_PER_VREG;
12334 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12335 any complication of having crtl->args.pretend_args_size changed. */
12336 cfun->machine->frame.saved_varargs_size
12337 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12338 STACK_BOUNDARY / BITS_PER_UNIT)
12339 + vr_saved * UNITS_PER_VREG);
12342 static void
12343 aarch64_conditional_register_usage (void)
12345 int i;
12346 if (!TARGET_FLOAT)
12348 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12350 fixed_regs[i] = 1;
12351 call_used_regs[i] = 1;
12354 if (!TARGET_SVE)
12355 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12357 fixed_regs[i] = 1;
12358 call_used_regs[i] = 1;
12362 /* Walk down the type tree of TYPE counting consecutive base elements.
12363 If *MODEP is VOIDmode, then set it to the first valid floating point
12364 type. If a non-floating point type is found, or if a floating point
12365 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12366 otherwise return the count in the sub-tree. */
12367 static int
12368 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12370 machine_mode mode;
12371 HOST_WIDE_INT size;
12373 switch (TREE_CODE (type))
12375 case REAL_TYPE:
12376 mode = TYPE_MODE (type);
12377 if (mode != DFmode && mode != SFmode
12378 && mode != TFmode && mode != HFmode)
12379 return -1;
12381 if (*modep == VOIDmode)
12382 *modep = mode;
12384 if (*modep == mode)
12385 return 1;
12387 break;
12389 case COMPLEX_TYPE:
12390 mode = TYPE_MODE (TREE_TYPE (type));
12391 if (mode != DFmode && mode != SFmode
12392 && mode != TFmode && mode != HFmode)
12393 return -1;
12395 if (*modep == VOIDmode)
12396 *modep = mode;
12398 if (*modep == mode)
12399 return 2;
12401 break;
12403 case VECTOR_TYPE:
12404 /* Use V2SImode and V4SImode as representatives of all 64-bit
12405 and 128-bit vector types. */
12406 size = int_size_in_bytes (type);
12407 switch (size)
12409 case 8:
12410 mode = V2SImode;
12411 break;
12412 case 16:
12413 mode = V4SImode;
12414 break;
12415 default:
12416 return -1;
12419 if (*modep == VOIDmode)
12420 *modep = mode;
12422 /* Vector modes are considered to be opaque: two vectors are
12423 equivalent for the purposes of being homogeneous aggregates
12424 if they are the same size. */
12425 if (*modep == mode)
12426 return 1;
12428 break;
12430 case ARRAY_TYPE:
12432 int count;
12433 tree index = TYPE_DOMAIN (type);
12435 /* Can't handle incomplete types nor sizes that are not
12436 fixed. */
12437 if (!COMPLETE_TYPE_P (type)
12438 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12439 return -1;
12441 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12442 if (count == -1
12443 || !index
12444 || !TYPE_MAX_VALUE (index)
12445 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12446 || !TYPE_MIN_VALUE (index)
12447 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12448 || count < 0)
12449 return -1;
12451 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12452 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12454 /* There must be no padding. */
12455 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12456 count * GET_MODE_BITSIZE (*modep)))
12457 return -1;
12459 return count;
12462 case RECORD_TYPE:
12464 int count = 0;
12465 int sub_count;
12466 tree field;
12468 /* Can't handle incomplete types nor sizes that are not
12469 fixed. */
12470 if (!COMPLETE_TYPE_P (type)
12471 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12472 return -1;
12474 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12476 if (TREE_CODE (field) != FIELD_DECL)
12477 continue;
12479 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12480 if (sub_count < 0)
12481 return -1;
12482 count += sub_count;
12485 /* There must be no padding. */
12486 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12487 count * GET_MODE_BITSIZE (*modep)))
12488 return -1;
12490 return count;
12493 case UNION_TYPE:
12494 case QUAL_UNION_TYPE:
12496 /* These aren't very interesting except in a degenerate case. */
12497 int count = 0;
12498 int sub_count;
12499 tree field;
12501 /* Can't handle incomplete types nor sizes that are not
12502 fixed. */
12503 if (!COMPLETE_TYPE_P (type)
12504 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12505 return -1;
12507 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12509 if (TREE_CODE (field) != FIELD_DECL)
12510 continue;
12512 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12513 if (sub_count < 0)
12514 return -1;
12515 count = count > sub_count ? count : sub_count;
12518 /* There must be no padding. */
12519 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12520 count * GET_MODE_BITSIZE (*modep)))
12521 return -1;
12523 return count;
12526 default:
12527 break;
12530 return -1;
12533 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12534 type as described in AAPCS64 \S 4.1.2.
12536 See the comment above aarch64_composite_type_p for the notes on MODE. */
12538 static bool
12539 aarch64_short_vector_p (const_tree type,
12540 machine_mode mode)
12542 poly_int64 size = -1;
12544 if (type && TREE_CODE (type) == VECTOR_TYPE)
12545 size = int_size_in_bytes (type);
12546 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12547 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12548 size = GET_MODE_SIZE (mode);
12550 return known_eq (size, 8) || known_eq (size, 16);
12553 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12554 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12555 array types. The C99 floating-point complex types are also considered
12556 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12557 types, which are GCC extensions and out of the scope of AAPCS64, are
12558 treated as composite types here as well.
12560 Note that MODE itself is not sufficient in determining whether a type
12561 is such a composite type or not. This is because
12562 stor-layout.c:compute_record_mode may have already changed the MODE
12563 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12564 structure with only one field may have its MODE set to the mode of the
12565 field. Also an integer mode whose size matches the size of the
12566 RECORD_TYPE type may be used to substitute the original mode
12567 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12568 solely relied on. */
12570 static bool
12571 aarch64_composite_type_p (const_tree type,
12572 machine_mode mode)
12574 if (aarch64_short_vector_p (type, mode))
12575 return false;
12577 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12578 return true;
12580 if (mode == BLKmode
12581 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12582 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12583 return true;
12585 return false;
12588 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12589 shall be passed or returned in simd/fp register(s) (providing these
12590 parameter passing registers are available).
12592 Upon successful return, *COUNT returns the number of needed registers,
12593 *BASE_MODE returns the mode of the individual register and when IS_HAF
12594 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12595 floating-point aggregate or a homogeneous short-vector aggregate. */
12597 static bool
12598 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12599 const_tree type,
12600 machine_mode *base_mode,
12601 int *count,
12602 bool *is_ha)
12604 machine_mode new_mode = VOIDmode;
12605 bool composite_p = aarch64_composite_type_p (type, mode);
12607 if (is_ha != NULL) *is_ha = false;
12609 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12610 || aarch64_short_vector_p (type, mode))
12612 *count = 1;
12613 new_mode = mode;
12615 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12617 if (is_ha != NULL) *is_ha = true;
12618 *count = 2;
12619 new_mode = GET_MODE_INNER (mode);
12621 else if (type && composite_p)
12623 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12625 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12627 if (is_ha != NULL) *is_ha = true;
12628 *count = ag_count;
12630 else
12631 return false;
12633 else
12634 return false;
12636 *base_mode = new_mode;
12637 return true;
12640 /* Implement TARGET_STRUCT_VALUE_RTX. */
12642 static rtx
12643 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12644 int incoming ATTRIBUTE_UNUSED)
12646 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12649 /* Implements target hook vector_mode_supported_p. */
12650 static bool
12651 aarch64_vector_mode_supported_p (machine_mode mode)
12653 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12654 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12657 /* Return appropriate SIMD container
12658 for MODE within a vector of WIDTH bits. */
12659 static machine_mode
12660 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12662 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12663 switch (mode)
12665 case E_DFmode:
12666 return VNx2DFmode;
12667 case E_SFmode:
12668 return VNx4SFmode;
12669 case E_HFmode:
12670 return VNx8HFmode;
12671 case E_DImode:
12672 return VNx2DImode;
12673 case E_SImode:
12674 return VNx4SImode;
12675 case E_HImode:
12676 return VNx8HImode;
12677 case E_QImode:
12678 return VNx16QImode;
12679 default:
12680 return word_mode;
12683 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12684 if (TARGET_SIMD)
12686 if (known_eq (width, 128))
12687 switch (mode)
12689 case E_DFmode:
12690 return V2DFmode;
12691 case E_SFmode:
12692 return V4SFmode;
12693 case E_HFmode:
12694 return V8HFmode;
12695 case E_SImode:
12696 return V4SImode;
12697 case E_HImode:
12698 return V8HImode;
12699 case E_QImode:
12700 return V16QImode;
12701 case E_DImode:
12702 return V2DImode;
12703 default:
12704 break;
12706 else
12707 switch (mode)
12709 case E_SFmode:
12710 return V2SFmode;
12711 case E_HFmode:
12712 return V4HFmode;
12713 case E_SImode:
12714 return V2SImode;
12715 case E_HImode:
12716 return V4HImode;
12717 case E_QImode:
12718 return V8QImode;
12719 default:
12720 break;
12723 return word_mode;
12726 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12727 static machine_mode
12728 aarch64_preferred_simd_mode (scalar_mode mode)
12730 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12731 return aarch64_simd_container_mode (mode, bits);
12734 /* Return a list of possible vector sizes for the vectorizer
12735 to iterate over. */
12736 static void
12737 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12739 if (TARGET_SVE)
12740 sizes->safe_push (BYTES_PER_SVE_VECTOR);
12741 sizes->safe_push (16);
12742 sizes->safe_push (8);
12745 /* Implement TARGET_MANGLE_TYPE. */
12747 static const char *
12748 aarch64_mangle_type (const_tree type)
12750 /* The AArch64 ABI documents say that "__va_list" has to be
12751 managled as if it is in the "std" namespace. */
12752 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12753 return "St9__va_list";
12755 /* Half-precision float. */
12756 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12757 return "Dh";
12759 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12760 builtin types. */
12761 if (TYPE_NAME (type) != NULL)
12762 return aarch64_mangle_builtin_type (type);
12764 /* Use the default mangling. */
12765 return NULL;
12768 /* Find the first rtx_insn before insn that will generate an assembly
12769 instruction. */
12771 static rtx_insn *
12772 aarch64_prev_real_insn (rtx_insn *insn)
12774 if (!insn)
12775 return NULL;
12779 insn = prev_real_insn (insn);
12781 while (insn && recog_memoized (insn) < 0);
12783 return insn;
12786 static bool
12787 is_madd_op (enum attr_type t1)
12789 unsigned int i;
12790 /* A number of these may be AArch32 only. */
12791 enum attr_type mlatypes[] = {
12792 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12793 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12794 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12797 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12799 if (t1 == mlatypes[i])
12800 return true;
12803 return false;
12806 /* Check if there is a register dependency between a load and the insn
12807 for which we hold recog_data. */
12809 static bool
12810 dep_between_memop_and_curr (rtx memop)
12812 rtx load_reg;
12813 int opno;
12815 gcc_assert (GET_CODE (memop) == SET);
12817 if (!REG_P (SET_DEST (memop)))
12818 return false;
12820 load_reg = SET_DEST (memop);
12821 for (opno = 1; opno < recog_data.n_operands; opno++)
12823 rtx operand = recog_data.operand[opno];
12824 if (REG_P (operand)
12825 && reg_overlap_mentioned_p (load_reg, operand))
12826 return true;
12829 return false;
12833 /* When working around the Cortex-A53 erratum 835769,
12834 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12835 instruction and has a preceding memory instruction such that a NOP
12836 should be inserted between them. */
12838 bool
12839 aarch64_madd_needs_nop (rtx_insn* insn)
12841 enum attr_type attr_type;
12842 rtx_insn *prev;
12843 rtx body;
12845 if (!TARGET_FIX_ERR_A53_835769)
12846 return false;
12848 if (!INSN_P (insn) || recog_memoized (insn) < 0)
12849 return false;
12851 attr_type = get_attr_type (insn);
12852 if (!is_madd_op (attr_type))
12853 return false;
12855 prev = aarch64_prev_real_insn (insn);
12856 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12857 Restore recog state to INSN to avoid state corruption. */
12858 extract_constrain_insn_cached (insn);
12860 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
12861 return false;
12863 body = single_set (prev);
12865 /* If the previous insn is a memory op and there is no dependency between
12866 it and the DImode madd, emit a NOP between them. If body is NULL then we
12867 have a complex memory operation, probably a load/store pair.
12868 Be conservative for now and emit a NOP. */
12869 if (GET_MODE (recog_data.operand[0]) == DImode
12870 && (!body || !dep_between_memop_and_curr (body)))
12871 return true;
12873 return false;
12878 /* Implement FINAL_PRESCAN_INSN. */
12880 void
12881 aarch64_final_prescan_insn (rtx_insn *insn)
12883 if (aarch64_madd_needs_nop (insn))
12884 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
12888 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
12889 instruction. */
12891 bool
12892 aarch64_sve_index_immediate_p (rtx base_or_step)
12894 return (CONST_INT_P (base_or_step)
12895 && IN_RANGE (INTVAL (base_or_step), -16, 15));
12898 /* Return true if X is a valid immediate for the SVE ADD and SUB
12899 instructions. Negate X first if NEGATE_P is true. */
12901 bool
12902 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
12904 rtx elt;
12906 if (!const_vec_duplicate_p (x, &elt)
12907 || !CONST_INT_P (elt))
12908 return false;
12910 HOST_WIDE_INT val = INTVAL (elt);
12911 if (negate_p)
12912 val = -val;
12913 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
12915 if (val & 0xff)
12916 return IN_RANGE (val, 0, 0xff);
12917 return IN_RANGE (val, 0, 0xff00);
12920 /* Return true if X is a valid immediate operand for an SVE logical
12921 instruction such as AND. */
12923 bool
12924 aarch64_sve_bitmask_immediate_p (rtx x)
12926 rtx elt;
12928 return (const_vec_duplicate_p (x, &elt)
12929 && CONST_INT_P (elt)
12930 && aarch64_bitmask_imm (INTVAL (elt),
12931 GET_MODE_INNER (GET_MODE (x))));
12934 /* Return true if X is a valid immediate for the SVE DUP and CPY
12935 instructions. */
12937 bool
12938 aarch64_sve_dup_immediate_p (rtx x)
12940 rtx elt;
12942 if (!const_vec_duplicate_p (x, &elt)
12943 || !CONST_INT_P (elt))
12944 return false;
12946 HOST_WIDE_INT val = INTVAL (elt);
12947 if (val & 0xff)
12948 return IN_RANGE (val, -0x80, 0x7f);
12949 return IN_RANGE (val, -0x8000, 0x7f00);
12952 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
12953 SIGNED_P says whether the operand is signed rather than unsigned. */
12955 bool
12956 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
12958 rtx elt;
12960 return (const_vec_duplicate_p (x, &elt)
12961 && CONST_INT_P (elt)
12962 && (signed_p
12963 ? IN_RANGE (INTVAL (elt), -16, 15)
12964 : IN_RANGE (INTVAL (elt), 0, 127)));
12967 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
12968 instruction. Negate X first if NEGATE_P is true. */
12970 bool
12971 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
12973 rtx elt;
12974 REAL_VALUE_TYPE r;
12976 if (!const_vec_duplicate_p (x, &elt)
12977 || GET_CODE (elt) != CONST_DOUBLE)
12978 return false;
12980 r = *CONST_DOUBLE_REAL_VALUE (elt);
12982 if (negate_p)
12983 r = real_value_negate (&r);
12985 if (real_equal (&r, &dconst1))
12986 return true;
12987 if (real_equal (&r, &dconsthalf))
12988 return true;
12989 return false;
12992 /* Return true if X is a valid immediate operand for an SVE FMUL
12993 instruction. */
12995 bool
12996 aarch64_sve_float_mul_immediate_p (rtx x)
12998 rtx elt;
13000 /* GCC will never generate a multiply with an immediate of 2, so there is no
13001 point testing for it (even though it is a valid constant). */
13002 return (const_vec_duplicate_p (x, &elt)
13003 && GET_CODE (elt) == CONST_DOUBLE
13004 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13007 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13008 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13009 is nonnull, use it to describe valid immediates. */
13010 static bool
13011 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13012 simd_immediate_info *info,
13013 enum simd_immediate_check which,
13014 simd_immediate_info::insn_type insn)
13016 /* Try a 4-byte immediate with LSL. */
13017 for (unsigned int shift = 0; shift < 32; shift += 8)
13018 if ((val32 & (0xff << shift)) == val32)
13020 if (info)
13021 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13022 simd_immediate_info::LSL, shift);
13023 return true;
13026 /* Try a 2-byte immediate with LSL. */
13027 unsigned int imm16 = val32 & 0xffff;
13028 if (imm16 == (val32 >> 16))
13029 for (unsigned int shift = 0; shift < 16; shift += 8)
13030 if ((imm16 & (0xff << shift)) == imm16)
13032 if (info)
13033 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13034 simd_immediate_info::LSL, shift);
13035 return true;
13038 /* Try a 4-byte immediate with MSL, except for cases that MVN
13039 can handle. */
13040 if (which == AARCH64_CHECK_MOV)
13041 for (unsigned int shift = 8; shift < 24; shift += 8)
13043 unsigned int low = (1 << shift) - 1;
13044 if (((val32 & (0xff << shift)) | low) == val32)
13046 if (info)
13047 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13048 simd_immediate_info::MSL, shift);
13049 return true;
13053 return false;
13056 /* Return true if replicating VAL64 is a valid immediate for the
13057 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13058 use it to describe valid immediates. */
13059 static bool
13060 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13061 simd_immediate_info *info,
13062 enum simd_immediate_check which)
13064 unsigned int val32 = val64 & 0xffffffff;
13065 unsigned int val16 = val64 & 0xffff;
13066 unsigned int val8 = val64 & 0xff;
13068 if (val32 == (val64 >> 32))
13070 if ((which & AARCH64_CHECK_ORR) != 0
13071 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13072 simd_immediate_info::MOV))
13073 return true;
13075 if ((which & AARCH64_CHECK_BIC) != 0
13076 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13077 simd_immediate_info::MVN))
13078 return true;
13080 /* Try using a replicated byte. */
13081 if (which == AARCH64_CHECK_MOV
13082 && val16 == (val32 >> 16)
13083 && val8 == (val16 >> 8))
13085 if (info)
13086 *info = simd_immediate_info (QImode, val8);
13087 return true;
13091 /* Try using a bit-to-bytemask. */
13092 if (which == AARCH64_CHECK_MOV)
13094 unsigned int i;
13095 for (i = 0; i < 64; i += 8)
13097 unsigned char byte = (val64 >> i) & 0xff;
13098 if (byte != 0 && byte != 0xff)
13099 break;
13101 if (i == 64)
13103 if (info)
13104 *info = simd_immediate_info (DImode, val64);
13105 return true;
13108 return false;
13111 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13112 instruction. If INFO is nonnull, use it to describe valid immediates. */
13114 static bool
13115 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13116 simd_immediate_info *info)
13118 scalar_int_mode mode = DImode;
13119 unsigned int val32 = val64 & 0xffffffff;
13120 if (val32 == (val64 >> 32))
13122 mode = SImode;
13123 unsigned int val16 = val32 & 0xffff;
13124 if (val16 == (val32 >> 16))
13126 mode = HImode;
13127 unsigned int val8 = val16 & 0xff;
13128 if (val8 == (val16 >> 8))
13129 mode = QImode;
13132 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13133 if (IN_RANGE (val, -0x80, 0x7f))
13135 /* DUP with no shift. */
13136 if (info)
13137 *info = simd_immediate_info (mode, val);
13138 return true;
13140 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13142 /* DUP with LSL #8. */
13143 if (info)
13144 *info = simd_immediate_info (mode, val);
13145 return true;
13147 if (aarch64_bitmask_imm (val64, mode))
13149 /* DUPM. */
13150 if (info)
13151 *info = simd_immediate_info (mode, val);
13152 return true;
13154 return false;
13157 /* Return true if OP is a valid SIMD immediate for the operation
13158 described by WHICH. If INFO is nonnull, use it to describe valid
13159 immediates. */
13160 bool
13161 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13162 enum simd_immediate_check which)
13164 machine_mode mode = GET_MODE (op);
13165 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13166 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13167 return false;
13169 scalar_mode elt_mode = GET_MODE_INNER (mode);
13170 rtx base, step;
13171 unsigned int n_elts;
13172 if (GET_CODE (op) == CONST_VECTOR
13173 && CONST_VECTOR_DUPLICATE_P (op))
13174 n_elts = CONST_VECTOR_NPATTERNS (op);
13175 else if ((vec_flags & VEC_SVE_DATA)
13176 && const_vec_series_p (op, &base, &step))
13178 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13179 if (!aarch64_sve_index_immediate_p (base)
13180 || !aarch64_sve_index_immediate_p (step))
13181 return false;
13183 if (info)
13184 *info = simd_immediate_info (elt_mode, base, step);
13185 return true;
13187 else if (GET_CODE (op) == CONST_VECTOR
13188 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13189 /* N_ELTS set above. */;
13190 else
13191 return false;
13193 /* Handle PFALSE and PTRUE. */
13194 if (vec_flags & VEC_SVE_PRED)
13195 return (op == CONST0_RTX (mode)
13196 || op == CONSTM1_RTX (mode));
13198 scalar_float_mode elt_float_mode;
13199 if (n_elts == 1
13200 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13202 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13203 if (aarch64_float_const_zero_rtx_p (elt)
13204 || aarch64_float_const_representable_p (elt))
13206 if (info)
13207 *info = simd_immediate_info (elt_float_mode, elt);
13208 return true;
13212 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13213 if (elt_size > 8)
13214 return false;
13216 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13218 /* Expand the vector constant out into a byte vector, with the least
13219 significant byte of the register first. */
13220 auto_vec<unsigned char, 16> bytes;
13221 bytes.reserve (n_elts * elt_size);
13222 for (unsigned int i = 0; i < n_elts; i++)
13224 /* The vector is provided in gcc endian-neutral fashion.
13225 For aarch64_be Advanced SIMD, it must be laid out in the vector
13226 register in reverse order. */
13227 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13228 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13230 if (elt_mode != elt_int_mode)
13231 elt = gen_lowpart (elt_int_mode, elt);
13233 if (!CONST_INT_P (elt))
13234 return false;
13236 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13237 for (unsigned int byte = 0; byte < elt_size; byte++)
13239 bytes.quick_push (elt_val & 0xff);
13240 elt_val >>= BITS_PER_UNIT;
13244 /* The immediate must repeat every eight bytes. */
13245 unsigned int nbytes = bytes.length ();
13246 for (unsigned i = 8; i < nbytes; ++i)
13247 if (bytes[i] != bytes[i - 8])
13248 return false;
13250 /* Get the repeating 8-byte value as an integer. No endian correction
13251 is needed here because bytes is already in lsb-first order. */
13252 unsigned HOST_WIDE_INT val64 = 0;
13253 for (unsigned int i = 0; i < 8; i++)
13254 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13255 << (i * BITS_PER_UNIT));
13257 if (vec_flags & VEC_SVE_DATA)
13258 return aarch64_sve_valid_immediate (val64, info);
13259 else
13260 return aarch64_advsimd_valid_immediate (val64, info, which);
13263 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13264 has a step in the range of INDEX. Return the index expression if so,
13265 otherwise return null. */
13267 aarch64_check_zero_based_sve_index_immediate (rtx x)
13269 rtx base, step;
13270 if (const_vec_series_p (x, &base, &step)
13271 && base == const0_rtx
13272 && aarch64_sve_index_immediate_p (step))
13273 return step;
13274 return NULL_RTX;
13277 /* Check of immediate shift constants are within range. */
13278 bool
13279 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13281 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13282 if (left)
13283 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13284 else
13285 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13288 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13289 operation of width WIDTH at bit position POS. */
13292 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13294 gcc_assert (CONST_INT_P (width));
13295 gcc_assert (CONST_INT_P (pos));
13297 unsigned HOST_WIDE_INT mask
13298 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13299 return GEN_INT (mask << UINTVAL (pos));
13302 bool
13303 aarch64_mov_operand_p (rtx x, machine_mode mode)
13305 if (GET_CODE (x) == HIGH
13306 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13307 return true;
13309 if (CONST_INT_P (x))
13310 return true;
13312 if (VECTOR_MODE_P (GET_MODE (x)))
13313 return aarch64_simd_valid_immediate (x, NULL);
13315 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13316 return true;
13318 if (aarch64_sve_cnt_immediate_p (x))
13319 return true;
13321 return aarch64_classify_symbolic_expression (x)
13322 == SYMBOL_TINY_ABSOLUTE;
13325 /* Return a const_int vector of VAL. */
13327 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13329 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13330 return gen_const_vec_duplicate (mode, c);
13333 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13335 bool
13336 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13338 machine_mode vmode;
13340 vmode = aarch64_simd_container_mode (mode, 64);
13341 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13342 return aarch64_simd_valid_immediate (op_v, NULL);
13345 /* Construct and return a PARALLEL RTX vector with elements numbering the
13346 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13347 the vector - from the perspective of the architecture. This does not
13348 line up with GCC's perspective on lane numbers, so we end up with
13349 different masks depending on our target endian-ness. The diagram
13350 below may help. We must draw the distinction when building masks
13351 which select one half of the vector. An instruction selecting
13352 architectural low-lanes for a big-endian target, must be described using
13353 a mask selecting GCC high-lanes.
13355 Big-Endian Little-Endian
13357 GCC 0 1 2 3 3 2 1 0
13358 | x | x | x | x | | x | x | x | x |
13359 Architecture 3 2 1 0 3 2 1 0
13361 Low Mask: { 2, 3 } { 0, 1 }
13362 High Mask: { 0, 1 } { 2, 3 }
13364 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13367 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13369 rtvec v = rtvec_alloc (nunits / 2);
13370 int high_base = nunits / 2;
13371 int low_base = 0;
13372 int base;
13373 rtx t1;
13374 int i;
13376 if (BYTES_BIG_ENDIAN)
13377 base = high ? low_base : high_base;
13378 else
13379 base = high ? high_base : low_base;
13381 for (i = 0; i < nunits / 2; i++)
13382 RTVEC_ELT (v, i) = GEN_INT (base + i);
13384 t1 = gen_rtx_PARALLEL (mode, v);
13385 return t1;
13388 /* Check OP for validity as a PARALLEL RTX vector with elements
13389 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13390 from the perspective of the architecture. See the diagram above
13391 aarch64_simd_vect_par_cnst_half for more details. */
13393 bool
13394 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13395 bool high)
13397 int nelts;
13398 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13399 return false;
13401 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13402 HOST_WIDE_INT count_op = XVECLEN (op, 0);
13403 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13404 int i = 0;
13406 if (count_op != count_ideal)
13407 return false;
13409 for (i = 0; i < count_ideal; i++)
13411 rtx elt_op = XVECEXP (op, 0, i);
13412 rtx elt_ideal = XVECEXP (ideal, 0, i);
13414 if (!CONST_INT_P (elt_op)
13415 || INTVAL (elt_ideal) != INTVAL (elt_op))
13416 return false;
13418 return true;
13421 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13422 HIGH (exclusive). */
13423 void
13424 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13425 const_tree exp)
13427 HOST_WIDE_INT lane;
13428 gcc_assert (CONST_INT_P (operand));
13429 lane = INTVAL (operand);
13431 if (lane < low || lane >= high)
13433 if (exp)
13434 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13435 else
13436 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13440 /* Peform endian correction on lane number N, which indexes a vector
13441 of mode MODE, and return the result as an SImode rtx. */
13444 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13446 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13449 /* Return TRUE if OP is a valid vector addressing mode. */
13451 bool
13452 aarch64_simd_mem_operand_p (rtx op)
13454 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13455 || REG_P (XEXP (op, 0)));
13458 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13460 bool
13461 aarch64_sve_ld1r_operand_p (rtx op)
13463 struct aarch64_address_info addr;
13464 scalar_mode mode;
13466 return (MEM_P (op)
13467 && is_a <scalar_mode> (GET_MODE (op), &mode)
13468 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13469 && addr.type == ADDRESS_REG_IMM
13470 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13473 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13474 The conditions for STR are the same. */
13475 bool
13476 aarch64_sve_ldr_operand_p (rtx op)
13478 struct aarch64_address_info addr;
13480 return (MEM_P (op)
13481 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13482 false, ADDR_QUERY_ANY)
13483 && addr.type == ADDRESS_REG_IMM);
13486 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13487 We need to be able to access the individual pieces, so the range
13488 is different from LD[234] and ST[234]. */
13489 bool
13490 aarch64_sve_struct_memory_operand_p (rtx op)
13492 if (!MEM_P (op))
13493 return false;
13495 machine_mode mode = GET_MODE (op);
13496 struct aarch64_address_info addr;
13497 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13498 ADDR_QUERY_ANY)
13499 || addr.type != ADDRESS_REG_IMM)
13500 return false;
13502 poly_int64 first = addr.const_offset;
13503 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13504 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13505 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13508 /* Emit a register copy from operand to operand, taking care not to
13509 early-clobber source registers in the process.
13511 COUNT is the number of components into which the copy needs to be
13512 decomposed. */
13513 void
13514 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13515 unsigned int count)
13517 unsigned int i;
13518 int rdest = REGNO (operands[0]);
13519 int rsrc = REGNO (operands[1]);
13521 if (!reg_overlap_mentioned_p (operands[0], operands[1])
13522 || rdest < rsrc)
13523 for (i = 0; i < count; i++)
13524 emit_move_insn (gen_rtx_REG (mode, rdest + i),
13525 gen_rtx_REG (mode, rsrc + i));
13526 else
13527 for (i = 0; i < count; i++)
13528 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13529 gen_rtx_REG (mode, rsrc + count - i - 1));
13532 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13533 one of VSTRUCT modes: OI, CI, or XI. */
13535 aarch64_simd_attr_length_rglist (machine_mode mode)
13537 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13538 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13541 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13542 alignment of a vector to 128 bits. SVE predicates have an alignment of
13543 16 bits. */
13544 static HOST_WIDE_INT
13545 aarch64_simd_vector_alignment (const_tree type)
13547 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13548 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13549 be set for non-predicate vectors of booleans. Modes are the most
13550 direct way we have of identifying real SVE predicate types. */
13551 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13552 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13553 return MIN (align, 128);
13556 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13557 static HOST_WIDE_INT
13558 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13560 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13562 /* If the length of the vector is fixed, try to align to that length,
13563 otherwise don't try to align at all. */
13564 HOST_WIDE_INT result;
13565 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13566 result = TYPE_ALIGN (TREE_TYPE (type));
13567 return result;
13569 return TYPE_ALIGN (type);
13572 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13573 static bool
13574 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13576 if (is_packed)
13577 return false;
13579 /* For fixed-length vectors, check that the vectorizer will aim for
13580 full-vector alignment. This isn't true for generic GCC vectors
13581 that are wider than the ABI maximum of 128 bits. */
13582 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13583 && (wi::to_widest (TYPE_SIZE (type))
13584 != aarch64_vectorize_preferred_vector_alignment (type)))
13585 return false;
13587 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13588 return true;
13591 /* Return true if the vector misalignment factor is supported by the
13592 target. */
13593 static bool
13594 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13595 const_tree type, int misalignment,
13596 bool is_packed)
13598 if (TARGET_SIMD && STRICT_ALIGNMENT)
13600 /* Return if movmisalign pattern is not supported for this mode. */
13601 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13602 return false;
13604 /* Misalignment factor is unknown at compile time. */
13605 if (misalignment == -1)
13606 return false;
13608 return default_builtin_support_vector_misalignment (mode, type, misalignment,
13609 is_packed);
13612 /* If VALS is a vector constant that can be loaded into a register
13613 using DUP, generate instructions to do so and return an RTX to
13614 assign to the register. Otherwise return NULL_RTX. */
13615 static rtx
13616 aarch64_simd_dup_constant (rtx vals)
13618 machine_mode mode = GET_MODE (vals);
13619 machine_mode inner_mode = GET_MODE_INNER (mode);
13620 rtx x;
13622 if (!const_vec_duplicate_p (vals, &x))
13623 return NULL_RTX;
13625 /* We can load this constant by using DUP and a constant in a
13626 single ARM register. This will be cheaper than a vector
13627 load. */
13628 x = copy_to_mode_reg (inner_mode, x);
13629 return gen_vec_duplicate (mode, x);
13633 /* Generate code to load VALS, which is a PARALLEL containing only
13634 constants (for vec_init) or CONST_VECTOR, efficiently into a
13635 register. Returns an RTX to copy into the register, or NULL_RTX
13636 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13637 static rtx
13638 aarch64_simd_make_constant (rtx vals)
13640 machine_mode mode = GET_MODE (vals);
13641 rtx const_dup;
13642 rtx const_vec = NULL_RTX;
13643 int n_const = 0;
13644 int i;
13646 if (GET_CODE (vals) == CONST_VECTOR)
13647 const_vec = vals;
13648 else if (GET_CODE (vals) == PARALLEL)
13650 /* A CONST_VECTOR must contain only CONST_INTs and
13651 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13652 Only store valid constants in a CONST_VECTOR. */
13653 int n_elts = XVECLEN (vals, 0);
13654 for (i = 0; i < n_elts; ++i)
13656 rtx x = XVECEXP (vals, 0, i);
13657 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13658 n_const++;
13660 if (n_const == n_elts)
13661 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13663 else
13664 gcc_unreachable ();
13666 if (const_vec != NULL_RTX
13667 && aarch64_simd_valid_immediate (const_vec, NULL))
13668 /* Load using MOVI/MVNI. */
13669 return const_vec;
13670 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13671 /* Loaded using DUP. */
13672 return const_dup;
13673 else if (const_vec != NULL_RTX)
13674 /* Load from constant pool. We can not take advantage of single-cycle
13675 LD1 because we need a PC-relative addressing mode. */
13676 return const_vec;
13677 else
13678 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13679 We can not construct an initializer. */
13680 return NULL_RTX;
13683 /* Expand a vector initialisation sequence, such that TARGET is
13684 initialised to contain VALS. */
13686 void
13687 aarch64_expand_vector_init (rtx target, rtx vals)
13689 machine_mode mode = GET_MODE (target);
13690 scalar_mode inner_mode = GET_MODE_INNER (mode);
13691 /* The number of vector elements. */
13692 int n_elts = XVECLEN (vals, 0);
13693 /* The number of vector elements which are not constant. */
13694 int n_var = 0;
13695 rtx any_const = NULL_RTX;
13696 /* The first element of vals. */
13697 rtx v0 = XVECEXP (vals, 0, 0);
13698 bool all_same = true;
13700 /* Count the number of variable elements to initialise. */
13701 for (int i = 0; i < n_elts; ++i)
13703 rtx x = XVECEXP (vals, 0, i);
13704 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13705 ++n_var;
13706 else
13707 any_const = x;
13709 all_same &= rtx_equal_p (x, v0);
13712 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13713 how best to handle this. */
13714 if (n_var == 0)
13716 rtx constant = aarch64_simd_make_constant (vals);
13717 if (constant != NULL_RTX)
13719 emit_move_insn (target, constant);
13720 return;
13724 /* Splat a single non-constant element if we can. */
13725 if (all_same)
13727 rtx x = copy_to_mode_reg (inner_mode, v0);
13728 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13729 return;
13732 enum insn_code icode = optab_handler (vec_set_optab, mode);
13733 gcc_assert (icode != CODE_FOR_nothing);
13735 /* If there are only variable elements, try to optimize
13736 the insertion using dup for the most common element
13737 followed by insertions. */
13739 /* The algorithm will fill matches[*][0] with the earliest matching element,
13740 and matches[X][1] with the count of duplicate elements (if X is the
13741 earliest element which has duplicates). */
13743 if (n_var == n_elts && n_elts <= 16)
13745 int matches[16][2] = {0};
13746 for (int i = 0; i < n_elts; i++)
13748 for (int j = 0; j <= i; j++)
13750 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13752 matches[i][0] = j;
13753 matches[j][1]++;
13754 break;
13758 int maxelement = 0;
13759 int maxv = 0;
13760 for (int i = 0; i < n_elts; i++)
13761 if (matches[i][1] > maxv)
13763 maxelement = i;
13764 maxv = matches[i][1];
13767 /* Create a duplicate of the most common element. */
13768 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13769 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13771 /* Insert the rest. */
13772 for (int i = 0; i < n_elts; i++)
13774 rtx x = XVECEXP (vals, 0, i);
13775 if (matches[i][0] == maxelement)
13776 continue;
13777 x = copy_to_mode_reg (inner_mode, x);
13778 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13780 return;
13783 /* Initialise a vector which is part-variable. We want to first try
13784 to build those lanes which are constant in the most efficient way we
13785 can. */
13786 if (n_var != n_elts)
13788 rtx copy = copy_rtx (vals);
13790 /* Load constant part of vector. We really don't care what goes into the
13791 parts we will overwrite, but we're more likely to be able to load the
13792 constant efficiently if it has fewer, larger, repeating parts
13793 (see aarch64_simd_valid_immediate). */
13794 for (int i = 0; i < n_elts; i++)
13796 rtx x = XVECEXP (vals, 0, i);
13797 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13798 continue;
13799 rtx subst = any_const;
13800 for (int bit = n_elts / 2; bit > 0; bit /= 2)
13802 /* Look in the copied vector, as more elements are const. */
13803 rtx test = XVECEXP (copy, 0, i ^ bit);
13804 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13806 subst = test;
13807 break;
13810 XVECEXP (copy, 0, i) = subst;
13812 aarch64_expand_vector_init (target, copy);
13815 /* Insert the variable lanes directly. */
13816 for (int i = 0; i < n_elts; i++)
13818 rtx x = XVECEXP (vals, 0, i);
13819 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13820 continue;
13821 x = copy_to_mode_reg (inner_mode, x);
13822 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13826 static unsigned HOST_WIDE_INT
13827 aarch64_shift_truncation_mask (machine_mode mode)
13829 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
13830 return 0;
13831 return GET_MODE_UNIT_BITSIZE (mode) - 1;
13834 /* Select a format to encode pointers in exception handling data. */
13836 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
13838 int type;
13839 switch (aarch64_cmodel)
13841 case AARCH64_CMODEL_TINY:
13842 case AARCH64_CMODEL_TINY_PIC:
13843 case AARCH64_CMODEL_SMALL:
13844 case AARCH64_CMODEL_SMALL_PIC:
13845 case AARCH64_CMODEL_SMALL_SPIC:
13846 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
13847 for everything. */
13848 type = DW_EH_PE_sdata4;
13849 break;
13850 default:
13851 /* No assumptions here. 8-byte relocs required. */
13852 type = DW_EH_PE_sdata8;
13853 break;
13855 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
13858 /* The last .arch and .tune assembly strings that we printed. */
13859 static std::string aarch64_last_printed_arch_string;
13860 static std::string aarch64_last_printed_tune_string;
13862 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
13863 by the function fndecl. */
13865 void
13866 aarch64_declare_function_name (FILE *stream, const char* name,
13867 tree fndecl)
13869 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
13871 struct cl_target_option *targ_options;
13872 if (target_parts)
13873 targ_options = TREE_TARGET_OPTION (target_parts);
13874 else
13875 targ_options = TREE_TARGET_OPTION (target_option_current_node);
13876 gcc_assert (targ_options);
13878 const struct processor *this_arch
13879 = aarch64_get_arch (targ_options->x_explicit_arch);
13881 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
13882 std::string extension
13883 = aarch64_get_extension_string_for_isa_flags (isa_flags,
13884 this_arch->flags);
13885 /* Only update the assembler .arch string if it is distinct from the last
13886 such string we printed. */
13887 std::string to_print = this_arch->name + extension;
13888 if (to_print != aarch64_last_printed_arch_string)
13890 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
13891 aarch64_last_printed_arch_string = to_print;
13894 /* Print the cpu name we're tuning for in the comments, might be
13895 useful to readers of the generated asm. Do it only when it changes
13896 from function to function and verbose assembly is requested. */
13897 const struct processor *this_tune
13898 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
13900 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
13902 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
13903 this_tune->name);
13904 aarch64_last_printed_tune_string = this_tune->name;
13907 /* Don't forget the type directive for ELF. */
13908 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
13909 ASM_OUTPUT_LABEL (stream, name);
13912 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
13914 static void
13915 aarch64_start_file (void)
13917 struct cl_target_option *default_options
13918 = TREE_TARGET_OPTION (target_option_default_node);
13920 const struct processor *default_arch
13921 = aarch64_get_arch (default_options->x_explicit_arch);
13922 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
13923 std::string extension
13924 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
13925 default_arch->flags);
13927 aarch64_last_printed_arch_string = default_arch->name + extension;
13928 aarch64_last_printed_tune_string = "";
13929 asm_fprintf (asm_out_file, "\t.arch %s\n",
13930 aarch64_last_printed_arch_string.c_str ());
13932 default_file_start ();
13935 /* Emit load exclusive. */
13937 static void
13938 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
13939 rtx mem, rtx model_rtx)
13941 rtx (*gen) (rtx, rtx, rtx);
13943 switch (mode)
13945 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
13946 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
13947 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
13948 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
13949 default:
13950 gcc_unreachable ();
13953 emit_insn (gen (rval, mem, model_rtx));
13956 /* Emit store exclusive. */
13958 static void
13959 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
13960 rtx rval, rtx mem, rtx model_rtx)
13962 rtx (*gen) (rtx, rtx, rtx, rtx);
13964 switch (mode)
13966 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
13967 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
13968 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
13969 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
13970 default:
13971 gcc_unreachable ();
13974 emit_insn (gen (bval, rval, mem, model_rtx));
13977 /* Mark the previous jump instruction as unlikely. */
13979 static void
13980 aarch64_emit_unlikely_jump (rtx insn)
13982 rtx_insn *jump = emit_jump_insn (insn);
13983 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
13986 /* Expand a compare and swap pattern. */
13988 void
13989 aarch64_expand_compare_and_swap (rtx operands[])
13991 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
13992 machine_mode mode, cmp_mode;
13993 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
13994 int idx;
13995 gen_cas_fn gen;
13996 const gen_cas_fn split_cas[] =
13998 gen_aarch64_compare_and_swapqi,
13999 gen_aarch64_compare_and_swaphi,
14000 gen_aarch64_compare_and_swapsi,
14001 gen_aarch64_compare_and_swapdi
14003 const gen_cas_fn atomic_cas[] =
14005 gen_aarch64_compare_and_swapqi_lse,
14006 gen_aarch64_compare_and_swaphi_lse,
14007 gen_aarch64_compare_and_swapsi_lse,
14008 gen_aarch64_compare_and_swapdi_lse
14011 bval = operands[0];
14012 rval = operands[1];
14013 mem = operands[2];
14014 oldval = operands[3];
14015 newval = operands[4];
14016 is_weak = operands[5];
14017 mod_s = operands[6];
14018 mod_f = operands[7];
14019 mode = GET_MODE (mem);
14020 cmp_mode = mode;
14022 /* Normally the succ memory model must be stronger than fail, but in the
14023 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14024 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14026 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14027 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14028 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14030 switch (mode)
14032 case E_QImode:
14033 case E_HImode:
14034 /* For short modes, we're going to perform the comparison in SImode,
14035 so do the zero-extension now. */
14036 cmp_mode = SImode;
14037 rval = gen_reg_rtx (SImode);
14038 oldval = convert_modes (SImode, mode, oldval, true);
14039 /* Fall through. */
14041 case E_SImode:
14042 case E_DImode:
14043 /* Force the value into a register if needed. */
14044 if (!aarch64_plus_operand (oldval, mode))
14045 oldval = force_reg (cmp_mode, oldval);
14046 break;
14048 default:
14049 gcc_unreachable ();
14052 switch (mode)
14054 case E_QImode: idx = 0; break;
14055 case E_HImode: idx = 1; break;
14056 case E_SImode: idx = 2; break;
14057 case E_DImode: idx = 3; break;
14058 default:
14059 gcc_unreachable ();
14061 if (TARGET_LSE)
14062 gen = atomic_cas[idx];
14063 else
14064 gen = split_cas[idx];
14066 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14068 if (mode == QImode || mode == HImode)
14069 emit_move_insn (operands[1], gen_lowpart (mode, rval));
14071 x = gen_rtx_REG (CCmode, CC_REGNUM);
14072 x = gen_rtx_EQ (SImode, x, const0_rtx);
14073 emit_insn (gen_rtx_SET (bval, x));
14076 /* Test whether the target supports using a atomic load-operate instruction.
14077 CODE is the operation and AFTER is TRUE if the data in memory after the
14078 operation should be returned and FALSE if the data before the operation
14079 should be returned. Returns FALSE if the operation isn't supported by the
14080 architecture. */
14082 bool
14083 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14085 if (!TARGET_LSE)
14086 return false;
14088 switch (code)
14090 case SET:
14091 case AND:
14092 case IOR:
14093 case XOR:
14094 case MINUS:
14095 case PLUS:
14096 return true;
14097 default:
14098 return false;
14102 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14103 sequence implementing an atomic operation. */
14105 static void
14106 aarch64_emit_post_barrier (enum memmodel model)
14108 const enum memmodel base_model = memmodel_base (model);
14110 if (is_mm_sync (model)
14111 && (base_model == MEMMODEL_ACQUIRE
14112 || base_model == MEMMODEL_ACQ_REL
14113 || base_model == MEMMODEL_SEQ_CST))
14115 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14119 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14120 for the data in memory. EXPECTED is the value expected to be in memory.
14121 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14122 is the memory ordering to use. */
14124 void
14125 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14126 rtx expected, rtx desired,
14127 rtx model)
14129 rtx (*gen) (rtx, rtx, rtx, rtx);
14130 machine_mode mode;
14132 mode = GET_MODE (mem);
14134 switch (mode)
14136 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14137 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14138 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14139 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14140 default:
14141 gcc_unreachable ();
14144 /* Move the expected value into the CAS destination register. */
14145 emit_insn (gen_rtx_SET (rval, expected));
14147 /* Emit the CAS. */
14148 emit_insn (gen (rval, mem, desired, model));
14150 /* Compare the expected value with the value loaded by the CAS, to establish
14151 whether the swap was made. */
14152 aarch64_gen_compare_reg (EQ, rval, expected);
14155 /* Split a compare and swap pattern. */
14157 void
14158 aarch64_split_compare_and_swap (rtx operands[])
14160 rtx rval, mem, oldval, newval, scratch;
14161 machine_mode mode;
14162 bool is_weak;
14163 rtx_code_label *label1, *label2;
14164 rtx x, cond;
14165 enum memmodel model;
14166 rtx model_rtx;
14168 rval = operands[0];
14169 mem = operands[1];
14170 oldval = operands[2];
14171 newval = operands[3];
14172 is_weak = (operands[4] != const0_rtx);
14173 model_rtx = operands[5];
14174 scratch = operands[7];
14175 mode = GET_MODE (mem);
14176 model = memmodel_from_int (INTVAL (model_rtx));
14178 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14179 loop:
14180 .label1:
14181 LD[A]XR rval, [mem]
14182 CBNZ rval, .label2
14183 ST[L]XR scratch, newval, [mem]
14184 CBNZ scratch, .label1
14185 .label2:
14186 CMP rval, 0. */
14187 bool strong_zero_p = !is_weak && oldval == const0_rtx;
14189 label1 = NULL;
14190 if (!is_weak)
14192 label1 = gen_label_rtx ();
14193 emit_label (label1);
14195 label2 = gen_label_rtx ();
14197 /* The initial load can be relaxed for a __sync operation since a final
14198 barrier will be emitted to stop code hoisting. */
14199 if (is_mm_sync (model))
14200 aarch64_emit_load_exclusive (mode, rval, mem,
14201 GEN_INT (MEMMODEL_RELAXED));
14202 else
14203 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14205 if (strong_zero_p)
14207 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14208 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14209 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14210 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14212 else
14214 cond = aarch64_gen_compare_reg (NE, rval, oldval);
14215 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14216 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14217 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14218 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14221 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14223 if (!is_weak)
14225 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14226 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14227 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14228 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14230 else
14232 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14233 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14234 emit_insn (gen_rtx_SET (cond, x));
14237 emit_label (label2);
14238 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14239 to set the condition flags. If this is not used it will be removed by
14240 later passes. */
14241 if (strong_zero_p)
14243 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14244 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14245 emit_insn (gen_rtx_SET (cond, x));
14247 /* Emit any final barrier needed for a __sync operation. */
14248 if (is_mm_sync (model))
14249 aarch64_emit_post_barrier (model);
14252 /* Emit a BIC instruction. */
14254 static void
14255 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14257 rtx shift_rtx = GEN_INT (shift);
14258 rtx (*gen) (rtx, rtx, rtx, rtx);
14260 switch (mode)
14262 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14263 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14264 default:
14265 gcc_unreachable ();
14268 emit_insn (gen (dst, s2, shift_rtx, s1));
14271 /* Emit an atomic swap. */
14273 static void
14274 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14275 rtx mem, rtx model)
14277 rtx (*gen) (rtx, rtx, rtx, rtx);
14279 switch (mode)
14281 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14282 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14283 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14284 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14285 default:
14286 gcc_unreachable ();
14289 emit_insn (gen (dst, mem, value, model));
14292 /* Operations supported by aarch64_emit_atomic_load_op. */
14294 enum aarch64_atomic_load_op_code
14296 AARCH64_LDOP_PLUS, /* A + B */
14297 AARCH64_LDOP_XOR, /* A ^ B */
14298 AARCH64_LDOP_OR, /* A | B */
14299 AARCH64_LDOP_BIC /* A & ~B */
14302 /* Emit an atomic load-operate. */
14304 static void
14305 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14306 machine_mode mode, rtx dst, rtx src,
14307 rtx mem, rtx model)
14309 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14310 const aarch64_atomic_load_op_fn plus[] =
14312 gen_aarch64_atomic_loadaddqi,
14313 gen_aarch64_atomic_loadaddhi,
14314 gen_aarch64_atomic_loadaddsi,
14315 gen_aarch64_atomic_loadadddi
14317 const aarch64_atomic_load_op_fn eor[] =
14319 gen_aarch64_atomic_loadeorqi,
14320 gen_aarch64_atomic_loadeorhi,
14321 gen_aarch64_atomic_loadeorsi,
14322 gen_aarch64_atomic_loadeordi
14324 const aarch64_atomic_load_op_fn ior[] =
14326 gen_aarch64_atomic_loadsetqi,
14327 gen_aarch64_atomic_loadsethi,
14328 gen_aarch64_atomic_loadsetsi,
14329 gen_aarch64_atomic_loadsetdi
14331 const aarch64_atomic_load_op_fn bic[] =
14333 gen_aarch64_atomic_loadclrqi,
14334 gen_aarch64_atomic_loadclrhi,
14335 gen_aarch64_atomic_loadclrsi,
14336 gen_aarch64_atomic_loadclrdi
14338 aarch64_atomic_load_op_fn gen;
14339 int idx = 0;
14341 switch (mode)
14343 case E_QImode: idx = 0; break;
14344 case E_HImode: idx = 1; break;
14345 case E_SImode: idx = 2; break;
14346 case E_DImode: idx = 3; break;
14347 default:
14348 gcc_unreachable ();
14351 switch (code)
14353 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14354 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14355 case AARCH64_LDOP_OR: gen = ior[idx]; break;
14356 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14357 default:
14358 gcc_unreachable ();
14361 emit_insn (gen (dst, mem, src, model));
14364 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14365 location to store the data read from memory. OUT_RESULT is the location to
14366 store the result of the operation. MEM is the memory location to read and
14367 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14368 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14369 be NULL. */
14371 void
14372 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14373 rtx mem, rtx value, rtx model_rtx)
14375 machine_mode mode = GET_MODE (mem);
14376 machine_mode wmode = (mode == DImode ? DImode : SImode);
14377 const bool short_mode = (mode < SImode);
14378 aarch64_atomic_load_op_code ldop_code;
14379 rtx src;
14380 rtx x;
14382 if (out_data)
14383 out_data = gen_lowpart (mode, out_data);
14385 if (out_result)
14386 out_result = gen_lowpart (mode, out_result);
14388 /* Make sure the value is in a register, putting it into a destination
14389 register if it needs to be manipulated. */
14390 if (!register_operand (value, mode)
14391 || code == AND || code == MINUS)
14393 src = out_result ? out_result : out_data;
14394 emit_move_insn (src, gen_lowpart (mode, value));
14396 else
14397 src = value;
14398 gcc_assert (register_operand (src, mode));
14400 /* Preprocess the data for the operation as necessary. If the operation is
14401 a SET then emit a swap instruction and finish. */
14402 switch (code)
14404 case SET:
14405 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14406 return;
14408 case MINUS:
14409 /* Negate the value and treat it as a PLUS. */
14411 rtx neg_src;
14413 /* Resize the value if necessary. */
14414 if (short_mode)
14415 src = gen_lowpart (wmode, src);
14417 neg_src = gen_rtx_NEG (wmode, src);
14418 emit_insn (gen_rtx_SET (src, neg_src));
14420 if (short_mode)
14421 src = gen_lowpart (mode, src);
14423 /* Fall-through. */
14424 case PLUS:
14425 ldop_code = AARCH64_LDOP_PLUS;
14426 break;
14428 case IOR:
14429 ldop_code = AARCH64_LDOP_OR;
14430 break;
14432 case XOR:
14433 ldop_code = AARCH64_LDOP_XOR;
14434 break;
14436 case AND:
14438 rtx not_src;
14440 /* Resize the value if necessary. */
14441 if (short_mode)
14442 src = gen_lowpart (wmode, src);
14444 not_src = gen_rtx_NOT (wmode, src);
14445 emit_insn (gen_rtx_SET (src, not_src));
14447 if (short_mode)
14448 src = gen_lowpart (mode, src);
14450 ldop_code = AARCH64_LDOP_BIC;
14451 break;
14453 default:
14454 /* The operation can't be done with atomic instructions. */
14455 gcc_unreachable ();
14458 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14460 /* If necessary, calculate the data in memory after the update by redoing the
14461 operation from values in registers. */
14462 if (!out_result)
14463 return;
14465 if (short_mode)
14467 src = gen_lowpart (wmode, src);
14468 out_data = gen_lowpart (wmode, out_data);
14469 out_result = gen_lowpart (wmode, out_result);
14472 x = NULL_RTX;
14474 switch (code)
14476 case MINUS:
14477 case PLUS:
14478 x = gen_rtx_PLUS (wmode, out_data, src);
14479 break;
14480 case IOR:
14481 x = gen_rtx_IOR (wmode, out_data, src);
14482 break;
14483 case XOR:
14484 x = gen_rtx_XOR (wmode, out_data, src);
14485 break;
14486 case AND:
14487 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14488 return;
14489 default:
14490 gcc_unreachable ();
14493 emit_set_insn (out_result, x);
14495 return;
14498 /* Split an atomic operation. */
14500 void
14501 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14502 rtx value, rtx model_rtx, rtx cond)
14504 machine_mode mode = GET_MODE (mem);
14505 machine_mode wmode = (mode == DImode ? DImode : SImode);
14506 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14507 const bool is_sync = is_mm_sync (model);
14508 rtx_code_label *label;
14509 rtx x;
14511 /* Split the atomic operation into a sequence. */
14512 label = gen_label_rtx ();
14513 emit_label (label);
14515 if (new_out)
14516 new_out = gen_lowpart (wmode, new_out);
14517 if (old_out)
14518 old_out = gen_lowpart (wmode, old_out);
14519 else
14520 old_out = new_out;
14521 value = simplify_gen_subreg (wmode, value, mode, 0);
14523 /* The initial load can be relaxed for a __sync operation since a final
14524 barrier will be emitted to stop code hoisting. */
14525 if (is_sync)
14526 aarch64_emit_load_exclusive (mode, old_out, mem,
14527 GEN_INT (MEMMODEL_RELAXED));
14528 else
14529 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14531 switch (code)
14533 case SET:
14534 new_out = value;
14535 break;
14537 case NOT:
14538 x = gen_rtx_AND (wmode, old_out, value);
14539 emit_insn (gen_rtx_SET (new_out, x));
14540 x = gen_rtx_NOT (wmode, new_out);
14541 emit_insn (gen_rtx_SET (new_out, x));
14542 break;
14544 case MINUS:
14545 if (CONST_INT_P (value))
14547 value = GEN_INT (-INTVAL (value));
14548 code = PLUS;
14550 /* Fall through. */
14552 default:
14553 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14554 emit_insn (gen_rtx_SET (new_out, x));
14555 break;
14558 aarch64_emit_store_exclusive (mode, cond, mem,
14559 gen_lowpart (mode, new_out), model_rtx);
14561 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14562 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14563 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14564 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14566 /* Emit any final barrier needed for a __sync operation. */
14567 if (is_sync)
14568 aarch64_emit_post_barrier (model);
14571 static void
14572 aarch64_init_libfuncs (void)
14574 /* Half-precision float operations. The compiler handles all operations
14575 with NULL libfuncs by converting to SFmode. */
14577 /* Conversions. */
14578 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14579 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14581 /* Arithmetic. */
14582 set_optab_libfunc (add_optab, HFmode, NULL);
14583 set_optab_libfunc (sdiv_optab, HFmode, NULL);
14584 set_optab_libfunc (smul_optab, HFmode, NULL);
14585 set_optab_libfunc (neg_optab, HFmode, NULL);
14586 set_optab_libfunc (sub_optab, HFmode, NULL);
14588 /* Comparisons. */
14589 set_optab_libfunc (eq_optab, HFmode, NULL);
14590 set_optab_libfunc (ne_optab, HFmode, NULL);
14591 set_optab_libfunc (lt_optab, HFmode, NULL);
14592 set_optab_libfunc (le_optab, HFmode, NULL);
14593 set_optab_libfunc (ge_optab, HFmode, NULL);
14594 set_optab_libfunc (gt_optab, HFmode, NULL);
14595 set_optab_libfunc (unord_optab, HFmode, NULL);
14598 /* Target hook for c_mode_for_suffix. */
14599 static machine_mode
14600 aarch64_c_mode_for_suffix (char suffix)
14602 if (suffix == 'q')
14603 return TFmode;
14605 return VOIDmode;
14608 /* We can only represent floating point constants which will fit in
14609 "quarter-precision" values. These values are characterised by
14610 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14613 (-1)^s * (n/16) * 2^r
14615 Where:
14616 's' is the sign bit.
14617 'n' is an integer in the range 16 <= n <= 31.
14618 'r' is an integer in the range -3 <= r <= 4. */
14620 /* Return true iff X can be represented by a quarter-precision
14621 floating point immediate operand X. Note, we cannot represent 0.0. */
14622 bool
14623 aarch64_float_const_representable_p (rtx x)
14625 /* This represents our current view of how many bits
14626 make up the mantissa. */
14627 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14628 int exponent;
14629 unsigned HOST_WIDE_INT mantissa, mask;
14630 REAL_VALUE_TYPE r, m;
14631 bool fail;
14633 if (!CONST_DOUBLE_P (x))
14634 return false;
14636 /* We don't support HFmode constants yet. */
14637 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14638 return false;
14640 r = *CONST_DOUBLE_REAL_VALUE (x);
14642 /* We cannot represent infinities, NaNs or +/-zero. We won't
14643 know if we have +zero until we analyse the mantissa, but we
14644 can reject the other invalid values. */
14645 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14646 || REAL_VALUE_MINUS_ZERO (r))
14647 return false;
14649 /* Extract exponent. */
14650 r = real_value_abs (&r);
14651 exponent = REAL_EXP (&r);
14653 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14654 highest (sign) bit, with a fixed binary point at bit point_pos.
14655 m1 holds the low part of the mantissa, m2 the high part.
14656 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14657 bits for the mantissa, this can fail (low bits will be lost). */
14658 real_ldexp (&m, &r, point_pos - exponent);
14659 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14661 /* If the low part of the mantissa has bits set we cannot represent
14662 the value. */
14663 if (w.ulow () != 0)
14664 return false;
14665 /* We have rejected the lower HOST_WIDE_INT, so update our
14666 understanding of how many bits lie in the mantissa and
14667 look only at the high HOST_WIDE_INT. */
14668 mantissa = w.elt (1);
14669 point_pos -= HOST_BITS_PER_WIDE_INT;
14671 /* We can only represent values with a mantissa of the form 1.xxxx. */
14672 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14673 if ((mantissa & mask) != 0)
14674 return false;
14676 /* Having filtered unrepresentable values, we may now remove all
14677 but the highest 5 bits. */
14678 mantissa >>= point_pos - 5;
14680 /* We cannot represent the value 0.0, so reject it. This is handled
14681 elsewhere. */
14682 if (mantissa == 0)
14683 return false;
14685 /* Then, as bit 4 is always set, we can mask it off, leaving
14686 the mantissa in the range [0, 15]. */
14687 mantissa &= ~(1 << 4);
14688 gcc_assert (mantissa <= 15);
14690 /* GCC internally does not use IEEE754-like encoding (where normalized
14691 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14692 Our mantissa values are shifted 4 places to the left relative to
14693 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14694 by 5 places to correct for GCC's representation. */
14695 exponent = 5 - exponent;
14697 return (exponent >= 0 && exponent <= 7);
14700 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14701 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14702 output MOVI/MVNI, ORR or BIC immediate. */
14703 char*
14704 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14705 enum simd_immediate_check which)
14707 bool is_valid;
14708 static char templ[40];
14709 const char *mnemonic;
14710 const char *shift_op;
14711 unsigned int lane_count = 0;
14712 char element_char;
14714 struct simd_immediate_info info;
14716 /* This will return true to show const_vector is legal for use as either
14717 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14718 It will also update INFO to show how the immediate should be generated.
14719 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14720 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14721 gcc_assert (is_valid);
14723 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14724 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14726 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14728 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14729 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14730 move immediate path. */
14731 if (aarch64_float_const_zero_rtx_p (info.value))
14732 info.value = GEN_INT (0);
14733 else
14735 const unsigned int buf_size = 20;
14736 char float_buf[buf_size] = {'\0'};
14737 real_to_decimal_for_mode (float_buf,
14738 CONST_DOUBLE_REAL_VALUE (info.value),
14739 buf_size, buf_size, 1, info.elt_mode);
14741 if (lane_count == 1)
14742 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14743 else
14744 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14745 lane_count, element_char, float_buf);
14746 return templ;
14750 gcc_assert (CONST_INT_P (info.value));
14752 if (which == AARCH64_CHECK_MOV)
14754 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14755 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14756 if (lane_count == 1)
14757 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14758 mnemonic, UINTVAL (info.value));
14759 else if (info.shift)
14760 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14761 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14762 element_char, UINTVAL (info.value), shift_op, info.shift);
14763 else
14764 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14765 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14766 element_char, UINTVAL (info.value));
14768 else
14770 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
14771 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14772 if (info.shift)
14773 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14774 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14775 element_char, UINTVAL (info.value), "lsl", info.shift);
14776 else
14777 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14778 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14779 element_char, UINTVAL (info.value));
14781 return templ;
14784 char*
14785 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14788 /* If a floating point number was passed and we desire to use it in an
14789 integer mode do the conversion to integer. */
14790 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14792 unsigned HOST_WIDE_INT ival;
14793 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14794 gcc_unreachable ();
14795 immediate = gen_int_mode (ival, mode);
14798 machine_mode vmode;
14799 /* use a 64 bit mode for everything except for DI/DF mode, where we use
14800 a 128 bit vector mode. */
14801 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14803 vmode = aarch64_simd_container_mode (mode, width);
14804 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14805 return aarch64_output_simd_mov_immediate (v_op, width);
14808 /* Return the output string to use for moving immediate CONST_VECTOR
14809 into an SVE register. */
14811 char *
14812 aarch64_output_sve_mov_immediate (rtx const_vector)
14814 static char templ[40];
14815 struct simd_immediate_info info;
14816 char element_char;
14818 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
14819 gcc_assert (is_valid);
14821 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14823 if (info.step)
14825 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
14826 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
14827 element_char, INTVAL (info.value), INTVAL (info.step));
14828 return templ;
14831 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14833 if (aarch64_float_const_zero_rtx_p (info.value))
14834 info.value = GEN_INT (0);
14835 else
14837 const int buf_size = 20;
14838 char float_buf[buf_size] = {};
14839 real_to_decimal_for_mode (float_buf,
14840 CONST_DOUBLE_REAL_VALUE (info.value),
14841 buf_size, buf_size, 1, info.elt_mode);
14843 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
14844 element_char, float_buf);
14845 return templ;
14849 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
14850 element_char, INTVAL (info.value));
14851 return templ;
14854 /* Return the asm format for a PTRUE instruction whose destination has
14855 mode MODE. SUFFIX is the element size suffix. */
14857 char *
14858 aarch64_output_ptrue (machine_mode mode, char suffix)
14860 unsigned int nunits;
14861 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
14862 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
14863 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
14864 else
14865 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
14866 return buf;
14869 /* Split operands into moves from op[1] + op[2] into op[0]. */
14871 void
14872 aarch64_split_combinev16qi (rtx operands[3])
14874 unsigned int dest = REGNO (operands[0]);
14875 unsigned int src1 = REGNO (operands[1]);
14876 unsigned int src2 = REGNO (operands[2]);
14877 machine_mode halfmode = GET_MODE (operands[1]);
14878 unsigned int halfregs = REG_NREGS (operands[1]);
14879 rtx destlo, desthi;
14881 gcc_assert (halfmode == V16QImode);
14883 if (src1 == dest && src2 == dest + halfregs)
14885 /* No-op move. Can't split to nothing; emit something. */
14886 emit_note (NOTE_INSN_DELETED);
14887 return;
14890 /* Preserve register attributes for variable tracking. */
14891 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
14892 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
14893 GET_MODE_SIZE (halfmode));
14895 /* Special case of reversed high/low parts. */
14896 if (reg_overlap_mentioned_p (operands[2], destlo)
14897 && reg_overlap_mentioned_p (operands[1], desthi))
14899 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14900 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
14901 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14903 else if (!reg_overlap_mentioned_p (operands[2], destlo))
14905 /* Try to avoid unnecessary moves if part of the result
14906 is in the right place already. */
14907 if (src1 != dest)
14908 emit_move_insn (destlo, operands[1]);
14909 if (src2 != dest + halfregs)
14910 emit_move_insn (desthi, operands[2]);
14912 else
14914 if (src2 != dest + halfregs)
14915 emit_move_insn (desthi, operands[2]);
14916 if (src1 != dest)
14917 emit_move_insn (destlo, operands[1]);
14921 /* vec_perm support. */
14923 struct expand_vec_perm_d
14925 rtx target, op0, op1;
14926 vec_perm_indices perm;
14927 machine_mode vmode;
14928 unsigned int vec_flags;
14929 bool one_vector_p;
14930 bool testing_p;
14933 /* Generate a variable permutation. */
14935 static void
14936 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
14938 machine_mode vmode = GET_MODE (target);
14939 bool one_vector_p = rtx_equal_p (op0, op1);
14941 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
14942 gcc_checking_assert (GET_MODE (op0) == vmode);
14943 gcc_checking_assert (GET_MODE (op1) == vmode);
14944 gcc_checking_assert (GET_MODE (sel) == vmode);
14945 gcc_checking_assert (TARGET_SIMD);
14947 if (one_vector_p)
14949 if (vmode == V8QImode)
14951 /* Expand the argument to a V16QI mode by duplicating it. */
14952 rtx pair = gen_reg_rtx (V16QImode);
14953 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
14954 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14956 else
14958 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
14961 else
14963 rtx pair;
14965 if (vmode == V8QImode)
14967 pair = gen_reg_rtx (V16QImode);
14968 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
14969 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
14971 else
14973 pair = gen_reg_rtx (OImode);
14974 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
14975 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
14980 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
14981 NELT is the number of elements in the vector. */
14983 void
14984 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
14985 unsigned int nelt)
14987 machine_mode vmode = GET_MODE (target);
14988 bool one_vector_p = rtx_equal_p (op0, op1);
14989 rtx mask;
14991 /* The TBL instruction does not use a modulo index, so we must take care
14992 of that ourselves. */
14993 mask = aarch64_simd_gen_const_vector_dup (vmode,
14994 one_vector_p ? nelt - 1 : 2 * nelt - 1);
14995 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
14997 /* For big-endian, we also need to reverse the index within the vector
14998 (but not which vector). */
14999 if (BYTES_BIG_ENDIAN)
15001 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15002 if (!one_vector_p)
15003 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15004 sel = expand_simple_binop (vmode, XOR, sel, mask,
15005 NULL, 0, OPTAB_LIB_WIDEN);
15007 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15010 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15012 static void
15013 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15015 emit_insn (gen_rtx_SET (target,
15016 gen_rtx_UNSPEC (GET_MODE (target),
15017 gen_rtvec (2, op0, op1), code)));
15020 /* Expand an SVE vec_perm with the given operands. */
15022 void
15023 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15025 machine_mode data_mode = GET_MODE (target);
15026 machine_mode sel_mode = GET_MODE (sel);
15027 /* Enforced by the pattern condition. */
15028 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15030 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15031 size of the two value vectors, i.e. the upper bits of the indices
15032 are effectively ignored. SVE TBL instead produces 0 for any
15033 out-of-range indices, so we need to modulo all the vec_perm indices
15034 to ensure they are all in range. */
15035 rtx sel_reg = force_reg (sel_mode, sel);
15037 /* Check if the sel only references the first values vector. */
15038 if (GET_CODE (sel) == CONST_VECTOR
15039 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15041 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15042 return;
15045 /* Check if the two values vectors are the same. */
15046 if (rtx_equal_p (op0, op1))
15048 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15049 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15050 NULL, 0, OPTAB_DIRECT);
15051 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15052 return;
15055 /* Run TBL on for each value vector and combine the results. */
15057 rtx res0 = gen_reg_rtx (data_mode);
15058 rtx res1 = gen_reg_rtx (data_mode);
15059 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15060 if (GET_CODE (sel) != CONST_VECTOR
15061 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15063 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15064 2 * nunits - 1);
15065 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15066 NULL, 0, OPTAB_DIRECT);
15068 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15069 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15070 NULL, 0, OPTAB_DIRECT);
15071 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15072 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15073 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15074 else
15075 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15078 /* Recognize patterns suitable for the TRN instructions. */
15079 static bool
15080 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15082 HOST_WIDE_INT odd;
15083 poly_uint64 nelt = d->perm.length ();
15084 rtx out, in0, in1, x;
15085 machine_mode vmode = d->vmode;
15087 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15088 return false;
15090 /* Note that these are little-endian tests.
15091 We correct for big-endian later. */
15092 if (!d->perm[0].is_constant (&odd)
15093 || (odd != 0 && odd != 1)
15094 || !d->perm.series_p (0, 2, odd, 2)
15095 || !d->perm.series_p (1, 2, nelt + odd, 2))
15096 return false;
15098 /* Success! */
15099 if (d->testing_p)
15100 return true;
15102 in0 = d->op0;
15103 in1 = d->op1;
15104 /* We don't need a big-endian lane correction for SVE; see the comment
15105 at the head of aarch64-sve.md for details. */
15106 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15108 x = in0, in0 = in1, in1 = x;
15109 odd = !odd;
15111 out = d->target;
15113 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15114 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15115 return true;
15118 /* Recognize patterns suitable for the UZP instructions. */
15119 static bool
15120 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15122 HOST_WIDE_INT odd;
15123 rtx out, in0, in1, x;
15124 machine_mode vmode = d->vmode;
15126 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15127 return false;
15129 /* Note that these are little-endian tests.
15130 We correct for big-endian later. */
15131 if (!d->perm[0].is_constant (&odd)
15132 || (odd != 0 && odd != 1)
15133 || !d->perm.series_p (0, 1, odd, 2))
15134 return false;
15136 /* Success! */
15137 if (d->testing_p)
15138 return true;
15140 in0 = d->op0;
15141 in1 = d->op1;
15142 /* We don't need a big-endian lane correction for SVE; see the comment
15143 at the head of aarch64-sve.md for details. */
15144 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15146 x = in0, in0 = in1, in1 = x;
15147 odd = !odd;
15149 out = d->target;
15151 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15152 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15153 return true;
15156 /* Recognize patterns suitable for the ZIP instructions. */
15157 static bool
15158 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15160 unsigned int high;
15161 poly_uint64 nelt = d->perm.length ();
15162 rtx out, in0, in1, x;
15163 machine_mode vmode = d->vmode;
15165 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15166 return false;
15168 /* Note that these are little-endian tests.
15169 We correct for big-endian later. */
15170 poly_uint64 first = d->perm[0];
15171 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15172 || !d->perm.series_p (0, 2, first, 1)
15173 || !d->perm.series_p (1, 2, first + nelt, 1))
15174 return false;
15175 high = maybe_ne (first, 0U);
15177 /* Success! */
15178 if (d->testing_p)
15179 return true;
15181 in0 = d->op0;
15182 in1 = d->op1;
15183 /* We don't need a big-endian lane correction for SVE; see the comment
15184 at the head of aarch64-sve.md for details. */
15185 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15187 x = in0, in0 = in1, in1 = x;
15188 high = !high;
15190 out = d->target;
15192 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15193 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15194 return true;
15197 /* Recognize patterns for the EXT insn. */
15199 static bool
15200 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15202 HOST_WIDE_INT location;
15203 rtx offset;
15205 /* The first element always refers to the first vector.
15206 Check if the extracted indices are increasing by one. */
15207 if (d->vec_flags == VEC_SVE_PRED
15208 || !d->perm[0].is_constant (&location)
15209 || !d->perm.series_p (0, 1, location, 1))
15210 return false;
15212 /* Success! */
15213 if (d->testing_p)
15214 return true;
15216 /* The case where (location == 0) is a no-op for both big- and little-endian,
15217 and is removed by the mid-end at optimization levels -O1 and higher.
15219 We don't need a big-endian lane correction for SVE; see the comment
15220 at the head of aarch64-sve.md for details. */
15221 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15223 /* After setup, we want the high elements of the first vector (stored
15224 at the LSB end of the register), and the low elements of the second
15225 vector (stored at the MSB end of the register). So swap. */
15226 std::swap (d->op0, d->op1);
15227 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15228 to_constant () is safe since this is restricted to Advanced SIMD
15229 vectors. */
15230 location = d->perm.length ().to_constant () - location;
15233 offset = GEN_INT (location);
15234 emit_set_insn (d->target,
15235 gen_rtx_UNSPEC (d->vmode,
15236 gen_rtvec (3, d->op0, d->op1, offset),
15237 UNSPEC_EXT));
15238 return true;
15241 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15242 within each 64-bit, 32-bit or 16-bit granule. */
15244 static bool
15245 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15247 HOST_WIDE_INT diff;
15248 unsigned int i, size, unspec;
15249 machine_mode pred_mode;
15251 if (d->vec_flags == VEC_SVE_PRED
15252 || !d->one_vector_p
15253 || !d->perm[0].is_constant (&diff))
15254 return false;
15256 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15257 if (size == 8)
15259 unspec = UNSPEC_REV64;
15260 pred_mode = VNx2BImode;
15262 else if (size == 4)
15264 unspec = UNSPEC_REV32;
15265 pred_mode = VNx4BImode;
15267 else if (size == 2)
15269 unspec = UNSPEC_REV16;
15270 pred_mode = VNx8BImode;
15272 else
15273 return false;
15275 unsigned int step = diff + 1;
15276 for (i = 0; i < step; ++i)
15277 if (!d->perm.series_p (i, step, diff - i, step))
15278 return false;
15280 /* Success! */
15281 if (d->testing_p)
15282 return true;
15284 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15285 if (d->vec_flags == VEC_SVE_DATA)
15287 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15288 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15289 UNSPEC_MERGE_PTRUE);
15291 emit_set_insn (d->target, src);
15292 return true;
15295 /* Recognize patterns for the REV insn, which reverses elements within
15296 a full vector. */
15298 static bool
15299 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15301 poly_uint64 nelt = d->perm.length ();
15303 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15304 return false;
15306 if (!d->perm.series_p (0, 1, nelt - 1, -1))
15307 return false;
15309 /* Success! */
15310 if (d->testing_p)
15311 return true;
15313 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15314 emit_set_insn (d->target, src);
15315 return true;
15318 static bool
15319 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15321 rtx out = d->target;
15322 rtx in0;
15323 HOST_WIDE_INT elt;
15324 machine_mode vmode = d->vmode;
15325 rtx lane;
15327 if (d->vec_flags == VEC_SVE_PRED
15328 || d->perm.encoding ().encoded_nelts () != 1
15329 || !d->perm[0].is_constant (&elt))
15330 return false;
15332 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15333 return false;
15335 /* Success! */
15336 if (d->testing_p)
15337 return true;
15339 /* The generic preparation in aarch64_expand_vec_perm_const_1
15340 swaps the operand order and the permute indices if it finds
15341 d->perm[0] to be in the second operand. Thus, we can always
15342 use d->op0 and need not do any extra arithmetic to get the
15343 correct lane number. */
15344 in0 = d->op0;
15345 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
15347 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15348 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15349 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15350 return true;
15353 static bool
15354 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15356 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15357 machine_mode vmode = d->vmode;
15359 /* Make sure that the indices are constant. */
15360 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15361 for (unsigned int i = 0; i < encoded_nelts; ++i)
15362 if (!d->perm[i].is_constant ())
15363 return false;
15365 if (d->testing_p)
15366 return true;
15368 /* Generic code will try constant permutation twice. Once with the
15369 original mode and again with the elements lowered to QImode.
15370 So wait and don't do the selector expansion ourselves. */
15371 if (vmode != V8QImode && vmode != V16QImode)
15372 return false;
15374 /* to_constant is safe since this routine is specific to Advanced SIMD
15375 vectors. */
15376 unsigned int nelt = d->perm.length ().to_constant ();
15377 for (unsigned int i = 0; i < nelt; ++i)
15378 /* If big-endian and two vectors we end up with a weird mixed-endian
15379 mode on NEON. Reverse the index within each word but not the word
15380 itself. to_constant is safe because we checked is_constant above. */
15381 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15382 ? d->perm[i].to_constant () ^ (nelt - 1)
15383 : d->perm[i].to_constant ());
15385 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15386 sel = force_reg (vmode, sel);
15388 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15389 return true;
15392 /* Try to implement D using an SVE TBL instruction. */
15394 static bool
15395 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15397 unsigned HOST_WIDE_INT nelt;
15399 /* Permuting two variable-length vectors could overflow the
15400 index range. */
15401 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15402 return false;
15404 if (d->testing_p)
15405 return true;
15407 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15408 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15409 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15410 return true;
15413 static bool
15414 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15416 /* The pattern matching functions above are written to look for a small
15417 number to begin the sequence (0, 1, N/2). If we begin with an index
15418 from the second operand, we can swap the operands. */
15419 poly_int64 nelt = d->perm.length ();
15420 if (known_ge (d->perm[0], nelt))
15422 d->perm.rotate_inputs (1);
15423 std::swap (d->op0, d->op1);
15426 if ((d->vec_flags == VEC_ADVSIMD
15427 || d->vec_flags == VEC_SVE_DATA
15428 || d->vec_flags == VEC_SVE_PRED)
15429 && known_gt (nelt, 1))
15431 if (aarch64_evpc_rev_local (d))
15432 return true;
15433 else if (aarch64_evpc_rev_global (d))
15434 return true;
15435 else if (aarch64_evpc_ext (d))
15436 return true;
15437 else if (aarch64_evpc_dup (d))
15438 return true;
15439 else if (aarch64_evpc_zip (d))
15440 return true;
15441 else if (aarch64_evpc_uzp (d))
15442 return true;
15443 else if (aarch64_evpc_trn (d))
15444 return true;
15445 if (d->vec_flags == VEC_SVE_DATA)
15446 return aarch64_evpc_sve_tbl (d);
15447 else if (d->vec_flags == VEC_SVE_DATA)
15448 return aarch64_evpc_tbl (d);
15450 return false;
15453 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15455 static bool
15456 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15457 rtx op1, const vec_perm_indices &sel)
15459 struct expand_vec_perm_d d;
15461 /* Check whether the mask can be applied to a single vector. */
15462 if (op0 && rtx_equal_p (op0, op1))
15463 d.one_vector_p = true;
15464 else if (sel.all_from_input_p (0))
15466 d.one_vector_p = true;
15467 op1 = op0;
15469 else if (sel.all_from_input_p (1))
15471 d.one_vector_p = true;
15472 op0 = op1;
15474 else
15475 d.one_vector_p = false;
15477 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15478 sel.nelts_per_input ());
15479 d.vmode = vmode;
15480 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15481 d.target = target;
15482 d.op0 = op0;
15483 d.op1 = op1;
15484 d.testing_p = !target;
15486 if (!d.testing_p)
15487 return aarch64_expand_vec_perm_const_1 (&d);
15489 rtx_insn *last = get_last_insn ();
15490 bool ret = aarch64_expand_vec_perm_const_1 (&d);
15491 gcc_assert (last == get_last_insn ());
15493 return ret;
15496 /* Generate a byte permute mask for a register of mode MODE,
15497 which has NUNITS units. */
15500 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15502 /* We have to reverse each vector because we dont have
15503 a permuted load that can reverse-load according to ABI rules. */
15504 rtx mask;
15505 rtvec v = rtvec_alloc (16);
15506 unsigned int i, j;
15507 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15509 gcc_assert (BYTES_BIG_ENDIAN);
15510 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15512 for (i = 0; i < nunits; i++)
15513 for (j = 0; j < usize; j++)
15514 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15515 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15516 return force_reg (V16QImode, mask);
15519 /* Return true if X is a valid second operand for the SVE instruction
15520 that implements integer comparison OP_CODE. */
15522 static bool
15523 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15525 if (register_operand (x, VOIDmode))
15526 return true;
15528 switch (op_code)
15530 case LTU:
15531 case LEU:
15532 case GEU:
15533 case GTU:
15534 return aarch64_sve_cmp_immediate_p (x, false);
15535 case LT:
15536 case LE:
15537 case GE:
15538 case GT:
15539 case NE:
15540 case EQ:
15541 return aarch64_sve_cmp_immediate_p (x, true);
15542 default:
15543 gcc_unreachable ();
15547 /* Return the UNSPEC_COND_* code for comparison CODE. */
15549 static unsigned int
15550 aarch64_unspec_cond_code (rtx_code code)
15552 switch (code)
15554 case NE:
15555 return UNSPEC_COND_NE;
15556 case EQ:
15557 return UNSPEC_COND_EQ;
15558 case LT:
15559 return UNSPEC_COND_LT;
15560 case GT:
15561 return UNSPEC_COND_GT;
15562 case LE:
15563 return UNSPEC_COND_LE;
15564 case GE:
15565 return UNSPEC_COND_GE;
15566 case LTU:
15567 return UNSPEC_COND_LO;
15568 case GTU:
15569 return UNSPEC_COND_HI;
15570 case LEU:
15571 return UNSPEC_COND_LS;
15572 case GEU:
15573 return UNSPEC_COND_HS;
15574 case UNORDERED:
15575 return UNSPEC_COND_UO;
15576 default:
15577 gcc_unreachable ();
15581 /* Return an (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>) expression,
15582 where <X> is the operation associated with comparison CODE. */
15584 static rtx
15585 aarch64_gen_unspec_cond (rtx_code code, machine_mode pred_mode,
15586 rtx pred, rtx op0, rtx op1)
15588 rtvec vec = gen_rtvec (3, pred, op0, op1);
15589 return gen_rtx_UNSPEC (pred_mode, vec, aarch64_unspec_cond_code (code));
15592 /* Expand an SVE integer comparison:
15594 TARGET = CODE (OP0, OP1). */
15596 void
15597 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15599 machine_mode pred_mode = GET_MODE (target);
15600 machine_mode data_mode = GET_MODE (op0);
15602 if (!aarch64_sve_cmp_operand_p (code, op1))
15603 op1 = force_reg (data_mode, op1);
15605 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15606 rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, ptrue, op0, op1);
15607 emit_insn (gen_set_clobber_cc (target, unspec));
15610 /* Emit an instruction:
15612 (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15614 where <X> is the operation associated with comparison CODE. */
15616 static void
15617 aarch64_emit_unspec_cond (rtx target, rtx_code code, machine_mode pred_mode,
15618 rtx pred, rtx op0, rtx op1)
15620 rtx unspec = aarch64_gen_unspec_cond (code, pred_mode, pred, op0, op1);
15621 emit_set_insn (target, unspec);
15624 /* Emit:
15626 (set TMP1 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X1>))
15627 (set TMP2 (unspec:PRED_MODE [PTRUE OP0 OP1] UNSPEC_COND_<X2>))
15628 (set TARGET (and:PRED_MODE (ior:PRED_MODE TMP1 TMP2) PTRUE))
15630 where <Xi> is the operation associated with comparison CODEi. */
15632 static void
15633 aarch64_emit_unspec_cond_or (rtx target, rtx_code code1, rtx_code code2,
15634 machine_mode pred_mode, rtx ptrue,
15635 rtx op0, rtx op1)
15637 rtx tmp1 = gen_reg_rtx (pred_mode);
15638 aarch64_emit_unspec_cond (tmp1, code1, pred_mode, ptrue, op0, op1);
15639 rtx tmp2 = gen_reg_rtx (pred_mode);
15640 aarch64_emit_unspec_cond (tmp2, code2, pred_mode, ptrue, op0, op1);
15641 emit_set_insn (target, gen_rtx_AND (pred_mode,
15642 gen_rtx_IOR (pred_mode, tmp1, tmp2),
15643 ptrue));
15646 /* If CAN_INVERT_P, emit an instruction:
15648 (set TARGET (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15650 where <X> is the operation associated with comparison CODE. Otherwise
15651 emit:
15653 (set TMP (unspec:PRED_MODE [PRED OP0 OP1] UNSPEC_COND_<X>))
15654 (set TARGET (and:PRED_MODE (not:PRED_MODE TMP) PTRUE))
15656 where the second instructions sets TARGET to the inverse of TMP. */
15658 static void
15659 aarch64_emit_inverted_unspec_cond (rtx target, rtx_code code,
15660 machine_mode pred_mode, rtx ptrue, rtx pred,
15661 rtx op0, rtx op1, bool can_invert_p)
15663 if (can_invert_p)
15664 aarch64_emit_unspec_cond (target, code, pred_mode, pred, op0, op1);
15665 else
15667 rtx tmp = gen_reg_rtx (pred_mode);
15668 aarch64_emit_unspec_cond (tmp, code, pred_mode, pred, op0, op1);
15669 emit_set_insn (target, gen_rtx_AND (pred_mode,
15670 gen_rtx_NOT (pred_mode, tmp),
15671 ptrue));
15675 /* Expand an SVE floating-point comparison:
15677 TARGET = CODE (OP0, OP1)
15679 If CAN_INVERT_P is true, the caller can also handle inverted results;
15680 return true if the result is in fact inverted. */
15682 bool
15683 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15684 rtx op0, rtx op1, bool can_invert_p)
15686 machine_mode pred_mode = GET_MODE (target);
15687 machine_mode data_mode = GET_MODE (op0);
15689 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15690 switch (code)
15692 case UNORDERED:
15693 /* UNORDERED has no immediate form. */
15694 op1 = force_reg (data_mode, op1);
15695 aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15696 return false;
15698 case LT:
15699 case LE:
15700 case GT:
15701 case GE:
15702 case EQ:
15703 case NE:
15704 /* There is native support for the comparison. */
15705 aarch64_emit_unspec_cond (target, code, pred_mode, ptrue, op0, op1);
15706 return false;
15708 case ORDERED:
15709 /* There is native support for the inverse comparison. */
15710 op1 = force_reg (data_mode, op1);
15711 aarch64_emit_inverted_unspec_cond (target, UNORDERED,
15712 pred_mode, ptrue, ptrue, op0, op1,
15713 can_invert_p);
15714 return can_invert_p;
15716 case LTGT:
15717 /* This is a trapping operation (LT or GT). */
15718 aarch64_emit_unspec_cond_or (target, LT, GT, pred_mode, ptrue, op0, op1);
15719 return false;
15721 case UNEQ:
15722 if (!flag_trapping_math)
15724 /* This would trap for signaling NaNs. */
15725 op1 = force_reg (data_mode, op1);
15726 aarch64_emit_unspec_cond_or (target, UNORDERED, EQ,
15727 pred_mode, ptrue, op0, op1);
15728 return false;
15730 /* fall through */
15732 case UNLT:
15733 case UNLE:
15734 case UNGT:
15735 case UNGE:
15737 rtx ordered = ptrue;
15738 if (flag_trapping_math)
15740 /* Only compare the elements that are known to be ordered. */
15741 ordered = gen_reg_rtx (pred_mode);
15742 op1 = force_reg (data_mode, op1);
15743 aarch64_emit_inverted_unspec_cond (ordered, UNORDERED, pred_mode,
15744 ptrue, ptrue, op0, op1, false);
15746 if (code == UNEQ)
15747 code = NE;
15748 else
15749 code = reverse_condition_maybe_unordered (code);
15750 aarch64_emit_inverted_unspec_cond (target, code, pred_mode, ptrue,
15751 ordered, op0, op1, can_invert_p);
15752 return can_invert_p;
15755 default:
15756 gcc_unreachable ();
15760 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
15761 of the data being selected and CMP_MODE is the mode of the values being
15762 compared. */
15764 void
15765 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15766 rtx *ops)
15768 machine_mode pred_mode
15769 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15770 GET_MODE_SIZE (cmp_mode)).require ();
15771 rtx pred = gen_reg_rtx (pred_mode);
15772 if (FLOAT_MODE_P (cmp_mode))
15774 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15775 ops[4], ops[5], true))
15776 std::swap (ops[1], ops[2]);
15778 else
15779 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15781 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15782 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15785 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
15786 true. However due to issues with register allocation it is preferable
15787 to avoid tieing integer scalar and FP scalar modes. Executing integer
15788 operations in general registers is better than treating them as scalar
15789 vector operations. This reduces latency and avoids redundant int<->FP
15790 moves. So tie modes if they are either the same class, or vector modes
15791 with other vector modes, vector structs or any scalar mode. */
15793 static bool
15794 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15796 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15797 return true;
15799 /* We specifically want to allow elements of "structure" modes to
15800 be tieable to the structure. This more general condition allows
15801 other rarer situations too. The reason we don't extend this to
15802 predicate modes is that there are no predicate structure modes
15803 nor any specific instructions for extracting part of a predicate
15804 register. */
15805 if (aarch64_vector_data_mode_p (mode1)
15806 && aarch64_vector_data_mode_p (mode2))
15807 return true;
15809 /* Also allow any scalar modes with vectors. */
15810 if (aarch64_vector_mode_supported_p (mode1)
15811 || aarch64_vector_mode_supported_p (mode2))
15812 return true;
15814 return false;
15817 /* Return a new RTX holding the result of moving POINTER forward by
15818 AMOUNT bytes. */
15820 static rtx
15821 aarch64_move_pointer (rtx pointer, poly_int64 amount)
15823 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
15825 return adjust_automodify_address (pointer, GET_MODE (pointer),
15826 next, amount);
15829 /* Return a new RTX holding the result of moving POINTER forward by the
15830 size of the mode it points to. */
15832 static rtx
15833 aarch64_progress_pointer (rtx pointer)
15835 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
15838 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15839 MODE bytes. */
15841 static void
15842 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
15843 machine_mode mode)
15845 rtx reg = gen_reg_rtx (mode);
15847 /* "Cast" the pointers to the correct mode. */
15848 *src = adjust_address (*src, mode, 0);
15849 *dst = adjust_address (*dst, mode, 0);
15850 /* Emit the memcpy. */
15851 emit_move_insn (reg, *src);
15852 emit_move_insn (*dst, reg);
15853 /* Move the pointers forward. */
15854 *src = aarch64_progress_pointer (*src);
15855 *dst = aarch64_progress_pointer (*dst);
15858 /* Expand movmem, as if from a __builtin_memcpy. Return true if
15859 we succeed, otherwise return false. */
15861 bool
15862 aarch64_expand_movmem (rtx *operands)
15864 unsigned int n;
15865 rtx dst = operands[0];
15866 rtx src = operands[1];
15867 rtx base;
15868 bool speed_p = !optimize_function_for_size_p (cfun);
15870 /* When optimizing for size, give a better estimate of the length of a
15871 memcpy call, but use the default otherwise. */
15872 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
15874 /* We can't do anything smart if the amount to copy is not constant. */
15875 if (!CONST_INT_P (operands[2]))
15876 return false;
15878 n = UINTVAL (operands[2]);
15880 /* Try to keep the number of instructions low. For cases below 16 bytes we
15881 need to make at most two moves. For cases above 16 bytes it will be one
15882 move for each 16 byte chunk, then at most two additional moves. */
15883 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
15884 return false;
15886 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
15887 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
15889 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
15890 src = adjust_automodify_address (src, VOIDmode, base, 0);
15892 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
15893 1-byte chunk. */
15894 if (n < 4)
15896 if (n >= 2)
15898 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
15899 n -= 2;
15902 if (n == 1)
15903 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
15905 return true;
15908 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
15909 4-byte chunk, partially overlapping with the previously copied chunk. */
15910 if (n < 8)
15912 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15913 n -= 4;
15914 if (n > 0)
15916 int move = n - 4;
15918 src = aarch64_move_pointer (src, move);
15919 dst = aarch64_move_pointer (dst, move);
15920 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15922 return true;
15925 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
15926 them, then (if applicable) an 8-byte chunk. */
15927 while (n >= 8)
15929 if (n / 16)
15931 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
15932 n -= 16;
15934 else
15936 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
15937 n -= 8;
15941 /* Finish the final bytes of the copy. We can always do this in one
15942 instruction. We either copy the exact amount we need, or partially
15943 overlap with the previous chunk we copied and copy 8-bytes. */
15944 if (n == 0)
15945 return true;
15946 else if (n == 1)
15947 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
15948 else if (n == 2)
15949 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
15950 else if (n == 4)
15951 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15952 else
15954 if (n == 3)
15956 src = aarch64_move_pointer (src, -1);
15957 dst = aarch64_move_pointer (dst, -1);
15958 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
15960 else
15962 int move = n - 8;
15964 src = aarch64_move_pointer (src, move);
15965 dst = aarch64_move_pointer (dst, move);
15966 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
15970 return true;
15973 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
15974 SImode stores. Handle the case when the constant has identical
15975 bottom and top halves. This is beneficial when the two stores can be
15976 merged into an STP and we avoid synthesising potentially expensive
15977 immediates twice. Return true if such a split is possible. */
15979 bool
15980 aarch64_split_dimode_const_store (rtx dst, rtx src)
15982 rtx lo = gen_lowpart (SImode, src);
15983 rtx hi = gen_highpart_mode (SImode, DImode, src);
15985 bool size_p = optimize_function_for_size_p (cfun);
15987 if (!rtx_equal_p (lo, hi))
15988 return false;
15990 unsigned int orig_cost
15991 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
15992 unsigned int lo_cost
15993 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
15995 /* We want to transform:
15996 MOV x1, 49370
15997 MOVK x1, 0x140, lsl 16
15998 MOVK x1, 0xc0da, lsl 32
15999 MOVK x1, 0x140, lsl 48
16000 STR x1, [x0]
16001 into:
16002 MOV w1, 49370
16003 MOVK w1, 0x140, lsl 16
16004 STP w1, w1, [x0]
16005 So we want to perform this only when we save two instructions
16006 or more. When optimizing for size, however, accept any code size
16007 savings we can. */
16008 if (size_p && orig_cost <= lo_cost)
16009 return false;
16011 if (!size_p
16012 && (orig_cost <= lo_cost + 1))
16013 return false;
16015 rtx mem_lo = adjust_address (dst, SImode, 0);
16016 if (!aarch64_mem_pair_operand (mem_lo, SImode))
16017 return false;
16019 rtx tmp_reg = gen_reg_rtx (SImode);
16020 aarch64_expand_mov_immediate (tmp_reg, lo);
16021 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16022 /* Don't emit an explicit store pair as this may not be always profitable.
16023 Let the sched-fusion logic decide whether to merge them. */
16024 emit_move_insn (mem_lo, tmp_reg);
16025 emit_move_insn (mem_hi, tmp_reg);
16027 return true;
16030 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16032 static unsigned HOST_WIDE_INT
16033 aarch64_asan_shadow_offset (void)
16035 return (HOST_WIDE_INT_1 << 36);
16038 static rtx
16039 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16040 int code, tree treeop0, tree treeop1)
16042 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16043 rtx op0, op1;
16044 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16045 insn_code icode;
16046 struct expand_operand ops[4];
16048 start_sequence ();
16049 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16051 op_mode = GET_MODE (op0);
16052 if (op_mode == VOIDmode)
16053 op_mode = GET_MODE (op1);
16055 switch (op_mode)
16057 case E_QImode:
16058 case E_HImode:
16059 case E_SImode:
16060 cmp_mode = SImode;
16061 icode = CODE_FOR_cmpsi;
16062 break;
16064 case E_DImode:
16065 cmp_mode = DImode;
16066 icode = CODE_FOR_cmpdi;
16067 break;
16069 case E_SFmode:
16070 cmp_mode = SFmode;
16071 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16072 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16073 break;
16075 case E_DFmode:
16076 cmp_mode = DFmode;
16077 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16078 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16079 break;
16081 default:
16082 end_sequence ();
16083 return NULL_RTX;
16086 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16087 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16088 if (!op0 || !op1)
16090 end_sequence ();
16091 return NULL_RTX;
16093 *prep_seq = get_insns ();
16094 end_sequence ();
16096 create_fixed_operand (&ops[0], op0);
16097 create_fixed_operand (&ops[1], op1);
16099 start_sequence ();
16100 if (!maybe_expand_insn (icode, 2, ops))
16102 end_sequence ();
16103 return NULL_RTX;
16105 *gen_seq = get_insns ();
16106 end_sequence ();
16108 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16109 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16112 static rtx
16113 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16114 int cmp_code, tree treeop0, tree treeop1, int bit_code)
16116 rtx op0, op1, target;
16117 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16118 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16119 insn_code icode;
16120 struct expand_operand ops[6];
16121 int aarch64_cond;
16123 push_to_sequence (*prep_seq);
16124 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16126 op_mode = GET_MODE (op0);
16127 if (op_mode == VOIDmode)
16128 op_mode = GET_MODE (op1);
16130 switch (op_mode)
16132 case E_QImode:
16133 case E_HImode:
16134 case E_SImode:
16135 cmp_mode = SImode;
16136 icode = CODE_FOR_ccmpsi;
16137 break;
16139 case E_DImode:
16140 cmp_mode = DImode;
16141 icode = CODE_FOR_ccmpdi;
16142 break;
16144 case E_SFmode:
16145 cmp_mode = SFmode;
16146 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16147 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16148 break;
16150 case E_DFmode:
16151 cmp_mode = DFmode;
16152 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16153 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16154 break;
16156 default:
16157 end_sequence ();
16158 return NULL_RTX;
16161 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16162 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16163 if (!op0 || !op1)
16165 end_sequence ();
16166 return NULL_RTX;
16168 *prep_seq = get_insns ();
16169 end_sequence ();
16171 target = gen_rtx_REG (cc_mode, CC_REGNUM);
16172 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16174 if (bit_code != AND)
16176 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16177 GET_MODE (XEXP (prev, 0))),
16178 VOIDmode, XEXP (prev, 0), const0_rtx);
16179 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16182 create_fixed_operand (&ops[0], XEXP (prev, 0));
16183 create_fixed_operand (&ops[1], target);
16184 create_fixed_operand (&ops[2], op0);
16185 create_fixed_operand (&ops[3], op1);
16186 create_fixed_operand (&ops[4], prev);
16187 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16189 push_to_sequence (*gen_seq);
16190 if (!maybe_expand_insn (icode, 6, ops))
16192 end_sequence ();
16193 return NULL_RTX;
16196 *gen_seq = get_insns ();
16197 end_sequence ();
16199 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16202 #undef TARGET_GEN_CCMP_FIRST
16203 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16205 #undef TARGET_GEN_CCMP_NEXT
16206 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16208 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16209 instruction fusion of some sort. */
16211 static bool
16212 aarch64_macro_fusion_p (void)
16214 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16218 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16219 should be kept together during scheduling. */
16221 static bool
16222 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16224 rtx set_dest;
16225 rtx prev_set = single_set (prev);
16226 rtx curr_set = single_set (curr);
16227 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16228 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16230 if (!aarch64_macro_fusion_p ())
16231 return false;
16233 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16235 /* We are trying to match:
16236 prev (mov) == (set (reg r0) (const_int imm16))
16237 curr (movk) == (set (zero_extract (reg r0)
16238 (const_int 16)
16239 (const_int 16))
16240 (const_int imm16_1)) */
16242 set_dest = SET_DEST (curr_set);
16244 if (GET_CODE (set_dest) == ZERO_EXTRACT
16245 && CONST_INT_P (SET_SRC (curr_set))
16246 && CONST_INT_P (SET_SRC (prev_set))
16247 && CONST_INT_P (XEXP (set_dest, 2))
16248 && INTVAL (XEXP (set_dest, 2)) == 16
16249 && REG_P (XEXP (set_dest, 0))
16250 && REG_P (SET_DEST (prev_set))
16251 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16253 return true;
16257 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16260 /* We're trying to match:
16261 prev (adrp) == (set (reg r1)
16262 (high (symbol_ref ("SYM"))))
16263 curr (add) == (set (reg r0)
16264 (lo_sum (reg r1)
16265 (symbol_ref ("SYM"))))
16266 Note that r0 need not necessarily be the same as r1, especially
16267 during pre-regalloc scheduling. */
16269 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16270 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16272 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16273 && REG_P (XEXP (SET_SRC (curr_set), 0))
16274 && REGNO (XEXP (SET_SRC (curr_set), 0))
16275 == REGNO (SET_DEST (prev_set))
16276 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16277 XEXP (SET_SRC (curr_set), 1)))
16278 return true;
16282 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16285 /* We're trying to match:
16286 prev (movk) == (set (zero_extract (reg r0)
16287 (const_int 16)
16288 (const_int 32))
16289 (const_int imm16_1))
16290 curr (movk) == (set (zero_extract (reg r0)
16291 (const_int 16)
16292 (const_int 48))
16293 (const_int imm16_2)) */
16295 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16296 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16297 && REG_P (XEXP (SET_DEST (prev_set), 0))
16298 && REG_P (XEXP (SET_DEST (curr_set), 0))
16299 && REGNO (XEXP (SET_DEST (prev_set), 0))
16300 == REGNO (XEXP (SET_DEST (curr_set), 0))
16301 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16302 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16303 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16304 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16305 && CONST_INT_P (SET_SRC (prev_set))
16306 && CONST_INT_P (SET_SRC (curr_set)))
16307 return true;
16310 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16312 /* We're trying to match:
16313 prev (adrp) == (set (reg r0)
16314 (high (symbol_ref ("SYM"))))
16315 curr (ldr) == (set (reg r1)
16316 (mem (lo_sum (reg r0)
16317 (symbol_ref ("SYM")))))
16319 curr (ldr) == (set (reg r1)
16320 (zero_extend (mem
16321 (lo_sum (reg r0)
16322 (symbol_ref ("SYM")))))) */
16323 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16324 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16326 rtx curr_src = SET_SRC (curr_set);
16328 if (GET_CODE (curr_src) == ZERO_EXTEND)
16329 curr_src = XEXP (curr_src, 0);
16331 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16332 && REG_P (XEXP (XEXP (curr_src, 0), 0))
16333 && REGNO (XEXP (XEXP (curr_src, 0), 0))
16334 == REGNO (SET_DEST (prev_set))
16335 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16336 XEXP (SET_SRC (prev_set), 0)))
16337 return true;
16341 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16342 && aarch_crypto_can_dual_issue (prev, curr))
16343 return true;
16345 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16346 && any_condjump_p (curr))
16348 enum attr_type prev_type = get_attr_type (prev);
16350 unsigned int condreg1, condreg2;
16351 rtx cc_reg_1;
16352 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16353 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16355 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16356 && prev
16357 && modified_in_p (cc_reg_1, prev))
16359 /* FIXME: this misses some which is considered simple arthematic
16360 instructions for ThunderX. Simple shifts are missed here. */
16361 if (prev_type == TYPE_ALUS_SREG
16362 || prev_type == TYPE_ALUS_IMM
16363 || prev_type == TYPE_LOGICS_REG
16364 || prev_type == TYPE_LOGICS_IMM)
16365 return true;
16369 if (prev_set
16370 && curr_set
16371 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16372 && any_condjump_p (curr))
16374 /* We're trying to match:
16375 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16376 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16377 (const_int 0))
16378 (label_ref ("SYM"))
16379 (pc)) */
16380 if (SET_DEST (curr_set) == (pc_rtx)
16381 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16382 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16383 && REG_P (SET_DEST (prev_set))
16384 && REGNO (SET_DEST (prev_set))
16385 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16387 /* Fuse ALU operations followed by conditional branch instruction. */
16388 switch (get_attr_type (prev))
16390 case TYPE_ALU_IMM:
16391 case TYPE_ALU_SREG:
16392 case TYPE_ADC_REG:
16393 case TYPE_ADC_IMM:
16394 case TYPE_ADCS_REG:
16395 case TYPE_ADCS_IMM:
16396 case TYPE_LOGIC_REG:
16397 case TYPE_LOGIC_IMM:
16398 case TYPE_CSEL:
16399 case TYPE_ADR:
16400 case TYPE_MOV_IMM:
16401 case TYPE_SHIFT_REG:
16402 case TYPE_SHIFT_IMM:
16403 case TYPE_BFM:
16404 case TYPE_RBIT:
16405 case TYPE_REV:
16406 case TYPE_EXTEND:
16407 return true;
16409 default:;
16414 return false;
16417 /* Return true iff the instruction fusion described by OP is enabled. */
16419 bool
16420 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16422 return (aarch64_tune_params.fusible_ops & op) != 0;
16425 /* If MEM is in the form of [base+offset], extract the two parts
16426 of address and set to BASE and OFFSET, otherwise return false
16427 after clearing BASE and OFFSET. */
16429 bool
16430 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16432 rtx addr;
16434 gcc_assert (MEM_P (mem));
16436 addr = XEXP (mem, 0);
16438 if (REG_P (addr))
16440 *base = addr;
16441 *offset = const0_rtx;
16442 return true;
16445 if (GET_CODE (addr) == PLUS
16446 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16448 *base = XEXP (addr, 0);
16449 *offset = XEXP (addr, 1);
16450 return true;
16453 *base = NULL_RTX;
16454 *offset = NULL_RTX;
16456 return false;
16459 /* Types for scheduling fusion. */
16460 enum sched_fusion_type
16462 SCHED_FUSION_NONE = 0,
16463 SCHED_FUSION_LD_SIGN_EXTEND,
16464 SCHED_FUSION_LD_ZERO_EXTEND,
16465 SCHED_FUSION_LD,
16466 SCHED_FUSION_ST,
16467 SCHED_FUSION_NUM
16470 /* If INSN is a load or store of address in the form of [base+offset],
16471 extract the two parts and set to BASE and OFFSET. Return scheduling
16472 fusion type this INSN is. */
16474 static enum sched_fusion_type
16475 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16477 rtx x, dest, src;
16478 enum sched_fusion_type fusion = SCHED_FUSION_LD;
16480 gcc_assert (INSN_P (insn));
16481 x = PATTERN (insn);
16482 if (GET_CODE (x) != SET)
16483 return SCHED_FUSION_NONE;
16485 src = SET_SRC (x);
16486 dest = SET_DEST (x);
16488 machine_mode dest_mode = GET_MODE (dest);
16490 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16491 return SCHED_FUSION_NONE;
16493 if (GET_CODE (src) == SIGN_EXTEND)
16495 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16496 src = XEXP (src, 0);
16497 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16498 return SCHED_FUSION_NONE;
16500 else if (GET_CODE (src) == ZERO_EXTEND)
16502 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16503 src = XEXP (src, 0);
16504 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16505 return SCHED_FUSION_NONE;
16508 if (GET_CODE (src) == MEM && REG_P (dest))
16509 extract_base_offset_in_addr (src, base, offset);
16510 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16512 fusion = SCHED_FUSION_ST;
16513 extract_base_offset_in_addr (dest, base, offset);
16515 else
16516 return SCHED_FUSION_NONE;
16518 if (*base == NULL_RTX || *offset == NULL_RTX)
16519 fusion = SCHED_FUSION_NONE;
16521 return fusion;
16524 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16526 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16527 and PRI are only calculated for these instructions. For other instruction,
16528 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16529 type instruction fusion can be added by returning different priorities.
16531 It's important that irrelevant instructions get the largest FUSION_PRI. */
16533 static void
16534 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16535 int *fusion_pri, int *pri)
16537 int tmp, off_val;
16538 rtx base, offset;
16539 enum sched_fusion_type fusion;
16541 gcc_assert (INSN_P (insn));
16543 tmp = max_pri - 1;
16544 fusion = fusion_load_store (insn, &base, &offset);
16545 if (fusion == SCHED_FUSION_NONE)
16547 *pri = tmp;
16548 *fusion_pri = tmp;
16549 return;
16552 /* Set FUSION_PRI according to fusion type and base register. */
16553 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16555 /* Calculate PRI. */
16556 tmp /= 2;
16558 /* INSN with smaller offset goes first. */
16559 off_val = (int)(INTVAL (offset));
16560 if (off_val >= 0)
16561 tmp -= (off_val & 0xfffff);
16562 else
16563 tmp += ((- off_val) & 0xfffff);
16565 *pri = tmp;
16566 return;
16569 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16570 Adjust priority of sha1h instructions so they are scheduled before
16571 other SHA1 instructions. */
16573 static int
16574 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16576 rtx x = PATTERN (insn);
16578 if (GET_CODE (x) == SET)
16580 x = SET_SRC (x);
16582 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16583 return priority + 10;
16586 return priority;
16589 /* Given OPERANDS of consecutive load/store, check if we can merge
16590 them into ldp/stp. LOAD is true if they are load instructions.
16591 MODE is the mode of memory operands. */
16593 bool
16594 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16595 machine_mode mode)
16597 HOST_WIDE_INT offval_1, offval_2, msize;
16598 enum reg_class rclass_1, rclass_2;
16599 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16601 if (load)
16603 mem_1 = operands[1];
16604 mem_2 = operands[3];
16605 reg_1 = operands[0];
16606 reg_2 = operands[2];
16607 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16608 if (REGNO (reg_1) == REGNO (reg_2))
16609 return false;
16611 else
16613 mem_1 = operands[0];
16614 mem_2 = operands[2];
16615 reg_1 = operands[1];
16616 reg_2 = operands[3];
16619 /* The mems cannot be volatile. */
16620 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16621 return false;
16623 /* If we have SImode and slow unaligned ldp,
16624 check the alignment to be at least 8 byte. */
16625 if (mode == SImode
16626 && (aarch64_tune_params.extra_tuning_flags
16627 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16628 && !optimize_size
16629 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16630 return false;
16632 /* Check if the addresses are in the form of [base+offset]. */
16633 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16634 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16635 return false;
16636 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16637 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16638 return false;
16640 /* Check if the bases are same. */
16641 if (!rtx_equal_p (base_1, base_2))
16642 return false;
16644 offval_1 = INTVAL (offset_1);
16645 offval_2 = INTVAL (offset_2);
16646 /* We should only be trying this for fixed-sized modes. There is no
16647 SVE LDP/STP instruction. */
16648 msize = GET_MODE_SIZE (mode).to_constant ();
16649 /* Check if the offsets are consecutive. */
16650 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16651 return false;
16653 /* Check if the addresses are clobbered by load. */
16654 if (load)
16656 if (reg_mentioned_p (reg_1, mem_1))
16657 return false;
16659 /* In increasing order, the last load can clobber the address. */
16660 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16661 return false;
16664 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16665 rclass_1 = FP_REGS;
16666 else
16667 rclass_1 = GENERAL_REGS;
16669 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16670 rclass_2 = FP_REGS;
16671 else
16672 rclass_2 = GENERAL_REGS;
16674 /* Check if the registers are of same class. */
16675 if (rclass_1 != rclass_2)
16676 return false;
16678 return true;
16681 /* Given OPERANDS of consecutive load/store, check if we can merge
16682 them into ldp/stp by adjusting the offset. LOAD is true if they
16683 are load instructions. MODE is the mode of memory operands.
16685 Given below consecutive stores:
16687 str w1, [xb, 0x100]
16688 str w1, [xb, 0x104]
16689 str w1, [xb, 0x108]
16690 str w1, [xb, 0x10c]
16692 Though the offsets are out of the range supported by stp, we can
16693 still pair them after adjusting the offset, like:
16695 add scratch, xb, 0x100
16696 stp w1, w1, [scratch]
16697 stp w1, w1, [scratch, 0x8]
16699 The peephole patterns detecting this opportunity should guarantee
16700 the scratch register is avaliable. */
16702 bool
16703 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16704 scalar_mode mode)
16706 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16707 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16708 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16709 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16711 if (load)
16713 reg_1 = operands[0];
16714 mem_1 = operands[1];
16715 reg_2 = operands[2];
16716 mem_2 = operands[3];
16717 reg_3 = operands[4];
16718 mem_3 = operands[5];
16719 reg_4 = operands[6];
16720 mem_4 = operands[7];
16721 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16722 && REG_P (reg_3) && REG_P (reg_4));
16723 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16724 return false;
16726 else
16728 mem_1 = operands[0];
16729 reg_1 = operands[1];
16730 mem_2 = operands[2];
16731 reg_2 = operands[3];
16732 mem_3 = operands[4];
16733 reg_3 = operands[5];
16734 mem_4 = operands[6];
16735 reg_4 = operands[7];
16737 /* Skip if memory operand is by itslef valid for ldp/stp. */
16738 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
16739 return false;
16741 /* The mems cannot be volatile. */
16742 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
16743 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
16744 return false;
16746 /* Check if the addresses are in the form of [base+offset]. */
16747 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16748 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16749 return false;
16750 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16751 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16752 return false;
16753 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
16754 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
16755 return false;
16756 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
16757 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
16758 return false;
16760 /* Check if the bases are same. */
16761 if (!rtx_equal_p (base_1, base_2)
16762 || !rtx_equal_p (base_2, base_3)
16763 || !rtx_equal_p (base_3, base_4))
16764 return false;
16766 offval_1 = INTVAL (offset_1);
16767 offval_2 = INTVAL (offset_2);
16768 offval_3 = INTVAL (offset_3);
16769 offval_4 = INTVAL (offset_4);
16770 msize = GET_MODE_SIZE (mode);
16771 /* Check if the offsets are consecutive. */
16772 if ((offval_1 != (offval_2 + msize)
16773 || offval_1 != (offval_3 + msize * 2)
16774 || offval_1 != (offval_4 + msize * 3))
16775 && (offval_4 != (offval_3 + msize)
16776 || offval_4 != (offval_2 + msize * 2)
16777 || offval_4 != (offval_1 + msize * 3)))
16778 return false;
16780 /* Check if the addresses are clobbered by load. */
16781 if (load)
16783 if (reg_mentioned_p (reg_1, mem_1)
16784 || reg_mentioned_p (reg_2, mem_2)
16785 || reg_mentioned_p (reg_3, mem_3))
16786 return false;
16788 /* In increasing order, the last load can clobber the address. */
16789 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
16790 return false;
16793 /* If we have SImode and slow unaligned ldp,
16794 check the alignment to be at least 8 byte. */
16795 if (mode == SImode
16796 && (aarch64_tune_params.extra_tuning_flags
16797 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16798 && !optimize_size
16799 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16800 return false;
16802 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16803 rclass_1 = FP_REGS;
16804 else
16805 rclass_1 = GENERAL_REGS;
16807 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16808 rclass_2 = FP_REGS;
16809 else
16810 rclass_2 = GENERAL_REGS;
16812 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
16813 rclass_3 = FP_REGS;
16814 else
16815 rclass_3 = GENERAL_REGS;
16817 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
16818 rclass_4 = FP_REGS;
16819 else
16820 rclass_4 = GENERAL_REGS;
16822 /* Check if the registers are of same class. */
16823 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
16824 return false;
16826 return true;
16829 /* Given OPERANDS of consecutive load/store, this function pairs them
16830 into ldp/stp after adjusting the offset. It depends on the fact
16831 that addresses of load/store instructions are in increasing order.
16832 MODE is the mode of memory operands. CODE is the rtl operator
16833 which should be applied to all memory operands, it's SIGN_EXTEND,
16834 ZERO_EXTEND or UNKNOWN. */
16836 bool
16837 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
16838 scalar_mode mode, RTX_CODE code)
16840 rtx base, offset, t1, t2;
16841 rtx mem_1, mem_2, mem_3, mem_4;
16842 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
16844 if (load)
16846 mem_1 = operands[1];
16847 mem_2 = operands[3];
16848 mem_3 = operands[5];
16849 mem_4 = operands[7];
16851 else
16853 mem_1 = operands[0];
16854 mem_2 = operands[2];
16855 mem_3 = operands[4];
16856 mem_4 = operands[6];
16857 gcc_assert (code == UNKNOWN);
16860 extract_base_offset_in_addr (mem_1, &base, &offset);
16861 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
16863 /* Adjust offset thus it can fit in ldp/stp instruction. */
16864 msize = GET_MODE_SIZE (mode);
16865 stp_off_limit = msize * 0x40;
16866 off_val = INTVAL (offset);
16867 abs_off = (off_val < 0) ? -off_val : off_val;
16868 new_off = abs_off % stp_off_limit;
16869 adj_off = abs_off - new_off;
16871 /* Further adjust to make sure all offsets are OK. */
16872 if ((new_off + msize * 2) >= stp_off_limit)
16874 adj_off += stp_off_limit;
16875 new_off -= stp_off_limit;
16878 /* Make sure the adjustment can be done with ADD/SUB instructions. */
16879 if (adj_off >= 0x1000)
16880 return false;
16882 if (off_val < 0)
16884 adj_off = -adj_off;
16885 new_off = -new_off;
16888 /* Create new memory references. */
16889 mem_1 = change_address (mem_1, VOIDmode,
16890 plus_constant (DImode, operands[8], new_off));
16892 /* Check if the adjusted address is OK for ldp/stp. */
16893 if (!aarch64_mem_pair_operand (mem_1, mode))
16894 return false;
16896 msize = GET_MODE_SIZE (mode);
16897 mem_2 = change_address (mem_2, VOIDmode,
16898 plus_constant (DImode,
16899 operands[8],
16900 new_off + msize));
16901 mem_3 = change_address (mem_3, VOIDmode,
16902 plus_constant (DImode,
16903 operands[8],
16904 new_off + msize * 2));
16905 mem_4 = change_address (mem_4, VOIDmode,
16906 plus_constant (DImode,
16907 operands[8],
16908 new_off + msize * 3));
16910 if (code == ZERO_EXTEND)
16912 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
16913 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
16914 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
16915 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
16917 else if (code == SIGN_EXTEND)
16919 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
16920 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
16921 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
16922 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
16925 if (load)
16927 operands[1] = mem_1;
16928 operands[3] = mem_2;
16929 operands[5] = mem_3;
16930 operands[7] = mem_4;
16932 else
16934 operands[0] = mem_1;
16935 operands[2] = mem_2;
16936 operands[4] = mem_3;
16937 operands[6] = mem_4;
16940 /* Emit adjusting instruction. */
16941 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
16942 /* Emit ldp/stp instructions. */
16943 t1 = gen_rtx_SET (operands[0], operands[1]);
16944 t2 = gen_rtx_SET (operands[2], operands[3]);
16945 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
16946 t1 = gen_rtx_SET (operands[4], operands[5]);
16947 t2 = gen_rtx_SET (operands[6], operands[7]);
16948 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
16949 return true;
16952 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
16953 it isn't worth branching around empty masked ops (including masked
16954 stores). */
16956 static bool
16957 aarch64_empty_mask_is_expensive (unsigned)
16959 return false;
16962 /* Return 1 if pseudo register should be created and used to hold
16963 GOT address for PIC code. */
16965 bool
16966 aarch64_use_pseudo_pic_reg (void)
16968 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
16971 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
16973 static int
16974 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
16976 switch (XINT (x, 1))
16978 case UNSPEC_GOTSMALLPIC:
16979 case UNSPEC_GOTSMALLPIC28K:
16980 case UNSPEC_GOTTINYPIC:
16981 return 0;
16982 default:
16983 break;
16986 return default_unspec_may_trap_p (x, flags);
16990 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
16991 return the log2 of that value. Otherwise return -1. */
16994 aarch64_fpconst_pow_of_2 (rtx x)
16996 const REAL_VALUE_TYPE *r;
16998 if (!CONST_DOUBLE_P (x))
16999 return -1;
17001 r = CONST_DOUBLE_REAL_VALUE (x);
17003 if (REAL_VALUE_NEGATIVE (*r)
17004 || REAL_VALUE_ISNAN (*r)
17005 || REAL_VALUE_ISINF (*r)
17006 || !real_isinteger (r, DFmode))
17007 return -1;
17009 return exact_log2 (real_to_integer (r));
17012 /* If X is a vector of equal CONST_DOUBLE values and that value is
17013 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17016 aarch64_vec_fpconst_pow_of_2 (rtx x)
17018 int nelts;
17019 if (GET_CODE (x) != CONST_VECTOR
17020 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17021 return -1;
17023 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17024 return -1;
17026 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17027 if (firstval <= 0)
17028 return -1;
17030 for (int i = 1; i < nelts; i++)
17031 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17032 return -1;
17034 return firstval;
17037 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17038 to float.
17040 __fp16 always promotes through this hook.
17041 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17042 through the generic excess precision logic rather than here. */
17044 static tree
17045 aarch64_promoted_type (const_tree t)
17047 if (SCALAR_FLOAT_TYPE_P (t)
17048 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17049 return float_type_node;
17051 return NULL_TREE;
17054 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17056 static bool
17057 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17058 optimization_type opt_type)
17060 switch (op)
17062 case rsqrt_optab:
17063 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17065 default:
17066 return true;
17070 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17072 static unsigned int
17073 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17074 int *offset)
17076 /* Polynomial invariant 1 == (VG / 2) - 1. */
17077 gcc_assert (i == 1);
17078 *factor = 2;
17079 *offset = 1;
17080 return AARCH64_DWARF_VG;
17083 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17084 if MODE is HFmode, and punt to the generic implementation otherwise. */
17086 static bool
17087 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17089 return (mode == HFmode
17090 ? true
17091 : default_libgcc_floating_mode_supported_p (mode));
17094 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17095 if MODE is HFmode, and punt to the generic implementation otherwise. */
17097 static bool
17098 aarch64_scalar_mode_supported_p (scalar_mode mode)
17100 return (mode == HFmode
17101 ? true
17102 : default_scalar_mode_supported_p (mode));
17105 /* Set the value of FLT_EVAL_METHOD.
17106 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17108 0: evaluate all operations and constants, whose semantic type has at
17109 most the range and precision of type float, to the range and
17110 precision of float; evaluate all other operations and constants to
17111 the range and precision of the semantic type;
17113 N, where _FloatN is a supported interchange floating type
17114 evaluate all operations and constants, whose semantic type has at
17115 most the range and precision of _FloatN type, to the range and
17116 precision of the _FloatN type; evaluate all other operations and
17117 constants to the range and precision of the semantic type;
17119 If we have the ARMv8.2-A extensions then we support _Float16 in native
17120 precision, so we should set this to 16. Otherwise, we support the type,
17121 but want to evaluate expressions in float precision, so set this to
17122 0. */
17124 static enum flt_eval_method
17125 aarch64_excess_precision (enum excess_precision_type type)
17127 switch (type)
17129 case EXCESS_PRECISION_TYPE_FAST:
17130 case EXCESS_PRECISION_TYPE_STANDARD:
17131 /* We can calculate either in 16-bit range and precision or
17132 32-bit range and precision. Make that decision based on whether
17133 we have native support for the ARMv8.2-A 16-bit floating-point
17134 instructions or not. */
17135 return (TARGET_FP_F16INST
17136 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17137 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17138 case EXCESS_PRECISION_TYPE_IMPLICIT:
17139 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17140 default:
17141 gcc_unreachable ();
17143 return FLT_EVAL_METHOD_UNPREDICTABLE;
17146 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17147 scheduled for speculative execution. Reject the long-running division
17148 and square-root instructions. */
17150 static bool
17151 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17153 switch (get_attr_type (insn))
17155 case TYPE_SDIV:
17156 case TYPE_UDIV:
17157 case TYPE_FDIVS:
17158 case TYPE_FDIVD:
17159 case TYPE_FSQRTS:
17160 case TYPE_FSQRTD:
17161 case TYPE_NEON_FP_SQRT_S:
17162 case TYPE_NEON_FP_SQRT_D:
17163 case TYPE_NEON_FP_SQRT_S_Q:
17164 case TYPE_NEON_FP_SQRT_D_Q:
17165 case TYPE_NEON_FP_DIV_S:
17166 case TYPE_NEON_FP_DIV_D:
17167 case TYPE_NEON_FP_DIV_S_Q:
17168 case TYPE_NEON_FP_DIV_D_Q:
17169 return false;
17170 default:
17171 return true;
17175 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17177 static int
17178 aarch64_compute_pressure_classes (reg_class *classes)
17180 int i = 0;
17181 classes[i++] = GENERAL_REGS;
17182 classes[i++] = FP_REGS;
17183 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17184 registers need to go in PR_LO_REGS at some point during their
17185 lifetime. Splitting it into two halves has the effect of making
17186 all predicates count against PR_LO_REGS, so that we try whenever
17187 possible to restrict the number of live predicates to 8. This
17188 greatly reduces the amount of spilling in certain loops. */
17189 classes[i++] = PR_LO_REGS;
17190 classes[i++] = PR_HI_REGS;
17191 return i;
17194 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17196 static bool
17197 aarch64_can_change_mode_class (machine_mode from,
17198 machine_mode to, reg_class_t)
17200 /* See the comment at the head of aarch64-sve.md for details. */
17201 if (BYTES_BIG_ENDIAN
17202 && (aarch64_sve_data_mode_p (from) != aarch64_sve_data_mode_p (to)))
17203 return false;
17204 return true;
17207 /* Implement TARGET_EARLY_REMAT_MODES. */
17209 static void
17210 aarch64_select_early_remat_modes (sbitmap modes)
17212 /* SVE values are not normally live across a call, so it should be
17213 worth doing early rematerialization even in VL-specific mode. */
17214 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17216 machine_mode mode = (machine_mode) i;
17217 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17218 if (vec_flags & VEC_ANY_SVE)
17219 bitmap_set_bit (modes, i);
17223 /* Target-specific selftests. */
17225 #if CHECKING_P
17227 namespace selftest {
17229 /* Selftest for the RTL loader.
17230 Verify that the RTL loader copes with a dump from
17231 print_rtx_function. This is essentially just a test that class
17232 function_reader can handle a real dump, but it also verifies
17233 that lookup_reg_by_dump_name correctly handles hard regs.
17234 The presence of hard reg names in the dump means that the test is
17235 target-specific, hence it is in this file. */
17237 static void
17238 aarch64_test_loading_full_dump ()
17240 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17242 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17244 rtx_insn *insn_1 = get_insn_by_uid (1);
17245 ASSERT_EQ (NOTE, GET_CODE (insn_1));
17247 rtx_insn *insn_15 = get_insn_by_uid (15);
17248 ASSERT_EQ (INSN, GET_CODE (insn_15));
17249 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17251 /* Verify crtl->return_rtx. */
17252 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17253 ASSERT_EQ (0, REGNO (crtl->return_rtx));
17254 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17257 /* Run all target-specific selftests. */
17259 static void
17260 aarch64_run_selftests (void)
17262 aarch64_test_loading_full_dump ();
17265 } // namespace selftest
17267 #endif /* #if CHECKING_P */
17269 #undef TARGET_ADDRESS_COST
17270 #define TARGET_ADDRESS_COST aarch64_address_cost
17272 /* This hook will determines whether unnamed bitfields affect the alignment
17273 of the containing structure. The hook returns true if the structure
17274 should inherit the alignment requirements of an unnamed bitfield's
17275 type. */
17276 #undef TARGET_ALIGN_ANON_BITFIELD
17277 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17279 #undef TARGET_ASM_ALIGNED_DI_OP
17280 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17282 #undef TARGET_ASM_ALIGNED_HI_OP
17283 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17285 #undef TARGET_ASM_ALIGNED_SI_OP
17286 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17288 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17289 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17290 hook_bool_const_tree_hwi_hwi_const_tree_true
17292 #undef TARGET_ASM_FILE_START
17293 #define TARGET_ASM_FILE_START aarch64_start_file
17295 #undef TARGET_ASM_OUTPUT_MI_THUNK
17296 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17298 #undef TARGET_ASM_SELECT_RTX_SECTION
17299 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17301 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17302 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17304 #undef TARGET_BUILD_BUILTIN_VA_LIST
17305 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17307 #undef TARGET_CALLEE_COPIES
17308 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17310 #undef TARGET_CAN_ELIMINATE
17311 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17313 #undef TARGET_CAN_INLINE_P
17314 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17316 #undef TARGET_CANNOT_FORCE_CONST_MEM
17317 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17319 #undef TARGET_CASE_VALUES_THRESHOLD
17320 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17322 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17323 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17325 /* Only the least significant bit is used for initialization guard
17326 variables. */
17327 #undef TARGET_CXX_GUARD_MASK_BIT
17328 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17330 #undef TARGET_C_MODE_FOR_SUFFIX
17331 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17333 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17334 #undef TARGET_DEFAULT_TARGET_FLAGS
17335 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17336 #endif
17338 #undef TARGET_CLASS_MAX_NREGS
17339 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17341 #undef TARGET_BUILTIN_DECL
17342 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17344 #undef TARGET_BUILTIN_RECIPROCAL
17345 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17347 #undef TARGET_C_EXCESS_PRECISION
17348 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17350 #undef TARGET_EXPAND_BUILTIN
17351 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17353 #undef TARGET_EXPAND_BUILTIN_VA_START
17354 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17356 #undef TARGET_FOLD_BUILTIN
17357 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17359 #undef TARGET_FUNCTION_ARG
17360 #define TARGET_FUNCTION_ARG aarch64_function_arg
17362 #undef TARGET_FUNCTION_ARG_ADVANCE
17363 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17365 #undef TARGET_FUNCTION_ARG_BOUNDARY
17366 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17368 #undef TARGET_FUNCTION_ARG_PADDING
17369 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17371 #undef TARGET_GET_RAW_RESULT_MODE
17372 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17373 #undef TARGET_GET_RAW_ARG_MODE
17374 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17376 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17377 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17379 #undef TARGET_FUNCTION_VALUE
17380 #define TARGET_FUNCTION_VALUE aarch64_function_value
17382 #undef TARGET_FUNCTION_VALUE_REGNO_P
17383 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17385 #undef TARGET_GIMPLE_FOLD_BUILTIN
17386 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17388 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17389 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17391 #undef TARGET_INIT_BUILTINS
17392 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17394 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17395 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17396 aarch64_ira_change_pseudo_allocno_class
17398 #undef TARGET_LEGITIMATE_ADDRESS_P
17399 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17401 #undef TARGET_LEGITIMATE_CONSTANT_P
17402 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17404 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17405 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17406 aarch64_legitimize_address_displacement
17408 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17409 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17411 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17412 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17413 aarch64_libgcc_floating_mode_supported_p
17415 #undef TARGET_MANGLE_TYPE
17416 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17418 #undef TARGET_MEMORY_MOVE_COST
17419 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17421 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17422 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17424 #undef TARGET_MUST_PASS_IN_STACK
17425 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17427 /* This target hook should return true if accesses to volatile bitfields
17428 should use the narrowest mode possible. It should return false if these
17429 accesses should use the bitfield container type. */
17430 #undef TARGET_NARROW_VOLATILE_BITFIELD
17431 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17433 #undef TARGET_OPTION_OVERRIDE
17434 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17436 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17437 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17438 aarch64_override_options_after_change
17440 #undef TARGET_OPTION_SAVE
17441 #define TARGET_OPTION_SAVE aarch64_option_save
17443 #undef TARGET_OPTION_RESTORE
17444 #define TARGET_OPTION_RESTORE aarch64_option_restore
17446 #undef TARGET_OPTION_PRINT
17447 #define TARGET_OPTION_PRINT aarch64_option_print
17449 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17450 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17452 #undef TARGET_SET_CURRENT_FUNCTION
17453 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17455 #undef TARGET_PASS_BY_REFERENCE
17456 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17458 #undef TARGET_PREFERRED_RELOAD_CLASS
17459 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17461 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17462 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17464 #undef TARGET_PROMOTED_TYPE
17465 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17467 #undef TARGET_SECONDARY_RELOAD
17468 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17470 #undef TARGET_SHIFT_TRUNCATION_MASK
17471 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17473 #undef TARGET_SETUP_INCOMING_VARARGS
17474 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17476 #undef TARGET_STRUCT_VALUE_RTX
17477 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17479 #undef TARGET_REGISTER_MOVE_COST
17480 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17482 #undef TARGET_RETURN_IN_MEMORY
17483 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17485 #undef TARGET_RETURN_IN_MSB
17486 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17488 #undef TARGET_RTX_COSTS
17489 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17491 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17492 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17494 #undef TARGET_SCHED_ISSUE_RATE
17495 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17497 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17498 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17499 aarch64_sched_first_cycle_multipass_dfa_lookahead
17501 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17502 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17503 aarch64_first_cycle_multipass_dfa_lookahead_guard
17505 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17506 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17507 aarch64_get_separate_components
17509 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17510 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17511 aarch64_components_for_bb
17513 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17514 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17515 aarch64_disqualify_components
17517 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17518 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17519 aarch64_emit_prologue_components
17521 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17522 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17523 aarch64_emit_epilogue_components
17525 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17526 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17527 aarch64_set_handled_components
17529 #undef TARGET_TRAMPOLINE_INIT
17530 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17532 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17533 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17535 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17536 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17538 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17539 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17540 aarch64_builtin_support_vector_misalignment
17542 #undef TARGET_ARRAY_MODE
17543 #define TARGET_ARRAY_MODE aarch64_array_mode
17545 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17546 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17548 #undef TARGET_VECTORIZE_ADD_STMT_COST
17549 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17551 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17552 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17553 aarch64_builtin_vectorization_cost
17555 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17556 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17558 #undef TARGET_VECTORIZE_BUILTINS
17559 #define TARGET_VECTORIZE_BUILTINS
17561 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17562 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17563 aarch64_builtin_vectorized_function
17565 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17566 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17567 aarch64_autovectorize_vector_sizes
17569 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17570 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17571 aarch64_atomic_assign_expand_fenv
17573 /* Section anchor support. */
17575 #undef TARGET_MIN_ANCHOR_OFFSET
17576 #define TARGET_MIN_ANCHOR_OFFSET -256
17578 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17579 byte offset; we can do much more for larger data types, but have no way
17580 to determine the size of the access. We assume accesses are aligned. */
17581 #undef TARGET_MAX_ANCHOR_OFFSET
17582 #define TARGET_MAX_ANCHOR_OFFSET 4095
17584 #undef TARGET_VECTOR_ALIGNMENT
17585 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17587 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17588 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17589 aarch64_vectorize_preferred_vector_alignment
17590 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17591 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17592 aarch64_simd_vector_alignment_reachable
17594 /* vec_perm support. */
17596 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17597 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17598 aarch64_vectorize_vec_perm_const
17600 #undef TARGET_VECTORIZE_GET_MASK_MODE
17601 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17602 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17603 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17604 aarch64_empty_mask_is_expensive
17606 #undef TARGET_INIT_LIBFUNCS
17607 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17609 #undef TARGET_FIXED_CONDITION_CODE_REGS
17610 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17612 #undef TARGET_FLAGS_REGNUM
17613 #define TARGET_FLAGS_REGNUM CC_REGNUM
17615 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17616 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17618 #undef TARGET_ASAN_SHADOW_OFFSET
17619 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17621 #undef TARGET_LEGITIMIZE_ADDRESS
17622 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17624 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17625 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17627 #undef TARGET_CAN_USE_DOLOOP_P
17628 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17630 #undef TARGET_SCHED_ADJUST_PRIORITY
17631 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17633 #undef TARGET_SCHED_MACRO_FUSION_P
17634 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17636 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17637 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17639 #undef TARGET_SCHED_FUSION_PRIORITY
17640 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17642 #undef TARGET_UNSPEC_MAY_TRAP_P
17643 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17645 #undef TARGET_USE_PSEUDO_PIC_REG
17646 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17648 #undef TARGET_PRINT_OPERAND
17649 #define TARGET_PRINT_OPERAND aarch64_print_operand
17651 #undef TARGET_PRINT_OPERAND_ADDRESS
17652 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17654 #undef TARGET_OPTAB_SUPPORTED_P
17655 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17657 #undef TARGET_OMIT_STRUCT_RETURN_REG
17658 #define TARGET_OMIT_STRUCT_RETURN_REG true
17660 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17661 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17662 aarch64_dwarf_poly_indeterminate_value
17664 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
17665 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17666 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17668 #undef TARGET_HARD_REGNO_NREGS
17669 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17670 #undef TARGET_HARD_REGNO_MODE_OK
17671 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17673 #undef TARGET_MODES_TIEABLE_P
17674 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17676 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17677 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17678 aarch64_hard_regno_call_part_clobbered
17680 #undef TARGET_CONSTANT_ALIGNMENT
17681 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17683 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17684 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17686 #undef TARGET_CAN_CHANGE_MODE_CLASS
17687 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17689 #undef TARGET_SELECT_EARLY_REMAT_MODES
17690 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17692 #if CHECKING_P
17693 #undef TARGET_RUN_TARGET_SELFTESTS
17694 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17695 #endif /* #if CHECKING_P */
17697 struct gcc_target targetm = TARGET_INITIALIZER;
17699 #include "gt-aarch64.h"