[AArch64] Merge stores of D-register values with different modes
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobb75a588eb9aa49b1796161d34494d452a6742e4d
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Classifies an address.
83 ADDRESS_REG_IMM
84 A simple base register plus immediate offset.
86 ADDRESS_REG_WB
87 A base register indexed by immediate offset with writeback.
89 ADDRESS_REG_REG
90 A base register indexed by (optionally scaled) register.
92 ADDRESS_REG_UXTW
93 A base register indexed by (optionally scaled) zero-extended register.
95 ADDRESS_REG_SXTW
96 A base register indexed by (optionally scaled) sign-extended register.
98 ADDRESS_LO_SUM
99 A LO_SUM rtx with a base register and "LO12" symbol relocation.
101 ADDRESS_SYMBOLIC:
102 A constant symbolic address, in pc-relative literal pool. */
104 enum aarch64_address_type {
105 ADDRESS_REG_IMM,
106 ADDRESS_REG_WB,
107 ADDRESS_REG_REG,
108 ADDRESS_REG_UXTW,
109 ADDRESS_REG_SXTW,
110 ADDRESS_LO_SUM,
111 ADDRESS_SYMBOLIC
114 struct aarch64_address_info {
115 enum aarch64_address_type type;
116 rtx base;
117 rtx offset;
118 poly_int64 const_offset;
119 int shift;
120 enum aarch64_symbol_type symbol_type;
123 /* Information about a legitimate vector immediate operand. */
124 struct simd_immediate_info
126 enum insn_type { MOV, MVN };
127 enum modifier_type { LSL, MSL };
129 simd_immediate_info () {}
130 simd_immediate_info (scalar_float_mode, rtx);
131 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
132 insn_type = MOV, modifier_type = LSL,
133 unsigned int = 0);
134 simd_immediate_info (scalar_mode, rtx, rtx);
136 /* The mode of the elements. */
137 scalar_mode elt_mode;
139 /* The value of each element if all elements are the same, or the
140 first value if the constant is a series. */
141 rtx value;
143 /* The value of the step if the constant is a series, null otherwise. */
144 rtx step;
146 /* The instruction to use to move the immediate into a vector. */
147 insn_type insn;
149 /* The kind of shift modifier to use, and the number of bits to shift.
150 This is (LSL, 0) if no shift is needed. */
151 modifier_type modifier;
152 unsigned int shift;
155 /* Construct a floating-point immediate in which each element has mode
156 ELT_MODE_IN and value VALUE_IN. */
157 inline simd_immediate_info
158 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
159 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
160 modifier (LSL), shift (0)
163 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
164 and value VALUE_IN. The other parameters are as for the structure
165 fields. */
166 inline simd_immediate_info
167 ::simd_immediate_info (scalar_int_mode elt_mode_in,
168 unsigned HOST_WIDE_INT value_in,
169 insn_type insn_in, modifier_type modifier_in,
170 unsigned int shift_in)
171 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
172 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
175 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
176 and where element I is equal to VALUE_IN + I * STEP_IN. */
177 inline simd_immediate_info
178 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
179 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
180 modifier (LSL), shift (0)
183 /* The current code model. */
184 enum aarch64_code_model aarch64_cmodel;
186 /* The number of 64-bit elements in an SVE vector. */
187 poly_uint16 aarch64_sve_vg;
189 #ifdef HAVE_AS_TLS
190 #undef TARGET_HAVE_TLS
191 #define TARGET_HAVE_TLS 1
192 #endif
194 static bool aarch64_composite_type_p (const_tree, machine_mode);
195 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
196 const_tree,
197 machine_mode *, int *,
198 bool *);
199 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
200 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
201 static void aarch64_override_options_after_change (void);
202 static bool aarch64_vector_mode_supported_p (machine_mode);
203 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
204 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
205 const_tree type,
206 int misalignment,
207 bool is_packed);
208 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
209 static bool aarch64_print_ldpstp_address (FILE *, machine_mode, rtx);
211 /* Major revision number of the ARM Architecture implemented by the target. */
212 unsigned aarch64_architecture_version;
214 /* The processor for which instructions should be scheduled. */
215 enum aarch64_processor aarch64_tune = cortexa53;
217 /* Mask to specify which instruction scheduling options should be used. */
218 unsigned long aarch64_tune_flags = 0;
220 /* Global flag for PC relative loads. */
221 bool aarch64_pcrelative_literal_loads;
223 /* Support for command line parsing of boolean flags in the tuning
224 structures. */
225 struct aarch64_flag_desc
227 const char* name;
228 unsigned int flag;
231 #define AARCH64_FUSION_PAIR(name, internal_name) \
232 { name, AARCH64_FUSE_##internal_name },
233 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
235 { "none", AARCH64_FUSE_NOTHING },
236 #include "aarch64-fusion-pairs.def"
237 { "all", AARCH64_FUSE_ALL },
238 { NULL, AARCH64_FUSE_NOTHING }
241 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
242 { name, AARCH64_EXTRA_TUNE_##internal_name },
243 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
245 { "none", AARCH64_EXTRA_TUNE_NONE },
246 #include "aarch64-tuning-flags.def"
247 { "all", AARCH64_EXTRA_TUNE_ALL },
248 { NULL, AARCH64_EXTRA_TUNE_NONE }
251 /* Tuning parameters. */
253 static const struct cpu_addrcost_table generic_addrcost_table =
256 1, /* hi */
257 0, /* si */
258 0, /* di */
259 1, /* ti */
261 0, /* pre_modify */
262 0, /* post_modify */
263 0, /* register_offset */
264 0, /* register_sextend */
265 0, /* register_zextend */
266 0 /* imm_offset */
269 static const struct cpu_addrcost_table exynosm1_addrcost_table =
272 0, /* hi */
273 0, /* si */
274 0, /* di */
275 2, /* ti */
277 0, /* pre_modify */
278 0, /* post_modify */
279 1, /* register_offset */
280 1, /* register_sextend */
281 2, /* register_zextend */
282 0, /* imm_offset */
285 static const struct cpu_addrcost_table xgene1_addrcost_table =
288 1, /* hi */
289 0, /* si */
290 0, /* di */
291 1, /* ti */
293 1, /* pre_modify */
294 0, /* post_modify */
295 0, /* register_offset */
296 1, /* register_sextend */
297 1, /* register_zextend */
298 0, /* imm_offset */
301 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
304 1, /* hi */
305 1, /* si */
306 1, /* di */
307 2, /* ti */
309 0, /* pre_modify */
310 0, /* post_modify */
311 2, /* register_offset */
312 3, /* register_sextend */
313 3, /* register_zextend */
314 0, /* imm_offset */
317 static const struct cpu_regmove_cost generic_regmove_cost =
319 1, /* GP2GP */
320 /* Avoid the use of slow int<->fp moves for spilling by setting
321 their cost higher than memmov_cost. */
322 5, /* GP2FP */
323 5, /* FP2GP */
324 2 /* FP2FP */
327 static const struct cpu_regmove_cost cortexa57_regmove_cost =
329 1, /* GP2GP */
330 /* Avoid the use of slow int<->fp moves for spilling by setting
331 their cost higher than memmov_cost. */
332 5, /* GP2FP */
333 5, /* FP2GP */
334 2 /* FP2FP */
337 static const struct cpu_regmove_cost cortexa53_regmove_cost =
339 1, /* GP2GP */
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
342 5, /* GP2FP */
343 5, /* FP2GP */
344 2 /* FP2FP */
347 static const struct cpu_regmove_cost exynosm1_regmove_cost =
349 1, /* GP2GP */
350 /* Avoid the use of slow int<->fp moves for spilling by setting
351 their cost higher than memmov_cost (actual, 4 and 9). */
352 9, /* GP2FP */
353 9, /* FP2GP */
354 1 /* FP2FP */
357 static const struct cpu_regmove_cost thunderx_regmove_cost =
359 2, /* GP2GP */
360 2, /* GP2FP */
361 6, /* FP2GP */
362 4 /* FP2FP */
365 static const struct cpu_regmove_cost xgene1_regmove_cost =
367 1, /* GP2GP */
368 /* Avoid the use of slow int<->fp moves for spilling by setting
369 their cost higher than memmov_cost. */
370 8, /* GP2FP */
371 8, /* FP2GP */
372 2 /* FP2FP */
375 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
377 2, /* GP2GP */
378 /* Avoid the use of int<->fp moves for spilling. */
379 6, /* GP2FP */
380 6, /* FP2GP */
381 4 /* FP2FP */
384 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
386 1, /* GP2GP */
387 /* Avoid the use of int<->fp moves for spilling. */
388 8, /* GP2FP */
389 8, /* FP2GP */
390 4 /* FP2FP */
393 /* Generic costs for vector insn classes. */
394 static const struct cpu_vector_cost generic_vector_cost =
396 1, /* scalar_int_stmt_cost */
397 1, /* scalar_fp_stmt_cost */
398 1, /* scalar_load_cost */
399 1, /* scalar_store_cost */
400 1, /* vec_int_stmt_cost */
401 1, /* vec_fp_stmt_cost */
402 2, /* vec_permute_cost */
403 1, /* vec_to_scalar_cost */
404 1, /* scalar_to_vec_cost */
405 1, /* vec_align_load_cost */
406 1, /* vec_unalign_load_cost */
407 1, /* vec_unalign_store_cost */
408 1, /* vec_store_cost */
409 3, /* cond_taken_branch_cost */
410 1 /* cond_not_taken_branch_cost */
413 /* ThunderX costs for vector insn classes. */
414 static const struct cpu_vector_cost thunderx_vector_cost =
416 1, /* scalar_int_stmt_cost */
417 1, /* scalar_fp_stmt_cost */
418 3, /* scalar_load_cost */
419 1, /* scalar_store_cost */
420 4, /* vec_int_stmt_cost */
421 1, /* vec_fp_stmt_cost */
422 4, /* vec_permute_cost */
423 2, /* vec_to_scalar_cost */
424 2, /* scalar_to_vec_cost */
425 3, /* vec_align_load_cost */
426 5, /* vec_unalign_load_cost */
427 5, /* vec_unalign_store_cost */
428 1, /* vec_store_cost */
429 3, /* cond_taken_branch_cost */
430 3 /* cond_not_taken_branch_cost */
433 /* Generic costs for vector insn classes. */
434 static const struct cpu_vector_cost cortexa57_vector_cost =
436 1, /* scalar_int_stmt_cost */
437 1, /* scalar_fp_stmt_cost */
438 4, /* scalar_load_cost */
439 1, /* scalar_store_cost */
440 2, /* vec_int_stmt_cost */
441 2, /* vec_fp_stmt_cost */
442 3, /* vec_permute_cost */
443 8, /* vec_to_scalar_cost */
444 8, /* scalar_to_vec_cost */
445 4, /* vec_align_load_cost */
446 4, /* vec_unalign_load_cost */
447 1, /* vec_unalign_store_cost */
448 1, /* vec_store_cost */
449 1, /* cond_taken_branch_cost */
450 1 /* cond_not_taken_branch_cost */
453 static const struct cpu_vector_cost exynosm1_vector_cost =
455 1, /* scalar_int_stmt_cost */
456 1, /* scalar_fp_stmt_cost */
457 5, /* scalar_load_cost */
458 1, /* scalar_store_cost */
459 3, /* vec_int_stmt_cost */
460 3, /* vec_fp_stmt_cost */
461 3, /* vec_permute_cost */
462 3, /* vec_to_scalar_cost */
463 3, /* scalar_to_vec_cost */
464 5, /* vec_align_load_cost */
465 5, /* vec_unalign_load_cost */
466 1, /* vec_unalign_store_cost */
467 1, /* vec_store_cost */
468 1, /* cond_taken_branch_cost */
469 1 /* cond_not_taken_branch_cost */
472 /* Generic costs for vector insn classes. */
473 static const struct cpu_vector_cost xgene1_vector_cost =
475 1, /* scalar_int_stmt_cost */
476 1, /* scalar_fp_stmt_cost */
477 5, /* scalar_load_cost */
478 1, /* scalar_store_cost */
479 2, /* vec_int_stmt_cost */
480 2, /* vec_fp_stmt_cost */
481 2, /* vec_permute_cost */
482 4, /* vec_to_scalar_cost */
483 4, /* scalar_to_vec_cost */
484 10, /* vec_align_load_cost */
485 10, /* vec_unalign_load_cost */
486 2, /* vec_unalign_store_cost */
487 2, /* vec_store_cost */
488 2, /* cond_taken_branch_cost */
489 1 /* cond_not_taken_branch_cost */
492 /* Costs for vector insn classes for Vulcan. */
493 static const struct cpu_vector_cost thunderx2t99_vector_cost =
495 1, /* scalar_int_stmt_cost */
496 6, /* scalar_fp_stmt_cost */
497 4, /* scalar_load_cost */
498 1, /* scalar_store_cost */
499 5, /* vec_int_stmt_cost */
500 6, /* vec_fp_stmt_cost */
501 3, /* vec_permute_cost */
502 6, /* vec_to_scalar_cost */
503 5, /* scalar_to_vec_cost */
504 8, /* vec_align_load_cost */
505 8, /* vec_unalign_load_cost */
506 4, /* vec_unalign_store_cost */
507 4, /* vec_store_cost */
508 2, /* cond_taken_branch_cost */
509 1 /* cond_not_taken_branch_cost */
512 /* Generic costs for branch instructions. */
513 static const struct cpu_branch_cost generic_branch_cost =
515 1, /* Predictable. */
516 3 /* Unpredictable. */
519 /* Generic approximation modes. */
520 static const cpu_approx_modes generic_approx_modes =
522 AARCH64_APPROX_NONE, /* division */
523 AARCH64_APPROX_NONE, /* sqrt */
524 AARCH64_APPROX_NONE /* recip_sqrt */
527 /* Approximation modes for Exynos M1. */
528 static const cpu_approx_modes exynosm1_approx_modes =
530 AARCH64_APPROX_NONE, /* division */
531 AARCH64_APPROX_ALL, /* sqrt */
532 AARCH64_APPROX_ALL /* recip_sqrt */
535 /* Approximation modes for X-Gene 1. */
536 static const cpu_approx_modes xgene1_approx_modes =
538 AARCH64_APPROX_NONE, /* division */
539 AARCH64_APPROX_NONE, /* sqrt */
540 AARCH64_APPROX_ALL /* recip_sqrt */
543 /* Generic prefetch settings (which disable prefetch). */
544 static const cpu_prefetch_tune generic_prefetch_tune =
546 0, /* num_slots */
547 -1, /* l1_cache_size */
548 -1, /* l1_cache_line_size */
549 -1, /* l2_cache_size */
550 -1 /* default_opt_level */
553 static const cpu_prefetch_tune exynosm1_prefetch_tune =
555 0, /* num_slots */
556 -1, /* l1_cache_size */
557 64, /* l1_cache_line_size */
558 -1, /* l2_cache_size */
559 -1 /* default_opt_level */
562 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
564 4, /* num_slots */
565 32, /* l1_cache_size */
566 64, /* l1_cache_line_size */
567 512, /* l2_cache_size */
568 -1 /* default_opt_level */
571 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
573 8, /* num_slots */
574 32, /* l1_cache_size */
575 128, /* l1_cache_line_size */
576 16*1024, /* l2_cache_size */
577 3 /* default_opt_level */
580 static const cpu_prefetch_tune thunderx_prefetch_tune =
582 8, /* num_slots */
583 32, /* l1_cache_size */
584 128, /* l1_cache_line_size */
585 -1, /* l2_cache_size */
586 -1 /* default_opt_level */
589 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
591 8, /* num_slots */
592 32, /* l1_cache_size */
593 64, /* l1_cache_line_size */
594 256, /* l2_cache_size */
595 -1 /* default_opt_level */
598 static const struct tune_params generic_tunings =
600 &cortexa57_extra_costs,
601 &generic_addrcost_table,
602 &generic_regmove_cost,
603 &generic_vector_cost,
604 &generic_branch_cost,
605 &generic_approx_modes,
606 4, /* memmov_cost */
607 2, /* issue_rate */
608 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
609 8, /* function_align. */
610 4, /* jump_align. */
611 8, /* loop_align. */
612 2, /* int_reassoc_width. */
613 4, /* fp_reassoc_width. */
614 1, /* vec_reassoc_width. */
615 2, /* min_div_recip_mul_sf. */
616 2, /* min_div_recip_mul_df. */
617 0, /* max_case_values. */
618 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
619 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
620 &generic_prefetch_tune
623 static const struct tune_params cortexa35_tunings =
625 &cortexa53_extra_costs,
626 &generic_addrcost_table,
627 &cortexa53_regmove_cost,
628 &generic_vector_cost,
629 &generic_branch_cost,
630 &generic_approx_modes,
631 4, /* memmov_cost */
632 1, /* issue_rate */
633 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
634 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
635 16, /* function_align. */
636 4, /* jump_align. */
637 8, /* loop_align. */
638 2, /* int_reassoc_width. */
639 4, /* fp_reassoc_width. */
640 1, /* vec_reassoc_width. */
641 2, /* min_div_recip_mul_sf. */
642 2, /* min_div_recip_mul_df. */
643 0, /* max_case_values. */
644 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
645 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
646 &generic_prefetch_tune
649 static const struct tune_params cortexa53_tunings =
651 &cortexa53_extra_costs,
652 &generic_addrcost_table,
653 &cortexa53_regmove_cost,
654 &generic_vector_cost,
655 &generic_branch_cost,
656 &generic_approx_modes,
657 4, /* memmov_cost */
658 2, /* issue_rate */
659 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
660 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
661 16, /* function_align. */
662 4, /* jump_align. */
663 8, /* loop_align. */
664 2, /* int_reassoc_width. */
665 4, /* fp_reassoc_width. */
666 1, /* vec_reassoc_width. */
667 2, /* min_div_recip_mul_sf. */
668 2, /* min_div_recip_mul_df. */
669 0, /* max_case_values. */
670 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
671 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
672 &generic_prefetch_tune
675 static const struct tune_params cortexa57_tunings =
677 &cortexa57_extra_costs,
678 &generic_addrcost_table,
679 &cortexa57_regmove_cost,
680 &cortexa57_vector_cost,
681 &generic_branch_cost,
682 &generic_approx_modes,
683 4, /* memmov_cost */
684 3, /* issue_rate */
685 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
686 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
687 16, /* function_align. */
688 4, /* jump_align. */
689 8, /* loop_align. */
690 2, /* int_reassoc_width. */
691 4, /* fp_reassoc_width. */
692 1, /* vec_reassoc_width. */
693 2, /* min_div_recip_mul_sf. */
694 2, /* min_div_recip_mul_df. */
695 0, /* max_case_values. */
696 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
697 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
698 &generic_prefetch_tune
701 static const struct tune_params cortexa72_tunings =
703 &cortexa57_extra_costs,
704 &generic_addrcost_table,
705 &cortexa57_regmove_cost,
706 &cortexa57_vector_cost,
707 &generic_branch_cost,
708 &generic_approx_modes,
709 4, /* memmov_cost */
710 3, /* issue_rate */
711 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
712 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
713 16, /* function_align. */
714 4, /* jump_align. */
715 8, /* loop_align. */
716 2, /* int_reassoc_width. */
717 4, /* fp_reassoc_width. */
718 1, /* vec_reassoc_width. */
719 2, /* min_div_recip_mul_sf. */
720 2, /* min_div_recip_mul_df. */
721 0, /* max_case_values. */
722 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
723 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
724 &generic_prefetch_tune
727 static const struct tune_params cortexa73_tunings =
729 &cortexa57_extra_costs,
730 &generic_addrcost_table,
731 &cortexa57_regmove_cost,
732 &cortexa57_vector_cost,
733 &generic_branch_cost,
734 &generic_approx_modes,
735 4, /* memmov_cost. */
736 2, /* issue_rate. */
737 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
738 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
739 16, /* function_align. */
740 4, /* jump_align. */
741 8, /* loop_align. */
742 2, /* int_reassoc_width. */
743 4, /* fp_reassoc_width. */
744 1, /* vec_reassoc_width. */
745 2, /* min_div_recip_mul_sf. */
746 2, /* min_div_recip_mul_df. */
747 0, /* max_case_values. */
748 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
749 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
750 &generic_prefetch_tune
755 static const struct tune_params exynosm1_tunings =
757 &exynosm1_extra_costs,
758 &exynosm1_addrcost_table,
759 &exynosm1_regmove_cost,
760 &exynosm1_vector_cost,
761 &generic_branch_cost,
762 &exynosm1_approx_modes,
763 4, /* memmov_cost */
764 3, /* issue_rate */
765 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
766 4, /* function_align. */
767 4, /* jump_align. */
768 4, /* loop_align. */
769 2, /* int_reassoc_width. */
770 4, /* fp_reassoc_width. */
771 1, /* vec_reassoc_width. */
772 2, /* min_div_recip_mul_sf. */
773 2, /* min_div_recip_mul_df. */
774 48, /* max_case_values. */
775 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
776 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
777 &exynosm1_prefetch_tune
780 static const struct tune_params thunderxt88_tunings =
782 &thunderx_extra_costs,
783 &generic_addrcost_table,
784 &thunderx_regmove_cost,
785 &thunderx_vector_cost,
786 &generic_branch_cost,
787 &generic_approx_modes,
788 6, /* memmov_cost */
789 2, /* issue_rate */
790 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
791 8, /* function_align. */
792 8, /* jump_align. */
793 8, /* loop_align. */
794 2, /* int_reassoc_width. */
795 4, /* fp_reassoc_width. */
796 1, /* vec_reassoc_width. */
797 2, /* min_div_recip_mul_sf. */
798 2, /* min_div_recip_mul_df. */
799 0, /* max_case_values. */
800 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
801 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
802 &thunderxt88_prefetch_tune
805 static const struct tune_params thunderx_tunings =
807 &thunderx_extra_costs,
808 &generic_addrcost_table,
809 &thunderx_regmove_cost,
810 &thunderx_vector_cost,
811 &generic_branch_cost,
812 &generic_approx_modes,
813 6, /* memmov_cost */
814 2, /* issue_rate */
815 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
816 8, /* function_align. */
817 8, /* jump_align. */
818 8, /* loop_align. */
819 2, /* int_reassoc_width. */
820 4, /* fp_reassoc_width. */
821 1, /* vec_reassoc_width. */
822 2, /* min_div_recip_mul_sf. */
823 2, /* min_div_recip_mul_df. */
824 0, /* max_case_values. */
825 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
826 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
827 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
828 &thunderx_prefetch_tune
831 static const struct tune_params xgene1_tunings =
833 &xgene1_extra_costs,
834 &xgene1_addrcost_table,
835 &xgene1_regmove_cost,
836 &xgene1_vector_cost,
837 &generic_branch_cost,
838 &xgene1_approx_modes,
839 6, /* memmov_cost */
840 4, /* issue_rate */
841 AARCH64_FUSE_NOTHING, /* fusible_ops */
842 16, /* function_align. */
843 8, /* jump_align. */
844 16, /* loop_align. */
845 2, /* int_reassoc_width. */
846 4, /* fp_reassoc_width. */
847 1, /* vec_reassoc_width. */
848 2, /* min_div_recip_mul_sf. */
849 2, /* min_div_recip_mul_df. */
850 0, /* max_case_values. */
851 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
852 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
853 &generic_prefetch_tune
856 static const struct tune_params qdf24xx_tunings =
858 &qdf24xx_extra_costs,
859 &generic_addrcost_table,
860 &qdf24xx_regmove_cost,
861 &generic_vector_cost,
862 &generic_branch_cost,
863 &generic_approx_modes,
864 4, /* memmov_cost */
865 4, /* issue_rate */
866 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
867 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
868 16, /* function_align. */
869 8, /* jump_align. */
870 16, /* loop_align. */
871 2, /* int_reassoc_width. */
872 4, /* fp_reassoc_width. */
873 1, /* vec_reassoc_width. */
874 2, /* min_div_recip_mul_sf. */
875 2, /* min_div_recip_mul_df. */
876 0, /* max_case_values. */
877 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
878 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
879 &qdf24xx_prefetch_tune
882 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
883 for now. */
884 static const struct tune_params saphira_tunings =
886 &generic_extra_costs,
887 &generic_addrcost_table,
888 &generic_regmove_cost,
889 &generic_vector_cost,
890 &generic_branch_cost,
891 &generic_approx_modes,
892 4, /* memmov_cost */
893 4, /* issue_rate */
894 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
895 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
896 16, /* function_align. */
897 8, /* jump_align. */
898 16, /* loop_align. */
899 2, /* int_reassoc_width. */
900 4, /* fp_reassoc_width. */
901 1, /* vec_reassoc_width. */
902 2, /* min_div_recip_mul_sf. */
903 2, /* min_div_recip_mul_df. */
904 0, /* max_case_values. */
905 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
906 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
907 &generic_prefetch_tune
910 static const struct tune_params thunderx2t99_tunings =
912 &thunderx2t99_extra_costs,
913 &thunderx2t99_addrcost_table,
914 &thunderx2t99_regmove_cost,
915 &thunderx2t99_vector_cost,
916 &generic_branch_cost,
917 &generic_approx_modes,
918 4, /* memmov_cost. */
919 4, /* issue_rate. */
920 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
921 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
922 16, /* function_align. */
923 8, /* jump_align. */
924 16, /* loop_align. */
925 3, /* int_reassoc_width. */
926 2, /* fp_reassoc_width. */
927 2, /* vec_reassoc_width. */
928 2, /* min_div_recip_mul_sf. */
929 2, /* min_div_recip_mul_df. */
930 0, /* max_case_values. */
931 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
932 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
933 &thunderx2t99_prefetch_tune
936 /* Support for fine-grained override of the tuning structures. */
937 struct aarch64_tuning_override_function
939 const char* name;
940 void (*parse_override)(const char*, struct tune_params*);
943 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
944 static void aarch64_parse_tune_string (const char*, struct tune_params*);
946 static const struct aarch64_tuning_override_function
947 aarch64_tuning_override_functions[] =
949 { "fuse", aarch64_parse_fuse_string },
950 { "tune", aarch64_parse_tune_string },
951 { NULL, NULL }
954 /* A processor implementing AArch64. */
955 struct processor
957 const char *const name;
958 enum aarch64_processor ident;
959 enum aarch64_processor sched_core;
960 enum aarch64_arch arch;
961 unsigned architecture_version;
962 const unsigned long flags;
963 const struct tune_params *const tune;
966 /* Architectures implementing AArch64. */
967 static const struct processor all_architectures[] =
969 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
970 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
971 #include "aarch64-arches.def"
972 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
975 /* Processor cores implementing AArch64. */
976 static const struct processor all_cores[] =
978 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
979 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
980 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
981 FLAGS, &COSTS##_tunings},
982 #include "aarch64-cores.def"
983 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
984 AARCH64_FL_FOR_ARCH8, &generic_tunings},
985 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
989 /* Target specification. These are populated by the -march, -mtune, -mcpu
990 handling code or by target attributes. */
991 static const struct processor *selected_arch;
992 static const struct processor *selected_cpu;
993 static const struct processor *selected_tune;
995 /* The current tuning set. */
996 struct tune_params aarch64_tune_params = generic_tunings;
998 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1000 /* An ISA extension in the co-processor and main instruction set space. */
1001 struct aarch64_option_extension
1003 const char *const name;
1004 const unsigned long flags_on;
1005 const unsigned long flags_off;
1008 typedef enum aarch64_cond_code
1010 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1011 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1012 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1014 aarch64_cc;
1016 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1018 /* The condition codes of the processor, and the inverse function. */
1019 static const char * const aarch64_condition_codes[] =
1021 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1022 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1025 /* Generate code to enable conditional branches in functions over 1 MiB. */
1026 const char *
1027 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1028 const char * branch_format)
1030 rtx_code_label * tmp_label = gen_label_rtx ();
1031 char label_buf[256];
1032 char buffer[128];
1033 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1034 CODE_LABEL_NUMBER (tmp_label));
1035 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1036 rtx dest_label = operands[pos_label];
1037 operands[pos_label] = tmp_label;
1039 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1040 output_asm_insn (buffer, operands);
1042 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1043 operands[pos_label] = dest_label;
1044 output_asm_insn (buffer, operands);
1045 return "";
1048 void
1049 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
1051 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
1052 if (TARGET_GENERAL_REGS_ONLY)
1053 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
1054 else
1055 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
1058 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1059 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
1060 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
1061 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
1062 cost (in this case the best class is the lowest cost one). Using ALL_REGS
1063 irrespectively of its cost results in bad allocations with many redundant
1064 int<->FP moves which are expensive on various cores.
1065 To avoid this we don't allow ALL_REGS as the allocno class, but force a
1066 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
1067 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
1068 Otherwise set the allocno class depending on the mode.
1069 The result of this is that it is no longer inefficient to have a higher
1070 memory move cost than the register move cost.
1073 static reg_class_t
1074 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1075 reg_class_t best_class)
1077 machine_mode mode;
1079 if (allocno_class != ALL_REGS)
1080 return allocno_class;
1082 if (best_class != ALL_REGS)
1083 return best_class;
1085 mode = PSEUDO_REGNO_MODE (regno);
1086 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1089 static unsigned int
1090 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1092 if (GET_MODE_UNIT_SIZE (mode) == 4)
1093 return aarch64_tune_params.min_div_recip_mul_sf;
1094 return aarch64_tune_params.min_div_recip_mul_df;
1097 /* Return the reassociation width of treeop OPC with mode MODE. */
1098 static int
1099 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1101 if (VECTOR_MODE_P (mode))
1102 return aarch64_tune_params.vec_reassoc_width;
1103 if (INTEGRAL_MODE_P (mode))
1104 return aarch64_tune_params.int_reassoc_width;
1105 /* Avoid reassociating floating point addition so we emit more FMAs. */
1106 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1107 return aarch64_tune_params.fp_reassoc_width;
1108 return 1;
1111 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1112 unsigned
1113 aarch64_dbx_register_number (unsigned regno)
1115 if (GP_REGNUM_P (regno))
1116 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1117 else if (regno == SP_REGNUM)
1118 return AARCH64_DWARF_SP;
1119 else if (FP_REGNUM_P (regno))
1120 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1121 else if (PR_REGNUM_P (regno))
1122 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1123 else if (regno == VG_REGNUM)
1124 return AARCH64_DWARF_VG;
1126 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1127 equivalent DWARF register. */
1128 return DWARF_FRAME_REGISTERS;
1131 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1132 static bool
1133 aarch64_advsimd_struct_mode_p (machine_mode mode)
1135 return (TARGET_SIMD
1136 && (mode == OImode || mode == CImode || mode == XImode));
1139 /* Return true if MODE is an SVE predicate mode. */
1140 static bool
1141 aarch64_sve_pred_mode_p (machine_mode mode)
1143 return (TARGET_SVE
1144 && (mode == VNx16BImode
1145 || mode == VNx8BImode
1146 || mode == VNx4BImode
1147 || mode == VNx2BImode));
1150 /* Three mutually-exclusive flags describing a vector or predicate type. */
1151 const unsigned int VEC_ADVSIMD = 1;
1152 const unsigned int VEC_SVE_DATA = 2;
1153 const unsigned int VEC_SVE_PRED = 4;
1154 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1155 a structure of 2, 3 or 4 vectors. */
1156 const unsigned int VEC_STRUCT = 8;
1157 /* Useful combinations of the above. */
1158 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1159 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1161 /* Return a set of flags describing the vector properties of mode MODE.
1162 Ignore modes that are not supported by the current target. */
1163 static unsigned int
1164 aarch64_classify_vector_mode (machine_mode mode)
1166 if (aarch64_advsimd_struct_mode_p (mode))
1167 return VEC_ADVSIMD | VEC_STRUCT;
1169 if (aarch64_sve_pred_mode_p (mode))
1170 return VEC_SVE_PRED;
1172 scalar_mode inner = GET_MODE_INNER (mode);
1173 if (VECTOR_MODE_P (mode)
1174 && (inner == QImode
1175 || inner == HImode
1176 || inner == HFmode
1177 || inner == SImode
1178 || inner == SFmode
1179 || inner == DImode
1180 || inner == DFmode))
1182 if (TARGET_SVE)
1184 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1185 return VEC_SVE_DATA;
1186 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1187 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1188 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1189 return VEC_SVE_DATA | VEC_STRUCT;
1192 /* This includes V1DF but not V1DI (which doesn't exist). */
1193 if (TARGET_SIMD
1194 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1195 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1196 return VEC_ADVSIMD;
1199 return 0;
1202 /* Return true if MODE is any of the data vector modes, including
1203 structure modes. */
1204 static bool
1205 aarch64_vector_data_mode_p (machine_mode mode)
1207 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1210 /* Return true if MODE is an SVE data vector mode; either a single vector
1211 or a structure of vectors. */
1212 static bool
1213 aarch64_sve_data_mode_p (machine_mode mode)
1215 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1218 /* Implement target hook TARGET_ARRAY_MODE. */
1219 static opt_machine_mode
1220 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1222 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1223 && IN_RANGE (nelems, 2, 4))
1224 return mode_for_vector (GET_MODE_INNER (mode),
1225 GET_MODE_NUNITS (mode) * nelems);
1227 return opt_machine_mode ();
1230 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1231 static bool
1232 aarch64_array_mode_supported_p (machine_mode mode,
1233 unsigned HOST_WIDE_INT nelems)
1235 if (TARGET_SIMD
1236 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1237 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1238 && (nelems >= 2 && nelems <= 4))
1239 return true;
1241 return false;
1244 /* Return the SVE predicate mode to use for elements that have
1245 ELEM_NBYTES bytes, if such a mode exists. */
1247 opt_machine_mode
1248 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1250 if (TARGET_SVE)
1252 if (elem_nbytes == 1)
1253 return VNx16BImode;
1254 if (elem_nbytes == 2)
1255 return VNx8BImode;
1256 if (elem_nbytes == 4)
1257 return VNx4BImode;
1258 if (elem_nbytes == 8)
1259 return VNx2BImode;
1261 return opt_machine_mode ();
1264 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1266 static opt_machine_mode
1267 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1269 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1271 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1272 machine_mode pred_mode;
1273 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1274 return pred_mode;
1277 return default_get_mask_mode (nunits, nbytes);
1280 /* Implement TARGET_HARD_REGNO_NREGS. */
1282 static unsigned int
1283 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1285 /* ??? Logically we should only need to provide a value when
1286 HARD_REGNO_MODE_OK says that the combination is valid,
1287 but at the moment we need to handle all modes. Just ignore
1288 any runtime parts for registers that can't store them. */
1289 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1290 switch (aarch64_regno_regclass (regno))
1292 case FP_REGS:
1293 case FP_LO_REGS:
1294 if (aarch64_sve_data_mode_p (mode))
1295 return exact_div (GET_MODE_SIZE (mode),
1296 BYTES_PER_SVE_VECTOR).to_constant ();
1297 return CEIL (lowest_size, UNITS_PER_VREG);
1298 case PR_REGS:
1299 case PR_LO_REGS:
1300 case PR_HI_REGS:
1301 return 1;
1302 default:
1303 return CEIL (lowest_size, UNITS_PER_WORD);
1305 gcc_unreachable ();
1308 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1310 static bool
1311 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1313 if (GET_MODE_CLASS (mode) == MODE_CC)
1314 return regno == CC_REGNUM;
1316 if (regno == VG_REGNUM)
1317 /* This must have the same size as _Unwind_Word. */
1318 return mode == DImode;
1320 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1321 if (vec_flags & VEC_SVE_PRED)
1322 return PR_REGNUM_P (regno);
1324 if (PR_REGNUM_P (regno))
1325 return 0;
1327 if (regno == SP_REGNUM)
1328 /* The purpose of comparing with ptr_mode is to support the
1329 global register variable associated with the stack pointer
1330 register via the syntax of asm ("wsp") in ILP32. */
1331 return mode == Pmode || mode == ptr_mode;
1333 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1334 return mode == Pmode;
1336 if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1337 return true;
1339 if (FP_REGNUM_P (regno))
1341 if (vec_flags & VEC_STRUCT)
1342 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1343 else
1344 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1347 return false;
1350 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1351 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1352 clobbers the top 64 bits when restoring the bottom 64 bits. */
1354 static bool
1355 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1357 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1360 /* Implement REGMODE_NATURAL_SIZE. */
1361 poly_uint64
1362 aarch64_regmode_natural_size (machine_mode mode)
1364 /* The natural size for SVE data modes is one SVE data vector,
1365 and similarly for predicates. We can't independently modify
1366 anything smaller than that. */
1367 /* ??? For now, only do this for variable-width SVE registers.
1368 Doing it for constant-sized registers breaks lower-subreg.c. */
1369 /* ??? And once that's fixed, we should probably have similar
1370 code for Advanced SIMD. */
1371 if (!aarch64_sve_vg.is_constant ())
1373 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1374 if (vec_flags & VEC_SVE_PRED)
1375 return BYTES_PER_SVE_PRED;
1376 if (vec_flags & VEC_SVE_DATA)
1377 return BYTES_PER_SVE_VECTOR;
1379 return UNITS_PER_WORD;
1382 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1383 machine_mode
1384 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1385 machine_mode mode)
1387 /* The predicate mode determines which bits are significant and
1388 which are "don't care". Decreasing the number of lanes would
1389 lose data while increasing the number of lanes would make bits
1390 unnecessarily significant. */
1391 if (PR_REGNUM_P (regno))
1392 return mode;
1393 if (known_ge (GET_MODE_SIZE (mode), 4))
1394 return mode;
1395 else
1396 return SImode;
1399 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1400 that strcpy from constants will be faster. */
1402 static HOST_WIDE_INT
1403 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1405 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1406 return MAX (align, BITS_PER_WORD);
1407 return align;
1410 /* Return true if calls to DECL should be treated as
1411 long-calls (ie called via a register). */
1412 static bool
1413 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1415 return false;
1418 /* Return true if calls to symbol-ref SYM should be treated as
1419 long-calls (ie called via a register). */
1420 bool
1421 aarch64_is_long_call_p (rtx sym)
1423 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1426 /* Return true if calls to symbol-ref SYM should not go through
1427 plt stubs. */
1429 bool
1430 aarch64_is_noplt_call_p (rtx sym)
1432 const_tree decl = SYMBOL_REF_DECL (sym);
1434 if (flag_pic
1435 && decl
1436 && (!flag_plt
1437 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1438 && !targetm.binds_local_p (decl))
1439 return true;
1441 return false;
1444 /* Return true if the offsets to a zero/sign-extract operation
1445 represent an expression that matches an extend operation. The
1446 operands represent the paramters from
1448 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1449 bool
1450 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1451 rtx extract_imm)
1453 HOST_WIDE_INT mult_val, extract_val;
1455 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1456 return false;
1458 mult_val = INTVAL (mult_imm);
1459 extract_val = INTVAL (extract_imm);
1461 if (extract_val > 8
1462 && extract_val < GET_MODE_BITSIZE (mode)
1463 && exact_log2 (extract_val & ~7) > 0
1464 && (extract_val & 7) <= 4
1465 && mult_val == (1 << (extract_val & 7)))
1466 return true;
1468 return false;
1471 /* Emit an insn that's a simple single-set. Both the operands must be
1472 known to be valid. */
1473 inline static rtx_insn *
1474 emit_set_insn (rtx x, rtx y)
1476 return emit_insn (gen_rtx_SET (x, y));
1479 /* X and Y are two things to compare using CODE. Emit the compare insn and
1480 return the rtx for register 0 in the proper mode. */
1482 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1484 machine_mode mode = SELECT_CC_MODE (code, x, y);
1485 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1487 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1488 return cc_reg;
1491 /* Build the SYMBOL_REF for __tls_get_addr. */
1493 static GTY(()) rtx tls_get_addr_libfunc;
1496 aarch64_tls_get_addr (void)
1498 if (!tls_get_addr_libfunc)
1499 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1500 return tls_get_addr_libfunc;
1503 /* Return the TLS model to use for ADDR. */
1505 static enum tls_model
1506 tls_symbolic_operand_type (rtx addr)
1508 enum tls_model tls_kind = TLS_MODEL_NONE;
1509 if (GET_CODE (addr) == CONST)
1511 poly_int64 addend;
1512 rtx sym = strip_offset (addr, &addend);
1513 if (GET_CODE (sym) == SYMBOL_REF)
1514 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1516 else if (GET_CODE (addr) == SYMBOL_REF)
1517 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1519 return tls_kind;
1522 /* We'll allow lo_sum's in addresses in our legitimate addresses
1523 so that combine would take care of combining addresses where
1524 necessary, but for generation purposes, we'll generate the address
1525 as :
1526 RTL Absolute
1527 tmp = hi (symbol_ref); adrp x1, foo
1528 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1531 PIC TLS
1532 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1533 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1534 bl __tls_get_addr
1537 Load TLS symbol, depending on TLS mechanism and TLS access model.
1539 Global Dynamic - Traditional TLS:
1540 adrp tmp, :tlsgd:imm
1541 add dest, tmp, #:tlsgd_lo12:imm
1542 bl __tls_get_addr
1544 Global Dynamic - TLS Descriptors:
1545 adrp dest, :tlsdesc:imm
1546 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1547 add dest, dest, #:tlsdesc_lo12:imm
1548 blr tmp
1549 mrs tp, tpidr_el0
1550 add dest, dest, tp
1552 Initial Exec:
1553 mrs tp, tpidr_el0
1554 adrp tmp, :gottprel:imm
1555 ldr dest, [tmp, #:gottprel_lo12:imm]
1556 add dest, dest, tp
1558 Local Exec:
1559 mrs tp, tpidr_el0
1560 add t0, tp, #:tprel_hi12:imm, lsl #12
1561 add t0, t0, #:tprel_lo12_nc:imm
1564 static void
1565 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1566 enum aarch64_symbol_type type)
1568 switch (type)
1570 case SYMBOL_SMALL_ABSOLUTE:
1572 /* In ILP32, the mode of dest can be either SImode or DImode. */
1573 rtx tmp_reg = dest;
1574 machine_mode mode = GET_MODE (dest);
1576 gcc_assert (mode == Pmode || mode == ptr_mode);
1578 if (can_create_pseudo_p ())
1579 tmp_reg = gen_reg_rtx (mode);
1581 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1582 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1583 return;
1586 case SYMBOL_TINY_ABSOLUTE:
1587 emit_insn (gen_rtx_SET (dest, imm));
1588 return;
1590 case SYMBOL_SMALL_GOT_28K:
1592 machine_mode mode = GET_MODE (dest);
1593 rtx gp_rtx = pic_offset_table_rtx;
1594 rtx insn;
1595 rtx mem;
1597 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1598 here before rtl expand. Tree IVOPT will generate rtl pattern to
1599 decide rtx costs, in which case pic_offset_table_rtx is not
1600 initialized. For that case no need to generate the first adrp
1601 instruction as the final cost for global variable access is
1602 one instruction. */
1603 if (gp_rtx != NULL)
1605 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1606 using the page base as GOT base, the first page may be wasted,
1607 in the worst scenario, there is only 28K space for GOT).
1609 The generate instruction sequence for accessing global variable
1612 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1614 Only one instruction needed. But we must initialize
1615 pic_offset_table_rtx properly. We generate initialize insn for
1616 every global access, and allow CSE to remove all redundant.
1618 The final instruction sequences will look like the following
1619 for multiply global variables access.
1621 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1623 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1624 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1625 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1626 ... */
1628 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1629 crtl->uses_pic_offset_table = 1;
1630 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1632 if (mode != GET_MODE (gp_rtx))
1633 gp_rtx = gen_lowpart (mode, gp_rtx);
1637 if (mode == ptr_mode)
1639 if (mode == DImode)
1640 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1641 else
1642 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1644 mem = XVECEXP (SET_SRC (insn), 0, 0);
1646 else
1648 gcc_assert (mode == Pmode);
1650 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1651 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1654 /* The operand is expected to be MEM. Whenever the related insn
1655 pattern changed, above code which calculate mem should be
1656 updated. */
1657 gcc_assert (GET_CODE (mem) == MEM);
1658 MEM_READONLY_P (mem) = 1;
1659 MEM_NOTRAP_P (mem) = 1;
1660 emit_insn (insn);
1661 return;
1664 case SYMBOL_SMALL_GOT_4G:
1666 /* In ILP32, the mode of dest can be either SImode or DImode,
1667 while the got entry is always of SImode size. The mode of
1668 dest depends on how dest is used: if dest is assigned to a
1669 pointer (e.g. in the memory), it has SImode; it may have
1670 DImode if dest is dereferenced to access the memeory.
1671 This is why we have to handle three different ldr_got_small
1672 patterns here (two patterns for ILP32). */
1674 rtx insn;
1675 rtx mem;
1676 rtx tmp_reg = dest;
1677 machine_mode mode = GET_MODE (dest);
1679 if (can_create_pseudo_p ())
1680 tmp_reg = gen_reg_rtx (mode);
1682 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1683 if (mode == ptr_mode)
1685 if (mode == DImode)
1686 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1687 else
1688 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1690 mem = XVECEXP (SET_SRC (insn), 0, 0);
1692 else
1694 gcc_assert (mode == Pmode);
1696 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1697 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1700 gcc_assert (GET_CODE (mem) == MEM);
1701 MEM_READONLY_P (mem) = 1;
1702 MEM_NOTRAP_P (mem) = 1;
1703 emit_insn (insn);
1704 return;
1707 case SYMBOL_SMALL_TLSGD:
1709 rtx_insn *insns;
1710 machine_mode mode = GET_MODE (dest);
1711 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1713 start_sequence ();
1714 if (TARGET_ILP32)
1715 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1716 else
1717 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1718 insns = get_insns ();
1719 end_sequence ();
1721 RTL_CONST_CALL_P (insns) = 1;
1722 emit_libcall_block (insns, dest, result, imm);
1723 return;
1726 case SYMBOL_SMALL_TLSDESC:
1728 machine_mode mode = GET_MODE (dest);
1729 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1730 rtx tp;
1732 gcc_assert (mode == Pmode || mode == ptr_mode);
1734 /* In ILP32, the got entry is always of SImode size. Unlike
1735 small GOT, the dest is fixed at reg 0. */
1736 if (TARGET_ILP32)
1737 emit_insn (gen_tlsdesc_small_si (imm));
1738 else
1739 emit_insn (gen_tlsdesc_small_di (imm));
1740 tp = aarch64_load_tp (NULL);
1742 if (mode != Pmode)
1743 tp = gen_lowpart (mode, tp);
1745 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1746 if (REG_P (dest))
1747 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1748 return;
1751 case SYMBOL_SMALL_TLSIE:
1753 /* In ILP32, the mode of dest can be either SImode or DImode,
1754 while the got entry is always of SImode size. The mode of
1755 dest depends on how dest is used: if dest is assigned to a
1756 pointer (e.g. in the memory), it has SImode; it may have
1757 DImode if dest is dereferenced to access the memeory.
1758 This is why we have to handle three different tlsie_small
1759 patterns here (two patterns for ILP32). */
1760 machine_mode mode = GET_MODE (dest);
1761 rtx tmp_reg = gen_reg_rtx (mode);
1762 rtx tp = aarch64_load_tp (NULL);
1764 if (mode == ptr_mode)
1766 if (mode == DImode)
1767 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1768 else
1770 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1771 tp = gen_lowpart (mode, tp);
1774 else
1776 gcc_assert (mode == Pmode);
1777 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1780 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1781 if (REG_P (dest))
1782 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1783 return;
1786 case SYMBOL_TLSLE12:
1787 case SYMBOL_TLSLE24:
1788 case SYMBOL_TLSLE32:
1789 case SYMBOL_TLSLE48:
1791 machine_mode mode = GET_MODE (dest);
1792 rtx tp = aarch64_load_tp (NULL);
1794 if (mode != Pmode)
1795 tp = gen_lowpart (mode, tp);
1797 switch (type)
1799 case SYMBOL_TLSLE12:
1800 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1801 (dest, tp, imm));
1802 break;
1803 case SYMBOL_TLSLE24:
1804 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1805 (dest, tp, imm));
1806 break;
1807 case SYMBOL_TLSLE32:
1808 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1809 (dest, imm));
1810 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1811 (dest, dest, tp));
1812 break;
1813 case SYMBOL_TLSLE48:
1814 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1815 (dest, imm));
1816 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1817 (dest, dest, tp));
1818 break;
1819 default:
1820 gcc_unreachable ();
1823 if (REG_P (dest))
1824 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1825 return;
1828 case SYMBOL_TINY_GOT:
1829 emit_insn (gen_ldr_got_tiny (dest, imm));
1830 return;
1832 case SYMBOL_TINY_TLSIE:
1834 machine_mode mode = GET_MODE (dest);
1835 rtx tp = aarch64_load_tp (NULL);
1837 if (mode == ptr_mode)
1839 if (mode == DImode)
1840 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1841 else
1843 tp = gen_lowpart (mode, tp);
1844 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1847 else
1849 gcc_assert (mode == Pmode);
1850 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1853 if (REG_P (dest))
1854 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1855 return;
1858 default:
1859 gcc_unreachable ();
1863 /* Emit a move from SRC to DEST. Assume that the move expanders can
1864 handle all moves if !can_create_pseudo_p (). The distinction is
1865 important because, unlike emit_move_insn, the move expanders know
1866 how to force Pmode objects into the constant pool even when the
1867 constant pool address is not itself legitimate. */
1868 static rtx
1869 aarch64_emit_move (rtx dest, rtx src)
1871 return (can_create_pseudo_p ()
1872 ? emit_move_insn (dest, src)
1873 : emit_move_insn_1 (dest, src));
1876 /* Apply UNOPTAB to OP and store the result in DEST. */
1878 static void
1879 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
1881 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
1882 if (dest != tmp)
1883 emit_move_insn (dest, tmp);
1886 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
1888 static void
1889 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
1891 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
1892 OPTAB_DIRECT);
1893 if (dest != tmp)
1894 emit_move_insn (dest, tmp);
1897 /* Split a 128-bit move operation into two 64-bit move operations,
1898 taking care to handle partial overlap of register to register
1899 copies. Special cases are needed when moving between GP regs and
1900 FP regs. SRC can be a register, constant or memory; DST a register
1901 or memory. If either operand is memory it must not have any side
1902 effects. */
1903 void
1904 aarch64_split_128bit_move (rtx dst, rtx src)
1906 rtx dst_lo, dst_hi;
1907 rtx src_lo, src_hi;
1909 machine_mode mode = GET_MODE (dst);
1911 gcc_assert (mode == TImode || mode == TFmode);
1912 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1913 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1915 if (REG_P (dst) && REG_P (src))
1917 int src_regno = REGNO (src);
1918 int dst_regno = REGNO (dst);
1920 /* Handle FP <-> GP regs. */
1921 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1923 src_lo = gen_lowpart (word_mode, src);
1924 src_hi = gen_highpart (word_mode, src);
1926 if (mode == TImode)
1928 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1929 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1931 else
1933 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1934 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1936 return;
1938 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1940 dst_lo = gen_lowpart (word_mode, dst);
1941 dst_hi = gen_highpart (word_mode, dst);
1943 if (mode == TImode)
1945 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1946 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1948 else
1950 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1951 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1953 return;
1957 dst_lo = gen_lowpart (word_mode, dst);
1958 dst_hi = gen_highpart (word_mode, dst);
1959 src_lo = gen_lowpart (word_mode, src);
1960 src_hi = gen_highpart_mode (word_mode, mode, src);
1962 /* At most one pairing may overlap. */
1963 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1965 aarch64_emit_move (dst_hi, src_hi);
1966 aarch64_emit_move (dst_lo, src_lo);
1968 else
1970 aarch64_emit_move (dst_lo, src_lo);
1971 aarch64_emit_move (dst_hi, src_hi);
1975 bool
1976 aarch64_split_128bit_move_p (rtx dst, rtx src)
1978 return (! REG_P (src)
1979 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1982 /* Split a complex SIMD combine. */
1984 void
1985 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1987 machine_mode src_mode = GET_MODE (src1);
1988 machine_mode dst_mode = GET_MODE (dst);
1990 gcc_assert (VECTOR_MODE_P (dst_mode));
1991 gcc_assert (register_operand (dst, dst_mode)
1992 && register_operand (src1, src_mode)
1993 && register_operand (src2, src_mode));
1995 rtx (*gen) (rtx, rtx, rtx);
1997 switch (src_mode)
1999 case E_V8QImode:
2000 gen = gen_aarch64_simd_combinev8qi;
2001 break;
2002 case E_V4HImode:
2003 gen = gen_aarch64_simd_combinev4hi;
2004 break;
2005 case E_V2SImode:
2006 gen = gen_aarch64_simd_combinev2si;
2007 break;
2008 case E_V4HFmode:
2009 gen = gen_aarch64_simd_combinev4hf;
2010 break;
2011 case E_V2SFmode:
2012 gen = gen_aarch64_simd_combinev2sf;
2013 break;
2014 case E_DImode:
2015 gen = gen_aarch64_simd_combinedi;
2016 break;
2017 case E_DFmode:
2018 gen = gen_aarch64_simd_combinedf;
2019 break;
2020 default:
2021 gcc_unreachable ();
2024 emit_insn (gen (dst, src1, src2));
2025 return;
2028 /* Split a complex SIMD move. */
2030 void
2031 aarch64_split_simd_move (rtx dst, rtx src)
2033 machine_mode src_mode = GET_MODE (src);
2034 machine_mode dst_mode = GET_MODE (dst);
2036 gcc_assert (VECTOR_MODE_P (dst_mode));
2038 if (REG_P (dst) && REG_P (src))
2040 rtx (*gen) (rtx, rtx);
2042 gcc_assert (VECTOR_MODE_P (src_mode));
2044 switch (src_mode)
2046 case E_V16QImode:
2047 gen = gen_aarch64_split_simd_movv16qi;
2048 break;
2049 case E_V8HImode:
2050 gen = gen_aarch64_split_simd_movv8hi;
2051 break;
2052 case E_V4SImode:
2053 gen = gen_aarch64_split_simd_movv4si;
2054 break;
2055 case E_V2DImode:
2056 gen = gen_aarch64_split_simd_movv2di;
2057 break;
2058 case E_V8HFmode:
2059 gen = gen_aarch64_split_simd_movv8hf;
2060 break;
2061 case E_V4SFmode:
2062 gen = gen_aarch64_split_simd_movv4sf;
2063 break;
2064 case E_V2DFmode:
2065 gen = gen_aarch64_split_simd_movv2df;
2066 break;
2067 default:
2068 gcc_unreachable ();
2071 emit_insn (gen (dst, src));
2072 return;
2076 bool
2077 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2078 machine_mode ymode, rtx y)
2080 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2081 gcc_assert (r != NULL);
2082 return rtx_equal_p (x, r);
2086 static rtx
2087 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2089 if (can_create_pseudo_p ())
2090 return force_reg (mode, value);
2091 else
2093 gcc_assert (x);
2094 aarch64_emit_move (x, value);
2095 return x;
2099 /* Return true if we can move VALUE into a register using a single
2100 CNT[BHWD] instruction. */
2102 static bool
2103 aarch64_sve_cnt_immediate_p (poly_int64 value)
2105 HOST_WIDE_INT factor = value.coeffs[0];
2106 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2107 return (value.coeffs[1] == factor
2108 && IN_RANGE (factor, 2, 16 * 16)
2109 && (factor & 1) == 0
2110 && factor <= 16 * (factor & -factor));
2113 /* Likewise for rtx X. */
2115 bool
2116 aarch64_sve_cnt_immediate_p (rtx x)
2118 poly_int64 value;
2119 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2122 /* Return the asm string for an instruction with a CNT-like vector size
2123 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2124 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2125 first part of the operands template (the part that comes before the
2126 vector size itself). FACTOR is the number of quadwords.
2127 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2128 If it is zero, we can use any element size. */
2130 static char *
2131 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2132 unsigned int factor,
2133 unsigned int nelts_per_vq)
2135 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2137 if (nelts_per_vq == 0)
2138 /* There is some overlap in the ranges of the four CNT instructions.
2139 Here we always use the smallest possible element size, so that the
2140 multiplier is 1 whereever possible. */
2141 nelts_per_vq = factor & -factor;
2142 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2143 gcc_assert (IN_RANGE (shift, 1, 4));
2144 char suffix = "dwhb"[shift - 1];
2146 factor >>= shift;
2147 unsigned int written;
2148 if (factor == 1)
2149 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2150 prefix, suffix, operands);
2151 else
2152 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2153 prefix, suffix, operands, factor);
2154 gcc_assert (written < sizeof (buffer));
2155 return buffer;
2158 /* Return the asm string for an instruction with a CNT-like vector size
2159 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2160 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2161 first part of the operands template (the part that comes before the
2162 vector size itself). X is the value of the vector size operand,
2163 as a polynomial integer rtx. */
2165 char *
2166 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2167 rtx x)
2169 poly_int64 value = rtx_to_poly_int64 (x);
2170 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2171 return aarch64_output_sve_cnt_immediate (prefix, operands,
2172 value.coeffs[1], 0);
2175 /* Return true if we can add VALUE to a register using a single ADDVL
2176 or ADDPL instruction. */
2178 static bool
2179 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2181 HOST_WIDE_INT factor = value.coeffs[0];
2182 if (factor == 0 || value.coeffs[1] != factor)
2183 return false;
2184 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2185 and a value of 16 is one vector width. */
2186 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2187 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2190 /* Likewise for rtx X. */
2192 bool
2193 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2195 poly_int64 value;
2196 return (poly_int_rtx_p (x, &value)
2197 && aarch64_sve_addvl_addpl_immediate_p (value));
2200 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2201 and storing the result in operand 0. */
2203 char *
2204 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2206 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2207 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2208 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2210 /* Use INC or DEC if possible. */
2211 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2213 if (aarch64_sve_cnt_immediate_p (offset_value))
2214 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2215 offset_value.coeffs[1], 0);
2216 if (aarch64_sve_cnt_immediate_p (-offset_value))
2217 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2218 -offset_value.coeffs[1], 0);
2221 int factor = offset_value.coeffs[1];
2222 if ((factor & 15) == 0)
2223 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2224 else
2225 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2226 return buffer;
2229 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2230 instruction. If it is, store the number of elements in each vector
2231 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2232 factor in *FACTOR_OUT (if nonnull). */
2234 bool
2235 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2236 unsigned int *nelts_per_vq_out)
2238 rtx elt;
2239 poly_int64 value;
2241 if (!const_vec_duplicate_p (x, &elt)
2242 || !poly_int_rtx_p (elt, &value))
2243 return false;
2245 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2246 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2247 /* There's no vector INCB. */
2248 return false;
2250 HOST_WIDE_INT factor = value.coeffs[0];
2251 if (value.coeffs[1] != factor)
2252 return false;
2254 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2255 if ((factor % nelts_per_vq) != 0
2256 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2257 return false;
2259 if (factor_out)
2260 *factor_out = factor;
2261 if (nelts_per_vq_out)
2262 *nelts_per_vq_out = nelts_per_vq;
2263 return true;
2266 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2267 instruction. */
2269 bool
2270 aarch64_sve_inc_dec_immediate_p (rtx x)
2272 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2275 /* Return the asm template for an SVE vector INC or DEC instruction.
2276 OPERANDS gives the operands before the vector count and X is the
2277 value of the vector count operand itself. */
2279 char *
2280 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2282 int factor;
2283 unsigned int nelts_per_vq;
2284 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2285 gcc_unreachable ();
2286 if (factor < 0)
2287 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2288 nelts_per_vq);
2289 else
2290 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2291 nelts_per_vq);
2294 static int
2295 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2296 scalar_int_mode mode)
2298 int i;
2299 unsigned HOST_WIDE_INT val, val2, mask;
2300 int one_match, zero_match;
2301 int num_insns;
2303 val = INTVAL (imm);
2305 if (aarch64_move_imm (val, mode))
2307 if (generate)
2308 emit_insn (gen_rtx_SET (dest, imm));
2309 return 1;
2312 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2313 (with XXXX non-zero). In that case check to see if the move can be done in
2314 a smaller mode. */
2315 val2 = val & 0xffffffff;
2316 if (mode == DImode
2317 && aarch64_move_imm (val2, SImode)
2318 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2320 if (generate)
2321 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2323 /* Check if we have to emit a second instruction by checking to see
2324 if any of the upper 32 bits of the original DI mode value is set. */
2325 if (val == val2)
2326 return 1;
2328 i = (val >> 48) ? 48 : 32;
2330 if (generate)
2331 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2332 GEN_INT ((val >> i) & 0xffff)));
2334 return 2;
2337 if ((val >> 32) == 0 || mode == SImode)
2339 if (generate)
2341 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2342 if (mode == SImode)
2343 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2344 GEN_INT ((val >> 16) & 0xffff)));
2345 else
2346 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2347 GEN_INT ((val >> 16) & 0xffff)));
2349 return 2;
2352 /* Remaining cases are all for DImode. */
2354 mask = 0xffff;
2355 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2356 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2357 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2358 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2360 if (zero_match != 2 && one_match != 2)
2362 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2363 For a 64-bit bitmask try whether changing 16 bits to all ones or
2364 zeroes creates a valid bitmask. To check any repeated bitmask,
2365 try using 16 bits from the other 32-bit half of val. */
2367 for (i = 0; i < 64; i += 16, mask <<= 16)
2369 val2 = val & ~mask;
2370 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2371 break;
2372 val2 = val | mask;
2373 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2374 break;
2375 val2 = val2 & ~mask;
2376 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2377 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2378 break;
2380 if (i != 64)
2382 if (generate)
2384 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2385 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2386 GEN_INT ((val >> i) & 0xffff)));
2388 return 2;
2392 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2393 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2394 otherwise skip zero bits. */
2396 num_insns = 1;
2397 mask = 0xffff;
2398 val2 = one_match > zero_match ? ~val : val;
2399 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2401 if (generate)
2402 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2403 ? (val | ~(mask << i))
2404 : (val & (mask << i)))));
2405 for (i += 16; i < 64; i += 16)
2407 if ((val2 & (mask << i)) == 0)
2408 continue;
2409 if (generate)
2410 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2411 GEN_INT ((val >> i) & 0xffff)));
2412 num_insns ++;
2415 return num_insns;
2418 /* Return whether imm is a 128-bit immediate which is simple enough to
2419 expand inline. */
2420 bool
2421 aarch64_mov128_immediate (rtx imm)
2423 if (GET_CODE (imm) == CONST_INT)
2424 return true;
2426 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2428 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2429 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2431 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2432 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2436 /* Return the number of temporary registers that aarch64_add_offset_1
2437 would need to add OFFSET to a register. */
2439 static unsigned int
2440 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2442 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2445 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2446 a non-polynomial OFFSET. MODE is the mode of the addition.
2447 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2448 be set and CFA adjustments added to the generated instructions.
2450 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2451 temporary if register allocation is already complete. This temporary
2452 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2453 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2454 the immediate again.
2456 Since this function may be used to adjust the stack pointer, we must
2457 ensure that it cannot cause transient stack deallocation (for example
2458 by first incrementing SP and then decrementing when adjusting by a
2459 large immediate). */
2461 static void
2462 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2463 rtx src, HOST_WIDE_INT offset, rtx temp1,
2464 bool frame_related_p, bool emit_move_imm)
2466 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2467 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2469 HOST_WIDE_INT moffset = abs_hwi (offset);
2470 rtx_insn *insn;
2472 if (!moffset)
2474 if (!rtx_equal_p (dest, src))
2476 insn = emit_insn (gen_rtx_SET (dest, src));
2477 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2479 return;
2482 /* Single instruction adjustment. */
2483 if (aarch64_uimm12_shift (moffset))
2485 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2486 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2487 return;
2490 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2491 and either:
2493 a) the offset cannot be loaded by a 16-bit move or
2494 b) there is no spare register into which we can move it. */
2495 if (moffset < 0x1000000
2496 && ((!temp1 && !can_create_pseudo_p ())
2497 || !aarch64_move_imm (moffset, mode)))
2499 HOST_WIDE_INT low_off = moffset & 0xfff;
2501 low_off = offset < 0 ? -low_off : low_off;
2502 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2503 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2504 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2505 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2506 return;
2509 /* Emit a move immediate if required and an addition/subtraction. */
2510 if (emit_move_imm)
2512 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2513 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2515 insn = emit_insn (offset < 0
2516 ? gen_sub3_insn (dest, src, temp1)
2517 : gen_add3_insn (dest, src, temp1));
2518 if (frame_related_p)
2520 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2521 rtx adj = plus_constant (mode, src, offset);
2522 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2526 /* Return the number of temporary registers that aarch64_add_offset
2527 would need to move OFFSET into a register or add OFFSET to a register;
2528 ADD_P is true if we want the latter rather than the former. */
2530 static unsigned int
2531 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2533 /* This follows the same structure as aarch64_add_offset. */
2534 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2535 return 0;
2537 unsigned int count = 0;
2538 HOST_WIDE_INT factor = offset.coeffs[1];
2539 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2540 poly_int64 poly_offset (factor, factor);
2541 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2542 /* Need one register for the ADDVL/ADDPL result. */
2543 count += 1;
2544 else if (factor != 0)
2546 factor = abs (factor);
2547 if (factor > 16 * (factor & -factor))
2548 /* Need one register for the CNT result and one for the multiplication
2549 factor. If necessary, the second temporary can be reused for the
2550 constant part of the offset. */
2551 return 2;
2552 /* Need one register for the CNT result (which might then
2553 be shifted). */
2554 count += 1;
2556 return count + aarch64_add_offset_1_temporaries (constant);
2559 /* If X can be represented as a poly_int64, return the number
2560 of temporaries that are required to add it to a register.
2561 Return -1 otherwise. */
2564 aarch64_add_offset_temporaries (rtx x)
2566 poly_int64 offset;
2567 if (!poly_int_rtx_p (x, &offset))
2568 return -1;
2569 return aarch64_offset_temporaries (true, offset);
2572 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2573 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2574 be set and CFA adjustments added to the generated instructions.
2576 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2577 temporary if register allocation is already complete. This temporary
2578 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2579 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2580 false to avoid emitting the immediate again.
2582 TEMP2, if nonnull, is a second temporary register that doesn't
2583 overlap either DEST or REG.
2585 Since this function may be used to adjust the stack pointer, we must
2586 ensure that it cannot cause transient stack deallocation (for example
2587 by first incrementing SP and then decrementing when adjusting by a
2588 large immediate). */
2590 static void
2591 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2592 poly_int64 offset, rtx temp1, rtx temp2,
2593 bool frame_related_p, bool emit_move_imm = true)
2595 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2596 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2597 gcc_assert (temp1 == NULL_RTX
2598 || !frame_related_p
2599 || !reg_overlap_mentioned_p (temp1, dest));
2600 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2602 /* Try using ADDVL or ADDPL to add the whole value. */
2603 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2605 rtx offset_rtx = gen_int_mode (offset, mode);
2606 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2607 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2608 return;
2611 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2612 SVE vector register, over and above the minimum size of 128 bits.
2613 This is equivalent to half the value returned by CNTD with a
2614 vector shape of ALL. */
2615 HOST_WIDE_INT factor = offset.coeffs[1];
2616 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2618 /* Try using ADDVL or ADDPL to add the VG-based part. */
2619 poly_int64 poly_offset (factor, factor);
2620 if (src != const0_rtx
2621 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2623 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2624 if (frame_related_p)
2626 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2627 RTX_FRAME_RELATED_P (insn) = true;
2628 src = dest;
2630 else
2632 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2633 src = aarch64_force_temporary (mode, temp1, addr);
2634 temp1 = temp2;
2635 temp2 = NULL_RTX;
2638 /* Otherwise use a CNT-based sequence. */
2639 else if (factor != 0)
2641 /* Use a subtraction if we have a negative factor. */
2642 rtx_code code = PLUS;
2643 if (factor < 0)
2645 factor = -factor;
2646 code = MINUS;
2649 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2650 into the multiplication. */
2651 rtx val;
2652 int shift = 0;
2653 if (factor & 1)
2654 /* Use a right shift by 1. */
2655 shift = -1;
2656 else
2657 factor /= 2;
2658 HOST_WIDE_INT low_bit = factor & -factor;
2659 if (factor <= 16 * low_bit)
2661 if (factor > 16 * 8)
2663 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2664 the value with the minimum multiplier and shift it into
2665 position. */
2666 int extra_shift = exact_log2 (low_bit);
2667 shift += extra_shift;
2668 factor >>= extra_shift;
2670 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2672 else
2674 /* Use CNTD, then multiply it by FACTOR. */
2675 val = gen_int_mode (poly_int64 (2, 2), mode);
2676 val = aarch64_force_temporary (mode, temp1, val);
2678 /* Go back to using a negative multiplication factor if we have
2679 no register from which to subtract. */
2680 if (code == MINUS && src == const0_rtx)
2682 factor = -factor;
2683 code = PLUS;
2685 rtx coeff1 = gen_int_mode (factor, mode);
2686 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2687 val = gen_rtx_MULT (mode, val, coeff1);
2690 if (shift > 0)
2692 /* Multiply by 1 << SHIFT. */
2693 val = aarch64_force_temporary (mode, temp1, val);
2694 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2696 else if (shift == -1)
2698 /* Divide by 2. */
2699 val = aarch64_force_temporary (mode, temp1, val);
2700 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2703 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2704 if (src != const0_rtx)
2706 val = aarch64_force_temporary (mode, temp1, val);
2707 val = gen_rtx_fmt_ee (code, mode, src, val);
2709 else if (code == MINUS)
2711 val = aarch64_force_temporary (mode, temp1, val);
2712 val = gen_rtx_NEG (mode, val);
2715 if (constant == 0 || frame_related_p)
2717 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2718 if (frame_related_p)
2720 RTX_FRAME_RELATED_P (insn) = true;
2721 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2722 gen_rtx_SET (dest, plus_constant (Pmode, src,
2723 poly_offset)));
2725 src = dest;
2726 if (constant == 0)
2727 return;
2729 else
2731 src = aarch64_force_temporary (mode, temp1, val);
2732 temp1 = temp2;
2733 temp2 = NULL_RTX;
2736 emit_move_imm = true;
2739 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2740 frame_related_p, emit_move_imm);
2743 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2744 than a poly_int64. */
2746 void
2747 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2748 rtx offset_rtx, rtx temp1, rtx temp2)
2750 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2751 temp1, temp2, false);
2754 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2755 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2756 if TEMP1 already contains abs (DELTA). */
2758 static inline void
2759 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2761 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2762 temp1, temp2, true, emit_move_imm);
2765 /* Subtract DELTA from the stack pointer, marking the instructions
2766 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2767 if nonnull. */
2769 static inline void
2770 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2772 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2773 temp1, temp2, frame_related_p);
2776 /* Set DEST to (vec_series BASE STEP). */
2778 static void
2779 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2781 machine_mode mode = GET_MODE (dest);
2782 scalar_mode inner = GET_MODE_INNER (mode);
2784 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2785 if (!aarch64_sve_index_immediate_p (base))
2786 base = force_reg (inner, base);
2787 if (!aarch64_sve_index_immediate_p (step))
2788 step = force_reg (inner, step);
2790 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2793 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2794 integer of mode INT_MODE. Return true on success. */
2796 static bool
2797 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2798 rtx src)
2800 /* If the constant is smaller than 128 bits, we can do the move
2801 using a vector of SRC_MODEs. */
2802 if (src_mode != TImode)
2804 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2805 GET_MODE_SIZE (src_mode));
2806 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2807 emit_move_insn (gen_lowpart (dup_mode, dest),
2808 gen_const_vec_duplicate (dup_mode, src));
2809 return true;
2812 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2813 src = force_const_mem (src_mode, src);
2814 if (!src)
2815 return false;
2817 /* Make sure that the address is legitimate. */
2818 if (!aarch64_sve_ld1r_operand_p (src))
2820 rtx addr = force_reg (Pmode, XEXP (src, 0));
2821 src = replace_equiv_address (src, addr);
2824 machine_mode mode = GET_MODE (dest);
2825 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2826 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2827 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2828 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2829 emit_insn (gen_rtx_SET (dest, src));
2830 return true;
2833 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2834 isn't a simple duplicate or series. */
2836 static void
2837 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2839 machine_mode mode = GET_MODE (src);
2840 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2841 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2842 gcc_assert (npatterns > 1);
2844 if (nelts_per_pattern == 1)
2846 /* The constant is a repeating seqeuence of at least two elements,
2847 where the repeating elements occupy no more than 128 bits.
2848 Get an integer representation of the replicated value. */
2849 scalar_int_mode int_mode;
2850 if (BYTES_BIG_ENDIAN)
2851 /* For now, always use LD1RQ to load the value on big-endian
2852 targets, since the handling of smaller integers includes a
2853 subreg that is semantically an element reverse. */
2854 int_mode = TImode;
2855 else
2857 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2858 gcc_assert (int_bits <= 128);
2859 int_mode = int_mode_for_size (int_bits, 0).require ();
2861 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2862 if (int_value
2863 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2864 return;
2867 /* Expand each pattern individually. */
2868 rtx_vector_builder builder;
2869 auto_vec<rtx, 16> vectors (npatterns);
2870 for (unsigned int i = 0; i < npatterns; ++i)
2872 builder.new_vector (mode, 1, nelts_per_pattern);
2873 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2874 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2875 vectors.quick_push (force_reg (mode, builder.build ()));
2878 /* Use permutes to interleave the separate vectors. */
2879 while (npatterns > 1)
2881 npatterns /= 2;
2882 for (unsigned int i = 0; i < npatterns; ++i)
2884 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2885 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2886 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2887 vectors[i] = tmp;
2890 gcc_assert (vectors[0] == dest);
2893 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2894 is a pattern that can be used to set DEST to a replicated scalar
2895 element. */
2897 void
2898 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2899 rtx (*gen_vec_duplicate) (rtx, rtx))
2901 machine_mode mode = GET_MODE (dest);
2903 /* Check on what type of symbol it is. */
2904 scalar_int_mode int_mode;
2905 if ((GET_CODE (imm) == SYMBOL_REF
2906 || GET_CODE (imm) == LABEL_REF
2907 || GET_CODE (imm) == CONST
2908 || GET_CODE (imm) == CONST_POLY_INT)
2909 && is_a <scalar_int_mode> (mode, &int_mode))
2911 rtx mem;
2912 poly_int64 offset;
2913 HOST_WIDE_INT const_offset;
2914 enum aarch64_symbol_type sty;
2916 /* If we have (const (plus symbol offset)), separate out the offset
2917 before we start classifying the symbol. */
2918 rtx base = strip_offset (imm, &offset);
2920 /* We must always add an offset involving VL separately, rather than
2921 folding it into the relocation. */
2922 if (!offset.is_constant (&const_offset))
2924 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2925 emit_insn (gen_rtx_SET (dest, imm));
2926 else
2928 /* Do arithmetic on 32-bit values if the result is smaller
2929 than that. */
2930 if (partial_subreg_p (int_mode, SImode))
2932 /* It is invalid to do symbol calculations in modes
2933 narrower than SImode. */
2934 gcc_assert (base == const0_rtx);
2935 dest = gen_lowpart (SImode, dest);
2936 int_mode = SImode;
2938 if (base != const0_rtx)
2940 base = aarch64_force_temporary (int_mode, dest, base);
2941 aarch64_add_offset (int_mode, dest, base, offset,
2942 NULL_RTX, NULL_RTX, false);
2944 else
2945 aarch64_add_offset (int_mode, dest, base, offset,
2946 dest, NULL_RTX, false);
2948 return;
2951 sty = aarch64_classify_symbol (base, const_offset);
2952 switch (sty)
2954 case SYMBOL_FORCE_TO_MEM:
2955 if (const_offset != 0
2956 && targetm.cannot_force_const_mem (int_mode, imm))
2958 gcc_assert (can_create_pseudo_p ());
2959 base = aarch64_force_temporary (int_mode, dest, base);
2960 aarch64_add_offset (int_mode, dest, base, const_offset,
2961 NULL_RTX, NULL_RTX, false);
2962 return;
2965 mem = force_const_mem (ptr_mode, imm);
2966 gcc_assert (mem);
2968 /* If we aren't generating PC relative literals, then
2969 we need to expand the literal pool access carefully.
2970 This is something that needs to be done in a number
2971 of places, so could well live as a separate function. */
2972 if (!aarch64_pcrelative_literal_loads)
2974 gcc_assert (can_create_pseudo_p ());
2975 base = gen_reg_rtx (ptr_mode);
2976 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
2977 if (ptr_mode != Pmode)
2978 base = convert_memory_address (Pmode, base);
2979 mem = gen_rtx_MEM (ptr_mode, base);
2982 if (int_mode != ptr_mode)
2983 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
2985 emit_insn (gen_rtx_SET (dest, mem));
2987 return;
2989 case SYMBOL_SMALL_TLSGD:
2990 case SYMBOL_SMALL_TLSDESC:
2991 case SYMBOL_SMALL_TLSIE:
2992 case SYMBOL_SMALL_GOT_28K:
2993 case SYMBOL_SMALL_GOT_4G:
2994 case SYMBOL_TINY_GOT:
2995 case SYMBOL_TINY_TLSIE:
2996 if (const_offset != 0)
2998 gcc_assert(can_create_pseudo_p ());
2999 base = aarch64_force_temporary (int_mode, dest, base);
3000 aarch64_add_offset (int_mode, dest, base, const_offset,
3001 NULL_RTX, NULL_RTX, false);
3002 return;
3004 /* FALLTHRU */
3006 case SYMBOL_SMALL_ABSOLUTE:
3007 case SYMBOL_TINY_ABSOLUTE:
3008 case SYMBOL_TLSLE12:
3009 case SYMBOL_TLSLE24:
3010 case SYMBOL_TLSLE32:
3011 case SYMBOL_TLSLE48:
3012 aarch64_load_symref_appropriately (dest, imm, sty);
3013 return;
3015 default:
3016 gcc_unreachable ();
3020 if (!CONST_INT_P (imm))
3022 rtx base, step, value;
3023 if (GET_CODE (imm) == HIGH
3024 || aarch64_simd_valid_immediate (imm, NULL))
3025 emit_insn (gen_rtx_SET (dest, imm));
3026 else if (const_vec_series_p (imm, &base, &step))
3027 aarch64_expand_vec_series (dest, base, step);
3028 else if (const_vec_duplicate_p (imm, &value))
3030 /* If the constant is out of range of an SVE vector move,
3031 load it from memory if we can, otherwise move it into
3032 a register and use a DUP. */
3033 scalar_mode inner_mode = GET_MODE_INNER (mode);
3034 rtx op = force_const_mem (inner_mode, value);
3035 if (!op)
3036 op = force_reg (inner_mode, value);
3037 else if (!aarch64_sve_ld1r_operand_p (op))
3039 rtx addr = force_reg (Pmode, XEXP (op, 0));
3040 op = replace_equiv_address (op, addr);
3042 emit_insn (gen_vec_duplicate (dest, op));
3044 else if (GET_CODE (imm) == CONST_VECTOR
3045 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3046 aarch64_expand_sve_const_vector (dest, imm);
3047 else
3049 rtx mem = force_const_mem (mode, imm);
3050 gcc_assert (mem);
3051 emit_move_insn (dest, mem);
3054 return;
3057 aarch64_internal_mov_immediate (dest, imm, true,
3058 as_a <scalar_int_mode> (mode));
3061 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3062 that is known to contain PTRUE. */
3064 void
3065 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3067 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3068 gen_rtvec (2, pred, src),
3069 UNSPEC_MERGE_PTRUE)));
3072 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3073 operand is in memory. In this case we need to use the predicated LD1
3074 and ST1 instead of LDR and STR, both for correctness on big-endian
3075 targets and because LD1 and ST1 support a wider range of addressing modes.
3076 PRED_MODE is the mode of the predicate.
3078 See the comment at the head of aarch64-sve.md for details about the
3079 big-endian handling. */
3081 void
3082 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3084 machine_mode mode = GET_MODE (dest);
3085 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3086 if (!register_operand (src, mode)
3087 && !register_operand (dest, mode))
3089 rtx tmp = gen_reg_rtx (mode);
3090 if (MEM_P (src))
3091 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3092 else
3093 emit_move_insn (tmp, src);
3094 src = tmp;
3096 aarch64_emit_sve_pred_move (dest, ptrue, src);
3099 /* Called only on big-endian targets. See whether an SVE vector move
3100 from SRC to DEST is effectively a REV[BHW] instruction, because at
3101 least one operand is a subreg of an SVE vector that has wider or
3102 narrower elements. Return true and emit the instruction if so.
3104 For example:
3106 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3108 represents a VIEW_CONVERT between the following vectors, viewed
3109 in memory order:
3111 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3112 R1: { [0], [1], [2], [3], ... }
3114 The high part of lane X in R2 should therefore correspond to lane X*2
3115 of R1, but the register representations are:
3117 msb lsb
3118 R2: ...... [1].high [1].low [0].high [0].low
3119 R1: ...... [3] [2] [1] [0]
3121 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3122 We therefore need a reverse operation to swap the high and low values
3123 around.
3125 This is purely an optimization. Without it we would spill the
3126 subreg operand to the stack in one mode and reload it in the
3127 other mode, which has the same effect as the REV. */
3129 bool
3130 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3132 gcc_assert (BYTES_BIG_ENDIAN);
3133 if (GET_CODE (dest) == SUBREG)
3134 dest = SUBREG_REG (dest);
3135 if (GET_CODE (src) == SUBREG)
3136 src = SUBREG_REG (src);
3138 /* The optimization handles two single SVE REGs with different element
3139 sizes. */
3140 if (!REG_P (dest)
3141 || !REG_P (src)
3142 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3143 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3144 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3145 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3146 return false;
3148 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3149 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3150 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3151 UNSPEC_REV_SUBREG);
3152 emit_insn (gen_rtx_SET (dest, unspec));
3153 return true;
3156 /* Return a copy of X with mode MODE, without changing its other
3157 attributes. Unlike gen_lowpart, this doesn't care whether the
3158 mode change is valid. */
3160 static rtx
3161 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3163 if (GET_MODE (x) == mode)
3164 return x;
3166 x = shallow_copy_rtx (x);
3167 set_mode_and_regno (x, mode, REGNO (x));
3168 return x;
3171 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3172 operands. */
3174 void
3175 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3177 /* Decide which REV operation we need. The mode with narrower elements
3178 determines the mode of the operands and the mode with the wider
3179 elements determines the reverse width. */
3180 machine_mode mode_with_wider_elts = GET_MODE (dest);
3181 machine_mode mode_with_narrower_elts = GET_MODE (src);
3182 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3183 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3184 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3186 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3187 unsigned int unspec;
3188 if (wider_bytes == 8)
3189 unspec = UNSPEC_REV64;
3190 else if (wider_bytes == 4)
3191 unspec = UNSPEC_REV32;
3192 else if (wider_bytes == 2)
3193 unspec = UNSPEC_REV16;
3194 else
3195 gcc_unreachable ();
3196 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3198 /* Emit:
3200 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3201 UNSPEC_MERGE_PTRUE))
3203 with the appropriate modes. */
3204 ptrue = gen_lowpart (pred_mode, ptrue);
3205 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3206 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3207 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3208 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3209 UNSPEC_MERGE_PTRUE);
3210 emit_insn (gen_rtx_SET (dest, src));
3213 static bool
3214 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3215 tree exp ATTRIBUTE_UNUSED)
3217 /* Currently, always true. */
3218 return true;
3221 /* Implement TARGET_PASS_BY_REFERENCE. */
3223 static bool
3224 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3225 machine_mode mode,
3226 const_tree type,
3227 bool named ATTRIBUTE_UNUSED)
3229 HOST_WIDE_INT size;
3230 machine_mode dummymode;
3231 int nregs;
3233 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3234 if (mode == BLKmode && type)
3235 size = int_size_in_bytes (type);
3236 else
3237 /* No frontends can create types with variable-sized modes, so we
3238 shouldn't be asked to pass or return them. */
3239 size = GET_MODE_SIZE (mode).to_constant ();
3241 /* Aggregates are passed by reference based on their size. */
3242 if (type && AGGREGATE_TYPE_P (type))
3244 size = int_size_in_bytes (type);
3247 /* Variable sized arguments are always returned by reference. */
3248 if (size < 0)
3249 return true;
3251 /* Can this be a candidate to be passed in fp/simd register(s)? */
3252 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3253 &dummymode, &nregs,
3254 NULL))
3255 return false;
3257 /* Arguments which are variable sized or larger than 2 registers are
3258 passed by reference unless they are a homogenous floating point
3259 aggregate. */
3260 return size > 2 * UNITS_PER_WORD;
3263 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3264 static bool
3265 aarch64_return_in_msb (const_tree valtype)
3267 machine_mode dummy_mode;
3268 int dummy_int;
3270 /* Never happens in little-endian mode. */
3271 if (!BYTES_BIG_ENDIAN)
3272 return false;
3274 /* Only composite types smaller than or equal to 16 bytes can
3275 be potentially returned in registers. */
3276 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3277 || int_size_in_bytes (valtype) <= 0
3278 || int_size_in_bytes (valtype) > 16)
3279 return false;
3281 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3282 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3283 is always passed/returned in the least significant bits of fp/simd
3284 register(s). */
3285 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3286 &dummy_mode, &dummy_int, NULL))
3287 return false;
3289 return true;
3292 /* Implement TARGET_FUNCTION_VALUE.
3293 Define how to find the value returned by a function. */
3295 static rtx
3296 aarch64_function_value (const_tree type, const_tree func,
3297 bool outgoing ATTRIBUTE_UNUSED)
3299 machine_mode mode;
3300 int unsignedp;
3301 int count;
3302 machine_mode ag_mode;
3304 mode = TYPE_MODE (type);
3305 if (INTEGRAL_TYPE_P (type))
3306 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3308 if (aarch64_return_in_msb (type))
3310 HOST_WIDE_INT size = int_size_in_bytes (type);
3312 if (size % UNITS_PER_WORD != 0)
3314 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3315 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3319 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3320 &ag_mode, &count, NULL))
3322 if (!aarch64_composite_type_p (type, mode))
3324 gcc_assert (count == 1 && mode == ag_mode);
3325 return gen_rtx_REG (mode, V0_REGNUM);
3327 else
3329 int i;
3330 rtx par;
3332 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3333 for (i = 0; i < count; i++)
3335 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3336 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3337 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3338 XVECEXP (par, 0, i) = tmp;
3340 return par;
3343 else
3344 return gen_rtx_REG (mode, R0_REGNUM);
3347 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3348 Return true if REGNO is the number of a hard register in which the values
3349 of called function may come back. */
3351 static bool
3352 aarch64_function_value_regno_p (const unsigned int regno)
3354 /* Maximum of 16 bytes can be returned in the general registers. Examples
3355 of 16-byte return values are: 128-bit integers and 16-byte small
3356 structures (excluding homogeneous floating-point aggregates). */
3357 if (regno == R0_REGNUM || regno == R1_REGNUM)
3358 return true;
3360 /* Up to four fp/simd registers can return a function value, e.g. a
3361 homogeneous floating-point aggregate having four members. */
3362 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3363 return TARGET_FLOAT;
3365 return false;
3368 /* Implement TARGET_RETURN_IN_MEMORY.
3370 If the type T of the result of a function is such that
3371 void func (T arg)
3372 would require that arg be passed as a value in a register (or set of
3373 registers) according to the parameter passing rules, then the result
3374 is returned in the same registers as would be used for such an
3375 argument. */
3377 static bool
3378 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3380 HOST_WIDE_INT size;
3381 machine_mode ag_mode;
3382 int count;
3384 if (!AGGREGATE_TYPE_P (type)
3385 && TREE_CODE (type) != COMPLEX_TYPE
3386 && TREE_CODE (type) != VECTOR_TYPE)
3387 /* Simple scalar types always returned in registers. */
3388 return false;
3390 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3391 type,
3392 &ag_mode,
3393 &count,
3394 NULL))
3395 return false;
3397 /* Types larger than 2 registers returned in memory. */
3398 size = int_size_in_bytes (type);
3399 return (size < 0 || size > 2 * UNITS_PER_WORD);
3402 static bool
3403 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3404 const_tree type, int *nregs)
3406 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3407 return aarch64_vfp_is_call_or_return_candidate (mode,
3408 type,
3409 &pcum->aapcs_vfp_rmode,
3410 nregs,
3411 NULL);
3414 /* Given MODE and TYPE of a function argument, return the alignment in
3415 bits. The idea is to suppress any stronger alignment requested by
3416 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3417 This is a helper function for local use only. */
3419 static unsigned int
3420 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3422 if (!type)
3423 return GET_MODE_ALIGNMENT (mode);
3425 if (integer_zerop (TYPE_SIZE (type)))
3426 return 0;
3428 gcc_assert (TYPE_MODE (type) == mode);
3430 if (!AGGREGATE_TYPE_P (type))
3431 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3433 if (TREE_CODE (type) == ARRAY_TYPE)
3434 return TYPE_ALIGN (TREE_TYPE (type));
3436 unsigned int alignment = 0;
3437 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3438 if (TREE_CODE (field) == FIELD_DECL)
3439 alignment = std::max (alignment, DECL_ALIGN (field));
3441 return alignment;
3444 /* Layout a function argument according to the AAPCS64 rules. The rule
3445 numbers refer to the rule numbers in the AAPCS64. */
3447 static void
3448 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3449 const_tree type,
3450 bool named ATTRIBUTE_UNUSED)
3452 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3453 int ncrn, nvrn, nregs;
3454 bool allocate_ncrn, allocate_nvrn;
3455 HOST_WIDE_INT size;
3457 /* We need to do this once per argument. */
3458 if (pcum->aapcs_arg_processed)
3459 return;
3461 pcum->aapcs_arg_processed = true;
3463 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3464 if (type)
3465 size = int_size_in_bytes (type);
3466 else
3467 /* No frontends can create types with variable-sized modes, so we
3468 shouldn't be asked to pass or return them. */
3469 size = GET_MODE_SIZE (mode).to_constant ();
3470 size = ROUND_UP (size, UNITS_PER_WORD);
3472 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3473 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3474 mode,
3475 type,
3476 &nregs);
3478 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3479 The following code thus handles passing by SIMD/FP registers first. */
3481 nvrn = pcum->aapcs_nvrn;
3483 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3484 and homogenous short-vector aggregates (HVA). */
3485 if (allocate_nvrn)
3487 if (!TARGET_FLOAT)
3488 aarch64_err_no_fpadvsimd (mode, "argument");
3490 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3492 pcum->aapcs_nextnvrn = nvrn + nregs;
3493 if (!aarch64_composite_type_p (type, mode))
3495 gcc_assert (nregs == 1);
3496 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3498 else
3500 rtx par;
3501 int i;
3502 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3503 for (i = 0; i < nregs; i++)
3505 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3506 V0_REGNUM + nvrn + i);
3507 rtx offset = gen_int_mode
3508 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3509 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3510 XVECEXP (par, 0, i) = tmp;
3512 pcum->aapcs_reg = par;
3514 return;
3516 else
3518 /* C.3 NSRN is set to 8. */
3519 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3520 goto on_stack;
3524 ncrn = pcum->aapcs_ncrn;
3525 nregs = size / UNITS_PER_WORD;
3527 /* C6 - C9. though the sign and zero extension semantics are
3528 handled elsewhere. This is the case where the argument fits
3529 entirely general registers. */
3530 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3533 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3535 /* C.8 if the argument has an alignment of 16 then the NGRN is
3536 rounded up to the next even number. */
3537 if (nregs == 2
3538 && ncrn % 2
3539 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3540 comparison is there because for > 16 * BITS_PER_UNIT
3541 alignment nregs should be > 2 and therefore it should be
3542 passed by reference rather than value. */
3543 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3545 ++ncrn;
3546 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3549 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3550 A reg is still generated for it, but the caller should be smart
3551 enough not to use it. */
3552 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3553 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3554 else
3556 rtx par;
3557 int i;
3559 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3560 for (i = 0; i < nregs; i++)
3562 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3563 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3564 GEN_INT (i * UNITS_PER_WORD));
3565 XVECEXP (par, 0, i) = tmp;
3567 pcum->aapcs_reg = par;
3570 pcum->aapcs_nextncrn = ncrn + nregs;
3571 return;
3574 /* C.11 */
3575 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3577 /* The argument is passed on stack; record the needed number of words for
3578 this argument and align the total size if necessary. */
3579 on_stack:
3580 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3582 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3583 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3584 16 / UNITS_PER_WORD);
3585 return;
3588 /* Implement TARGET_FUNCTION_ARG. */
3590 static rtx
3591 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3592 const_tree type, bool named)
3594 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3595 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3597 if (mode == VOIDmode)
3598 return NULL_RTX;
3600 aarch64_layout_arg (pcum_v, mode, type, named);
3601 return pcum->aapcs_reg;
3604 void
3605 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3606 const_tree fntype ATTRIBUTE_UNUSED,
3607 rtx libname ATTRIBUTE_UNUSED,
3608 const_tree fndecl ATTRIBUTE_UNUSED,
3609 unsigned n_named ATTRIBUTE_UNUSED)
3611 pcum->aapcs_ncrn = 0;
3612 pcum->aapcs_nvrn = 0;
3613 pcum->aapcs_nextncrn = 0;
3614 pcum->aapcs_nextnvrn = 0;
3615 pcum->pcs_variant = ARM_PCS_AAPCS64;
3616 pcum->aapcs_reg = NULL_RTX;
3617 pcum->aapcs_arg_processed = false;
3618 pcum->aapcs_stack_words = 0;
3619 pcum->aapcs_stack_size = 0;
3621 if (!TARGET_FLOAT
3622 && fndecl && TREE_PUBLIC (fndecl)
3623 && fntype && fntype != error_mark_node)
3625 const_tree type = TREE_TYPE (fntype);
3626 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3627 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3628 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3629 &mode, &nregs, NULL))
3630 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
3632 return;
3635 static void
3636 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3637 machine_mode mode,
3638 const_tree type,
3639 bool named)
3641 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3642 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3644 aarch64_layout_arg (pcum_v, mode, type, named);
3645 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3646 != (pcum->aapcs_stack_words != 0));
3647 pcum->aapcs_arg_processed = false;
3648 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3649 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3650 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3651 pcum->aapcs_stack_words = 0;
3652 pcum->aapcs_reg = NULL_RTX;
3656 bool
3657 aarch64_function_arg_regno_p (unsigned regno)
3659 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3660 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3663 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3664 PARM_BOUNDARY bits of alignment, but will be given anything up
3665 to STACK_BOUNDARY bits if the type requires it. This makes sure
3666 that both before and after the layout of each argument, the Next
3667 Stacked Argument Address (NSAA) will have a minimum alignment of
3668 8 bytes. */
3670 static unsigned int
3671 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3673 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3674 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3677 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3679 static fixed_size_mode
3680 aarch64_get_reg_raw_mode (int regno)
3682 if (TARGET_SVE && FP_REGNUM_P (regno))
3683 /* Don't use the SVE part of the register for __builtin_apply and
3684 __builtin_return. The SVE registers aren't used by the normal PCS,
3685 so using them there would be a waste of time. The PCS extensions
3686 for SVE types are fundamentally incompatible with the
3687 __builtin_return/__builtin_apply interface. */
3688 return as_a <fixed_size_mode> (V16QImode);
3689 return default_get_reg_raw_mode (regno);
3692 /* Implement TARGET_FUNCTION_ARG_PADDING.
3694 Small aggregate types are placed in the lowest memory address.
3696 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3698 static pad_direction
3699 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3701 /* On little-endian targets, the least significant byte of every stack
3702 argument is passed at the lowest byte address of the stack slot. */
3703 if (!BYTES_BIG_ENDIAN)
3704 return PAD_UPWARD;
3706 /* Otherwise, integral, floating-point and pointer types are padded downward:
3707 the least significant byte of a stack argument is passed at the highest
3708 byte address of the stack slot. */
3709 if (type
3710 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3711 || POINTER_TYPE_P (type))
3712 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3713 return PAD_DOWNWARD;
3715 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3716 return PAD_UPWARD;
3719 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3721 It specifies padding for the last (may also be the only)
3722 element of a block move between registers and memory. If
3723 assuming the block is in the memory, padding upward means that
3724 the last element is padded after its highest significant byte,
3725 while in downward padding, the last element is padded at the
3726 its least significant byte side.
3728 Small aggregates and small complex types are always padded
3729 upwards.
3731 We don't need to worry about homogeneous floating-point or
3732 short-vector aggregates; their move is not affected by the
3733 padding direction determined here. Regardless of endianness,
3734 each element of such an aggregate is put in the least
3735 significant bits of a fp/simd register.
3737 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3738 register has useful data, and return the opposite if the most
3739 significant byte does. */
3741 bool
3742 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3743 bool first ATTRIBUTE_UNUSED)
3746 /* Small composite types are always padded upward. */
3747 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3749 HOST_WIDE_INT size;
3750 if (type)
3751 size = int_size_in_bytes (type);
3752 else
3753 /* No frontends can create types with variable-sized modes, so we
3754 shouldn't be asked to pass or return them. */
3755 size = GET_MODE_SIZE (mode).to_constant ();
3756 if (size < 2 * UNITS_PER_WORD)
3757 return true;
3760 /* Otherwise, use the default padding. */
3761 return !BYTES_BIG_ENDIAN;
3764 static scalar_int_mode
3765 aarch64_libgcc_cmp_return_mode (void)
3767 return SImode;
3770 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3772 /* We use the 12-bit shifted immediate arithmetic instructions so values
3773 must be multiple of (1 << 12), i.e. 4096. */
3774 #define ARITH_FACTOR 4096
3776 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3777 #error Cannot use simple address calculation for stack probing
3778 #endif
3780 /* The pair of scratch registers used for stack probing. */
3781 #define PROBE_STACK_FIRST_REG 9
3782 #define PROBE_STACK_SECOND_REG 10
3784 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3785 inclusive. These are offsets from the current stack pointer. */
3787 static void
3788 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3790 HOST_WIDE_INT size;
3791 if (!poly_size.is_constant (&size))
3793 sorry ("stack probes for SVE frames");
3794 return;
3797 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3799 /* See the same assertion on PROBE_INTERVAL above. */
3800 gcc_assert ((first % ARITH_FACTOR) == 0);
3802 /* See if we have a constant small number of probes to generate. If so,
3803 that's the easy case. */
3804 if (size <= PROBE_INTERVAL)
3806 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3808 emit_set_insn (reg1,
3809 plus_constant (Pmode,
3810 stack_pointer_rtx, -(first + base)));
3811 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3814 /* The run-time loop is made up of 8 insns in the generic case while the
3815 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3816 else if (size <= 4 * PROBE_INTERVAL)
3818 HOST_WIDE_INT i, rem;
3820 emit_set_insn (reg1,
3821 plus_constant (Pmode,
3822 stack_pointer_rtx,
3823 -(first + PROBE_INTERVAL)));
3824 emit_stack_probe (reg1);
3826 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3827 it exceeds SIZE. If only two probes are needed, this will not
3828 generate any code. Then probe at FIRST + SIZE. */
3829 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3831 emit_set_insn (reg1,
3832 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3833 emit_stack_probe (reg1);
3836 rem = size - (i - PROBE_INTERVAL);
3837 if (rem > 256)
3839 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3841 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3842 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3844 else
3845 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3848 /* Otherwise, do the same as above, but in a loop. Note that we must be
3849 extra careful with variables wrapping around because we might be at
3850 the very top (or the very bottom) of the address space and we have
3851 to be able to handle this case properly; in particular, we use an
3852 equality test for the loop condition. */
3853 else
3855 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3857 /* Step 1: round SIZE to the previous multiple of the interval. */
3859 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3862 /* Step 2: compute initial and final value of the loop counter. */
3864 /* TEST_ADDR = SP + FIRST. */
3865 emit_set_insn (reg1,
3866 plus_constant (Pmode, stack_pointer_rtx, -first));
3868 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3869 HOST_WIDE_INT adjustment = - (first + rounded_size);
3870 if (! aarch64_uimm12_shift (adjustment))
3872 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3873 true, Pmode);
3874 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3876 else
3877 emit_set_insn (reg2,
3878 plus_constant (Pmode, stack_pointer_rtx, adjustment));
3880 /* Step 3: the loop
3884 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3885 probe at TEST_ADDR
3887 while (TEST_ADDR != LAST_ADDR)
3889 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3890 until it is equal to ROUNDED_SIZE. */
3892 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3895 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3896 that SIZE is equal to ROUNDED_SIZE. */
3898 if (size != rounded_size)
3900 HOST_WIDE_INT rem = size - rounded_size;
3902 if (rem > 256)
3904 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3906 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3907 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3909 else
3910 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3914 /* Make sure nothing is scheduled before we are done. */
3915 emit_insn (gen_blockage ());
3918 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3919 absolute addresses. */
3921 const char *
3922 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3924 static int labelno = 0;
3925 char loop_lab[32];
3926 rtx xops[2];
3928 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3930 /* Loop. */
3931 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3933 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3934 xops[0] = reg1;
3935 xops[1] = GEN_INT (PROBE_INTERVAL);
3936 output_asm_insn ("sub\t%0, %0, %1", xops);
3938 /* Probe at TEST_ADDR. */
3939 output_asm_insn ("str\txzr, [%0]", xops);
3941 /* Test if TEST_ADDR == LAST_ADDR. */
3942 xops[1] = reg2;
3943 output_asm_insn ("cmp\t%0, %1", xops);
3945 /* Branch. */
3946 fputs ("\tb.ne\t", asm_out_file);
3947 assemble_name_raw (asm_out_file, loop_lab);
3948 fputc ('\n', asm_out_file);
3950 return "";
3953 /* Mark the registers that need to be saved by the callee and calculate
3954 the size of the callee-saved registers area and frame record (both FP
3955 and LR may be omitted). */
3956 static void
3957 aarch64_layout_frame (void)
3959 HOST_WIDE_INT offset = 0;
3960 int regno, last_fp_reg = INVALID_REGNUM;
3962 if (reload_completed && cfun->machine->frame.laid_out)
3963 return;
3965 /* Force a frame chain for EH returns so the return address is at FP+8. */
3966 cfun->machine->frame.emit_frame_chain
3967 = frame_pointer_needed || crtl->calls_eh_return;
3969 /* Emit a frame chain if the frame pointer is enabled.
3970 If -momit-leaf-frame-pointer is used, do not use a frame chain
3971 in leaf functions which do not use LR. */
3972 if (flag_omit_frame_pointer == 2
3973 && !(flag_omit_leaf_frame_pointer && crtl->is_leaf
3974 && !df_regs_ever_live_p (LR_REGNUM)))
3975 cfun->machine->frame.emit_frame_chain = true;
3977 #define SLOT_NOT_REQUIRED (-2)
3978 #define SLOT_REQUIRED (-1)
3980 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
3981 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
3983 /* First mark all the registers that really need to be saved... */
3984 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3985 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3987 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
3988 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
3990 /* ... that includes the eh data registers (if needed)... */
3991 if (crtl->calls_eh_return)
3992 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
3993 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
3994 = SLOT_REQUIRED;
3996 /* ... and any callee saved register that dataflow says is live. */
3997 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
3998 if (df_regs_ever_live_p (regno)
3999 && (regno == R30_REGNUM
4000 || !call_used_regs[regno]))
4001 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4003 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4004 if (df_regs_ever_live_p (regno)
4005 && !call_used_regs[regno])
4007 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4008 last_fp_reg = regno;
4011 if (cfun->machine->frame.emit_frame_chain)
4013 /* FP and LR are placed in the linkage record. */
4014 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4015 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4016 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4017 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4018 offset = 2 * UNITS_PER_WORD;
4021 /* Now assign stack slots for them. */
4022 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4023 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4025 cfun->machine->frame.reg_offset[regno] = offset;
4026 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4027 cfun->machine->frame.wb_candidate1 = regno;
4028 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4029 cfun->machine->frame.wb_candidate2 = regno;
4030 offset += UNITS_PER_WORD;
4033 HOST_WIDE_INT max_int_offset = offset;
4034 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4035 bool has_align_gap = offset != max_int_offset;
4037 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4038 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4040 /* If there is an alignment gap between integer and fp callee-saves,
4041 allocate the last fp register to it if possible. */
4042 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4044 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4045 break;
4048 cfun->machine->frame.reg_offset[regno] = offset;
4049 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4050 cfun->machine->frame.wb_candidate1 = regno;
4051 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4052 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4053 cfun->machine->frame.wb_candidate2 = regno;
4054 offset += UNITS_PER_WORD;
4057 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4059 cfun->machine->frame.saved_regs_size = offset;
4061 HOST_WIDE_INT varargs_and_saved_regs_size
4062 = offset + cfun->machine->frame.saved_varargs_size;
4064 cfun->machine->frame.hard_fp_offset
4065 = aligned_upper_bound (varargs_and_saved_regs_size
4066 + get_frame_size (),
4067 STACK_BOUNDARY / BITS_PER_UNIT);
4069 /* Both these values are already aligned. */
4070 gcc_assert (multiple_p (crtl->outgoing_args_size,
4071 STACK_BOUNDARY / BITS_PER_UNIT));
4072 cfun->machine->frame.frame_size
4073 = (cfun->machine->frame.hard_fp_offset
4074 + crtl->outgoing_args_size);
4076 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4078 cfun->machine->frame.initial_adjust = 0;
4079 cfun->machine->frame.final_adjust = 0;
4080 cfun->machine->frame.callee_adjust = 0;
4081 cfun->machine->frame.callee_offset = 0;
4083 HOST_WIDE_INT max_push_offset = 0;
4084 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4085 max_push_offset = 512;
4086 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4087 max_push_offset = 256;
4089 HOST_WIDE_INT const_size, const_fp_offset;
4090 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4091 && const_size < max_push_offset
4092 && known_eq (crtl->outgoing_args_size, 0))
4094 /* Simple, small frame with no outgoing arguments:
4095 stp reg1, reg2, [sp, -frame_size]!
4096 stp reg3, reg4, [sp, 16] */
4097 cfun->machine->frame.callee_adjust = const_size;
4099 else if (known_lt (crtl->outgoing_args_size
4100 + cfun->machine->frame.saved_regs_size, 512)
4101 && !(cfun->calls_alloca
4102 && known_lt (cfun->machine->frame.hard_fp_offset,
4103 max_push_offset)))
4105 /* Frame with small outgoing arguments:
4106 sub sp, sp, frame_size
4107 stp reg1, reg2, [sp, outgoing_args_size]
4108 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4109 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4110 cfun->machine->frame.callee_offset
4111 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4113 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4114 && const_fp_offset < max_push_offset)
4116 /* Frame with large outgoing arguments but a small local area:
4117 stp reg1, reg2, [sp, -hard_fp_offset]!
4118 stp reg3, reg4, [sp, 16]
4119 sub sp, sp, outgoing_args_size */
4120 cfun->machine->frame.callee_adjust = const_fp_offset;
4121 cfun->machine->frame.final_adjust
4122 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4124 else
4126 /* Frame with large local area and outgoing arguments using frame pointer:
4127 sub sp, sp, hard_fp_offset
4128 stp x29, x30, [sp, 0]
4129 add x29, sp, 0
4130 stp reg3, reg4, [sp, 16]
4131 sub sp, sp, outgoing_args_size */
4132 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4133 cfun->machine->frame.final_adjust
4134 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4137 cfun->machine->frame.laid_out = true;
4140 /* Return true if the register REGNO is saved on entry to
4141 the current function. */
4143 static bool
4144 aarch64_register_saved_on_entry (int regno)
4146 return cfun->machine->frame.reg_offset[regno] >= 0;
4149 /* Return the next register up from REGNO up to LIMIT for the callee
4150 to save. */
4152 static unsigned
4153 aarch64_next_callee_save (unsigned regno, unsigned limit)
4155 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4156 regno ++;
4157 return regno;
4160 /* Push the register number REGNO of mode MODE to the stack with write-back
4161 adjusting the stack by ADJUSTMENT. */
4163 static void
4164 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4165 HOST_WIDE_INT adjustment)
4167 rtx base_rtx = stack_pointer_rtx;
4168 rtx insn, reg, mem;
4170 reg = gen_rtx_REG (mode, regno);
4171 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4172 plus_constant (Pmode, base_rtx, -adjustment));
4173 mem = gen_frame_mem (mode, mem);
4175 insn = emit_move_insn (mem, reg);
4176 RTX_FRAME_RELATED_P (insn) = 1;
4179 /* Generate and return an instruction to store the pair of registers
4180 REG and REG2 of mode MODE to location BASE with write-back adjusting
4181 the stack location BASE by ADJUSTMENT. */
4183 static rtx
4184 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4185 HOST_WIDE_INT adjustment)
4187 switch (mode)
4189 case E_DImode:
4190 return gen_storewb_pairdi_di (base, base, reg, reg2,
4191 GEN_INT (-adjustment),
4192 GEN_INT (UNITS_PER_WORD - adjustment));
4193 case E_DFmode:
4194 return gen_storewb_pairdf_di (base, base, reg, reg2,
4195 GEN_INT (-adjustment),
4196 GEN_INT (UNITS_PER_WORD - adjustment));
4197 default:
4198 gcc_unreachable ();
4202 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4203 stack pointer by ADJUSTMENT. */
4205 static void
4206 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4208 rtx_insn *insn;
4209 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4211 if (regno2 == INVALID_REGNUM)
4212 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4214 rtx reg1 = gen_rtx_REG (mode, regno1);
4215 rtx reg2 = gen_rtx_REG (mode, regno2);
4217 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4218 reg2, adjustment));
4219 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4220 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4221 RTX_FRAME_RELATED_P (insn) = 1;
4224 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4225 adjusting it by ADJUSTMENT afterwards. */
4227 static rtx
4228 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4229 HOST_WIDE_INT adjustment)
4231 switch (mode)
4233 case E_DImode:
4234 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4235 GEN_INT (UNITS_PER_WORD));
4236 case E_DFmode:
4237 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4238 GEN_INT (UNITS_PER_WORD));
4239 default:
4240 gcc_unreachable ();
4244 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4245 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4246 into CFI_OPS. */
4248 static void
4249 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4250 rtx *cfi_ops)
4252 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4253 rtx reg1 = gen_rtx_REG (mode, regno1);
4255 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4257 if (regno2 == INVALID_REGNUM)
4259 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4260 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4261 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4263 else
4265 rtx reg2 = gen_rtx_REG (mode, regno2);
4266 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4267 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4268 reg2, adjustment));
4272 /* Generate and return a store pair instruction of mode MODE to store
4273 register REG1 to MEM1 and register REG2 to MEM2. */
4275 static rtx
4276 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4277 rtx reg2)
4279 switch (mode)
4281 case E_DImode:
4282 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4284 case E_DFmode:
4285 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4287 default:
4288 gcc_unreachable ();
4292 /* Generate and regurn a load pair isntruction of mode MODE to load register
4293 REG1 from MEM1 and register REG2 from MEM2. */
4295 static rtx
4296 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4297 rtx mem2)
4299 switch (mode)
4301 case E_DImode:
4302 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4304 case E_DFmode:
4305 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4307 default:
4308 gcc_unreachable ();
4312 /* Return TRUE if return address signing should be enabled for the current
4313 function, otherwise return FALSE. */
4315 bool
4316 aarch64_return_address_signing_enabled (void)
4318 /* This function should only be called after frame laid out. */
4319 gcc_assert (cfun->machine->frame.laid_out);
4321 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4322 if it's LR is pushed onto stack. */
4323 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4324 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4325 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4328 /* Emit code to save the callee-saved registers from register number START
4329 to LIMIT to the stack at the location starting at offset START_OFFSET,
4330 skipping any write-back candidates if SKIP_WB is true. */
4332 static void
4333 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4334 unsigned start, unsigned limit, bool skip_wb)
4336 rtx_insn *insn;
4337 unsigned regno;
4338 unsigned regno2;
4340 for (regno = aarch64_next_callee_save (start, limit);
4341 regno <= limit;
4342 regno = aarch64_next_callee_save (regno + 1, limit))
4344 rtx reg, mem;
4345 poly_int64 offset;
4347 if (skip_wb
4348 && (regno == cfun->machine->frame.wb_candidate1
4349 || regno == cfun->machine->frame.wb_candidate2))
4350 continue;
4352 if (cfun->machine->reg_is_wrapped_separately[regno])
4353 continue;
4355 reg = gen_rtx_REG (mode, regno);
4356 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4357 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4358 offset));
4360 regno2 = aarch64_next_callee_save (regno + 1, limit);
4362 if (regno2 <= limit
4363 && !cfun->machine->reg_is_wrapped_separately[regno2]
4364 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4365 == cfun->machine->frame.reg_offset[regno2]))
4368 rtx reg2 = gen_rtx_REG (mode, regno2);
4369 rtx mem2;
4371 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4372 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4373 offset));
4374 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4375 reg2));
4377 /* The first part of a frame-related parallel insn is
4378 always assumed to be relevant to the frame
4379 calculations; subsequent parts, are only
4380 frame-related if explicitly marked. */
4381 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4382 regno = regno2;
4384 else
4385 insn = emit_move_insn (mem, reg);
4387 RTX_FRAME_RELATED_P (insn) = 1;
4391 /* Emit code to restore the callee registers of mode MODE from register
4392 number START up to and including LIMIT. Restore from the stack offset
4393 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4394 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4396 static void
4397 aarch64_restore_callee_saves (machine_mode mode,
4398 poly_int64 start_offset, unsigned start,
4399 unsigned limit, bool skip_wb, rtx *cfi_ops)
4401 rtx base_rtx = stack_pointer_rtx;
4402 unsigned regno;
4403 unsigned regno2;
4404 poly_int64 offset;
4406 for (regno = aarch64_next_callee_save (start, limit);
4407 regno <= limit;
4408 regno = aarch64_next_callee_save (regno + 1, limit))
4410 if (cfun->machine->reg_is_wrapped_separately[regno])
4411 continue;
4413 rtx reg, mem;
4415 if (skip_wb
4416 && (regno == cfun->machine->frame.wb_candidate1
4417 || regno == cfun->machine->frame.wb_candidate2))
4418 continue;
4420 reg = gen_rtx_REG (mode, regno);
4421 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4422 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4424 regno2 = aarch64_next_callee_save (regno + 1, limit);
4426 if (regno2 <= limit
4427 && !cfun->machine->reg_is_wrapped_separately[regno2]
4428 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4429 == cfun->machine->frame.reg_offset[regno2]))
4431 rtx reg2 = gen_rtx_REG (mode, regno2);
4432 rtx mem2;
4434 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4435 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4436 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4438 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4439 regno = regno2;
4441 else
4442 emit_move_insn (reg, mem);
4443 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4447 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4448 of MODE. */
4450 static inline bool
4451 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4453 HOST_WIDE_INT multiple;
4454 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4455 && IN_RANGE (multiple, -8, 7));
4458 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4459 of MODE. */
4461 static inline bool
4462 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4464 HOST_WIDE_INT multiple;
4465 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4466 && IN_RANGE (multiple, 0, 63));
4469 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4470 of MODE. */
4472 bool
4473 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4475 HOST_WIDE_INT multiple;
4476 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4477 && IN_RANGE (multiple, -64, 63));
4480 /* Return true if OFFSET is a signed 9-bit value. */
4482 static inline bool
4483 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4484 poly_int64 offset)
4486 HOST_WIDE_INT const_offset;
4487 return (offset.is_constant (&const_offset)
4488 && IN_RANGE (const_offset, -256, 255));
4491 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4492 of MODE. */
4494 static inline bool
4495 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4497 HOST_WIDE_INT multiple;
4498 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4499 && IN_RANGE (multiple, -256, 255));
4502 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4503 of MODE. */
4505 static inline bool
4506 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4508 HOST_WIDE_INT multiple;
4509 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4510 && IN_RANGE (multiple, 0, 4095));
4513 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4515 static sbitmap
4516 aarch64_get_separate_components (void)
4518 aarch64_layout_frame ();
4520 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4521 bitmap_clear (components);
4523 /* The registers we need saved to the frame. */
4524 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4525 if (aarch64_register_saved_on_entry (regno))
4527 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4528 if (!frame_pointer_needed)
4529 offset += cfun->machine->frame.frame_size
4530 - cfun->machine->frame.hard_fp_offset;
4531 /* Check that we can access the stack slot of the register with one
4532 direct load with no adjustments needed. */
4533 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4534 bitmap_set_bit (components, regno);
4537 /* Don't mess with the hard frame pointer. */
4538 if (frame_pointer_needed)
4539 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4541 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4542 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4543 /* If aarch64_layout_frame has chosen registers to store/restore with
4544 writeback don't interfere with them to avoid having to output explicit
4545 stack adjustment instructions. */
4546 if (reg2 != INVALID_REGNUM)
4547 bitmap_clear_bit (components, reg2);
4548 if (reg1 != INVALID_REGNUM)
4549 bitmap_clear_bit (components, reg1);
4551 bitmap_clear_bit (components, LR_REGNUM);
4552 bitmap_clear_bit (components, SP_REGNUM);
4554 return components;
4557 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4559 static sbitmap
4560 aarch64_components_for_bb (basic_block bb)
4562 bitmap in = DF_LIVE_IN (bb);
4563 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4564 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4566 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4567 bitmap_clear (components);
4569 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4570 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4571 if ((!call_used_regs[regno])
4572 && (bitmap_bit_p (in, regno)
4573 || bitmap_bit_p (gen, regno)
4574 || bitmap_bit_p (kill, regno)))
4576 unsigned regno2, offset, offset2;
4577 bitmap_set_bit (components, regno);
4579 /* If there is a callee-save at an adjacent offset, add it too
4580 to increase the use of LDP/STP. */
4581 offset = cfun->machine->frame.reg_offset[regno];
4582 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4584 if (regno2 <= LAST_SAVED_REGNUM)
4586 offset2 = cfun->machine->frame.reg_offset[regno2];
4587 if ((offset & ~8) == (offset2 & ~8))
4588 bitmap_set_bit (components, regno2);
4592 return components;
4595 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4596 Nothing to do for aarch64. */
4598 static void
4599 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4603 /* Return the next set bit in BMP from START onwards. Return the total number
4604 of bits in BMP if no set bit is found at or after START. */
4606 static unsigned int
4607 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4609 unsigned int nbits = SBITMAP_SIZE (bmp);
4610 if (start == nbits)
4611 return start;
4613 gcc_assert (start < nbits);
4614 for (unsigned int i = start; i < nbits; i++)
4615 if (bitmap_bit_p (bmp, i))
4616 return i;
4618 return nbits;
4621 /* Do the work for aarch64_emit_prologue_components and
4622 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4623 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4624 for these components or the epilogue sequence. That is, it determines
4625 whether we should emit stores or loads and what kind of CFA notes to attach
4626 to the insns. Otherwise the logic for the two sequences is very
4627 similar. */
4629 static void
4630 aarch64_process_components (sbitmap components, bool prologue_p)
4632 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4633 ? HARD_FRAME_POINTER_REGNUM
4634 : STACK_POINTER_REGNUM);
4636 unsigned last_regno = SBITMAP_SIZE (components);
4637 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4638 rtx_insn *insn = NULL;
4640 while (regno != last_regno)
4642 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4643 so DFmode for the vector registers is enough. */
4644 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4645 rtx reg = gen_rtx_REG (mode, regno);
4646 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4647 if (!frame_pointer_needed)
4648 offset += cfun->machine->frame.frame_size
4649 - cfun->machine->frame.hard_fp_offset;
4650 rtx addr = plus_constant (Pmode, ptr_reg, offset);
4651 rtx mem = gen_frame_mem (mode, addr);
4653 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4654 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4655 /* No more registers to handle after REGNO.
4656 Emit a single save/restore and exit. */
4657 if (regno2 == last_regno)
4659 insn = emit_insn (set);
4660 RTX_FRAME_RELATED_P (insn) = 1;
4661 if (prologue_p)
4662 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4663 else
4664 add_reg_note (insn, REG_CFA_RESTORE, reg);
4665 break;
4668 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4669 /* The next register is not of the same class or its offset is not
4670 mergeable with the current one into a pair. */
4671 if (!satisfies_constraint_Ump (mem)
4672 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4673 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4674 GET_MODE_SIZE (mode)))
4676 insn = emit_insn (set);
4677 RTX_FRAME_RELATED_P (insn) = 1;
4678 if (prologue_p)
4679 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4680 else
4681 add_reg_note (insn, REG_CFA_RESTORE, reg);
4683 regno = regno2;
4684 continue;
4687 /* REGNO2 can be saved/restored in a pair with REGNO. */
4688 rtx reg2 = gen_rtx_REG (mode, regno2);
4689 if (!frame_pointer_needed)
4690 offset2 += cfun->machine->frame.frame_size
4691 - cfun->machine->frame.hard_fp_offset;
4692 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4693 rtx mem2 = gen_frame_mem (mode, addr2);
4694 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4695 : gen_rtx_SET (reg2, mem2);
4697 if (prologue_p)
4698 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4699 else
4700 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4702 RTX_FRAME_RELATED_P (insn) = 1;
4703 if (prologue_p)
4705 add_reg_note (insn, REG_CFA_OFFSET, set);
4706 add_reg_note (insn, REG_CFA_OFFSET, set2);
4708 else
4710 add_reg_note (insn, REG_CFA_RESTORE, reg);
4711 add_reg_note (insn, REG_CFA_RESTORE, reg2);
4714 regno = aarch64_get_next_set_bit (components, regno2 + 1);
4718 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4720 static void
4721 aarch64_emit_prologue_components (sbitmap components)
4723 aarch64_process_components (components, true);
4726 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4728 static void
4729 aarch64_emit_epilogue_components (sbitmap components)
4731 aarch64_process_components (components, false);
4734 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4736 static void
4737 aarch64_set_handled_components (sbitmap components)
4739 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4740 if (bitmap_bit_p (components, regno))
4741 cfun->machine->reg_is_wrapped_separately[regno] = true;
4744 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4745 is saved at BASE + OFFSET. */
4747 static void
4748 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4749 rtx base, poly_int64 offset)
4751 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4752 add_reg_note (insn, REG_CFA_EXPRESSION,
4753 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4756 /* AArch64 stack frames generated by this compiler look like:
4758 +-------------------------------+
4760 | incoming stack arguments |
4762 +-------------------------------+
4763 | | <-- incoming stack pointer (aligned)
4764 | callee-allocated save area |
4765 | for register varargs |
4767 +-------------------------------+
4768 | local variables | <-- frame_pointer_rtx
4770 +-------------------------------+
4771 | padding0 | \
4772 +-------------------------------+ |
4773 | callee-saved registers | | frame.saved_regs_size
4774 +-------------------------------+ |
4775 | LR' | |
4776 +-------------------------------+ |
4777 | FP' | / <- hard_frame_pointer_rtx (aligned)
4778 +-------------------------------+
4779 | dynamic allocation |
4780 +-------------------------------+
4781 | padding |
4782 +-------------------------------+
4783 | outgoing stack arguments | <-- arg_pointer
4785 +-------------------------------+
4786 | | <-- stack_pointer_rtx (aligned)
4788 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4789 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4790 unchanged. */
4792 /* Generate the prologue instructions for entry into a function.
4793 Establish the stack frame by decreasing the stack pointer with a
4794 properly calculated size and, if necessary, create a frame record
4795 filled with the values of LR and previous frame pointer. The
4796 current FP is also set up if it is in use. */
4798 void
4799 aarch64_expand_prologue (void)
4801 aarch64_layout_frame ();
4803 poly_int64 frame_size = cfun->machine->frame.frame_size;
4804 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4805 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4806 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4807 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4808 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4809 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4810 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4811 rtx_insn *insn;
4813 /* Sign return address for functions. */
4814 if (aarch64_return_address_signing_enabled ())
4816 insn = emit_insn (gen_pacisp ());
4817 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4818 RTX_FRAME_RELATED_P (insn) = 1;
4821 if (flag_stack_usage_info)
4822 current_function_static_stack_size = constant_lower_bound (frame_size);
4824 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4826 if (crtl->is_leaf && !cfun->calls_alloca)
4828 if (maybe_gt (frame_size, PROBE_INTERVAL)
4829 && maybe_gt (frame_size, get_stack_check_protect ()))
4830 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4831 (frame_size
4832 - get_stack_check_protect ()));
4834 else if (maybe_gt (frame_size, 0))
4835 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4838 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4839 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4841 aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4843 if (callee_adjust != 0)
4844 aarch64_push_regs (reg1, reg2, callee_adjust);
4846 if (emit_frame_chain)
4848 poly_int64 reg_offset = callee_adjust;
4849 if (callee_adjust == 0)
4851 reg1 = R29_REGNUM;
4852 reg2 = R30_REGNUM;
4853 reg_offset = callee_offset;
4854 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4856 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4857 stack_pointer_rtx, callee_offset,
4858 ip1_rtx, ip0_rtx, frame_pointer_needed);
4859 if (frame_pointer_needed && !frame_size.is_constant ())
4861 /* Variable-sized frames need to describe the save slot
4862 address using DW_CFA_expression rather than DW_CFA_offset.
4863 This means that, without taking further action, the
4864 locations of the registers that we've already saved would
4865 remain based on the stack pointer even after we redefine
4866 the CFA based on the frame pointer. We therefore need new
4867 DW_CFA_expressions to re-express the save slots with addresses
4868 based on the frame pointer. */
4869 rtx_insn *insn = get_last_insn ();
4870 gcc_assert (RTX_FRAME_RELATED_P (insn));
4872 /* Add an explicit CFA definition if this was previously
4873 implicit. */
4874 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4876 rtx src = plus_constant (Pmode, stack_pointer_rtx,
4877 callee_offset);
4878 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4879 gen_rtx_SET (hard_frame_pointer_rtx, src));
4882 /* Change the save slot expressions for the registers that
4883 we've already saved. */
4884 reg_offset -= callee_offset;
4885 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4886 reg_offset + UNITS_PER_WORD);
4887 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4888 reg_offset);
4890 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4893 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4894 callee_adjust != 0 || emit_frame_chain);
4895 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4896 callee_adjust != 0 || emit_frame_chain);
4897 aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4900 /* Return TRUE if we can use a simple_return insn.
4902 This function checks whether the callee saved stack is empty, which
4903 means no restore actions are need. The pro_and_epilogue will use
4904 this to check whether shrink-wrapping opt is feasible. */
4906 bool
4907 aarch64_use_return_insn_p (void)
4909 if (!reload_completed)
4910 return false;
4912 if (crtl->profile)
4913 return false;
4915 aarch64_layout_frame ();
4917 return known_eq (cfun->machine->frame.frame_size, 0);
4920 /* Generate the epilogue instructions for returning from a function.
4921 This is almost exactly the reverse of the prolog sequence, except
4922 that we need to insert barriers to avoid scheduling loads that read
4923 from a deallocated stack, and we optimize the unwind records by
4924 emitting them all together if possible. */
4925 void
4926 aarch64_expand_epilogue (bool for_sibcall)
4928 aarch64_layout_frame ();
4930 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4931 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4932 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4933 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4934 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4935 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4936 rtx cfi_ops = NULL;
4937 rtx_insn *insn;
4938 /* A stack clash protection prologue may not have left IP0_REGNUM or
4939 IP1_REGNUM in a usable state. The same is true for allocations
4940 with an SVE component, since we then need both temporary registers
4941 for each allocation. */
4942 bool can_inherit_p = (initial_adjust.is_constant ()
4943 && final_adjust.is_constant ()
4944 && !flag_stack_clash_protection);
4946 /* We need to add memory barrier to prevent read from deallocated stack. */
4947 bool need_barrier_p
4948 = maybe_ne (get_frame_size ()
4949 + cfun->machine->frame.saved_varargs_size, 0);
4951 /* Emit a barrier to prevent loads from a deallocated stack. */
4952 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
4953 || cfun->calls_alloca
4954 || crtl->calls_eh_return)
4956 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4957 need_barrier_p = false;
4960 /* Restore the stack pointer from the frame pointer if it may not
4961 be the same as the stack pointer. */
4962 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4963 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4964 if (frame_pointer_needed
4965 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
4966 /* If writeback is used when restoring callee-saves, the CFA
4967 is restored on the instruction doing the writeback. */
4968 aarch64_add_offset (Pmode, stack_pointer_rtx,
4969 hard_frame_pointer_rtx, -callee_offset,
4970 ip1_rtx, ip0_rtx, callee_adjust == 0);
4971 else
4972 aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
4973 !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
4975 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4976 callee_adjust != 0, &cfi_ops);
4977 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4978 callee_adjust != 0, &cfi_ops);
4980 if (need_barrier_p)
4981 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
4983 if (callee_adjust != 0)
4984 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
4986 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
4988 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
4989 insn = get_last_insn ();
4990 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
4991 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
4992 RTX_FRAME_RELATED_P (insn) = 1;
4993 cfi_ops = NULL;
4996 aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
4997 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
4999 if (cfi_ops)
5001 /* Emit delayed restores and reset the CFA to be SP. */
5002 insn = get_last_insn ();
5003 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5004 REG_NOTES (insn) = cfi_ops;
5005 RTX_FRAME_RELATED_P (insn) = 1;
5008 /* We prefer to emit the combined return/authenticate instruction RETAA,
5009 however there are three cases in which we must instead emit an explicit
5010 authentication instruction.
5012 1) Sibcalls don't return in a normal way, so if we're about to call one
5013 we must authenticate.
5015 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5016 generating code for !TARGET_ARMV8_3 we can't use it and must
5017 explicitly authenticate.
5019 3) On an eh_return path we make extra stack adjustments to update the
5020 canonical frame address to be the exception handler's CFA. We want
5021 to authenticate using the CFA of the function which calls eh_return.
5023 if (aarch64_return_address_signing_enabled ()
5024 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5026 insn = emit_insn (gen_autisp ());
5027 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5028 RTX_FRAME_RELATED_P (insn) = 1;
5031 /* Stack adjustment for exception handler. */
5032 if (crtl->calls_eh_return)
5034 /* We need to unwind the stack by the offset computed by
5035 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5036 to be SP; letting the CFA move during this adjustment
5037 is just as correct as retaining the CFA from the body
5038 of the function. Therefore, do nothing special. */
5039 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5042 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5043 if (!for_sibcall)
5044 emit_jump_insn (ret_rtx);
5047 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5048 normally or return to a previous frame after unwinding.
5050 An EH return uses a single shared return sequence. The epilogue is
5051 exactly like a normal epilogue except that it has an extra input
5052 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5053 that must be applied after the frame has been destroyed. An extra label
5054 is inserted before the epilogue which initializes this register to zero,
5055 and this is the entry point for a normal return.
5057 An actual EH return updates the return address, initializes the stack
5058 adjustment and jumps directly into the epilogue (bypassing the zeroing
5059 of the adjustment). Since the return address is typically saved on the
5060 stack when a function makes a call, the saved LR must be updated outside
5061 the epilogue.
5063 This poses problems as the store is generated well before the epilogue,
5064 so the offset of LR is not known yet. Also optimizations will remove the
5065 store as it appears dead, even after the epilogue is generated (as the
5066 base or offset for loading LR is different in many cases).
5068 To avoid these problems this implementation forces the frame pointer
5069 in eh_return functions so that the location of LR is fixed and known early.
5070 It also marks the store volatile, so no optimization is permitted to
5071 remove the store. */
5073 aarch64_eh_return_handler_rtx (void)
5075 rtx tmp = gen_frame_mem (Pmode,
5076 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5078 /* Mark the store volatile, so no optimization is permitted to remove it. */
5079 MEM_VOLATILE_P (tmp) = true;
5080 return tmp;
5083 /* Output code to add DELTA to the first argument, and then jump
5084 to FUNCTION. Used for C++ multiple inheritance. */
5085 static void
5086 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5087 HOST_WIDE_INT delta,
5088 HOST_WIDE_INT vcall_offset,
5089 tree function)
5091 /* The this pointer is always in x0. Note that this differs from
5092 Arm where the this pointer maybe bumped to r1 if r0 is required
5093 to return a pointer to an aggregate. On AArch64 a result value
5094 pointer will be in x8. */
5095 int this_regno = R0_REGNUM;
5096 rtx this_rtx, temp0, temp1, addr, funexp;
5097 rtx_insn *insn;
5099 reload_completed = 1;
5100 emit_note (NOTE_INSN_PROLOGUE_END);
5102 this_rtx = gen_rtx_REG (Pmode, this_regno);
5103 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5104 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5106 if (vcall_offset == 0)
5107 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5108 else
5110 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5112 addr = this_rtx;
5113 if (delta != 0)
5115 if (delta >= -256 && delta < 256)
5116 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5117 plus_constant (Pmode, this_rtx, delta));
5118 else
5119 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5120 temp1, temp0, false);
5123 if (Pmode == ptr_mode)
5124 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5125 else
5126 aarch64_emit_move (temp0,
5127 gen_rtx_ZERO_EXTEND (Pmode,
5128 gen_rtx_MEM (ptr_mode, addr)));
5130 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5131 addr = plus_constant (Pmode, temp0, vcall_offset);
5132 else
5134 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5135 Pmode);
5136 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5139 if (Pmode == ptr_mode)
5140 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5141 else
5142 aarch64_emit_move (temp1,
5143 gen_rtx_SIGN_EXTEND (Pmode,
5144 gen_rtx_MEM (ptr_mode, addr)));
5146 emit_insn (gen_add2_insn (this_rtx, temp1));
5149 /* Generate a tail call to the target function. */
5150 if (!TREE_USED (function))
5152 assemble_external (function);
5153 TREE_USED (function) = 1;
5155 funexp = XEXP (DECL_RTL (function), 0);
5156 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5157 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5158 SIBLING_CALL_P (insn) = 1;
5160 insn = get_insns ();
5161 shorten_branches (insn);
5162 final_start_function (insn, file, 1);
5163 final (insn, file, 1);
5164 final_end_function ();
5166 /* Stop pretending to be a post-reload pass. */
5167 reload_completed = 0;
5170 static bool
5171 aarch64_tls_referenced_p (rtx x)
5173 if (!TARGET_HAVE_TLS)
5174 return false;
5175 subrtx_iterator::array_type array;
5176 FOR_EACH_SUBRTX (iter, array, x, ALL)
5178 const_rtx x = *iter;
5179 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5180 return true;
5181 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5182 TLS offsets, not real symbol references. */
5183 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5184 iter.skip_subrtxes ();
5186 return false;
5190 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5191 a left shift of 0 or 12 bits. */
5192 bool
5193 aarch64_uimm12_shift (HOST_WIDE_INT val)
5195 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5196 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5201 /* Return true if val is an immediate that can be loaded into a
5202 register by a MOVZ instruction. */
5203 static bool
5204 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5206 if (GET_MODE_SIZE (mode) > 4)
5208 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5209 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5210 return 1;
5212 else
5214 /* Ignore sign extension. */
5215 val &= (HOST_WIDE_INT) 0xffffffff;
5217 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5218 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5221 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5222 64-bit (DImode) integer. */
5224 static unsigned HOST_WIDE_INT
5225 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5227 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5228 while (size < 64)
5230 val &= (HOST_WIDE_INT_1U << size) - 1;
5231 val |= val << size;
5232 size *= 2;
5234 return val;
5237 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5239 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5241 0x0000000100000001ull,
5242 0x0001000100010001ull,
5243 0x0101010101010101ull,
5244 0x1111111111111111ull,
5245 0x5555555555555555ull,
5249 /* Return true if val is a valid bitmask immediate. */
5251 bool
5252 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5254 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5255 int bits;
5257 /* Check for a single sequence of one bits and return quickly if so.
5258 The special cases of all ones and all zeroes returns false. */
5259 val = aarch64_replicate_bitmask_imm (val_in, mode);
5260 tmp = val + (val & -val);
5262 if (tmp == (tmp & -tmp))
5263 return (val + 1) > 1;
5265 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5266 if (mode == SImode)
5267 val = (val << 32) | (val & 0xffffffff);
5269 /* Invert if the immediate doesn't start with a zero bit - this means we
5270 only need to search for sequences of one bits. */
5271 if (val & 1)
5272 val = ~val;
5274 /* Find the first set bit and set tmp to val with the first sequence of one
5275 bits removed. Return success if there is a single sequence of ones. */
5276 first_one = val & -val;
5277 tmp = val & (val + first_one);
5279 if (tmp == 0)
5280 return true;
5282 /* Find the next set bit and compute the difference in bit position. */
5283 next_one = tmp & -tmp;
5284 bits = clz_hwi (first_one) - clz_hwi (next_one);
5285 mask = val ^ tmp;
5287 /* Check the bit position difference is a power of 2, and that the first
5288 sequence of one bits fits within 'bits' bits. */
5289 if ((mask >> bits) != 0 || bits != (bits & -bits))
5290 return false;
5292 /* Check the sequence of one bits is repeated 64/bits times. */
5293 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5296 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5297 Assumed precondition: VAL_IN Is not zero. */
5299 unsigned HOST_WIDE_INT
5300 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5302 int lowest_bit_set = ctz_hwi (val_in);
5303 int highest_bit_set = floor_log2 (val_in);
5304 gcc_assert (val_in != 0);
5306 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5307 (HOST_WIDE_INT_1U << lowest_bit_set));
5310 /* Create constant where bits outside of lowest bit set to highest bit set
5311 are set to 1. */
5313 unsigned HOST_WIDE_INT
5314 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5316 return val_in | ~aarch64_and_split_imm1 (val_in);
5319 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5321 bool
5322 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5324 scalar_int_mode int_mode;
5325 if (!is_a <scalar_int_mode> (mode, &int_mode))
5326 return false;
5328 if (aarch64_bitmask_imm (val_in, int_mode))
5329 return false;
5331 if (aarch64_move_imm (val_in, int_mode))
5332 return false;
5334 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5336 return aarch64_bitmask_imm (imm2, int_mode);
5339 /* Return true if val is an immediate that can be loaded into a
5340 register in a single instruction. */
5341 bool
5342 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5344 scalar_int_mode int_mode;
5345 if (!is_a <scalar_int_mode> (mode, &int_mode))
5346 return false;
5348 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5349 return 1;
5350 return aarch64_bitmask_imm (val, int_mode);
5353 static bool
5354 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5356 rtx base, offset;
5358 if (GET_CODE (x) == HIGH)
5359 return true;
5361 /* There's no way to calculate VL-based values using relocations. */
5362 subrtx_iterator::array_type array;
5363 FOR_EACH_SUBRTX (iter, array, x, ALL)
5364 if (GET_CODE (*iter) == CONST_POLY_INT)
5365 return true;
5367 split_const (x, &base, &offset);
5368 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5370 if (aarch64_classify_symbol (base, INTVAL (offset))
5371 != SYMBOL_FORCE_TO_MEM)
5372 return true;
5373 else
5374 /* Avoid generating a 64-bit relocation in ILP32; leave
5375 to aarch64_expand_mov_immediate to handle it properly. */
5376 return mode != ptr_mode;
5379 return aarch64_tls_referenced_p (x);
5382 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5383 The expansion for a table switch is quite expensive due to the number
5384 of instructions, the table lookup and hard to predict indirect jump.
5385 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5386 set, otherwise use tables for > 16 cases as a tradeoff between size and
5387 performance. When optimizing for size, use the default setting. */
5389 static unsigned int
5390 aarch64_case_values_threshold (void)
5392 /* Use the specified limit for the number of cases before using jump
5393 tables at higher optimization levels. */
5394 if (optimize > 2
5395 && selected_cpu->tune->max_case_values != 0)
5396 return selected_cpu->tune->max_case_values;
5397 else
5398 return optimize_size ? default_case_values_threshold () : 17;
5401 /* Return true if register REGNO is a valid index register.
5402 STRICT_P is true if REG_OK_STRICT is in effect. */
5404 bool
5405 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5407 if (!HARD_REGISTER_NUM_P (regno))
5409 if (!strict_p)
5410 return true;
5412 if (!reg_renumber)
5413 return false;
5415 regno = reg_renumber[regno];
5417 return GP_REGNUM_P (regno);
5420 /* Return true if register REGNO is a valid base register for mode MODE.
5421 STRICT_P is true if REG_OK_STRICT is in effect. */
5423 bool
5424 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5426 if (!HARD_REGISTER_NUM_P (regno))
5428 if (!strict_p)
5429 return true;
5431 if (!reg_renumber)
5432 return false;
5434 regno = reg_renumber[regno];
5437 /* The fake registers will be eliminated to either the stack or
5438 hard frame pointer, both of which are usually valid base registers.
5439 Reload deals with the cases where the eliminated form isn't valid. */
5440 return (GP_REGNUM_P (regno)
5441 || regno == SP_REGNUM
5442 || regno == FRAME_POINTER_REGNUM
5443 || regno == ARG_POINTER_REGNUM);
5446 /* Return true if X is a valid base register for mode MODE.
5447 STRICT_P is true if REG_OK_STRICT is in effect. */
5449 static bool
5450 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5452 if (!strict_p
5453 && GET_CODE (x) == SUBREG
5454 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5455 x = SUBREG_REG (x);
5457 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5460 /* Return true if address offset is a valid index. If it is, fill in INFO
5461 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5463 static bool
5464 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5465 machine_mode mode, bool strict_p)
5467 enum aarch64_address_type type;
5468 rtx index;
5469 int shift;
5471 /* (reg:P) */
5472 if ((REG_P (x) || GET_CODE (x) == SUBREG)
5473 && GET_MODE (x) == Pmode)
5475 type = ADDRESS_REG_REG;
5476 index = x;
5477 shift = 0;
5479 /* (sign_extend:DI (reg:SI)) */
5480 else if ((GET_CODE (x) == SIGN_EXTEND
5481 || GET_CODE (x) == ZERO_EXTEND)
5482 && GET_MODE (x) == DImode
5483 && GET_MODE (XEXP (x, 0)) == SImode)
5485 type = (GET_CODE (x) == SIGN_EXTEND)
5486 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5487 index = XEXP (x, 0);
5488 shift = 0;
5490 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5491 else if (GET_CODE (x) == MULT
5492 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5493 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5494 && GET_MODE (XEXP (x, 0)) == DImode
5495 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5496 && CONST_INT_P (XEXP (x, 1)))
5498 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5499 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5500 index = XEXP (XEXP (x, 0), 0);
5501 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5503 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5504 else if (GET_CODE (x) == ASHIFT
5505 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5506 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5507 && GET_MODE (XEXP (x, 0)) == DImode
5508 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5509 && CONST_INT_P (XEXP (x, 1)))
5511 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5512 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5513 index = XEXP (XEXP (x, 0), 0);
5514 shift = INTVAL (XEXP (x, 1));
5516 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5517 else if ((GET_CODE (x) == SIGN_EXTRACT
5518 || GET_CODE (x) == ZERO_EXTRACT)
5519 && GET_MODE (x) == DImode
5520 && GET_CODE (XEXP (x, 0)) == MULT
5521 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5522 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5524 type = (GET_CODE (x) == SIGN_EXTRACT)
5525 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5526 index = XEXP (XEXP (x, 0), 0);
5527 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5528 if (INTVAL (XEXP (x, 1)) != 32 + shift
5529 || INTVAL (XEXP (x, 2)) != 0)
5530 shift = -1;
5532 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5533 (const_int 0xffffffff<<shift)) */
5534 else if (GET_CODE (x) == AND
5535 && GET_MODE (x) == DImode
5536 && GET_CODE (XEXP (x, 0)) == MULT
5537 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5538 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5539 && CONST_INT_P (XEXP (x, 1)))
5541 type = ADDRESS_REG_UXTW;
5542 index = XEXP (XEXP (x, 0), 0);
5543 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5544 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5545 shift = -1;
5547 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5548 else if ((GET_CODE (x) == SIGN_EXTRACT
5549 || GET_CODE (x) == ZERO_EXTRACT)
5550 && GET_MODE (x) == DImode
5551 && GET_CODE (XEXP (x, 0)) == ASHIFT
5552 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5553 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5555 type = (GET_CODE (x) == SIGN_EXTRACT)
5556 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5557 index = XEXP (XEXP (x, 0), 0);
5558 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5559 if (INTVAL (XEXP (x, 1)) != 32 + shift
5560 || INTVAL (XEXP (x, 2)) != 0)
5561 shift = -1;
5563 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5564 (const_int 0xffffffff<<shift)) */
5565 else if (GET_CODE (x) == AND
5566 && GET_MODE (x) == DImode
5567 && GET_CODE (XEXP (x, 0)) == ASHIFT
5568 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5569 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5570 && CONST_INT_P (XEXP (x, 1)))
5572 type = ADDRESS_REG_UXTW;
5573 index = XEXP (XEXP (x, 0), 0);
5574 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5575 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5576 shift = -1;
5578 /* (mult:P (reg:P) (const_int scale)) */
5579 else if (GET_CODE (x) == MULT
5580 && GET_MODE (x) == Pmode
5581 && GET_MODE (XEXP (x, 0)) == Pmode
5582 && CONST_INT_P (XEXP (x, 1)))
5584 type = ADDRESS_REG_REG;
5585 index = XEXP (x, 0);
5586 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5588 /* (ashift:P (reg:P) (const_int shift)) */
5589 else if (GET_CODE (x) == ASHIFT
5590 && GET_MODE (x) == Pmode
5591 && GET_MODE (XEXP (x, 0)) == Pmode
5592 && CONST_INT_P (XEXP (x, 1)))
5594 type = ADDRESS_REG_REG;
5595 index = XEXP (x, 0);
5596 shift = INTVAL (XEXP (x, 1));
5598 else
5599 return false;
5601 if (!strict_p
5602 && GET_CODE (index) == SUBREG
5603 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5604 index = SUBREG_REG (index);
5606 if (aarch64_sve_data_mode_p (mode))
5608 if (type != ADDRESS_REG_REG
5609 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5610 return false;
5612 else
5614 if (shift != 0
5615 && !(IN_RANGE (shift, 1, 3)
5616 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5617 return false;
5620 if (REG_P (index)
5621 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5623 info->type = type;
5624 info->offset = index;
5625 info->shift = shift;
5626 return true;
5629 return false;
5632 /* Return true if MODE is one of the modes for which we
5633 support LDP/STP operations. */
5635 static bool
5636 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5638 return mode == SImode || mode == DImode
5639 || mode == SFmode || mode == DFmode
5640 || (aarch64_vector_mode_supported_p (mode)
5641 && known_eq (GET_MODE_SIZE (mode), 8));
5644 /* Return true if REGNO is a virtual pointer register, or an eliminable
5645 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5646 include stack_pointer or hard_frame_pointer. */
5647 static bool
5648 virt_or_elim_regno_p (unsigned regno)
5650 return ((regno >= FIRST_VIRTUAL_REGISTER
5651 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5652 || regno == FRAME_POINTER_REGNUM
5653 || regno == ARG_POINTER_REGNUM);
5656 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5657 If it is, fill in INFO appropriately. STRICT_P is true if
5658 REG_OK_STRICT is in effect. */
5660 static bool
5661 aarch64_classify_address (struct aarch64_address_info *info,
5662 rtx x, machine_mode mode, bool strict_p,
5663 aarch64_addr_query_type type = ADDR_QUERY_M)
5665 enum rtx_code code = GET_CODE (x);
5666 rtx op0, op1;
5667 poly_int64 offset;
5669 HOST_WIDE_INT const_size;
5671 /* On BE, we use load/store pair for all large int mode load/stores.
5672 TI/TFmode may also use a load/store pair. */
5673 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5674 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5675 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5676 || mode == TImode
5677 || mode == TFmode
5678 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5680 bool allow_reg_index_p = (!load_store_pair_p
5681 && (known_lt (GET_MODE_SIZE (mode), 16)
5682 || vec_flags == VEC_ADVSIMD
5683 || vec_flags == VEC_SVE_DATA));
5685 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5686 [Rn, #offset, MUL VL]. */
5687 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5688 && (code != REG && code != PLUS))
5689 return false;
5691 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5692 REG addressing. */
5693 if (advsimd_struct_p
5694 && !BYTES_BIG_ENDIAN
5695 && (code != POST_INC && code != REG))
5696 return false;
5698 gcc_checking_assert (GET_MODE (x) == VOIDmode
5699 || SCALAR_INT_MODE_P (GET_MODE (x)));
5701 switch (code)
5703 case REG:
5704 case SUBREG:
5705 info->type = ADDRESS_REG_IMM;
5706 info->base = x;
5707 info->offset = const0_rtx;
5708 info->const_offset = 0;
5709 return aarch64_base_register_rtx_p (x, strict_p);
5711 case PLUS:
5712 op0 = XEXP (x, 0);
5713 op1 = XEXP (x, 1);
5715 if (! strict_p
5716 && REG_P (op0)
5717 && virt_or_elim_regno_p (REGNO (op0))
5718 && poly_int_rtx_p (op1, &offset))
5720 info->type = ADDRESS_REG_IMM;
5721 info->base = op0;
5722 info->offset = op1;
5723 info->const_offset = offset;
5725 return true;
5728 if (maybe_ne (GET_MODE_SIZE (mode), 0)
5729 && aarch64_base_register_rtx_p (op0, strict_p)
5730 && poly_int_rtx_p (op1, &offset))
5732 info->type = ADDRESS_REG_IMM;
5733 info->base = op0;
5734 info->offset = op1;
5735 info->const_offset = offset;
5737 /* TImode and TFmode values are allowed in both pairs of X
5738 registers and individual Q registers. The available
5739 address modes are:
5740 X,X: 7-bit signed scaled offset
5741 Q: 9-bit signed offset
5742 We conservatively require an offset representable in either mode.
5743 When performing the check for pairs of X registers i.e. LDP/STP
5744 pass down DImode since that is the natural size of the LDP/STP
5745 instruction memory accesses. */
5746 if (mode == TImode || mode == TFmode)
5747 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5748 && (offset_9bit_signed_unscaled_p (mode, offset)
5749 || offset_12bit_unsigned_scaled_p (mode, offset)));
5751 /* A 7bit offset check because OImode will emit a ldp/stp
5752 instruction (only big endian will get here).
5753 For ldp/stp instructions, the offset is scaled for the size of a
5754 single element of the pair. */
5755 if (mode == OImode)
5756 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5758 /* Three 9/12 bit offsets checks because CImode will emit three
5759 ldr/str instructions (only big endian will get here). */
5760 if (mode == CImode)
5761 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5762 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
5763 || offset_12bit_unsigned_scaled_p (V16QImode,
5764 offset + 32)));
5766 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5767 instructions (only big endian will get here). */
5768 if (mode == XImode)
5769 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5770 && aarch64_offset_7bit_signed_scaled_p (TImode,
5771 offset + 32));
5773 /* Make "m" use the LD1 offset range for SVE data modes, so
5774 that pre-RTL optimizers like ivopts will work to that
5775 instead of the wider LDR/STR range. */
5776 if (vec_flags == VEC_SVE_DATA)
5777 return (type == ADDR_QUERY_M
5778 ? offset_4bit_signed_scaled_p (mode, offset)
5779 : offset_9bit_signed_scaled_p (mode, offset));
5781 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5783 poly_int64 end_offset = (offset
5784 + GET_MODE_SIZE (mode)
5785 - BYTES_PER_SVE_VECTOR);
5786 return (type == ADDR_QUERY_M
5787 ? offset_4bit_signed_scaled_p (mode, offset)
5788 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5789 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5790 end_offset)));
5793 if (vec_flags == VEC_SVE_PRED)
5794 return offset_9bit_signed_scaled_p (mode, offset);
5796 if (load_store_pair_p)
5797 return ((known_eq (GET_MODE_SIZE (mode), 4)
5798 || known_eq (GET_MODE_SIZE (mode), 8))
5799 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5800 else
5801 return (offset_9bit_signed_unscaled_p (mode, offset)
5802 || offset_12bit_unsigned_scaled_p (mode, offset));
5805 if (allow_reg_index_p)
5807 /* Look for base + (scaled/extended) index register. */
5808 if (aarch64_base_register_rtx_p (op0, strict_p)
5809 && aarch64_classify_index (info, op1, mode, strict_p))
5811 info->base = op0;
5812 return true;
5814 if (aarch64_base_register_rtx_p (op1, strict_p)
5815 && aarch64_classify_index (info, op0, mode, strict_p))
5817 info->base = op1;
5818 return true;
5822 return false;
5824 case POST_INC:
5825 case POST_DEC:
5826 case PRE_INC:
5827 case PRE_DEC:
5828 info->type = ADDRESS_REG_WB;
5829 info->base = XEXP (x, 0);
5830 info->offset = NULL_RTX;
5831 return aarch64_base_register_rtx_p (info->base, strict_p);
5833 case POST_MODIFY:
5834 case PRE_MODIFY:
5835 info->type = ADDRESS_REG_WB;
5836 info->base = XEXP (x, 0);
5837 if (GET_CODE (XEXP (x, 1)) == PLUS
5838 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5839 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5840 && aarch64_base_register_rtx_p (info->base, strict_p))
5842 info->offset = XEXP (XEXP (x, 1), 1);
5843 info->const_offset = offset;
5845 /* TImode and TFmode values are allowed in both pairs of X
5846 registers and individual Q registers. The available
5847 address modes are:
5848 X,X: 7-bit signed scaled offset
5849 Q: 9-bit signed offset
5850 We conservatively require an offset representable in either mode.
5852 if (mode == TImode || mode == TFmode)
5853 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5854 && offset_9bit_signed_unscaled_p (mode, offset));
5856 if (load_store_pair_p)
5857 return ((known_eq (GET_MODE_SIZE (mode), 4)
5858 || known_eq (GET_MODE_SIZE (mode), 8))
5859 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5860 else
5861 return offset_9bit_signed_unscaled_p (mode, offset);
5863 return false;
5865 case CONST:
5866 case SYMBOL_REF:
5867 case LABEL_REF:
5868 /* load literal: pc-relative constant pool entry. Only supported
5869 for SI mode or larger. */
5870 info->type = ADDRESS_SYMBOLIC;
5872 if (!load_store_pair_p
5873 && GET_MODE_SIZE (mode).is_constant (&const_size)
5874 && const_size >= 4)
5876 rtx sym, addend;
5878 split_const (x, &sym, &addend);
5879 return ((GET_CODE (sym) == LABEL_REF
5880 || (GET_CODE (sym) == SYMBOL_REF
5881 && CONSTANT_POOL_ADDRESS_P (sym)
5882 && aarch64_pcrelative_literal_loads)));
5884 return false;
5886 case LO_SUM:
5887 info->type = ADDRESS_LO_SUM;
5888 info->base = XEXP (x, 0);
5889 info->offset = XEXP (x, 1);
5890 if (allow_reg_index_p
5891 && aarch64_base_register_rtx_p (info->base, strict_p))
5893 rtx sym, offs;
5894 split_const (info->offset, &sym, &offs);
5895 if (GET_CODE (sym) == SYMBOL_REF
5896 && (aarch64_classify_symbol (sym, INTVAL (offs))
5897 == SYMBOL_SMALL_ABSOLUTE))
5899 /* The symbol and offset must be aligned to the access size. */
5900 unsigned int align;
5902 if (CONSTANT_POOL_ADDRESS_P (sym))
5903 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5904 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5906 tree exp = SYMBOL_REF_DECL (sym);
5907 align = TYPE_ALIGN (TREE_TYPE (exp));
5908 align = aarch64_constant_alignment (exp, align);
5910 else if (SYMBOL_REF_DECL (sym))
5911 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5912 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5913 && SYMBOL_REF_BLOCK (sym) != NULL)
5914 align = SYMBOL_REF_BLOCK (sym)->alignment;
5915 else
5916 align = BITS_PER_UNIT;
5918 poly_int64 ref_size = GET_MODE_SIZE (mode);
5919 if (known_eq (ref_size, 0))
5920 ref_size = GET_MODE_SIZE (DImode);
5922 return (multiple_p (INTVAL (offs), ref_size)
5923 && multiple_p (align / BITS_PER_UNIT, ref_size));
5926 return false;
5928 default:
5929 return false;
5933 /* Return true if the address X is valid for a PRFM instruction.
5934 STRICT_P is true if we should do strict checking with
5935 aarch64_classify_address. */
5937 bool
5938 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
5940 struct aarch64_address_info addr;
5942 /* PRFM accepts the same addresses as DImode... */
5943 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
5944 if (!res)
5945 return false;
5947 /* ... except writeback forms. */
5948 return addr.type != ADDRESS_REG_WB;
5951 bool
5952 aarch64_symbolic_address_p (rtx x)
5954 rtx offset;
5956 split_const (x, &x, &offset);
5957 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
5960 /* Classify the base of symbolic expression X. */
5962 enum aarch64_symbol_type
5963 aarch64_classify_symbolic_expression (rtx x)
5965 rtx offset;
5967 split_const (x, &x, &offset);
5968 return aarch64_classify_symbol (x, INTVAL (offset));
5972 /* Return TRUE if X is a legitimate address for accessing memory in
5973 mode MODE. */
5974 static bool
5975 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
5977 struct aarch64_address_info addr;
5979 return aarch64_classify_address (&addr, x, mode, strict_p);
5982 /* Return TRUE if X is a legitimate address of type TYPE for accessing
5983 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
5984 bool
5985 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
5986 aarch64_addr_query_type type)
5988 struct aarch64_address_info addr;
5990 return aarch64_classify_address (&addr, x, mode, strict_p, type);
5993 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
5995 static bool
5996 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
5997 poly_int64 orig_offset,
5998 machine_mode mode)
6000 HOST_WIDE_INT size;
6001 if (GET_MODE_SIZE (mode).is_constant (&size))
6003 HOST_WIDE_INT const_offset, second_offset;
6005 /* A general SVE offset is A * VQ + B. Remove the A component from
6006 coefficient 0 in order to get the constant B. */
6007 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6009 /* Split an out-of-range address displacement into a base and
6010 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6011 range otherwise to increase opportunities for sharing the base
6012 address of different sizes. Unaligned accesses use the signed
6013 9-bit range, TImode/TFmode use the intersection of signed
6014 scaled 7-bit and signed 9-bit offset. */
6015 if (mode == TImode || mode == TFmode)
6016 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6017 else if ((const_offset & (size - 1)) != 0)
6018 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6019 else
6020 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6022 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6023 return false;
6025 /* Split the offset into second_offset and the rest. */
6026 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6027 *offset2 = gen_int_mode (second_offset, Pmode);
6028 return true;
6030 else
6032 /* Get the mode we should use as the basis of the range. For structure
6033 modes this is the mode of one vector. */
6034 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6035 machine_mode step_mode
6036 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6038 /* Get the "mul vl" multiplier we'd like to use. */
6039 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6040 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6041 if (vec_flags & VEC_SVE_DATA)
6042 /* LDR supports a 9-bit range, but the move patterns for
6043 structure modes require all vectors to be in range of the
6044 same base. The simplest way of accomodating that while still
6045 promoting reuse of anchor points between different modes is
6046 to use an 8-bit range unconditionally. */
6047 vnum = ((vnum + 128) & 255) - 128;
6048 else
6049 /* Predicates are only handled singly, so we might as well use
6050 the full range. */
6051 vnum = ((vnum + 256) & 511) - 256;
6052 if (vnum == 0)
6053 return false;
6055 /* Convert the "mul vl" multiplier into a byte offset. */
6056 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6057 if (known_eq (second_offset, orig_offset))
6058 return false;
6060 /* Split the offset into second_offset and the rest. */
6061 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6062 *offset2 = gen_int_mode (second_offset, Pmode);
6063 return true;
6067 /* Return the binary representation of floating point constant VALUE in INTVAL.
6068 If the value cannot be converted, return false without setting INTVAL.
6069 The conversion is done in the given MODE. */
6070 bool
6071 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6074 /* We make a general exception for 0. */
6075 if (aarch64_float_const_zero_rtx_p (value))
6077 *intval = 0;
6078 return true;
6081 scalar_float_mode mode;
6082 if (GET_CODE (value) != CONST_DOUBLE
6083 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6084 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6085 /* Only support up to DF mode. */
6086 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6087 return false;
6089 unsigned HOST_WIDE_INT ival = 0;
6091 long res[2];
6092 real_to_target (res,
6093 CONST_DOUBLE_REAL_VALUE (value),
6094 REAL_MODE_FORMAT (mode));
6096 if (mode == DFmode)
6098 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6099 ival = zext_hwi (res[order], 32);
6100 ival |= (zext_hwi (res[1 - order], 32) << 32);
6102 else
6103 ival = zext_hwi (res[0], 32);
6105 *intval = ival;
6106 return true;
6109 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6110 single MOV(+MOVK) followed by an FMOV. */
6111 bool
6112 aarch64_float_const_rtx_p (rtx x)
6114 machine_mode mode = GET_MODE (x);
6115 if (mode == VOIDmode)
6116 return false;
6118 /* Determine whether it's cheaper to write float constants as
6119 mov/movk pairs over ldr/adrp pairs. */
6120 unsigned HOST_WIDE_INT ival;
6122 if (GET_CODE (x) == CONST_DOUBLE
6123 && SCALAR_FLOAT_MODE_P (mode)
6124 && aarch64_reinterpret_float_as_int (x, &ival))
6126 scalar_int_mode imode = (mode == HFmode
6127 ? SImode
6128 : int_mode_for_mode (mode).require ());
6129 int num_instr = aarch64_internal_mov_immediate
6130 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6131 return num_instr < 3;
6134 return false;
6137 /* Return TRUE if rtx X is immediate constant 0.0 */
6138 bool
6139 aarch64_float_const_zero_rtx_p (rtx x)
6141 if (GET_MODE (x) == VOIDmode)
6142 return false;
6144 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6145 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6146 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6149 /* Return TRUE if rtx X is immediate constant that fits in a single
6150 MOVI immediate operation. */
6151 bool
6152 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6154 if (!TARGET_SIMD)
6155 return false;
6157 machine_mode vmode;
6158 scalar_int_mode imode;
6159 unsigned HOST_WIDE_INT ival;
6161 if (GET_CODE (x) == CONST_DOUBLE
6162 && SCALAR_FLOAT_MODE_P (mode))
6164 if (!aarch64_reinterpret_float_as_int (x, &ival))
6165 return false;
6167 /* We make a general exception for 0. */
6168 if (aarch64_float_const_zero_rtx_p (x))
6169 return true;
6171 imode = int_mode_for_mode (mode).require ();
6173 else if (GET_CODE (x) == CONST_INT
6174 && is_a <scalar_int_mode> (mode, &imode))
6175 ival = INTVAL (x);
6176 else
6177 return false;
6179 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6180 a 128 bit vector mode. */
6181 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6183 vmode = aarch64_simd_container_mode (imode, width);
6184 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6186 return aarch64_simd_valid_immediate (v_op, NULL);
6190 /* Return the fixed registers used for condition codes. */
6192 static bool
6193 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6195 *p1 = CC_REGNUM;
6196 *p2 = INVALID_REGNUM;
6197 return true;
6200 /* This function is used by the call expanders of the machine description.
6201 RESULT is the register in which the result is returned. It's NULL for
6202 "call" and "sibcall".
6203 MEM is the location of the function call.
6204 SIBCALL indicates whether this function call is normal call or sibling call.
6205 It will generate different pattern accordingly. */
6207 void
6208 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6210 rtx call, callee, tmp;
6211 rtvec vec;
6212 machine_mode mode;
6214 gcc_assert (MEM_P (mem));
6215 callee = XEXP (mem, 0);
6216 mode = GET_MODE (callee);
6217 gcc_assert (mode == Pmode);
6219 /* Decide if we should generate indirect calls by loading the
6220 address of the callee into a register before performing
6221 the branch-and-link. */
6222 if (SYMBOL_REF_P (callee)
6223 ? (aarch64_is_long_call_p (callee)
6224 || aarch64_is_noplt_call_p (callee))
6225 : !REG_P (callee))
6226 XEXP (mem, 0) = force_reg (mode, callee);
6228 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6230 if (result != NULL_RTX)
6231 call = gen_rtx_SET (result, call);
6233 if (sibcall)
6234 tmp = ret_rtx;
6235 else
6236 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6238 vec = gen_rtvec (2, call, tmp);
6239 call = gen_rtx_PARALLEL (VOIDmode, vec);
6241 aarch64_emit_call_insn (call);
6244 /* Emit call insn with PAT and do aarch64-specific handling. */
6246 void
6247 aarch64_emit_call_insn (rtx pat)
6249 rtx insn = emit_call_insn (pat);
6251 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6252 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6253 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6256 machine_mode
6257 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6259 /* All floating point compares return CCFP if it is an equality
6260 comparison, and CCFPE otherwise. */
6261 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6263 switch (code)
6265 case EQ:
6266 case NE:
6267 case UNORDERED:
6268 case ORDERED:
6269 case UNLT:
6270 case UNLE:
6271 case UNGT:
6272 case UNGE:
6273 case UNEQ:
6274 return CCFPmode;
6276 case LT:
6277 case LE:
6278 case GT:
6279 case GE:
6280 case LTGT:
6281 return CCFPEmode;
6283 default:
6284 gcc_unreachable ();
6288 /* Equality comparisons of short modes against zero can be performed
6289 using the TST instruction with the appropriate bitmask. */
6290 if (y == const0_rtx && REG_P (x)
6291 && (code == EQ || code == NE)
6292 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6293 return CC_NZmode;
6295 /* Similarly, comparisons of zero_extends from shorter modes can
6296 be performed using an ANDS with an immediate mask. */
6297 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6298 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6299 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6300 && (code == EQ || code == NE))
6301 return CC_NZmode;
6303 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6304 && y == const0_rtx
6305 && (code == EQ || code == NE || code == LT || code == GE)
6306 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6307 || GET_CODE (x) == NEG
6308 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6309 && CONST_INT_P (XEXP (x, 2)))))
6310 return CC_NZmode;
6312 /* A compare with a shifted operand. Because of canonicalization,
6313 the comparison will have to be swapped when we emit the assembly
6314 code. */
6315 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6316 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6317 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6318 || GET_CODE (x) == LSHIFTRT
6319 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6320 return CC_SWPmode;
6322 /* Similarly for a negated operand, but we can only do this for
6323 equalities. */
6324 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6325 && (REG_P (y) || GET_CODE (y) == SUBREG)
6326 && (code == EQ || code == NE)
6327 && GET_CODE (x) == NEG)
6328 return CC_Zmode;
6330 /* A test for unsigned overflow. */
6331 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6332 && code == NE
6333 && GET_CODE (x) == PLUS
6334 && GET_CODE (y) == ZERO_EXTEND)
6335 return CC_Cmode;
6337 /* For everything else, return CCmode. */
6338 return CCmode;
6341 static int
6342 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6345 aarch64_get_condition_code (rtx x)
6347 machine_mode mode = GET_MODE (XEXP (x, 0));
6348 enum rtx_code comp_code = GET_CODE (x);
6350 if (GET_MODE_CLASS (mode) != MODE_CC)
6351 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6352 return aarch64_get_condition_code_1 (mode, comp_code);
6355 static int
6356 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6358 switch (mode)
6360 case E_CCFPmode:
6361 case E_CCFPEmode:
6362 switch (comp_code)
6364 case GE: return AARCH64_GE;
6365 case GT: return AARCH64_GT;
6366 case LE: return AARCH64_LS;
6367 case LT: return AARCH64_MI;
6368 case NE: return AARCH64_NE;
6369 case EQ: return AARCH64_EQ;
6370 case ORDERED: return AARCH64_VC;
6371 case UNORDERED: return AARCH64_VS;
6372 case UNLT: return AARCH64_LT;
6373 case UNLE: return AARCH64_LE;
6374 case UNGT: return AARCH64_HI;
6375 case UNGE: return AARCH64_PL;
6376 default: return -1;
6378 break;
6380 case E_CCmode:
6381 switch (comp_code)
6383 case NE: return AARCH64_NE;
6384 case EQ: return AARCH64_EQ;
6385 case GE: return AARCH64_GE;
6386 case GT: return AARCH64_GT;
6387 case LE: return AARCH64_LE;
6388 case LT: return AARCH64_LT;
6389 case GEU: return AARCH64_CS;
6390 case GTU: return AARCH64_HI;
6391 case LEU: return AARCH64_LS;
6392 case LTU: return AARCH64_CC;
6393 default: return -1;
6395 break;
6397 case E_CC_SWPmode:
6398 switch (comp_code)
6400 case NE: return AARCH64_NE;
6401 case EQ: return AARCH64_EQ;
6402 case GE: return AARCH64_LE;
6403 case GT: return AARCH64_LT;
6404 case LE: return AARCH64_GE;
6405 case LT: return AARCH64_GT;
6406 case GEU: return AARCH64_LS;
6407 case GTU: return AARCH64_CC;
6408 case LEU: return AARCH64_CS;
6409 case LTU: return AARCH64_HI;
6410 default: return -1;
6412 break;
6414 case E_CC_NZmode:
6415 switch (comp_code)
6417 case NE: return AARCH64_NE;
6418 case EQ: return AARCH64_EQ;
6419 case GE: return AARCH64_PL;
6420 case LT: return AARCH64_MI;
6421 default: return -1;
6423 break;
6425 case E_CC_Zmode:
6426 switch (comp_code)
6428 case NE: return AARCH64_NE;
6429 case EQ: return AARCH64_EQ;
6430 default: return -1;
6432 break;
6434 case E_CC_Cmode:
6435 switch (comp_code)
6437 case NE: return AARCH64_CS;
6438 case EQ: return AARCH64_CC;
6439 default: return -1;
6441 break;
6443 default:
6444 return -1;
6447 return -1;
6450 bool
6451 aarch64_const_vec_all_same_in_range_p (rtx x,
6452 HOST_WIDE_INT minval,
6453 HOST_WIDE_INT maxval)
6455 rtx elt;
6456 return (const_vec_duplicate_p (x, &elt)
6457 && CONST_INT_P (elt)
6458 && IN_RANGE (INTVAL (elt), minval, maxval));
6461 bool
6462 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6464 return aarch64_const_vec_all_same_in_range_p (x, val, val);
6467 /* Return true if VEC is a constant in which every element is in the range
6468 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6470 static bool
6471 aarch64_const_vec_all_in_range_p (rtx vec,
6472 HOST_WIDE_INT minval,
6473 HOST_WIDE_INT maxval)
6475 if (GET_CODE (vec) != CONST_VECTOR
6476 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6477 return false;
6479 int nunits;
6480 if (!CONST_VECTOR_STEPPED_P (vec))
6481 nunits = const_vector_encoded_nelts (vec);
6482 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6483 return false;
6485 for (int i = 0; i < nunits; i++)
6487 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6488 if (!CONST_INT_P (vec_elem)
6489 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6490 return false;
6492 return true;
6495 /* N Z C V. */
6496 #define AARCH64_CC_V 1
6497 #define AARCH64_CC_C (1 << 1)
6498 #define AARCH64_CC_Z (1 << 2)
6499 #define AARCH64_CC_N (1 << 3)
6501 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6502 static const int aarch64_nzcv_codes[] =
6504 0, /* EQ, Z == 1. */
6505 AARCH64_CC_Z, /* NE, Z == 0. */
6506 0, /* CS, C == 1. */
6507 AARCH64_CC_C, /* CC, C == 0. */
6508 0, /* MI, N == 1. */
6509 AARCH64_CC_N, /* PL, N == 0. */
6510 0, /* VS, V == 1. */
6511 AARCH64_CC_V, /* VC, V == 0. */
6512 0, /* HI, C ==1 && Z == 0. */
6513 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
6514 AARCH64_CC_V, /* GE, N == V. */
6515 0, /* LT, N != V. */
6516 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
6517 0, /* LE, !(Z == 0 && N == V). */
6518 0, /* AL, Any. */
6519 0 /* NV, Any. */
6522 /* Print floating-point vector immediate operand X to F, negating it
6523 first if NEGATE is true. Return true on success, false if it isn't
6524 a constant we can handle. */
6526 static bool
6527 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6529 rtx elt;
6531 if (!const_vec_duplicate_p (x, &elt))
6532 return false;
6534 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6535 if (negate)
6536 r = real_value_negate (&r);
6538 /* We only handle the SVE single-bit immediates here. */
6539 if (real_equal (&r, &dconst0))
6540 asm_fprintf (f, "0.0");
6541 else if (real_equal (&r, &dconst1))
6542 asm_fprintf (f, "1.0");
6543 else if (real_equal (&r, &dconsthalf))
6544 asm_fprintf (f, "0.5");
6545 else
6546 return false;
6548 return true;
6551 /* Return the equivalent letter for size. */
6552 static char
6553 sizetochar (int size)
6555 switch (size)
6557 case 64: return 'd';
6558 case 32: return 's';
6559 case 16: return 'h';
6560 case 8 : return 'b';
6561 default: gcc_unreachable ();
6565 /* Print operand X to file F in a target specific manner according to CODE.
6566 The acceptable formatting commands given by CODE are:
6567 'c': An integer or symbol address without a preceding #
6568 sign.
6569 'C': Take the duplicated element in a vector constant
6570 and print it in hex.
6571 'D': Take the duplicated element in a vector constant
6572 and print it as an unsigned integer, in decimal.
6573 'e': Print the sign/zero-extend size as a character 8->b,
6574 16->h, 32->w.
6575 'p': Prints N such that 2^N == X (X must be power of 2 and
6576 const int).
6577 'P': Print the number of non-zero bits in X (a const_int).
6578 'H': Print the higher numbered register of a pair (TImode)
6579 of regs.
6580 'm': Print a condition (eq, ne, etc).
6581 'M': Same as 'm', but invert condition.
6582 'N': Take the duplicated element in a vector constant
6583 and print the negative of it in decimal.
6584 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6585 'S/T/U/V': Print a FP/SIMD register name for a register list.
6586 The register printed is the FP/SIMD register name
6587 of X + 0/1/2/3 for S/T/U/V.
6588 'R': Print a scalar FP/SIMD register name + 1.
6589 'X': Print bottom 16 bits of integer constant in hex.
6590 'w/x': Print a general register name or the zero register
6591 (32-bit or 64-bit).
6592 '0': Print a normal operand, if it's a general register,
6593 then we assume DImode.
6594 'k': Print NZCV for conditional compare instructions.
6595 'A': Output address constant representing the first
6596 argument of X, specifying a relocation offset
6597 if appropriate.
6598 'L': Output constant address specified by X
6599 with a relocation offset if appropriate.
6600 'G': Prints address of X, specifying a PC relative
6601 relocation mode if appropriate.
6602 'y': Output address of LDP or STP - this is used for
6603 some LDP/STPs which don't use a PARALLEL in their
6604 pattern (so the mode needs to be adjusted).
6605 'z': Output address of a typical LDP or STP. */
6607 static void
6608 aarch64_print_operand (FILE *f, rtx x, int code)
6610 rtx elt;
6611 switch (code)
6613 case 'c':
6614 switch (GET_CODE (x))
6616 case CONST_INT:
6617 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6618 break;
6620 case SYMBOL_REF:
6621 output_addr_const (f, x);
6622 break;
6624 case CONST:
6625 if (GET_CODE (XEXP (x, 0)) == PLUS
6626 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6628 output_addr_const (f, x);
6629 break;
6631 /* Fall through. */
6633 default:
6634 output_operand_lossage ("unsupported operand for code '%c'", code);
6636 break;
6638 case 'e':
6640 int n;
6642 if (!CONST_INT_P (x)
6643 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6645 output_operand_lossage ("invalid operand for '%%%c'", code);
6646 return;
6649 switch (n)
6651 case 3:
6652 fputc ('b', f);
6653 break;
6654 case 4:
6655 fputc ('h', f);
6656 break;
6657 case 5:
6658 fputc ('w', f);
6659 break;
6660 default:
6661 output_operand_lossage ("invalid operand for '%%%c'", code);
6662 return;
6665 break;
6667 case 'p':
6669 int n;
6671 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6673 output_operand_lossage ("invalid operand for '%%%c'", code);
6674 return;
6677 asm_fprintf (f, "%d", n);
6679 break;
6681 case 'P':
6682 if (!CONST_INT_P (x))
6684 output_operand_lossage ("invalid operand for '%%%c'", code);
6685 return;
6688 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6689 break;
6691 case 'H':
6692 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6694 output_operand_lossage ("invalid operand for '%%%c'", code);
6695 return;
6698 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6699 break;
6701 case 'M':
6702 case 'm':
6704 int cond_code;
6705 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6706 if (x == const_true_rtx)
6708 if (code == 'M')
6709 fputs ("nv", f);
6710 return;
6713 if (!COMPARISON_P (x))
6715 output_operand_lossage ("invalid operand for '%%%c'", code);
6716 return;
6719 cond_code = aarch64_get_condition_code (x);
6720 gcc_assert (cond_code >= 0);
6721 if (code == 'M')
6722 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6723 fputs (aarch64_condition_codes[cond_code], f);
6725 break;
6727 case 'N':
6728 if (!const_vec_duplicate_p (x, &elt))
6730 output_operand_lossage ("invalid vector constant");
6731 return;
6734 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6735 asm_fprintf (f, "%wd", -INTVAL (elt));
6736 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6737 && aarch64_print_vector_float_operand (f, x, true))
6739 else
6741 output_operand_lossage ("invalid vector constant");
6742 return;
6744 break;
6746 case 'b':
6747 case 'h':
6748 case 's':
6749 case 'd':
6750 case 'q':
6751 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6753 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6754 return;
6756 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6757 break;
6759 case 'S':
6760 case 'T':
6761 case 'U':
6762 case 'V':
6763 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6765 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6766 return;
6768 asm_fprintf (f, "%c%d",
6769 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6770 REGNO (x) - V0_REGNUM + (code - 'S'));
6771 break;
6773 case 'R':
6774 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6776 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6777 return;
6779 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6780 break;
6782 case 'X':
6783 if (!CONST_INT_P (x))
6785 output_operand_lossage ("invalid operand for '%%%c'", code);
6786 return;
6788 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6789 break;
6791 case 'C':
6793 /* Print a replicated constant in hex. */
6794 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6796 output_operand_lossage ("invalid operand for '%%%c'", code);
6797 return;
6799 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6800 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6802 break;
6804 case 'D':
6806 /* Print a replicated constant in decimal, treating it as
6807 unsigned. */
6808 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6810 output_operand_lossage ("invalid operand for '%%%c'", code);
6811 return;
6813 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6814 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6816 break;
6818 case 'w':
6819 case 'x':
6820 if (x == const0_rtx
6821 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6823 asm_fprintf (f, "%czr", code);
6824 break;
6827 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6829 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6830 break;
6833 if (REG_P (x) && REGNO (x) == SP_REGNUM)
6835 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6836 break;
6839 /* Fall through */
6841 case 0:
6842 if (x == NULL)
6844 output_operand_lossage ("missing operand");
6845 return;
6848 switch (GET_CODE (x))
6850 case REG:
6851 if (aarch64_sve_data_mode_p (GET_MODE (x)))
6853 if (REG_NREGS (x) == 1)
6854 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6855 else
6857 char suffix
6858 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6859 asm_fprintf (f, "{z%d.%c - z%d.%c}",
6860 REGNO (x) - V0_REGNUM, suffix,
6861 END_REGNO (x) - V0_REGNUM - 1, suffix);
6864 else
6865 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6866 break;
6868 case MEM:
6869 output_address (GET_MODE (x), XEXP (x, 0));
6870 break;
6872 case LABEL_REF:
6873 case SYMBOL_REF:
6874 output_addr_const (asm_out_file, x);
6875 break;
6877 case CONST_INT:
6878 asm_fprintf (f, "%wd", INTVAL (x));
6879 break;
6881 case CONST:
6882 if (!VECTOR_MODE_P (GET_MODE (x)))
6884 output_addr_const (asm_out_file, x);
6885 break;
6887 /* fall through */
6889 case CONST_VECTOR:
6890 if (!const_vec_duplicate_p (x, &elt))
6892 output_operand_lossage ("invalid vector constant");
6893 return;
6896 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6897 asm_fprintf (f, "%wd", INTVAL (elt));
6898 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6899 && aarch64_print_vector_float_operand (f, x, false))
6901 else
6903 output_operand_lossage ("invalid vector constant");
6904 return;
6906 break;
6908 case CONST_DOUBLE:
6909 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6910 be getting CONST_DOUBLEs holding integers. */
6911 gcc_assert (GET_MODE (x) != VOIDmode);
6912 if (aarch64_float_const_zero_rtx_p (x))
6914 fputc ('0', f);
6915 break;
6917 else if (aarch64_float_const_representable_p (x))
6919 #define buf_size 20
6920 char float_buf[buf_size] = {'\0'};
6921 real_to_decimal_for_mode (float_buf,
6922 CONST_DOUBLE_REAL_VALUE (x),
6923 buf_size, buf_size,
6924 1, GET_MODE (x));
6925 asm_fprintf (asm_out_file, "%s", float_buf);
6926 break;
6927 #undef buf_size
6929 output_operand_lossage ("invalid constant");
6930 return;
6931 default:
6932 output_operand_lossage ("invalid operand");
6933 return;
6935 break;
6937 case 'A':
6938 if (GET_CODE (x) == HIGH)
6939 x = XEXP (x, 0);
6941 switch (aarch64_classify_symbolic_expression (x))
6943 case SYMBOL_SMALL_GOT_4G:
6944 asm_fprintf (asm_out_file, ":got:");
6945 break;
6947 case SYMBOL_SMALL_TLSGD:
6948 asm_fprintf (asm_out_file, ":tlsgd:");
6949 break;
6951 case SYMBOL_SMALL_TLSDESC:
6952 asm_fprintf (asm_out_file, ":tlsdesc:");
6953 break;
6955 case SYMBOL_SMALL_TLSIE:
6956 asm_fprintf (asm_out_file, ":gottprel:");
6957 break;
6959 case SYMBOL_TLSLE24:
6960 asm_fprintf (asm_out_file, ":tprel:");
6961 break;
6963 case SYMBOL_TINY_GOT:
6964 gcc_unreachable ();
6965 break;
6967 default:
6968 break;
6970 output_addr_const (asm_out_file, x);
6971 break;
6973 case 'L':
6974 switch (aarch64_classify_symbolic_expression (x))
6976 case SYMBOL_SMALL_GOT_4G:
6977 asm_fprintf (asm_out_file, ":lo12:");
6978 break;
6980 case SYMBOL_SMALL_TLSGD:
6981 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
6982 break;
6984 case SYMBOL_SMALL_TLSDESC:
6985 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
6986 break;
6988 case SYMBOL_SMALL_TLSIE:
6989 asm_fprintf (asm_out_file, ":gottprel_lo12:");
6990 break;
6992 case SYMBOL_TLSLE12:
6993 asm_fprintf (asm_out_file, ":tprel_lo12:");
6994 break;
6996 case SYMBOL_TLSLE24:
6997 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
6998 break;
7000 case SYMBOL_TINY_GOT:
7001 asm_fprintf (asm_out_file, ":got:");
7002 break;
7004 case SYMBOL_TINY_TLSIE:
7005 asm_fprintf (asm_out_file, ":gottprel:");
7006 break;
7008 default:
7009 break;
7011 output_addr_const (asm_out_file, x);
7012 break;
7014 case 'G':
7015 switch (aarch64_classify_symbolic_expression (x))
7017 case SYMBOL_TLSLE24:
7018 asm_fprintf (asm_out_file, ":tprel_hi12:");
7019 break;
7020 default:
7021 break;
7023 output_addr_const (asm_out_file, x);
7024 break;
7026 case 'k':
7028 HOST_WIDE_INT cond_code;
7030 if (!CONST_INT_P (x))
7032 output_operand_lossage ("invalid operand for '%%%c'", code);
7033 return;
7036 cond_code = INTVAL (x);
7037 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7038 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7040 break;
7042 case 'y':
7043 case 'z':
7045 machine_mode mode = GET_MODE (x);
7047 if (GET_CODE (x) != MEM
7048 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7050 output_operand_lossage ("invalid operand for '%%%c'", code);
7051 return;
7054 if (code == 'y')
7055 /* LDP/STP which uses a single double-width memory operand.
7056 Adjust the mode to appear like a typical LDP/STP.
7057 Currently this is supported for 16-byte accesses only. */
7058 mode = DFmode;
7060 if (!aarch64_print_ldpstp_address (f, mode, XEXP (x, 0)))
7061 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7063 break;
7065 default:
7066 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7067 return;
7071 /* Print address 'x' of a memory access with mode 'mode'.
7072 'op' is the context required by aarch64_classify_address. It can either be
7073 MEM for a normal memory access or PARALLEL for LDP/STP. */
7074 static bool
7075 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7076 aarch64_addr_query_type type)
7078 struct aarch64_address_info addr;
7079 unsigned int size;
7081 /* Check all addresses are Pmode - including ILP32. */
7082 if (GET_MODE (x) != Pmode)
7083 output_operand_lossage ("invalid address mode");
7085 if (aarch64_classify_address (&addr, x, mode, true, type))
7086 switch (addr.type)
7088 case ADDRESS_REG_IMM:
7089 if (known_eq (addr.const_offset, 0))
7090 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7091 else if (aarch64_sve_data_mode_p (mode))
7093 HOST_WIDE_INT vnum
7094 = exact_div (addr.const_offset,
7095 BYTES_PER_SVE_VECTOR).to_constant ();
7096 asm_fprintf (f, "[%s, #%wd, mul vl]",
7097 reg_names[REGNO (addr.base)], vnum);
7099 else if (aarch64_sve_pred_mode_p (mode))
7101 HOST_WIDE_INT vnum
7102 = exact_div (addr.const_offset,
7103 BYTES_PER_SVE_PRED).to_constant ();
7104 asm_fprintf (f, "[%s, #%wd, mul vl]",
7105 reg_names[REGNO (addr.base)], vnum);
7107 else
7108 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7109 INTVAL (addr.offset));
7110 return true;
7112 case ADDRESS_REG_REG:
7113 if (addr.shift == 0)
7114 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7115 reg_names [REGNO (addr.offset)]);
7116 else
7117 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7118 reg_names [REGNO (addr.offset)], addr.shift);
7119 return true;
7121 case ADDRESS_REG_UXTW:
7122 if (addr.shift == 0)
7123 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7124 REGNO (addr.offset) - R0_REGNUM);
7125 else
7126 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7127 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7128 return true;
7130 case ADDRESS_REG_SXTW:
7131 if (addr.shift == 0)
7132 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7133 REGNO (addr.offset) - R0_REGNUM);
7134 else
7135 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7136 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7137 return true;
7139 case ADDRESS_REG_WB:
7140 /* Writeback is only supported for fixed-width modes. */
7141 size = GET_MODE_SIZE (mode).to_constant ();
7142 switch (GET_CODE (x))
7144 case PRE_INC:
7145 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7146 return true;
7147 case POST_INC:
7148 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7149 return true;
7150 case PRE_DEC:
7151 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7152 return true;
7153 case POST_DEC:
7154 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7155 return true;
7156 case PRE_MODIFY:
7157 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7158 INTVAL (addr.offset));
7159 return true;
7160 case POST_MODIFY:
7161 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7162 INTVAL (addr.offset));
7163 return true;
7164 default:
7165 break;
7167 break;
7169 case ADDRESS_LO_SUM:
7170 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7171 output_addr_const (f, addr.offset);
7172 asm_fprintf (f, "]");
7173 return true;
7175 case ADDRESS_SYMBOLIC:
7176 output_addr_const (f, x);
7177 return true;
7180 return false;
7183 /* Print address 'x' of a LDP/STP with mode 'mode'. */
7184 static bool
7185 aarch64_print_ldpstp_address (FILE *f, machine_mode mode, rtx x)
7187 return aarch64_print_address_internal (f, mode, x, ADDR_QUERY_LDP_STP);
7190 /* Print address 'x' of a memory access with mode 'mode'. */
7191 static void
7192 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7194 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7195 output_addr_const (f, x);
7198 bool
7199 aarch64_label_mentioned_p (rtx x)
7201 const char *fmt;
7202 int i;
7204 if (GET_CODE (x) == LABEL_REF)
7205 return true;
7207 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7208 referencing instruction, but they are constant offsets, not
7209 symbols. */
7210 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7211 return false;
7213 fmt = GET_RTX_FORMAT (GET_CODE (x));
7214 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7216 if (fmt[i] == 'E')
7218 int j;
7220 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7221 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7222 return 1;
7224 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7225 return 1;
7228 return 0;
7231 /* Implement REGNO_REG_CLASS. */
7233 enum reg_class
7234 aarch64_regno_regclass (unsigned regno)
7236 if (GP_REGNUM_P (regno))
7237 return GENERAL_REGS;
7239 if (regno == SP_REGNUM)
7240 return STACK_REG;
7242 if (regno == FRAME_POINTER_REGNUM
7243 || regno == ARG_POINTER_REGNUM)
7244 return POINTER_REGS;
7246 if (FP_REGNUM_P (regno))
7247 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
7249 if (PR_REGNUM_P (regno))
7250 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7252 return NO_REGS;
7255 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7256 If OFFSET is out of range, return an offset of an anchor point
7257 that is in range. Return 0 otherwise. */
7259 static HOST_WIDE_INT
7260 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7261 machine_mode mode)
7263 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7264 if (size > 16)
7265 return (offset + 0x400) & ~0x7f0;
7267 /* For offsets that aren't a multiple of the access size, the limit is
7268 -256...255. */
7269 if (offset & (size - 1))
7271 /* BLKmode typically uses LDP of X-registers. */
7272 if (mode == BLKmode)
7273 return (offset + 512) & ~0x3ff;
7274 return (offset + 0x100) & ~0x1ff;
7277 /* Small negative offsets are supported. */
7278 if (IN_RANGE (offset, -256, 0))
7279 return 0;
7281 if (mode == TImode || mode == TFmode)
7282 return (offset + 0x100) & ~0x1ff;
7284 /* Use 12-bit offset by access size. */
7285 return offset & (~0xfff * size);
7288 static rtx
7289 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
7291 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7292 where mask is selected by alignment and size of the offset.
7293 We try to pick as large a range for the offset as possible to
7294 maximize the chance of a CSE. However, for aligned addresses
7295 we limit the range to 4k so that structures with different sized
7296 elements are likely to use the same base. We need to be careful
7297 not to split a CONST for some forms of address expression, otherwise
7298 it will generate sub-optimal code. */
7300 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7302 rtx base = XEXP (x, 0);
7303 rtx offset_rtx = XEXP (x, 1);
7304 HOST_WIDE_INT offset = INTVAL (offset_rtx);
7306 if (GET_CODE (base) == PLUS)
7308 rtx op0 = XEXP (base, 0);
7309 rtx op1 = XEXP (base, 1);
7311 /* Force any scaling into a temp for CSE. */
7312 op0 = force_reg (Pmode, op0);
7313 op1 = force_reg (Pmode, op1);
7315 /* Let the pointer register be in op0. */
7316 if (REG_POINTER (op1))
7317 std::swap (op0, op1);
7319 /* If the pointer is virtual or frame related, then we know that
7320 virtual register instantiation or register elimination is going
7321 to apply a second constant. We want the two constants folded
7322 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7323 if (virt_or_elim_regno_p (REGNO (op0)))
7325 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7326 NULL_RTX, true, OPTAB_DIRECT);
7327 return gen_rtx_PLUS (Pmode, base, op1);
7330 /* Otherwise, in order to encourage CSE (and thence loop strength
7331 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7332 base = expand_binop (Pmode, add_optab, op0, op1,
7333 NULL_RTX, true, OPTAB_DIRECT);
7334 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7337 HOST_WIDE_INT size;
7338 if (GET_MODE_SIZE (mode).is_constant (&size))
7340 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7341 mode);
7342 if (base_offset != 0)
7344 base = plus_constant (Pmode, base, base_offset);
7345 base = force_operand (base, NULL_RTX);
7346 return plus_constant (Pmode, base, offset - base_offset);
7351 return x;
7354 /* Return the reload icode required for a constant pool in mode. */
7355 static enum insn_code
7356 aarch64_constant_pool_reload_icode (machine_mode mode)
7358 switch (mode)
7360 case E_SFmode:
7361 return CODE_FOR_aarch64_reload_movcpsfdi;
7363 case E_DFmode:
7364 return CODE_FOR_aarch64_reload_movcpdfdi;
7366 case E_TFmode:
7367 return CODE_FOR_aarch64_reload_movcptfdi;
7369 case E_V8QImode:
7370 return CODE_FOR_aarch64_reload_movcpv8qidi;
7372 case E_V16QImode:
7373 return CODE_FOR_aarch64_reload_movcpv16qidi;
7375 case E_V4HImode:
7376 return CODE_FOR_aarch64_reload_movcpv4hidi;
7378 case E_V8HImode:
7379 return CODE_FOR_aarch64_reload_movcpv8hidi;
7381 case E_V2SImode:
7382 return CODE_FOR_aarch64_reload_movcpv2sidi;
7384 case E_V4SImode:
7385 return CODE_FOR_aarch64_reload_movcpv4sidi;
7387 case E_V2DImode:
7388 return CODE_FOR_aarch64_reload_movcpv2didi;
7390 case E_V2DFmode:
7391 return CODE_FOR_aarch64_reload_movcpv2dfdi;
7393 default:
7394 gcc_unreachable ();
7397 gcc_unreachable ();
7399 static reg_class_t
7400 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7401 reg_class_t rclass,
7402 machine_mode mode,
7403 secondary_reload_info *sri)
7405 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7406 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7407 comment at the head of aarch64-sve.md for more details about the
7408 big-endian handling. */
7409 if (BYTES_BIG_ENDIAN
7410 && reg_class_subset_p (rclass, FP_REGS)
7411 && !((REG_P (x) && HARD_REGISTER_P (x))
7412 || aarch64_simd_valid_immediate (x, NULL))
7413 && aarch64_sve_data_mode_p (mode))
7415 sri->icode = CODE_FOR_aarch64_sve_reload_be;
7416 return NO_REGS;
7419 /* If we have to disable direct literal pool loads and stores because the
7420 function is too big, then we need a scratch register. */
7421 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7422 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7423 || targetm.vector_mode_supported_p (GET_MODE (x)))
7424 && !aarch64_pcrelative_literal_loads)
7426 sri->icode = aarch64_constant_pool_reload_icode (mode);
7427 return NO_REGS;
7430 /* Without the TARGET_SIMD instructions we cannot move a Q register
7431 to a Q register directly. We need a scratch. */
7432 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7433 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7434 && reg_class_subset_p (rclass, FP_REGS))
7436 if (mode == TFmode)
7437 sri->icode = CODE_FOR_aarch64_reload_movtf;
7438 else if (mode == TImode)
7439 sri->icode = CODE_FOR_aarch64_reload_movti;
7440 return NO_REGS;
7443 /* A TFmode or TImode memory access should be handled via an FP_REGS
7444 because AArch64 has richer addressing modes for LDR/STR instructions
7445 than LDP/STP instructions. */
7446 if (TARGET_FLOAT && rclass == GENERAL_REGS
7447 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7448 return FP_REGS;
7450 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7451 return GENERAL_REGS;
7453 return NO_REGS;
7456 static bool
7457 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7459 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7461 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7462 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7463 if (frame_pointer_needed)
7464 return to == HARD_FRAME_POINTER_REGNUM;
7465 return true;
7468 poly_int64
7469 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7471 aarch64_layout_frame ();
7473 if (to == HARD_FRAME_POINTER_REGNUM)
7475 if (from == ARG_POINTER_REGNUM)
7476 return cfun->machine->frame.hard_fp_offset;
7478 if (from == FRAME_POINTER_REGNUM)
7479 return cfun->machine->frame.hard_fp_offset
7480 - cfun->machine->frame.locals_offset;
7483 if (to == STACK_POINTER_REGNUM)
7485 if (from == FRAME_POINTER_REGNUM)
7486 return cfun->machine->frame.frame_size
7487 - cfun->machine->frame.locals_offset;
7490 return cfun->machine->frame.frame_size;
7493 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7494 previous frame. */
7497 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7499 if (count != 0)
7500 return const0_rtx;
7501 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7505 static void
7506 aarch64_asm_trampoline_template (FILE *f)
7508 if (TARGET_ILP32)
7510 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7511 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7513 else
7515 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7516 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7518 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7519 assemble_aligned_integer (4, const0_rtx);
7520 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7521 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7524 static void
7525 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7527 rtx fnaddr, mem, a_tramp;
7528 const int tramp_code_sz = 16;
7530 /* Don't need to copy the trailing D-words, we fill those in below. */
7531 emit_block_move (m_tramp, assemble_trampoline_template (),
7532 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7533 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7534 fnaddr = XEXP (DECL_RTL (fndecl), 0);
7535 if (GET_MODE (fnaddr) != ptr_mode)
7536 fnaddr = convert_memory_address (ptr_mode, fnaddr);
7537 emit_move_insn (mem, fnaddr);
7539 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7540 emit_move_insn (mem, chain_value);
7542 /* XXX We should really define a "clear_cache" pattern and use
7543 gen_clear_cache(). */
7544 a_tramp = XEXP (m_tramp, 0);
7545 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7546 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7547 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7548 ptr_mode);
7551 static unsigned char
7552 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7554 /* ??? Logically we should only need to provide a value when
7555 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7556 can hold MODE, but at the moment we need to handle all modes.
7557 Just ignore any runtime parts for registers that can't store them. */
7558 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7559 unsigned int nregs;
7560 switch (regclass)
7562 case TAILCALL_ADDR_REGS:
7563 case POINTER_REGS:
7564 case GENERAL_REGS:
7565 case ALL_REGS:
7566 case POINTER_AND_FP_REGS:
7567 case FP_REGS:
7568 case FP_LO_REGS:
7569 if (aarch64_sve_data_mode_p (mode)
7570 && constant_multiple_p (GET_MODE_SIZE (mode),
7571 BYTES_PER_SVE_VECTOR, &nregs))
7572 return nregs;
7573 return (aarch64_vector_data_mode_p (mode)
7574 ? CEIL (lowest_size, UNITS_PER_VREG)
7575 : CEIL (lowest_size, UNITS_PER_WORD));
7576 case STACK_REG:
7577 case PR_REGS:
7578 case PR_LO_REGS:
7579 case PR_HI_REGS:
7580 return 1;
7582 case NO_REGS:
7583 return 0;
7585 default:
7586 break;
7588 gcc_unreachable ();
7591 static reg_class_t
7592 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7594 if (regclass == POINTER_REGS)
7595 return GENERAL_REGS;
7597 if (regclass == STACK_REG)
7599 if (REG_P(x)
7600 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7601 return regclass;
7603 return NO_REGS;
7606 /* Register eliminiation can result in a request for
7607 SP+constant->FP_REGS. We cannot support such operations which
7608 use SP as source and an FP_REG as destination, so reject out
7609 right now. */
7610 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7612 rtx lhs = XEXP (x, 0);
7614 /* Look through a possible SUBREG introduced by ILP32. */
7615 if (GET_CODE (lhs) == SUBREG)
7616 lhs = SUBREG_REG (lhs);
7618 gcc_assert (REG_P (lhs));
7619 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7620 POINTER_REGS));
7621 return NO_REGS;
7624 return regclass;
7627 void
7628 aarch64_asm_output_labelref (FILE* f, const char *name)
7630 asm_fprintf (f, "%U%s", name);
7633 static void
7634 aarch64_elf_asm_constructor (rtx symbol, int priority)
7636 if (priority == DEFAULT_INIT_PRIORITY)
7637 default_ctor_section_asm_out_constructor (symbol, priority);
7638 else
7640 section *s;
7641 /* While priority is known to be in range [0, 65535], so 18 bytes
7642 would be enough, the compiler might not know that. To avoid
7643 -Wformat-truncation false positive, use a larger size. */
7644 char buf[23];
7645 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7646 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7647 switch_to_section (s);
7648 assemble_align (POINTER_SIZE);
7649 assemble_aligned_integer (POINTER_BYTES, symbol);
7653 static void
7654 aarch64_elf_asm_destructor (rtx symbol, int priority)
7656 if (priority == DEFAULT_INIT_PRIORITY)
7657 default_dtor_section_asm_out_destructor (symbol, priority);
7658 else
7660 section *s;
7661 /* While priority is known to be in range [0, 65535], so 18 bytes
7662 would be enough, the compiler might not know that. To avoid
7663 -Wformat-truncation false positive, use a larger size. */
7664 char buf[23];
7665 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7666 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7667 switch_to_section (s);
7668 assemble_align (POINTER_SIZE);
7669 assemble_aligned_integer (POINTER_BYTES, symbol);
7673 const char*
7674 aarch64_output_casesi (rtx *operands)
7676 char buf[100];
7677 char label[100];
7678 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7679 int index;
7680 static const char *const patterns[4][2] =
7683 "ldrb\t%w3, [%0,%w1,uxtw]",
7684 "add\t%3, %4, %w3, sxtb #2"
7687 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7688 "add\t%3, %4, %w3, sxth #2"
7691 "ldr\t%w3, [%0,%w1,uxtw #2]",
7692 "add\t%3, %4, %w3, sxtw #2"
7694 /* We assume that DImode is only generated when not optimizing and
7695 that we don't really need 64-bit address offsets. That would
7696 imply an object file with 8GB of code in a single function! */
7698 "ldr\t%w3, [%0,%w1,uxtw #2]",
7699 "add\t%3, %4, %w3, sxtw #2"
7703 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7705 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7706 index = exact_log2 (GET_MODE_SIZE (mode));
7708 gcc_assert (index >= 0 && index <= 3);
7710 /* Need to implement table size reduction, by chaning the code below. */
7711 output_asm_insn (patterns[index][0], operands);
7712 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7713 snprintf (buf, sizeof (buf),
7714 "adr\t%%4, %s", targetm.strip_name_encoding (label));
7715 output_asm_insn (buf, operands);
7716 output_asm_insn (patterns[index][1], operands);
7717 output_asm_insn ("br\t%3", operands);
7718 assemble_label (asm_out_file, label);
7719 return "";
7723 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7724 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7725 operator. */
7728 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7730 if (shift >= 0 && shift <= 3)
7732 int size;
7733 for (size = 8; size <= 32; size *= 2)
7735 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7736 if (mask == bits << shift)
7737 return size;
7740 return 0;
7743 /* Constant pools are per function only when PC relative
7744 literal loads are true or we are in the large memory
7745 model. */
7747 static inline bool
7748 aarch64_can_use_per_function_literal_pools_p (void)
7750 return (aarch64_pcrelative_literal_loads
7751 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7754 static bool
7755 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7757 /* We can't use blocks for constants when we're using a per-function
7758 constant pool. */
7759 return !aarch64_can_use_per_function_literal_pools_p ();
7762 /* Select appropriate section for constants depending
7763 on where we place literal pools. */
7765 static section *
7766 aarch64_select_rtx_section (machine_mode mode,
7767 rtx x,
7768 unsigned HOST_WIDE_INT align)
7770 if (aarch64_can_use_per_function_literal_pools_p ())
7771 return function_section (current_function_decl);
7773 return default_elf_select_rtx_section (mode, x, align);
7776 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7777 void
7778 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7779 HOST_WIDE_INT offset)
7781 /* When using per-function literal pools, we must ensure that any code
7782 section is aligned to the minimal instruction length, lest we get
7783 errors from the assembler re "unaligned instructions". */
7784 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7785 ASM_OUTPUT_ALIGN (f, 2);
7788 /* Costs. */
7790 /* Helper function for rtx cost calculation. Strip a shift expression
7791 from X. Returns the inner operand if successful, or the original
7792 expression on failure. */
7793 static rtx
7794 aarch64_strip_shift (rtx x)
7796 rtx op = x;
7798 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7799 we can convert both to ROR during final output. */
7800 if ((GET_CODE (op) == ASHIFT
7801 || GET_CODE (op) == ASHIFTRT
7802 || GET_CODE (op) == LSHIFTRT
7803 || GET_CODE (op) == ROTATERT
7804 || GET_CODE (op) == ROTATE)
7805 && CONST_INT_P (XEXP (op, 1)))
7806 return XEXP (op, 0);
7808 if (GET_CODE (op) == MULT
7809 && CONST_INT_P (XEXP (op, 1))
7810 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7811 return XEXP (op, 0);
7813 return x;
7816 /* Helper function for rtx cost calculation. Strip an extend
7817 expression from X. Returns the inner operand if successful, or the
7818 original expression on failure. We deal with a number of possible
7819 canonicalization variations here. If STRIP_SHIFT is true, then
7820 we can strip off a shift also. */
7821 static rtx
7822 aarch64_strip_extend (rtx x, bool strip_shift)
7824 scalar_int_mode mode;
7825 rtx op = x;
7827 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7828 return op;
7830 /* Zero and sign extraction of a widened value. */
7831 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7832 && XEXP (op, 2) == const0_rtx
7833 && GET_CODE (XEXP (op, 0)) == MULT
7834 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7835 XEXP (op, 1)))
7836 return XEXP (XEXP (op, 0), 0);
7838 /* It can also be represented (for zero-extend) as an AND with an
7839 immediate. */
7840 if (GET_CODE (op) == AND
7841 && GET_CODE (XEXP (op, 0)) == MULT
7842 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7843 && CONST_INT_P (XEXP (op, 1))
7844 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7845 INTVAL (XEXP (op, 1))) != 0)
7846 return XEXP (XEXP (op, 0), 0);
7848 /* Now handle extended register, as this may also have an optional
7849 left shift by 1..4. */
7850 if (strip_shift
7851 && GET_CODE (op) == ASHIFT
7852 && CONST_INT_P (XEXP (op, 1))
7853 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7854 op = XEXP (op, 0);
7856 if (GET_CODE (op) == ZERO_EXTEND
7857 || GET_CODE (op) == SIGN_EXTEND)
7858 op = XEXP (op, 0);
7860 if (op != x)
7861 return op;
7863 return x;
7866 /* Return true iff CODE is a shift supported in combination
7867 with arithmetic instructions. */
7869 static bool
7870 aarch64_shift_p (enum rtx_code code)
7872 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7876 /* Return true iff X is a cheap shift without a sign extend. */
7878 static bool
7879 aarch64_cheap_mult_shift_p (rtx x)
7881 rtx op0, op1;
7883 op0 = XEXP (x, 0);
7884 op1 = XEXP (x, 1);
7886 if (!(aarch64_tune_params.extra_tuning_flags
7887 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7888 return false;
7890 if (GET_CODE (op0) == SIGN_EXTEND)
7891 return false;
7893 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7894 && UINTVAL (op1) <= 4)
7895 return true;
7897 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7898 return false;
7900 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7902 if (l2 > 0 && l2 <= 4)
7903 return true;
7905 return false;
7908 /* Helper function for rtx cost calculation. Calculate the cost of
7909 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7910 Return the calculated cost of the expression, recursing manually in to
7911 operands where needed. */
7913 static int
7914 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7916 rtx op0, op1;
7917 const struct cpu_cost_table *extra_cost
7918 = aarch64_tune_params.insn_extra_cost;
7919 int cost = 0;
7920 bool compound_p = (outer == PLUS || outer == MINUS);
7921 machine_mode mode = GET_MODE (x);
7923 gcc_checking_assert (code == MULT);
7925 op0 = XEXP (x, 0);
7926 op1 = XEXP (x, 1);
7928 if (VECTOR_MODE_P (mode))
7929 mode = GET_MODE_INNER (mode);
7931 /* Integer multiply/fma. */
7932 if (GET_MODE_CLASS (mode) == MODE_INT)
7934 /* The multiply will be canonicalized as a shift, cost it as such. */
7935 if (aarch64_shift_p (GET_CODE (x))
7936 || (CONST_INT_P (op1)
7937 && exact_log2 (INTVAL (op1)) > 0))
7939 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7940 || GET_CODE (op0) == SIGN_EXTEND;
7941 if (speed)
7943 if (compound_p)
7945 /* If the shift is considered cheap,
7946 then don't add any cost. */
7947 if (aarch64_cheap_mult_shift_p (x))
7949 else if (REG_P (op1))
7950 /* ARITH + shift-by-register. */
7951 cost += extra_cost->alu.arith_shift_reg;
7952 else if (is_extend)
7953 /* ARITH + extended register. We don't have a cost field
7954 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
7955 cost += extra_cost->alu.extend_arith;
7956 else
7957 /* ARITH + shift-by-immediate. */
7958 cost += extra_cost->alu.arith_shift;
7960 else
7961 /* LSL (immediate). */
7962 cost += extra_cost->alu.shift;
7965 /* Strip extends as we will have costed them in the case above. */
7966 if (is_extend)
7967 op0 = aarch64_strip_extend (op0, true);
7969 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7971 return cost;
7974 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
7975 compound and let the below cases handle it. After all, MNEG is a
7976 special-case alias of MSUB. */
7977 if (GET_CODE (op0) == NEG)
7979 op0 = XEXP (op0, 0);
7980 compound_p = true;
7983 /* Integer multiplies or FMAs have zero/sign extending variants. */
7984 if ((GET_CODE (op0) == ZERO_EXTEND
7985 && GET_CODE (op1) == ZERO_EXTEND)
7986 || (GET_CODE (op0) == SIGN_EXTEND
7987 && GET_CODE (op1) == SIGN_EXTEND))
7989 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
7990 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
7992 if (speed)
7994 if (compound_p)
7995 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
7996 cost += extra_cost->mult[0].extend_add;
7997 else
7998 /* MUL/SMULL/UMULL. */
7999 cost += extra_cost->mult[0].extend;
8002 return cost;
8005 /* This is either an integer multiply or a MADD. In both cases
8006 we want to recurse and cost the operands. */
8007 cost += rtx_cost (op0, mode, MULT, 0, speed);
8008 cost += rtx_cost (op1, mode, MULT, 1, speed);
8010 if (speed)
8012 if (compound_p)
8013 /* MADD/MSUB. */
8014 cost += extra_cost->mult[mode == DImode].add;
8015 else
8016 /* MUL. */
8017 cost += extra_cost->mult[mode == DImode].simple;
8020 return cost;
8022 else
8024 if (speed)
8026 /* Floating-point FMA/FMUL can also support negations of the
8027 operands, unless the rounding mode is upward or downward in
8028 which case FNMUL is different than FMUL with operand negation. */
8029 bool neg0 = GET_CODE (op0) == NEG;
8030 bool neg1 = GET_CODE (op1) == NEG;
8031 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8033 if (neg0)
8034 op0 = XEXP (op0, 0);
8035 if (neg1)
8036 op1 = XEXP (op1, 0);
8039 if (compound_p)
8040 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8041 cost += extra_cost->fp[mode == DFmode].fma;
8042 else
8043 /* FMUL/FNMUL. */
8044 cost += extra_cost->fp[mode == DFmode].mult;
8047 cost += rtx_cost (op0, mode, MULT, 0, speed);
8048 cost += rtx_cost (op1, mode, MULT, 1, speed);
8049 return cost;
8053 static int
8054 aarch64_address_cost (rtx x,
8055 machine_mode mode,
8056 addr_space_t as ATTRIBUTE_UNUSED,
8057 bool speed)
8059 enum rtx_code c = GET_CODE (x);
8060 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8061 struct aarch64_address_info info;
8062 int cost = 0;
8063 info.shift = 0;
8065 if (!aarch64_classify_address (&info, x, mode, false))
8067 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8069 /* This is a CONST or SYMBOL ref which will be split
8070 in a different way depending on the code model in use.
8071 Cost it through the generic infrastructure. */
8072 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8073 /* Divide through by the cost of one instruction to
8074 bring it to the same units as the address costs. */
8075 cost_symbol_ref /= COSTS_N_INSNS (1);
8076 /* The cost is then the cost of preparing the address,
8077 followed by an immediate (possibly 0) offset. */
8078 return cost_symbol_ref + addr_cost->imm_offset;
8080 else
8082 /* This is most likely a jump table from a case
8083 statement. */
8084 return addr_cost->register_offset;
8088 switch (info.type)
8090 case ADDRESS_LO_SUM:
8091 case ADDRESS_SYMBOLIC:
8092 case ADDRESS_REG_IMM:
8093 cost += addr_cost->imm_offset;
8094 break;
8096 case ADDRESS_REG_WB:
8097 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8098 cost += addr_cost->pre_modify;
8099 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8100 cost += addr_cost->post_modify;
8101 else
8102 gcc_unreachable ();
8104 break;
8106 case ADDRESS_REG_REG:
8107 cost += addr_cost->register_offset;
8108 break;
8110 case ADDRESS_REG_SXTW:
8111 cost += addr_cost->register_sextend;
8112 break;
8114 case ADDRESS_REG_UXTW:
8115 cost += addr_cost->register_zextend;
8116 break;
8118 default:
8119 gcc_unreachable ();
8123 if (info.shift > 0)
8125 /* For the sake of calculating the cost of the shifted register
8126 component, we can treat same sized modes in the same way. */
8127 if (known_eq (GET_MODE_BITSIZE (mode), 16))
8128 cost += addr_cost->addr_scale_costs.hi;
8129 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8130 cost += addr_cost->addr_scale_costs.si;
8131 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8132 cost += addr_cost->addr_scale_costs.di;
8133 else
8134 /* We can't tell, or this is a 128-bit vector. */
8135 cost += addr_cost->addr_scale_costs.ti;
8138 return cost;
8141 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8142 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8143 to be taken. */
8146 aarch64_branch_cost (bool speed_p, bool predictable_p)
8148 /* When optimizing for speed, use the cost of unpredictable branches. */
8149 const struct cpu_branch_cost *branch_costs =
8150 aarch64_tune_params.branch_costs;
8152 if (!speed_p || predictable_p)
8153 return branch_costs->predictable;
8154 else
8155 return branch_costs->unpredictable;
8158 /* Return true if the RTX X in mode MODE is a zero or sign extract
8159 usable in an ADD or SUB (extended register) instruction. */
8160 static bool
8161 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8163 /* Catch add with a sign extract.
8164 This is add_<optab><mode>_multp2. */
8165 if (GET_CODE (x) == SIGN_EXTRACT
8166 || GET_CODE (x) == ZERO_EXTRACT)
8168 rtx op0 = XEXP (x, 0);
8169 rtx op1 = XEXP (x, 1);
8170 rtx op2 = XEXP (x, 2);
8172 if (GET_CODE (op0) == MULT
8173 && CONST_INT_P (op1)
8174 && op2 == const0_rtx
8175 && CONST_INT_P (XEXP (op0, 1))
8176 && aarch64_is_extend_from_extract (mode,
8177 XEXP (op0, 1),
8178 op1))
8180 return true;
8183 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8184 No shift. */
8185 else if (GET_CODE (x) == SIGN_EXTEND
8186 || GET_CODE (x) == ZERO_EXTEND)
8187 return REG_P (XEXP (x, 0));
8189 return false;
8192 static bool
8193 aarch64_frint_unspec_p (unsigned int u)
8195 switch (u)
8197 case UNSPEC_FRINTZ:
8198 case UNSPEC_FRINTP:
8199 case UNSPEC_FRINTM:
8200 case UNSPEC_FRINTA:
8201 case UNSPEC_FRINTN:
8202 case UNSPEC_FRINTX:
8203 case UNSPEC_FRINTI:
8204 return true;
8206 default:
8207 return false;
8211 /* Return true iff X is an rtx that will match an extr instruction
8212 i.e. as described in the *extr<mode>5_insn family of patterns.
8213 OP0 and OP1 will be set to the operands of the shifts involved
8214 on success and will be NULL_RTX otherwise. */
8216 static bool
8217 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8219 rtx op0, op1;
8220 scalar_int_mode mode;
8221 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8222 return false;
8224 *res_op0 = NULL_RTX;
8225 *res_op1 = NULL_RTX;
8227 if (GET_CODE (x) != IOR)
8228 return false;
8230 op0 = XEXP (x, 0);
8231 op1 = XEXP (x, 1);
8233 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8234 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8236 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8237 if (GET_CODE (op1) == ASHIFT)
8238 std::swap (op0, op1);
8240 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8241 return false;
8243 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8244 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8246 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8247 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8249 *res_op0 = XEXP (op0, 0);
8250 *res_op1 = XEXP (op1, 0);
8251 return true;
8255 return false;
8258 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8259 storing it in *COST. Result is true if the total cost of the operation
8260 has now been calculated. */
8261 static bool
8262 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8264 rtx inner;
8265 rtx comparator;
8266 enum rtx_code cmpcode;
8268 if (COMPARISON_P (op0))
8270 inner = XEXP (op0, 0);
8271 comparator = XEXP (op0, 1);
8272 cmpcode = GET_CODE (op0);
8274 else
8276 inner = op0;
8277 comparator = const0_rtx;
8278 cmpcode = NE;
8281 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8283 /* Conditional branch. */
8284 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8285 return true;
8286 else
8288 if (cmpcode == NE || cmpcode == EQ)
8290 if (comparator == const0_rtx)
8292 /* TBZ/TBNZ/CBZ/CBNZ. */
8293 if (GET_CODE (inner) == ZERO_EXTRACT)
8294 /* TBZ/TBNZ. */
8295 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8296 ZERO_EXTRACT, 0, speed);
8297 else
8298 /* CBZ/CBNZ. */
8299 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8301 return true;
8304 else if (cmpcode == LT || cmpcode == GE)
8306 /* TBZ/TBNZ. */
8307 if (comparator == const0_rtx)
8308 return true;
8312 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8314 /* CCMP. */
8315 if (GET_CODE (op1) == COMPARE)
8317 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8318 if (XEXP (op1, 1) == const0_rtx)
8319 *cost += 1;
8320 if (speed)
8322 machine_mode mode = GET_MODE (XEXP (op1, 0));
8323 const struct cpu_cost_table *extra_cost
8324 = aarch64_tune_params.insn_extra_cost;
8326 if (GET_MODE_CLASS (mode) == MODE_INT)
8327 *cost += extra_cost->alu.arith;
8328 else
8329 *cost += extra_cost->fp[mode == DFmode].compare;
8331 return true;
8334 /* It's a conditional operation based on the status flags,
8335 so it must be some flavor of CSEL. */
8337 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8338 if (GET_CODE (op1) == NEG
8339 || GET_CODE (op1) == NOT
8340 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8341 op1 = XEXP (op1, 0);
8342 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8344 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8345 op1 = XEXP (op1, 0);
8346 op2 = XEXP (op2, 0);
8349 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8350 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8351 return true;
8354 /* We don't know what this is, cost all operands. */
8355 return false;
8358 /* Check whether X is a bitfield operation of the form shift + extend that
8359 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8360 operand to which the bitfield operation is applied. Otherwise return
8361 NULL_RTX. */
8363 static rtx
8364 aarch64_extend_bitfield_pattern_p (rtx x)
8366 rtx_code outer_code = GET_CODE (x);
8367 machine_mode outer_mode = GET_MODE (x);
8369 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8370 && outer_mode != SImode && outer_mode != DImode)
8371 return NULL_RTX;
8373 rtx inner = XEXP (x, 0);
8374 rtx_code inner_code = GET_CODE (inner);
8375 machine_mode inner_mode = GET_MODE (inner);
8376 rtx op = NULL_RTX;
8378 switch (inner_code)
8380 case ASHIFT:
8381 if (CONST_INT_P (XEXP (inner, 1))
8382 && (inner_mode == QImode || inner_mode == HImode))
8383 op = XEXP (inner, 0);
8384 break;
8385 case LSHIFTRT:
8386 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8387 && (inner_mode == QImode || inner_mode == HImode))
8388 op = XEXP (inner, 0);
8389 break;
8390 case ASHIFTRT:
8391 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8392 && (inner_mode == QImode || inner_mode == HImode))
8393 op = XEXP (inner, 0);
8394 break;
8395 default:
8396 break;
8399 return op;
8402 /* Return true if the mask and a shift amount from an RTX of the form
8403 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8404 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8406 bool
8407 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8408 rtx shft_amnt)
8410 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8411 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8412 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8413 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8416 /* Calculate the cost of calculating X, storing it in *COST. Result
8417 is true if the total cost of the operation has now been calculated. */
8418 static bool
8419 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8420 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8422 rtx op0, op1, op2;
8423 const struct cpu_cost_table *extra_cost
8424 = aarch64_tune_params.insn_extra_cost;
8425 int code = GET_CODE (x);
8426 scalar_int_mode int_mode;
8428 /* By default, assume that everything has equivalent cost to the
8429 cheapest instruction. Any additional costs are applied as a delta
8430 above this default. */
8431 *cost = COSTS_N_INSNS (1);
8433 switch (code)
8435 case SET:
8436 /* The cost depends entirely on the operands to SET. */
8437 *cost = 0;
8438 op0 = SET_DEST (x);
8439 op1 = SET_SRC (x);
8441 switch (GET_CODE (op0))
8443 case MEM:
8444 if (speed)
8446 rtx address = XEXP (op0, 0);
8447 if (VECTOR_MODE_P (mode))
8448 *cost += extra_cost->ldst.storev;
8449 else if (GET_MODE_CLASS (mode) == MODE_INT)
8450 *cost += extra_cost->ldst.store;
8451 else if (mode == SFmode)
8452 *cost += extra_cost->ldst.storef;
8453 else if (mode == DFmode)
8454 *cost += extra_cost->ldst.stored;
8456 *cost +=
8457 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8458 0, speed));
8461 *cost += rtx_cost (op1, mode, SET, 1, speed);
8462 return true;
8464 case SUBREG:
8465 if (! REG_P (SUBREG_REG (op0)))
8466 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8468 /* Fall through. */
8469 case REG:
8470 /* The cost is one per vector-register copied. */
8471 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8473 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8474 *cost = COSTS_N_INSNS (nregs);
8476 /* const0_rtx is in general free, but we will use an
8477 instruction to set a register to 0. */
8478 else if (REG_P (op1) || op1 == const0_rtx)
8480 /* The cost is 1 per register copied. */
8481 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8482 *cost = COSTS_N_INSNS (nregs);
8484 else
8485 /* Cost is just the cost of the RHS of the set. */
8486 *cost += rtx_cost (op1, mode, SET, 1, speed);
8487 return true;
8489 case ZERO_EXTRACT:
8490 case SIGN_EXTRACT:
8491 /* Bit-field insertion. Strip any redundant widening of
8492 the RHS to meet the width of the target. */
8493 if (GET_CODE (op1) == SUBREG)
8494 op1 = SUBREG_REG (op1);
8495 if ((GET_CODE (op1) == ZERO_EXTEND
8496 || GET_CODE (op1) == SIGN_EXTEND)
8497 && CONST_INT_P (XEXP (op0, 1))
8498 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8499 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8500 op1 = XEXP (op1, 0);
8502 if (CONST_INT_P (op1))
8504 /* MOV immediate is assumed to always be cheap. */
8505 *cost = COSTS_N_INSNS (1);
8507 else
8509 /* BFM. */
8510 if (speed)
8511 *cost += extra_cost->alu.bfi;
8512 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8515 return true;
8517 default:
8518 /* We can't make sense of this, assume default cost. */
8519 *cost = COSTS_N_INSNS (1);
8520 return false;
8522 return false;
8524 case CONST_INT:
8525 /* If an instruction can incorporate a constant within the
8526 instruction, the instruction's expression avoids calling
8527 rtx_cost() on the constant. If rtx_cost() is called on a
8528 constant, then it is usually because the constant must be
8529 moved into a register by one or more instructions.
8531 The exception is constant 0, which can be expressed
8532 as XZR/WZR and is therefore free. The exception to this is
8533 if we have (set (reg) (const0_rtx)) in which case we must cost
8534 the move. However, we can catch that when we cost the SET, so
8535 we don't need to consider that here. */
8536 if (x == const0_rtx)
8537 *cost = 0;
8538 else
8540 /* To an approximation, building any other constant is
8541 proportionally expensive to the number of instructions
8542 required to build that constant. This is true whether we
8543 are compiling for SPEED or otherwise. */
8544 if (!is_a <scalar_int_mode> (mode, &int_mode))
8545 int_mode = word_mode;
8546 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8547 (NULL_RTX, x, false, int_mode));
8549 return true;
8551 case CONST_DOUBLE:
8553 /* First determine number of instructions to do the move
8554 as an integer constant. */
8555 if (!aarch64_float_const_representable_p (x)
8556 && !aarch64_can_const_movi_rtx_p (x, mode)
8557 && aarch64_float_const_rtx_p (x))
8559 unsigned HOST_WIDE_INT ival;
8560 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8561 gcc_assert (succeed);
8563 scalar_int_mode imode = (mode == HFmode
8564 ? SImode
8565 : int_mode_for_mode (mode).require ());
8566 int ncost = aarch64_internal_mov_immediate
8567 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8568 *cost += COSTS_N_INSNS (ncost);
8569 return true;
8572 if (speed)
8574 /* mov[df,sf]_aarch64. */
8575 if (aarch64_float_const_representable_p (x))
8576 /* FMOV (scalar immediate). */
8577 *cost += extra_cost->fp[mode == DFmode].fpconst;
8578 else if (!aarch64_float_const_zero_rtx_p (x))
8580 /* This will be a load from memory. */
8581 if (mode == DFmode)
8582 *cost += extra_cost->ldst.loadd;
8583 else
8584 *cost += extra_cost->ldst.loadf;
8586 else
8587 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8588 or MOV v0.s[0], wzr - neither of which are modeled by the
8589 cost tables. Just use the default cost. */
8594 return true;
8596 case MEM:
8597 if (speed)
8599 /* For loads we want the base cost of a load, plus an
8600 approximation for the additional cost of the addressing
8601 mode. */
8602 rtx address = XEXP (x, 0);
8603 if (VECTOR_MODE_P (mode))
8604 *cost += extra_cost->ldst.loadv;
8605 else if (GET_MODE_CLASS (mode) == MODE_INT)
8606 *cost += extra_cost->ldst.load;
8607 else if (mode == SFmode)
8608 *cost += extra_cost->ldst.loadf;
8609 else if (mode == DFmode)
8610 *cost += extra_cost->ldst.loadd;
8612 *cost +=
8613 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8614 0, speed));
8617 return true;
8619 case NEG:
8620 op0 = XEXP (x, 0);
8622 if (VECTOR_MODE_P (mode))
8624 if (speed)
8626 /* FNEG. */
8627 *cost += extra_cost->vect.alu;
8629 return false;
8632 if (GET_MODE_CLASS (mode) == MODE_INT)
8634 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8635 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8637 /* CSETM. */
8638 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8639 return true;
8642 /* Cost this as SUB wzr, X. */
8643 op0 = CONST0_RTX (mode);
8644 op1 = XEXP (x, 0);
8645 goto cost_minus;
8648 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8650 /* Support (neg(fma...)) as a single instruction only if
8651 sign of zeros is unimportant. This matches the decision
8652 making in aarch64.md. */
8653 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8655 /* FNMADD. */
8656 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8657 return true;
8659 if (GET_CODE (op0) == MULT)
8661 /* FNMUL. */
8662 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8663 return true;
8665 if (speed)
8666 /* FNEG. */
8667 *cost += extra_cost->fp[mode == DFmode].neg;
8668 return false;
8671 return false;
8673 case CLRSB:
8674 case CLZ:
8675 if (speed)
8677 if (VECTOR_MODE_P (mode))
8678 *cost += extra_cost->vect.alu;
8679 else
8680 *cost += extra_cost->alu.clz;
8683 return false;
8685 case COMPARE:
8686 op0 = XEXP (x, 0);
8687 op1 = XEXP (x, 1);
8689 if (op1 == const0_rtx
8690 && GET_CODE (op0) == AND)
8692 x = op0;
8693 mode = GET_MODE (op0);
8694 goto cost_logic;
8697 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8699 /* TODO: A write to the CC flags possibly costs extra, this
8700 needs encoding in the cost tables. */
8702 mode = GET_MODE (op0);
8703 /* ANDS. */
8704 if (GET_CODE (op0) == AND)
8706 x = op0;
8707 goto cost_logic;
8710 if (GET_CODE (op0) == PLUS)
8712 /* ADDS (and CMN alias). */
8713 x = op0;
8714 goto cost_plus;
8717 if (GET_CODE (op0) == MINUS)
8719 /* SUBS. */
8720 x = op0;
8721 goto cost_minus;
8724 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8725 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8726 && CONST_INT_P (XEXP (op0, 2)))
8728 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8729 Handle it here directly rather than going to cost_logic
8730 since we know the immediate generated for the TST is valid
8731 so we can avoid creating an intermediate rtx for it only
8732 for costing purposes. */
8733 if (speed)
8734 *cost += extra_cost->alu.logical;
8736 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8737 ZERO_EXTRACT, 0, speed);
8738 return true;
8741 if (GET_CODE (op1) == NEG)
8743 /* CMN. */
8744 if (speed)
8745 *cost += extra_cost->alu.arith;
8747 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8748 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8749 return true;
8752 /* CMP.
8754 Compare can freely swap the order of operands, and
8755 canonicalization puts the more complex operation first.
8756 But the integer MINUS logic expects the shift/extend
8757 operation in op1. */
8758 if (! (REG_P (op0)
8759 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8761 op0 = XEXP (x, 1);
8762 op1 = XEXP (x, 0);
8764 goto cost_minus;
8767 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8769 /* FCMP. */
8770 if (speed)
8771 *cost += extra_cost->fp[mode == DFmode].compare;
8773 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8775 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8776 /* FCMP supports constant 0.0 for no extra cost. */
8777 return true;
8779 return false;
8782 if (VECTOR_MODE_P (mode))
8784 /* Vector compare. */
8785 if (speed)
8786 *cost += extra_cost->vect.alu;
8788 if (aarch64_float_const_zero_rtx_p (op1))
8790 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8791 cost. */
8792 return true;
8794 return false;
8796 return false;
8798 case MINUS:
8800 op0 = XEXP (x, 0);
8801 op1 = XEXP (x, 1);
8803 cost_minus:
8804 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8806 /* Detect valid immediates. */
8807 if ((GET_MODE_CLASS (mode) == MODE_INT
8808 || (GET_MODE_CLASS (mode) == MODE_CC
8809 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8810 && CONST_INT_P (op1)
8811 && aarch64_uimm12_shift (INTVAL (op1)))
8813 if (speed)
8814 /* SUB(S) (immediate). */
8815 *cost += extra_cost->alu.arith;
8816 return true;
8819 /* Look for SUB (extended register). */
8820 if (is_a <scalar_int_mode> (mode, &int_mode)
8821 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8823 if (speed)
8824 *cost += extra_cost->alu.extend_arith;
8826 op1 = aarch64_strip_extend (op1, true);
8827 *cost += rtx_cost (op1, VOIDmode,
8828 (enum rtx_code) GET_CODE (op1), 0, speed);
8829 return true;
8832 rtx new_op1 = aarch64_strip_extend (op1, false);
8834 /* Cost this as an FMA-alike operation. */
8835 if ((GET_CODE (new_op1) == MULT
8836 || aarch64_shift_p (GET_CODE (new_op1)))
8837 && code != COMPARE)
8839 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8840 (enum rtx_code) code,
8841 speed);
8842 return true;
8845 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8847 if (speed)
8849 if (VECTOR_MODE_P (mode))
8851 /* Vector SUB. */
8852 *cost += extra_cost->vect.alu;
8854 else if (GET_MODE_CLASS (mode) == MODE_INT)
8856 /* SUB(S). */
8857 *cost += extra_cost->alu.arith;
8859 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8861 /* FSUB. */
8862 *cost += extra_cost->fp[mode == DFmode].addsub;
8865 return true;
8868 case PLUS:
8870 rtx new_op0;
8872 op0 = XEXP (x, 0);
8873 op1 = XEXP (x, 1);
8875 cost_plus:
8876 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8877 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8879 /* CSINC. */
8880 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8881 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8882 return true;
8885 if (GET_MODE_CLASS (mode) == MODE_INT
8886 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8887 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8889 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8891 if (speed)
8892 /* ADD (immediate). */
8893 *cost += extra_cost->alu.arith;
8894 return true;
8897 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8899 /* Look for ADD (extended register). */
8900 if (is_a <scalar_int_mode> (mode, &int_mode)
8901 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8903 if (speed)
8904 *cost += extra_cost->alu.extend_arith;
8906 op0 = aarch64_strip_extend (op0, true);
8907 *cost += rtx_cost (op0, VOIDmode,
8908 (enum rtx_code) GET_CODE (op0), 0, speed);
8909 return true;
8912 /* Strip any extend, leave shifts behind as we will
8913 cost them through mult_cost. */
8914 new_op0 = aarch64_strip_extend (op0, false);
8916 if (GET_CODE (new_op0) == MULT
8917 || aarch64_shift_p (GET_CODE (new_op0)))
8919 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8920 speed);
8921 return true;
8924 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8926 if (speed)
8928 if (VECTOR_MODE_P (mode))
8930 /* Vector ADD. */
8931 *cost += extra_cost->vect.alu;
8933 else if (GET_MODE_CLASS (mode) == MODE_INT)
8935 /* ADD. */
8936 *cost += extra_cost->alu.arith;
8938 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8940 /* FADD. */
8941 *cost += extra_cost->fp[mode == DFmode].addsub;
8944 return true;
8947 case BSWAP:
8948 *cost = COSTS_N_INSNS (1);
8950 if (speed)
8952 if (VECTOR_MODE_P (mode))
8953 *cost += extra_cost->vect.alu;
8954 else
8955 *cost += extra_cost->alu.rev;
8957 return false;
8959 case IOR:
8960 if (aarch_rev16_p (x))
8962 *cost = COSTS_N_INSNS (1);
8964 if (speed)
8966 if (VECTOR_MODE_P (mode))
8967 *cost += extra_cost->vect.alu;
8968 else
8969 *cost += extra_cost->alu.rev;
8971 return true;
8974 if (aarch64_extr_rtx_p (x, &op0, &op1))
8976 *cost += rtx_cost (op0, mode, IOR, 0, speed);
8977 *cost += rtx_cost (op1, mode, IOR, 1, speed);
8978 if (speed)
8979 *cost += extra_cost->alu.shift;
8981 return true;
8983 /* Fall through. */
8984 case XOR:
8985 case AND:
8986 cost_logic:
8987 op0 = XEXP (x, 0);
8988 op1 = XEXP (x, 1);
8990 if (VECTOR_MODE_P (mode))
8992 if (speed)
8993 *cost += extra_cost->vect.alu;
8994 return true;
8997 if (code == AND
8998 && GET_CODE (op0) == MULT
8999 && CONST_INT_P (XEXP (op0, 1))
9000 && CONST_INT_P (op1)
9001 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9002 INTVAL (op1)) != 0)
9004 /* This is a UBFM/SBFM. */
9005 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9006 if (speed)
9007 *cost += extra_cost->alu.bfx;
9008 return true;
9011 if (is_int_mode (mode, &int_mode))
9013 if (CONST_INT_P (op1))
9015 /* We have a mask + shift version of a UBFIZ
9016 i.e. the *andim_ashift<mode>_bfiz pattern. */
9017 if (GET_CODE (op0) == ASHIFT
9018 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9019 XEXP (op0, 1)))
9021 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9022 (enum rtx_code) code, 0, speed);
9023 if (speed)
9024 *cost += extra_cost->alu.bfx;
9026 return true;
9028 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9030 /* We possibly get the immediate for free, this is not
9031 modelled. */
9032 *cost += rtx_cost (op0, int_mode,
9033 (enum rtx_code) code, 0, speed);
9034 if (speed)
9035 *cost += extra_cost->alu.logical;
9037 return true;
9040 else
9042 rtx new_op0 = op0;
9044 /* Handle ORN, EON, or BIC. */
9045 if (GET_CODE (op0) == NOT)
9046 op0 = XEXP (op0, 0);
9048 new_op0 = aarch64_strip_shift (op0);
9050 /* If we had a shift on op0 then this is a logical-shift-
9051 by-register/immediate operation. Otherwise, this is just
9052 a logical operation. */
9053 if (speed)
9055 if (new_op0 != op0)
9057 /* Shift by immediate. */
9058 if (CONST_INT_P (XEXP (op0, 1)))
9059 *cost += extra_cost->alu.log_shift;
9060 else
9061 *cost += extra_cost->alu.log_shift_reg;
9063 else
9064 *cost += extra_cost->alu.logical;
9067 /* In both cases we want to cost both operands. */
9068 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9069 0, speed);
9070 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9071 1, speed);
9073 return true;
9076 return false;
9078 case NOT:
9079 x = XEXP (x, 0);
9080 op0 = aarch64_strip_shift (x);
9082 if (VECTOR_MODE_P (mode))
9084 /* Vector NOT. */
9085 *cost += extra_cost->vect.alu;
9086 return false;
9089 /* MVN-shifted-reg. */
9090 if (op0 != x)
9092 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9094 if (speed)
9095 *cost += extra_cost->alu.log_shift;
9097 return true;
9099 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9100 Handle the second form here taking care that 'a' in the above can
9101 be a shift. */
9102 else if (GET_CODE (op0) == XOR)
9104 rtx newop0 = XEXP (op0, 0);
9105 rtx newop1 = XEXP (op0, 1);
9106 rtx op0_stripped = aarch64_strip_shift (newop0);
9108 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9109 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9111 if (speed)
9113 if (op0_stripped != newop0)
9114 *cost += extra_cost->alu.log_shift;
9115 else
9116 *cost += extra_cost->alu.logical;
9119 return true;
9121 /* MVN. */
9122 if (speed)
9123 *cost += extra_cost->alu.logical;
9125 return false;
9127 case ZERO_EXTEND:
9129 op0 = XEXP (x, 0);
9130 /* If a value is written in SI mode, then zero extended to DI
9131 mode, the operation will in general be free as a write to
9132 a 'w' register implicitly zeroes the upper bits of an 'x'
9133 register. However, if this is
9135 (set (reg) (zero_extend (reg)))
9137 we must cost the explicit register move. */
9138 if (mode == DImode
9139 && GET_MODE (op0) == SImode
9140 && outer == SET)
9142 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9144 /* If OP_COST is non-zero, then the cost of the zero extend
9145 is effectively the cost of the inner operation. Otherwise
9146 we have a MOV instruction and we take the cost from the MOV
9147 itself. This is true independently of whether we are
9148 optimizing for space or time. */
9149 if (op_cost)
9150 *cost = op_cost;
9152 return true;
9154 else if (MEM_P (op0))
9156 /* All loads can zero extend to any size for free. */
9157 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9158 return true;
9161 op0 = aarch64_extend_bitfield_pattern_p (x);
9162 if (op0)
9164 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9165 if (speed)
9166 *cost += extra_cost->alu.bfx;
9167 return true;
9170 if (speed)
9172 if (VECTOR_MODE_P (mode))
9174 /* UMOV. */
9175 *cost += extra_cost->vect.alu;
9177 else
9179 /* We generate an AND instead of UXTB/UXTH. */
9180 *cost += extra_cost->alu.logical;
9183 return false;
9185 case SIGN_EXTEND:
9186 if (MEM_P (XEXP (x, 0)))
9188 /* LDRSH. */
9189 if (speed)
9191 rtx address = XEXP (XEXP (x, 0), 0);
9192 *cost += extra_cost->ldst.load_sign_extend;
9194 *cost +=
9195 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9196 0, speed));
9198 return true;
9201 op0 = aarch64_extend_bitfield_pattern_p (x);
9202 if (op0)
9204 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9205 if (speed)
9206 *cost += extra_cost->alu.bfx;
9207 return true;
9210 if (speed)
9212 if (VECTOR_MODE_P (mode))
9213 *cost += extra_cost->vect.alu;
9214 else
9215 *cost += extra_cost->alu.extend;
9217 return false;
9219 case ASHIFT:
9220 op0 = XEXP (x, 0);
9221 op1 = XEXP (x, 1);
9223 if (CONST_INT_P (op1))
9225 if (speed)
9227 if (VECTOR_MODE_P (mode))
9229 /* Vector shift (immediate). */
9230 *cost += extra_cost->vect.alu;
9232 else
9234 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9235 aliases. */
9236 *cost += extra_cost->alu.shift;
9240 /* We can incorporate zero/sign extend for free. */
9241 if (GET_CODE (op0) == ZERO_EXTEND
9242 || GET_CODE (op0) == SIGN_EXTEND)
9243 op0 = XEXP (op0, 0);
9245 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9246 return true;
9248 else
9250 if (VECTOR_MODE_P (mode))
9252 if (speed)
9253 /* Vector shift (register). */
9254 *cost += extra_cost->vect.alu;
9256 else
9258 if (speed)
9259 /* LSLV. */
9260 *cost += extra_cost->alu.shift_reg;
9262 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9263 && CONST_INT_P (XEXP (op1, 1))
9264 && known_eq (INTVAL (XEXP (op1, 1)),
9265 GET_MODE_BITSIZE (mode) - 1))
9267 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9268 /* We already demanded XEXP (op1, 0) to be REG_P, so
9269 don't recurse into it. */
9270 return true;
9273 return false; /* All arguments need to be in registers. */
9276 case ROTATE:
9277 case ROTATERT:
9278 case LSHIFTRT:
9279 case ASHIFTRT:
9280 op0 = XEXP (x, 0);
9281 op1 = XEXP (x, 1);
9283 if (CONST_INT_P (op1))
9285 /* ASR (immediate) and friends. */
9286 if (speed)
9288 if (VECTOR_MODE_P (mode))
9289 *cost += extra_cost->vect.alu;
9290 else
9291 *cost += extra_cost->alu.shift;
9294 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9295 return true;
9297 else
9299 if (VECTOR_MODE_P (mode))
9301 if (speed)
9302 /* Vector shift (register). */
9303 *cost += extra_cost->vect.alu;
9305 else
9307 if (speed)
9308 /* ASR (register) and friends. */
9309 *cost += extra_cost->alu.shift_reg;
9311 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9312 && CONST_INT_P (XEXP (op1, 1))
9313 && known_eq (INTVAL (XEXP (op1, 1)),
9314 GET_MODE_BITSIZE (mode) - 1))
9316 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9317 /* We already demanded XEXP (op1, 0) to be REG_P, so
9318 don't recurse into it. */
9319 return true;
9322 return false; /* All arguments need to be in registers. */
9325 case SYMBOL_REF:
9327 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9328 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9330 /* LDR. */
9331 if (speed)
9332 *cost += extra_cost->ldst.load;
9334 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9335 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9337 /* ADRP, followed by ADD. */
9338 *cost += COSTS_N_INSNS (1);
9339 if (speed)
9340 *cost += 2 * extra_cost->alu.arith;
9342 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9343 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9345 /* ADR. */
9346 if (speed)
9347 *cost += extra_cost->alu.arith;
9350 if (flag_pic)
9352 /* One extra load instruction, after accessing the GOT. */
9353 *cost += COSTS_N_INSNS (1);
9354 if (speed)
9355 *cost += extra_cost->ldst.load;
9357 return true;
9359 case HIGH:
9360 case LO_SUM:
9361 /* ADRP/ADD (immediate). */
9362 if (speed)
9363 *cost += extra_cost->alu.arith;
9364 return true;
9366 case ZERO_EXTRACT:
9367 case SIGN_EXTRACT:
9368 /* UBFX/SBFX. */
9369 if (speed)
9371 if (VECTOR_MODE_P (mode))
9372 *cost += extra_cost->vect.alu;
9373 else
9374 *cost += extra_cost->alu.bfx;
9377 /* We can trust that the immediates used will be correct (there
9378 are no by-register forms), so we need only cost op0. */
9379 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9380 return true;
9382 case MULT:
9383 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9384 /* aarch64_rtx_mult_cost always handles recursion to its
9385 operands. */
9386 return true;
9388 case MOD:
9389 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9390 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9391 an unconditional negate. This case should only ever be reached through
9392 the set_smod_pow2_cheap check in expmed.c. */
9393 if (CONST_INT_P (XEXP (x, 1))
9394 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9395 && (mode == SImode || mode == DImode))
9397 /* We expand to 4 instructions. Reset the baseline. */
9398 *cost = COSTS_N_INSNS (4);
9400 if (speed)
9401 *cost += 2 * extra_cost->alu.logical
9402 + 2 * extra_cost->alu.arith;
9404 return true;
9407 /* Fall-through. */
9408 case UMOD:
9409 if (speed)
9411 /* Slighly prefer UMOD over SMOD. */
9412 if (VECTOR_MODE_P (mode))
9413 *cost += extra_cost->vect.alu;
9414 else if (GET_MODE_CLASS (mode) == MODE_INT)
9415 *cost += (extra_cost->mult[mode == DImode].add
9416 + extra_cost->mult[mode == DImode].idiv
9417 + (code == MOD ? 1 : 0));
9419 return false; /* All arguments need to be in registers. */
9421 case DIV:
9422 case UDIV:
9423 case SQRT:
9424 if (speed)
9426 if (VECTOR_MODE_P (mode))
9427 *cost += extra_cost->vect.alu;
9428 else if (GET_MODE_CLASS (mode) == MODE_INT)
9429 /* There is no integer SQRT, so only DIV and UDIV can get
9430 here. */
9431 *cost += (extra_cost->mult[mode == DImode].idiv
9432 /* Slighly prefer UDIV over SDIV. */
9433 + (code == DIV ? 1 : 0));
9434 else
9435 *cost += extra_cost->fp[mode == DFmode].div;
9437 return false; /* All arguments need to be in registers. */
9439 case IF_THEN_ELSE:
9440 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9441 XEXP (x, 2), cost, speed);
9443 case EQ:
9444 case NE:
9445 case GT:
9446 case GTU:
9447 case LT:
9448 case LTU:
9449 case GE:
9450 case GEU:
9451 case LE:
9452 case LEU:
9454 return false; /* All arguments must be in registers. */
9456 case FMA:
9457 op0 = XEXP (x, 0);
9458 op1 = XEXP (x, 1);
9459 op2 = XEXP (x, 2);
9461 if (speed)
9463 if (VECTOR_MODE_P (mode))
9464 *cost += extra_cost->vect.alu;
9465 else
9466 *cost += extra_cost->fp[mode == DFmode].fma;
9469 /* FMSUB, FNMADD, and FNMSUB are free. */
9470 if (GET_CODE (op0) == NEG)
9471 op0 = XEXP (op0, 0);
9473 if (GET_CODE (op2) == NEG)
9474 op2 = XEXP (op2, 0);
9476 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9477 and the by-element operand as operand 0. */
9478 if (GET_CODE (op1) == NEG)
9479 op1 = XEXP (op1, 0);
9481 /* Catch vector-by-element operations. The by-element operand can
9482 either be (vec_duplicate (vec_select (x))) or just
9483 (vec_select (x)), depending on whether we are multiplying by
9484 a vector or a scalar.
9486 Canonicalization is not very good in these cases, FMA4 will put the
9487 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9488 if (GET_CODE (op0) == VEC_DUPLICATE)
9489 op0 = XEXP (op0, 0);
9490 else if (GET_CODE (op1) == VEC_DUPLICATE)
9491 op1 = XEXP (op1, 0);
9493 if (GET_CODE (op0) == VEC_SELECT)
9494 op0 = XEXP (op0, 0);
9495 else if (GET_CODE (op1) == VEC_SELECT)
9496 op1 = XEXP (op1, 0);
9498 /* If the remaining parameters are not registers,
9499 get the cost to put them into registers. */
9500 *cost += rtx_cost (op0, mode, FMA, 0, speed);
9501 *cost += rtx_cost (op1, mode, FMA, 1, speed);
9502 *cost += rtx_cost (op2, mode, FMA, 2, speed);
9503 return true;
9505 case FLOAT:
9506 case UNSIGNED_FLOAT:
9507 if (speed)
9508 *cost += extra_cost->fp[mode == DFmode].fromint;
9509 return false;
9511 case FLOAT_EXTEND:
9512 if (speed)
9514 if (VECTOR_MODE_P (mode))
9516 /*Vector truncate. */
9517 *cost += extra_cost->vect.alu;
9519 else
9520 *cost += extra_cost->fp[mode == DFmode].widen;
9522 return false;
9524 case FLOAT_TRUNCATE:
9525 if (speed)
9527 if (VECTOR_MODE_P (mode))
9529 /*Vector conversion. */
9530 *cost += extra_cost->vect.alu;
9532 else
9533 *cost += extra_cost->fp[mode == DFmode].narrow;
9535 return false;
9537 case FIX:
9538 case UNSIGNED_FIX:
9539 x = XEXP (x, 0);
9540 /* Strip the rounding part. They will all be implemented
9541 by the fcvt* family of instructions anyway. */
9542 if (GET_CODE (x) == UNSPEC)
9544 unsigned int uns_code = XINT (x, 1);
9546 if (uns_code == UNSPEC_FRINTA
9547 || uns_code == UNSPEC_FRINTM
9548 || uns_code == UNSPEC_FRINTN
9549 || uns_code == UNSPEC_FRINTP
9550 || uns_code == UNSPEC_FRINTZ)
9551 x = XVECEXP (x, 0, 0);
9554 if (speed)
9556 if (VECTOR_MODE_P (mode))
9557 *cost += extra_cost->vect.alu;
9558 else
9559 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9562 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9563 fixed-point fcvt. */
9564 if (GET_CODE (x) == MULT
9565 && ((VECTOR_MODE_P (mode)
9566 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9567 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9569 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9570 0, speed);
9571 return true;
9574 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9575 return true;
9577 case ABS:
9578 if (VECTOR_MODE_P (mode))
9580 /* ABS (vector). */
9581 if (speed)
9582 *cost += extra_cost->vect.alu;
9584 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9586 op0 = XEXP (x, 0);
9588 /* FABD, which is analogous to FADD. */
9589 if (GET_CODE (op0) == MINUS)
9591 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9592 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9593 if (speed)
9594 *cost += extra_cost->fp[mode == DFmode].addsub;
9596 return true;
9598 /* Simple FABS is analogous to FNEG. */
9599 if (speed)
9600 *cost += extra_cost->fp[mode == DFmode].neg;
9602 else
9604 /* Integer ABS will either be split to
9605 two arithmetic instructions, or will be an ABS
9606 (scalar), which we don't model. */
9607 *cost = COSTS_N_INSNS (2);
9608 if (speed)
9609 *cost += 2 * extra_cost->alu.arith;
9611 return false;
9613 case SMAX:
9614 case SMIN:
9615 if (speed)
9617 if (VECTOR_MODE_P (mode))
9618 *cost += extra_cost->vect.alu;
9619 else
9621 /* FMAXNM/FMINNM/FMAX/FMIN.
9622 TODO: This may not be accurate for all implementations, but
9623 we do not model this in the cost tables. */
9624 *cost += extra_cost->fp[mode == DFmode].addsub;
9627 return false;
9629 case UNSPEC:
9630 /* The floating point round to integer frint* instructions. */
9631 if (aarch64_frint_unspec_p (XINT (x, 1)))
9633 if (speed)
9634 *cost += extra_cost->fp[mode == DFmode].roundint;
9636 return false;
9639 if (XINT (x, 1) == UNSPEC_RBIT)
9641 if (speed)
9642 *cost += extra_cost->alu.rev;
9644 return false;
9646 break;
9648 case TRUNCATE:
9650 /* Decompose <su>muldi3_highpart. */
9651 if (/* (truncate:DI */
9652 mode == DImode
9653 /* (lshiftrt:TI */
9654 && GET_MODE (XEXP (x, 0)) == TImode
9655 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9656 /* (mult:TI */
9657 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9658 /* (ANY_EXTEND:TI (reg:DI))
9659 (ANY_EXTEND:TI (reg:DI))) */
9660 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9661 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9662 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9663 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9664 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9665 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9666 /* (const_int 64) */
9667 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9668 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9670 /* UMULH/SMULH. */
9671 if (speed)
9672 *cost += extra_cost->mult[mode == DImode].extend;
9673 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9674 mode, MULT, 0, speed);
9675 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9676 mode, MULT, 1, speed);
9677 return true;
9680 /* Fall through. */
9681 default:
9682 break;
9685 if (dump_file
9686 && flag_aarch64_verbose_cost)
9687 fprintf (dump_file,
9688 "\nFailed to cost RTX. Assuming default cost.\n");
9690 return true;
9693 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9694 calculated for X. This cost is stored in *COST. Returns true
9695 if the total cost of X was calculated. */
9696 static bool
9697 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9698 int param, int *cost, bool speed)
9700 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9702 if (dump_file
9703 && flag_aarch64_verbose_cost)
9705 print_rtl_single (dump_file, x);
9706 fprintf (dump_file, "\n%s cost: %d (%s)\n",
9707 speed ? "Hot" : "Cold",
9708 *cost, result ? "final" : "partial");
9711 return result;
9714 static int
9715 aarch64_register_move_cost (machine_mode mode,
9716 reg_class_t from_i, reg_class_t to_i)
9718 enum reg_class from = (enum reg_class) from_i;
9719 enum reg_class to = (enum reg_class) to_i;
9720 const struct cpu_regmove_cost *regmove_cost
9721 = aarch64_tune_params.regmove_cost;
9723 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9724 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9725 to = GENERAL_REGS;
9727 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9728 from = GENERAL_REGS;
9730 /* Moving between GPR and stack cost is the same as GP2GP. */
9731 if ((from == GENERAL_REGS && to == STACK_REG)
9732 || (to == GENERAL_REGS && from == STACK_REG))
9733 return regmove_cost->GP2GP;
9735 /* To/From the stack register, we move via the gprs. */
9736 if (to == STACK_REG || from == STACK_REG)
9737 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9738 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9740 if (known_eq (GET_MODE_SIZE (mode), 16))
9742 /* 128-bit operations on general registers require 2 instructions. */
9743 if (from == GENERAL_REGS && to == GENERAL_REGS)
9744 return regmove_cost->GP2GP * 2;
9745 else if (from == GENERAL_REGS)
9746 return regmove_cost->GP2FP * 2;
9747 else if (to == GENERAL_REGS)
9748 return regmove_cost->FP2GP * 2;
9750 /* When AdvSIMD instructions are disabled it is not possible to move
9751 a 128-bit value directly between Q registers. This is handled in
9752 secondary reload. A general register is used as a scratch to move
9753 the upper DI value and the lower DI value is moved directly,
9754 hence the cost is the sum of three moves. */
9755 if (! TARGET_SIMD)
9756 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9758 return regmove_cost->FP2FP;
9761 if (from == GENERAL_REGS && to == GENERAL_REGS)
9762 return regmove_cost->GP2GP;
9763 else if (from == GENERAL_REGS)
9764 return regmove_cost->GP2FP;
9765 else if (to == GENERAL_REGS)
9766 return regmove_cost->FP2GP;
9768 return regmove_cost->FP2FP;
9771 static int
9772 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9773 reg_class_t rclass ATTRIBUTE_UNUSED,
9774 bool in ATTRIBUTE_UNUSED)
9776 return aarch64_tune_params.memmov_cost;
9779 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9780 to optimize 1.0/sqrt. */
9782 static bool
9783 use_rsqrt_p (machine_mode mode)
9785 return (!flag_trapping_math
9786 && flag_unsafe_math_optimizations
9787 && ((aarch64_tune_params.approx_modes->recip_sqrt
9788 & AARCH64_APPROX_MODE (mode))
9789 || flag_mrecip_low_precision_sqrt));
9792 /* Function to decide when to use the approximate reciprocal square root
9793 builtin. */
9795 static tree
9796 aarch64_builtin_reciprocal (tree fndecl)
9798 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9800 if (!use_rsqrt_p (mode))
9801 return NULL_TREE;
9802 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9805 typedef rtx (*rsqrte_type) (rtx, rtx);
9807 /* Select reciprocal square root initial estimate insn depending on machine
9808 mode. */
9810 static rsqrte_type
9811 get_rsqrte_type (machine_mode mode)
9813 switch (mode)
9815 case E_DFmode: return gen_aarch64_rsqrtedf;
9816 case E_SFmode: return gen_aarch64_rsqrtesf;
9817 case E_V2DFmode: return gen_aarch64_rsqrtev2df;
9818 case E_V2SFmode: return gen_aarch64_rsqrtev2sf;
9819 case E_V4SFmode: return gen_aarch64_rsqrtev4sf;
9820 default: gcc_unreachable ();
9824 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
9826 /* Select reciprocal square root series step insn depending on machine mode. */
9828 static rsqrts_type
9829 get_rsqrts_type (machine_mode mode)
9831 switch (mode)
9833 case E_DFmode: return gen_aarch64_rsqrtsdf;
9834 case E_SFmode: return gen_aarch64_rsqrtssf;
9835 case E_V2DFmode: return gen_aarch64_rsqrtsv2df;
9836 case E_V2SFmode: return gen_aarch64_rsqrtsv2sf;
9837 case E_V4SFmode: return gen_aarch64_rsqrtsv4sf;
9838 default: gcc_unreachable ();
9842 /* Emit instruction sequence to compute either the approximate square root
9843 or its approximate reciprocal, depending on the flag RECP, and return
9844 whether the sequence was emitted or not. */
9846 bool
9847 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9849 machine_mode mode = GET_MODE (dst);
9851 if (GET_MODE_INNER (mode) == HFmode)
9853 gcc_assert (!recp);
9854 return false;
9857 if (!recp)
9859 if (!(flag_mlow_precision_sqrt
9860 || (aarch64_tune_params.approx_modes->sqrt
9861 & AARCH64_APPROX_MODE (mode))))
9862 return false;
9864 if (flag_finite_math_only
9865 || flag_trapping_math
9866 || !flag_unsafe_math_optimizations
9867 || optimize_function_for_size_p (cfun))
9868 return false;
9870 else
9871 /* Caller assumes we cannot fail. */
9872 gcc_assert (use_rsqrt_p (mode));
9874 machine_mode mmsk = mode_for_int_vector (mode).require ();
9875 rtx xmsk = gen_reg_rtx (mmsk);
9876 if (!recp)
9877 /* When calculating the approximate square root, compare the
9878 argument with 0.0 and create a mask. */
9879 emit_insn (gen_rtx_SET (xmsk,
9880 gen_rtx_NEG (mmsk,
9881 gen_rtx_EQ (mmsk, src,
9882 CONST0_RTX (mode)))));
9884 /* Estimate the approximate reciprocal square root. */
9885 rtx xdst = gen_reg_rtx (mode);
9886 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
9888 /* Iterate over the series twice for SF and thrice for DF. */
9889 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9891 /* Optionally iterate over the series once less for faster performance
9892 while sacrificing the accuracy. */
9893 if ((recp && flag_mrecip_low_precision_sqrt)
9894 || (!recp && flag_mlow_precision_sqrt))
9895 iterations--;
9897 /* Iterate over the series to calculate the approximate reciprocal square
9898 root. */
9899 rtx x1 = gen_reg_rtx (mode);
9900 while (iterations--)
9902 rtx x2 = gen_reg_rtx (mode);
9903 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9905 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
9907 if (iterations > 0)
9908 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9911 if (!recp)
9913 /* Qualify the approximate reciprocal square root when the argument is
9914 0.0 by squashing the intermediary result to 0.0. */
9915 rtx xtmp = gen_reg_rtx (mmsk);
9916 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9917 gen_rtx_SUBREG (mmsk, xdst, 0)));
9918 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9920 /* Calculate the approximate square root. */
9921 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9924 /* Finalize the approximation. */
9925 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9927 return true;
9930 typedef rtx (*recpe_type) (rtx, rtx);
9932 /* Select reciprocal initial estimate insn depending on machine mode. */
9934 static recpe_type
9935 get_recpe_type (machine_mode mode)
9937 switch (mode)
9939 case E_SFmode: return (gen_aarch64_frecpesf);
9940 case E_V2SFmode: return (gen_aarch64_frecpev2sf);
9941 case E_V4SFmode: return (gen_aarch64_frecpev4sf);
9942 case E_DFmode: return (gen_aarch64_frecpedf);
9943 case E_V2DFmode: return (gen_aarch64_frecpev2df);
9944 default: gcc_unreachable ();
9948 typedef rtx (*recps_type) (rtx, rtx, rtx);
9950 /* Select reciprocal series step insn depending on machine mode. */
9952 static recps_type
9953 get_recps_type (machine_mode mode)
9955 switch (mode)
9957 case E_SFmode: return (gen_aarch64_frecpssf);
9958 case E_V2SFmode: return (gen_aarch64_frecpsv2sf);
9959 case E_V4SFmode: return (gen_aarch64_frecpsv4sf);
9960 case E_DFmode: return (gen_aarch64_frecpsdf);
9961 case E_V2DFmode: return (gen_aarch64_frecpsv2df);
9962 default: gcc_unreachable ();
9966 /* Emit the instruction sequence to compute the approximation for the division
9967 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
9969 bool
9970 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9972 machine_mode mode = GET_MODE (quo);
9974 if (GET_MODE_INNER (mode) == HFmode)
9975 return false;
9977 bool use_approx_division_p = (flag_mlow_precision_div
9978 || (aarch64_tune_params.approx_modes->division
9979 & AARCH64_APPROX_MODE (mode)));
9981 if (!flag_finite_math_only
9982 || flag_trapping_math
9983 || !flag_unsafe_math_optimizations
9984 || optimize_function_for_size_p (cfun)
9985 || !use_approx_division_p)
9986 return false;
9988 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9989 return false;
9991 /* Estimate the approximate reciprocal. */
9992 rtx xrcp = gen_reg_rtx (mode);
9993 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
9995 /* Iterate over the series twice for SF and thrice for DF. */
9996 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9998 /* Optionally iterate over the series once less for faster performance,
9999 while sacrificing the accuracy. */
10000 if (flag_mlow_precision_div)
10001 iterations--;
10003 /* Iterate over the series to calculate the approximate reciprocal. */
10004 rtx xtmp = gen_reg_rtx (mode);
10005 while (iterations--)
10007 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
10009 if (iterations > 0)
10010 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
10013 if (num != CONST1_RTX (mode))
10015 /* As the approximate reciprocal of DEN is already calculated, only
10016 calculate the approximate division when NUM is not 1.0. */
10017 rtx xnum = force_reg (mode, num);
10018 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
10021 /* Finalize the approximation. */
10022 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
10023 return true;
10026 /* Return the number of instructions that can be issued per cycle. */
10027 static int
10028 aarch64_sched_issue_rate (void)
10030 return aarch64_tune_params.issue_rate;
10033 static int
10034 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
10036 int issue_rate = aarch64_sched_issue_rate ();
10038 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
10042 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
10043 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
10044 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
10046 static int
10047 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
10048 int ready_index)
10050 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10054 /* Vectorizer cost model target hooks. */
10056 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10057 static int
10058 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10059 tree vectype,
10060 int misalign ATTRIBUTE_UNUSED)
10062 unsigned elements;
10063 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10064 bool fp = false;
10066 if (vectype != NULL)
10067 fp = FLOAT_TYPE_P (vectype);
10069 switch (type_of_cost)
10071 case scalar_stmt:
10072 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10074 case scalar_load:
10075 return costs->scalar_load_cost;
10077 case scalar_store:
10078 return costs->scalar_store_cost;
10080 case vector_stmt:
10081 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10083 case vector_load:
10084 return costs->vec_align_load_cost;
10086 case vector_store:
10087 return costs->vec_store_cost;
10089 case vec_to_scalar:
10090 return costs->vec_to_scalar_cost;
10092 case scalar_to_vec:
10093 return costs->scalar_to_vec_cost;
10095 case unaligned_load:
10096 case vector_gather_load:
10097 return costs->vec_unalign_load_cost;
10099 case unaligned_store:
10100 case vector_scatter_store:
10101 return costs->vec_unalign_store_cost;
10103 case cond_branch_taken:
10104 return costs->cond_taken_branch_cost;
10106 case cond_branch_not_taken:
10107 return costs->cond_not_taken_branch_cost;
10109 case vec_perm:
10110 return costs->vec_permute_cost;
10112 case vec_promote_demote:
10113 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10115 case vec_construct:
10116 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10117 return elements / 2 + 1;
10119 default:
10120 gcc_unreachable ();
10124 /* Implement targetm.vectorize.add_stmt_cost. */
10125 static unsigned
10126 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10127 struct _stmt_vec_info *stmt_info, int misalign,
10128 enum vect_cost_model_location where)
10130 unsigned *cost = (unsigned *) data;
10131 unsigned retval = 0;
10133 if (flag_vect_cost_model)
10135 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10136 int stmt_cost =
10137 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10139 /* Statements in an inner loop relative to the loop being
10140 vectorized are weighted more heavily. The value here is
10141 arbitrary and could potentially be improved with analysis. */
10142 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10143 count *= 50; /* FIXME */
10145 retval = (unsigned) (count * stmt_cost);
10146 cost[where] += retval;
10149 return retval;
10152 static void initialize_aarch64_code_model (struct gcc_options *);
10154 /* Parse the TO_PARSE string and put the architecture struct that it
10155 selects into RES and the architectural features into ISA_FLAGS.
10156 Return an aarch64_parse_opt_result describing the parse result.
10157 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10159 static enum aarch64_parse_opt_result
10160 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10161 unsigned long *isa_flags)
10163 char *ext;
10164 const struct processor *arch;
10165 char *str = (char *) alloca (strlen (to_parse) + 1);
10166 size_t len;
10168 strcpy (str, to_parse);
10170 ext = strchr (str, '+');
10172 if (ext != NULL)
10173 len = ext - str;
10174 else
10175 len = strlen (str);
10177 if (len == 0)
10178 return AARCH64_PARSE_MISSING_ARG;
10181 /* Loop through the list of supported ARCHes to find a match. */
10182 for (arch = all_architectures; arch->name != NULL; arch++)
10184 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10186 unsigned long isa_temp = arch->flags;
10188 if (ext != NULL)
10190 /* TO_PARSE string contains at least one extension. */
10191 enum aarch64_parse_opt_result ext_res
10192 = aarch64_parse_extension (ext, &isa_temp);
10194 if (ext_res != AARCH64_PARSE_OK)
10195 return ext_res;
10197 /* Extension parsing was successful. Confirm the result
10198 arch and ISA flags. */
10199 *res = arch;
10200 *isa_flags = isa_temp;
10201 return AARCH64_PARSE_OK;
10205 /* ARCH name not found in list. */
10206 return AARCH64_PARSE_INVALID_ARG;
10209 /* Parse the TO_PARSE string and put the result tuning in RES and the
10210 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10211 describing the parse result. If there is an error parsing, RES and
10212 ISA_FLAGS are left unchanged. */
10214 static enum aarch64_parse_opt_result
10215 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10216 unsigned long *isa_flags)
10218 char *ext;
10219 const struct processor *cpu;
10220 char *str = (char *) alloca (strlen (to_parse) + 1);
10221 size_t len;
10223 strcpy (str, to_parse);
10225 ext = strchr (str, '+');
10227 if (ext != NULL)
10228 len = ext - str;
10229 else
10230 len = strlen (str);
10232 if (len == 0)
10233 return AARCH64_PARSE_MISSING_ARG;
10236 /* Loop through the list of supported CPUs to find a match. */
10237 for (cpu = all_cores; cpu->name != NULL; cpu++)
10239 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10241 unsigned long isa_temp = cpu->flags;
10244 if (ext != NULL)
10246 /* TO_PARSE string contains at least one extension. */
10247 enum aarch64_parse_opt_result ext_res
10248 = aarch64_parse_extension (ext, &isa_temp);
10250 if (ext_res != AARCH64_PARSE_OK)
10251 return ext_res;
10253 /* Extension parsing was successfull. Confirm the result
10254 cpu and ISA flags. */
10255 *res = cpu;
10256 *isa_flags = isa_temp;
10257 return AARCH64_PARSE_OK;
10261 /* CPU name not found in list. */
10262 return AARCH64_PARSE_INVALID_ARG;
10265 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10266 Return an aarch64_parse_opt_result describing the parse result.
10267 If the parsing fails the RES does not change. */
10269 static enum aarch64_parse_opt_result
10270 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10272 const struct processor *cpu;
10273 char *str = (char *) alloca (strlen (to_parse) + 1);
10275 strcpy (str, to_parse);
10277 /* Loop through the list of supported CPUs to find a match. */
10278 for (cpu = all_cores; cpu->name != NULL; cpu++)
10280 if (strcmp (cpu->name, str) == 0)
10282 *res = cpu;
10283 return AARCH64_PARSE_OK;
10287 /* CPU name not found in list. */
10288 return AARCH64_PARSE_INVALID_ARG;
10291 /* Parse TOKEN, which has length LENGTH to see if it is an option
10292 described in FLAG. If it is, return the index bit for that fusion type.
10293 If not, error (printing OPTION_NAME) and return zero. */
10295 static unsigned int
10296 aarch64_parse_one_option_token (const char *token,
10297 size_t length,
10298 const struct aarch64_flag_desc *flag,
10299 const char *option_name)
10301 for (; flag->name != NULL; flag++)
10303 if (length == strlen (flag->name)
10304 && !strncmp (flag->name, token, length))
10305 return flag->flag;
10308 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10309 return 0;
10312 /* Parse OPTION which is a comma-separated list of flags to enable.
10313 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10314 default state we inherit from the CPU tuning structures. OPTION_NAME
10315 gives the top-level option we are parsing in the -moverride string,
10316 for use in error messages. */
10318 static unsigned int
10319 aarch64_parse_boolean_options (const char *option,
10320 const struct aarch64_flag_desc *flags,
10321 unsigned int initial_state,
10322 const char *option_name)
10324 const char separator = '.';
10325 const char* specs = option;
10326 const char* ntoken = option;
10327 unsigned int found_flags = initial_state;
10329 while ((ntoken = strchr (specs, separator)))
10331 size_t token_length = ntoken - specs;
10332 unsigned token_ops = aarch64_parse_one_option_token (specs,
10333 token_length,
10334 flags,
10335 option_name);
10336 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10337 in the token stream, reset the supported operations. So:
10339 adrp+add.cmp+branch.none.adrp+add
10341 would have the result of turning on only adrp+add fusion. */
10342 if (!token_ops)
10343 found_flags = 0;
10345 found_flags |= token_ops;
10346 specs = ++ntoken;
10349 /* We ended with a comma, print something. */
10350 if (!(*specs))
10352 error ("%s string ill-formed\n", option_name);
10353 return 0;
10356 /* We still have one more token to parse. */
10357 size_t token_length = strlen (specs);
10358 unsigned token_ops = aarch64_parse_one_option_token (specs,
10359 token_length,
10360 flags,
10361 option_name);
10362 if (!token_ops)
10363 found_flags = 0;
10365 found_flags |= token_ops;
10366 return found_flags;
10369 /* Support for overriding instruction fusion. */
10371 static void
10372 aarch64_parse_fuse_string (const char *fuse_string,
10373 struct tune_params *tune)
10375 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10376 aarch64_fusible_pairs,
10377 tune->fusible_ops,
10378 "fuse=");
10381 /* Support for overriding other tuning flags. */
10383 static void
10384 aarch64_parse_tune_string (const char *tune_string,
10385 struct tune_params *tune)
10387 tune->extra_tuning_flags
10388 = aarch64_parse_boolean_options (tune_string,
10389 aarch64_tuning_flags,
10390 tune->extra_tuning_flags,
10391 "tune=");
10394 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10395 we understand. If it is, extract the option string and handoff to
10396 the appropriate function. */
10398 void
10399 aarch64_parse_one_override_token (const char* token,
10400 size_t length,
10401 struct tune_params *tune)
10403 const struct aarch64_tuning_override_function *fn
10404 = aarch64_tuning_override_functions;
10406 const char *option_part = strchr (token, '=');
10407 if (!option_part)
10409 error ("tuning string missing in option (%s)", token);
10410 return;
10413 /* Get the length of the option name. */
10414 length = option_part - token;
10415 /* Skip the '=' to get to the option string. */
10416 option_part++;
10418 for (; fn->name != NULL; fn++)
10420 if (!strncmp (fn->name, token, length))
10422 fn->parse_override (option_part, tune);
10423 return;
10427 error ("unknown tuning option (%s)",token);
10428 return;
10431 /* A checking mechanism for the implementation of the tls size. */
10433 static void
10434 initialize_aarch64_tls_size (struct gcc_options *opts)
10436 if (aarch64_tls_size == 0)
10437 aarch64_tls_size = 24;
10439 switch (opts->x_aarch64_cmodel_var)
10441 case AARCH64_CMODEL_TINY:
10442 /* Both the default and maximum TLS size allowed under tiny is 1M which
10443 needs two instructions to address, so we clamp the size to 24. */
10444 if (aarch64_tls_size > 24)
10445 aarch64_tls_size = 24;
10446 break;
10447 case AARCH64_CMODEL_SMALL:
10448 /* The maximum TLS size allowed under small is 4G. */
10449 if (aarch64_tls_size > 32)
10450 aarch64_tls_size = 32;
10451 break;
10452 case AARCH64_CMODEL_LARGE:
10453 /* The maximum TLS size allowed under large is 16E.
10454 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10455 if (aarch64_tls_size > 48)
10456 aarch64_tls_size = 48;
10457 break;
10458 default:
10459 gcc_unreachable ();
10462 return;
10465 /* Parse STRING looking for options in the format:
10466 string :: option:string
10467 option :: name=substring
10468 name :: {a-z}
10469 substring :: defined by option. */
10471 static void
10472 aarch64_parse_override_string (const char* input_string,
10473 struct tune_params* tune)
10475 const char separator = ':';
10476 size_t string_length = strlen (input_string) + 1;
10477 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10478 char *string = string_root;
10479 strncpy (string, input_string, string_length);
10480 string[string_length - 1] = '\0';
10482 char* ntoken = string;
10484 while ((ntoken = strchr (string, separator)))
10486 size_t token_length = ntoken - string;
10487 /* Make this substring look like a string. */
10488 *ntoken = '\0';
10489 aarch64_parse_one_override_token (string, token_length, tune);
10490 string = ++ntoken;
10493 /* One last option to parse. */
10494 aarch64_parse_one_override_token (string, strlen (string), tune);
10495 free (string_root);
10499 static void
10500 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10502 /* PR 70044: We have to be careful about being called multiple times for the
10503 same function. This means all changes should be repeatable. */
10505 /* If the frame pointer is enabled, set it to a special value that behaves
10506 similar to frame pointer omission. If we don't do this all leaf functions
10507 will get a frame pointer even if flag_omit_leaf_frame_pointer is set.
10508 If flag_omit_frame_pointer has this special value, we must force the
10509 frame pointer if not in a leaf function. We also need to force it in a
10510 leaf function if flag_omit_frame_pointer is not set or if LR is used. */
10511 if (opts->x_flag_omit_frame_pointer == 0)
10512 opts->x_flag_omit_frame_pointer = 2;
10514 /* If not optimizing for size, set the default
10515 alignment to what the target wants. */
10516 if (!opts->x_optimize_size)
10518 if (opts->x_align_loops <= 0)
10519 opts->x_align_loops = aarch64_tune_params.loop_align;
10520 if (opts->x_align_jumps <= 0)
10521 opts->x_align_jumps = aarch64_tune_params.jump_align;
10522 if (opts->x_align_functions <= 0)
10523 opts->x_align_functions = aarch64_tune_params.function_align;
10526 /* We default to no pc-relative literal loads. */
10528 aarch64_pcrelative_literal_loads = false;
10530 /* If -mpc-relative-literal-loads is set on the command line, this
10531 implies that the user asked for PC relative literal loads. */
10532 if (opts->x_pcrelative_literal_loads == 1)
10533 aarch64_pcrelative_literal_loads = true;
10535 /* In the tiny memory model it makes no sense to disallow PC relative
10536 literal pool loads. */
10537 if (aarch64_cmodel == AARCH64_CMODEL_TINY
10538 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10539 aarch64_pcrelative_literal_loads = true;
10541 /* When enabling the lower precision Newton series for the square root, also
10542 enable it for the reciprocal square root, since the latter is an
10543 intermediary step for the former. */
10544 if (flag_mlow_precision_sqrt)
10545 flag_mrecip_low_precision_sqrt = true;
10548 /* 'Unpack' up the internal tuning structs and update the options
10549 in OPTS. The caller must have set up selected_tune and selected_arch
10550 as all the other target-specific codegen decisions are
10551 derived from them. */
10553 void
10554 aarch64_override_options_internal (struct gcc_options *opts)
10556 aarch64_tune_flags = selected_tune->flags;
10557 aarch64_tune = selected_tune->sched_core;
10558 /* Make a copy of the tuning parameters attached to the core, which
10559 we may later overwrite. */
10560 aarch64_tune_params = *(selected_tune->tune);
10561 aarch64_architecture_version = selected_arch->architecture_version;
10563 if (opts->x_aarch64_override_tune_string)
10564 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10565 &aarch64_tune_params);
10567 /* This target defaults to strict volatile bitfields. */
10568 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10569 opts->x_flag_strict_volatile_bitfields = 1;
10571 initialize_aarch64_code_model (opts);
10572 initialize_aarch64_tls_size (opts);
10574 int queue_depth = 0;
10575 switch (aarch64_tune_params.autoprefetcher_model)
10577 case tune_params::AUTOPREFETCHER_OFF:
10578 queue_depth = -1;
10579 break;
10580 case tune_params::AUTOPREFETCHER_WEAK:
10581 queue_depth = 0;
10582 break;
10583 case tune_params::AUTOPREFETCHER_STRONG:
10584 queue_depth = max_insn_queue_index + 1;
10585 break;
10586 default:
10587 gcc_unreachable ();
10590 /* We don't mind passing in global_options_set here as we don't use
10591 the *options_set structs anyway. */
10592 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10593 queue_depth,
10594 opts->x_param_values,
10595 global_options_set.x_param_values);
10597 /* Set up parameters to be used in prefetching algorithm. Do not
10598 override the defaults unless we are tuning for a core we have
10599 researched values for. */
10600 if (aarch64_tune_params.prefetch->num_slots > 0)
10601 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10602 aarch64_tune_params.prefetch->num_slots,
10603 opts->x_param_values,
10604 global_options_set.x_param_values);
10605 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10606 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10607 aarch64_tune_params.prefetch->l1_cache_size,
10608 opts->x_param_values,
10609 global_options_set.x_param_values);
10610 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10611 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10612 aarch64_tune_params.prefetch->l1_cache_line_size,
10613 opts->x_param_values,
10614 global_options_set.x_param_values);
10615 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10616 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10617 aarch64_tune_params.prefetch->l2_cache_size,
10618 opts->x_param_values,
10619 global_options_set.x_param_values);
10621 /* Use the alternative scheduling-pressure algorithm by default. */
10622 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10623 opts->x_param_values,
10624 global_options_set.x_param_values);
10626 /* Enable sw prefetching at specified optimization level for
10627 CPUS that have prefetch. Lower optimization level threshold by 1
10628 when profiling is enabled. */
10629 if (opts->x_flag_prefetch_loop_arrays < 0
10630 && !opts->x_optimize_size
10631 && aarch64_tune_params.prefetch->default_opt_level >= 0
10632 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10633 opts->x_flag_prefetch_loop_arrays = 1;
10635 aarch64_override_options_after_change_1 (opts);
10638 /* Print a hint with a suggestion for a core or architecture name that
10639 most closely resembles what the user passed in STR. ARCH is true if
10640 the user is asking for an architecture name. ARCH is false if the user
10641 is asking for a core name. */
10643 static void
10644 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10646 auto_vec<const char *> candidates;
10647 const struct processor *entry = arch ? all_architectures : all_cores;
10648 for (; entry->name != NULL; entry++)
10649 candidates.safe_push (entry->name);
10651 #ifdef HAVE_LOCAL_CPU_DETECT
10652 /* Add also "native" as possible value. */
10653 if (arch)
10654 candidates.safe_push ("native");
10655 #endif
10657 char *s;
10658 const char *hint = candidates_list_and_hint (str, s, candidates);
10659 if (hint)
10660 inform (input_location, "valid arguments are: %s;"
10661 " did you mean %qs?", s, hint);
10662 else
10663 inform (input_location, "valid arguments are: %s", s);
10665 XDELETEVEC (s);
10668 /* Print a hint with a suggestion for a core name that most closely resembles
10669 what the user passed in STR. */
10671 inline static void
10672 aarch64_print_hint_for_core (const char *str)
10674 aarch64_print_hint_for_core_or_arch (str, false);
10677 /* Print a hint with a suggestion for an architecture name that most closely
10678 resembles what the user passed in STR. */
10680 inline static void
10681 aarch64_print_hint_for_arch (const char *str)
10683 aarch64_print_hint_for_core_or_arch (str, true);
10686 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10687 specified in STR and throw errors if appropriate. Put the results if
10688 they are valid in RES and ISA_FLAGS. Return whether the option is
10689 valid. */
10691 static bool
10692 aarch64_validate_mcpu (const char *str, const struct processor **res,
10693 unsigned long *isa_flags)
10695 enum aarch64_parse_opt_result parse_res
10696 = aarch64_parse_cpu (str, res, isa_flags);
10698 if (parse_res == AARCH64_PARSE_OK)
10699 return true;
10701 switch (parse_res)
10703 case AARCH64_PARSE_MISSING_ARG:
10704 error ("missing cpu name in %<-mcpu=%s%>", str);
10705 break;
10706 case AARCH64_PARSE_INVALID_ARG:
10707 error ("unknown value %qs for -mcpu", str);
10708 aarch64_print_hint_for_core (str);
10709 break;
10710 case AARCH64_PARSE_INVALID_FEATURE:
10711 error ("invalid feature modifier in %<-mcpu=%s%>", str);
10712 break;
10713 default:
10714 gcc_unreachable ();
10717 return false;
10720 /* Validate a command-line -march option. Parse the arch and extensions
10721 (if any) specified in STR and throw errors if appropriate. Put the
10722 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10723 option is valid. */
10725 static bool
10726 aarch64_validate_march (const char *str, const struct processor **res,
10727 unsigned long *isa_flags)
10729 enum aarch64_parse_opt_result parse_res
10730 = aarch64_parse_arch (str, res, isa_flags);
10732 if (parse_res == AARCH64_PARSE_OK)
10733 return true;
10735 switch (parse_res)
10737 case AARCH64_PARSE_MISSING_ARG:
10738 error ("missing arch name in %<-march=%s%>", str);
10739 break;
10740 case AARCH64_PARSE_INVALID_ARG:
10741 error ("unknown value %qs for -march", str);
10742 aarch64_print_hint_for_arch (str);
10743 break;
10744 case AARCH64_PARSE_INVALID_FEATURE:
10745 error ("invalid feature modifier in %<-march=%s%>", str);
10746 break;
10747 default:
10748 gcc_unreachable ();
10751 return false;
10754 /* Validate a command-line -mtune option. Parse the cpu
10755 specified in STR and throw errors if appropriate. Put the
10756 result, if it is valid, in RES. Return whether the option is
10757 valid. */
10759 static bool
10760 aarch64_validate_mtune (const char *str, const struct processor **res)
10762 enum aarch64_parse_opt_result parse_res
10763 = aarch64_parse_tune (str, res);
10765 if (parse_res == AARCH64_PARSE_OK)
10766 return true;
10768 switch (parse_res)
10770 case AARCH64_PARSE_MISSING_ARG:
10771 error ("missing cpu name in %<-mtune=%s%>", str);
10772 break;
10773 case AARCH64_PARSE_INVALID_ARG:
10774 error ("unknown value %qs for -mtune", str);
10775 aarch64_print_hint_for_core (str);
10776 break;
10777 default:
10778 gcc_unreachable ();
10780 return false;
10783 /* Return the CPU corresponding to the enum CPU.
10784 If it doesn't specify a cpu, return the default. */
10786 static const struct processor *
10787 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10789 if (cpu != aarch64_none)
10790 return &all_cores[cpu];
10792 /* The & 0x3f is to extract the bottom 6 bits that encode the
10793 default cpu as selected by the --with-cpu GCC configure option
10794 in config.gcc.
10795 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10796 flags mechanism should be reworked to make it more sane. */
10797 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10800 /* Return the architecture corresponding to the enum ARCH.
10801 If it doesn't specify a valid architecture, return the default. */
10803 static const struct processor *
10804 aarch64_get_arch (enum aarch64_arch arch)
10806 if (arch != aarch64_no_arch)
10807 return &all_architectures[arch];
10809 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10811 return &all_architectures[cpu->arch];
10814 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10816 static poly_uint16
10817 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10819 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10820 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10821 deciding which .md file patterns to use and when deciding whether
10822 something is a legitimate address or constant. */
10823 if (value == SVE_SCALABLE || value == SVE_128)
10824 return poly_uint16 (2, 2);
10825 else
10826 return (int) value / 64;
10829 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10830 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10831 tuning structs. In particular it must set selected_tune and
10832 aarch64_isa_flags that define the available ISA features and tuning
10833 decisions. It must also set selected_arch as this will be used to
10834 output the .arch asm tags for each function. */
10836 static void
10837 aarch64_override_options (void)
10839 unsigned long cpu_isa = 0;
10840 unsigned long arch_isa = 0;
10841 aarch64_isa_flags = 0;
10843 bool valid_cpu = true;
10844 bool valid_tune = true;
10845 bool valid_arch = true;
10847 selected_cpu = NULL;
10848 selected_arch = NULL;
10849 selected_tune = NULL;
10851 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10852 If either of -march or -mtune is given, they override their
10853 respective component of -mcpu. */
10854 if (aarch64_cpu_string)
10855 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10856 &cpu_isa);
10858 if (aarch64_arch_string)
10859 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10860 &arch_isa);
10862 if (aarch64_tune_string)
10863 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10865 /* If the user did not specify a processor, choose the default
10866 one for them. This will be the CPU set during configuration using
10867 --with-cpu, otherwise it is "generic". */
10868 if (!selected_cpu)
10870 if (selected_arch)
10872 selected_cpu = &all_cores[selected_arch->ident];
10873 aarch64_isa_flags = arch_isa;
10874 explicit_arch = selected_arch->arch;
10876 else
10878 /* Get default configure-time CPU. */
10879 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10880 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10883 if (selected_tune)
10884 explicit_tune_core = selected_tune->ident;
10886 /* If both -mcpu and -march are specified check that they are architecturally
10887 compatible, warn if they're not and prefer the -march ISA flags. */
10888 else if (selected_arch)
10890 if (selected_arch->arch != selected_cpu->arch)
10892 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10893 all_architectures[selected_cpu->arch].name,
10894 selected_arch->name);
10896 aarch64_isa_flags = arch_isa;
10897 explicit_arch = selected_arch->arch;
10898 explicit_tune_core = selected_tune ? selected_tune->ident
10899 : selected_cpu->ident;
10901 else
10903 /* -mcpu but no -march. */
10904 aarch64_isa_flags = cpu_isa;
10905 explicit_tune_core = selected_tune ? selected_tune->ident
10906 : selected_cpu->ident;
10907 gcc_assert (selected_cpu);
10908 selected_arch = &all_architectures[selected_cpu->arch];
10909 explicit_arch = selected_arch->arch;
10912 /* Set the arch as well as we will need it when outputing
10913 the .arch directive in assembly. */
10914 if (!selected_arch)
10916 gcc_assert (selected_cpu);
10917 selected_arch = &all_architectures[selected_cpu->arch];
10920 if (!selected_tune)
10921 selected_tune = selected_cpu;
10923 #ifndef HAVE_AS_MABI_OPTION
10924 /* The compiler may have been configured with 2.23.* binutils, which does
10925 not have support for ILP32. */
10926 if (TARGET_ILP32)
10927 error ("assembler does not support -mabi=ilp32");
10928 #endif
10930 /* Convert -msve-vector-bits to a VG count. */
10931 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10933 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10934 sorry ("return address signing is only supported for -mabi=lp64");
10936 /* Make sure we properly set up the explicit options. */
10937 if ((aarch64_cpu_string && valid_cpu)
10938 || (aarch64_tune_string && valid_tune))
10939 gcc_assert (explicit_tune_core != aarch64_none);
10941 if ((aarch64_cpu_string && valid_cpu)
10942 || (aarch64_arch_string && valid_arch))
10943 gcc_assert (explicit_arch != aarch64_no_arch);
10945 aarch64_override_options_internal (&global_options);
10947 /* Save these options as the default ones in case we push and pop them later
10948 while processing functions with potential target attributes. */
10949 target_option_default_node = target_option_current_node
10950 = build_target_option_node (&global_options);
10953 /* Implement targetm.override_options_after_change. */
10955 static void
10956 aarch64_override_options_after_change (void)
10958 aarch64_override_options_after_change_1 (&global_options);
10961 static struct machine_function *
10962 aarch64_init_machine_status (void)
10964 struct machine_function *machine;
10965 machine = ggc_cleared_alloc<machine_function> ();
10966 return machine;
10969 void
10970 aarch64_init_expanders (void)
10972 init_machine_status = aarch64_init_machine_status;
10975 /* A checking mechanism for the implementation of the various code models. */
10976 static void
10977 initialize_aarch64_code_model (struct gcc_options *opts)
10979 if (opts->x_flag_pic)
10981 switch (opts->x_aarch64_cmodel_var)
10983 case AARCH64_CMODEL_TINY:
10984 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10985 break;
10986 case AARCH64_CMODEL_SMALL:
10987 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10988 aarch64_cmodel = (flag_pic == 2
10989 ? AARCH64_CMODEL_SMALL_PIC
10990 : AARCH64_CMODEL_SMALL_SPIC);
10991 #else
10992 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10993 #endif
10994 break;
10995 case AARCH64_CMODEL_LARGE:
10996 sorry ("code model %qs with -f%s", "large",
10997 opts->x_flag_pic > 1 ? "PIC" : "pic");
10998 break;
10999 default:
11000 gcc_unreachable ();
11003 else
11004 aarch64_cmodel = opts->x_aarch64_cmodel_var;
11007 /* Implement TARGET_OPTION_SAVE. */
11009 static void
11010 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
11012 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
11015 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
11016 using the information saved in PTR. */
11018 static void
11019 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
11021 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
11022 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11023 opts->x_explicit_arch = ptr->x_explicit_arch;
11024 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
11025 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
11027 aarch64_override_options_internal (opts);
11030 /* Implement TARGET_OPTION_PRINT. */
11032 static void
11033 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11035 const struct processor *cpu
11036 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11037 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11038 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11039 std::string extension
11040 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11042 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11043 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11044 arch->name, extension.c_str ());
11047 static GTY(()) tree aarch64_previous_fndecl;
11049 void
11050 aarch64_reset_previous_fndecl (void)
11052 aarch64_previous_fndecl = NULL;
11055 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11056 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11057 make sure optab availability predicates are recomputed when necessary. */
11059 void
11060 aarch64_save_restore_target_globals (tree new_tree)
11062 if (TREE_TARGET_GLOBALS (new_tree))
11063 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11064 else if (new_tree == target_option_default_node)
11065 restore_target_globals (&default_target_globals);
11066 else
11067 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11070 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11071 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11072 of the function, if such exists. This function may be called multiple
11073 times on a single function so use aarch64_previous_fndecl to avoid
11074 setting up identical state. */
11076 static void
11077 aarch64_set_current_function (tree fndecl)
11079 if (!fndecl || fndecl == aarch64_previous_fndecl)
11080 return;
11082 tree old_tree = (aarch64_previous_fndecl
11083 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11084 : NULL_TREE);
11086 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11088 /* If current function has no attributes but the previous one did,
11089 use the default node. */
11090 if (!new_tree && old_tree)
11091 new_tree = target_option_default_node;
11093 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11094 the default have been handled by aarch64_save_restore_target_globals from
11095 aarch64_pragma_target_parse. */
11096 if (old_tree == new_tree)
11097 return;
11099 aarch64_previous_fndecl = fndecl;
11101 /* First set the target options. */
11102 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11104 aarch64_save_restore_target_globals (new_tree);
11107 /* Enum describing the various ways we can handle attributes.
11108 In many cases we can reuse the generic option handling machinery. */
11110 enum aarch64_attr_opt_type
11112 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
11113 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
11114 aarch64_attr_enum, /* Attribute sets an enum variable. */
11115 aarch64_attr_custom /* Attribute requires a custom handling function. */
11118 /* All the information needed to handle a target attribute.
11119 NAME is the name of the attribute.
11120 ATTR_TYPE specifies the type of behavior of the attribute as described
11121 in the definition of enum aarch64_attr_opt_type.
11122 ALLOW_NEG is true if the attribute supports a "no-" form.
11123 HANDLER is the function that takes the attribute string as an argument
11124 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11125 OPT_NUM is the enum specifying the option that the attribute modifies.
11126 This is needed for attributes that mirror the behavior of a command-line
11127 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11128 aarch64_attr_enum. */
11130 struct aarch64_attribute_info
11132 const char *name;
11133 enum aarch64_attr_opt_type attr_type;
11134 bool allow_neg;
11135 bool (*handler) (const char *);
11136 enum opt_code opt_num;
11139 /* Handle the ARCH_STR argument to the arch= target attribute. */
11141 static bool
11142 aarch64_handle_attr_arch (const char *str)
11144 const struct processor *tmp_arch = NULL;
11145 enum aarch64_parse_opt_result parse_res
11146 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11148 if (parse_res == AARCH64_PARSE_OK)
11150 gcc_assert (tmp_arch);
11151 selected_arch = tmp_arch;
11152 explicit_arch = selected_arch->arch;
11153 return true;
11156 switch (parse_res)
11158 case AARCH64_PARSE_MISSING_ARG:
11159 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11160 break;
11161 case AARCH64_PARSE_INVALID_ARG:
11162 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11163 aarch64_print_hint_for_arch (str);
11164 break;
11165 case AARCH64_PARSE_INVALID_FEATURE:
11166 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11167 break;
11168 default:
11169 gcc_unreachable ();
11172 return false;
11175 /* Handle the argument CPU_STR to the cpu= target attribute. */
11177 static bool
11178 aarch64_handle_attr_cpu (const char *str)
11180 const struct processor *tmp_cpu = NULL;
11181 enum aarch64_parse_opt_result parse_res
11182 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11184 if (parse_res == AARCH64_PARSE_OK)
11186 gcc_assert (tmp_cpu);
11187 selected_tune = tmp_cpu;
11188 explicit_tune_core = selected_tune->ident;
11190 selected_arch = &all_architectures[tmp_cpu->arch];
11191 explicit_arch = selected_arch->arch;
11192 return true;
11195 switch (parse_res)
11197 case AARCH64_PARSE_MISSING_ARG:
11198 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11199 break;
11200 case AARCH64_PARSE_INVALID_ARG:
11201 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11202 aarch64_print_hint_for_core (str);
11203 break;
11204 case AARCH64_PARSE_INVALID_FEATURE:
11205 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11206 break;
11207 default:
11208 gcc_unreachable ();
11211 return false;
11214 /* Handle the argument STR to the tune= target attribute. */
11216 static bool
11217 aarch64_handle_attr_tune (const char *str)
11219 const struct processor *tmp_tune = NULL;
11220 enum aarch64_parse_opt_result parse_res
11221 = aarch64_parse_tune (str, &tmp_tune);
11223 if (parse_res == AARCH64_PARSE_OK)
11225 gcc_assert (tmp_tune);
11226 selected_tune = tmp_tune;
11227 explicit_tune_core = selected_tune->ident;
11228 return true;
11231 switch (parse_res)
11233 case AARCH64_PARSE_INVALID_ARG:
11234 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11235 aarch64_print_hint_for_core (str);
11236 break;
11237 default:
11238 gcc_unreachable ();
11241 return false;
11244 /* Parse an architecture extensions target attribute string specified in STR.
11245 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11246 if successful. Update aarch64_isa_flags to reflect the ISA features
11247 modified. */
11249 static bool
11250 aarch64_handle_attr_isa_flags (char *str)
11252 enum aarch64_parse_opt_result parse_res;
11253 unsigned long isa_flags = aarch64_isa_flags;
11255 /* We allow "+nothing" in the beginning to clear out all architectural
11256 features if the user wants to handpick specific features. */
11257 if (strncmp ("+nothing", str, 8) == 0)
11259 isa_flags = 0;
11260 str += 8;
11263 parse_res = aarch64_parse_extension (str, &isa_flags);
11265 if (parse_res == AARCH64_PARSE_OK)
11267 aarch64_isa_flags = isa_flags;
11268 return true;
11271 switch (parse_res)
11273 case AARCH64_PARSE_MISSING_ARG:
11274 error ("missing value in %<target()%> pragma or attribute");
11275 break;
11277 case AARCH64_PARSE_INVALID_FEATURE:
11278 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11279 break;
11281 default:
11282 gcc_unreachable ();
11285 return false;
11288 /* The target attributes that we support. On top of these we also support just
11289 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11290 handled explicitly in aarch64_process_one_target_attr. */
11292 static const struct aarch64_attribute_info aarch64_attributes[] =
11294 { "general-regs-only", aarch64_attr_mask, false, NULL,
11295 OPT_mgeneral_regs_only },
11296 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11297 OPT_mfix_cortex_a53_835769 },
11298 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11299 OPT_mfix_cortex_a53_843419 },
11300 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11301 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
11302 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11303 OPT_momit_leaf_frame_pointer },
11304 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11305 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11306 OPT_march_ },
11307 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11308 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11309 OPT_mtune_ },
11310 { "sign-return-address", aarch64_attr_enum, false, NULL,
11311 OPT_msign_return_address_ },
11312 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11315 /* Parse ARG_STR which contains the definition of one target attribute.
11316 Show appropriate errors if any or return true if the attribute is valid. */
11318 static bool
11319 aarch64_process_one_target_attr (char *arg_str)
11321 bool invert = false;
11323 size_t len = strlen (arg_str);
11325 if (len == 0)
11327 error ("malformed %<target()%> pragma or attribute");
11328 return false;
11331 char *str_to_check = (char *) alloca (len + 1);
11332 strcpy (str_to_check, arg_str);
11334 /* Skip leading whitespace. */
11335 while (*str_to_check == ' ' || *str_to_check == '\t')
11336 str_to_check++;
11338 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11339 It is easier to detect and handle it explicitly here rather than going
11340 through the machinery for the rest of the target attributes in this
11341 function. */
11342 if (*str_to_check == '+')
11343 return aarch64_handle_attr_isa_flags (str_to_check);
11345 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11347 invert = true;
11348 str_to_check += 3;
11350 char *arg = strchr (str_to_check, '=');
11352 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11353 and point ARG to "foo". */
11354 if (arg)
11356 *arg = '\0';
11357 arg++;
11359 const struct aarch64_attribute_info *p_attr;
11360 bool found = false;
11361 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11363 /* If the names don't match up, or the user has given an argument
11364 to an attribute that doesn't accept one, or didn't give an argument
11365 to an attribute that expects one, fail to match. */
11366 if (strcmp (str_to_check, p_attr->name) != 0)
11367 continue;
11369 found = true;
11370 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11371 || p_attr->attr_type == aarch64_attr_enum;
11373 if (attr_need_arg_p ^ (arg != NULL))
11375 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11376 return false;
11379 /* If the name matches but the attribute does not allow "no-" versions
11380 then we can't match. */
11381 if (invert && !p_attr->allow_neg)
11383 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11384 return false;
11387 switch (p_attr->attr_type)
11389 /* Has a custom handler registered.
11390 For example, cpu=, arch=, tune=. */
11391 case aarch64_attr_custom:
11392 gcc_assert (p_attr->handler);
11393 if (!p_attr->handler (arg))
11394 return false;
11395 break;
11397 /* Either set or unset a boolean option. */
11398 case aarch64_attr_bool:
11400 struct cl_decoded_option decoded;
11402 generate_option (p_attr->opt_num, NULL, !invert,
11403 CL_TARGET, &decoded);
11404 aarch64_handle_option (&global_options, &global_options_set,
11405 &decoded, input_location);
11406 break;
11408 /* Set or unset a bit in the target_flags. aarch64_handle_option
11409 should know what mask to apply given the option number. */
11410 case aarch64_attr_mask:
11412 struct cl_decoded_option decoded;
11413 /* We only need to specify the option number.
11414 aarch64_handle_option will know which mask to apply. */
11415 decoded.opt_index = p_attr->opt_num;
11416 decoded.value = !invert;
11417 aarch64_handle_option (&global_options, &global_options_set,
11418 &decoded, input_location);
11419 break;
11421 /* Use the option setting machinery to set an option to an enum. */
11422 case aarch64_attr_enum:
11424 gcc_assert (arg);
11425 bool valid;
11426 int value;
11427 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11428 &value, CL_TARGET);
11429 if (valid)
11431 set_option (&global_options, NULL, p_attr->opt_num, value,
11432 NULL, DK_UNSPECIFIED, input_location,
11433 global_dc);
11435 else
11437 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11439 break;
11441 default:
11442 gcc_unreachable ();
11446 /* If we reached here we either have found an attribute and validated
11447 it or didn't match any. If we matched an attribute but its arguments
11448 were malformed we will have returned false already. */
11449 return found;
11452 /* Count how many times the character C appears in
11453 NULL-terminated string STR. */
11455 static unsigned int
11456 num_occurences_in_str (char c, char *str)
11458 unsigned int res = 0;
11459 while (*str != '\0')
11461 if (*str == c)
11462 res++;
11464 str++;
11467 return res;
11470 /* Parse the tree in ARGS that contains the target attribute information
11471 and update the global target options space. */
11473 bool
11474 aarch64_process_target_attr (tree args)
11476 if (TREE_CODE (args) == TREE_LIST)
11480 tree head = TREE_VALUE (args);
11481 if (head)
11483 if (!aarch64_process_target_attr (head))
11484 return false;
11486 args = TREE_CHAIN (args);
11487 } while (args);
11489 return true;
11492 if (TREE_CODE (args) != STRING_CST)
11494 error ("attribute %<target%> argument not a string");
11495 return false;
11498 size_t len = strlen (TREE_STRING_POINTER (args));
11499 char *str_to_check = (char *) alloca (len + 1);
11500 strcpy (str_to_check, TREE_STRING_POINTER (args));
11502 if (len == 0)
11504 error ("malformed %<target()%> pragma or attribute");
11505 return false;
11508 /* Used to catch empty spaces between commas i.e.
11509 attribute ((target ("attr1,,attr2"))). */
11510 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11512 /* Handle multiple target attributes separated by ','. */
11513 char *token = strtok (str_to_check, ",");
11515 unsigned int num_attrs = 0;
11516 while (token)
11518 num_attrs++;
11519 if (!aarch64_process_one_target_attr (token))
11521 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11522 return false;
11525 token = strtok (NULL, ",");
11528 if (num_attrs != num_commas + 1)
11530 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11531 return false;
11534 return true;
11537 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11538 process attribute ((target ("..."))). */
11540 static bool
11541 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11543 struct cl_target_option cur_target;
11544 bool ret;
11545 tree old_optimize;
11546 tree new_target, new_optimize;
11547 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11549 /* If what we're processing is the current pragma string then the
11550 target option node is already stored in target_option_current_node
11551 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11552 having to re-parse the string. This is especially useful to keep
11553 arm_neon.h compile times down since that header contains a lot
11554 of intrinsics enclosed in pragmas. */
11555 if (!existing_target && args == current_target_pragma)
11557 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11558 return true;
11560 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11562 old_optimize = build_optimization_node (&global_options);
11563 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11565 /* If the function changed the optimization levels as well as setting
11566 target options, start with the optimizations specified. */
11567 if (func_optimize && func_optimize != old_optimize)
11568 cl_optimization_restore (&global_options,
11569 TREE_OPTIMIZATION (func_optimize));
11571 /* Save the current target options to restore at the end. */
11572 cl_target_option_save (&cur_target, &global_options);
11574 /* If fndecl already has some target attributes applied to it, unpack
11575 them so that we add this attribute on top of them, rather than
11576 overwriting them. */
11577 if (existing_target)
11579 struct cl_target_option *existing_options
11580 = TREE_TARGET_OPTION (existing_target);
11582 if (existing_options)
11583 cl_target_option_restore (&global_options, existing_options);
11585 else
11586 cl_target_option_restore (&global_options,
11587 TREE_TARGET_OPTION (target_option_current_node));
11589 ret = aarch64_process_target_attr (args);
11591 /* Set up any additional state. */
11592 if (ret)
11594 aarch64_override_options_internal (&global_options);
11595 /* Initialize SIMD builtins if we haven't already.
11596 Set current_target_pragma to NULL for the duration so that
11597 the builtin initialization code doesn't try to tag the functions
11598 being built with the attributes specified by any current pragma, thus
11599 going into an infinite recursion. */
11600 if (TARGET_SIMD)
11602 tree saved_current_target_pragma = current_target_pragma;
11603 current_target_pragma = NULL;
11604 aarch64_init_simd_builtins ();
11605 current_target_pragma = saved_current_target_pragma;
11607 new_target = build_target_option_node (&global_options);
11609 else
11610 new_target = NULL;
11612 new_optimize = build_optimization_node (&global_options);
11614 if (fndecl && ret)
11616 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11618 if (old_optimize != new_optimize)
11619 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11622 cl_target_option_restore (&global_options, &cur_target);
11624 if (old_optimize != new_optimize)
11625 cl_optimization_restore (&global_options,
11626 TREE_OPTIMIZATION (old_optimize));
11627 return ret;
11630 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11631 tri-bool options (yes, no, don't care) and the default value is
11632 DEF, determine whether to reject inlining. */
11634 static bool
11635 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11636 int dont_care, int def)
11638 /* If the callee doesn't care, always allow inlining. */
11639 if (callee == dont_care)
11640 return true;
11642 /* If the caller doesn't care, always allow inlining. */
11643 if (caller == dont_care)
11644 return true;
11646 /* Otherwise, allow inlining if either the callee and caller values
11647 agree, or if the callee is using the default value. */
11648 return (callee == caller || callee == def);
11651 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11652 to inline CALLEE into CALLER based on target-specific info.
11653 Make sure that the caller and callee have compatible architectural
11654 features. Then go through the other possible target attributes
11655 and see if they can block inlining. Try not to reject always_inline
11656 callees unless they are incompatible architecturally. */
11658 static bool
11659 aarch64_can_inline_p (tree caller, tree callee)
11661 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11662 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11664 /* If callee has no option attributes, then it is ok to inline. */
11665 if (!callee_tree)
11666 return true;
11668 struct cl_target_option *caller_opts
11669 = TREE_TARGET_OPTION (caller_tree ? caller_tree
11670 : target_option_default_node);
11672 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
11675 /* Callee's ISA flags should be a subset of the caller's. */
11676 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11677 != callee_opts->x_aarch64_isa_flags)
11678 return false;
11680 /* Allow non-strict aligned functions inlining into strict
11681 aligned ones. */
11682 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11683 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11684 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11685 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11686 return false;
11688 bool always_inline = lookup_attribute ("always_inline",
11689 DECL_ATTRIBUTES (callee));
11691 /* If the architectural features match up and the callee is always_inline
11692 then the other attributes don't matter. */
11693 if (always_inline)
11694 return true;
11696 if (caller_opts->x_aarch64_cmodel_var
11697 != callee_opts->x_aarch64_cmodel_var)
11698 return false;
11700 if (caller_opts->x_aarch64_tls_dialect
11701 != callee_opts->x_aarch64_tls_dialect)
11702 return false;
11704 /* Honour explicit requests to workaround errata. */
11705 if (!aarch64_tribools_ok_for_inlining_p (
11706 caller_opts->x_aarch64_fix_a53_err835769,
11707 callee_opts->x_aarch64_fix_a53_err835769,
11708 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11709 return false;
11711 if (!aarch64_tribools_ok_for_inlining_p (
11712 caller_opts->x_aarch64_fix_a53_err843419,
11713 callee_opts->x_aarch64_fix_a53_err843419,
11714 2, TARGET_FIX_ERR_A53_843419))
11715 return false;
11717 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11718 caller and calle and they don't match up, reject inlining. */
11719 if (!aarch64_tribools_ok_for_inlining_p (
11720 caller_opts->x_flag_omit_leaf_frame_pointer,
11721 callee_opts->x_flag_omit_leaf_frame_pointer,
11722 2, 1))
11723 return false;
11725 /* If the callee has specific tuning overrides, respect them. */
11726 if (callee_opts->x_aarch64_override_tune_string != NULL
11727 && caller_opts->x_aarch64_override_tune_string == NULL)
11728 return false;
11730 /* If the user specified tuning override strings for the
11731 caller and callee and they don't match up, reject inlining.
11732 We just do a string compare here, we don't analyze the meaning
11733 of the string, as it would be too costly for little gain. */
11734 if (callee_opts->x_aarch64_override_tune_string
11735 && caller_opts->x_aarch64_override_tune_string
11736 && (strcmp (callee_opts->x_aarch64_override_tune_string,
11737 caller_opts->x_aarch64_override_tune_string) != 0))
11738 return false;
11740 return true;
11743 /* Return true if SYMBOL_REF X binds locally. */
11745 static bool
11746 aarch64_symbol_binds_local_p (const_rtx x)
11748 return (SYMBOL_REF_DECL (x)
11749 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11750 : SYMBOL_REF_LOCAL_P (x));
11753 /* Return true if SYMBOL_REF X is thread local */
11754 static bool
11755 aarch64_tls_symbol_p (rtx x)
11757 if (! TARGET_HAVE_TLS)
11758 return false;
11760 if (GET_CODE (x) != SYMBOL_REF)
11761 return false;
11763 return SYMBOL_REF_TLS_MODEL (x) != 0;
11766 /* Classify a TLS symbol into one of the TLS kinds. */
11767 enum aarch64_symbol_type
11768 aarch64_classify_tls_symbol (rtx x)
11770 enum tls_model tls_kind = tls_symbolic_operand_type (x);
11772 switch (tls_kind)
11774 case TLS_MODEL_GLOBAL_DYNAMIC:
11775 case TLS_MODEL_LOCAL_DYNAMIC:
11776 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11778 case TLS_MODEL_INITIAL_EXEC:
11779 switch (aarch64_cmodel)
11781 case AARCH64_CMODEL_TINY:
11782 case AARCH64_CMODEL_TINY_PIC:
11783 return SYMBOL_TINY_TLSIE;
11784 default:
11785 return SYMBOL_SMALL_TLSIE;
11788 case TLS_MODEL_LOCAL_EXEC:
11789 if (aarch64_tls_size == 12)
11790 return SYMBOL_TLSLE12;
11791 else if (aarch64_tls_size == 24)
11792 return SYMBOL_TLSLE24;
11793 else if (aarch64_tls_size == 32)
11794 return SYMBOL_TLSLE32;
11795 else if (aarch64_tls_size == 48)
11796 return SYMBOL_TLSLE48;
11797 else
11798 gcc_unreachable ();
11800 case TLS_MODEL_EMULATED:
11801 case TLS_MODEL_NONE:
11802 return SYMBOL_FORCE_TO_MEM;
11804 default:
11805 gcc_unreachable ();
11809 /* Return the correct method for accessing X + OFFSET, where X is either
11810 a SYMBOL_REF or LABEL_REF. */
11812 enum aarch64_symbol_type
11813 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11815 if (GET_CODE (x) == LABEL_REF)
11817 switch (aarch64_cmodel)
11819 case AARCH64_CMODEL_LARGE:
11820 return SYMBOL_FORCE_TO_MEM;
11822 case AARCH64_CMODEL_TINY_PIC:
11823 case AARCH64_CMODEL_TINY:
11824 return SYMBOL_TINY_ABSOLUTE;
11826 case AARCH64_CMODEL_SMALL_SPIC:
11827 case AARCH64_CMODEL_SMALL_PIC:
11828 case AARCH64_CMODEL_SMALL:
11829 return SYMBOL_SMALL_ABSOLUTE;
11831 default:
11832 gcc_unreachable ();
11836 if (GET_CODE (x) == SYMBOL_REF)
11838 if (aarch64_tls_symbol_p (x))
11839 return aarch64_classify_tls_symbol (x);
11841 switch (aarch64_cmodel)
11843 case AARCH64_CMODEL_TINY:
11844 /* When we retrieve symbol + offset address, we have to make sure
11845 the offset does not cause overflow of the final address. But
11846 we have no way of knowing the address of symbol at compile time
11847 so we can't accurately say if the distance between the PC and
11848 symbol + offset is outside the addressible range of +/-1M in the
11849 TINY code model. So we rely on images not being greater than
11850 1M and cap the offset at 1M and anything beyond 1M will have to
11851 be loaded using an alternative mechanism. Furthermore if the
11852 symbol is a weak reference to something that isn't known to
11853 resolve to a symbol in this module, then force to memory. */
11854 if ((SYMBOL_REF_WEAK (x)
11855 && !aarch64_symbol_binds_local_p (x))
11856 || !IN_RANGE (offset, -1048575, 1048575))
11857 return SYMBOL_FORCE_TO_MEM;
11858 return SYMBOL_TINY_ABSOLUTE;
11860 case AARCH64_CMODEL_SMALL:
11861 /* Same reasoning as the tiny code model, but the offset cap here is
11862 4G. */
11863 if ((SYMBOL_REF_WEAK (x)
11864 && !aarch64_symbol_binds_local_p (x))
11865 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11866 HOST_WIDE_INT_C (4294967264)))
11867 return SYMBOL_FORCE_TO_MEM;
11868 return SYMBOL_SMALL_ABSOLUTE;
11870 case AARCH64_CMODEL_TINY_PIC:
11871 if (!aarch64_symbol_binds_local_p (x))
11872 return SYMBOL_TINY_GOT;
11873 return SYMBOL_TINY_ABSOLUTE;
11875 case AARCH64_CMODEL_SMALL_SPIC:
11876 case AARCH64_CMODEL_SMALL_PIC:
11877 if (!aarch64_symbol_binds_local_p (x))
11878 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11879 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11880 return SYMBOL_SMALL_ABSOLUTE;
11882 case AARCH64_CMODEL_LARGE:
11883 /* This is alright even in PIC code as the constant
11884 pool reference is always PC relative and within
11885 the same translation unit. */
11886 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11887 return SYMBOL_SMALL_ABSOLUTE;
11888 else
11889 return SYMBOL_FORCE_TO_MEM;
11891 default:
11892 gcc_unreachable ();
11896 /* By default push everything into the constant pool. */
11897 return SYMBOL_FORCE_TO_MEM;
11900 bool
11901 aarch64_constant_address_p (rtx x)
11903 return (CONSTANT_P (x) && memory_address_p (DImode, x));
11906 bool
11907 aarch64_legitimate_pic_operand_p (rtx x)
11909 if (GET_CODE (x) == SYMBOL_REF
11910 || (GET_CODE (x) == CONST
11911 && GET_CODE (XEXP (x, 0)) == PLUS
11912 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11913 return false;
11915 return true;
11918 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11919 that should be rematerialized rather than spilled. */
11921 static bool
11922 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11924 /* Support CSE and rematerialization of common constants. */
11925 if (CONST_INT_P (x)
11926 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11927 || GET_CODE (x) == CONST_VECTOR)
11928 return true;
11930 /* Do not allow vector struct mode constants for Advanced SIMD.
11931 We could support 0 and -1 easily, but they need support in
11932 aarch64-simd.md. */
11933 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11934 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11935 return false;
11937 /* Only accept variable-length vector constants if they can be
11938 handled directly.
11940 ??? It would be possible to handle rematerialization of other
11941 constants via secondary reloads. */
11942 if (vec_flags & VEC_ANY_SVE)
11943 return aarch64_simd_valid_immediate (x, NULL);
11945 if (GET_CODE (x) == HIGH)
11946 x = XEXP (x, 0);
11948 /* Accept polynomial constants that can be calculated by using the
11949 destination of a move as the sole temporary. Constants that
11950 require a second temporary cannot be rematerialized (they can't be
11951 forced to memory and also aren't legitimate constants). */
11952 poly_int64 offset;
11953 if (poly_int_rtx_p (x, &offset))
11954 return aarch64_offset_temporaries (false, offset) <= 1;
11956 /* If an offset is being added to something else, we need to allow the
11957 base to be moved into the destination register, meaning that there
11958 are no free temporaries for the offset. */
11959 x = strip_offset (x, &offset);
11960 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11961 return false;
11963 /* Do not allow const (plus (anchor_symbol, const_int)). */
11964 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11965 return false;
11967 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
11968 so spilling them is better than rematerialization. */
11969 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11970 return true;
11972 /* Label references are always constant. */
11973 if (GET_CODE (x) == LABEL_REF)
11974 return true;
11976 return false;
11980 aarch64_load_tp (rtx target)
11982 if (!target
11983 || GET_MODE (target) != Pmode
11984 || !register_operand (target, Pmode))
11985 target = gen_reg_rtx (Pmode);
11987 /* Can return in any reg. */
11988 emit_insn (gen_aarch64_load_tp_hard (target));
11989 return target;
11992 /* On AAPCS systems, this is the "struct __va_list". */
11993 static GTY(()) tree va_list_type;
11995 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11996 Return the type to use as __builtin_va_list.
11998 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
12000 struct __va_list
12002 void *__stack;
12003 void *__gr_top;
12004 void *__vr_top;
12005 int __gr_offs;
12006 int __vr_offs;
12007 }; */
12009 static tree
12010 aarch64_build_builtin_va_list (void)
12012 tree va_list_name;
12013 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12015 /* Create the type. */
12016 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
12017 /* Give it the required name. */
12018 va_list_name = build_decl (BUILTINS_LOCATION,
12019 TYPE_DECL,
12020 get_identifier ("__va_list"),
12021 va_list_type);
12022 DECL_ARTIFICIAL (va_list_name) = 1;
12023 TYPE_NAME (va_list_type) = va_list_name;
12024 TYPE_STUB_DECL (va_list_type) = va_list_name;
12026 /* Create the fields. */
12027 f_stack = build_decl (BUILTINS_LOCATION,
12028 FIELD_DECL, get_identifier ("__stack"),
12029 ptr_type_node);
12030 f_grtop = build_decl (BUILTINS_LOCATION,
12031 FIELD_DECL, get_identifier ("__gr_top"),
12032 ptr_type_node);
12033 f_vrtop = build_decl (BUILTINS_LOCATION,
12034 FIELD_DECL, get_identifier ("__vr_top"),
12035 ptr_type_node);
12036 f_groff = build_decl (BUILTINS_LOCATION,
12037 FIELD_DECL, get_identifier ("__gr_offs"),
12038 integer_type_node);
12039 f_vroff = build_decl (BUILTINS_LOCATION,
12040 FIELD_DECL, get_identifier ("__vr_offs"),
12041 integer_type_node);
12043 /* Tell tree-stdarg pass about our internal offset fields.
12044 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12045 purpose to identify whether the code is updating va_list internal
12046 offset fields through irregular way. */
12047 va_list_gpr_counter_field = f_groff;
12048 va_list_fpr_counter_field = f_vroff;
12050 DECL_ARTIFICIAL (f_stack) = 1;
12051 DECL_ARTIFICIAL (f_grtop) = 1;
12052 DECL_ARTIFICIAL (f_vrtop) = 1;
12053 DECL_ARTIFICIAL (f_groff) = 1;
12054 DECL_ARTIFICIAL (f_vroff) = 1;
12056 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12057 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12058 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12059 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12060 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12062 TYPE_FIELDS (va_list_type) = f_stack;
12063 DECL_CHAIN (f_stack) = f_grtop;
12064 DECL_CHAIN (f_grtop) = f_vrtop;
12065 DECL_CHAIN (f_vrtop) = f_groff;
12066 DECL_CHAIN (f_groff) = f_vroff;
12068 /* Compute its layout. */
12069 layout_type (va_list_type);
12071 return va_list_type;
12074 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12075 static void
12076 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12078 const CUMULATIVE_ARGS *cum;
12079 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12080 tree stack, grtop, vrtop, groff, vroff;
12081 tree t;
12082 int gr_save_area_size = cfun->va_list_gpr_size;
12083 int vr_save_area_size = cfun->va_list_fpr_size;
12084 int vr_offset;
12086 cum = &crtl->args.info;
12087 if (cfun->va_list_gpr_size)
12088 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12089 cfun->va_list_gpr_size);
12090 if (cfun->va_list_fpr_size)
12091 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12092 * UNITS_PER_VREG, cfun->va_list_fpr_size);
12094 if (!TARGET_FLOAT)
12096 gcc_assert (cum->aapcs_nvrn == 0);
12097 vr_save_area_size = 0;
12100 f_stack = TYPE_FIELDS (va_list_type_node);
12101 f_grtop = DECL_CHAIN (f_stack);
12102 f_vrtop = DECL_CHAIN (f_grtop);
12103 f_groff = DECL_CHAIN (f_vrtop);
12104 f_vroff = DECL_CHAIN (f_groff);
12106 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12107 NULL_TREE);
12108 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12109 NULL_TREE);
12110 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12111 NULL_TREE);
12112 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12113 NULL_TREE);
12114 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12115 NULL_TREE);
12117 /* Emit code to initialize STACK, which points to the next varargs stack
12118 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12119 by named arguments. STACK is 8-byte aligned. */
12120 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12121 if (cum->aapcs_stack_size > 0)
12122 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12123 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12124 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12126 /* Emit code to initialize GRTOP, the top of the GR save area.
12127 virtual_incoming_args_rtx should have been 16 byte aligned. */
12128 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12129 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12130 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12132 /* Emit code to initialize VRTOP, the top of the VR save area.
12133 This address is gr_save_area_bytes below GRTOP, rounded
12134 down to the next 16-byte boundary. */
12135 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12136 vr_offset = ROUND_UP (gr_save_area_size,
12137 STACK_BOUNDARY / BITS_PER_UNIT);
12139 if (vr_offset)
12140 t = fold_build_pointer_plus_hwi (t, -vr_offset);
12141 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12142 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12144 /* Emit code to initialize GROFF, the offset from GRTOP of the
12145 next GPR argument. */
12146 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12147 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12148 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12150 /* Likewise emit code to initialize VROFF, the offset from FTOP
12151 of the next VR argument. */
12152 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12153 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12154 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12157 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12159 static tree
12160 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12161 gimple_seq *post_p ATTRIBUTE_UNUSED)
12163 tree addr;
12164 bool indirect_p;
12165 bool is_ha; /* is HFA or HVA. */
12166 bool dw_align; /* double-word align. */
12167 machine_mode ag_mode = VOIDmode;
12168 int nregs;
12169 machine_mode mode;
12171 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12172 tree stack, f_top, f_off, off, arg, roundup, on_stack;
12173 HOST_WIDE_INT size, rsize, adjust, align;
12174 tree t, u, cond1, cond2;
12176 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12177 if (indirect_p)
12178 type = build_pointer_type (type);
12180 mode = TYPE_MODE (type);
12182 f_stack = TYPE_FIELDS (va_list_type_node);
12183 f_grtop = DECL_CHAIN (f_stack);
12184 f_vrtop = DECL_CHAIN (f_grtop);
12185 f_groff = DECL_CHAIN (f_vrtop);
12186 f_vroff = DECL_CHAIN (f_groff);
12188 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12189 f_stack, NULL_TREE);
12190 size = int_size_in_bytes (type);
12191 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12193 dw_align = false;
12194 adjust = 0;
12195 if (aarch64_vfp_is_call_or_return_candidate (mode,
12196 type,
12197 &ag_mode,
12198 &nregs,
12199 &is_ha))
12201 /* No frontends can create types with variable-sized modes, so we
12202 shouldn't be asked to pass or return them. */
12203 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12205 /* TYPE passed in fp/simd registers. */
12206 if (!TARGET_FLOAT)
12207 aarch64_err_no_fpadvsimd (mode, "varargs");
12209 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12210 unshare_expr (valist), f_vrtop, NULL_TREE);
12211 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12212 unshare_expr (valist), f_vroff, NULL_TREE);
12214 rsize = nregs * UNITS_PER_VREG;
12216 if (is_ha)
12218 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12219 adjust = UNITS_PER_VREG - ag_size;
12221 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12222 && size < UNITS_PER_VREG)
12224 adjust = UNITS_PER_VREG - size;
12227 else
12229 /* TYPE passed in general registers. */
12230 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12231 unshare_expr (valist), f_grtop, NULL_TREE);
12232 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12233 unshare_expr (valist), f_groff, NULL_TREE);
12234 rsize = ROUND_UP (size, UNITS_PER_WORD);
12235 nregs = rsize / UNITS_PER_WORD;
12237 if (align > 8)
12238 dw_align = true;
12240 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12241 && size < UNITS_PER_WORD)
12243 adjust = UNITS_PER_WORD - size;
12247 /* Get a local temporary for the field value. */
12248 off = get_initialized_tmp_var (f_off, pre_p, NULL);
12250 /* Emit code to branch if off >= 0. */
12251 t = build2 (GE_EXPR, boolean_type_node, off,
12252 build_int_cst (TREE_TYPE (off), 0));
12253 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12255 if (dw_align)
12257 /* Emit: offs = (offs + 15) & -16. */
12258 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12259 build_int_cst (TREE_TYPE (off), 15));
12260 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12261 build_int_cst (TREE_TYPE (off), -16));
12262 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12264 else
12265 roundup = NULL;
12267 /* Update ap.__[g|v]r_offs */
12268 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12269 build_int_cst (TREE_TYPE (off), rsize));
12270 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12272 /* String up. */
12273 if (roundup)
12274 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12276 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12277 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12278 build_int_cst (TREE_TYPE (f_off), 0));
12279 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12281 /* String up: make sure the assignment happens before the use. */
12282 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12283 COND_EXPR_ELSE (cond1) = t;
12285 /* Prepare the trees handling the argument that is passed on the stack;
12286 the top level node will store in ON_STACK. */
12287 arg = get_initialized_tmp_var (stack, pre_p, NULL);
12288 if (align > 8)
12290 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12291 t = fold_build_pointer_plus_hwi (arg, 15);
12292 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12293 build_int_cst (TREE_TYPE (t), -16));
12294 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12296 else
12297 roundup = NULL;
12298 /* Advance ap.__stack */
12299 t = fold_build_pointer_plus_hwi (arg, size + 7);
12300 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12301 build_int_cst (TREE_TYPE (t), -8));
12302 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12303 /* String up roundup and advance. */
12304 if (roundup)
12305 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12306 /* String up with arg */
12307 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12308 /* Big-endianness related address adjustment. */
12309 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12310 && size < UNITS_PER_WORD)
12312 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12313 size_int (UNITS_PER_WORD - size));
12314 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12317 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12318 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12320 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12321 t = off;
12322 if (adjust)
12323 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12324 build_int_cst (TREE_TYPE (off), adjust));
12326 t = fold_convert (sizetype, t);
12327 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12329 if (is_ha)
12331 /* type ha; // treat as "struct {ftype field[n];}"
12332 ... [computing offs]
12333 for (i = 0; i <nregs; ++i, offs += 16)
12334 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12335 return ha; */
12336 int i;
12337 tree tmp_ha, field_t, field_ptr_t;
12339 /* Declare a local variable. */
12340 tmp_ha = create_tmp_var_raw (type, "ha");
12341 gimple_add_tmp_var (tmp_ha);
12343 /* Establish the base type. */
12344 switch (ag_mode)
12346 case E_SFmode:
12347 field_t = float_type_node;
12348 field_ptr_t = float_ptr_type_node;
12349 break;
12350 case E_DFmode:
12351 field_t = double_type_node;
12352 field_ptr_t = double_ptr_type_node;
12353 break;
12354 case E_TFmode:
12355 field_t = long_double_type_node;
12356 field_ptr_t = long_double_ptr_type_node;
12357 break;
12358 case E_HFmode:
12359 field_t = aarch64_fp16_type_node;
12360 field_ptr_t = aarch64_fp16_ptr_type_node;
12361 break;
12362 case E_V2SImode:
12363 case E_V4SImode:
12365 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12366 field_t = build_vector_type_for_mode (innertype, ag_mode);
12367 field_ptr_t = build_pointer_type (field_t);
12369 break;
12370 default:
12371 gcc_assert (0);
12374 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12375 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12376 addr = t;
12377 t = fold_convert (field_ptr_t, addr);
12378 t = build2 (MODIFY_EXPR, field_t,
12379 build1 (INDIRECT_REF, field_t, tmp_ha),
12380 build1 (INDIRECT_REF, field_t, t));
12382 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12383 for (i = 1; i < nregs; ++i)
12385 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12386 u = fold_convert (field_ptr_t, addr);
12387 u = build2 (MODIFY_EXPR, field_t,
12388 build2 (MEM_REF, field_t, tmp_ha,
12389 build_int_cst (field_ptr_t,
12390 (i *
12391 int_size_in_bytes (field_t)))),
12392 build1 (INDIRECT_REF, field_t, u));
12393 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12396 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12397 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12400 COND_EXPR_ELSE (cond2) = t;
12401 addr = fold_convert (build_pointer_type (type), cond1);
12402 addr = build_va_arg_indirect_ref (addr);
12404 if (indirect_p)
12405 addr = build_va_arg_indirect_ref (addr);
12407 return addr;
12410 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12412 static void
12413 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12414 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12415 int no_rtl)
12417 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12418 CUMULATIVE_ARGS local_cum;
12419 int gr_saved = cfun->va_list_gpr_size;
12420 int vr_saved = cfun->va_list_fpr_size;
12422 /* The caller has advanced CUM up to, but not beyond, the last named
12423 argument. Advance a local copy of CUM past the last "real" named
12424 argument, to find out how many registers are left over. */
12425 local_cum = *cum;
12426 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12428 /* Found out how many registers we need to save.
12429 Honor tree-stdvar analysis results. */
12430 if (cfun->va_list_gpr_size)
12431 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12432 cfun->va_list_gpr_size / UNITS_PER_WORD);
12433 if (cfun->va_list_fpr_size)
12434 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12435 cfun->va_list_fpr_size / UNITS_PER_VREG);
12437 if (!TARGET_FLOAT)
12439 gcc_assert (local_cum.aapcs_nvrn == 0);
12440 vr_saved = 0;
12443 if (!no_rtl)
12445 if (gr_saved > 0)
12447 rtx ptr, mem;
12449 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12450 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12451 - gr_saved * UNITS_PER_WORD);
12452 mem = gen_frame_mem (BLKmode, ptr);
12453 set_mem_alias_set (mem, get_varargs_alias_set ());
12455 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12456 mem, gr_saved);
12458 if (vr_saved > 0)
12460 /* We can't use move_block_from_reg, because it will use
12461 the wrong mode, storing D regs only. */
12462 machine_mode mode = TImode;
12463 int off, i, vr_start;
12465 /* Set OFF to the offset from virtual_incoming_args_rtx of
12466 the first vector register. The VR save area lies below
12467 the GR one, and is aligned to 16 bytes. */
12468 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12469 STACK_BOUNDARY / BITS_PER_UNIT);
12470 off -= vr_saved * UNITS_PER_VREG;
12472 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12473 for (i = 0; i < vr_saved; ++i)
12475 rtx ptr, mem;
12477 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12478 mem = gen_frame_mem (mode, ptr);
12479 set_mem_alias_set (mem, get_varargs_alias_set ());
12480 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12481 off += UNITS_PER_VREG;
12486 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12487 any complication of having crtl->args.pretend_args_size changed. */
12488 cfun->machine->frame.saved_varargs_size
12489 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12490 STACK_BOUNDARY / BITS_PER_UNIT)
12491 + vr_saved * UNITS_PER_VREG);
12494 static void
12495 aarch64_conditional_register_usage (void)
12497 int i;
12498 if (!TARGET_FLOAT)
12500 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12502 fixed_regs[i] = 1;
12503 call_used_regs[i] = 1;
12506 if (!TARGET_SVE)
12507 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12509 fixed_regs[i] = 1;
12510 call_used_regs[i] = 1;
12514 /* Walk down the type tree of TYPE counting consecutive base elements.
12515 If *MODEP is VOIDmode, then set it to the first valid floating point
12516 type. If a non-floating point type is found, or if a floating point
12517 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12518 otherwise return the count in the sub-tree. */
12519 static int
12520 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12522 machine_mode mode;
12523 HOST_WIDE_INT size;
12525 switch (TREE_CODE (type))
12527 case REAL_TYPE:
12528 mode = TYPE_MODE (type);
12529 if (mode != DFmode && mode != SFmode
12530 && mode != TFmode && mode != HFmode)
12531 return -1;
12533 if (*modep == VOIDmode)
12534 *modep = mode;
12536 if (*modep == mode)
12537 return 1;
12539 break;
12541 case COMPLEX_TYPE:
12542 mode = TYPE_MODE (TREE_TYPE (type));
12543 if (mode != DFmode && mode != SFmode
12544 && mode != TFmode && mode != HFmode)
12545 return -1;
12547 if (*modep == VOIDmode)
12548 *modep = mode;
12550 if (*modep == mode)
12551 return 2;
12553 break;
12555 case VECTOR_TYPE:
12556 /* Use V2SImode and V4SImode as representatives of all 64-bit
12557 and 128-bit vector types. */
12558 size = int_size_in_bytes (type);
12559 switch (size)
12561 case 8:
12562 mode = V2SImode;
12563 break;
12564 case 16:
12565 mode = V4SImode;
12566 break;
12567 default:
12568 return -1;
12571 if (*modep == VOIDmode)
12572 *modep = mode;
12574 /* Vector modes are considered to be opaque: two vectors are
12575 equivalent for the purposes of being homogeneous aggregates
12576 if they are the same size. */
12577 if (*modep == mode)
12578 return 1;
12580 break;
12582 case ARRAY_TYPE:
12584 int count;
12585 tree index = TYPE_DOMAIN (type);
12587 /* Can't handle incomplete types nor sizes that are not
12588 fixed. */
12589 if (!COMPLETE_TYPE_P (type)
12590 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12591 return -1;
12593 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12594 if (count == -1
12595 || !index
12596 || !TYPE_MAX_VALUE (index)
12597 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12598 || !TYPE_MIN_VALUE (index)
12599 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12600 || count < 0)
12601 return -1;
12603 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12604 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12606 /* There must be no padding. */
12607 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12608 count * GET_MODE_BITSIZE (*modep)))
12609 return -1;
12611 return count;
12614 case RECORD_TYPE:
12616 int count = 0;
12617 int sub_count;
12618 tree field;
12620 /* Can't handle incomplete types nor sizes that are not
12621 fixed. */
12622 if (!COMPLETE_TYPE_P (type)
12623 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12624 return -1;
12626 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12628 if (TREE_CODE (field) != FIELD_DECL)
12629 continue;
12631 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12632 if (sub_count < 0)
12633 return -1;
12634 count += sub_count;
12637 /* There must be no padding. */
12638 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12639 count * GET_MODE_BITSIZE (*modep)))
12640 return -1;
12642 return count;
12645 case UNION_TYPE:
12646 case QUAL_UNION_TYPE:
12648 /* These aren't very interesting except in a degenerate case. */
12649 int count = 0;
12650 int sub_count;
12651 tree field;
12653 /* Can't handle incomplete types nor sizes that are not
12654 fixed. */
12655 if (!COMPLETE_TYPE_P (type)
12656 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12657 return -1;
12659 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12661 if (TREE_CODE (field) != FIELD_DECL)
12662 continue;
12664 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12665 if (sub_count < 0)
12666 return -1;
12667 count = count > sub_count ? count : sub_count;
12670 /* There must be no padding. */
12671 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12672 count * GET_MODE_BITSIZE (*modep)))
12673 return -1;
12675 return count;
12678 default:
12679 break;
12682 return -1;
12685 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12686 type as described in AAPCS64 \S 4.1.2.
12688 See the comment above aarch64_composite_type_p for the notes on MODE. */
12690 static bool
12691 aarch64_short_vector_p (const_tree type,
12692 machine_mode mode)
12694 poly_int64 size = -1;
12696 if (type && TREE_CODE (type) == VECTOR_TYPE)
12697 size = int_size_in_bytes (type);
12698 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12699 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12700 size = GET_MODE_SIZE (mode);
12702 return known_eq (size, 8) || known_eq (size, 16);
12705 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12706 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12707 array types. The C99 floating-point complex types are also considered
12708 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12709 types, which are GCC extensions and out of the scope of AAPCS64, are
12710 treated as composite types here as well.
12712 Note that MODE itself is not sufficient in determining whether a type
12713 is such a composite type or not. This is because
12714 stor-layout.c:compute_record_mode may have already changed the MODE
12715 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12716 structure with only one field may have its MODE set to the mode of the
12717 field. Also an integer mode whose size matches the size of the
12718 RECORD_TYPE type may be used to substitute the original mode
12719 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12720 solely relied on. */
12722 static bool
12723 aarch64_composite_type_p (const_tree type,
12724 machine_mode mode)
12726 if (aarch64_short_vector_p (type, mode))
12727 return false;
12729 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12730 return true;
12732 if (mode == BLKmode
12733 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12734 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12735 return true;
12737 return false;
12740 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12741 shall be passed or returned in simd/fp register(s) (providing these
12742 parameter passing registers are available).
12744 Upon successful return, *COUNT returns the number of needed registers,
12745 *BASE_MODE returns the mode of the individual register and when IS_HAF
12746 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12747 floating-point aggregate or a homogeneous short-vector aggregate. */
12749 static bool
12750 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12751 const_tree type,
12752 machine_mode *base_mode,
12753 int *count,
12754 bool *is_ha)
12756 machine_mode new_mode = VOIDmode;
12757 bool composite_p = aarch64_composite_type_p (type, mode);
12759 if (is_ha != NULL) *is_ha = false;
12761 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12762 || aarch64_short_vector_p (type, mode))
12764 *count = 1;
12765 new_mode = mode;
12767 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12769 if (is_ha != NULL) *is_ha = true;
12770 *count = 2;
12771 new_mode = GET_MODE_INNER (mode);
12773 else if (type && composite_p)
12775 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12777 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12779 if (is_ha != NULL) *is_ha = true;
12780 *count = ag_count;
12782 else
12783 return false;
12785 else
12786 return false;
12788 *base_mode = new_mode;
12789 return true;
12792 /* Implement TARGET_STRUCT_VALUE_RTX. */
12794 static rtx
12795 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12796 int incoming ATTRIBUTE_UNUSED)
12798 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12801 /* Implements target hook vector_mode_supported_p. */
12802 static bool
12803 aarch64_vector_mode_supported_p (machine_mode mode)
12805 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12806 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12809 /* Return appropriate SIMD container
12810 for MODE within a vector of WIDTH bits. */
12811 static machine_mode
12812 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12814 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12815 switch (mode)
12817 case E_DFmode:
12818 return VNx2DFmode;
12819 case E_SFmode:
12820 return VNx4SFmode;
12821 case E_HFmode:
12822 return VNx8HFmode;
12823 case E_DImode:
12824 return VNx2DImode;
12825 case E_SImode:
12826 return VNx4SImode;
12827 case E_HImode:
12828 return VNx8HImode;
12829 case E_QImode:
12830 return VNx16QImode;
12831 default:
12832 return word_mode;
12835 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12836 if (TARGET_SIMD)
12838 if (known_eq (width, 128))
12839 switch (mode)
12841 case E_DFmode:
12842 return V2DFmode;
12843 case E_SFmode:
12844 return V4SFmode;
12845 case E_HFmode:
12846 return V8HFmode;
12847 case E_SImode:
12848 return V4SImode;
12849 case E_HImode:
12850 return V8HImode;
12851 case E_QImode:
12852 return V16QImode;
12853 case E_DImode:
12854 return V2DImode;
12855 default:
12856 break;
12858 else
12859 switch (mode)
12861 case E_SFmode:
12862 return V2SFmode;
12863 case E_HFmode:
12864 return V4HFmode;
12865 case E_SImode:
12866 return V2SImode;
12867 case E_HImode:
12868 return V4HImode;
12869 case E_QImode:
12870 return V8QImode;
12871 default:
12872 break;
12875 return word_mode;
12878 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12879 static machine_mode
12880 aarch64_preferred_simd_mode (scalar_mode mode)
12882 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12883 return aarch64_simd_container_mode (mode, bits);
12886 /* Return a list of possible vector sizes for the vectorizer
12887 to iterate over. */
12888 static void
12889 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12891 if (TARGET_SVE)
12892 sizes->safe_push (BYTES_PER_SVE_VECTOR);
12893 sizes->safe_push (16);
12894 sizes->safe_push (8);
12897 /* Implement TARGET_MANGLE_TYPE. */
12899 static const char *
12900 aarch64_mangle_type (const_tree type)
12902 /* The AArch64 ABI documents say that "__va_list" has to be
12903 managled as if it is in the "std" namespace. */
12904 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12905 return "St9__va_list";
12907 /* Half-precision float. */
12908 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12909 return "Dh";
12911 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12912 builtin types. */
12913 if (TYPE_NAME (type) != NULL)
12914 return aarch64_mangle_builtin_type (type);
12916 /* Use the default mangling. */
12917 return NULL;
12920 /* Find the first rtx_insn before insn that will generate an assembly
12921 instruction. */
12923 static rtx_insn *
12924 aarch64_prev_real_insn (rtx_insn *insn)
12926 if (!insn)
12927 return NULL;
12931 insn = prev_real_insn (insn);
12933 while (insn && recog_memoized (insn) < 0);
12935 return insn;
12938 static bool
12939 is_madd_op (enum attr_type t1)
12941 unsigned int i;
12942 /* A number of these may be AArch32 only. */
12943 enum attr_type mlatypes[] = {
12944 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12945 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12946 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12949 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12951 if (t1 == mlatypes[i])
12952 return true;
12955 return false;
12958 /* Check if there is a register dependency between a load and the insn
12959 for which we hold recog_data. */
12961 static bool
12962 dep_between_memop_and_curr (rtx memop)
12964 rtx load_reg;
12965 int opno;
12967 gcc_assert (GET_CODE (memop) == SET);
12969 if (!REG_P (SET_DEST (memop)))
12970 return false;
12972 load_reg = SET_DEST (memop);
12973 for (opno = 1; opno < recog_data.n_operands; opno++)
12975 rtx operand = recog_data.operand[opno];
12976 if (REG_P (operand)
12977 && reg_overlap_mentioned_p (load_reg, operand))
12978 return true;
12981 return false;
12985 /* When working around the Cortex-A53 erratum 835769,
12986 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12987 instruction and has a preceding memory instruction such that a NOP
12988 should be inserted between them. */
12990 bool
12991 aarch64_madd_needs_nop (rtx_insn* insn)
12993 enum attr_type attr_type;
12994 rtx_insn *prev;
12995 rtx body;
12997 if (!TARGET_FIX_ERR_A53_835769)
12998 return false;
13000 if (!INSN_P (insn) || recog_memoized (insn) < 0)
13001 return false;
13003 attr_type = get_attr_type (insn);
13004 if (!is_madd_op (attr_type))
13005 return false;
13007 prev = aarch64_prev_real_insn (insn);
13008 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
13009 Restore recog state to INSN to avoid state corruption. */
13010 extract_constrain_insn_cached (insn);
13012 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
13013 return false;
13015 body = single_set (prev);
13017 /* If the previous insn is a memory op and there is no dependency between
13018 it and the DImode madd, emit a NOP between them. If body is NULL then we
13019 have a complex memory operation, probably a load/store pair.
13020 Be conservative for now and emit a NOP. */
13021 if (GET_MODE (recog_data.operand[0]) == DImode
13022 && (!body || !dep_between_memop_and_curr (body)))
13023 return true;
13025 return false;
13030 /* Implement FINAL_PRESCAN_INSN. */
13032 void
13033 aarch64_final_prescan_insn (rtx_insn *insn)
13035 if (aarch64_madd_needs_nop (insn))
13036 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13040 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13041 instruction. */
13043 bool
13044 aarch64_sve_index_immediate_p (rtx base_or_step)
13046 return (CONST_INT_P (base_or_step)
13047 && IN_RANGE (INTVAL (base_or_step), -16, 15));
13050 /* Return true if X is a valid immediate for the SVE ADD and SUB
13051 instructions. Negate X first if NEGATE_P is true. */
13053 bool
13054 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13056 rtx elt;
13058 if (!const_vec_duplicate_p (x, &elt)
13059 || !CONST_INT_P (elt))
13060 return false;
13062 HOST_WIDE_INT val = INTVAL (elt);
13063 if (negate_p)
13064 val = -val;
13065 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13067 if (val & 0xff)
13068 return IN_RANGE (val, 0, 0xff);
13069 return IN_RANGE (val, 0, 0xff00);
13072 /* Return true if X is a valid immediate operand for an SVE logical
13073 instruction such as AND. */
13075 bool
13076 aarch64_sve_bitmask_immediate_p (rtx x)
13078 rtx elt;
13080 return (const_vec_duplicate_p (x, &elt)
13081 && CONST_INT_P (elt)
13082 && aarch64_bitmask_imm (INTVAL (elt),
13083 GET_MODE_INNER (GET_MODE (x))));
13086 /* Return true if X is a valid immediate for the SVE DUP and CPY
13087 instructions. */
13089 bool
13090 aarch64_sve_dup_immediate_p (rtx x)
13092 rtx elt;
13094 if (!const_vec_duplicate_p (x, &elt)
13095 || !CONST_INT_P (elt))
13096 return false;
13098 HOST_WIDE_INT val = INTVAL (elt);
13099 if (val & 0xff)
13100 return IN_RANGE (val, -0x80, 0x7f);
13101 return IN_RANGE (val, -0x8000, 0x7f00);
13104 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13105 SIGNED_P says whether the operand is signed rather than unsigned. */
13107 bool
13108 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13110 rtx elt;
13112 return (const_vec_duplicate_p (x, &elt)
13113 && CONST_INT_P (elt)
13114 && (signed_p
13115 ? IN_RANGE (INTVAL (elt), -16, 15)
13116 : IN_RANGE (INTVAL (elt), 0, 127)));
13119 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13120 instruction. Negate X first if NEGATE_P is true. */
13122 bool
13123 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13125 rtx elt;
13126 REAL_VALUE_TYPE r;
13128 if (!const_vec_duplicate_p (x, &elt)
13129 || GET_CODE (elt) != CONST_DOUBLE)
13130 return false;
13132 r = *CONST_DOUBLE_REAL_VALUE (elt);
13134 if (negate_p)
13135 r = real_value_negate (&r);
13137 if (real_equal (&r, &dconst1))
13138 return true;
13139 if (real_equal (&r, &dconsthalf))
13140 return true;
13141 return false;
13144 /* Return true if X is a valid immediate operand for an SVE FMUL
13145 instruction. */
13147 bool
13148 aarch64_sve_float_mul_immediate_p (rtx x)
13150 rtx elt;
13152 /* GCC will never generate a multiply with an immediate of 2, so there is no
13153 point testing for it (even though it is a valid constant). */
13154 return (const_vec_duplicate_p (x, &elt)
13155 && GET_CODE (elt) == CONST_DOUBLE
13156 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13159 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13160 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13161 is nonnull, use it to describe valid immediates. */
13162 static bool
13163 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13164 simd_immediate_info *info,
13165 enum simd_immediate_check which,
13166 simd_immediate_info::insn_type insn)
13168 /* Try a 4-byte immediate with LSL. */
13169 for (unsigned int shift = 0; shift < 32; shift += 8)
13170 if ((val32 & (0xff << shift)) == val32)
13172 if (info)
13173 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13174 simd_immediate_info::LSL, shift);
13175 return true;
13178 /* Try a 2-byte immediate with LSL. */
13179 unsigned int imm16 = val32 & 0xffff;
13180 if (imm16 == (val32 >> 16))
13181 for (unsigned int shift = 0; shift < 16; shift += 8)
13182 if ((imm16 & (0xff << shift)) == imm16)
13184 if (info)
13185 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13186 simd_immediate_info::LSL, shift);
13187 return true;
13190 /* Try a 4-byte immediate with MSL, except for cases that MVN
13191 can handle. */
13192 if (which == AARCH64_CHECK_MOV)
13193 for (unsigned int shift = 8; shift < 24; shift += 8)
13195 unsigned int low = (1 << shift) - 1;
13196 if (((val32 & (0xff << shift)) | low) == val32)
13198 if (info)
13199 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13200 simd_immediate_info::MSL, shift);
13201 return true;
13205 return false;
13208 /* Return true if replicating VAL64 is a valid immediate for the
13209 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13210 use it to describe valid immediates. */
13211 static bool
13212 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13213 simd_immediate_info *info,
13214 enum simd_immediate_check which)
13216 unsigned int val32 = val64 & 0xffffffff;
13217 unsigned int val16 = val64 & 0xffff;
13218 unsigned int val8 = val64 & 0xff;
13220 if (val32 == (val64 >> 32))
13222 if ((which & AARCH64_CHECK_ORR) != 0
13223 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13224 simd_immediate_info::MOV))
13225 return true;
13227 if ((which & AARCH64_CHECK_BIC) != 0
13228 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13229 simd_immediate_info::MVN))
13230 return true;
13232 /* Try using a replicated byte. */
13233 if (which == AARCH64_CHECK_MOV
13234 && val16 == (val32 >> 16)
13235 && val8 == (val16 >> 8))
13237 if (info)
13238 *info = simd_immediate_info (QImode, val8);
13239 return true;
13243 /* Try using a bit-to-bytemask. */
13244 if (which == AARCH64_CHECK_MOV)
13246 unsigned int i;
13247 for (i = 0; i < 64; i += 8)
13249 unsigned char byte = (val64 >> i) & 0xff;
13250 if (byte != 0 && byte != 0xff)
13251 break;
13253 if (i == 64)
13255 if (info)
13256 *info = simd_immediate_info (DImode, val64);
13257 return true;
13260 return false;
13263 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13264 instruction. If INFO is nonnull, use it to describe valid immediates. */
13266 static bool
13267 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13268 simd_immediate_info *info)
13270 scalar_int_mode mode = DImode;
13271 unsigned int val32 = val64 & 0xffffffff;
13272 if (val32 == (val64 >> 32))
13274 mode = SImode;
13275 unsigned int val16 = val32 & 0xffff;
13276 if (val16 == (val32 >> 16))
13278 mode = HImode;
13279 unsigned int val8 = val16 & 0xff;
13280 if (val8 == (val16 >> 8))
13281 mode = QImode;
13284 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13285 if (IN_RANGE (val, -0x80, 0x7f))
13287 /* DUP with no shift. */
13288 if (info)
13289 *info = simd_immediate_info (mode, val);
13290 return true;
13292 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13294 /* DUP with LSL #8. */
13295 if (info)
13296 *info = simd_immediate_info (mode, val);
13297 return true;
13299 if (aarch64_bitmask_imm (val64, mode))
13301 /* DUPM. */
13302 if (info)
13303 *info = simd_immediate_info (mode, val);
13304 return true;
13306 return false;
13309 /* Return true if OP is a valid SIMD immediate for the operation
13310 described by WHICH. If INFO is nonnull, use it to describe valid
13311 immediates. */
13312 bool
13313 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13314 enum simd_immediate_check which)
13316 machine_mode mode = GET_MODE (op);
13317 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13318 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13319 return false;
13321 scalar_mode elt_mode = GET_MODE_INNER (mode);
13322 rtx base, step;
13323 unsigned int n_elts;
13324 if (GET_CODE (op) == CONST_VECTOR
13325 && CONST_VECTOR_DUPLICATE_P (op))
13326 n_elts = CONST_VECTOR_NPATTERNS (op);
13327 else if ((vec_flags & VEC_SVE_DATA)
13328 && const_vec_series_p (op, &base, &step))
13330 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13331 if (!aarch64_sve_index_immediate_p (base)
13332 || !aarch64_sve_index_immediate_p (step))
13333 return false;
13335 if (info)
13336 *info = simd_immediate_info (elt_mode, base, step);
13337 return true;
13339 else if (GET_CODE (op) == CONST_VECTOR
13340 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13341 /* N_ELTS set above. */;
13342 else
13343 return false;
13345 /* Handle PFALSE and PTRUE. */
13346 if (vec_flags & VEC_SVE_PRED)
13347 return (op == CONST0_RTX (mode)
13348 || op == CONSTM1_RTX (mode));
13350 scalar_float_mode elt_float_mode;
13351 if (n_elts == 1
13352 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13354 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13355 if (aarch64_float_const_zero_rtx_p (elt)
13356 || aarch64_float_const_representable_p (elt))
13358 if (info)
13359 *info = simd_immediate_info (elt_float_mode, elt);
13360 return true;
13364 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13365 if (elt_size > 8)
13366 return false;
13368 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13370 /* Expand the vector constant out into a byte vector, with the least
13371 significant byte of the register first. */
13372 auto_vec<unsigned char, 16> bytes;
13373 bytes.reserve (n_elts * elt_size);
13374 for (unsigned int i = 0; i < n_elts; i++)
13376 /* The vector is provided in gcc endian-neutral fashion.
13377 For aarch64_be Advanced SIMD, it must be laid out in the vector
13378 register in reverse order. */
13379 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13380 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13382 if (elt_mode != elt_int_mode)
13383 elt = gen_lowpart (elt_int_mode, elt);
13385 if (!CONST_INT_P (elt))
13386 return false;
13388 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13389 for (unsigned int byte = 0; byte < elt_size; byte++)
13391 bytes.quick_push (elt_val & 0xff);
13392 elt_val >>= BITS_PER_UNIT;
13396 /* The immediate must repeat every eight bytes. */
13397 unsigned int nbytes = bytes.length ();
13398 for (unsigned i = 8; i < nbytes; ++i)
13399 if (bytes[i] != bytes[i - 8])
13400 return false;
13402 /* Get the repeating 8-byte value as an integer. No endian correction
13403 is needed here because bytes is already in lsb-first order. */
13404 unsigned HOST_WIDE_INT val64 = 0;
13405 for (unsigned int i = 0; i < 8; i++)
13406 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13407 << (i * BITS_PER_UNIT));
13409 if (vec_flags & VEC_SVE_DATA)
13410 return aarch64_sve_valid_immediate (val64, info);
13411 else
13412 return aarch64_advsimd_valid_immediate (val64, info, which);
13415 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13416 has a step in the range of INDEX. Return the index expression if so,
13417 otherwise return null. */
13419 aarch64_check_zero_based_sve_index_immediate (rtx x)
13421 rtx base, step;
13422 if (const_vec_series_p (x, &base, &step)
13423 && base == const0_rtx
13424 && aarch64_sve_index_immediate_p (step))
13425 return step;
13426 return NULL_RTX;
13429 /* Check of immediate shift constants are within range. */
13430 bool
13431 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13433 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13434 if (left)
13435 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13436 else
13437 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13440 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13441 operation of width WIDTH at bit position POS. */
13444 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13446 gcc_assert (CONST_INT_P (width));
13447 gcc_assert (CONST_INT_P (pos));
13449 unsigned HOST_WIDE_INT mask
13450 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13451 return GEN_INT (mask << UINTVAL (pos));
13454 bool
13455 aarch64_mov_operand_p (rtx x, machine_mode mode)
13457 if (GET_CODE (x) == HIGH
13458 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13459 return true;
13461 if (CONST_INT_P (x))
13462 return true;
13464 if (VECTOR_MODE_P (GET_MODE (x)))
13465 return aarch64_simd_valid_immediate (x, NULL);
13467 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13468 return true;
13470 if (aarch64_sve_cnt_immediate_p (x))
13471 return true;
13473 return aarch64_classify_symbolic_expression (x)
13474 == SYMBOL_TINY_ABSOLUTE;
13477 /* Return a const_int vector of VAL. */
13479 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13481 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13482 return gen_const_vec_duplicate (mode, c);
13485 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13487 bool
13488 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13490 machine_mode vmode;
13492 vmode = aarch64_simd_container_mode (mode, 64);
13493 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13494 return aarch64_simd_valid_immediate (op_v, NULL);
13497 /* Construct and return a PARALLEL RTX vector with elements numbering the
13498 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13499 the vector - from the perspective of the architecture. This does not
13500 line up with GCC's perspective on lane numbers, so we end up with
13501 different masks depending on our target endian-ness. The diagram
13502 below may help. We must draw the distinction when building masks
13503 which select one half of the vector. An instruction selecting
13504 architectural low-lanes for a big-endian target, must be described using
13505 a mask selecting GCC high-lanes.
13507 Big-Endian Little-Endian
13509 GCC 0 1 2 3 3 2 1 0
13510 | x | x | x | x | | x | x | x | x |
13511 Architecture 3 2 1 0 3 2 1 0
13513 Low Mask: { 2, 3 } { 0, 1 }
13514 High Mask: { 0, 1 } { 2, 3 }
13516 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13519 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13521 rtvec v = rtvec_alloc (nunits / 2);
13522 int high_base = nunits / 2;
13523 int low_base = 0;
13524 int base;
13525 rtx t1;
13526 int i;
13528 if (BYTES_BIG_ENDIAN)
13529 base = high ? low_base : high_base;
13530 else
13531 base = high ? high_base : low_base;
13533 for (i = 0; i < nunits / 2; i++)
13534 RTVEC_ELT (v, i) = GEN_INT (base + i);
13536 t1 = gen_rtx_PARALLEL (mode, v);
13537 return t1;
13540 /* Check OP for validity as a PARALLEL RTX vector with elements
13541 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13542 from the perspective of the architecture. See the diagram above
13543 aarch64_simd_vect_par_cnst_half for more details. */
13545 bool
13546 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13547 bool high)
13549 int nelts;
13550 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13551 return false;
13553 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13554 HOST_WIDE_INT count_op = XVECLEN (op, 0);
13555 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13556 int i = 0;
13558 if (count_op != count_ideal)
13559 return false;
13561 for (i = 0; i < count_ideal; i++)
13563 rtx elt_op = XVECEXP (op, 0, i);
13564 rtx elt_ideal = XVECEXP (ideal, 0, i);
13566 if (!CONST_INT_P (elt_op)
13567 || INTVAL (elt_ideal) != INTVAL (elt_op))
13568 return false;
13570 return true;
13573 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13574 HIGH (exclusive). */
13575 void
13576 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13577 const_tree exp)
13579 HOST_WIDE_INT lane;
13580 gcc_assert (CONST_INT_P (operand));
13581 lane = INTVAL (operand);
13583 if (lane < low || lane >= high)
13585 if (exp)
13586 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13587 else
13588 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13592 /* Peform endian correction on lane number N, which indexes a vector
13593 of mode MODE, and return the result as an SImode rtx. */
13596 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13598 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13601 /* Return TRUE if OP is a valid vector addressing mode. */
13603 bool
13604 aarch64_simd_mem_operand_p (rtx op)
13606 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13607 || REG_P (XEXP (op, 0)));
13610 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13612 bool
13613 aarch64_sve_ld1r_operand_p (rtx op)
13615 struct aarch64_address_info addr;
13616 scalar_mode mode;
13618 return (MEM_P (op)
13619 && is_a <scalar_mode> (GET_MODE (op), &mode)
13620 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13621 && addr.type == ADDRESS_REG_IMM
13622 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13625 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13626 The conditions for STR are the same. */
13627 bool
13628 aarch64_sve_ldr_operand_p (rtx op)
13630 struct aarch64_address_info addr;
13632 return (MEM_P (op)
13633 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13634 false, ADDR_QUERY_ANY)
13635 && addr.type == ADDRESS_REG_IMM);
13638 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13639 We need to be able to access the individual pieces, so the range
13640 is different from LD[234] and ST[234]. */
13641 bool
13642 aarch64_sve_struct_memory_operand_p (rtx op)
13644 if (!MEM_P (op))
13645 return false;
13647 machine_mode mode = GET_MODE (op);
13648 struct aarch64_address_info addr;
13649 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13650 ADDR_QUERY_ANY)
13651 || addr.type != ADDRESS_REG_IMM)
13652 return false;
13654 poly_int64 first = addr.const_offset;
13655 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13656 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13657 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13660 /* Emit a register copy from operand to operand, taking care not to
13661 early-clobber source registers in the process.
13663 COUNT is the number of components into which the copy needs to be
13664 decomposed. */
13665 void
13666 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13667 unsigned int count)
13669 unsigned int i;
13670 int rdest = REGNO (operands[0]);
13671 int rsrc = REGNO (operands[1]);
13673 if (!reg_overlap_mentioned_p (operands[0], operands[1])
13674 || rdest < rsrc)
13675 for (i = 0; i < count; i++)
13676 emit_move_insn (gen_rtx_REG (mode, rdest + i),
13677 gen_rtx_REG (mode, rsrc + i));
13678 else
13679 for (i = 0; i < count; i++)
13680 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13681 gen_rtx_REG (mode, rsrc + count - i - 1));
13684 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13685 one of VSTRUCT modes: OI, CI, or XI. */
13687 aarch64_simd_attr_length_rglist (machine_mode mode)
13689 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13690 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13693 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13694 alignment of a vector to 128 bits. SVE predicates have an alignment of
13695 16 bits. */
13696 static HOST_WIDE_INT
13697 aarch64_simd_vector_alignment (const_tree type)
13699 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13700 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13701 be set for non-predicate vectors of booleans. Modes are the most
13702 direct way we have of identifying real SVE predicate types. */
13703 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13704 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13705 return MIN (align, 128);
13708 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13709 static HOST_WIDE_INT
13710 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13712 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13714 /* If the length of the vector is fixed, try to align to that length,
13715 otherwise don't try to align at all. */
13716 HOST_WIDE_INT result;
13717 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13718 result = TYPE_ALIGN (TREE_TYPE (type));
13719 return result;
13721 return TYPE_ALIGN (type);
13724 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13725 static bool
13726 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13728 if (is_packed)
13729 return false;
13731 /* For fixed-length vectors, check that the vectorizer will aim for
13732 full-vector alignment. This isn't true for generic GCC vectors
13733 that are wider than the ABI maximum of 128 bits. */
13734 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13735 && (wi::to_widest (TYPE_SIZE (type))
13736 != aarch64_vectorize_preferred_vector_alignment (type)))
13737 return false;
13739 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13740 return true;
13743 /* Return true if the vector misalignment factor is supported by the
13744 target. */
13745 static bool
13746 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13747 const_tree type, int misalignment,
13748 bool is_packed)
13750 if (TARGET_SIMD && STRICT_ALIGNMENT)
13752 /* Return if movmisalign pattern is not supported for this mode. */
13753 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13754 return false;
13756 /* Misalignment factor is unknown at compile time. */
13757 if (misalignment == -1)
13758 return false;
13760 return default_builtin_support_vector_misalignment (mode, type, misalignment,
13761 is_packed);
13764 /* If VALS is a vector constant that can be loaded into a register
13765 using DUP, generate instructions to do so and return an RTX to
13766 assign to the register. Otherwise return NULL_RTX. */
13767 static rtx
13768 aarch64_simd_dup_constant (rtx vals)
13770 machine_mode mode = GET_MODE (vals);
13771 machine_mode inner_mode = GET_MODE_INNER (mode);
13772 rtx x;
13774 if (!const_vec_duplicate_p (vals, &x))
13775 return NULL_RTX;
13777 /* We can load this constant by using DUP and a constant in a
13778 single ARM register. This will be cheaper than a vector
13779 load. */
13780 x = copy_to_mode_reg (inner_mode, x);
13781 return gen_vec_duplicate (mode, x);
13785 /* Generate code to load VALS, which is a PARALLEL containing only
13786 constants (for vec_init) or CONST_VECTOR, efficiently into a
13787 register. Returns an RTX to copy into the register, or NULL_RTX
13788 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13789 static rtx
13790 aarch64_simd_make_constant (rtx vals)
13792 machine_mode mode = GET_MODE (vals);
13793 rtx const_dup;
13794 rtx const_vec = NULL_RTX;
13795 int n_const = 0;
13796 int i;
13798 if (GET_CODE (vals) == CONST_VECTOR)
13799 const_vec = vals;
13800 else if (GET_CODE (vals) == PARALLEL)
13802 /* A CONST_VECTOR must contain only CONST_INTs and
13803 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13804 Only store valid constants in a CONST_VECTOR. */
13805 int n_elts = XVECLEN (vals, 0);
13806 for (i = 0; i < n_elts; ++i)
13808 rtx x = XVECEXP (vals, 0, i);
13809 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13810 n_const++;
13812 if (n_const == n_elts)
13813 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13815 else
13816 gcc_unreachable ();
13818 if (const_vec != NULL_RTX
13819 && aarch64_simd_valid_immediate (const_vec, NULL))
13820 /* Load using MOVI/MVNI. */
13821 return const_vec;
13822 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13823 /* Loaded using DUP. */
13824 return const_dup;
13825 else if (const_vec != NULL_RTX)
13826 /* Load from constant pool. We can not take advantage of single-cycle
13827 LD1 because we need a PC-relative addressing mode. */
13828 return const_vec;
13829 else
13830 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13831 We can not construct an initializer. */
13832 return NULL_RTX;
13835 /* Expand a vector initialisation sequence, such that TARGET is
13836 initialised to contain VALS. */
13838 void
13839 aarch64_expand_vector_init (rtx target, rtx vals)
13841 machine_mode mode = GET_MODE (target);
13842 scalar_mode inner_mode = GET_MODE_INNER (mode);
13843 /* The number of vector elements. */
13844 int n_elts = XVECLEN (vals, 0);
13845 /* The number of vector elements which are not constant. */
13846 int n_var = 0;
13847 rtx any_const = NULL_RTX;
13848 /* The first element of vals. */
13849 rtx v0 = XVECEXP (vals, 0, 0);
13850 bool all_same = true;
13852 /* Count the number of variable elements to initialise. */
13853 for (int i = 0; i < n_elts; ++i)
13855 rtx x = XVECEXP (vals, 0, i);
13856 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13857 ++n_var;
13858 else
13859 any_const = x;
13861 all_same &= rtx_equal_p (x, v0);
13864 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13865 how best to handle this. */
13866 if (n_var == 0)
13868 rtx constant = aarch64_simd_make_constant (vals);
13869 if (constant != NULL_RTX)
13871 emit_move_insn (target, constant);
13872 return;
13876 /* Splat a single non-constant element if we can. */
13877 if (all_same)
13879 rtx x = copy_to_mode_reg (inner_mode, v0);
13880 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13881 return;
13884 enum insn_code icode = optab_handler (vec_set_optab, mode);
13885 gcc_assert (icode != CODE_FOR_nothing);
13887 /* If there are only variable elements, try to optimize
13888 the insertion using dup for the most common element
13889 followed by insertions. */
13891 /* The algorithm will fill matches[*][0] with the earliest matching element,
13892 and matches[X][1] with the count of duplicate elements (if X is the
13893 earliest element which has duplicates). */
13895 if (n_var == n_elts && n_elts <= 16)
13897 int matches[16][2] = {0};
13898 for (int i = 0; i < n_elts; i++)
13900 for (int j = 0; j <= i; j++)
13902 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13904 matches[i][0] = j;
13905 matches[j][1]++;
13906 break;
13910 int maxelement = 0;
13911 int maxv = 0;
13912 for (int i = 0; i < n_elts; i++)
13913 if (matches[i][1] > maxv)
13915 maxelement = i;
13916 maxv = matches[i][1];
13919 /* Create a duplicate of the most common element, unless all elements
13920 are equally useless to us, in which case just immediately set the
13921 vector register using the first element. */
13923 if (maxv == 1)
13925 /* For vectors of two 64-bit elements, we can do even better. */
13926 if (n_elts == 2
13927 && (inner_mode == E_DImode
13928 || inner_mode == E_DFmode))
13931 rtx x0 = XVECEXP (vals, 0, 0);
13932 rtx x1 = XVECEXP (vals, 0, 1);
13933 /* Combine can pick up this case, but handling it directly
13934 here leaves clearer RTL.
13936 This is load_pair_lanes<mode>, and also gives us a clean-up
13937 for store_pair_lanes<mode>. */
13938 if (memory_operand (x0, inner_mode)
13939 && memory_operand (x1, inner_mode)
13940 && !STRICT_ALIGNMENT
13941 && rtx_equal_p (XEXP (x1, 0),
13942 plus_constant (Pmode,
13943 XEXP (x0, 0),
13944 GET_MODE_SIZE (inner_mode))))
13946 rtx t;
13947 if (inner_mode == DFmode)
13948 t = gen_load_pair_lanesdf (target, x0, x1);
13949 else
13950 t = gen_load_pair_lanesdi (target, x0, x1);
13951 emit_insn (t);
13952 return;
13955 /* The subreg-move sequence below will move into lane zero of the
13956 vector register. For big-endian we want that position to hold
13957 the last element of VALS. */
13958 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
13959 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13960 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
13962 else
13964 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13965 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13968 /* Insert the rest. */
13969 for (int i = 0; i < n_elts; i++)
13971 rtx x = XVECEXP (vals, 0, i);
13972 if (matches[i][0] == maxelement)
13973 continue;
13974 x = copy_to_mode_reg (inner_mode, x);
13975 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13977 return;
13980 /* Initialise a vector which is part-variable. We want to first try
13981 to build those lanes which are constant in the most efficient way we
13982 can. */
13983 if (n_var != n_elts)
13985 rtx copy = copy_rtx (vals);
13987 /* Load constant part of vector. We really don't care what goes into the
13988 parts we will overwrite, but we're more likely to be able to load the
13989 constant efficiently if it has fewer, larger, repeating parts
13990 (see aarch64_simd_valid_immediate). */
13991 for (int i = 0; i < n_elts; i++)
13993 rtx x = XVECEXP (vals, 0, i);
13994 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13995 continue;
13996 rtx subst = any_const;
13997 for (int bit = n_elts / 2; bit > 0; bit /= 2)
13999 /* Look in the copied vector, as more elements are const. */
14000 rtx test = XVECEXP (copy, 0, i ^ bit);
14001 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
14003 subst = test;
14004 break;
14007 XVECEXP (copy, 0, i) = subst;
14009 aarch64_expand_vector_init (target, copy);
14012 /* Insert the variable lanes directly. */
14013 for (int i = 0; i < n_elts; i++)
14015 rtx x = XVECEXP (vals, 0, i);
14016 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
14017 continue;
14018 x = copy_to_mode_reg (inner_mode, x);
14019 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14023 static unsigned HOST_WIDE_INT
14024 aarch64_shift_truncation_mask (machine_mode mode)
14026 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14027 return 0;
14028 return GET_MODE_UNIT_BITSIZE (mode) - 1;
14031 /* Select a format to encode pointers in exception handling data. */
14033 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14035 int type;
14036 switch (aarch64_cmodel)
14038 case AARCH64_CMODEL_TINY:
14039 case AARCH64_CMODEL_TINY_PIC:
14040 case AARCH64_CMODEL_SMALL:
14041 case AARCH64_CMODEL_SMALL_PIC:
14042 case AARCH64_CMODEL_SMALL_SPIC:
14043 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
14044 for everything. */
14045 type = DW_EH_PE_sdata4;
14046 break;
14047 default:
14048 /* No assumptions here. 8-byte relocs required. */
14049 type = DW_EH_PE_sdata8;
14050 break;
14052 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14055 /* The last .arch and .tune assembly strings that we printed. */
14056 static std::string aarch64_last_printed_arch_string;
14057 static std::string aarch64_last_printed_tune_string;
14059 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14060 by the function fndecl. */
14062 void
14063 aarch64_declare_function_name (FILE *stream, const char* name,
14064 tree fndecl)
14066 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14068 struct cl_target_option *targ_options;
14069 if (target_parts)
14070 targ_options = TREE_TARGET_OPTION (target_parts);
14071 else
14072 targ_options = TREE_TARGET_OPTION (target_option_current_node);
14073 gcc_assert (targ_options);
14075 const struct processor *this_arch
14076 = aarch64_get_arch (targ_options->x_explicit_arch);
14078 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14079 std::string extension
14080 = aarch64_get_extension_string_for_isa_flags (isa_flags,
14081 this_arch->flags);
14082 /* Only update the assembler .arch string if it is distinct from the last
14083 such string we printed. */
14084 std::string to_print = this_arch->name + extension;
14085 if (to_print != aarch64_last_printed_arch_string)
14087 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14088 aarch64_last_printed_arch_string = to_print;
14091 /* Print the cpu name we're tuning for in the comments, might be
14092 useful to readers of the generated asm. Do it only when it changes
14093 from function to function and verbose assembly is requested. */
14094 const struct processor *this_tune
14095 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14097 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14099 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14100 this_tune->name);
14101 aarch64_last_printed_tune_string = this_tune->name;
14104 /* Don't forget the type directive for ELF. */
14105 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14106 ASM_OUTPUT_LABEL (stream, name);
14109 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14111 static void
14112 aarch64_start_file (void)
14114 struct cl_target_option *default_options
14115 = TREE_TARGET_OPTION (target_option_default_node);
14117 const struct processor *default_arch
14118 = aarch64_get_arch (default_options->x_explicit_arch);
14119 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14120 std::string extension
14121 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14122 default_arch->flags);
14124 aarch64_last_printed_arch_string = default_arch->name + extension;
14125 aarch64_last_printed_tune_string = "";
14126 asm_fprintf (asm_out_file, "\t.arch %s\n",
14127 aarch64_last_printed_arch_string.c_str ());
14129 default_file_start ();
14132 /* Emit load exclusive. */
14134 static void
14135 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14136 rtx mem, rtx model_rtx)
14138 rtx (*gen) (rtx, rtx, rtx);
14140 switch (mode)
14142 case E_QImode: gen = gen_aarch64_load_exclusiveqi; break;
14143 case E_HImode: gen = gen_aarch64_load_exclusivehi; break;
14144 case E_SImode: gen = gen_aarch64_load_exclusivesi; break;
14145 case E_DImode: gen = gen_aarch64_load_exclusivedi; break;
14146 default:
14147 gcc_unreachable ();
14150 emit_insn (gen (rval, mem, model_rtx));
14153 /* Emit store exclusive. */
14155 static void
14156 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14157 rtx rval, rtx mem, rtx model_rtx)
14159 rtx (*gen) (rtx, rtx, rtx, rtx);
14161 switch (mode)
14163 case E_QImode: gen = gen_aarch64_store_exclusiveqi; break;
14164 case E_HImode: gen = gen_aarch64_store_exclusivehi; break;
14165 case E_SImode: gen = gen_aarch64_store_exclusivesi; break;
14166 case E_DImode: gen = gen_aarch64_store_exclusivedi; break;
14167 default:
14168 gcc_unreachable ();
14171 emit_insn (gen (bval, rval, mem, model_rtx));
14174 /* Mark the previous jump instruction as unlikely. */
14176 static void
14177 aarch64_emit_unlikely_jump (rtx insn)
14179 rtx_insn *jump = emit_jump_insn (insn);
14180 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14183 /* Expand a compare and swap pattern. */
14185 void
14186 aarch64_expand_compare_and_swap (rtx operands[])
14188 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14189 machine_mode mode, cmp_mode;
14190 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
14191 int idx;
14192 gen_cas_fn gen;
14193 const gen_cas_fn split_cas[] =
14195 gen_aarch64_compare_and_swapqi,
14196 gen_aarch64_compare_and_swaphi,
14197 gen_aarch64_compare_and_swapsi,
14198 gen_aarch64_compare_and_swapdi
14200 const gen_cas_fn atomic_cas[] =
14202 gen_aarch64_compare_and_swapqi_lse,
14203 gen_aarch64_compare_and_swaphi_lse,
14204 gen_aarch64_compare_and_swapsi_lse,
14205 gen_aarch64_compare_and_swapdi_lse
14208 bval = operands[0];
14209 rval = operands[1];
14210 mem = operands[2];
14211 oldval = operands[3];
14212 newval = operands[4];
14213 is_weak = operands[5];
14214 mod_s = operands[6];
14215 mod_f = operands[7];
14216 mode = GET_MODE (mem);
14217 cmp_mode = mode;
14219 /* Normally the succ memory model must be stronger than fail, but in the
14220 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14221 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14223 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14224 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14225 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14227 switch (mode)
14229 case E_QImode:
14230 case E_HImode:
14231 /* For short modes, we're going to perform the comparison in SImode,
14232 so do the zero-extension now. */
14233 cmp_mode = SImode;
14234 rval = gen_reg_rtx (SImode);
14235 oldval = convert_modes (SImode, mode, oldval, true);
14236 /* Fall through. */
14238 case E_SImode:
14239 case E_DImode:
14240 /* Force the value into a register if needed. */
14241 if (!aarch64_plus_operand (oldval, mode))
14242 oldval = force_reg (cmp_mode, oldval);
14243 break;
14245 default:
14246 gcc_unreachable ();
14249 switch (mode)
14251 case E_QImode: idx = 0; break;
14252 case E_HImode: idx = 1; break;
14253 case E_SImode: idx = 2; break;
14254 case E_DImode: idx = 3; break;
14255 default:
14256 gcc_unreachable ();
14258 if (TARGET_LSE)
14259 gen = atomic_cas[idx];
14260 else
14261 gen = split_cas[idx];
14263 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
14265 if (mode == QImode || mode == HImode)
14266 emit_move_insn (operands[1], gen_lowpart (mode, rval));
14268 x = gen_rtx_REG (CCmode, CC_REGNUM);
14269 x = gen_rtx_EQ (SImode, x, const0_rtx);
14270 emit_insn (gen_rtx_SET (bval, x));
14273 /* Test whether the target supports using a atomic load-operate instruction.
14274 CODE is the operation and AFTER is TRUE if the data in memory after the
14275 operation should be returned and FALSE if the data before the operation
14276 should be returned. Returns FALSE if the operation isn't supported by the
14277 architecture. */
14279 bool
14280 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14282 if (!TARGET_LSE)
14283 return false;
14285 switch (code)
14287 case SET:
14288 case AND:
14289 case IOR:
14290 case XOR:
14291 case MINUS:
14292 case PLUS:
14293 return true;
14294 default:
14295 return false;
14299 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14300 sequence implementing an atomic operation. */
14302 static void
14303 aarch64_emit_post_barrier (enum memmodel model)
14305 const enum memmodel base_model = memmodel_base (model);
14307 if (is_mm_sync (model)
14308 && (base_model == MEMMODEL_ACQUIRE
14309 || base_model == MEMMODEL_ACQ_REL
14310 || base_model == MEMMODEL_SEQ_CST))
14312 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14316 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14317 for the data in memory. EXPECTED is the value expected to be in memory.
14318 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14319 is the memory ordering to use. */
14321 void
14322 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14323 rtx expected, rtx desired,
14324 rtx model)
14326 rtx (*gen) (rtx, rtx, rtx, rtx);
14327 machine_mode mode;
14329 mode = GET_MODE (mem);
14331 switch (mode)
14333 case E_QImode: gen = gen_aarch64_atomic_casqi; break;
14334 case E_HImode: gen = gen_aarch64_atomic_cashi; break;
14335 case E_SImode: gen = gen_aarch64_atomic_cassi; break;
14336 case E_DImode: gen = gen_aarch64_atomic_casdi; break;
14337 default:
14338 gcc_unreachable ();
14341 /* Move the expected value into the CAS destination register. */
14342 emit_insn (gen_rtx_SET (rval, expected));
14344 /* Emit the CAS. */
14345 emit_insn (gen (rval, mem, desired, model));
14347 /* Compare the expected value with the value loaded by the CAS, to establish
14348 whether the swap was made. */
14349 aarch64_gen_compare_reg (EQ, rval, expected);
14352 /* Split a compare and swap pattern. */
14354 void
14355 aarch64_split_compare_and_swap (rtx operands[])
14357 rtx rval, mem, oldval, newval, scratch;
14358 machine_mode mode;
14359 bool is_weak;
14360 rtx_code_label *label1, *label2;
14361 rtx x, cond;
14362 enum memmodel model;
14363 rtx model_rtx;
14365 rval = operands[0];
14366 mem = operands[1];
14367 oldval = operands[2];
14368 newval = operands[3];
14369 is_weak = (operands[4] != const0_rtx);
14370 model_rtx = operands[5];
14371 scratch = operands[7];
14372 mode = GET_MODE (mem);
14373 model = memmodel_from_int (INTVAL (model_rtx));
14375 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14376 loop:
14377 .label1:
14378 LD[A]XR rval, [mem]
14379 CBNZ rval, .label2
14380 ST[L]XR scratch, newval, [mem]
14381 CBNZ scratch, .label1
14382 .label2:
14383 CMP rval, 0. */
14384 bool strong_zero_p = !is_weak && oldval == const0_rtx;
14386 label1 = NULL;
14387 if (!is_weak)
14389 label1 = gen_label_rtx ();
14390 emit_label (label1);
14392 label2 = gen_label_rtx ();
14394 /* The initial load can be relaxed for a __sync operation since a final
14395 barrier will be emitted to stop code hoisting. */
14396 if (is_mm_sync (model))
14397 aarch64_emit_load_exclusive (mode, rval, mem,
14398 GEN_INT (MEMMODEL_RELAXED));
14399 else
14400 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14402 if (strong_zero_p)
14404 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14405 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14406 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14407 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14409 else
14411 cond = aarch64_gen_compare_reg (NE, rval, oldval);
14412 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14413 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14414 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14415 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14418 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14420 if (!is_weak)
14422 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14423 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14424 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14425 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14427 else
14429 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14430 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14431 emit_insn (gen_rtx_SET (cond, x));
14434 emit_label (label2);
14435 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14436 to set the condition flags. If this is not used it will be removed by
14437 later passes. */
14438 if (strong_zero_p)
14440 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14441 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14442 emit_insn (gen_rtx_SET (cond, x));
14444 /* Emit any final barrier needed for a __sync operation. */
14445 if (is_mm_sync (model))
14446 aarch64_emit_post_barrier (model);
14449 /* Emit a BIC instruction. */
14451 static void
14452 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14454 rtx shift_rtx = GEN_INT (shift);
14455 rtx (*gen) (rtx, rtx, rtx, rtx);
14457 switch (mode)
14459 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14460 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14461 default:
14462 gcc_unreachable ();
14465 emit_insn (gen (dst, s2, shift_rtx, s1));
14468 /* Emit an atomic swap. */
14470 static void
14471 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14472 rtx mem, rtx model)
14474 rtx (*gen) (rtx, rtx, rtx, rtx);
14476 switch (mode)
14478 case E_QImode: gen = gen_aarch64_atomic_swpqi; break;
14479 case E_HImode: gen = gen_aarch64_atomic_swphi; break;
14480 case E_SImode: gen = gen_aarch64_atomic_swpsi; break;
14481 case E_DImode: gen = gen_aarch64_atomic_swpdi; break;
14482 default:
14483 gcc_unreachable ();
14486 emit_insn (gen (dst, mem, value, model));
14489 /* Operations supported by aarch64_emit_atomic_load_op. */
14491 enum aarch64_atomic_load_op_code
14493 AARCH64_LDOP_PLUS, /* A + B */
14494 AARCH64_LDOP_XOR, /* A ^ B */
14495 AARCH64_LDOP_OR, /* A | B */
14496 AARCH64_LDOP_BIC /* A & ~B */
14499 /* Emit an atomic load-operate. */
14501 static void
14502 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
14503 machine_mode mode, rtx dst, rtx src,
14504 rtx mem, rtx model)
14506 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
14507 const aarch64_atomic_load_op_fn plus[] =
14509 gen_aarch64_atomic_loadaddqi,
14510 gen_aarch64_atomic_loadaddhi,
14511 gen_aarch64_atomic_loadaddsi,
14512 gen_aarch64_atomic_loadadddi
14514 const aarch64_atomic_load_op_fn eor[] =
14516 gen_aarch64_atomic_loadeorqi,
14517 gen_aarch64_atomic_loadeorhi,
14518 gen_aarch64_atomic_loadeorsi,
14519 gen_aarch64_atomic_loadeordi
14521 const aarch64_atomic_load_op_fn ior[] =
14523 gen_aarch64_atomic_loadsetqi,
14524 gen_aarch64_atomic_loadsethi,
14525 gen_aarch64_atomic_loadsetsi,
14526 gen_aarch64_atomic_loadsetdi
14528 const aarch64_atomic_load_op_fn bic[] =
14530 gen_aarch64_atomic_loadclrqi,
14531 gen_aarch64_atomic_loadclrhi,
14532 gen_aarch64_atomic_loadclrsi,
14533 gen_aarch64_atomic_loadclrdi
14535 aarch64_atomic_load_op_fn gen;
14536 int idx = 0;
14538 switch (mode)
14540 case E_QImode: idx = 0; break;
14541 case E_HImode: idx = 1; break;
14542 case E_SImode: idx = 2; break;
14543 case E_DImode: idx = 3; break;
14544 default:
14545 gcc_unreachable ();
14548 switch (code)
14550 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
14551 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
14552 case AARCH64_LDOP_OR: gen = ior[idx]; break;
14553 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
14554 default:
14555 gcc_unreachable ();
14558 emit_insn (gen (dst, mem, src, model));
14561 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14562 location to store the data read from memory. OUT_RESULT is the location to
14563 store the result of the operation. MEM is the memory location to read and
14564 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14565 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14566 be NULL. */
14568 void
14569 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14570 rtx mem, rtx value, rtx model_rtx)
14572 machine_mode mode = GET_MODE (mem);
14573 machine_mode wmode = (mode == DImode ? DImode : SImode);
14574 const bool short_mode = (mode < SImode);
14575 aarch64_atomic_load_op_code ldop_code;
14576 rtx src;
14577 rtx x;
14579 if (out_data)
14580 out_data = gen_lowpart (mode, out_data);
14582 if (out_result)
14583 out_result = gen_lowpart (mode, out_result);
14585 /* Make sure the value is in a register, putting it into a destination
14586 register if it needs to be manipulated. */
14587 if (!register_operand (value, mode)
14588 || code == AND || code == MINUS)
14590 src = out_result ? out_result : out_data;
14591 emit_move_insn (src, gen_lowpart (mode, value));
14593 else
14594 src = value;
14595 gcc_assert (register_operand (src, mode));
14597 /* Preprocess the data for the operation as necessary. If the operation is
14598 a SET then emit a swap instruction and finish. */
14599 switch (code)
14601 case SET:
14602 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14603 return;
14605 case MINUS:
14606 /* Negate the value and treat it as a PLUS. */
14608 rtx neg_src;
14610 /* Resize the value if necessary. */
14611 if (short_mode)
14612 src = gen_lowpart (wmode, src);
14614 neg_src = gen_rtx_NEG (wmode, src);
14615 emit_insn (gen_rtx_SET (src, neg_src));
14617 if (short_mode)
14618 src = gen_lowpart (mode, src);
14620 /* Fall-through. */
14621 case PLUS:
14622 ldop_code = AARCH64_LDOP_PLUS;
14623 break;
14625 case IOR:
14626 ldop_code = AARCH64_LDOP_OR;
14627 break;
14629 case XOR:
14630 ldop_code = AARCH64_LDOP_XOR;
14631 break;
14633 case AND:
14635 rtx not_src;
14637 /* Resize the value if necessary. */
14638 if (short_mode)
14639 src = gen_lowpart (wmode, src);
14641 not_src = gen_rtx_NOT (wmode, src);
14642 emit_insn (gen_rtx_SET (src, not_src));
14644 if (short_mode)
14645 src = gen_lowpart (mode, src);
14647 ldop_code = AARCH64_LDOP_BIC;
14648 break;
14650 default:
14651 /* The operation can't be done with atomic instructions. */
14652 gcc_unreachable ();
14655 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
14657 /* If necessary, calculate the data in memory after the update by redoing the
14658 operation from values in registers. */
14659 if (!out_result)
14660 return;
14662 if (short_mode)
14664 src = gen_lowpart (wmode, src);
14665 out_data = gen_lowpart (wmode, out_data);
14666 out_result = gen_lowpart (wmode, out_result);
14669 x = NULL_RTX;
14671 switch (code)
14673 case MINUS:
14674 case PLUS:
14675 x = gen_rtx_PLUS (wmode, out_data, src);
14676 break;
14677 case IOR:
14678 x = gen_rtx_IOR (wmode, out_data, src);
14679 break;
14680 case XOR:
14681 x = gen_rtx_XOR (wmode, out_data, src);
14682 break;
14683 case AND:
14684 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14685 return;
14686 default:
14687 gcc_unreachable ();
14690 emit_set_insn (out_result, x);
14692 return;
14695 /* Split an atomic operation. */
14697 void
14698 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14699 rtx value, rtx model_rtx, rtx cond)
14701 machine_mode mode = GET_MODE (mem);
14702 machine_mode wmode = (mode == DImode ? DImode : SImode);
14703 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14704 const bool is_sync = is_mm_sync (model);
14705 rtx_code_label *label;
14706 rtx x;
14708 /* Split the atomic operation into a sequence. */
14709 label = gen_label_rtx ();
14710 emit_label (label);
14712 if (new_out)
14713 new_out = gen_lowpart (wmode, new_out);
14714 if (old_out)
14715 old_out = gen_lowpart (wmode, old_out);
14716 else
14717 old_out = new_out;
14718 value = simplify_gen_subreg (wmode, value, mode, 0);
14720 /* The initial load can be relaxed for a __sync operation since a final
14721 barrier will be emitted to stop code hoisting. */
14722 if (is_sync)
14723 aarch64_emit_load_exclusive (mode, old_out, mem,
14724 GEN_INT (MEMMODEL_RELAXED));
14725 else
14726 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14728 switch (code)
14730 case SET:
14731 new_out = value;
14732 break;
14734 case NOT:
14735 x = gen_rtx_AND (wmode, old_out, value);
14736 emit_insn (gen_rtx_SET (new_out, x));
14737 x = gen_rtx_NOT (wmode, new_out);
14738 emit_insn (gen_rtx_SET (new_out, x));
14739 break;
14741 case MINUS:
14742 if (CONST_INT_P (value))
14744 value = GEN_INT (-INTVAL (value));
14745 code = PLUS;
14747 /* Fall through. */
14749 default:
14750 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14751 emit_insn (gen_rtx_SET (new_out, x));
14752 break;
14755 aarch64_emit_store_exclusive (mode, cond, mem,
14756 gen_lowpart (mode, new_out), model_rtx);
14758 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14759 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14760 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14761 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14763 /* Emit any final barrier needed for a __sync operation. */
14764 if (is_sync)
14765 aarch64_emit_post_barrier (model);
14768 static void
14769 aarch64_init_libfuncs (void)
14771 /* Half-precision float operations. The compiler handles all operations
14772 with NULL libfuncs by converting to SFmode. */
14774 /* Conversions. */
14775 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14776 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14778 /* Arithmetic. */
14779 set_optab_libfunc (add_optab, HFmode, NULL);
14780 set_optab_libfunc (sdiv_optab, HFmode, NULL);
14781 set_optab_libfunc (smul_optab, HFmode, NULL);
14782 set_optab_libfunc (neg_optab, HFmode, NULL);
14783 set_optab_libfunc (sub_optab, HFmode, NULL);
14785 /* Comparisons. */
14786 set_optab_libfunc (eq_optab, HFmode, NULL);
14787 set_optab_libfunc (ne_optab, HFmode, NULL);
14788 set_optab_libfunc (lt_optab, HFmode, NULL);
14789 set_optab_libfunc (le_optab, HFmode, NULL);
14790 set_optab_libfunc (ge_optab, HFmode, NULL);
14791 set_optab_libfunc (gt_optab, HFmode, NULL);
14792 set_optab_libfunc (unord_optab, HFmode, NULL);
14795 /* Target hook for c_mode_for_suffix. */
14796 static machine_mode
14797 aarch64_c_mode_for_suffix (char suffix)
14799 if (suffix == 'q')
14800 return TFmode;
14802 return VOIDmode;
14805 /* We can only represent floating point constants which will fit in
14806 "quarter-precision" values. These values are characterised by
14807 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14810 (-1)^s * (n/16) * 2^r
14812 Where:
14813 's' is the sign bit.
14814 'n' is an integer in the range 16 <= n <= 31.
14815 'r' is an integer in the range -3 <= r <= 4. */
14817 /* Return true iff X can be represented by a quarter-precision
14818 floating point immediate operand X. Note, we cannot represent 0.0. */
14819 bool
14820 aarch64_float_const_representable_p (rtx x)
14822 /* This represents our current view of how many bits
14823 make up the mantissa. */
14824 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14825 int exponent;
14826 unsigned HOST_WIDE_INT mantissa, mask;
14827 REAL_VALUE_TYPE r, m;
14828 bool fail;
14830 if (!CONST_DOUBLE_P (x))
14831 return false;
14833 /* We don't support HFmode constants yet. */
14834 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
14835 return false;
14837 r = *CONST_DOUBLE_REAL_VALUE (x);
14839 /* We cannot represent infinities, NaNs or +/-zero. We won't
14840 know if we have +zero until we analyse the mantissa, but we
14841 can reject the other invalid values. */
14842 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14843 || REAL_VALUE_MINUS_ZERO (r))
14844 return false;
14846 /* Extract exponent. */
14847 r = real_value_abs (&r);
14848 exponent = REAL_EXP (&r);
14850 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14851 highest (sign) bit, with a fixed binary point at bit point_pos.
14852 m1 holds the low part of the mantissa, m2 the high part.
14853 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14854 bits for the mantissa, this can fail (low bits will be lost). */
14855 real_ldexp (&m, &r, point_pos - exponent);
14856 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14858 /* If the low part of the mantissa has bits set we cannot represent
14859 the value. */
14860 if (w.ulow () != 0)
14861 return false;
14862 /* We have rejected the lower HOST_WIDE_INT, so update our
14863 understanding of how many bits lie in the mantissa and
14864 look only at the high HOST_WIDE_INT. */
14865 mantissa = w.elt (1);
14866 point_pos -= HOST_BITS_PER_WIDE_INT;
14868 /* We can only represent values with a mantissa of the form 1.xxxx. */
14869 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14870 if ((mantissa & mask) != 0)
14871 return false;
14873 /* Having filtered unrepresentable values, we may now remove all
14874 but the highest 5 bits. */
14875 mantissa >>= point_pos - 5;
14877 /* We cannot represent the value 0.0, so reject it. This is handled
14878 elsewhere. */
14879 if (mantissa == 0)
14880 return false;
14882 /* Then, as bit 4 is always set, we can mask it off, leaving
14883 the mantissa in the range [0, 15]. */
14884 mantissa &= ~(1 << 4);
14885 gcc_assert (mantissa <= 15);
14887 /* GCC internally does not use IEEE754-like encoding (where normalized
14888 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14889 Our mantissa values are shifted 4 places to the left relative to
14890 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14891 by 5 places to correct for GCC's representation. */
14892 exponent = 5 - exponent;
14894 return (exponent >= 0 && exponent <= 7);
14897 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14898 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14899 output MOVI/MVNI, ORR or BIC immediate. */
14900 char*
14901 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14902 enum simd_immediate_check which)
14904 bool is_valid;
14905 static char templ[40];
14906 const char *mnemonic;
14907 const char *shift_op;
14908 unsigned int lane_count = 0;
14909 char element_char;
14911 struct simd_immediate_info info;
14913 /* This will return true to show const_vector is legal for use as either
14914 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14915 It will also update INFO to show how the immediate should be generated.
14916 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14917 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14918 gcc_assert (is_valid);
14920 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14921 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14923 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14925 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14926 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14927 move immediate path. */
14928 if (aarch64_float_const_zero_rtx_p (info.value))
14929 info.value = GEN_INT (0);
14930 else
14932 const unsigned int buf_size = 20;
14933 char float_buf[buf_size] = {'\0'};
14934 real_to_decimal_for_mode (float_buf,
14935 CONST_DOUBLE_REAL_VALUE (info.value),
14936 buf_size, buf_size, 1, info.elt_mode);
14938 if (lane_count == 1)
14939 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14940 else
14941 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14942 lane_count, element_char, float_buf);
14943 return templ;
14947 gcc_assert (CONST_INT_P (info.value));
14949 if (which == AARCH64_CHECK_MOV)
14951 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14952 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14953 if (lane_count == 1)
14954 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14955 mnemonic, UINTVAL (info.value));
14956 else if (info.shift)
14957 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14958 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14959 element_char, UINTVAL (info.value), shift_op, info.shift);
14960 else
14961 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14962 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14963 element_char, UINTVAL (info.value));
14965 else
14967 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
14968 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14969 if (info.shift)
14970 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14971 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14972 element_char, UINTVAL (info.value), "lsl", info.shift);
14973 else
14974 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14975 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14976 element_char, UINTVAL (info.value));
14978 return templ;
14981 char*
14982 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14985 /* If a floating point number was passed and we desire to use it in an
14986 integer mode do the conversion to integer. */
14987 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14989 unsigned HOST_WIDE_INT ival;
14990 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14991 gcc_unreachable ();
14992 immediate = gen_int_mode (ival, mode);
14995 machine_mode vmode;
14996 /* use a 64 bit mode for everything except for DI/DF mode, where we use
14997 a 128 bit vector mode. */
14998 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
15000 vmode = aarch64_simd_container_mode (mode, width);
15001 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
15002 return aarch64_output_simd_mov_immediate (v_op, width);
15005 /* Return the output string to use for moving immediate CONST_VECTOR
15006 into an SVE register. */
15008 char *
15009 aarch64_output_sve_mov_immediate (rtx const_vector)
15011 static char templ[40];
15012 struct simd_immediate_info info;
15013 char element_char;
15015 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
15016 gcc_assert (is_valid);
15018 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
15020 if (info.step)
15022 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
15023 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
15024 element_char, INTVAL (info.value), INTVAL (info.step));
15025 return templ;
15028 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
15030 if (aarch64_float_const_zero_rtx_p (info.value))
15031 info.value = GEN_INT (0);
15032 else
15034 const int buf_size = 20;
15035 char float_buf[buf_size] = {};
15036 real_to_decimal_for_mode (float_buf,
15037 CONST_DOUBLE_REAL_VALUE (info.value),
15038 buf_size, buf_size, 1, info.elt_mode);
15040 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
15041 element_char, float_buf);
15042 return templ;
15046 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
15047 element_char, INTVAL (info.value));
15048 return templ;
15051 /* Return the asm format for a PTRUE instruction whose destination has
15052 mode MODE. SUFFIX is the element size suffix. */
15054 char *
15055 aarch64_output_ptrue (machine_mode mode, char suffix)
15057 unsigned int nunits;
15058 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
15059 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
15060 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
15061 else
15062 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
15063 return buf;
15066 /* Split operands into moves from op[1] + op[2] into op[0]. */
15068 void
15069 aarch64_split_combinev16qi (rtx operands[3])
15071 unsigned int dest = REGNO (operands[0]);
15072 unsigned int src1 = REGNO (operands[1]);
15073 unsigned int src2 = REGNO (operands[2]);
15074 machine_mode halfmode = GET_MODE (operands[1]);
15075 unsigned int halfregs = REG_NREGS (operands[1]);
15076 rtx destlo, desthi;
15078 gcc_assert (halfmode == V16QImode);
15080 if (src1 == dest && src2 == dest + halfregs)
15082 /* No-op move. Can't split to nothing; emit something. */
15083 emit_note (NOTE_INSN_DELETED);
15084 return;
15087 /* Preserve register attributes for variable tracking. */
15088 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
15089 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
15090 GET_MODE_SIZE (halfmode));
15092 /* Special case of reversed high/low parts. */
15093 if (reg_overlap_mentioned_p (operands[2], destlo)
15094 && reg_overlap_mentioned_p (operands[1], desthi))
15096 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15097 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
15098 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
15100 else if (!reg_overlap_mentioned_p (operands[2], destlo))
15102 /* Try to avoid unnecessary moves if part of the result
15103 is in the right place already. */
15104 if (src1 != dest)
15105 emit_move_insn (destlo, operands[1]);
15106 if (src2 != dest + halfregs)
15107 emit_move_insn (desthi, operands[2]);
15109 else
15111 if (src2 != dest + halfregs)
15112 emit_move_insn (desthi, operands[2]);
15113 if (src1 != dest)
15114 emit_move_insn (destlo, operands[1]);
15118 /* vec_perm support. */
15120 struct expand_vec_perm_d
15122 rtx target, op0, op1;
15123 vec_perm_indices perm;
15124 machine_mode vmode;
15125 unsigned int vec_flags;
15126 bool one_vector_p;
15127 bool testing_p;
15130 /* Generate a variable permutation. */
15132 static void
15133 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
15135 machine_mode vmode = GET_MODE (target);
15136 bool one_vector_p = rtx_equal_p (op0, op1);
15138 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15139 gcc_checking_assert (GET_MODE (op0) == vmode);
15140 gcc_checking_assert (GET_MODE (op1) == vmode);
15141 gcc_checking_assert (GET_MODE (sel) == vmode);
15142 gcc_checking_assert (TARGET_SIMD);
15144 if (one_vector_p)
15146 if (vmode == V8QImode)
15148 /* Expand the argument to a V16QI mode by duplicating it. */
15149 rtx pair = gen_reg_rtx (V16QImode);
15150 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15151 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15153 else
15155 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15158 else
15160 rtx pair;
15162 if (vmode == V8QImode)
15164 pair = gen_reg_rtx (V16QImode);
15165 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15166 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15168 else
15170 pair = gen_reg_rtx (OImode);
15171 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15172 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15177 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15178 NELT is the number of elements in the vector. */
15180 void
15181 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15182 unsigned int nelt)
15184 machine_mode vmode = GET_MODE (target);
15185 bool one_vector_p = rtx_equal_p (op0, op1);
15186 rtx mask;
15188 /* The TBL instruction does not use a modulo index, so we must take care
15189 of that ourselves. */
15190 mask = aarch64_simd_gen_const_vector_dup (vmode,
15191 one_vector_p ? nelt - 1 : 2 * nelt - 1);
15192 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15194 /* For big-endian, we also need to reverse the index within the vector
15195 (but not which vector). */
15196 if (BYTES_BIG_ENDIAN)
15198 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15199 if (!one_vector_p)
15200 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15201 sel = expand_simple_binop (vmode, XOR, sel, mask,
15202 NULL, 0, OPTAB_LIB_WIDEN);
15204 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15207 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15209 static void
15210 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15212 emit_insn (gen_rtx_SET (target,
15213 gen_rtx_UNSPEC (GET_MODE (target),
15214 gen_rtvec (2, op0, op1), code)));
15217 /* Expand an SVE vec_perm with the given operands. */
15219 void
15220 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15222 machine_mode data_mode = GET_MODE (target);
15223 machine_mode sel_mode = GET_MODE (sel);
15224 /* Enforced by the pattern condition. */
15225 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15227 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15228 size of the two value vectors, i.e. the upper bits of the indices
15229 are effectively ignored. SVE TBL instead produces 0 for any
15230 out-of-range indices, so we need to modulo all the vec_perm indices
15231 to ensure they are all in range. */
15232 rtx sel_reg = force_reg (sel_mode, sel);
15234 /* Check if the sel only references the first values vector. */
15235 if (GET_CODE (sel) == CONST_VECTOR
15236 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15238 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15239 return;
15242 /* Check if the two values vectors are the same. */
15243 if (rtx_equal_p (op0, op1))
15245 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15246 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15247 NULL, 0, OPTAB_DIRECT);
15248 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15249 return;
15252 /* Run TBL on for each value vector and combine the results. */
15254 rtx res0 = gen_reg_rtx (data_mode);
15255 rtx res1 = gen_reg_rtx (data_mode);
15256 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15257 if (GET_CODE (sel) != CONST_VECTOR
15258 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15260 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15261 2 * nunits - 1);
15262 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15263 NULL, 0, OPTAB_DIRECT);
15265 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15266 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15267 NULL, 0, OPTAB_DIRECT);
15268 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15269 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15270 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15271 else
15272 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15275 /* Recognize patterns suitable for the TRN instructions. */
15276 static bool
15277 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15279 HOST_WIDE_INT odd;
15280 poly_uint64 nelt = d->perm.length ();
15281 rtx out, in0, in1, x;
15282 machine_mode vmode = d->vmode;
15284 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15285 return false;
15287 /* Note that these are little-endian tests.
15288 We correct for big-endian later. */
15289 if (!d->perm[0].is_constant (&odd)
15290 || (odd != 0 && odd != 1)
15291 || !d->perm.series_p (0, 2, odd, 2)
15292 || !d->perm.series_p (1, 2, nelt + odd, 2))
15293 return false;
15295 /* Success! */
15296 if (d->testing_p)
15297 return true;
15299 in0 = d->op0;
15300 in1 = d->op1;
15301 /* We don't need a big-endian lane correction for SVE; see the comment
15302 at the head of aarch64-sve.md for details. */
15303 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15305 x = in0, in0 = in1, in1 = x;
15306 odd = !odd;
15308 out = d->target;
15310 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15311 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15312 return true;
15315 /* Recognize patterns suitable for the UZP instructions. */
15316 static bool
15317 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15319 HOST_WIDE_INT odd;
15320 rtx out, in0, in1, x;
15321 machine_mode vmode = d->vmode;
15323 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15324 return false;
15326 /* Note that these are little-endian tests.
15327 We correct for big-endian later. */
15328 if (!d->perm[0].is_constant (&odd)
15329 || (odd != 0 && odd != 1)
15330 || !d->perm.series_p (0, 1, odd, 2))
15331 return false;
15333 /* Success! */
15334 if (d->testing_p)
15335 return true;
15337 in0 = d->op0;
15338 in1 = d->op1;
15339 /* We don't need a big-endian lane correction for SVE; see the comment
15340 at the head of aarch64-sve.md for details. */
15341 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15343 x = in0, in0 = in1, in1 = x;
15344 odd = !odd;
15346 out = d->target;
15348 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15349 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15350 return true;
15353 /* Recognize patterns suitable for the ZIP instructions. */
15354 static bool
15355 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15357 unsigned int high;
15358 poly_uint64 nelt = d->perm.length ();
15359 rtx out, in0, in1, x;
15360 machine_mode vmode = d->vmode;
15362 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15363 return false;
15365 /* Note that these are little-endian tests.
15366 We correct for big-endian later. */
15367 poly_uint64 first = d->perm[0];
15368 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15369 || !d->perm.series_p (0, 2, first, 1)
15370 || !d->perm.series_p (1, 2, first + nelt, 1))
15371 return false;
15372 high = maybe_ne (first, 0U);
15374 /* Success! */
15375 if (d->testing_p)
15376 return true;
15378 in0 = d->op0;
15379 in1 = d->op1;
15380 /* We don't need a big-endian lane correction for SVE; see the comment
15381 at the head of aarch64-sve.md for details. */
15382 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15384 x = in0, in0 = in1, in1 = x;
15385 high = !high;
15387 out = d->target;
15389 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15390 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15391 return true;
15394 /* Recognize patterns for the EXT insn. */
15396 static bool
15397 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15399 HOST_WIDE_INT location;
15400 rtx offset;
15402 /* The first element always refers to the first vector.
15403 Check if the extracted indices are increasing by one. */
15404 if (d->vec_flags == VEC_SVE_PRED
15405 || !d->perm[0].is_constant (&location)
15406 || !d->perm.series_p (0, 1, location, 1))
15407 return false;
15409 /* Success! */
15410 if (d->testing_p)
15411 return true;
15413 /* The case where (location == 0) is a no-op for both big- and little-endian,
15414 and is removed by the mid-end at optimization levels -O1 and higher.
15416 We don't need a big-endian lane correction for SVE; see the comment
15417 at the head of aarch64-sve.md for details. */
15418 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15420 /* After setup, we want the high elements of the first vector (stored
15421 at the LSB end of the register), and the low elements of the second
15422 vector (stored at the MSB end of the register). So swap. */
15423 std::swap (d->op0, d->op1);
15424 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15425 to_constant () is safe since this is restricted to Advanced SIMD
15426 vectors. */
15427 location = d->perm.length ().to_constant () - location;
15430 offset = GEN_INT (location);
15431 emit_set_insn (d->target,
15432 gen_rtx_UNSPEC (d->vmode,
15433 gen_rtvec (3, d->op0, d->op1, offset),
15434 UNSPEC_EXT));
15435 return true;
15438 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15439 within each 64-bit, 32-bit or 16-bit granule. */
15441 static bool
15442 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15444 HOST_WIDE_INT diff;
15445 unsigned int i, size, unspec;
15446 machine_mode pred_mode;
15448 if (d->vec_flags == VEC_SVE_PRED
15449 || !d->one_vector_p
15450 || !d->perm[0].is_constant (&diff))
15451 return false;
15453 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15454 if (size == 8)
15456 unspec = UNSPEC_REV64;
15457 pred_mode = VNx2BImode;
15459 else if (size == 4)
15461 unspec = UNSPEC_REV32;
15462 pred_mode = VNx4BImode;
15464 else if (size == 2)
15466 unspec = UNSPEC_REV16;
15467 pred_mode = VNx8BImode;
15469 else
15470 return false;
15472 unsigned int step = diff + 1;
15473 for (i = 0; i < step; ++i)
15474 if (!d->perm.series_p (i, step, diff - i, step))
15475 return false;
15477 /* Success! */
15478 if (d->testing_p)
15479 return true;
15481 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15482 if (d->vec_flags == VEC_SVE_DATA)
15484 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15485 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15486 UNSPEC_MERGE_PTRUE);
15488 emit_set_insn (d->target, src);
15489 return true;
15492 /* Recognize patterns for the REV insn, which reverses elements within
15493 a full vector. */
15495 static bool
15496 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15498 poly_uint64 nelt = d->perm.length ();
15500 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15501 return false;
15503 if (!d->perm.series_p (0, 1, nelt - 1, -1))
15504 return false;
15506 /* Success! */
15507 if (d->testing_p)
15508 return true;
15510 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15511 emit_set_insn (d->target, src);
15512 return true;
15515 static bool
15516 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15518 rtx out = d->target;
15519 rtx in0;
15520 HOST_WIDE_INT elt;
15521 machine_mode vmode = d->vmode;
15522 rtx lane;
15524 if (d->vec_flags == VEC_SVE_PRED
15525 || d->perm.encoding ().encoded_nelts () != 1
15526 || !d->perm[0].is_constant (&elt))
15527 return false;
15529 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15530 return false;
15532 /* Success! */
15533 if (d->testing_p)
15534 return true;
15536 /* The generic preparation in aarch64_expand_vec_perm_const_1
15537 swaps the operand order and the permute indices if it finds
15538 d->perm[0] to be in the second operand. Thus, we can always
15539 use d->op0 and need not do any extra arithmetic to get the
15540 correct lane number. */
15541 in0 = d->op0;
15542 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
15544 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15545 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15546 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15547 return true;
15550 static bool
15551 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15553 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15554 machine_mode vmode = d->vmode;
15556 /* Make sure that the indices are constant. */
15557 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15558 for (unsigned int i = 0; i < encoded_nelts; ++i)
15559 if (!d->perm[i].is_constant ())
15560 return false;
15562 if (d->testing_p)
15563 return true;
15565 /* Generic code will try constant permutation twice. Once with the
15566 original mode and again with the elements lowered to QImode.
15567 So wait and don't do the selector expansion ourselves. */
15568 if (vmode != V8QImode && vmode != V16QImode)
15569 return false;
15571 /* to_constant is safe since this routine is specific to Advanced SIMD
15572 vectors. */
15573 unsigned int nelt = d->perm.length ().to_constant ();
15574 for (unsigned int i = 0; i < nelt; ++i)
15575 /* If big-endian and two vectors we end up with a weird mixed-endian
15576 mode on NEON. Reverse the index within each word but not the word
15577 itself. to_constant is safe because we checked is_constant above. */
15578 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15579 ? d->perm[i].to_constant () ^ (nelt - 1)
15580 : d->perm[i].to_constant ());
15582 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15583 sel = force_reg (vmode, sel);
15585 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15586 return true;
15589 /* Try to implement D using an SVE TBL instruction. */
15591 static bool
15592 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15594 unsigned HOST_WIDE_INT nelt;
15596 /* Permuting two variable-length vectors could overflow the
15597 index range. */
15598 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15599 return false;
15601 if (d->testing_p)
15602 return true;
15604 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15605 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15606 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15607 return true;
15610 static bool
15611 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15613 /* The pattern matching functions above are written to look for a small
15614 number to begin the sequence (0, 1, N/2). If we begin with an index
15615 from the second operand, we can swap the operands. */
15616 poly_int64 nelt = d->perm.length ();
15617 if (known_ge (d->perm[0], nelt))
15619 d->perm.rotate_inputs (1);
15620 std::swap (d->op0, d->op1);
15623 if ((d->vec_flags == VEC_ADVSIMD
15624 || d->vec_flags == VEC_SVE_DATA
15625 || d->vec_flags == VEC_SVE_PRED)
15626 && known_gt (nelt, 1))
15628 if (aarch64_evpc_rev_local (d))
15629 return true;
15630 else if (aarch64_evpc_rev_global (d))
15631 return true;
15632 else if (aarch64_evpc_ext (d))
15633 return true;
15634 else if (aarch64_evpc_dup (d))
15635 return true;
15636 else if (aarch64_evpc_zip (d))
15637 return true;
15638 else if (aarch64_evpc_uzp (d))
15639 return true;
15640 else if (aarch64_evpc_trn (d))
15641 return true;
15642 if (d->vec_flags == VEC_SVE_DATA)
15643 return aarch64_evpc_sve_tbl (d);
15644 else if (d->vec_flags == VEC_SVE_DATA)
15645 return aarch64_evpc_tbl (d);
15647 return false;
15650 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15652 static bool
15653 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15654 rtx op1, const vec_perm_indices &sel)
15656 struct expand_vec_perm_d d;
15658 /* Check whether the mask can be applied to a single vector. */
15659 if (op0 && rtx_equal_p (op0, op1))
15660 d.one_vector_p = true;
15661 else if (sel.all_from_input_p (0))
15663 d.one_vector_p = true;
15664 op1 = op0;
15666 else if (sel.all_from_input_p (1))
15668 d.one_vector_p = true;
15669 op0 = op1;
15671 else
15672 d.one_vector_p = false;
15674 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15675 sel.nelts_per_input ());
15676 d.vmode = vmode;
15677 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15678 d.target = target;
15679 d.op0 = op0;
15680 d.op1 = op1;
15681 d.testing_p = !target;
15683 if (!d.testing_p)
15684 return aarch64_expand_vec_perm_const_1 (&d);
15686 rtx_insn *last = get_last_insn ();
15687 bool ret = aarch64_expand_vec_perm_const_1 (&d);
15688 gcc_assert (last == get_last_insn ());
15690 return ret;
15693 /* Generate a byte permute mask for a register of mode MODE,
15694 which has NUNITS units. */
15697 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15699 /* We have to reverse each vector because we dont have
15700 a permuted load that can reverse-load according to ABI rules. */
15701 rtx mask;
15702 rtvec v = rtvec_alloc (16);
15703 unsigned int i, j;
15704 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15706 gcc_assert (BYTES_BIG_ENDIAN);
15707 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15709 for (i = 0; i < nunits; i++)
15710 for (j = 0; j < usize; j++)
15711 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15712 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15713 return force_reg (V16QImode, mask);
15716 /* Return true if X is a valid second operand for the SVE instruction
15717 that implements integer comparison OP_CODE. */
15719 static bool
15720 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15722 if (register_operand (x, VOIDmode))
15723 return true;
15725 switch (op_code)
15727 case LTU:
15728 case LEU:
15729 case GEU:
15730 case GTU:
15731 return aarch64_sve_cmp_immediate_p (x, false);
15732 case LT:
15733 case LE:
15734 case GE:
15735 case GT:
15736 case NE:
15737 case EQ:
15738 return aarch64_sve_cmp_immediate_p (x, true);
15739 default:
15740 gcc_unreachable ();
15744 /* Use predicated SVE instructions to implement the equivalent of:
15746 (set TARGET OP)
15748 given that PTRUE is an all-true predicate of the appropriate mode. */
15750 static void
15751 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15753 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15754 gen_rtvec (2, ptrue, op),
15755 UNSPEC_MERGE_PTRUE);
15756 rtx_insn *insn = emit_set_insn (target, unspec);
15757 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15760 /* Likewise, but also clobber the condition codes. */
15762 static void
15763 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15765 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15766 gen_rtvec (2, ptrue, op),
15767 UNSPEC_MERGE_PTRUE);
15768 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15769 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15772 /* Return the UNSPEC_COND_* code for comparison CODE. */
15774 static unsigned int
15775 aarch64_unspec_cond_code (rtx_code code)
15777 switch (code)
15779 case NE:
15780 return UNSPEC_COND_NE;
15781 case EQ:
15782 return UNSPEC_COND_EQ;
15783 case LT:
15784 return UNSPEC_COND_LT;
15785 case GT:
15786 return UNSPEC_COND_GT;
15787 case LE:
15788 return UNSPEC_COND_LE;
15789 case GE:
15790 return UNSPEC_COND_GE;
15791 default:
15792 gcc_unreachable ();
15796 /* Emit:
15798 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15800 where <X> is the operation associated with comparison CODE. This form
15801 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15802 semantics, such as when PRED might not be all-true and when comparing
15803 inactive lanes could have side effects. */
15805 static void
15806 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15807 rtx pred, rtx op0, rtx op1)
15809 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15810 gen_rtvec (3, pred, op0, op1),
15811 aarch64_unspec_cond_code (code));
15812 emit_set_insn (target, unspec);
15815 /* Expand an SVE integer comparison using the SVE equivalent of:
15817 (set TARGET (CODE OP0 OP1)). */
15819 void
15820 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15822 machine_mode pred_mode = GET_MODE (target);
15823 machine_mode data_mode = GET_MODE (op0);
15825 if (!aarch64_sve_cmp_operand_p (code, op1))
15826 op1 = force_reg (data_mode, op1);
15828 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15829 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15830 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15833 /* Emit the SVE equivalent of:
15835 (set TMP1 (CODE1 OP0 OP1))
15836 (set TMP2 (CODE2 OP0 OP1))
15837 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15839 PTRUE is an all-true predicate with the same mode as TARGET. */
15841 static void
15842 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15843 rtx ptrue, rtx op0, rtx op1)
15845 machine_mode pred_mode = GET_MODE (ptrue);
15846 rtx tmp1 = gen_reg_rtx (pred_mode);
15847 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15848 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15849 rtx tmp2 = gen_reg_rtx (pred_mode);
15850 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15851 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15852 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15855 /* Emit the SVE equivalent of:
15857 (set TMP (CODE OP0 OP1))
15858 (set TARGET (not TMP))
15860 PTRUE is an all-true predicate with the same mode as TARGET. */
15862 static void
15863 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15864 rtx op0, rtx op1)
15866 machine_mode pred_mode = GET_MODE (ptrue);
15867 rtx tmp = gen_reg_rtx (pred_mode);
15868 aarch64_emit_sve_ptrue_op (tmp, ptrue,
15869 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15870 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15873 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15875 (set TARGET (CODE OP0 OP1))
15877 If CAN_INVERT_P is true, the caller can also handle inverted results;
15878 return true if the result is in fact inverted. */
15880 bool
15881 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15882 rtx op0, rtx op1, bool can_invert_p)
15884 machine_mode pred_mode = GET_MODE (target);
15885 machine_mode data_mode = GET_MODE (op0);
15887 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15888 switch (code)
15890 case UNORDERED:
15891 /* UNORDERED has no immediate form. */
15892 op1 = force_reg (data_mode, op1);
15893 /* fall through */
15894 case LT:
15895 case LE:
15896 case GT:
15897 case GE:
15898 case EQ:
15899 case NE:
15901 /* There is native support for the comparison. */
15902 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15903 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15904 return false;
15907 case LTGT:
15908 /* This is a trapping operation (LT or GT). */
15909 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15910 return false;
15912 case UNEQ:
15913 if (!flag_trapping_math)
15915 /* This would trap for signaling NaNs. */
15916 op1 = force_reg (data_mode, op1);
15917 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15918 return false;
15920 /* fall through */
15921 case UNLT:
15922 case UNLE:
15923 case UNGT:
15924 case UNGE:
15925 if (flag_trapping_math)
15927 /* Work out which elements are ordered. */
15928 rtx ordered = gen_reg_rtx (pred_mode);
15929 op1 = force_reg (data_mode, op1);
15930 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
15932 /* Test the opposite condition for the ordered elements,
15933 then invert the result. */
15934 if (code == UNEQ)
15935 code = NE;
15936 else
15937 code = reverse_condition_maybe_unordered (code);
15938 if (can_invert_p)
15940 aarch64_emit_sve_predicated_cond (target, code,
15941 ordered, op0, op1);
15942 return true;
15944 rtx tmp = gen_reg_rtx (pred_mode);
15945 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
15946 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15947 return false;
15949 break;
15951 case ORDERED:
15952 /* ORDERED has no immediate form. */
15953 op1 = force_reg (data_mode, op1);
15954 break;
15956 default:
15957 gcc_unreachable ();
15960 /* There is native support for the inverse comparison. */
15961 code = reverse_condition_maybe_unordered (code);
15962 if (can_invert_p)
15964 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15965 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15966 return true;
15968 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
15969 return false;
15972 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
15973 of the data being selected and CMP_MODE is the mode of the values being
15974 compared. */
15976 void
15977 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15978 rtx *ops)
15980 machine_mode pred_mode
15981 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15982 GET_MODE_SIZE (cmp_mode)).require ();
15983 rtx pred = gen_reg_rtx (pred_mode);
15984 if (FLOAT_MODE_P (cmp_mode))
15986 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15987 ops[4], ops[5], true))
15988 std::swap (ops[1], ops[2]);
15990 else
15991 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15993 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15994 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15997 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
15998 true. However due to issues with register allocation it is preferable
15999 to avoid tieing integer scalar and FP scalar modes. Executing integer
16000 operations in general registers is better than treating them as scalar
16001 vector operations. This reduces latency and avoids redundant int<->FP
16002 moves. So tie modes if they are either the same class, or vector modes
16003 with other vector modes, vector structs or any scalar mode. */
16005 static bool
16006 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
16008 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
16009 return true;
16011 /* We specifically want to allow elements of "structure" modes to
16012 be tieable to the structure. This more general condition allows
16013 other rarer situations too. The reason we don't extend this to
16014 predicate modes is that there are no predicate structure modes
16015 nor any specific instructions for extracting part of a predicate
16016 register. */
16017 if (aarch64_vector_data_mode_p (mode1)
16018 && aarch64_vector_data_mode_p (mode2))
16019 return true;
16021 /* Also allow any scalar modes with vectors. */
16022 if (aarch64_vector_mode_supported_p (mode1)
16023 || aarch64_vector_mode_supported_p (mode2))
16024 return true;
16026 return false;
16029 /* Return a new RTX holding the result of moving POINTER forward by
16030 AMOUNT bytes. */
16032 static rtx
16033 aarch64_move_pointer (rtx pointer, poly_int64 amount)
16035 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
16037 return adjust_automodify_address (pointer, GET_MODE (pointer),
16038 next, amount);
16041 /* Return a new RTX holding the result of moving POINTER forward by the
16042 size of the mode it points to. */
16044 static rtx
16045 aarch64_progress_pointer (rtx pointer)
16047 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
16050 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
16051 MODE bytes. */
16053 static void
16054 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
16055 machine_mode mode)
16057 rtx reg = gen_reg_rtx (mode);
16059 /* "Cast" the pointers to the correct mode. */
16060 *src = adjust_address (*src, mode, 0);
16061 *dst = adjust_address (*dst, mode, 0);
16062 /* Emit the memcpy. */
16063 emit_move_insn (reg, *src);
16064 emit_move_insn (*dst, reg);
16065 /* Move the pointers forward. */
16066 *src = aarch64_progress_pointer (*src);
16067 *dst = aarch64_progress_pointer (*dst);
16070 /* Expand movmem, as if from a __builtin_memcpy. Return true if
16071 we succeed, otherwise return false. */
16073 bool
16074 aarch64_expand_movmem (rtx *operands)
16076 unsigned int n;
16077 rtx dst = operands[0];
16078 rtx src = operands[1];
16079 rtx base;
16080 bool speed_p = !optimize_function_for_size_p (cfun);
16082 /* When optimizing for size, give a better estimate of the length of a
16083 memcpy call, but use the default otherwise. */
16084 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
16086 /* We can't do anything smart if the amount to copy is not constant. */
16087 if (!CONST_INT_P (operands[2]))
16088 return false;
16090 n = UINTVAL (operands[2]);
16092 /* Try to keep the number of instructions low. For cases below 16 bytes we
16093 need to make at most two moves. For cases above 16 bytes it will be one
16094 move for each 16 byte chunk, then at most two additional moves. */
16095 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
16096 return false;
16098 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
16099 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
16101 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
16102 src = adjust_automodify_address (src, VOIDmode, base, 0);
16104 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
16105 1-byte chunk. */
16106 if (n < 4)
16108 if (n >= 2)
16110 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16111 n -= 2;
16114 if (n == 1)
16115 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16117 return true;
16120 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
16121 4-byte chunk, partially overlapping with the previously copied chunk. */
16122 if (n < 8)
16124 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16125 n -= 4;
16126 if (n > 0)
16128 int move = n - 4;
16130 src = aarch64_move_pointer (src, move);
16131 dst = aarch64_move_pointer (dst, move);
16132 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16134 return true;
16137 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
16138 them, then (if applicable) an 8-byte chunk. */
16139 while (n >= 8)
16141 if (n / 16)
16143 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
16144 n -= 16;
16146 else
16148 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16149 n -= 8;
16153 /* Finish the final bytes of the copy. We can always do this in one
16154 instruction. We either copy the exact amount we need, or partially
16155 overlap with the previous chunk we copied and copy 8-bytes. */
16156 if (n == 0)
16157 return true;
16158 else if (n == 1)
16159 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
16160 else if (n == 2)
16161 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
16162 else if (n == 4)
16163 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16164 else
16166 if (n == 3)
16168 src = aarch64_move_pointer (src, -1);
16169 dst = aarch64_move_pointer (dst, -1);
16170 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
16172 else
16174 int move = n - 8;
16176 src = aarch64_move_pointer (src, move);
16177 dst = aarch64_move_pointer (dst, move);
16178 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
16182 return true;
16185 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16186 SImode stores. Handle the case when the constant has identical
16187 bottom and top halves. This is beneficial when the two stores can be
16188 merged into an STP and we avoid synthesising potentially expensive
16189 immediates twice. Return true if such a split is possible. */
16191 bool
16192 aarch64_split_dimode_const_store (rtx dst, rtx src)
16194 rtx lo = gen_lowpart (SImode, src);
16195 rtx hi = gen_highpart_mode (SImode, DImode, src);
16197 bool size_p = optimize_function_for_size_p (cfun);
16199 if (!rtx_equal_p (lo, hi))
16200 return false;
16202 unsigned int orig_cost
16203 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16204 unsigned int lo_cost
16205 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16207 /* We want to transform:
16208 MOV x1, 49370
16209 MOVK x1, 0x140, lsl 16
16210 MOVK x1, 0xc0da, lsl 32
16211 MOVK x1, 0x140, lsl 48
16212 STR x1, [x0]
16213 into:
16214 MOV w1, 49370
16215 MOVK w1, 0x140, lsl 16
16216 STP w1, w1, [x0]
16217 So we want to perform this only when we save two instructions
16218 or more. When optimizing for size, however, accept any code size
16219 savings we can. */
16220 if (size_p && orig_cost <= lo_cost)
16221 return false;
16223 if (!size_p
16224 && (orig_cost <= lo_cost + 1))
16225 return false;
16227 rtx mem_lo = adjust_address (dst, SImode, 0);
16228 if (!aarch64_mem_pair_operand (mem_lo, SImode))
16229 return false;
16231 rtx tmp_reg = gen_reg_rtx (SImode);
16232 aarch64_expand_mov_immediate (tmp_reg, lo);
16233 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16234 /* Don't emit an explicit store pair as this may not be always profitable.
16235 Let the sched-fusion logic decide whether to merge them. */
16236 emit_move_insn (mem_lo, tmp_reg);
16237 emit_move_insn (mem_hi, tmp_reg);
16239 return true;
16242 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16244 static unsigned HOST_WIDE_INT
16245 aarch64_asan_shadow_offset (void)
16247 return (HOST_WIDE_INT_1 << 36);
16250 static rtx
16251 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16252 int code, tree treeop0, tree treeop1)
16254 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16255 rtx op0, op1;
16256 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16257 insn_code icode;
16258 struct expand_operand ops[4];
16260 start_sequence ();
16261 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16263 op_mode = GET_MODE (op0);
16264 if (op_mode == VOIDmode)
16265 op_mode = GET_MODE (op1);
16267 switch (op_mode)
16269 case E_QImode:
16270 case E_HImode:
16271 case E_SImode:
16272 cmp_mode = SImode;
16273 icode = CODE_FOR_cmpsi;
16274 break;
16276 case E_DImode:
16277 cmp_mode = DImode;
16278 icode = CODE_FOR_cmpdi;
16279 break;
16281 case E_SFmode:
16282 cmp_mode = SFmode;
16283 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16284 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16285 break;
16287 case E_DFmode:
16288 cmp_mode = DFmode;
16289 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16290 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16291 break;
16293 default:
16294 end_sequence ();
16295 return NULL_RTX;
16298 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16299 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16300 if (!op0 || !op1)
16302 end_sequence ();
16303 return NULL_RTX;
16305 *prep_seq = get_insns ();
16306 end_sequence ();
16308 create_fixed_operand (&ops[0], op0);
16309 create_fixed_operand (&ops[1], op1);
16311 start_sequence ();
16312 if (!maybe_expand_insn (icode, 2, ops))
16314 end_sequence ();
16315 return NULL_RTX;
16317 *gen_seq = get_insns ();
16318 end_sequence ();
16320 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16321 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16324 static rtx
16325 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16326 int cmp_code, tree treeop0, tree treeop1, int bit_code)
16328 rtx op0, op1, target;
16329 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16330 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16331 insn_code icode;
16332 struct expand_operand ops[6];
16333 int aarch64_cond;
16335 push_to_sequence (*prep_seq);
16336 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16338 op_mode = GET_MODE (op0);
16339 if (op_mode == VOIDmode)
16340 op_mode = GET_MODE (op1);
16342 switch (op_mode)
16344 case E_QImode:
16345 case E_HImode:
16346 case E_SImode:
16347 cmp_mode = SImode;
16348 icode = CODE_FOR_ccmpsi;
16349 break;
16351 case E_DImode:
16352 cmp_mode = DImode;
16353 icode = CODE_FOR_ccmpdi;
16354 break;
16356 case E_SFmode:
16357 cmp_mode = SFmode;
16358 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16359 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16360 break;
16362 case E_DFmode:
16363 cmp_mode = DFmode;
16364 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16365 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16366 break;
16368 default:
16369 end_sequence ();
16370 return NULL_RTX;
16373 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16374 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16375 if (!op0 || !op1)
16377 end_sequence ();
16378 return NULL_RTX;
16380 *prep_seq = get_insns ();
16381 end_sequence ();
16383 target = gen_rtx_REG (cc_mode, CC_REGNUM);
16384 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16386 if (bit_code != AND)
16388 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16389 GET_MODE (XEXP (prev, 0))),
16390 VOIDmode, XEXP (prev, 0), const0_rtx);
16391 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16394 create_fixed_operand (&ops[0], XEXP (prev, 0));
16395 create_fixed_operand (&ops[1], target);
16396 create_fixed_operand (&ops[2], op0);
16397 create_fixed_operand (&ops[3], op1);
16398 create_fixed_operand (&ops[4], prev);
16399 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16401 push_to_sequence (*gen_seq);
16402 if (!maybe_expand_insn (icode, 6, ops))
16404 end_sequence ();
16405 return NULL_RTX;
16408 *gen_seq = get_insns ();
16409 end_sequence ();
16411 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16414 #undef TARGET_GEN_CCMP_FIRST
16415 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16417 #undef TARGET_GEN_CCMP_NEXT
16418 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16420 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16421 instruction fusion of some sort. */
16423 static bool
16424 aarch64_macro_fusion_p (void)
16426 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16430 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16431 should be kept together during scheduling. */
16433 static bool
16434 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16436 rtx set_dest;
16437 rtx prev_set = single_set (prev);
16438 rtx curr_set = single_set (curr);
16439 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16440 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16442 if (!aarch64_macro_fusion_p ())
16443 return false;
16445 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16447 /* We are trying to match:
16448 prev (mov) == (set (reg r0) (const_int imm16))
16449 curr (movk) == (set (zero_extract (reg r0)
16450 (const_int 16)
16451 (const_int 16))
16452 (const_int imm16_1)) */
16454 set_dest = SET_DEST (curr_set);
16456 if (GET_CODE (set_dest) == ZERO_EXTRACT
16457 && CONST_INT_P (SET_SRC (curr_set))
16458 && CONST_INT_P (SET_SRC (prev_set))
16459 && CONST_INT_P (XEXP (set_dest, 2))
16460 && INTVAL (XEXP (set_dest, 2)) == 16
16461 && REG_P (XEXP (set_dest, 0))
16462 && REG_P (SET_DEST (prev_set))
16463 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16465 return true;
16469 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16472 /* We're trying to match:
16473 prev (adrp) == (set (reg r1)
16474 (high (symbol_ref ("SYM"))))
16475 curr (add) == (set (reg r0)
16476 (lo_sum (reg r1)
16477 (symbol_ref ("SYM"))))
16478 Note that r0 need not necessarily be the same as r1, especially
16479 during pre-regalloc scheduling. */
16481 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16482 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16484 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16485 && REG_P (XEXP (SET_SRC (curr_set), 0))
16486 && REGNO (XEXP (SET_SRC (curr_set), 0))
16487 == REGNO (SET_DEST (prev_set))
16488 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16489 XEXP (SET_SRC (curr_set), 1)))
16490 return true;
16494 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16497 /* We're trying to match:
16498 prev (movk) == (set (zero_extract (reg r0)
16499 (const_int 16)
16500 (const_int 32))
16501 (const_int imm16_1))
16502 curr (movk) == (set (zero_extract (reg r0)
16503 (const_int 16)
16504 (const_int 48))
16505 (const_int imm16_2)) */
16507 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16508 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16509 && REG_P (XEXP (SET_DEST (prev_set), 0))
16510 && REG_P (XEXP (SET_DEST (curr_set), 0))
16511 && REGNO (XEXP (SET_DEST (prev_set), 0))
16512 == REGNO (XEXP (SET_DEST (curr_set), 0))
16513 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16514 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16515 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16516 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16517 && CONST_INT_P (SET_SRC (prev_set))
16518 && CONST_INT_P (SET_SRC (curr_set)))
16519 return true;
16522 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16524 /* We're trying to match:
16525 prev (adrp) == (set (reg r0)
16526 (high (symbol_ref ("SYM"))))
16527 curr (ldr) == (set (reg r1)
16528 (mem (lo_sum (reg r0)
16529 (symbol_ref ("SYM")))))
16531 curr (ldr) == (set (reg r1)
16532 (zero_extend (mem
16533 (lo_sum (reg r0)
16534 (symbol_ref ("SYM")))))) */
16535 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16536 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16538 rtx curr_src = SET_SRC (curr_set);
16540 if (GET_CODE (curr_src) == ZERO_EXTEND)
16541 curr_src = XEXP (curr_src, 0);
16543 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16544 && REG_P (XEXP (XEXP (curr_src, 0), 0))
16545 && REGNO (XEXP (XEXP (curr_src, 0), 0))
16546 == REGNO (SET_DEST (prev_set))
16547 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16548 XEXP (SET_SRC (prev_set), 0)))
16549 return true;
16553 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16554 && aarch_crypto_can_dual_issue (prev, curr))
16555 return true;
16557 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16558 && any_condjump_p (curr))
16560 enum attr_type prev_type = get_attr_type (prev);
16562 unsigned int condreg1, condreg2;
16563 rtx cc_reg_1;
16564 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16565 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16567 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16568 && prev
16569 && modified_in_p (cc_reg_1, prev))
16571 /* FIXME: this misses some which is considered simple arthematic
16572 instructions for ThunderX. Simple shifts are missed here. */
16573 if (prev_type == TYPE_ALUS_SREG
16574 || prev_type == TYPE_ALUS_IMM
16575 || prev_type == TYPE_LOGICS_REG
16576 || prev_type == TYPE_LOGICS_IMM)
16577 return true;
16581 if (prev_set
16582 && curr_set
16583 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16584 && any_condjump_p (curr))
16586 /* We're trying to match:
16587 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16588 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16589 (const_int 0))
16590 (label_ref ("SYM"))
16591 (pc)) */
16592 if (SET_DEST (curr_set) == (pc_rtx)
16593 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16594 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16595 && REG_P (SET_DEST (prev_set))
16596 && REGNO (SET_DEST (prev_set))
16597 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16599 /* Fuse ALU operations followed by conditional branch instruction. */
16600 switch (get_attr_type (prev))
16602 case TYPE_ALU_IMM:
16603 case TYPE_ALU_SREG:
16604 case TYPE_ADC_REG:
16605 case TYPE_ADC_IMM:
16606 case TYPE_ADCS_REG:
16607 case TYPE_ADCS_IMM:
16608 case TYPE_LOGIC_REG:
16609 case TYPE_LOGIC_IMM:
16610 case TYPE_CSEL:
16611 case TYPE_ADR:
16612 case TYPE_MOV_IMM:
16613 case TYPE_SHIFT_REG:
16614 case TYPE_SHIFT_IMM:
16615 case TYPE_BFM:
16616 case TYPE_RBIT:
16617 case TYPE_REV:
16618 case TYPE_EXTEND:
16619 return true;
16621 default:;
16626 return false;
16629 /* Return true iff the instruction fusion described by OP is enabled. */
16631 bool
16632 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16634 return (aarch64_tune_params.fusible_ops & op) != 0;
16637 /* If MEM is in the form of [base+offset], extract the two parts
16638 of address and set to BASE and OFFSET, otherwise return false
16639 after clearing BASE and OFFSET. */
16641 bool
16642 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16644 rtx addr;
16646 gcc_assert (MEM_P (mem));
16648 addr = XEXP (mem, 0);
16650 if (REG_P (addr))
16652 *base = addr;
16653 *offset = const0_rtx;
16654 return true;
16657 if (GET_CODE (addr) == PLUS
16658 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16660 *base = XEXP (addr, 0);
16661 *offset = XEXP (addr, 1);
16662 return true;
16665 *base = NULL_RTX;
16666 *offset = NULL_RTX;
16668 return false;
16671 /* Types for scheduling fusion. */
16672 enum sched_fusion_type
16674 SCHED_FUSION_NONE = 0,
16675 SCHED_FUSION_LD_SIGN_EXTEND,
16676 SCHED_FUSION_LD_ZERO_EXTEND,
16677 SCHED_FUSION_LD,
16678 SCHED_FUSION_ST,
16679 SCHED_FUSION_NUM
16682 /* If INSN is a load or store of address in the form of [base+offset],
16683 extract the two parts and set to BASE and OFFSET. Return scheduling
16684 fusion type this INSN is. */
16686 static enum sched_fusion_type
16687 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16689 rtx x, dest, src;
16690 enum sched_fusion_type fusion = SCHED_FUSION_LD;
16692 gcc_assert (INSN_P (insn));
16693 x = PATTERN (insn);
16694 if (GET_CODE (x) != SET)
16695 return SCHED_FUSION_NONE;
16697 src = SET_SRC (x);
16698 dest = SET_DEST (x);
16700 machine_mode dest_mode = GET_MODE (dest);
16702 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16703 return SCHED_FUSION_NONE;
16705 if (GET_CODE (src) == SIGN_EXTEND)
16707 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16708 src = XEXP (src, 0);
16709 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16710 return SCHED_FUSION_NONE;
16712 else if (GET_CODE (src) == ZERO_EXTEND)
16714 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16715 src = XEXP (src, 0);
16716 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16717 return SCHED_FUSION_NONE;
16720 if (GET_CODE (src) == MEM && REG_P (dest))
16721 extract_base_offset_in_addr (src, base, offset);
16722 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16724 fusion = SCHED_FUSION_ST;
16725 extract_base_offset_in_addr (dest, base, offset);
16727 else
16728 return SCHED_FUSION_NONE;
16730 if (*base == NULL_RTX || *offset == NULL_RTX)
16731 fusion = SCHED_FUSION_NONE;
16733 return fusion;
16736 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16738 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16739 and PRI are only calculated for these instructions. For other instruction,
16740 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16741 type instruction fusion can be added by returning different priorities.
16743 It's important that irrelevant instructions get the largest FUSION_PRI. */
16745 static void
16746 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16747 int *fusion_pri, int *pri)
16749 int tmp, off_val;
16750 rtx base, offset;
16751 enum sched_fusion_type fusion;
16753 gcc_assert (INSN_P (insn));
16755 tmp = max_pri - 1;
16756 fusion = fusion_load_store (insn, &base, &offset);
16757 if (fusion == SCHED_FUSION_NONE)
16759 *pri = tmp;
16760 *fusion_pri = tmp;
16761 return;
16764 /* Set FUSION_PRI according to fusion type and base register. */
16765 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16767 /* Calculate PRI. */
16768 tmp /= 2;
16770 /* INSN with smaller offset goes first. */
16771 off_val = (int)(INTVAL (offset));
16772 if (off_val >= 0)
16773 tmp -= (off_val & 0xfffff);
16774 else
16775 tmp += ((- off_val) & 0xfffff);
16777 *pri = tmp;
16778 return;
16781 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16782 Adjust priority of sha1h instructions so they are scheduled before
16783 other SHA1 instructions. */
16785 static int
16786 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16788 rtx x = PATTERN (insn);
16790 if (GET_CODE (x) == SET)
16792 x = SET_SRC (x);
16794 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16795 return priority + 10;
16798 return priority;
16801 /* Given OPERANDS of consecutive load/store, check if we can merge
16802 them into ldp/stp. LOAD is true if they are load instructions.
16803 MODE is the mode of memory operands. */
16805 bool
16806 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16807 machine_mode mode)
16809 HOST_WIDE_INT offval_1, offval_2, msize;
16810 enum reg_class rclass_1, rclass_2;
16811 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16813 if (load)
16815 mem_1 = operands[1];
16816 mem_2 = operands[3];
16817 reg_1 = operands[0];
16818 reg_2 = operands[2];
16819 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16820 if (REGNO (reg_1) == REGNO (reg_2))
16821 return false;
16823 else
16825 mem_1 = operands[0];
16826 mem_2 = operands[2];
16827 reg_1 = operands[1];
16828 reg_2 = operands[3];
16831 /* The mems cannot be volatile. */
16832 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16833 return false;
16835 /* If we have SImode and slow unaligned ldp,
16836 check the alignment to be at least 8 byte. */
16837 if (mode == SImode
16838 && (aarch64_tune_params.extra_tuning_flags
16839 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16840 && !optimize_size
16841 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16842 return false;
16844 /* Check if the addresses are in the form of [base+offset]. */
16845 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16846 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16847 return false;
16848 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16849 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16850 return false;
16852 /* Check if the bases are same. */
16853 if (!rtx_equal_p (base_1, base_2))
16854 return false;
16856 /* The operands must be of the same size. */
16857 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16858 GET_MODE_SIZE (GET_MODE (mem_2))));
16860 offval_1 = INTVAL (offset_1);
16861 offval_2 = INTVAL (offset_2);
16862 /* We should only be trying this for fixed-sized modes. There is no
16863 SVE LDP/STP instruction. */
16864 msize = GET_MODE_SIZE (mode).to_constant ();
16865 /* Check if the offsets are consecutive. */
16866 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16867 return false;
16869 /* Check if the addresses are clobbered by load. */
16870 if (load)
16872 if (reg_mentioned_p (reg_1, mem_1))
16873 return false;
16875 /* In increasing order, the last load can clobber the address. */
16876 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16877 return false;
16880 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16881 rclass_1 = FP_REGS;
16882 else
16883 rclass_1 = GENERAL_REGS;
16885 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16886 rclass_2 = FP_REGS;
16887 else
16888 rclass_2 = GENERAL_REGS;
16890 /* Check if the registers are of same class. */
16891 if (rclass_1 != rclass_2)
16892 return false;
16894 return true;
16897 /* Given OPERANDS of consecutive load/store, check if we can merge
16898 them into ldp/stp by adjusting the offset. LOAD is true if they
16899 are load instructions. MODE is the mode of memory operands.
16901 Given below consecutive stores:
16903 str w1, [xb, 0x100]
16904 str w1, [xb, 0x104]
16905 str w1, [xb, 0x108]
16906 str w1, [xb, 0x10c]
16908 Though the offsets are out of the range supported by stp, we can
16909 still pair them after adjusting the offset, like:
16911 add scratch, xb, 0x100
16912 stp w1, w1, [scratch]
16913 stp w1, w1, [scratch, 0x8]
16915 The peephole patterns detecting this opportunity should guarantee
16916 the scratch register is avaliable. */
16918 bool
16919 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16920 scalar_mode mode)
16922 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
16923 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
16924 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
16925 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
16927 if (load)
16929 reg_1 = operands[0];
16930 mem_1 = operands[1];
16931 reg_2 = operands[2];
16932 mem_2 = operands[3];
16933 reg_3 = operands[4];
16934 mem_3 = operands[5];
16935 reg_4 = operands[6];
16936 mem_4 = operands[7];
16937 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
16938 && REG_P (reg_3) && REG_P (reg_4));
16939 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
16940 return false;
16942 else
16944 mem_1 = operands[0];
16945 reg_1 = operands[1];
16946 mem_2 = operands[2];
16947 reg_2 = operands[3];
16948 mem_3 = operands[4];
16949 reg_3 = operands[5];
16950 mem_4 = operands[6];
16951 reg_4 = operands[7];
16953 /* Skip if memory operand is by itslef valid for ldp/stp. */
16954 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
16955 return false;
16957 /* The mems cannot be volatile. */
16958 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
16959 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
16960 return false;
16962 /* Check if the addresses are in the form of [base+offset]. */
16963 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16964 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16965 return false;
16966 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16967 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16968 return false;
16969 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
16970 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
16971 return false;
16972 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
16973 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
16974 return false;
16976 /* Check if the bases are same. */
16977 if (!rtx_equal_p (base_1, base_2)
16978 || !rtx_equal_p (base_2, base_3)
16979 || !rtx_equal_p (base_3, base_4))
16980 return false;
16982 offval_1 = INTVAL (offset_1);
16983 offval_2 = INTVAL (offset_2);
16984 offval_3 = INTVAL (offset_3);
16985 offval_4 = INTVAL (offset_4);
16986 msize = GET_MODE_SIZE (mode);
16987 /* Check if the offsets are consecutive. */
16988 if ((offval_1 != (offval_2 + msize)
16989 || offval_1 != (offval_3 + msize * 2)
16990 || offval_1 != (offval_4 + msize * 3))
16991 && (offval_4 != (offval_3 + msize)
16992 || offval_4 != (offval_2 + msize * 2)
16993 || offval_4 != (offval_1 + msize * 3)))
16994 return false;
16996 /* Check if the addresses are clobbered by load. */
16997 if (load)
16999 if (reg_mentioned_p (reg_1, mem_1)
17000 || reg_mentioned_p (reg_2, mem_2)
17001 || reg_mentioned_p (reg_3, mem_3))
17002 return false;
17004 /* In increasing order, the last load can clobber the address. */
17005 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
17006 return false;
17009 /* If we have SImode and slow unaligned ldp,
17010 check the alignment to be at least 8 byte. */
17011 if (mode == SImode
17012 && (aarch64_tune_params.extra_tuning_flags
17013 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17014 && !optimize_size
17015 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
17016 return false;
17018 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
17019 rclass_1 = FP_REGS;
17020 else
17021 rclass_1 = GENERAL_REGS;
17023 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
17024 rclass_2 = FP_REGS;
17025 else
17026 rclass_2 = GENERAL_REGS;
17028 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
17029 rclass_3 = FP_REGS;
17030 else
17031 rclass_3 = GENERAL_REGS;
17033 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
17034 rclass_4 = FP_REGS;
17035 else
17036 rclass_4 = GENERAL_REGS;
17038 /* Check if the registers are of same class. */
17039 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
17040 return false;
17042 return true;
17045 /* Given OPERANDS of consecutive load/store, this function pairs them
17046 into ldp/stp after adjusting the offset. It depends on the fact
17047 that addresses of load/store instructions are in increasing order.
17048 MODE is the mode of memory operands. CODE is the rtl operator
17049 which should be applied to all memory operands, it's SIGN_EXTEND,
17050 ZERO_EXTEND or UNKNOWN. */
17052 bool
17053 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17054 scalar_mode mode, RTX_CODE code)
17056 rtx base, offset, t1, t2;
17057 rtx mem_1, mem_2, mem_3, mem_4;
17058 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
17060 if (load)
17062 mem_1 = operands[1];
17063 mem_2 = operands[3];
17064 mem_3 = operands[5];
17065 mem_4 = operands[7];
17067 else
17069 mem_1 = operands[0];
17070 mem_2 = operands[2];
17071 mem_3 = operands[4];
17072 mem_4 = operands[6];
17073 gcc_assert (code == UNKNOWN);
17076 extract_base_offset_in_addr (mem_1, &base, &offset);
17077 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
17079 /* Adjust offset thus it can fit in ldp/stp instruction. */
17080 msize = GET_MODE_SIZE (mode);
17081 stp_off_limit = msize * 0x40;
17082 off_val = INTVAL (offset);
17083 abs_off = (off_val < 0) ? -off_val : off_val;
17084 new_off = abs_off % stp_off_limit;
17085 adj_off = abs_off - new_off;
17087 /* Further adjust to make sure all offsets are OK. */
17088 if ((new_off + msize * 2) >= stp_off_limit)
17090 adj_off += stp_off_limit;
17091 new_off -= stp_off_limit;
17094 /* Make sure the adjustment can be done with ADD/SUB instructions. */
17095 if (adj_off >= 0x1000)
17096 return false;
17098 if (off_val < 0)
17100 adj_off = -adj_off;
17101 new_off = -new_off;
17104 /* Create new memory references. */
17105 mem_1 = change_address (mem_1, VOIDmode,
17106 plus_constant (DImode, operands[8], new_off));
17108 /* Check if the adjusted address is OK for ldp/stp. */
17109 if (!aarch64_mem_pair_operand (mem_1, mode))
17110 return false;
17112 msize = GET_MODE_SIZE (mode);
17113 mem_2 = change_address (mem_2, VOIDmode,
17114 plus_constant (DImode,
17115 operands[8],
17116 new_off + msize));
17117 mem_3 = change_address (mem_3, VOIDmode,
17118 plus_constant (DImode,
17119 operands[8],
17120 new_off + msize * 2));
17121 mem_4 = change_address (mem_4, VOIDmode,
17122 plus_constant (DImode,
17123 operands[8],
17124 new_off + msize * 3));
17126 if (code == ZERO_EXTEND)
17128 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17129 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17130 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17131 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17133 else if (code == SIGN_EXTEND)
17135 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17136 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17137 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17138 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17141 if (load)
17143 operands[1] = mem_1;
17144 operands[3] = mem_2;
17145 operands[5] = mem_3;
17146 operands[7] = mem_4;
17148 else
17150 operands[0] = mem_1;
17151 operands[2] = mem_2;
17152 operands[4] = mem_3;
17153 operands[6] = mem_4;
17156 /* Emit adjusting instruction. */
17157 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
17158 /* Emit ldp/stp instructions. */
17159 t1 = gen_rtx_SET (operands[0], operands[1]);
17160 t2 = gen_rtx_SET (operands[2], operands[3]);
17161 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17162 t1 = gen_rtx_SET (operands[4], operands[5]);
17163 t2 = gen_rtx_SET (operands[6], operands[7]);
17164 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17165 return true;
17168 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17169 it isn't worth branching around empty masked ops (including masked
17170 stores). */
17172 static bool
17173 aarch64_empty_mask_is_expensive (unsigned)
17175 return false;
17178 /* Return 1 if pseudo register should be created and used to hold
17179 GOT address for PIC code. */
17181 bool
17182 aarch64_use_pseudo_pic_reg (void)
17184 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17187 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17189 static int
17190 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17192 switch (XINT (x, 1))
17194 case UNSPEC_GOTSMALLPIC:
17195 case UNSPEC_GOTSMALLPIC28K:
17196 case UNSPEC_GOTTINYPIC:
17197 return 0;
17198 default:
17199 break;
17202 return default_unspec_may_trap_p (x, flags);
17206 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17207 return the log2 of that value. Otherwise return -1. */
17210 aarch64_fpconst_pow_of_2 (rtx x)
17212 const REAL_VALUE_TYPE *r;
17214 if (!CONST_DOUBLE_P (x))
17215 return -1;
17217 r = CONST_DOUBLE_REAL_VALUE (x);
17219 if (REAL_VALUE_NEGATIVE (*r)
17220 || REAL_VALUE_ISNAN (*r)
17221 || REAL_VALUE_ISINF (*r)
17222 || !real_isinteger (r, DFmode))
17223 return -1;
17225 return exact_log2 (real_to_integer (r));
17228 /* If X is a vector of equal CONST_DOUBLE values and that value is
17229 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17232 aarch64_vec_fpconst_pow_of_2 (rtx x)
17234 int nelts;
17235 if (GET_CODE (x) != CONST_VECTOR
17236 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17237 return -1;
17239 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17240 return -1;
17242 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17243 if (firstval <= 0)
17244 return -1;
17246 for (int i = 1; i < nelts; i++)
17247 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17248 return -1;
17250 return firstval;
17253 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17254 to float.
17256 __fp16 always promotes through this hook.
17257 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17258 through the generic excess precision logic rather than here. */
17260 static tree
17261 aarch64_promoted_type (const_tree t)
17263 if (SCALAR_FLOAT_TYPE_P (t)
17264 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17265 return float_type_node;
17267 return NULL_TREE;
17270 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17272 static bool
17273 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17274 optimization_type opt_type)
17276 switch (op)
17278 case rsqrt_optab:
17279 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17281 default:
17282 return true;
17286 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17288 static unsigned int
17289 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17290 int *offset)
17292 /* Polynomial invariant 1 == (VG / 2) - 1. */
17293 gcc_assert (i == 1);
17294 *factor = 2;
17295 *offset = 1;
17296 return AARCH64_DWARF_VG;
17299 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17300 if MODE is HFmode, and punt to the generic implementation otherwise. */
17302 static bool
17303 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17305 return (mode == HFmode
17306 ? true
17307 : default_libgcc_floating_mode_supported_p (mode));
17310 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17311 if MODE is HFmode, and punt to the generic implementation otherwise. */
17313 static bool
17314 aarch64_scalar_mode_supported_p (scalar_mode mode)
17316 return (mode == HFmode
17317 ? true
17318 : default_scalar_mode_supported_p (mode));
17321 /* Set the value of FLT_EVAL_METHOD.
17322 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17324 0: evaluate all operations and constants, whose semantic type has at
17325 most the range and precision of type float, to the range and
17326 precision of float; evaluate all other operations and constants to
17327 the range and precision of the semantic type;
17329 N, where _FloatN is a supported interchange floating type
17330 evaluate all operations and constants, whose semantic type has at
17331 most the range and precision of _FloatN type, to the range and
17332 precision of the _FloatN type; evaluate all other operations and
17333 constants to the range and precision of the semantic type;
17335 If we have the ARMv8.2-A extensions then we support _Float16 in native
17336 precision, so we should set this to 16. Otherwise, we support the type,
17337 but want to evaluate expressions in float precision, so set this to
17338 0. */
17340 static enum flt_eval_method
17341 aarch64_excess_precision (enum excess_precision_type type)
17343 switch (type)
17345 case EXCESS_PRECISION_TYPE_FAST:
17346 case EXCESS_PRECISION_TYPE_STANDARD:
17347 /* We can calculate either in 16-bit range and precision or
17348 32-bit range and precision. Make that decision based on whether
17349 we have native support for the ARMv8.2-A 16-bit floating-point
17350 instructions or not. */
17351 return (TARGET_FP_F16INST
17352 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17353 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17354 case EXCESS_PRECISION_TYPE_IMPLICIT:
17355 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17356 default:
17357 gcc_unreachable ();
17359 return FLT_EVAL_METHOD_UNPREDICTABLE;
17362 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17363 scheduled for speculative execution. Reject the long-running division
17364 and square-root instructions. */
17366 static bool
17367 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17369 switch (get_attr_type (insn))
17371 case TYPE_SDIV:
17372 case TYPE_UDIV:
17373 case TYPE_FDIVS:
17374 case TYPE_FDIVD:
17375 case TYPE_FSQRTS:
17376 case TYPE_FSQRTD:
17377 case TYPE_NEON_FP_SQRT_S:
17378 case TYPE_NEON_FP_SQRT_D:
17379 case TYPE_NEON_FP_SQRT_S_Q:
17380 case TYPE_NEON_FP_SQRT_D_Q:
17381 case TYPE_NEON_FP_DIV_S:
17382 case TYPE_NEON_FP_DIV_D:
17383 case TYPE_NEON_FP_DIV_S_Q:
17384 case TYPE_NEON_FP_DIV_D_Q:
17385 return false;
17386 default:
17387 return true;
17391 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17393 static int
17394 aarch64_compute_pressure_classes (reg_class *classes)
17396 int i = 0;
17397 classes[i++] = GENERAL_REGS;
17398 classes[i++] = FP_REGS;
17399 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17400 registers need to go in PR_LO_REGS at some point during their
17401 lifetime. Splitting it into two halves has the effect of making
17402 all predicates count against PR_LO_REGS, so that we try whenever
17403 possible to restrict the number of live predicates to 8. This
17404 greatly reduces the amount of spilling in certain loops. */
17405 classes[i++] = PR_LO_REGS;
17406 classes[i++] = PR_HI_REGS;
17407 return i;
17410 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17412 static bool
17413 aarch64_can_change_mode_class (machine_mode from,
17414 machine_mode to, reg_class_t)
17416 if (BYTES_BIG_ENDIAN)
17418 bool from_sve_p = aarch64_sve_data_mode_p (from);
17419 bool to_sve_p = aarch64_sve_data_mode_p (to);
17421 /* Don't allow changes between SVE data modes and non-SVE modes.
17422 See the comment at the head of aarch64-sve.md for details. */
17423 if (from_sve_p != to_sve_p)
17424 return false;
17426 /* Don't allow changes in element size: lane 0 of the new vector
17427 would not then be lane 0 of the old vector. See the comment
17428 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17429 description.
17431 In the worst case, this forces a register to be spilled in
17432 one mode and reloaded in the other, which handles the
17433 endianness correctly. */
17434 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17435 return false;
17437 return true;
17440 /* Implement TARGET_EARLY_REMAT_MODES. */
17442 static void
17443 aarch64_select_early_remat_modes (sbitmap modes)
17445 /* SVE values are not normally live across a call, so it should be
17446 worth doing early rematerialization even in VL-specific mode. */
17447 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17449 machine_mode mode = (machine_mode) i;
17450 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17451 if (vec_flags & VEC_ANY_SVE)
17452 bitmap_set_bit (modes, i);
17456 /* Target-specific selftests. */
17458 #if CHECKING_P
17460 namespace selftest {
17462 /* Selftest for the RTL loader.
17463 Verify that the RTL loader copes with a dump from
17464 print_rtx_function. This is essentially just a test that class
17465 function_reader can handle a real dump, but it also verifies
17466 that lookup_reg_by_dump_name correctly handles hard regs.
17467 The presence of hard reg names in the dump means that the test is
17468 target-specific, hence it is in this file. */
17470 static void
17471 aarch64_test_loading_full_dump ()
17473 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17475 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17477 rtx_insn *insn_1 = get_insn_by_uid (1);
17478 ASSERT_EQ (NOTE, GET_CODE (insn_1));
17480 rtx_insn *insn_15 = get_insn_by_uid (15);
17481 ASSERT_EQ (INSN, GET_CODE (insn_15));
17482 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17484 /* Verify crtl->return_rtx. */
17485 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17486 ASSERT_EQ (0, REGNO (crtl->return_rtx));
17487 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17490 /* Run all target-specific selftests. */
17492 static void
17493 aarch64_run_selftests (void)
17495 aarch64_test_loading_full_dump ();
17498 } // namespace selftest
17500 #endif /* #if CHECKING_P */
17502 #undef TARGET_ADDRESS_COST
17503 #define TARGET_ADDRESS_COST aarch64_address_cost
17505 /* This hook will determines whether unnamed bitfields affect the alignment
17506 of the containing structure. The hook returns true if the structure
17507 should inherit the alignment requirements of an unnamed bitfield's
17508 type. */
17509 #undef TARGET_ALIGN_ANON_BITFIELD
17510 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17512 #undef TARGET_ASM_ALIGNED_DI_OP
17513 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17515 #undef TARGET_ASM_ALIGNED_HI_OP
17516 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17518 #undef TARGET_ASM_ALIGNED_SI_OP
17519 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17521 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17522 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17523 hook_bool_const_tree_hwi_hwi_const_tree_true
17525 #undef TARGET_ASM_FILE_START
17526 #define TARGET_ASM_FILE_START aarch64_start_file
17528 #undef TARGET_ASM_OUTPUT_MI_THUNK
17529 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17531 #undef TARGET_ASM_SELECT_RTX_SECTION
17532 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17534 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17535 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17537 #undef TARGET_BUILD_BUILTIN_VA_LIST
17538 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17540 #undef TARGET_CALLEE_COPIES
17541 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17543 #undef TARGET_CAN_ELIMINATE
17544 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17546 #undef TARGET_CAN_INLINE_P
17547 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17549 #undef TARGET_CANNOT_FORCE_CONST_MEM
17550 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17552 #undef TARGET_CASE_VALUES_THRESHOLD
17553 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17555 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17556 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17558 /* Only the least significant bit is used for initialization guard
17559 variables. */
17560 #undef TARGET_CXX_GUARD_MASK_BIT
17561 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17563 #undef TARGET_C_MODE_FOR_SUFFIX
17564 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17566 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17567 #undef TARGET_DEFAULT_TARGET_FLAGS
17568 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17569 #endif
17571 #undef TARGET_CLASS_MAX_NREGS
17572 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17574 #undef TARGET_BUILTIN_DECL
17575 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17577 #undef TARGET_BUILTIN_RECIPROCAL
17578 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17580 #undef TARGET_C_EXCESS_PRECISION
17581 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17583 #undef TARGET_EXPAND_BUILTIN
17584 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17586 #undef TARGET_EXPAND_BUILTIN_VA_START
17587 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17589 #undef TARGET_FOLD_BUILTIN
17590 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17592 #undef TARGET_FUNCTION_ARG
17593 #define TARGET_FUNCTION_ARG aarch64_function_arg
17595 #undef TARGET_FUNCTION_ARG_ADVANCE
17596 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17598 #undef TARGET_FUNCTION_ARG_BOUNDARY
17599 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17601 #undef TARGET_FUNCTION_ARG_PADDING
17602 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17604 #undef TARGET_GET_RAW_RESULT_MODE
17605 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17606 #undef TARGET_GET_RAW_ARG_MODE
17607 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17609 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17610 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17612 #undef TARGET_FUNCTION_VALUE
17613 #define TARGET_FUNCTION_VALUE aarch64_function_value
17615 #undef TARGET_FUNCTION_VALUE_REGNO_P
17616 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17618 #undef TARGET_GIMPLE_FOLD_BUILTIN
17619 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17621 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17622 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17624 #undef TARGET_INIT_BUILTINS
17625 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17627 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17628 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17629 aarch64_ira_change_pseudo_allocno_class
17631 #undef TARGET_LEGITIMATE_ADDRESS_P
17632 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17634 #undef TARGET_LEGITIMATE_CONSTANT_P
17635 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17637 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17638 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17639 aarch64_legitimize_address_displacement
17641 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17642 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17644 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17645 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17646 aarch64_libgcc_floating_mode_supported_p
17648 #undef TARGET_MANGLE_TYPE
17649 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17651 #undef TARGET_MEMORY_MOVE_COST
17652 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17654 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17655 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17657 #undef TARGET_MUST_PASS_IN_STACK
17658 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17660 /* This target hook should return true if accesses to volatile bitfields
17661 should use the narrowest mode possible. It should return false if these
17662 accesses should use the bitfield container type. */
17663 #undef TARGET_NARROW_VOLATILE_BITFIELD
17664 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17666 #undef TARGET_OPTION_OVERRIDE
17667 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17669 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17670 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17671 aarch64_override_options_after_change
17673 #undef TARGET_OPTION_SAVE
17674 #define TARGET_OPTION_SAVE aarch64_option_save
17676 #undef TARGET_OPTION_RESTORE
17677 #define TARGET_OPTION_RESTORE aarch64_option_restore
17679 #undef TARGET_OPTION_PRINT
17680 #define TARGET_OPTION_PRINT aarch64_option_print
17682 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17683 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17685 #undef TARGET_SET_CURRENT_FUNCTION
17686 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17688 #undef TARGET_PASS_BY_REFERENCE
17689 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17691 #undef TARGET_PREFERRED_RELOAD_CLASS
17692 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17694 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17695 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17697 #undef TARGET_PROMOTED_TYPE
17698 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17700 #undef TARGET_SECONDARY_RELOAD
17701 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17703 #undef TARGET_SHIFT_TRUNCATION_MASK
17704 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17706 #undef TARGET_SETUP_INCOMING_VARARGS
17707 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17709 #undef TARGET_STRUCT_VALUE_RTX
17710 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17712 #undef TARGET_REGISTER_MOVE_COST
17713 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17715 #undef TARGET_RETURN_IN_MEMORY
17716 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17718 #undef TARGET_RETURN_IN_MSB
17719 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17721 #undef TARGET_RTX_COSTS
17722 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17724 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17725 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17727 #undef TARGET_SCHED_ISSUE_RATE
17728 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17730 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17731 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17732 aarch64_sched_first_cycle_multipass_dfa_lookahead
17734 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17735 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17736 aarch64_first_cycle_multipass_dfa_lookahead_guard
17738 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17739 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17740 aarch64_get_separate_components
17742 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17743 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17744 aarch64_components_for_bb
17746 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17747 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17748 aarch64_disqualify_components
17750 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17751 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17752 aarch64_emit_prologue_components
17754 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17755 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17756 aarch64_emit_epilogue_components
17758 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17759 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17760 aarch64_set_handled_components
17762 #undef TARGET_TRAMPOLINE_INIT
17763 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17765 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17766 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17768 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17769 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17771 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17772 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17773 aarch64_builtin_support_vector_misalignment
17775 #undef TARGET_ARRAY_MODE
17776 #define TARGET_ARRAY_MODE aarch64_array_mode
17778 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17779 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17781 #undef TARGET_VECTORIZE_ADD_STMT_COST
17782 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17784 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17785 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17786 aarch64_builtin_vectorization_cost
17788 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17789 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17791 #undef TARGET_VECTORIZE_BUILTINS
17792 #define TARGET_VECTORIZE_BUILTINS
17794 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17795 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17796 aarch64_builtin_vectorized_function
17798 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17799 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17800 aarch64_autovectorize_vector_sizes
17802 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17803 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17804 aarch64_atomic_assign_expand_fenv
17806 /* Section anchor support. */
17808 #undef TARGET_MIN_ANCHOR_OFFSET
17809 #define TARGET_MIN_ANCHOR_OFFSET -256
17811 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17812 byte offset; we can do much more for larger data types, but have no way
17813 to determine the size of the access. We assume accesses are aligned. */
17814 #undef TARGET_MAX_ANCHOR_OFFSET
17815 #define TARGET_MAX_ANCHOR_OFFSET 4095
17817 #undef TARGET_VECTOR_ALIGNMENT
17818 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17820 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17821 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17822 aarch64_vectorize_preferred_vector_alignment
17823 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17824 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17825 aarch64_simd_vector_alignment_reachable
17827 /* vec_perm support. */
17829 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17830 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17831 aarch64_vectorize_vec_perm_const
17833 #undef TARGET_VECTORIZE_GET_MASK_MODE
17834 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17835 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17836 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17837 aarch64_empty_mask_is_expensive
17839 #undef TARGET_INIT_LIBFUNCS
17840 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17842 #undef TARGET_FIXED_CONDITION_CODE_REGS
17843 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17845 #undef TARGET_FLAGS_REGNUM
17846 #define TARGET_FLAGS_REGNUM CC_REGNUM
17848 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17849 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17851 #undef TARGET_ASAN_SHADOW_OFFSET
17852 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17854 #undef TARGET_LEGITIMIZE_ADDRESS
17855 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17857 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17858 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17860 #undef TARGET_CAN_USE_DOLOOP_P
17861 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17863 #undef TARGET_SCHED_ADJUST_PRIORITY
17864 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17866 #undef TARGET_SCHED_MACRO_FUSION_P
17867 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17869 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17870 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17872 #undef TARGET_SCHED_FUSION_PRIORITY
17873 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17875 #undef TARGET_UNSPEC_MAY_TRAP_P
17876 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17878 #undef TARGET_USE_PSEUDO_PIC_REG
17879 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17881 #undef TARGET_PRINT_OPERAND
17882 #define TARGET_PRINT_OPERAND aarch64_print_operand
17884 #undef TARGET_PRINT_OPERAND_ADDRESS
17885 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17887 #undef TARGET_OPTAB_SUPPORTED_P
17888 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17890 #undef TARGET_OMIT_STRUCT_RETURN_REG
17891 #define TARGET_OMIT_STRUCT_RETURN_REG true
17893 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17894 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
17895 aarch64_dwarf_poly_indeterminate_value
17897 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
17898 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
17899 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
17901 #undef TARGET_HARD_REGNO_NREGS
17902 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
17903 #undef TARGET_HARD_REGNO_MODE_OK
17904 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
17906 #undef TARGET_MODES_TIEABLE_P
17907 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
17909 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
17910 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
17911 aarch64_hard_regno_call_part_clobbered
17913 #undef TARGET_CONSTANT_ALIGNMENT
17914 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
17916 #undef TARGET_COMPUTE_PRESSURE_CLASSES
17917 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
17919 #undef TARGET_CAN_CHANGE_MODE_CLASS
17920 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
17922 #undef TARGET_SELECT_EARLY_REMAT_MODES
17923 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
17925 #if CHECKING_P
17926 #undef TARGET_RUN_TARGET_SELFTESTS
17927 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
17928 #endif /* #if CHECKING_P */
17930 struct gcc_target targetm = TARGET_INITIALIZER;
17932 #include "gt-aarch64.h"