[aarch64] Add HiSilicon tsv110 CPU support
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob9c68025e1f19236305ab41b0f7ed0cbfa039d3e1
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2018 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #define IN_TARGET_CODE 1
23 #include "config.h"
24 #define INCLUDE_STRING
25 #include "system.h"
26 #include "coretypes.h"
27 #include "backend.h"
28 #include "target.h"
29 #include "rtl.h"
30 #include "tree.h"
31 #include "memmodel.h"
32 #include "gimple.h"
33 #include "cfghooks.h"
34 #include "cfgloop.h"
35 #include "df.h"
36 #include "tm_p.h"
37 #include "stringpool.h"
38 #include "attribs.h"
39 #include "optabs.h"
40 #include "regs.h"
41 #include "emit-rtl.h"
42 #include "recog.h"
43 #include "diagnostic.h"
44 #include "insn-attr.h"
45 #include "alias.h"
46 #include "fold-const.h"
47 #include "stor-layout.h"
48 #include "calls.h"
49 #include "varasm.h"
50 #include "output.h"
51 #include "flags.h"
52 #include "explow.h"
53 #include "expr.h"
54 #include "reload.h"
55 #include "langhooks.h"
56 #include "opts.h"
57 #include "params.h"
58 #include "gimplify.h"
59 #include "dwarf2.h"
60 #include "gimple-iterator.h"
61 #include "tree-vectorizer.h"
62 #include "aarch64-cost-tables.h"
63 #include "dumpfile.h"
64 #include "builtins.h"
65 #include "rtl-iter.h"
66 #include "tm-constrs.h"
67 #include "sched-int.h"
68 #include "target-globals.h"
69 #include "common/common-target.h"
70 #include "cfgrtl.h"
71 #include "selftest.h"
72 #include "selftest-rtl.h"
73 #include "rtx-vector-builder.h"
75 /* This file should be included last. */
76 #include "target-def.h"
78 /* Defined for convenience. */
79 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
81 /* Information about a legitimate vector immediate operand. */
82 struct simd_immediate_info
84 enum insn_type { MOV, MVN };
85 enum modifier_type { LSL, MSL };
87 simd_immediate_info () {}
88 simd_immediate_info (scalar_float_mode, rtx);
89 simd_immediate_info (scalar_int_mode, unsigned HOST_WIDE_INT,
90 insn_type = MOV, modifier_type = LSL,
91 unsigned int = 0);
92 simd_immediate_info (scalar_mode, rtx, rtx);
94 /* The mode of the elements. */
95 scalar_mode elt_mode;
97 /* The value of each element if all elements are the same, or the
98 first value if the constant is a series. */
99 rtx value;
101 /* The value of the step if the constant is a series, null otherwise. */
102 rtx step;
104 /* The instruction to use to move the immediate into a vector. */
105 insn_type insn;
107 /* The kind of shift modifier to use, and the number of bits to shift.
108 This is (LSL, 0) if no shift is needed. */
109 modifier_type modifier;
110 unsigned int shift;
113 /* Construct a floating-point immediate in which each element has mode
114 ELT_MODE_IN and value VALUE_IN. */
115 inline simd_immediate_info
116 ::simd_immediate_info (scalar_float_mode elt_mode_in, rtx value_in)
117 : elt_mode (elt_mode_in), value (value_in), step (NULL_RTX), insn (MOV),
118 modifier (LSL), shift (0)
121 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
122 and value VALUE_IN. The other parameters are as for the structure
123 fields. */
124 inline simd_immediate_info
125 ::simd_immediate_info (scalar_int_mode elt_mode_in,
126 unsigned HOST_WIDE_INT value_in,
127 insn_type insn_in, modifier_type modifier_in,
128 unsigned int shift_in)
129 : elt_mode (elt_mode_in), value (gen_int_mode (value_in, elt_mode_in)),
130 step (NULL_RTX), insn (insn_in), modifier (modifier_in), shift (shift_in)
133 /* Construct an integer immediate in which each element has mode ELT_MODE_IN
134 and where element I is equal to VALUE_IN + I * STEP_IN. */
135 inline simd_immediate_info
136 ::simd_immediate_info (scalar_mode elt_mode_in, rtx value_in, rtx step_in)
137 : elt_mode (elt_mode_in), value (value_in), step (step_in), insn (MOV),
138 modifier (LSL), shift (0)
141 /* The current code model. */
142 enum aarch64_code_model aarch64_cmodel;
144 /* The number of 64-bit elements in an SVE vector. */
145 poly_uint16 aarch64_sve_vg;
147 #ifdef HAVE_AS_TLS
148 #undef TARGET_HAVE_TLS
149 #define TARGET_HAVE_TLS 1
150 #endif
152 static bool aarch64_composite_type_p (const_tree, machine_mode);
153 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
154 const_tree,
155 machine_mode *, int *,
156 bool *);
157 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
158 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
159 static void aarch64_override_options_after_change (void);
160 static bool aarch64_vector_mode_supported_p (machine_mode);
161 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
162 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
163 const_tree type,
164 int misalignment,
165 bool is_packed);
166 static machine_mode aarch64_simd_container_mode (scalar_mode, poly_int64);
167 static bool aarch64_print_address_internal (FILE*, machine_mode, rtx,
168 aarch64_addr_query_type);
170 /* Major revision number of the ARM Architecture implemented by the target. */
171 unsigned aarch64_architecture_version;
173 /* The processor for which instructions should be scheduled. */
174 enum aarch64_processor aarch64_tune = cortexa53;
176 /* Mask to specify which instruction scheduling options should be used. */
177 unsigned long aarch64_tune_flags = 0;
179 /* Global flag for PC relative loads. */
180 bool aarch64_pcrelative_literal_loads;
182 /* Global flag for whether frame pointer is enabled. */
183 bool aarch64_use_frame_pointer;
185 /* Support for command line parsing of boolean flags in the tuning
186 structures. */
187 struct aarch64_flag_desc
189 const char* name;
190 unsigned int flag;
193 #define AARCH64_FUSION_PAIR(name, internal_name) \
194 { name, AARCH64_FUSE_##internal_name },
195 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
197 { "none", AARCH64_FUSE_NOTHING },
198 #include "aarch64-fusion-pairs.def"
199 { "all", AARCH64_FUSE_ALL },
200 { NULL, AARCH64_FUSE_NOTHING }
203 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
204 { name, AARCH64_EXTRA_TUNE_##internal_name },
205 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
207 { "none", AARCH64_EXTRA_TUNE_NONE },
208 #include "aarch64-tuning-flags.def"
209 { "all", AARCH64_EXTRA_TUNE_ALL },
210 { NULL, AARCH64_EXTRA_TUNE_NONE }
213 /* Tuning parameters. */
215 static const struct cpu_addrcost_table generic_addrcost_table =
218 1, /* hi */
219 0, /* si */
220 0, /* di */
221 1, /* ti */
223 0, /* pre_modify */
224 0, /* post_modify */
225 0, /* register_offset */
226 0, /* register_sextend */
227 0, /* register_zextend */
228 0 /* imm_offset */
231 static const struct cpu_addrcost_table exynosm1_addrcost_table =
234 0, /* hi */
235 0, /* si */
236 0, /* di */
237 2, /* ti */
239 0, /* pre_modify */
240 0, /* post_modify */
241 1, /* register_offset */
242 1, /* register_sextend */
243 2, /* register_zextend */
244 0, /* imm_offset */
247 static const struct cpu_addrcost_table xgene1_addrcost_table =
250 1, /* hi */
251 0, /* si */
252 0, /* di */
253 1, /* ti */
255 1, /* pre_modify */
256 0, /* post_modify */
257 0, /* register_offset */
258 1, /* register_sextend */
259 1, /* register_zextend */
260 0, /* imm_offset */
263 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
266 1, /* hi */
267 1, /* si */
268 1, /* di */
269 2, /* ti */
271 0, /* pre_modify */
272 0, /* post_modify */
273 2, /* register_offset */
274 3, /* register_sextend */
275 3, /* register_zextend */
276 0, /* imm_offset */
279 static const struct cpu_addrcost_table tsv110_addrcost_table =
282 1, /* hi */
283 0, /* si */
284 0, /* di */
285 1, /* ti */
287 0, /* pre_modify */
288 0, /* post_modify */
289 0, /* register_offset */
290 1, /* register_sextend */
291 1, /* register_zextend */
292 0, /* imm_offset */
295 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
298 1, /* hi */
299 1, /* si */
300 1, /* di */
301 2, /* ti */
303 1, /* pre_modify */
304 1, /* post_modify */
305 3, /* register_offset */
306 3, /* register_sextend */
307 3, /* register_zextend */
308 2, /* imm_offset */
311 static const struct cpu_regmove_cost generic_regmove_cost =
313 1, /* GP2GP */
314 /* Avoid the use of slow int<->fp moves for spilling by setting
315 their cost higher than memmov_cost. */
316 5, /* GP2FP */
317 5, /* FP2GP */
318 2 /* FP2FP */
321 static const struct cpu_regmove_cost cortexa57_regmove_cost =
323 1, /* GP2GP */
324 /* Avoid the use of slow int<->fp moves for spilling by setting
325 their cost higher than memmov_cost. */
326 5, /* GP2FP */
327 5, /* FP2GP */
328 2 /* FP2FP */
331 static const struct cpu_regmove_cost cortexa53_regmove_cost =
333 1, /* GP2GP */
334 /* Avoid the use of slow int<->fp moves for spilling by setting
335 their cost higher than memmov_cost. */
336 5, /* GP2FP */
337 5, /* FP2GP */
338 2 /* FP2FP */
341 static const struct cpu_regmove_cost exynosm1_regmove_cost =
343 1, /* GP2GP */
344 /* Avoid the use of slow int<->fp moves for spilling by setting
345 their cost higher than memmov_cost (actual, 4 and 9). */
346 9, /* GP2FP */
347 9, /* FP2GP */
348 1 /* FP2FP */
351 static const struct cpu_regmove_cost thunderx_regmove_cost =
353 2, /* GP2GP */
354 2, /* GP2FP */
355 6, /* FP2GP */
356 4 /* FP2FP */
359 static const struct cpu_regmove_cost xgene1_regmove_cost =
361 1, /* GP2GP */
362 /* Avoid the use of slow int<->fp moves for spilling by setting
363 their cost higher than memmov_cost. */
364 8, /* GP2FP */
365 8, /* FP2GP */
366 2 /* FP2FP */
369 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
371 2, /* GP2GP */
372 /* Avoid the use of int<->fp moves for spilling. */
373 6, /* GP2FP */
374 6, /* FP2GP */
375 4 /* FP2FP */
378 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
380 1, /* GP2GP */
381 /* Avoid the use of int<->fp moves for spilling. */
382 8, /* GP2FP */
383 8, /* FP2GP */
384 4 /* FP2FP */
387 static const struct cpu_regmove_cost tsv110_regmove_cost =
389 1, /* GP2GP */
390 /* Avoid the use of slow int<->fp moves for spilling by setting
391 their cost higher than memmov_cost. */
392 2, /* GP2FP */
393 3, /* FP2GP */
394 2 /* FP2FP */
397 /* Generic costs for vector insn classes. */
398 static const struct cpu_vector_cost generic_vector_cost =
400 1, /* scalar_int_stmt_cost */
401 1, /* scalar_fp_stmt_cost */
402 1, /* scalar_load_cost */
403 1, /* scalar_store_cost */
404 1, /* vec_int_stmt_cost */
405 1, /* vec_fp_stmt_cost */
406 2, /* vec_permute_cost */
407 1, /* vec_to_scalar_cost */
408 1, /* scalar_to_vec_cost */
409 1, /* vec_align_load_cost */
410 1, /* vec_unalign_load_cost */
411 1, /* vec_unalign_store_cost */
412 1, /* vec_store_cost */
413 3, /* cond_taken_branch_cost */
414 1 /* cond_not_taken_branch_cost */
417 /* QDF24XX costs for vector insn classes. */
418 static const struct cpu_vector_cost qdf24xx_vector_cost =
420 1, /* scalar_int_stmt_cost */
421 1, /* scalar_fp_stmt_cost */
422 1, /* scalar_load_cost */
423 1, /* scalar_store_cost */
424 1, /* vec_int_stmt_cost */
425 3, /* vec_fp_stmt_cost */
426 2, /* vec_permute_cost */
427 1, /* vec_to_scalar_cost */
428 1, /* scalar_to_vec_cost */
429 1, /* vec_align_load_cost */
430 1, /* vec_unalign_load_cost */
431 1, /* vec_unalign_store_cost */
432 1, /* vec_store_cost */
433 3, /* cond_taken_branch_cost */
434 1 /* cond_not_taken_branch_cost */
437 /* ThunderX costs for vector insn classes. */
438 static const struct cpu_vector_cost thunderx_vector_cost =
440 1, /* scalar_int_stmt_cost */
441 1, /* scalar_fp_stmt_cost */
442 3, /* scalar_load_cost */
443 1, /* scalar_store_cost */
444 4, /* vec_int_stmt_cost */
445 1, /* vec_fp_stmt_cost */
446 4, /* vec_permute_cost */
447 2, /* vec_to_scalar_cost */
448 2, /* scalar_to_vec_cost */
449 3, /* vec_align_load_cost */
450 5, /* vec_unalign_load_cost */
451 5, /* vec_unalign_store_cost */
452 1, /* vec_store_cost */
453 3, /* cond_taken_branch_cost */
454 3 /* cond_not_taken_branch_cost */
457 static const struct cpu_vector_cost tsv110_vector_cost =
459 1, /* scalar_int_stmt_cost */
460 1, /* scalar_fp_stmt_cost */
461 5, /* scalar_load_cost */
462 1, /* scalar_store_cost */
463 2, /* vec_int_stmt_cost */
464 2, /* vec_fp_stmt_cost */
465 2, /* vec_permute_cost */
466 3, /* vec_to_scalar_cost */
467 2, /* scalar_to_vec_cost */
468 5, /* vec_align_load_cost */
469 5, /* vec_unalign_load_cost */
470 1, /* vec_unalign_store_cost */
471 1, /* vec_store_cost */
472 1, /* cond_taken_branch_cost */
473 1 /* cond_not_taken_branch_cost */
476 /* Generic costs for vector insn classes. */
477 static const struct cpu_vector_cost cortexa57_vector_cost =
479 1, /* scalar_int_stmt_cost */
480 1, /* scalar_fp_stmt_cost */
481 4, /* scalar_load_cost */
482 1, /* scalar_store_cost */
483 2, /* vec_int_stmt_cost */
484 2, /* vec_fp_stmt_cost */
485 3, /* vec_permute_cost */
486 8, /* vec_to_scalar_cost */
487 8, /* scalar_to_vec_cost */
488 4, /* vec_align_load_cost */
489 4, /* vec_unalign_load_cost */
490 1, /* vec_unalign_store_cost */
491 1, /* vec_store_cost */
492 1, /* cond_taken_branch_cost */
493 1 /* cond_not_taken_branch_cost */
496 static const struct cpu_vector_cost exynosm1_vector_cost =
498 1, /* scalar_int_stmt_cost */
499 1, /* scalar_fp_stmt_cost */
500 5, /* scalar_load_cost */
501 1, /* scalar_store_cost */
502 3, /* vec_int_stmt_cost */
503 3, /* vec_fp_stmt_cost */
504 3, /* vec_permute_cost */
505 3, /* vec_to_scalar_cost */
506 3, /* scalar_to_vec_cost */
507 5, /* vec_align_load_cost */
508 5, /* vec_unalign_load_cost */
509 1, /* vec_unalign_store_cost */
510 1, /* vec_store_cost */
511 1, /* cond_taken_branch_cost */
512 1 /* cond_not_taken_branch_cost */
515 /* Generic costs for vector insn classes. */
516 static const struct cpu_vector_cost xgene1_vector_cost =
518 1, /* scalar_int_stmt_cost */
519 1, /* scalar_fp_stmt_cost */
520 5, /* scalar_load_cost */
521 1, /* scalar_store_cost */
522 2, /* vec_int_stmt_cost */
523 2, /* vec_fp_stmt_cost */
524 2, /* vec_permute_cost */
525 4, /* vec_to_scalar_cost */
526 4, /* scalar_to_vec_cost */
527 10, /* vec_align_load_cost */
528 10, /* vec_unalign_load_cost */
529 2, /* vec_unalign_store_cost */
530 2, /* vec_store_cost */
531 2, /* cond_taken_branch_cost */
532 1 /* cond_not_taken_branch_cost */
535 /* Costs for vector insn classes for Vulcan. */
536 static const struct cpu_vector_cost thunderx2t99_vector_cost =
538 1, /* scalar_int_stmt_cost */
539 6, /* scalar_fp_stmt_cost */
540 4, /* scalar_load_cost */
541 1, /* scalar_store_cost */
542 5, /* vec_int_stmt_cost */
543 6, /* vec_fp_stmt_cost */
544 3, /* vec_permute_cost */
545 6, /* vec_to_scalar_cost */
546 5, /* scalar_to_vec_cost */
547 8, /* vec_align_load_cost */
548 8, /* vec_unalign_load_cost */
549 4, /* vec_unalign_store_cost */
550 4, /* vec_store_cost */
551 2, /* cond_taken_branch_cost */
552 1 /* cond_not_taken_branch_cost */
555 /* Generic costs for branch instructions. */
556 static const struct cpu_branch_cost generic_branch_cost =
558 1, /* Predictable. */
559 3 /* Unpredictable. */
562 /* Generic approximation modes. */
563 static const cpu_approx_modes generic_approx_modes =
565 AARCH64_APPROX_NONE, /* division */
566 AARCH64_APPROX_NONE, /* sqrt */
567 AARCH64_APPROX_NONE /* recip_sqrt */
570 /* Approximation modes for Exynos M1. */
571 static const cpu_approx_modes exynosm1_approx_modes =
573 AARCH64_APPROX_NONE, /* division */
574 AARCH64_APPROX_ALL, /* sqrt */
575 AARCH64_APPROX_ALL /* recip_sqrt */
578 /* Approximation modes for X-Gene 1. */
579 static const cpu_approx_modes xgene1_approx_modes =
581 AARCH64_APPROX_NONE, /* division */
582 AARCH64_APPROX_NONE, /* sqrt */
583 AARCH64_APPROX_ALL /* recip_sqrt */
586 /* Generic prefetch settings (which disable prefetch). */
587 static const cpu_prefetch_tune generic_prefetch_tune =
589 0, /* num_slots */
590 -1, /* l1_cache_size */
591 -1, /* l1_cache_line_size */
592 -1, /* l2_cache_size */
593 true, /* prefetch_dynamic_strides */
594 -1, /* minimum_stride */
595 -1 /* default_opt_level */
598 static const cpu_prefetch_tune exynosm1_prefetch_tune =
600 0, /* num_slots */
601 -1, /* l1_cache_size */
602 64, /* l1_cache_line_size */
603 -1, /* l2_cache_size */
604 true, /* prefetch_dynamic_strides */
605 -1, /* minimum_stride */
606 -1 /* default_opt_level */
609 static const cpu_prefetch_tune qdf24xx_prefetch_tune =
611 4, /* num_slots */
612 32, /* l1_cache_size */
613 64, /* l1_cache_line_size */
614 512, /* l2_cache_size */
615 false, /* prefetch_dynamic_strides */
616 2048, /* minimum_stride */
617 3 /* default_opt_level */
620 static const cpu_prefetch_tune thunderxt88_prefetch_tune =
622 8, /* num_slots */
623 32, /* l1_cache_size */
624 128, /* l1_cache_line_size */
625 16*1024, /* l2_cache_size */
626 true, /* prefetch_dynamic_strides */
627 -1, /* minimum_stride */
628 3 /* default_opt_level */
631 static const cpu_prefetch_tune thunderx_prefetch_tune =
633 8, /* num_slots */
634 32, /* l1_cache_size */
635 128, /* l1_cache_line_size */
636 -1, /* l2_cache_size */
637 true, /* prefetch_dynamic_strides */
638 -1, /* minimum_stride */
639 -1 /* default_opt_level */
642 static const cpu_prefetch_tune thunderx2t99_prefetch_tune =
644 8, /* num_slots */
645 32, /* l1_cache_size */
646 64, /* l1_cache_line_size */
647 256, /* l2_cache_size */
648 true, /* prefetch_dynamic_strides */
649 -1, /* minimum_stride */
650 -1 /* default_opt_level */
653 static const cpu_prefetch_tune tsv110_prefetch_tune =
655 0, /* num_slots */
656 64, /* l1_cache_size */
657 64, /* l1_cache_line_size */
658 512, /* l2_cache_size */
659 true, /* prefetch_dynamic_strides */
660 -1, /* minimum_stride */
661 -1 /* default_opt_level */
664 static const struct tune_params generic_tunings =
666 &cortexa57_extra_costs,
667 &generic_addrcost_table,
668 &generic_regmove_cost,
669 &generic_vector_cost,
670 &generic_branch_cost,
671 &generic_approx_modes,
672 4, /* memmov_cost */
673 2, /* issue_rate */
674 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
675 "8", /* function_align. */
676 "4", /* jump_align. */
677 "8", /* loop_align. */
678 2, /* int_reassoc_width. */
679 4, /* fp_reassoc_width. */
680 1, /* vec_reassoc_width. */
681 2, /* min_div_recip_mul_sf. */
682 2, /* min_div_recip_mul_df. */
683 0, /* max_case_values. */
684 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
685 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
686 &generic_prefetch_tune
689 static const struct tune_params cortexa35_tunings =
691 &cortexa53_extra_costs,
692 &generic_addrcost_table,
693 &cortexa53_regmove_cost,
694 &generic_vector_cost,
695 &generic_branch_cost,
696 &generic_approx_modes,
697 4, /* memmov_cost */
698 1, /* issue_rate */
699 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
700 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
701 "16", /* function_align. */
702 "4", /* jump_align. */
703 "8", /* loop_align. */
704 2, /* int_reassoc_width. */
705 4, /* fp_reassoc_width. */
706 1, /* vec_reassoc_width. */
707 2, /* min_div_recip_mul_sf. */
708 2, /* min_div_recip_mul_df. */
709 0, /* max_case_values. */
710 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
711 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
712 &generic_prefetch_tune
715 static const struct tune_params cortexa53_tunings =
717 &cortexa53_extra_costs,
718 &generic_addrcost_table,
719 &cortexa53_regmove_cost,
720 &generic_vector_cost,
721 &generic_branch_cost,
722 &generic_approx_modes,
723 4, /* memmov_cost */
724 2, /* issue_rate */
725 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
726 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
727 "16", /* function_align. */
728 "4", /* jump_align. */
729 "8", /* loop_align. */
730 2, /* int_reassoc_width. */
731 4, /* fp_reassoc_width. */
732 1, /* vec_reassoc_width. */
733 2, /* min_div_recip_mul_sf. */
734 2, /* min_div_recip_mul_df. */
735 0, /* max_case_values. */
736 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
737 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
738 &generic_prefetch_tune
741 static const struct tune_params cortexa57_tunings =
743 &cortexa57_extra_costs,
744 &generic_addrcost_table,
745 &cortexa57_regmove_cost,
746 &cortexa57_vector_cost,
747 &generic_branch_cost,
748 &generic_approx_modes,
749 4, /* memmov_cost */
750 3, /* issue_rate */
751 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
752 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
753 "16", /* function_align. */
754 "4", /* jump_align. */
755 "8", /* loop_align. */
756 2, /* int_reassoc_width. */
757 4, /* fp_reassoc_width. */
758 1, /* vec_reassoc_width. */
759 2, /* min_div_recip_mul_sf. */
760 2, /* min_div_recip_mul_df. */
761 0, /* max_case_values. */
762 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
763 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS), /* tune_flags. */
764 &generic_prefetch_tune
767 static const struct tune_params cortexa72_tunings =
769 &cortexa57_extra_costs,
770 &generic_addrcost_table,
771 &cortexa57_regmove_cost,
772 &cortexa57_vector_cost,
773 &generic_branch_cost,
774 &generic_approx_modes,
775 4, /* memmov_cost */
776 3, /* issue_rate */
777 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
778 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
779 "16", /* function_align. */
780 "4", /* jump_align. */
781 "8", /* loop_align. */
782 2, /* int_reassoc_width. */
783 4, /* fp_reassoc_width. */
784 1, /* vec_reassoc_width. */
785 2, /* min_div_recip_mul_sf. */
786 2, /* min_div_recip_mul_df. */
787 0, /* max_case_values. */
788 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
789 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
790 &generic_prefetch_tune
793 static const struct tune_params cortexa73_tunings =
795 &cortexa57_extra_costs,
796 &generic_addrcost_table,
797 &cortexa57_regmove_cost,
798 &cortexa57_vector_cost,
799 &generic_branch_cost,
800 &generic_approx_modes,
801 4, /* memmov_cost. */
802 2, /* issue_rate. */
803 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
804 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
805 "16", /* function_align. */
806 "4", /* jump_align. */
807 "8", /* loop_align. */
808 2, /* int_reassoc_width. */
809 4, /* fp_reassoc_width. */
810 1, /* vec_reassoc_width. */
811 2, /* min_div_recip_mul_sf. */
812 2, /* min_div_recip_mul_df. */
813 0, /* max_case_values. */
814 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
815 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
816 &generic_prefetch_tune
821 static const struct tune_params exynosm1_tunings =
823 &exynosm1_extra_costs,
824 &exynosm1_addrcost_table,
825 &exynosm1_regmove_cost,
826 &exynosm1_vector_cost,
827 &generic_branch_cost,
828 &exynosm1_approx_modes,
829 4, /* memmov_cost */
830 3, /* issue_rate */
831 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
832 "4", /* function_align. */
833 "4", /* jump_align. */
834 "4", /* loop_align. */
835 2, /* int_reassoc_width. */
836 4, /* fp_reassoc_width. */
837 1, /* vec_reassoc_width. */
838 2, /* min_div_recip_mul_sf. */
839 2, /* min_div_recip_mul_df. */
840 48, /* max_case_values. */
841 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
842 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
843 &exynosm1_prefetch_tune
846 static const struct tune_params thunderxt88_tunings =
848 &thunderx_extra_costs,
849 &generic_addrcost_table,
850 &thunderx_regmove_cost,
851 &thunderx_vector_cost,
852 &generic_branch_cost,
853 &generic_approx_modes,
854 6, /* memmov_cost */
855 2, /* issue_rate */
856 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
857 "8", /* function_align. */
858 "8", /* jump_align. */
859 "8", /* loop_align. */
860 2, /* int_reassoc_width. */
861 4, /* fp_reassoc_width. */
862 1, /* vec_reassoc_width. */
863 2, /* min_div_recip_mul_sf. */
864 2, /* min_div_recip_mul_df. */
865 0, /* max_case_values. */
866 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
867 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW), /* tune_flags. */
868 &thunderxt88_prefetch_tune
871 static const struct tune_params thunderx_tunings =
873 &thunderx_extra_costs,
874 &generic_addrcost_table,
875 &thunderx_regmove_cost,
876 &thunderx_vector_cost,
877 &generic_branch_cost,
878 &generic_approx_modes,
879 6, /* memmov_cost */
880 2, /* issue_rate */
881 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
882 "8", /* function_align. */
883 "8", /* jump_align. */
884 "8", /* loop_align. */
885 2, /* int_reassoc_width. */
886 4, /* fp_reassoc_width. */
887 1, /* vec_reassoc_width. */
888 2, /* min_div_recip_mul_sf. */
889 2, /* min_div_recip_mul_df. */
890 0, /* max_case_values. */
891 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
892 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW
893 | AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND), /* tune_flags. */
894 &thunderx_prefetch_tune
897 static const struct tune_params tsv110_tunings =
899 &tsv110_extra_costs,
900 &tsv110_addrcost_table,
901 &tsv110_regmove_cost,
902 &tsv110_vector_cost,
903 &generic_branch_cost,
904 &generic_approx_modes,
905 4, /* memmov_cost */
906 4, /* issue_rate */
907 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_CMP_BRANCH
908 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
909 "16", /* function_align. */
910 "4", /* jump_align. */
911 "8", /* loop_align. */
912 2, /* int_reassoc_width. */
913 4, /* fp_reassoc_width. */
914 1, /* vec_reassoc_width. */
915 2, /* min_div_recip_mul_sf. */
916 2, /* min_div_recip_mul_df. */
917 0, /* max_case_values. */
918 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
919 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
920 &tsv110_prefetch_tune
923 static const struct tune_params xgene1_tunings =
925 &xgene1_extra_costs,
926 &xgene1_addrcost_table,
927 &xgene1_regmove_cost,
928 &xgene1_vector_cost,
929 &generic_branch_cost,
930 &xgene1_approx_modes,
931 6, /* memmov_cost */
932 4, /* issue_rate */
933 AARCH64_FUSE_NOTHING, /* fusible_ops */
934 "16", /* function_align. */
935 "8", /* jump_align. */
936 "16", /* loop_align. */
937 2, /* int_reassoc_width. */
938 4, /* fp_reassoc_width. */
939 1, /* vec_reassoc_width. */
940 2, /* min_div_recip_mul_sf. */
941 2, /* min_div_recip_mul_df. */
942 0, /* max_case_values. */
943 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
944 (AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS), /* tune_flags. */
945 &generic_prefetch_tune
948 static const struct tune_params qdf24xx_tunings =
950 &qdf24xx_extra_costs,
951 &qdf24xx_addrcost_table,
952 &qdf24xx_regmove_cost,
953 &qdf24xx_vector_cost,
954 &generic_branch_cost,
955 &generic_approx_modes,
956 4, /* memmov_cost */
957 4, /* issue_rate */
958 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
959 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
960 "16", /* function_align. */
961 "8", /* jump_align. */
962 "16", /* loop_align. */
963 2, /* int_reassoc_width. */
964 4, /* fp_reassoc_width. */
965 1, /* vec_reassoc_width. */
966 2, /* min_div_recip_mul_sf. */
967 2, /* min_div_recip_mul_df. */
968 0, /* max_case_values. */
969 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
970 AARCH64_EXTRA_TUNE_RENAME_LOAD_REGS, /* tune_flags. */
971 &qdf24xx_prefetch_tune
974 /* Tuning structure for the Qualcomm Saphira core. Default to falkor values
975 for now. */
976 static const struct tune_params saphira_tunings =
978 &generic_extra_costs,
979 &generic_addrcost_table,
980 &generic_regmove_cost,
981 &generic_vector_cost,
982 &generic_branch_cost,
983 &generic_approx_modes,
984 4, /* memmov_cost */
985 4, /* issue_rate */
986 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
987 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
988 "16", /* function_align. */
989 "8", /* jump_align. */
990 "16", /* loop_align. */
991 2, /* int_reassoc_width. */
992 4, /* fp_reassoc_width. */
993 1, /* vec_reassoc_width. */
994 2, /* min_div_recip_mul_sf. */
995 2, /* min_div_recip_mul_df. */
996 0, /* max_case_values. */
997 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
998 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
999 &generic_prefetch_tune
1002 static const struct tune_params thunderx2t99_tunings =
1004 &thunderx2t99_extra_costs,
1005 &thunderx2t99_addrcost_table,
1006 &thunderx2t99_regmove_cost,
1007 &thunderx2t99_vector_cost,
1008 &generic_branch_cost,
1009 &generic_approx_modes,
1010 4, /* memmov_cost. */
1011 4, /* issue_rate. */
1012 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC
1013 | AARCH64_FUSE_ALU_BRANCH), /* fusible_ops */
1014 "16", /* function_align. */
1015 "8", /* jump_align. */
1016 "16", /* loop_align. */
1017 3, /* int_reassoc_width. */
1018 2, /* fp_reassoc_width. */
1019 2, /* vec_reassoc_width. */
1020 2, /* min_div_recip_mul_sf. */
1021 2, /* min_div_recip_mul_df. */
1022 0, /* max_case_values. */
1023 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
1024 (AARCH64_EXTRA_TUNE_NONE), /* tune_flags. */
1025 &thunderx2t99_prefetch_tune
1028 /* Support for fine-grained override of the tuning structures. */
1029 struct aarch64_tuning_override_function
1031 const char* name;
1032 void (*parse_override)(const char*, struct tune_params*);
1035 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
1036 static void aarch64_parse_tune_string (const char*, struct tune_params*);
1038 static const struct aarch64_tuning_override_function
1039 aarch64_tuning_override_functions[] =
1041 { "fuse", aarch64_parse_fuse_string },
1042 { "tune", aarch64_parse_tune_string },
1043 { NULL, NULL }
1046 /* A processor implementing AArch64. */
1047 struct processor
1049 const char *const name;
1050 enum aarch64_processor ident;
1051 enum aarch64_processor sched_core;
1052 enum aarch64_arch arch;
1053 unsigned architecture_version;
1054 const unsigned long flags;
1055 const struct tune_params *const tune;
1058 /* Architectures implementing AArch64. */
1059 static const struct processor all_architectures[] =
1061 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
1062 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
1063 #include "aarch64-arches.def"
1064 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1067 /* Processor cores implementing AArch64. */
1068 static const struct processor all_cores[] =
1070 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
1071 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
1072 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
1073 FLAGS, &COSTS##_tunings},
1074 #include "aarch64-cores.def"
1075 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
1076 AARCH64_FL_FOR_ARCH8, &generic_tunings},
1077 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
1081 /* Target specification. These are populated by the -march, -mtune, -mcpu
1082 handling code or by target attributes. */
1083 static const struct processor *selected_arch;
1084 static const struct processor *selected_cpu;
1085 static const struct processor *selected_tune;
1087 /* The current tuning set. */
1088 struct tune_params aarch64_tune_params = generic_tunings;
1090 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
1092 /* An ISA extension in the co-processor and main instruction set space. */
1093 struct aarch64_option_extension
1095 const char *const name;
1096 const unsigned long flags_on;
1097 const unsigned long flags_off;
1100 typedef enum aarch64_cond_code
1102 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
1103 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
1104 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
1106 aarch64_cc;
1108 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
1110 /* The condition codes of the processor, and the inverse function. */
1111 static const char * const aarch64_condition_codes[] =
1113 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
1114 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
1117 /* Generate code to enable conditional branches in functions over 1 MiB. */
1118 const char *
1119 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
1120 const char * branch_format)
1122 rtx_code_label * tmp_label = gen_label_rtx ();
1123 char label_buf[256];
1124 char buffer[128];
1125 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
1126 CODE_LABEL_NUMBER (tmp_label));
1127 const char *label_ptr = targetm.strip_name_encoding (label_buf);
1128 rtx dest_label = operands[pos_label];
1129 operands[pos_label] = tmp_label;
1131 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
1132 output_asm_insn (buffer, operands);
1134 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
1135 operands[pos_label] = dest_label;
1136 output_asm_insn (buffer, operands);
1137 return "";
1140 void
1141 aarch64_err_no_fpadvsimd (machine_mode mode)
1143 if (TARGET_GENERAL_REGS_ONLY)
1144 if (FLOAT_MODE_P (mode))
1145 error ("%qs is incompatible with the use of floating-point types",
1146 "-mgeneral-regs-only");
1147 else
1148 error ("%qs is incompatible with the use of vector types",
1149 "-mgeneral-regs-only");
1150 else
1151 if (FLOAT_MODE_P (mode))
1152 error ("%qs feature modifier is incompatible with the use of"
1153 " floating-point types", "+nofp");
1154 else
1155 error ("%qs feature modifier is incompatible with the use of"
1156 " vector types", "+nofp");
1159 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
1160 The register allocator chooses POINTER_AND_FP_REGS if FP_REGS and
1161 GENERAL_REGS have the same cost - even if POINTER_AND_FP_REGS has a much
1162 higher cost. POINTER_AND_FP_REGS is also used if the cost of both FP_REGS
1163 and GENERAL_REGS is lower than the memory cost (in this case the best class
1164 is the lowest cost one). Using POINTER_AND_FP_REGS irrespectively of its
1165 cost results in bad allocations with many redundant int<->FP moves which
1166 are expensive on various cores.
1167 To avoid this we don't allow POINTER_AND_FP_REGS as the allocno class, but
1168 force a decision between FP_REGS and GENERAL_REGS. We use the allocno class
1169 if it isn't POINTER_AND_FP_REGS. Similarly, use the best class if it isn't
1170 POINTER_AND_FP_REGS. Otherwise set the allocno class depending on the mode.
1171 The result of this is that it is no longer inefficient to have a higher
1172 memory move cost than the register move cost.
1175 static reg_class_t
1176 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
1177 reg_class_t best_class)
1179 machine_mode mode;
1181 if (!reg_class_subset_p (GENERAL_REGS, allocno_class)
1182 || !reg_class_subset_p (FP_REGS, allocno_class))
1183 return allocno_class;
1185 if (!reg_class_subset_p (GENERAL_REGS, best_class)
1186 || !reg_class_subset_p (FP_REGS, best_class))
1187 return best_class;
1189 mode = PSEUDO_REGNO_MODE (regno);
1190 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
1193 static unsigned int
1194 aarch64_min_divisions_for_recip_mul (machine_mode mode)
1196 if (GET_MODE_UNIT_SIZE (mode) == 4)
1197 return aarch64_tune_params.min_div_recip_mul_sf;
1198 return aarch64_tune_params.min_div_recip_mul_df;
1201 /* Return the reassociation width of treeop OPC with mode MODE. */
1202 static int
1203 aarch64_reassociation_width (unsigned opc, machine_mode mode)
1205 if (VECTOR_MODE_P (mode))
1206 return aarch64_tune_params.vec_reassoc_width;
1207 if (INTEGRAL_MODE_P (mode))
1208 return aarch64_tune_params.int_reassoc_width;
1209 /* Avoid reassociating floating point addition so we emit more FMAs. */
1210 if (FLOAT_MODE_P (mode) && opc != PLUS_EXPR)
1211 return aarch64_tune_params.fp_reassoc_width;
1212 return 1;
1215 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
1216 unsigned
1217 aarch64_dbx_register_number (unsigned regno)
1219 if (GP_REGNUM_P (regno))
1220 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
1221 else if (regno == SP_REGNUM)
1222 return AARCH64_DWARF_SP;
1223 else if (FP_REGNUM_P (regno))
1224 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
1225 else if (PR_REGNUM_P (regno))
1226 return AARCH64_DWARF_P0 + regno - P0_REGNUM;
1227 else if (regno == VG_REGNUM)
1228 return AARCH64_DWARF_VG;
1230 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
1231 equivalent DWARF register. */
1232 return DWARF_FRAME_REGISTERS;
1235 /* Return true if MODE is any of the Advanced SIMD structure modes. */
1236 static bool
1237 aarch64_advsimd_struct_mode_p (machine_mode mode)
1239 return (TARGET_SIMD
1240 && (mode == OImode || mode == CImode || mode == XImode));
1243 /* Return true if MODE is an SVE predicate mode. */
1244 static bool
1245 aarch64_sve_pred_mode_p (machine_mode mode)
1247 return (TARGET_SVE
1248 && (mode == VNx16BImode
1249 || mode == VNx8BImode
1250 || mode == VNx4BImode
1251 || mode == VNx2BImode));
1254 /* Three mutually-exclusive flags describing a vector or predicate type. */
1255 const unsigned int VEC_ADVSIMD = 1;
1256 const unsigned int VEC_SVE_DATA = 2;
1257 const unsigned int VEC_SVE_PRED = 4;
1258 /* Can be used in combination with VEC_ADVSIMD or VEC_SVE_DATA to indicate
1259 a structure of 2, 3 or 4 vectors. */
1260 const unsigned int VEC_STRUCT = 8;
1261 /* Useful combinations of the above. */
1262 const unsigned int VEC_ANY_SVE = VEC_SVE_DATA | VEC_SVE_PRED;
1263 const unsigned int VEC_ANY_DATA = VEC_ADVSIMD | VEC_SVE_DATA;
1265 /* Return a set of flags describing the vector properties of mode MODE.
1266 Ignore modes that are not supported by the current target. */
1267 static unsigned int
1268 aarch64_classify_vector_mode (machine_mode mode)
1270 if (aarch64_advsimd_struct_mode_p (mode))
1271 return VEC_ADVSIMD | VEC_STRUCT;
1273 if (aarch64_sve_pred_mode_p (mode))
1274 return VEC_SVE_PRED;
1276 scalar_mode inner = GET_MODE_INNER (mode);
1277 if (VECTOR_MODE_P (mode)
1278 && (inner == QImode
1279 || inner == HImode
1280 || inner == HFmode
1281 || inner == SImode
1282 || inner == SFmode
1283 || inner == DImode
1284 || inner == DFmode))
1286 if (TARGET_SVE)
1288 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
1289 return VEC_SVE_DATA;
1290 if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
1291 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
1292 || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
1293 return VEC_SVE_DATA | VEC_STRUCT;
1296 /* This includes V1DF but not V1DI (which doesn't exist). */
1297 if (TARGET_SIMD
1298 && (known_eq (GET_MODE_BITSIZE (mode), 64)
1299 || known_eq (GET_MODE_BITSIZE (mode), 128)))
1300 return VEC_ADVSIMD;
1303 return 0;
1306 /* Return true if MODE is any of the data vector modes, including
1307 structure modes. */
1308 static bool
1309 aarch64_vector_data_mode_p (machine_mode mode)
1311 return aarch64_classify_vector_mode (mode) & VEC_ANY_DATA;
1314 /* Return true if MODE is an SVE data vector mode; either a single vector
1315 or a structure of vectors. */
1316 static bool
1317 aarch64_sve_data_mode_p (machine_mode mode)
1319 return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
1322 /* Implement target hook TARGET_ARRAY_MODE. */
1323 static opt_machine_mode
1324 aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
1326 if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
1327 && IN_RANGE (nelems, 2, 4))
1328 return mode_for_vector (GET_MODE_INNER (mode),
1329 GET_MODE_NUNITS (mode) * nelems);
1331 return opt_machine_mode ();
1334 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1335 static bool
1336 aarch64_array_mode_supported_p (machine_mode mode,
1337 unsigned HOST_WIDE_INT nelems)
1339 if (TARGET_SIMD
1340 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1341 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1342 && (nelems >= 2 && nelems <= 4))
1343 return true;
1345 return false;
1348 /* Return the SVE predicate mode to use for elements that have
1349 ELEM_NBYTES bytes, if such a mode exists. */
1351 opt_machine_mode
1352 aarch64_sve_pred_mode (unsigned int elem_nbytes)
1354 if (TARGET_SVE)
1356 if (elem_nbytes == 1)
1357 return VNx16BImode;
1358 if (elem_nbytes == 2)
1359 return VNx8BImode;
1360 if (elem_nbytes == 4)
1361 return VNx4BImode;
1362 if (elem_nbytes == 8)
1363 return VNx2BImode;
1365 return opt_machine_mode ();
1368 /* Implement TARGET_VECTORIZE_GET_MASK_MODE. */
1370 static opt_machine_mode
1371 aarch64_get_mask_mode (poly_uint64 nunits, poly_uint64 nbytes)
1373 if (TARGET_SVE && known_eq (nbytes, BYTES_PER_SVE_VECTOR))
1375 unsigned int elem_nbytes = vector_element_size (nbytes, nunits);
1376 machine_mode pred_mode;
1377 if (aarch64_sve_pred_mode (elem_nbytes).exists (&pred_mode))
1378 return pred_mode;
1381 return default_get_mask_mode (nunits, nbytes);
1384 /* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
1385 prefer to use the first arithmetic operand as the else value if
1386 the else value doesn't matter, since that exactly matches the SVE
1387 destructive merging form. For ternary operations we could either
1388 pick the first operand and use FMAD-like instructions or the last
1389 operand and use FMLA-like instructions; the latter seems more
1390 natural. */
1392 static tree
1393 aarch64_preferred_else_value (unsigned, tree, unsigned int nops, tree *ops)
1395 return nops == 3 ? ops[2] : ops[0];
1398 /* Implement TARGET_HARD_REGNO_NREGS. */
1400 static unsigned int
1401 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1403 /* ??? Logically we should only need to provide a value when
1404 HARD_REGNO_MODE_OK says that the combination is valid,
1405 but at the moment we need to handle all modes. Just ignore
1406 any runtime parts for registers that can't store them. */
1407 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
1408 switch (aarch64_regno_regclass (regno))
1410 case FP_REGS:
1411 case FP_LO_REGS:
1412 if (aarch64_sve_data_mode_p (mode))
1413 return exact_div (GET_MODE_SIZE (mode),
1414 BYTES_PER_SVE_VECTOR).to_constant ();
1415 return CEIL (lowest_size, UNITS_PER_VREG);
1416 case PR_REGS:
1417 case PR_LO_REGS:
1418 case PR_HI_REGS:
1419 return 1;
1420 default:
1421 return CEIL (lowest_size, UNITS_PER_WORD);
1423 gcc_unreachable ();
1426 /* Implement TARGET_HARD_REGNO_MODE_OK. */
1428 static bool
1429 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1431 if (GET_MODE_CLASS (mode) == MODE_CC)
1432 return regno == CC_REGNUM;
1434 if (regno == VG_REGNUM)
1435 /* This must have the same size as _Unwind_Word. */
1436 return mode == DImode;
1438 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1439 if (vec_flags & VEC_SVE_PRED)
1440 return PR_REGNUM_P (regno);
1442 if (PR_REGNUM_P (regno))
1443 return 0;
1445 if (regno == SP_REGNUM)
1446 /* The purpose of comparing with ptr_mode is to support the
1447 global register variable associated with the stack pointer
1448 register via the syntax of asm ("wsp") in ILP32. */
1449 return mode == Pmode || mode == ptr_mode;
1451 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1452 return mode == Pmode;
1454 if (GP_REGNUM_P (regno) && known_le (GET_MODE_SIZE (mode), 16))
1455 return true;
1457 if (FP_REGNUM_P (regno))
1459 if (vec_flags & VEC_STRUCT)
1460 return end_hard_regno (mode, regno) - 1 <= V31_REGNUM;
1461 else
1462 return !VECTOR_MODE_P (mode) || vec_flags != 0;
1465 return false;
1468 /* Implement TARGET_HARD_REGNO_CALL_PART_CLOBBERED. The callee only saves
1469 the lower 64 bits of a 128-bit register. Tell the compiler the callee
1470 clobbers the top 64 bits when restoring the bottom 64 bits. */
1472 static bool
1473 aarch64_hard_regno_call_part_clobbered (unsigned int regno, machine_mode mode)
1475 return FP_REGNUM_P (regno) && maybe_gt (GET_MODE_SIZE (mode), 8);
1478 /* Implement REGMODE_NATURAL_SIZE. */
1479 poly_uint64
1480 aarch64_regmode_natural_size (machine_mode mode)
1482 /* The natural size for SVE data modes is one SVE data vector,
1483 and similarly for predicates. We can't independently modify
1484 anything smaller than that. */
1485 /* ??? For now, only do this for variable-width SVE registers.
1486 Doing it for constant-sized registers breaks lower-subreg.c. */
1487 /* ??? And once that's fixed, we should probably have similar
1488 code for Advanced SIMD. */
1489 if (!aarch64_sve_vg.is_constant ())
1491 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
1492 if (vec_flags & VEC_SVE_PRED)
1493 return BYTES_PER_SVE_PRED;
1494 if (vec_flags & VEC_SVE_DATA)
1495 return BYTES_PER_SVE_VECTOR;
1497 return UNITS_PER_WORD;
1500 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1501 machine_mode
1502 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned,
1503 machine_mode mode)
1505 /* The predicate mode determines which bits are significant and
1506 which are "don't care". Decreasing the number of lanes would
1507 lose data while increasing the number of lanes would make bits
1508 unnecessarily significant. */
1509 if (PR_REGNUM_P (regno))
1510 return mode;
1511 if (known_ge (GET_MODE_SIZE (mode), 4))
1512 return mode;
1513 else
1514 return SImode;
1517 /* Return true if I's bits are consecutive ones from the MSB. */
1518 bool
1519 aarch64_high_bits_all_ones_p (HOST_WIDE_INT i)
1521 return exact_log2 (-i) != HOST_WIDE_INT_M1;
1524 /* Implement TARGET_CONSTANT_ALIGNMENT. Make strings word-aligned so
1525 that strcpy from constants will be faster. */
1527 static HOST_WIDE_INT
1528 aarch64_constant_alignment (const_tree exp, HOST_WIDE_INT align)
1530 if (TREE_CODE (exp) == STRING_CST && !optimize_size)
1531 return MAX (align, BITS_PER_WORD);
1532 return align;
1535 /* Return true if calls to DECL should be treated as
1536 long-calls (ie called via a register). */
1537 static bool
1538 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1540 return false;
1543 /* Return true if calls to symbol-ref SYM should be treated as
1544 long-calls (ie called via a register). */
1545 bool
1546 aarch64_is_long_call_p (rtx sym)
1548 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1551 /* Return true if calls to symbol-ref SYM should not go through
1552 plt stubs. */
1554 bool
1555 aarch64_is_noplt_call_p (rtx sym)
1557 const_tree decl = SYMBOL_REF_DECL (sym);
1559 if (flag_pic
1560 && decl
1561 && (!flag_plt
1562 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1563 && !targetm.binds_local_p (decl))
1564 return true;
1566 return false;
1569 /* Return true if the offsets to a zero/sign-extract operation
1570 represent an expression that matches an extend operation. The
1571 operands represent the paramters from
1573 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1574 bool
1575 aarch64_is_extend_from_extract (scalar_int_mode mode, rtx mult_imm,
1576 rtx extract_imm)
1578 HOST_WIDE_INT mult_val, extract_val;
1580 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1581 return false;
1583 mult_val = INTVAL (mult_imm);
1584 extract_val = INTVAL (extract_imm);
1586 if (extract_val > 8
1587 && extract_val < GET_MODE_BITSIZE (mode)
1588 && exact_log2 (extract_val & ~7) > 0
1589 && (extract_val & 7) <= 4
1590 && mult_val == (1 << (extract_val & 7)))
1591 return true;
1593 return false;
1596 /* Emit an insn that's a simple single-set. Both the operands must be
1597 known to be valid. */
1598 inline static rtx_insn *
1599 emit_set_insn (rtx x, rtx y)
1601 return emit_insn (gen_rtx_SET (x, y));
1604 /* X and Y are two things to compare using CODE. Emit the compare insn and
1605 return the rtx for register 0 in the proper mode. */
1607 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1609 machine_mode mode = SELECT_CC_MODE (code, x, y);
1610 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1612 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1613 return cc_reg;
1616 /* Build the SYMBOL_REF for __tls_get_addr. */
1618 static GTY(()) rtx tls_get_addr_libfunc;
1621 aarch64_tls_get_addr (void)
1623 if (!tls_get_addr_libfunc)
1624 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1625 return tls_get_addr_libfunc;
1628 /* Return the TLS model to use for ADDR. */
1630 static enum tls_model
1631 tls_symbolic_operand_type (rtx addr)
1633 enum tls_model tls_kind = TLS_MODEL_NONE;
1634 if (GET_CODE (addr) == CONST)
1636 poly_int64 addend;
1637 rtx sym = strip_offset (addr, &addend);
1638 if (GET_CODE (sym) == SYMBOL_REF)
1639 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1641 else if (GET_CODE (addr) == SYMBOL_REF)
1642 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1644 return tls_kind;
1647 /* We'll allow lo_sum's in addresses in our legitimate addresses
1648 so that combine would take care of combining addresses where
1649 necessary, but for generation purposes, we'll generate the address
1650 as :
1651 RTL Absolute
1652 tmp = hi (symbol_ref); adrp x1, foo
1653 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1656 PIC TLS
1657 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1658 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1659 bl __tls_get_addr
1662 Load TLS symbol, depending on TLS mechanism and TLS access model.
1664 Global Dynamic - Traditional TLS:
1665 adrp tmp, :tlsgd:imm
1666 add dest, tmp, #:tlsgd_lo12:imm
1667 bl __tls_get_addr
1669 Global Dynamic - TLS Descriptors:
1670 adrp dest, :tlsdesc:imm
1671 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1672 add dest, dest, #:tlsdesc_lo12:imm
1673 blr tmp
1674 mrs tp, tpidr_el0
1675 add dest, dest, tp
1677 Initial Exec:
1678 mrs tp, tpidr_el0
1679 adrp tmp, :gottprel:imm
1680 ldr dest, [tmp, #:gottprel_lo12:imm]
1681 add dest, dest, tp
1683 Local Exec:
1684 mrs tp, tpidr_el0
1685 add t0, tp, #:tprel_hi12:imm, lsl #12
1686 add t0, t0, #:tprel_lo12_nc:imm
1689 static void
1690 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1691 enum aarch64_symbol_type type)
1693 switch (type)
1695 case SYMBOL_SMALL_ABSOLUTE:
1697 /* In ILP32, the mode of dest can be either SImode or DImode. */
1698 rtx tmp_reg = dest;
1699 machine_mode mode = GET_MODE (dest);
1701 gcc_assert (mode == Pmode || mode == ptr_mode);
1703 if (can_create_pseudo_p ())
1704 tmp_reg = gen_reg_rtx (mode);
1706 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1707 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1708 return;
1711 case SYMBOL_TINY_ABSOLUTE:
1712 emit_insn (gen_rtx_SET (dest, imm));
1713 return;
1715 case SYMBOL_SMALL_GOT_28K:
1717 machine_mode mode = GET_MODE (dest);
1718 rtx gp_rtx = pic_offset_table_rtx;
1719 rtx insn;
1720 rtx mem;
1722 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1723 here before rtl expand. Tree IVOPT will generate rtl pattern to
1724 decide rtx costs, in which case pic_offset_table_rtx is not
1725 initialized. For that case no need to generate the first adrp
1726 instruction as the final cost for global variable access is
1727 one instruction. */
1728 if (gp_rtx != NULL)
1730 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1731 using the page base as GOT base, the first page may be wasted,
1732 in the worst scenario, there is only 28K space for GOT).
1734 The generate instruction sequence for accessing global variable
1737 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1739 Only one instruction needed. But we must initialize
1740 pic_offset_table_rtx properly. We generate initialize insn for
1741 every global access, and allow CSE to remove all redundant.
1743 The final instruction sequences will look like the following
1744 for multiply global variables access.
1746 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1748 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1749 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1750 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1751 ... */
1753 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1754 crtl->uses_pic_offset_table = 1;
1755 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1757 if (mode != GET_MODE (gp_rtx))
1758 gp_rtx = gen_lowpart (mode, gp_rtx);
1762 if (mode == ptr_mode)
1764 if (mode == DImode)
1765 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1766 else
1767 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1769 mem = XVECEXP (SET_SRC (insn), 0, 0);
1771 else
1773 gcc_assert (mode == Pmode);
1775 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1776 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1779 /* The operand is expected to be MEM. Whenever the related insn
1780 pattern changed, above code which calculate mem should be
1781 updated. */
1782 gcc_assert (GET_CODE (mem) == MEM);
1783 MEM_READONLY_P (mem) = 1;
1784 MEM_NOTRAP_P (mem) = 1;
1785 emit_insn (insn);
1786 return;
1789 case SYMBOL_SMALL_GOT_4G:
1791 /* In ILP32, the mode of dest can be either SImode or DImode,
1792 while the got entry is always of SImode size. The mode of
1793 dest depends on how dest is used: if dest is assigned to a
1794 pointer (e.g. in the memory), it has SImode; it may have
1795 DImode if dest is dereferenced to access the memeory.
1796 This is why we have to handle three different ldr_got_small
1797 patterns here (two patterns for ILP32). */
1799 rtx insn;
1800 rtx mem;
1801 rtx tmp_reg = dest;
1802 machine_mode mode = GET_MODE (dest);
1804 if (can_create_pseudo_p ())
1805 tmp_reg = gen_reg_rtx (mode);
1807 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1808 if (mode == ptr_mode)
1810 if (mode == DImode)
1811 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1812 else
1813 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1815 mem = XVECEXP (SET_SRC (insn), 0, 0);
1817 else
1819 gcc_assert (mode == Pmode);
1821 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1822 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1825 gcc_assert (GET_CODE (mem) == MEM);
1826 MEM_READONLY_P (mem) = 1;
1827 MEM_NOTRAP_P (mem) = 1;
1828 emit_insn (insn);
1829 return;
1832 case SYMBOL_SMALL_TLSGD:
1834 rtx_insn *insns;
1835 machine_mode mode = GET_MODE (dest);
1836 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1838 start_sequence ();
1839 if (TARGET_ILP32)
1840 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1841 else
1842 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1843 insns = get_insns ();
1844 end_sequence ();
1846 RTL_CONST_CALL_P (insns) = 1;
1847 emit_libcall_block (insns, dest, result, imm);
1848 return;
1851 case SYMBOL_SMALL_TLSDESC:
1853 machine_mode mode = GET_MODE (dest);
1854 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1855 rtx tp;
1857 gcc_assert (mode == Pmode || mode == ptr_mode);
1859 /* In ILP32, the got entry is always of SImode size. Unlike
1860 small GOT, the dest is fixed at reg 0. */
1861 if (TARGET_ILP32)
1862 emit_insn (gen_tlsdesc_small_si (imm));
1863 else
1864 emit_insn (gen_tlsdesc_small_di (imm));
1865 tp = aarch64_load_tp (NULL);
1867 if (mode != Pmode)
1868 tp = gen_lowpart (mode, tp);
1870 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1871 if (REG_P (dest))
1872 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1873 return;
1876 case SYMBOL_SMALL_TLSIE:
1878 /* In ILP32, the mode of dest can be either SImode or DImode,
1879 while the got entry is always of SImode size. The mode of
1880 dest depends on how dest is used: if dest is assigned to a
1881 pointer (e.g. in the memory), it has SImode; it may have
1882 DImode if dest is dereferenced to access the memeory.
1883 This is why we have to handle three different tlsie_small
1884 patterns here (two patterns for ILP32). */
1885 machine_mode mode = GET_MODE (dest);
1886 rtx tmp_reg = gen_reg_rtx (mode);
1887 rtx tp = aarch64_load_tp (NULL);
1889 if (mode == ptr_mode)
1891 if (mode == DImode)
1892 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1893 else
1895 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1896 tp = gen_lowpart (mode, tp);
1899 else
1901 gcc_assert (mode == Pmode);
1902 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1905 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1906 if (REG_P (dest))
1907 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1908 return;
1911 case SYMBOL_TLSLE12:
1912 case SYMBOL_TLSLE24:
1913 case SYMBOL_TLSLE32:
1914 case SYMBOL_TLSLE48:
1916 machine_mode mode = GET_MODE (dest);
1917 rtx tp = aarch64_load_tp (NULL);
1919 if (mode != Pmode)
1920 tp = gen_lowpart (mode, tp);
1922 switch (type)
1924 case SYMBOL_TLSLE12:
1925 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1926 (dest, tp, imm));
1927 break;
1928 case SYMBOL_TLSLE24:
1929 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1930 (dest, tp, imm));
1931 break;
1932 case SYMBOL_TLSLE32:
1933 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1934 (dest, imm));
1935 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1936 (dest, dest, tp));
1937 break;
1938 case SYMBOL_TLSLE48:
1939 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1940 (dest, imm));
1941 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1942 (dest, dest, tp));
1943 break;
1944 default:
1945 gcc_unreachable ();
1948 if (REG_P (dest))
1949 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1950 return;
1953 case SYMBOL_TINY_GOT:
1954 emit_insn (gen_ldr_got_tiny (dest, imm));
1955 return;
1957 case SYMBOL_TINY_TLSIE:
1959 machine_mode mode = GET_MODE (dest);
1960 rtx tp = aarch64_load_tp (NULL);
1962 if (mode == ptr_mode)
1964 if (mode == DImode)
1965 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1966 else
1968 tp = gen_lowpart (mode, tp);
1969 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1972 else
1974 gcc_assert (mode == Pmode);
1975 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1978 if (REG_P (dest))
1979 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1980 return;
1983 default:
1984 gcc_unreachable ();
1988 /* Emit a move from SRC to DEST. Assume that the move expanders can
1989 handle all moves if !can_create_pseudo_p (). The distinction is
1990 important because, unlike emit_move_insn, the move expanders know
1991 how to force Pmode objects into the constant pool even when the
1992 constant pool address is not itself legitimate. */
1993 static rtx
1994 aarch64_emit_move (rtx dest, rtx src)
1996 return (can_create_pseudo_p ()
1997 ? emit_move_insn (dest, src)
1998 : emit_move_insn_1 (dest, src));
2001 /* Apply UNOPTAB to OP and store the result in DEST. */
2003 static void
2004 aarch64_emit_unop (rtx dest, optab unoptab, rtx op)
2006 rtx tmp = expand_unop (GET_MODE (dest), unoptab, op, dest, 0);
2007 if (dest != tmp)
2008 emit_move_insn (dest, tmp);
2011 /* Apply BINOPTAB to OP0 and OP1 and store the result in DEST. */
2013 static void
2014 aarch64_emit_binop (rtx dest, optab binoptab, rtx op0, rtx op1)
2016 rtx tmp = expand_binop (GET_MODE (dest), binoptab, op0, op1, dest, 0,
2017 OPTAB_DIRECT);
2018 if (dest != tmp)
2019 emit_move_insn (dest, tmp);
2022 /* Split a 128-bit move operation into two 64-bit move operations,
2023 taking care to handle partial overlap of register to register
2024 copies. Special cases are needed when moving between GP regs and
2025 FP regs. SRC can be a register, constant or memory; DST a register
2026 or memory. If either operand is memory it must not have any side
2027 effects. */
2028 void
2029 aarch64_split_128bit_move (rtx dst, rtx src)
2031 rtx dst_lo, dst_hi;
2032 rtx src_lo, src_hi;
2034 machine_mode mode = GET_MODE (dst);
2036 gcc_assert (mode == TImode || mode == TFmode);
2037 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
2038 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
2040 if (REG_P (dst) && REG_P (src))
2042 int src_regno = REGNO (src);
2043 int dst_regno = REGNO (dst);
2045 /* Handle FP <-> GP regs. */
2046 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
2048 src_lo = gen_lowpart (word_mode, src);
2049 src_hi = gen_highpart (word_mode, src);
2051 emit_insn (gen_aarch64_movlow_di (mode, dst, src_lo));
2052 emit_insn (gen_aarch64_movhigh_di (mode, dst, src_hi));
2053 return;
2055 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
2057 dst_lo = gen_lowpart (word_mode, dst);
2058 dst_hi = gen_highpart (word_mode, dst);
2060 emit_insn (gen_aarch64_movdi_low (mode, dst_lo, src));
2061 emit_insn (gen_aarch64_movdi_high (mode, dst_hi, src));
2062 return;
2066 dst_lo = gen_lowpart (word_mode, dst);
2067 dst_hi = gen_highpart (word_mode, dst);
2068 src_lo = gen_lowpart (word_mode, src);
2069 src_hi = gen_highpart_mode (word_mode, mode, src);
2071 /* At most one pairing may overlap. */
2072 if (reg_overlap_mentioned_p (dst_lo, src_hi))
2074 aarch64_emit_move (dst_hi, src_hi);
2075 aarch64_emit_move (dst_lo, src_lo);
2077 else
2079 aarch64_emit_move (dst_lo, src_lo);
2080 aarch64_emit_move (dst_hi, src_hi);
2084 bool
2085 aarch64_split_128bit_move_p (rtx dst, rtx src)
2087 return (! REG_P (src)
2088 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
2091 /* Split a complex SIMD combine. */
2093 void
2094 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
2096 machine_mode src_mode = GET_MODE (src1);
2097 machine_mode dst_mode = GET_MODE (dst);
2099 gcc_assert (VECTOR_MODE_P (dst_mode));
2100 gcc_assert (register_operand (dst, dst_mode)
2101 && register_operand (src1, src_mode)
2102 && register_operand (src2, src_mode));
2104 emit_insn (gen_aarch64_simd_combine (src_mode, dst, src1, src2));
2105 return;
2108 /* Split a complex SIMD move. */
2110 void
2111 aarch64_split_simd_move (rtx dst, rtx src)
2113 machine_mode src_mode = GET_MODE (src);
2114 machine_mode dst_mode = GET_MODE (dst);
2116 gcc_assert (VECTOR_MODE_P (dst_mode));
2118 if (REG_P (dst) && REG_P (src))
2120 gcc_assert (VECTOR_MODE_P (src_mode));
2121 emit_insn (gen_aarch64_split_simd_mov (src_mode, dst, src));
2125 bool
2126 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
2127 machine_mode ymode, rtx y)
2129 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
2130 gcc_assert (r != NULL);
2131 return rtx_equal_p (x, r);
2135 static rtx
2136 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
2138 if (can_create_pseudo_p ())
2139 return force_reg (mode, value);
2140 else
2142 gcc_assert (x);
2143 aarch64_emit_move (x, value);
2144 return x;
2148 /* Return true if we can move VALUE into a register using a single
2149 CNT[BHWD] instruction. */
2151 static bool
2152 aarch64_sve_cnt_immediate_p (poly_int64 value)
2154 HOST_WIDE_INT factor = value.coeffs[0];
2155 /* The coefficient must be [1, 16] * {2, 4, 8, 16}. */
2156 return (value.coeffs[1] == factor
2157 && IN_RANGE (factor, 2, 16 * 16)
2158 && (factor & 1) == 0
2159 && factor <= 16 * (factor & -factor));
2162 /* Likewise for rtx X. */
2164 bool
2165 aarch64_sve_cnt_immediate_p (rtx x)
2167 poly_int64 value;
2168 return poly_int_rtx_p (x, &value) && aarch64_sve_cnt_immediate_p (value);
2171 /* Return the asm string for an instruction with a CNT-like vector size
2172 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2173 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2174 first part of the operands template (the part that comes before the
2175 vector size itself). FACTOR is the number of quadwords.
2176 NELTS_PER_VQ, if nonzero, is the number of elements in each quadword.
2177 If it is zero, we can use any element size. */
2179 static char *
2180 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2181 unsigned int factor,
2182 unsigned int nelts_per_vq)
2184 static char buffer[sizeof ("sqincd\t%x0, %w0, all, mul #16")];
2186 if (nelts_per_vq == 0)
2187 /* There is some overlap in the ranges of the four CNT instructions.
2188 Here we always use the smallest possible element size, so that the
2189 multiplier is 1 whereever possible. */
2190 nelts_per_vq = factor & -factor;
2191 int shift = std::min (exact_log2 (nelts_per_vq), 4);
2192 gcc_assert (IN_RANGE (shift, 1, 4));
2193 char suffix = "dwhb"[shift - 1];
2195 factor >>= shift;
2196 unsigned int written;
2197 if (factor == 1)
2198 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s",
2199 prefix, suffix, operands);
2200 else
2201 written = snprintf (buffer, sizeof (buffer), "%s%c\t%s, all, mul #%d",
2202 prefix, suffix, operands, factor);
2203 gcc_assert (written < sizeof (buffer));
2204 return buffer;
2207 /* Return the asm string for an instruction with a CNT-like vector size
2208 operand (a vector pattern followed by a multiplier in the range [1, 16]).
2209 PREFIX is the mnemonic without the size suffix and OPERANDS is the
2210 first part of the operands template (the part that comes before the
2211 vector size itself). X is the value of the vector size operand,
2212 as a polynomial integer rtx. */
2214 char *
2215 aarch64_output_sve_cnt_immediate (const char *prefix, const char *operands,
2216 rtx x)
2218 poly_int64 value = rtx_to_poly_int64 (x);
2219 gcc_assert (aarch64_sve_cnt_immediate_p (value));
2220 return aarch64_output_sve_cnt_immediate (prefix, operands,
2221 value.coeffs[1], 0);
2224 /* Return true if we can add VALUE to a register using a single ADDVL
2225 or ADDPL instruction. */
2227 static bool
2228 aarch64_sve_addvl_addpl_immediate_p (poly_int64 value)
2230 HOST_WIDE_INT factor = value.coeffs[0];
2231 if (factor == 0 || value.coeffs[1] != factor)
2232 return false;
2233 /* FACTOR counts VG / 2, so a value of 2 is one predicate width
2234 and a value of 16 is one vector width. */
2235 return (((factor & 15) == 0 && IN_RANGE (factor, -32 * 16, 31 * 16))
2236 || ((factor & 1) == 0 && IN_RANGE (factor, -32 * 2, 31 * 2)));
2239 /* Likewise for rtx X. */
2241 bool
2242 aarch64_sve_addvl_addpl_immediate_p (rtx x)
2244 poly_int64 value;
2245 return (poly_int_rtx_p (x, &value)
2246 && aarch64_sve_addvl_addpl_immediate_p (value));
2249 /* Return the asm string for adding ADDVL or ADDPL immediate X to operand 1
2250 and storing the result in operand 0. */
2252 char *
2253 aarch64_output_sve_addvl_addpl (rtx dest, rtx base, rtx offset)
2255 static char buffer[sizeof ("addpl\t%x0, %x1, #-") + 3 * sizeof (int)];
2256 poly_int64 offset_value = rtx_to_poly_int64 (offset);
2257 gcc_assert (aarch64_sve_addvl_addpl_immediate_p (offset_value));
2259 /* Use INC or DEC if possible. */
2260 if (rtx_equal_p (dest, base) && GP_REGNUM_P (REGNO (dest)))
2262 if (aarch64_sve_cnt_immediate_p (offset_value))
2263 return aarch64_output_sve_cnt_immediate ("inc", "%x0",
2264 offset_value.coeffs[1], 0);
2265 if (aarch64_sve_cnt_immediate_p (-offset_value))
2266 return aarch64_output_sve_cnt_immediate ("dec", "%x0",
2267 -offset_value.coeffs[1], 0);
2270 int factor = offset_value.coeffs[1];
2271 if ((factor & 15) == 0)
2272 snprintf (buffer, sizeof (buffer), "addvl\t%%x0, %%x1, #%d", factor / 16);
2273 else
2274 snprintf (buffer, sizeof (buffer), "addpl\t%%x0, %%x1, #%d", factor / 2);
2275 return buffer;
2278 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2279 instruction. If it is, store the number of elements in each vector
2280 quadword in *NELTS_PER_VQ_OUT (if nonnull) and store the multiplication
2281 factor in *FACTOR_OUT (if nonnull). */
2283 bool
2284 aarch64_sve_inc_dec_immediate_p (rtx x, int *factor_out,
2285 unsigned int *nelts_per_vq_out)
2287 rtx elt;
2288 poly_int64 value;
2290 if (!const_vec_duplicate_p (x, &elt)
2291 || !poly_int_rtx_p (elt, &value))
2292 return false;
2294 unsigned int nelts_per_vq = 128 / GET_MODE_UNIT_BITSIZE (GET_MODE (x));
2295 if (nelts_per_vq != 8 && nelts_per_vq != 4 && nelts_per_vq != 2)
2296 /* There's no vector INCB. */
2297 return false;
2299 HOST_WIDE_INT factor = value.coeffs[0];
2300 if (value.coeffs[1] != factor)
2301 return false;
2303 /* The coefficient must be [1, 16] * NELTS_PER_VQ. */
2304 if ((factor % nelts_per_vq) != 0
2305 || !IN_RANGE (abs (factor), nelts_per_vq, 16 * nelts_per_vq))
2306 return false;
2308 if (factor_out)
2309 *factor_out = factor;
2310 if (nelts_per_vq_out)
2311 *nelts_per_vq_out = nelts_per_vq;
2312 return true;
2315 /* Return true if X is a valid immediate for an SVE vector INC or DEC
2316 instruction. */
2318 bool
2319 aarch64_sve_inc_dec_immediate_p (rtx x)
2321 return aarch64_sve_inc_dec_immediate_p (x, NULL, NULL);
2324 /* Return the asm template for an SVE vector INC or DEC instruction.
2325 OPERANDS gives the operands before the vector count and X is the
2326 value of the vector count operand itself. */
2328 char *
2329 aarch64_output_sve_inc_dec_immediate (const char *operands, rtx x)
2331 int factor;
2332 unsigned int nelts_per_vq;
2333 if (!aarch64_sve_inc_dec_immediate_p (x, &factor, &nelts_per_vq))
2334 gcc_unreachable ();
2335 if (factor < 0)
2336 return aarch64_output_sve_cnt_immediate ("dec", operands, -factor,
2337 nelts_per_vq);
2338 else
2339 return aarch64_output_sve_cnt_immediate ("inc", operands, factor,
2340 nelts_per_vq);
2343 static int
2344 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
2345 scalar_int_mode mode)
2347 int i;
2348 unsigned HOST_WIDE_INT val, val2, mask;
2349 int one_match, zero_match;
2350 int num_insns;
2352 val = INTVAL (imm);
2354 if (aarch64_move_imm (val, mode))
2356 if (generate)
2357 emit_insn (gen_rtx_SET (dest, imm));
2358 return 1;
2361 /* Check to see if the low 32 bits are either 0xffffXXXX or 0xXXXXffff
2362 (with XXXX non-zero). In that case check to see if the move can be done in
2363 a smaller mode. */
2364 val2 = val & 0xffffffff;
2365 if (mode == DImode
2366 && aarch64_move_imm (val2, SImode)
2367 && (((val >> 32) & 0xffff) == 0 || (val >> 48) == 0))
2369 if (generate)
2370 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2372 /* Check if we have to emit a second instruction by checking to see
2373 if any of the upper 32 bits of the original DI mode value is set. */
2374 if (val == val2)
2375 return 1;
2377 i = (val >> 48) ? 48 : 32;
2379 if (generate)
2380 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2381 GEN_INT ((val >> i) & 0xffff)));
2383 return 2;
2386 if ((val >> 32) == 0 || mode == SImode)
2388 if (generate)
2390 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
2391 if (mode == SImode)
2392 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
2393 GEN_INT ((val >> 16) & 0xffff)));
2394 else
2395 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
2396 GEN_INT ((val >> 16) & 0xffff)));
2398 return 2;
2401 /* Remaining cases are all for DImode. */
2403 mask = 0xffff;
2404 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
2405 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
2406 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
2407 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
2409 if (zero_match != 2 && one_match != 2)
2411 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
2412 For a 64-bit bitmask try whether changing 16 bits to all ones or
2413 zeroes creates a valid bitmask. To check any repeated bitmask,
2414 try using 16 bits from the other 32-bit half of val. */
2416 for (i = 0; i < 64; i += 16, mask <<= 16)
2418 val2 = val & ~mask;
2419 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2420 break;
2421 val2 = val | mask;
2422 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2423 break;
2424 val2 = val2 & ~mask;
2425 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
2426 if (val2 != val && aarch64_bitmask_imm (val2, mode))
2427 break;
2429 if (i != 64)
2431 if (generate)
2433 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
2434 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2435 GEN_INT ((val >> i) & 0xffff)));
2437 return 2;
2441 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
2442 are emitted by the initial mov. If one_match > zero_match, skip set bits,
2443 otherwise skip zero bits. */
2445 num_insns = 1;
2446 mask = 0xffff;
2447 val2 = one_match > zero_match ? ~val : val;
2448 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
2450 if (generate)
2451 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
2452 ? (val | ~(mask << i))
2453 : (val & (mask << i)))));
2454 for (i += 16; i < 64; i += 16)
2456 if ((val2 & (mask << i)) == 0)
2457 continue;
2458 if (generate)
2459 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
2460 GEN_INT ((val >> i) & 0xffff)));
2461 num_insns ++;
2464 return num_insns;
2467 /* Return whether imm is a 128-bit immediate which is simple enough to
2468 expand inline. */
2469 bool
2470 aarch64_mov128_immediate (rtx imm)
2472 if (GET_CODE (imm) == CONST_INT)
2473 return true;
2475 gcc_assert (CONST_WIDE_INT_NUNITS (imm) == 2);
2477 rtx lo = GEN_INT (CONST_WIDE_INT_ELT (imm, 0));
2478 rtx hi = GEN_INT (CONST_WIDE_INT_ELT (imm, 1));
2480 return aarch64_internal_mov_immediate (NULL_RTX, lo, false, DImode)
2481 + aarch64_internal_mov_immediate (NULL_RTX, hi, false, DImode) <= 4;
2485 /* Return the number of temporary registers that aarch64_add_offset_1
2486 would need to add OFFSET to a register. */
2488 static unsigned int
2489 aarch64_add_offset_1_temporaries (HOST_WIDE_INT offset)
2491 return abs_hwi (offset) < 0x1000000 ? 0 : 1;
2494 /* A subroutine of aarch64_add_offset. Set DEST to SRC + OFFSET for
2495 a non-polynomial OFFSET. MODE is the mode of the addition.
2496 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2497 be set and CFA adjustments added to the generated instructions.
2499 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2500 temporary if register allocation is already complete. This temporary
2501 register may overlap DEST but must not overlap SRC. If TEMP1 is known
2502 to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting
2503 the immediate again.
2505 Since this function may be used to adjust the stack pointer, we must
2506 ensure that it cannot cause transient stack deallocation (for example
2507 by first incrementing SP and then decrementing when adjusting by a
2508 large immediate). */
2510 static void
2511 aarch64_add_offset_1 (scalar_int_mode mode, rtx dest,
2512 rtx src, HOST_WIDE_INT offset, rtx temp1,
2513 bool frame_related_p, bool emit_move_imm)
2515 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2516 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2518 HOST_WIDE_INT moffset = abs_hwi (offset);
2519 rtx_insn *insn;
2521 if (!moffset)
2523 if (!rtx_equal_p (dest, src))
2525 insn = emit_insn (gen_rtx_SET (dest, src));
2526 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2528 return;
2531 /* Single instruction adjustment. */
2532 if (aarch64_uimm12_shift (moffset))
2534 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset)));
2535 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2536 return;
2539 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits
2540 and either:
2542 a) the offset cannot be loaded by a 16-bit move or
2543 b) there is no spare register into which we can move it. */
2544 if (moffset < 0x1000000
2545 && ((!temp1 && !can_create_pseudo_p ())
2546 || !aarch64_move_imm (moffset, mode)))
2548 HOST_WIDE_INT low_off = moffset & 0xfff;
2550 low_off = offset < 0 ? -low_off : low_off;
2551 insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off)));
2552 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2553 insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off)));
2554 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2555 return;
2558 /* Emit a move immediate if required and an addition/subtraction. */
2559 if (emit_move_imm)
2561 gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ());
2562 temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset));
2564 insn = emit_insn (offset < 0
2565 ? gen_sub3_insn (dest, src, temp1)
2566 : gen_add3_insn (dest, src, temp1));
2567 if (frame_related_p)
2569 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2570 rtx adj = plus_constant (mode, src, offset);
2571 add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj));
2575 /* Return the number of temporary registers that aarch64_add_offset
2576 would need to move OFFSET into a register or add OFFSET to a register;
2577 ADD_P is true if we want the latter rather than the former. */
2579 static unsigned int
2580 aarch64_offset_temporaries (bool add_p, poly_int64 offset)
2582 /* This follows the same structure as aarch64_add_offset. */
2583 if (add_p && aarch64_sve_addvl_addpl_immediate_p (offset))
2584 return 0;
2586 unsigned int count = 0;
2587 HOST_WIDE_INT factor = offset.coeffs[1];
2588 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2589 poly_int64 poly_offset (factor, factor);
2590 if (add_p && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2591 /* Need one register for the ADDVL/ADDPL result. */
2592 count += 1;
2593 else if (factor != 0)
2595 factor = abs (factor);
2596 if (factor > 16 * (factor & -factor))
2597 /* Need one register for the CNT result and one for the multiplication
2598 factor. If necessary, the second temporary can be reused for the
2599 constant part of the offset. */
2600 return 2;
2601 /* Need one register for the CNT result (which might then
2602 be shifted). */
2603 count += 1;
2605 return count + aarch64_add_offset_1_temporaries (constant);
2608 /* If X can be represented as a poly_int64, return the number
2609 of temporaries that are required to add it to a register.
2610 Return -1 otherwise. */
2613 aarch64_add_offset_temporaries (rtx x)
2615 poly_int64 offset;
2616 if (!poly_int_rtx_p (x, &offset))
2617 return -1;
2618 return aarch64_offset_temporaries (true, offset);
2621 /* Set DEST to SRC + OFFSET. MODE is the mode of the addition.
2622 FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should
2623 be set and CFA adjustments added to the generated instructions.
2625 TEMP1, if nonnull, is a register of mode MODE that can be used as a
2626 temporary if register allocation is already complete. This temporary
2627 register may overlap DEST if !FRAME_RELATED_P but must not overlap SRC.
2628 If TEMP1 is known to hold abs (OFFSET), EMIT_MOVE_IMM can be set to
2629 false to avoid emitting the immediate again.
2631 TEMP2, if nonnull, is a second temporary register that doesn't
2632 overlap either DEST or REG.
2634 Since this function may be used to adjust the stack pointer, we must
2635 ensure that it cannot cause transient stack deallocation (for example
2636 by first incrementing SP and then decrementing when adjusting by a
2637 large immediate). */
2639 static void
2640 aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2641 poly_int64 offset, rtx temp1, rtx temp2,
2642 bool frame_related_p, bool emit_move_imm = true)
2644 gcc_assert (emit_move_imm || temp1 != NULL_RTX);
2645 gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src));
2646 gcc_assert (temp1 == NULL_RTX
2647 || !frame_related_p
2648 || !reg_overlap_mentioned_p (temp1, dest));
2649 gcc_assert (temp2 == NULL_RTX || !reg_overlap_mentioned_p (dest, temp2));
2651 /* Try using ADDVL or ADDPL to add the whole value. */
2652 if (src != const0_rtx && aarch64_sve_addvl_addpl_immediate_p (offset))
2654 rtx offset_rtx = gen_int_mode (offset, mode);
2655 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2656 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2657 return;
2660 /* Coefficient 1 is multiplied by the number of 128-bit blocks in an
2661 SVE vector register, over and above the minimum size of 128 bits.
2662 This is equivalent to half the value returned by CNTD with a
2663 vector shape of ALL. */
2664 HOST_WIDE_INT factor = offset.coeffs[1];
2665 HOST_WIDE_INT constant = offset.coeffs[0] - factor;
2667 /* Try using ADDVL or ADDPL to add the VG-based part. */
2668 poly_int64 poly_offset (factor, factor);
2669 if (src != const0_rtx
2670 && aarch64_sve_addvl_addpl_immediate_p (poly_offset))
2672 rtx offset_rtx = gen_int_mode (poly_offset, mode);
2673 if (frame_related_p)
2675 rtx_insn *insn = emit_insn (gen_add3_insn (dest, src, offset_rtx));
2676 RTX_FRAME_RELATED_P (insn) = true;
2677 src = dest;
2679 else
2681 rtx addr = gen_rtx_PLUS (mode, src, offset_rtx);
2682 src = aarch64_force_temporary (mode, temp1, addr);
2683 temp1 = temp2;
2684 temp2 = NULL_RTX;
2687 /* Otherwise use a CNT-based sequence. */
2688 else if (factor != 0)
2690 /* Use a subtraction if we have a negative factor. */
2691 rtx_code code = PLUS;
2692 if (factor < 0)
2694 factor = -factor;
2695 code = MINUS;
2698 /* Calculate CNTD * FACTOR / 2. First try to fold the division
2699 into the multiplication. */
2700 rtx val;
2701 int shift = 0;
2702 if (factor & 1)
2703 /* Use a right shift by 1. */
2704 shift = -1;
2705 else
2706 factor /= 2;
2707 HOST_WIDE_INT low_bit = factor & -factor;
2708 if (factor <= 16 * low_bit)
2710 if (factor > 16 * 8)
2712 /* "CNTB Xn, ALL, MUL #FACTOR" is out of range, so calculate
2713 the value with the minimum multiplier and shift it into
2714 position. */
2715 int extra_shift = exact_log2 (low_bit);
2716 shift += extra_shift;
2717 factor >>= extra_shift;
2719 val = gen_int_mode (poly_int64 (factor * 2, factor * 2), mode);
2721 else
2723 /* Use CNTD, then multiply it by FACTOR. */
2724 val = gen_int_mode (poly_int64 (2, 2), mode);
2725 val = aarch64_force_temporary (mode, temp1, val);
2727 /* Go back to using a negative multiplication factor if we have
2728 no register from which to subtract. */
2729 if (code == MINUS && src == const0_rtx)
2731 factor = -factor;
2732 code = PLUS;
2734 rtx coeff1 = gen_int_mode (factor, mode);
2735 coeff1 = aarch64_force_temporary (mode, temp2, coeff1);
2736 val = gen_rtx_MULT (mode, val, coeff1);
2739 if (shift > 0)
2741 /* Multiply by 1 << SHIFT. */
2742 val = aarch64_force_temporary (mode, temp1, val);
2743 val = gen_rtx_ASHIFT (mode, val, GEN_INT (shift));
2745 else if (shift == -1)
2747 /* Divide by 2. */
2748 val = aarch64_force_temporary (mode, temp1, val);
2749 val = gen_rtx_ASHIFTRT (mode, val, const1_rtx);
2752 /* Calculate SRC +/- CNTD * FACTOR / 2. */
2753 if (src != const0_rtx)
2755 val = aarch64_force_temporary (mode, temp1, val);
2756 val = gen_rtx_fmt_ee (code, mode, src, val);
2758 else if (code == MINUS)
2760 val = aarch64_force_temporary (mode, temp1, val);
2761 val = gen_rtx_NEG (mode, val);
2764 if (constant == 0 || frame_related_p)
2766 rtx_insn *insn = emit_insn (gen_rtx_SET (dest, val));
2767 if (frame_related_p)
2769 RTX_FRAME_RELATED_P (insn) = true;
2770 add_reg_note (insn, REG_CFA_ADJUST_CFA,
2771 gen_rtx_SET (dest, plus_constant (Pmode, src,
2772 poly_offset)));
2774 src = dest;
2775 if (constant == 0)
2776 return;
2778 else
2780 src = aarch64_force_temporary (mode, temp1, val);
2781 temp1 = temp2;
2782 temp2 = NULL_RTX;
2785 emit_move_imm = true;
2788 aarch64_add_offset_1 (mode, dest, src, constant, temp1,
2789 frame_related_p, emit_move_imm);
2792 /* Like aarch64_add_offset, but the offset is given as an rtx rather
2793 than a poly_int64. */
2795 void
2796 aarch64_split_add_offset (scalar_int_mode mode, rtx dest, rtx src,
2797 rtx offset_rtx, rtx temp1, rtx temp2)
2799 aarch64_add_offset (mode, dest, src, rtx_to_poly_int64 (offset_rtx),
2800 temp1, temp2, false);
2803 /* Add DELTA to the stack pointer, marking the instructions frame-related.
2804 TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false
2805 if TEMP1 already contains abs (DELTA). */
2807 static inline void
2808 aarch64_add_sp (rtx temp1, rtx temp2, poly_int64 delta, bool emit_move_imm)
2810 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta,
2811 temp1, temp2, true, emit_move_imm);
2814 /* Subtract DELTA from the stack pointer, marking the instructions
2815 frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary
2816 if nonnull. */
2818 static inline void
2819 aarch64_sub_sp (rtx temp1, rtx temp2, poly_int64 delta, bool frame_related_p)
2821 aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta,
2822 temp1, temp2, frame_related_p);
2825 /* Set DEST to (vec_series BASE STEP). */
2827 static void
2828 aarch64_expand_vec_series (rtx dest, rtx base, rtx step)
2830 machine_mode mode = GET_MODE (dest);
2831 scalar_mode inner = GET_MODE_INNER (mode);
2833 /* Each operand can be a register or an immediate in the range [-16, 15]. */
2834 if (!aarch64_sve_index_immediate_p (base))
2835 base = force_reg (inner, base);
2836 if (!aarch64_sve_index_immediate_p (step))
2837 step = force_reg (inner, step);
2839 emit_set_insn (dest, gen_rtx_VEC_SERIES (mode, base, step));
2842 /* Try to duplicate SRC into SVE register DEST, given that SRC is an
2843 integer of mode INT_MODE. Return true on success. */
2845 static bool
2846 aarch64_expand_sve_widened_duplicate (rtx dest, scalar_int_mode src_mode,
2847 rtx src)
2849 /* If the constant is smaller than 128 bits, we can do the move
2850 using a vector of SRC_MODEs. */
2851 if (src_mode != TImode)
2853 poly_uint64 count = exact_div (GET_MODE_SIZE (GET_MODE (dest)),
2854 GET_MODE_SIZE (src_mode));
2855 machine_mode dup_mode = mode_for_vector (src_mode, count).require ();
2856 emit_move_insn (gen_lowpart (dup_mode, dest),
2857 gen_const_vec_duplicate (dup_mode, src));
2858 return true;
2861 /* Use LD1RQ[BHWD] to load the 128 bits from memory. */
2862 src = force_const_mem (src_mode, src);
2863 if (!src)
2864 return false;
2866 /* Make sure that the address is legitimate. */
2867 if (!aarch64_sve_ld1r_operand_p (src))
2869 rtx addr = force_reg (Pmode, XEXP (src, 0));
2870 src = replace_equiv_address (src, addr);
2873 machine_mode mode = GET_MODE (dest);
2874 unsigned int elem_bytes = GET_MODE_UNIT_SIZE (mode);
2875 machine_mode pred_mode = aarch64_sve_pred_mode (elem_bytes).require ();
2876 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
2877 src = gen_rtx_UNSPEC (mode, gen_rtvec (2, ptrue, src), UNSPEC_LD1RQ);
2878 emit_insn (gen_rtx_SET (dest, src));
2879 return true;
2882 /* Expand a move of general CONST_VECTOR SRC into DEST, given that it
2883 isn't a simple duplicate or series. */
2885 static void
2886 aarch64_expand_sve_const_vector (rtx dest, rtx src)
2888 machine_mode mode = GET_MODE (src);
2889 unsigned int npatterns = CONST_VECTOR_NPATTERNS (src);
2890 unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (src);
2891 gcc_assert (npatterns > 1);
2893 if (nelts_per_pattern == 1)
2895 /* The constant is a repeating seqeuence of at least two elements,
2896 where the repeating elements occupy no more than 128 bits.
2897 Get an integer representation of the replicated value. */
2898 scalar_int_mode int_mode;
2899 if (BYTES_BIG_ENDIAN)
2900 /* For now, always use LD1RQ to load the value on big-endian
2901 targets, since the handling of smaller integers includes a
2902 subreg that is semantically an element reverse. */
2903 int_mode = TImode;
2904 else
2906 unsigned int int_bits = GET_MODE_UNIT_BITSIZE (mode) * npatterns;
2907 gcc_assert (int_bits <= 128);
2908 int_mode = int_mode_for_size (int_bits, 0).require ();
2910 rtx int_value = simplify_gen_subreg (int_mode, src, mode, 0);
2911 if (int_value
2912 && aarch64_expand_sve_widened_duplicate (dest, int_mode, int_value))
2913 return;
2916 /* Expand each pattern individually. */
2917 rtx_vector_builder builder;
2918 auto_vec<rtx, 16> vectors (npatterns);
2919 for (unsigned int i = 0; i < npatterns; ++i)
2921 builder.new_vector (mode, 1, nelts_per_pattern);
2922 for (unsigned int j = 0; j < nelts_per_pattern; ++j)
2923 builder.quick_push (CONST_VECTOR_ELT (src, i + j * npatterns));
2924 vectors.quick_push (force_reg (mode, builder.build ()));
2927 /* Use permutes to interleave the separate vectors. */
2928 while (npatterns > 1)
2930 npatterns /= 2;
2931 for (unsigned int i = 0; i < npatterns; ++i)
2933 rtx tmp = (npatterns == 1 ? dest : gen_reg_rtx (mode));
2934 rtvec v = gen_rtvec (2, vectors[i], vectors[i + npatterns]);
2935 emit_set_insn (tmp, gen_rtx_UNSPEC (mode, v, UNSPEC_ZIP1));
2936 vectors[i] = tmp;
2939 gcc_assert (vectors[0] == dest);
2942 /* Set DEST to immediate IMM. For SVE vector modes, GEN_VEC_DUPLICATE
2943 is a pattern that can be used to set DEST to a replicated scalar
2944 element. */
2946 void
2947 aarch64_expand_mov_immediate (rtx dest, rtx imm,
2948 rtx (*gen_vec_duplicate) (rtx, rtx))
2950 machine_mode mode = GET_MODE (dest);
2952 /* Check on what type of symbol it is. */
2953 scalar_int_mode int_mode;
2954 if ((GET_CODE (imm) == SYMBOL_REF
2955 || GET_CODE (imm) == LABEL_REF
2956 || GET_CODE (imm) == CONST
2957 || GET_CODE (imm) == CONST_POLY_INT)
2958 && is_a <scalar_int_mode> (mode, &int_mode))
2960 rtx mem;
2961 poly_int64 offset;
2962 HOST_WIDE_INT const_offset;
2963 enum aarch64_symbol_type sty;
2965 /* If we have (const (plus symbol offset)), separate out the offset
2966 before we start classifying the symbol. */
2967 rtx base = strip_offset (imm, &offset);
2969 /* We must always add an offset involving VL separately, rather than
2970 folding it into the relocation. */
2971 if (!offset.is_constant (&const_offset))
2973 if (base == const0_rtx && aarch64_sve_cnt_immediate_p (offset))
2974 emit_insn (gen_rtx_SET (dest, imm));
2975 else
2977 /* Do arithmetic on 32-bit values if the result is smaller
2978 than that. */
2979 if (partial_subreg_p (int_mode, SImode))
2981 /* It is invalid to do symbol calculations in modes
2982 narrower than SImode. */
2983 gcc_assert (base == const0_rtx);
2984 dest = gen_lowpart (SImode, dest);
2985 int_mode = SImode;
2987 if (base != const0_rtx)
2989 base = aarch64_force_temporary (int_mode, dest, base);
2990 aarch64_add_offset (int_mode, dest, base, offset,
2991 NULL_RTX, NULL_RTX, false);
2993 else
2994 aarch64_add_offset (int_mode, dest, base, offset,
2995 dest, NULL_RTX, false);
2997 return;
3000 sty = aarch64_classify_symbol (base, const_offset);
3001 switch (sty)
3003 case SYMBOL_FORCE_TO_MEM:
3004 if (const_offset != 0
3005 && targetm.cannot_force_const_mem (int_mode, imm))
3007 gcc_assert (can_create_pseudo_p ());
3008 base = aarch64_force_temporary (int_mode, dest, base);
3009 aarch64_add_offset (int_mode, dest, base, const_offset,
3010 NULL_RTX, NULL_RTX, false);
3011 return;
3014 mem = force_const_mem (ptr_mode, imm);
3015 gcc_assert (mem);
3017 /* If we aren't generating PC relative literals, then
3018 we need to expand the literal pool access carefully.
3019 This is something that needs to be done in a number
3020 of places, so could well live as a separate function. */
3021 if (!aarch64_pcrelative_literal_loads)
3023 gcc_assert (can_create_pseudo_p ());
3024 base = gen_reg_rtx (ptr_mode);
3025 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
3026 if (ptr_mode != Pmode)
3027 base = convert_memory_address (Pmode, base);
3028 mem = gen_rtx_MEM (ptr_mode, base);
3031 if (int_mode != ptr_mode)
3032 mem = gen_rtx_ZERO_EXTEND (int_mode, mem);
3034 emit_insn (gen_rtx_SET (dest, mem));
3036 return;
3038 case SYMBOL_SMALL_TLSGD:
3039 case SYMBOL_SMALL_TLSDESC:
3040 case SYMBOL_SMALL_TLSIE:
3041 case SYMBOL_SMALL_GOT_28K:
3042 case SYMBOL_SMALL_GOT_4G:
3043 case SYMBOL_TINY_GOT:
3044 case SYMBOL_TINY_TLSIE:
3045 if (const_offset != 0)
3047 gcc_assert(can_create_pseudo_p ());
3048 base = aarch64_force_temporary (int_mode, dest, base);
3049 aarch64_add_offset (int_mode, dest, base, const_offset,
3050 NULL_RTX, NULL_RTX, false);
3051 return;
3053 /* FALLTHRU */
3055 case SYMBOL_SMALL_ABSOLUTE:
3056 case SYMBOL_TINY_ABSOLUTE:
3057 case SYMBOL_TLSLE12:
3058 case SYMBOL_TLSLE24:
3059 case SYMBOL_TLSLE32:
3060 case SYMBOL_TLSLE48:
3061 aarch64_load_symref_appropriately (dest, imm, sty);
3062 return;
3064 default:
3065 gcc_unreachable ();
3069 if (!CONST_INT_P (imm))
3071 rtx base, step, value;
3072 if (GET_CODE (imm) == HIGH
3073 || aarch64_simd_valid_immediate (imm, NULL))
3074 emit_insn (gen_rtx_SET (dest, imm));
3075 else if (const_vec_series_p (imm, &base, &step))
3076 aarch64_expand_vec_series (dest, base, step);
3077 else if (const_vec_duplicate_p (imm, &value))
3079 /* If the constant is out of range of an SVE vector move,
3080 load it from memory if we can, otherwise move it into
3081 a register and use a DUP. */
3082 scalar_mode inner_mode = GET_MODE_INNER (mode);
3083 rtx op = force_const_mem (inner_mode, value);
3084 if (!op)
3085 op = force_reg (inner_mode, value);
3086 else if (!aarch64_sve_ld1r_operand_p (op))
3088 rtx addr = force_reg (Pmode, XEXP (op, 0));
3089 op = replace_equiv_address (op, addr);
3091 emit_insn (gen_vec_duplicate (dest, op));
3093 else if (GET_CODE (imm) == CONST_VECTOR
3094 && !GET_MODE_NUNITS (GET_MODE (imm)).is_constant ())
3095 aarch64_expand_sve_const_vector (dest, imm);
3096 else
3098 rtx mem = force_const_mem (mode, imm);
3099 gcc_assert (mem);
3100 emit_move_insn (dest, mem);
3103 return;
3106 aarch64_internal_mov_immediate (dest, imm, true,
3107 as_a <scalar_int_mode> (mode));
3110 /* Emit an SVE predicated move from SRC to DEST. PRED is a predicate
3111 that is known to contain PTRUE. */
3113 void
3114 aarch64_emit_sve_pred_move (rtx dest, rtx pred, rtx src)
3116 emit_insn (gen_rtx_SET (dest, gen_rtx_UNSPEC (GET_MODE (dest),
3117 gen_rtvec (2, pred, src),
3118 UNSPEC_MERGE_PTRUE)));
3121 /* Expand a pre-RA SVE data move from SRC to DEST in which at least one
3122 operand is in memory. In this case we need to use the predicated LD1
3123 and ST1 instead of LDR and STR, both for correctness on big-endian
3124 targets and because LD1 and ST1 support a wider range of addressing modes.
3125 PRED_MODE is the mode of the predicate.
3127 See the comment at the head of aarch64-sve.md for details about the
3128 big-endian handling. */
3130 void
3131 aarch64_expand_sve_mem_move (rtx dest, rtx src, machine_mode pred_mode)
3133 machine_mode mode = GET_MODE (dest);
3134 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
3135 if (!register_operand (src, mode)
3136 && !register_operand (dest, mode))
3138 rtx tmp = gen_reg_rtx (mode);
3139 if (MEM_P (src))
3140 aarch64_emit_sve_pred_move (tmp, ptrue, src);
3141 else
3142 emit_move_insn (tmp, src);
3143 src = tmp;
3145 aarch64_emit_sve_pred_move (dest, ptrue, src);
3148 /* Called only on big-endian targets. See whether an SVE vector move
3149 from SRC to DEST is effectively a REV[BHW] instruction, because at
3150 least one operand is a subreg of an SVE vector that has wider or
3151 narrower elements. Return true and emit the instruction if so.
3153 For example:
3155 (set (reg:VNx8HI R1) (subreg:VNx8HI (reg:VNx16QI R2) 0))
3157 represents a VIEW_CONVERT between the following vectors, viewed
3158 in memory order:
3160 R2: { [0].high, [0].low, [1].high, [1].low, ... }
3161 R1: { [0], [1], [2], [3], ... }
3163 The high part of lane X in R2 should therefore correspond to lane X*2
3164 of R1, but the register representations are:
3166 msb lsb
3167 R2: ...... [1].high [1].low [0].high [0].low
3168 R1: ...... [3] [2] [1] [0]
3170 where the low part of lane X in R2 corresponds to lane X*2 in R1.
3171 We therefore need a reverse operation to swap the high and low values
3172 around.
3174 This is purely an optimization. Without it we would spill the
3175 subreg operand to the stack in one mode and reload it in the
3176 other mode, which has the same effect as the REV. */
3178 bool
3179 aarch64_maybe_expand_sve_subreg_move (rtx dest, rtx src)
3181 gcc_assert (BYTES_BIG_ENDIAN);
3182 if (GET_CODE (dest) == SUBREG)
3183 dest = SUBREG_REG (dest);
3184 if (GET_CODE (src) == SUBREG)
3185 src = SUBREG_REG (src);
3187 /* The optimization handles two single SVE REGs with different element
3188 sizes. */
3189 if (!REG_P (dest)
3190 || !REG_P (src)
3191 || aarch64_classify_vector_mode (GET_MODE (dest)) != VEC_SVE_DATA
3192 || aarch64_classify_vector_mode (GET_MODE (src)) != VEC_SVE_DATA
3193 || (GET_MODE_UNIT_SIZE (GET_MODE (dest))
3194 == GET_MODE_UNIT_SIZE (GET_MODE (src))))
3195 return false;
3197 /* Generate *aarch64_sve_mov<mode>_subreg_be. */
3198 rtx ptrue = force_reg (VNx16BImode, CONSTM1_RTX (VNx16BImode));
3199 rtx unspec = gen_rtx_UNSPEC (GET_MODE (dest), gen_rtvec (2, ptrue, src),
3200 UNSPEC_REV_SUBREG);
3201 emit_insn (gen_rtx_SET (dest, unspec));
3202 return true;
3205 /* Return a copy of X with mode MODE, without changing its other
3206 attributes. Unlike gen_lowpart, this doesn't care whether the
3207 mode change is valid. */
3209 static rtx
3210 aarch64_replace_reg_mode (rtx x, machine_mode mode)
3212 if (GET_MODE (x) == mode)
3213 return x;
3215 x = shallow_copy_rtx (x);
3216 set_mode_and_regno (x, mode, REGNO (x));
3217 return x;
3220 /* Split a *aarch64_sve_mov<mode>_subreg_be pattern with the given
3221 operands. */
3223 void
3224 aarch64_split_sve_subreg_move (rtx dest, rtx ptrue, rtx src)
3226 /* Decide which REV operation we need. The mode with narrower elements
3227 determines the mode of the operands and the mode with the wider
3228 elements determines the reverse width. */
3229 machine_mode mode_with_wider_elts = GET_MODE (dest);
3230 machine_mode mode_with_narrower_elts = GET_MODE (src);
3231 if (GET_MODE_UNIT_SIZE (mode_with_wider_elts)
3232 < GET_MODE_UNIT_SIZE (mode_with_narrower_elts))
3233 std::swap (mode_with_wider_elts, mode_with_narrower_elts);
3235 unsigned int wider_bytes = GET_MODE_UNIT_SIZE (mode_with_wider_elts);
3236 unsigned int unspec;
3237 if (wider_bytes == 8)
3238 unspec = UNSPEC_REV64;
3239 else if (wider_bytes == 4)
3240 unspec = UNSPEC_REV32;
3241 else if (wider_bytes == 2)
3242 unspec = UNSPEC_REV16;
3243 else
3244 gcc_unreachable ();
3245 machine_mode pred_mode = aarch64_sve_pred_mode (wider_bytes).require ();
3247 /* Emit:
3249 (set DEST (unspec [PTRUE (unspec [SRC] UNSPEC_REV<nn>)]
3250 UNSPEC_MERGE_PTRUE))
3252 with the appropriate modes. */
3253 ptrue = gen_lowpart (pred_mode, ptrue);
3254 dest = aarch64_replace_reg_mode (dest, mode_with_narrower_elts);
3255 src = aarch64_replace_reg_mode (src, mode_with_narrower_elts);
3256 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (1, src), unspec);
3257 src = gen_rtx_UNSPEC (mode_with_narrower_elts, gen_rtvec (2, ptrue, src),
3258 UNSPEC_MERGE_PTRUE);
3259 emit_insn (gen_rtx_SET (dest, src));
3262 static bool
3263 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
3264 tree exp ATTRIBUTE_UNUSED)
3266 /* Currently, always true. */
3267 return true;
3270 /* Implement TARGET_PASS_BY_REFERENCE. */
3272 static bool
3273 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
3274 machine_mode mode,
3275 const_tree type,
3276 bool named ATTRIBUTE_UNUSED)
3278 HOST_WIDE_INT size;
3279 machine_mode dummymode;
3280 int nregs;
3282 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
3283 if (mode == BLKmode && type)
3284 size = int_size_in_bytes (type);
3285 else
3286 /* No frontends can create types with variable-sized modes, so we
3287 shouldn't be asked to pass or return them. */
3288 size = GET_MODE_SIZE (mode).to_constant ();
3290 /* Aggregates are passed by reference based on their size. */
3291 if (type && AGGREGATE_TYPE_P (type))
3293 size = int_size_in_bytes (type);
3296 /* Variable sized arguments are always returned by reference. */
3297 if (size < 0)
3298 return true;
3300 /* Can this be a candidate to be passed in fp/simd register(s)? */
3301 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3302 &dummymode, &nregs,
3303 NULL))
3304 return false;
3306 /* Arguments which are variable sized or larger than 2 registers are
3307 passed by reference unless they are a homogenous floating point
3308 aggregate. */
3309 return size > 2 * UNITS_PER_WORD;
3312 /* Return TRUE if VALTYPE is padded to its least significant bits. */
3313 static bool
3314 aarch64_return_in_msb (const_tree valtype)
3316 machine_mode dummy_mode;
3317 int dummy_int;
3319 /* Never happens in little-endian mode. */
3320 if (!BYTES_BIG_ENDIAN)
3321 return false;
3323 /* Only composite types smaller than or equal to 16 bytes can
3324 be potentially returned in registers. */
3325 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
3326 || int_size_in_bytes (valtype) <= 0
3327 || int_size_in_bytes (valtype) > 16)
3328 return false;
3330 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
3331 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
3332 is always passed/returned in the least significant bits of fp/simd
3333 register(s). */
3334 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
3335 &dummy_mode, &dummy_int, NULL))
3336 return false;
3338 return true;
3341 /* Implement TARGET_FUNCTION_VALUE.
3342 Define how to find the value returned by a function. */
3344 static rtx
3345 aarch64_function_value (const_tree type, const_tree func,
3346 bool outgoing ATTRIBUTE_UNUSED)
3348 machine_mode mode;
3349 int unsignedp;
3350 int count;
3351 machine_mode ag_mode;
3353 mode = TYPE_MODE (type);
3354 if (INTEGRAL_TYPE_P (type))
3355 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
3357 if (aarch64_return_in_msb (type))
3359 HOST_WIDE_INT size = int_size_in_bytes (type);
3361 if (size % UNITS_PER_WORD != 0)
3363 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
3364 mode = int_mode_for_size (size * BITS_PER_UNIT, 0).require ();
3368 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
3369 &ag_mode, &count, NULL))
3371 if (!aarch64_composite_type_p (type, mode))
3373 gcc_assert (count == 1 && mode == ag_mode);
3374 return gen_rtx_REG (mode, V0_REGNUM);
3376 else
3378 int i;
3379 rtx par;
3381 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
3382 for (i = 0; i < count; i++)
3384 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
3385 rtx offset = gen_int_mode (i * GET_MODE_SIZE (ag_mode), Pmode);
3386 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3387 XVECEXP (par, 0, i) = tmp;
3389 return par;
3392 else
3393 return gen_rtx_REG (mode, R0_REGNUM);
3396 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
3397 Return true if REGNO is the number of a hard register in which the values
3398 of called function may come back. */
3400 static bool
3401 aarch64_function_value_regno_p (const unsigned int regno)
3403 /* Maximum of 16 bytes can be returned in the general registers. Examples
3404 of 16-byte return values are: 128-bit integers and 16-byte small
3405 structures (excluding homogeneous floating-point aggregates). */
3406 if (regno == R0_REGNUM || regno == R1_REGNUM)
3407 return true;
3409 /* Up to four fp/simd registers can return a function value, e.g. a
3410 homogeneous floating-point aggregate having four members. */
3411 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
3412 return TARGET_FLOAT;
3414 return false;
3417 /* Implement TARGET_RETURN_IN_MEMORY.
3419 If the type T of the result of a function is such that
3420 void func (T arg)
3421 would require that arg be passed as a value in a register (or set of
3422 registers) according to the parameter passing rules, then the result
3423 is returned in the same registers as would be used for such an
3424 argument. */
3426 static bool
3427 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
3429 HOST_WIDE_INT size;
3430 machine_mode ag_mode;
3431 int count;
3433 if (!AGGREGATE_TYPE_P (type)
3434 && TREE_CODE (type) != COMPLEX_TYPE
3435 && TREE_CODE (type) != VECTOR_TYPE)
3436 /* Simple scalar types always returned in registers. */
3437 return false;
3439 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
3440 type,
3441 &ag_mode,
3442 &count,
3443 NULL))
3444 return false;
3446 /* Types larger than 2 registers returned in memory. */
3447 size = int_size_in_bytes (type);
3448 return (size < 0 || size > 2 * UNITS_PER_WORD);
3451 static bool
3452 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
3453 const_tree type, int *nregs)
3455 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3456 return aarch64_vfp_is_call_or_return_candidate (mode,
3457 type,
3458 &pcum->aapcs_vfp_rmode,
3459 nregs,
3460 NULL);
3463 /* Given MODE and TYPE of a function argument, return the alignment in
3464 bits. The idea is to suppress any stronger alignment requested by
3465 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
3466 This is a helper function for local use only. */
3468 static unsigned int
3469 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
3471 if (!type)
3472 return GET_MODE_ALIGNMENT (mode);
3474 if (integer_zerop (TYPE_SIZE (type)))
3475 return 0;
3477 gcc_assert (TYPE_MODE (type) == mode);
3479 if (!AGGREGATE_TYPE_P (type))
3480 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
3482 if (TREE_CODE (type) == ARRAY_TYPE)
3483 return TYPE_ALIGN (TREE_TYPE (type));
3485 unsigned int alignment = 0;
3486 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
3487 if (TREE_CODE (field) == FIELD_DECL)
3488 alignment = std::max (alignment, DECL_ALIGN (field));
3490 return alignment;
3493 /* Layout a function argument according to the AAPCS64 rules. The rule
3494 numbers refer to the rule numbers in the AAPCS64. */
3496 static void
3497 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
3498 const_tree type,
3499 bool named ATTRIBUTE_UNUSED)
3501 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3502 int ncrn, nvrn, nregs;
3503 bool allocate_ncrn, allocate_nvrn;
3504 HOST_WIDE_INT size;
3506 /* We need to do this once per argument. */
3507 if (pcum->aapcs_arg_processed)
3508 return;
3510 pcum->aapcs_arg_processed = true;
3512 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
3513 if (type)
3514 size = int_size_in_bytes (type);
3515 else
3516 /* No frontends can create types with variable-sized modes, so we
3517 shouldn't be asked to pass or return them. */
3518 size = GET_MODE_SIZE (mode).to_constant ();
3519 size = ROUND_UP (size, UNITS_PER_WORD);
3521 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
3522 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
3523 mode,
3524 type,
3525 &nregs);
3527 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
3528 The following code thus handles passing by SIMD/FP registers first. */
3530 nvrn = pcum->aapcs_nvrn;
3532 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
3533 and homogenous short-vector aggregates (HVA). */
3534 if (allocate_nvrn)
3536 if (!TARGET_FLOAT)
3537 aarch64_err_no_fpadvsimd (mode);
3539 if (nvrn + nregs <= NUM_FP_ARG_REGS)
3541 pcum->aapcs_nextnvrn = nvrn + nregs;
3542 if (!aarch64_composite_type_p (type, mode))
3544 gcc_assert (nregs == 1);
3545 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
3547 else
3549 rtx par;
3550 int i;
3551 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3552 for (i = 0; i < nregs; i++)
3554 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
3555 V0_REGNUM + nvrn + i);
3556 rtx offset = gen_int_mode
3557 (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode), Pmode);
3558 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp, offset);
3559 XVECEXP (par, 0, i) = tmp;
3561 pcum->aapcs_reg = par;
3563 return;
3565 else
3567 /* C.3 NSRN is set to 8. */
3568 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
3569 goto on_stack;
3573 ncrn = pcum->aapcs_ncrn;
3574 nregs = size / UNITS_PER_WORD;
3576 /* C6 - C9. though the sign and zero extension semantics are
3577 handled elsewhere. This is the case where the argument fits
3578 entirely general registers. */
3579 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
3582 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
3584 /* C.8 if the argument has an alignment of 16 then the NGRN is
3585 rounded up to the next even number. */
3586 if (nregs == 2
3587 && ncrn % 2
3588 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
3589 comparison is there because for > 16 * BITS_PER_UNIT
3590 alignment nregs should be > 2 and therefore it should be
3591 passed by reference rather than value. */
3592 && aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3594 ++ncrn;
3595 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
3598 /* NREGS can be 0 when e.g. an empty structure is to be passed.
3599 A reg is still generated for it, but the caller should be smart
3600 enough not to use it. */
3601 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
3602 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
3603 else
3605 rtx par;
3606 int i;
3608 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
3609 for (i = 0; i < nregs; i++)
3611 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
3612 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
3613 GEN_INT (i * UNITS_PER_WORD));
3614 XVECEXP (par, 0, i) = tmp;
3616 pcum->aapcs_reg = par;
3619 pcum->aapcs_nextncrn = ncrn + nregs;
3620 return;
3623 /* C.11 */
3624 pcum->aapcs_nextncrn = NUM_ARG_REGS;
3626 /* The argument is passed on stack; record the needed number of words for
3627 this argument and align the total size if necessary. */
3628 on_stack:
3629 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
3631 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
3632 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
3633 16 / UNITS_PER_WORD);
3634 return;
3637 /* Implement TARGET_FUNCTION_ARG. */
3639 static rtx
3640 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
3641 const_tree type, bool named)
3643 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3644 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
3646 if (mode == VOIDmode)
3647 return NULL_RTX;
3649 aarch64_layout_arg (pcum_v, mode, type, named);
3650 return pcum->aapcs_reg;
3653 void
3654 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
3655 const_tree fntype ATTRIBUTE_UNUSED,
3656 rtx libname ATTRIBUTE_UNUSED,
3657 const_tree fndecl ATTRIBUTE_UNUSED,
3658 unsigned n_named ATTRIBUTE_UNUSED)
3660 pcum->aapcs_ncrn = 0;
3661 pcum->aapcs_nvrn = 0;
3662 pcum->aapcs_nextncrn = 0;
3663 pcum->aapcs_nextnvrn = 0;
3664 pcum->pcs_variant = ARM_PCS_AAPCS64;
3665 pcum->aapcs_reg = NULL_RTX;
3666 pcum->aapcs_arg_processed = false;
3667 pcum->aapcs_stack_words = 0;
3668 pcum->aapcs_stack_size = 0;
3670 if (!TARGET_FLOAT
3671 && fndecl && TREE_PUBLIC (fndecl)
3672 && fntype && fntype != error_mark_node)
3674 const_tree type = TREE_TYPE (fntype);
3675 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
3676 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
3677 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
3678 &mode, &nregs, NULL))
3679 aarch64_err_no_fpadvsimd (TYPE_MODE (type));
3681 return;
3684 static void
3685 aarch64_function_arg_advance (cumulative_args_t pcum_v,
3686 machine_mode mode,
3687 const_tree type,
3688 bool named)
3690 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
3691 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
3693 aarch64_layout_arg (pcum_v, mode, type, named);
3694 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
3695 != (pcum->aapcs_stack_words != 0));
3696 pcum->aapcs_arg_processed = false;
3697 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
3698 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
3699 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
3700 pcum->aapcs_stack_words = 0;
3701 pcum->aapcs_reg = NULL_RTX;
3705 bool
3706 aarch64_function_arg_regno_p (unsigned regno)
3708 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
3709 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
3712 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
3713 PARM_BOUNDARY bits of alignment, but will be given anything up
3714 to STACK_BOUNDARY bits if the type requires it. This makes sure
3715 that both before and after the layout of each argument, the Next
3716 Stacked Argument Address (NSAA) will have a minimum alignment of
3717 8 bytes. */
3719 static unsigned int
3720 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
3722 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
3723 return MIN (MAX (alignment, PARM_BOUNDARY), STACK_BOUNDARY);
3726 /* Implement TARGET_GET_RAW_RESULT_MODE and TARGET_GET_RAW_ARG_MODE. */
3728 static fixed_size_mode
3729 aarch64_get_reg_raw_mode (int regno)
3731 if (TARGET_SVE && FP_REGNUM_P (regno))
3732 /* Don't use the SVE part of the register for __builtin_apply and
3733 __builtin_return. The SVE registers aren't used by the normal PCS,
3734 so using them there would be a waste of time. The PCS extensions
3735 for SVE types are fundamentally incompatible with the
3736 __builtin_return/__builtin_apply interface. */
3737 return as_a <fixed_size_mode> (V16QImode);
3738 return default_get_reg_raw_mode (regno);
3741 /* Implement TARGET_FUNCTION_ARG_PADDING.
3743 Small aggregate types are placed in the lowest memory address.
3745 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
3747 static pad_direction
3748 aarch64_function_arg_padding (machine_mode mode, const_tree type)
3750 /* On little-endian targets, the least significant byte of every stack
3751 argument is passed at the lowest byte address of the stack slot. */
3752 if (!BYTES_BIG_ENDIAN)
3753 return PAD_UPWARD;
3755 /* Otherwise, integral, floating-point and pointer types are padded downward:
3756 the least significant byte of a stack argument is passed at the highest
3757 byte address of the stack slot. */
3758 if (type
3759 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
3760 || POINTER_TYPE_P (type))
3761 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
3762 return PAD_DOWNWARD;
3764 /* Everything else padded upward, i.e. data in first byte of stack slot. */
3765 return PAD_UPWARD;
3768 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
3770 It specifies padding for the last (may also be the only)
3771 element of a block move between registers and memory. If
3772 assuming the block is in the memory, padding upward means that
3773 the last element is padded after its highest significant byte,
3774 while in downward padding, the last element is padded at the
3775 its least significant byte side.
3777 Small aggregates and small complex types are always padded
3778 upwards.
3780 We don't need to worry about homogeneous floating-point or
3781 short-vector aggregates; their move is not affected by the
3782 padding direction determined here. Regardless of endianness,
3783 each element of such an aggregate is put in the least
3784 significant bits of a fp/simd register.
3786 Return !BYTES_BIG_ENDIAN if the least significant byte of the
3787 register has useful data, and return the opposite if the most
3788 significant byte does. */
3790 bool
3791 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
3792 bool first ATTRIBUTE_UNUSED)
3795 /* Small composite types are always padded upward. */
3796 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
3798 HOST_WIDE_INT size;
3799 if (type)
3800 size = int_size_in_bytes (type);
3801 else
3802 /* No frontends can create types with variable-sized modes, so we
3803 shouldn't be asked to pass or return them. */
3804 size = GET_MODE_SIZE (mode).to_constant ();
3805 if (size < 2 * UNITS_PER_WORD)
3806 return true;
3809 /* Otherwise, use the default padding. */
3810 return !BYTES_BIG_ENDIAN;
3813 static scalar_int_mode
3814 aarch64_libgcc_cmp_return_mode (void)
3816 return SImode;
3819 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
3821 /* We use the 12-bit shifted immediate arithmetic instructions so values
3822 must be multiple of (1 << 12), i.e. 4096. */
3823 #define ARITH_FACTOR 4096
3825 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
3826 #error Cannot use simple address calculation for stack probing
3827 #endif
3829 /* The pair of scratch registers used for stack probing. */
3830 #define PROBE_STACK_FIRST_REG 9
3831 #define PROBE_STACK_SECOND_REG 10
3833 /* Emit code to probe a range of stack addresses from FIRST to FIRST+POLY_SIZE,
3834 inclusive. These are offsets from the current stack pointer. */
3836 static void
3837 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, poly_int64 poly_size)
3839 HOST_WIDE_INT size;
3840 if (!poly_size.is_constant (&size))
3842 sorry ("stack probes for SVE frames");
3843 return;
3846 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
3848 /* See the same assertion on PROBE_INTERVAL above. */
3849 gcc_assert ((first % ARITH_FACTOR) == 0);
3851 /* See if we have a constant small number of probes to generate. If so,
3852 that's the easy case. */
3853 if (size <= PROBE_INTERVAL)
3855 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
3857 emit_set_insn (reg1,
3858 plus_constant (Pmode,
3859 stack_pointer_rtx, -(first + base)));
3860 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
3863 /* The run-time loop is made up of 8 insns in the generic case while the
3864 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
3865 else if (size <= 4 * PROBE_INTERVAL)
3867 HOST_WIDE_INT i, rem;
3869 emit_set_insn (reg1,
3870 plus_constant (Pmode,
3871 stack_pointer_rtx,
3872 -(first + PROBE_INTERVAL)));
3873 emit_stack_probe (reg1);
3875 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
3876 it exceeds SIZE. If only two probes are needed, this will not
3877 generate any code. Then probe at FIRST + SIZE. */
3878 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
3880 emit_set_insn (reg1,
3881 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
3882 emit_stack_probe (reg1);
3885 rem = size - (i - PROBE_INTERVAL);
3886 if (rem > 256)
3888 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3890 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
3891 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
3893 else
3894 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
3897 /* Otherwise, do the same as above, but in a loop. Note that we must be
3898 extra careful with variables wrapping around because we might be at
3899 the very top (or the very bottom) of the address space and we have
3900 to be able to handle this case properly; in particular, we use an
3901 equality test for the loop condition. */
3902 else
3904 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
3906 /* Step 1: round SIZE to the previous multiple of the interval. */
3908 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
3911 /* Step 2: compute initial and final value of the loop counter. */
3913 /* TEST_ADDR = SP + FIRST. */
3914 emit_set_insn (reg1,
3915 plus_constant (Pmode, stack_pointer_rtx, -first));
3917 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
3918 HOST_WIDE_INT adjustment = - (first + rounded_size);
3919 if (! aarch64_uimm12_shift (adjustment))
3921 aarch64_internal_mov_immediate (reg2, GEN_INT (adjustment),
3922 true, Pmode);
3923 emit_set_insn (reg2, gen_rtx_PLUS (Pmode, stack_pointer_rtx, reg2));
3925 else
3926 emit_set_insn (reg2,
3927 plus_constant (Pmode, stack_pointer_rtx, adjustment));
3929 /* Step 3: the loop
3933 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
3934 probe at TEST_ADDR
3936 while (TEST_ADDR != LAST_ADDR)
3938 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
3939 until it is equal to ROUNDED_SIZE. */
3941 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
3944 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
3945 that SIZE is equal to ROUNDED_SIZE. */
3947 if (size != rounded_size)
3949 HOST_WIDE_INT rem = size - rounded_size;
3951 if (rem > 256)
3953 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
3955 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
3956 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
3958 else
3959 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
3963 /* Make sure nothing is scheduled before we are done. */
3964 emit_insn (gen_blockage ());
3967 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
3968 absolute addresses. */
3970 const char *
3971 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
3973 static int labelno = 0;
3974 char loop_lab[32];
3975 rtx xops[2];
3977 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
3979 /* Loop. */
3980 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
3982 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
3983 xops[0] = reg1;
3984 xops[1] = GEN_INT (PROBE_INTERVAL);
3985 output_asm_insn ("sub\t%0, %0, %1", xops);
3987 /* Probe at TEST_ADDR. */
3988 output_asm_insn ("str\txzr, [%0]", xops);
3990 /* Test if TEST_ADDR == LAST_ADDR. */
3991 xops[1] = reg2;
3992 output_asm_insn ("cmp\t%0, %1", xops);
3994 /* Branch. */
3995 fputs ("\tb.ne\t", asm_out_file);
3996 assemble_name_raw (asm_out_file, loop_lab);
3997 fputc ('\n', asm_out_file);
3999 return "";
4002 /* Determine whether a frame chain needs to be generated. */
4003 static bool
4004 aarch64_needs_frame_chain (void)
4006 /* Force a frame chain for EH returns so the return address is at FP+8. */
4007 if (frame_pointer_needed || crtl->calls_eh_return)
4008 return true;
4010 /* A leaf function cannot have calls or write LR. */
4011 bool is_leaf = crtl->is_leaf && !df_regs_ever_live_p (LR_REGNUM);
4013 /* Don't use a frame chain in leaf functions if leaf frame pointers
4014 are disabled. */
4015 if (flag_omit_leaf_frame_pointer && is_leaf)
4016 return false;
4018 return aarch64_use_frame_pointer;
4021 /* Mark the registers that need to be saved by the callee and calculate
4022 the size of the callee-saved registers area and frame record (both FP
4023 and LR may be omitted). */
4024 static void
4025 aarch64_layout_frame (void)
4027 HOST_WIDE_INT offset = 0;
4028 int regno, last_fp_reg = INVALID_REGNUM;
4030 cfun->machine->frame.emit_frame_chain = aarch64_needs_frame_chain ();
4032 #define SLOT_NOT_REQUIRED (-2)
4033 #define SLOT_REQUIRED (-1)
4035 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
4036 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
4038 /* First mark all the registers that really need to be saved... */
4039 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4040 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4042 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4043 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
4045 /* ... that includes the eh data registers (if needed)... */
4046 if (crtl->calls_eh_return)
4047 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
4048 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
4049 = SLOT_REQUIRED;
4051 /* ... and any callee saved register that dataflow says is live. */
4052 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4053 if (df_regs_ever_live_p (regno)
4054 && (regno == R30_REGNUM
4055 || !call_used_regs[regno]))
4056 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4058 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4059 if (df_regs_ever_live_p (regno)
4060 && !call_used_regs[regno])
4062 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
4063 last_fp_reg = regno;
4066 if (cfun->machine->frame.emit_frame_chain)
4068 /* FP and LR are placed in the linkage record. */
4069 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
4070 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
4071 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
4072 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
4073 offset = 2 * UNITS_PER_WORD;
4076 /* Now assign stack slots for them. */
4077 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
4078 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4080 cfun->machine->frame.reg_offset[regno] = offset;
4081 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4082 cfun->machine->frame.wb_candidate1 = regno;
4083 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
4084 cfun->machine->frame.wb_candidate2 = regno;
4085 offset += UNITS_PER_WORD;
4088 HOST_WIDE_INT max_int_offset = offset;
4089 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4090 bool has_align_gap = offset != max_int_offset;
4092 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
4093 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
4095 /* If there is an alignment gap between integer and fp callee-saves,
4096 allocate the last fp register to it if possible. */
4097 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
4099 cfun->machine->frame.reg_offset[regno] = max_int_offset;
4100 break;
4103 cfun->machine->frame.reg_offset[regno] = offset;
4104 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
4105 cfun->machine->frame.wb_candidate1 = regno;
4106 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
4107 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
4108 cfun->machine->frame.wb_candidate2 = regno;
4109 offset += UNITS_PER_WORD;
4112 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
4114 cfun->machine->frame.saved_regs_size = offset;
4116 HOST_WIDE_INT varargs_and_saved_regs_size
4117 = offset + cfun->machine->frame.saved_varargs_size;
4119 cfun->machine->frame.hard_fp_offset
4120 = aligned_upper_bound (varargs_and_saved_regs_size
4121 + get_frame_size (),
4122 STACK_BOUNDARY / BITS_PER_UNIT);
4124 /* Both these values are already aligned. */
4125 gcc_assert (multiple_p (crtl->outgoing_args_size,
4126 STACK_BOUNDARY / BITS_PER_UNIT));
4127 cfun->machine->frame.frame_size
4128 = (cfun->machine->frame.hard_fp_offset
4129 + crtl->outgoing_args_size);
4131 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
4133 cfun->machine->frame.initial_adjust = 0;
4134 cfun->machine->frame.final_adjust = 0;
4135 cfun->machine->frame.callee_adjust = 0;
4136 cfun->machine->frame.callee_offset = 0;
4138 HOST_WIDE_INT max_push_offset = 0;
4139 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
4140 max_push_offset = 512;
4141 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
4142 max_push_offset = 256;
4144 HOST_WIDE_INT const_size, const_fp_offset;
4145 if (cfun->machine->frame.frame_size.is_constant (&const_size)
4146 && const_size < max_push_offset
4147 && known_eq (crtl->outgoing_args_size, 0))
4149 /* Simple, small frame with no outgoing arguments:
4150 stp reg1, reg2, [sp, -frame_size]!
4151 stp reg3, reg4, [sp, 16] */
4152 cfun->machine->frame.callee_adjust = const_size;
4154 else if (known_lt (crtl->outgoing_args_size
4155 + cfun->machine->frame.saved_regs_size, 512)
4156 && !(cfun->calls_alloca
4157 && known_lt (cfun->machine->frame.hard_fp_offset,
4158 max_push_offset)))
4160 /* Frame with small outgoing arguments:
4161 sub sp, sp, frame_size
4162 stp reg1, reg2, [sp, outgoing_args_size]
4163 stp reg3, reg4, [sp, outgoing_args_size + 16] */
4164 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
4165 cfun->machine->frame.callee_offset
4166 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
4168 else if (cfun->machine->frame.hard_fp_offset.is_constant (&const_fp_offset)
4169 && const_fp_offset < max_push_offset)
4171 /* Frame with large outgoing arguments but a small local area:
4172 stp reg1, reg2, [sp, -hard_fp_offset]!
4173 stp reg3, reg4, [sp, 16]
4174 sub sp, sp, outgoing_args_size */
4175 cfun->machine->frame.callee_adjust = const_fp_offset;
4176 cfun->machine->frame.final_adjust
4177 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
4179 else
4181 /* Frame with large local area and outgoing arguments using frame pointer:
4182 sub sp, sp, hard_fp_offset
4183 stp x29, x30, [sp, 0]
4184 add x29, sp, 0
4185 stp reg3, reg4, [sp, 16]
4186 sub sp, sp, outgoing_args_size */
4187 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
4188 cfun->machine->frame.final_adjust
4189 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
4192 cfun->machine->frame.laid_out = true;
4195 /* Return true if the register REGNO is saved on entry to
4196 the current function. */
4198 static bool
4199 aarch64_register_saved_on_entry (int regno)
4201 return cfun->machine->frame.reg_offset[regno] >= 0;
4204 /* Return the next register up from REGNO up to LIMIT for the callee
4205 to save. */
4207 static unsigned
4208 aarch64_next_callee_save (unsigned regno, unsigned limit)
4210 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
4211 regno ++;
4212 return regno;
4215 /* Push the register number REGNO of mode MODE to the stack with write-back
4216 adjusting the stack by ADJUSTMENT. */
4218 static void
4219 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
4220 HOST_WIDE_INT adjustment)
4222 rtx base_rtx = stack_pointer_rtx;
4223 rtx insn, reg, mem;
4225 reg = gen_rtx_REG (mode, regno);
4226 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
4227 plus_constant (Pmode, base_rtx, -adjustment));
4228 mem = gen_frame_mem (mode, mem);
4230 insn = emit_move_insn (mem, reg);
4231 RTX_FRAME_RELATED_P (insn) = 1;
4234 /* Generate and return an instruction to store the pair of registers
4235 REG and REG2 of mode MODE to location BASE with write-back adjusting
4236 the stack location BASE by ADJUSTMENT. */
4238 static rtx
4239 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4240 HOST_WIDE_INT adjustment)
4242 switch (mode)
4244 case E_DImode:
4245 return gen_storewb_pairdi_di (base, base, reg, reg2,
4246 GEN_INT (-adjustment),
4247 GEN_INT (UNITS_PER_WORD - adjustment));
4248 case E_DFmode:
4249 return gen_storewb_pairdf_di (base, base, reg, reg2,
4250 GEN_INT (-adjustment),
4251 GEN_INT (UNITS_PER_WORD - adjustment));
4252 default:
4253 gcc_unreachable ();
4257 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
4258 stack pointer by ADJUSTMENT. */
4260 static void
4261 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
4263 rtx_insn *insn;
4264 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4266 if (regno2 == INVALID_REGNUM)
4267 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
4269 rtx reg1 = gen_rtx_REG (mode, regno1);
4270 rtx reg2 = gen_rtx_REG (mode, regno2);
4272 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
4273 reg2, adjustment));
4274 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
4275 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4276 RTX_FRAME_RELATED_P (insn) = 1;
4279 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
4280 adjusting it by ADJUSTMENT afterwards. */
4282 static rtx
4283 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
4284 HOST_WIDE_INT adjustment)
4286 switch (mode)
4288 case E_DImode:
4289 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
4290 GEN_INT (UNITS_PER_WORD));
4291 case E_DFmode:
4292 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
4293 GEN_INT (UNITS_PER_WORD));
4294 default:
4295 gcc_unreachable ();
4299 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
4300 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
4301 into CFI_OPS. */
4303 static void
4304 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
4305 rtx *cfi_ops)
4307 machine_mode mode = (regno1 <= R30_REGNUM) ? E_DImode : E_DFmode;
4308 rtx reg1 = gen_rtx_REG (mode, regno1);
4310 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
4312 if (regno2 == INVALID_REGNUM)
4314 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
4315 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
4316 emit_move_insn (reg1, gen_frame_mem (mode, mem));
4318 else
4320 rtx reg2 = gen_rtx_REG (mode, regno2);
4321 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4322 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
4323 reg2, adjustment));
4327 /* Generate and return a store pair instruction of mode MODE to store
4328 register REG1 to MEM1 and register REG2 to MEM2. */
4330 static rtx
4331 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
4332 rtx reg2)
4334 switch (mode)
4336 case E_DImode:
4337 return gen_store_pair_dw_didi (mem1, reg1, mem2, reg2);
4339 case E_DFmode:
4340 return gen_store_pair_dw_dfdf (mem1, reg1, mem2, reg2);
4342 default:
4343 gcc_unreachable ();
4347 /* Generate and regurn a load pair isntruction of mode MODE to load register
4348 REG1 from MEM1 and register REG2 from MEM2. */
4350 static rtx
4351 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
4352 rtx mem2)
4354 switch (mode)
4356 case E_DImode:
4357 return gen_load_pair_dw_didi (reg1, mem1, reg2, mem2);
4359 case E_DFmode:
4360 return gen_load_pair_dw_dfdf (reg1, mem1, reg2, mem2);
4362 default:
4363 gcc_unreachable ();
4367 /* Return TRUE if return address signing should be enabled for the current
4368 function, otherwise return FALSE. */
4370 bool
4371 aarch64_return_address_signing_enabled (void)
4373 /* This function should only be called after frame laid out. */
4374 gcc_assert (cfun->machine->frame.laid_out);
4376 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
4377 if it's LR is pushed onto stack. */
4378 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
4379 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
4380 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
4383 /* Emit code to save the callee-saved registers from register number START
4384 to LIMIT to the stack at the location starting at offset START_OFFSET,
4385 skipping any write-back candidates if SKIP_WB is true. */
4387 static void
4388 aarch64_save_callee_saves (machine_mode mode, poly_int64 start_offset,
4389 unsigned start, unsigned limit, bool skip_wb)
4391 rtx_insn *insn;
4392 unsigned regno;
4393 unsigned regno2;
4395 for (regno = aarch64_next_callee_save (start, limit);
4396 regno <= limit;
4397 regno = aarch64_next_callee_save (regno + 1, limit))
4399 rtx reg, mem;
4400 poly_int64 offset;
4402 if (skip_wb
4403 && (regno == cfun->machine->frame.wb_candidate1
4404 || regno == cfun->machine->frame.wb_candidate2))
4405 continue;
4407 if (cfun->machine->reg_is_wrapped_separately[regno])
4408 continue;
4410 reg = gen_rtx_REG (mode, regno);
4411 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4412 mem = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4413 offset));
4415 regno2 = aarch64_next_callee_save (regno + 1, limit);
4417 if (regno2 <= limit
4418 && !cfun->machine->reg_is_wrapped_separately[regno2]
4419 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4420 == cfun->machine->frame.reg_offset[regno2]))
4423 rtx reg2 = gen_rtx_REG (mode, regno2);
4424 rtx mem2;
4426 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4427 mem2 = gen_frame_mem (mode, plus_constant (Pmode, stack_pointer_rtx,
4428 offset));
4429 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
4430 reg2));
4432 /* The first part of a frame-related parallel insn is
4433 always assumed to be relevant to the frame
4434 calculations; subsequent parts, are only
4435 frame-related if explicitly marked. */
4436 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
4437 regno = regno2;
4439 else
4440 insn = emit_move_insn (mem, reg);
4442 RTX_FRAME_RELATED_P (insn) = 1;
4446 /* Emit code to restore the callee registers of mode MODE from register
4447 number START up to and including LIMIT. Restore from the stack offset
4448 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
4449 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
4451 static void
4452 aarch64_restore_callee_saves (machine_mode mode,
4453 poly_int64 start_offset, unsigned start,
4454 unsigned limit, bool skip_wb, rtx *cfi_ops)
4456 rtx base_rtx = stack_pointer_rtx;
4457 unsigned regno;
4458 unsigned regno2;
4459 poly_int64 offset;
4461 for (regno = aarch64_next_callee_save (start, limit);
4462 regno <= limit;
4463 regno = aarch64_next_callee_save (regno + 1, limit))
4465 if (cfun->machine->reg_is_wrapped_separately[regno])
4466 continue;
4468 rtx reg, mem;
4470 if (skip_wb
4471 && (regno == cfun->machine->frame.wb_candidate1
4472 || regno == cfun->machine->frame.wb_candidate2))
4473 continue;
4475 reg = gen_rtx_REG (mode, regno);
4476 offset = start_offset + cfun->machine->frame.reg_offset[regno];
4477 mem = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4479 regno2 = aarch64_next_callee_save (regno + 1, limit);
4481 if (regno2 <= limit
4482 && !cfun->machine->reg_is_wrapped_separately[regno2]
4483 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
4484 == cfun->machine->frame.reg_offset[regno2]))
4486 rtx reg2 = gen_rtx_REG (mode, regno2);
4487 rtx mem2;
4489 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
4490 mem2 = gen_frame_mem (mode, plus_constant (Pmode, base_rtx, offset));
4491 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4493 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
4494 regno = regno2;
4496 else
4497 emit_move_insn (reg, mem);
4498 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
4502 /* Return true if OFFSET is a signed 4-bit value multiplied by the size
4503 of MODE. */
4505 static inline bool
4506 offset_4bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4508 HOST_WIDE_INT multiple;
4509 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4510 && IN_RANGE (multiple, -8, 7));
4513 /* Return true if OFFSET is a unsigned 6-bit value multiplied by the size
4514 of MODE. */
4516 static inline bool
4517 offset_6bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4519 HOST_WIDE_INT multiple;
4520 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4521 && IN_RANGE (multiple, 0, 63));
4524 /* Return true if OFFSET is a signed 7-bit value multiplied by the size
4525 of MODE. */
4527 bool
4528 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4530 HOST_WIDE_INT multiple;
4531 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4532 && IN_RANGE (multiple, -64, 63));
4535 /* Return true if OFFSET is a signed 9-bit value. */
4537 bool
4538 aarch64_offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
4539 poly_int64 offset)
4541 HOST_WIDE_INT const_offset;
4542 return (offset.is_constant (&const_offset)
4543 && IN_RANGE (const_offset, -256, 255));
4546 /* Return true if OFFSET is a signed 9-bit value multiplied by the size
4547 of MODE. */
4549 static inline bool
4550 offset_9bit_signed_scaled_p (machine_mode mode, poly_int64 offset)
4552 HOST_WIDE_INT multiple;
4553 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4554 && IN_RANGE (multiple, -256, 255));
4557 /* Return true if OFFSET is an unsigned 12-bit value multiplied by the size
4558 of MODE. */
4560 static inline bool
4561 offset_12bit_unsigned_scaled_p (machine_mode mode, poly_int64 offset)
4563 HOST_WIDE_INT multiple;
4564 return (constant_multiple_p (offset, GET_MODE_SIZE (mode), &multiple)
4565 && IN_RANGE (multiple, 0, 4095));
4568 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
4570 static sbitmap
4571 aarch64_get_separate_components (void)
4573 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4574 bitmap_clear (components);
4576 /* The registers we need saved to the frame. */
4577 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4578 if (aarch64_register_saved_on_entry (regno))
4580 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4581 if (!frame_pointer_needed)
4582 offset += cfun->machine->frame.frame_size
4583 - cfun->machine->frame.hard_fp_offset;
4584 /* Check that we can access the stack slot of the register with one
4585 direct load with no adjustments needed. */
4586 if (offset_12bit_unsigned_scaled_p (DImode, offset))
4587 bitmap_set_bit (components, regno);
4590 /* Don't mess with the hard frame pointer. */
4591 if (frame_pointer_needed)
4592 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
4594 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4595 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4596 /* If registers have been chosen to be stored/restored with
4597 writeback don't interfere with them to avoid having to output explicit
4598 stack adjustment instructions. */
4599 if (reg2 != INVALID_REGNUM)
4600 bitmap_clear_bit (components, reg2);
4601 if (reg1 != INVALID_REGNUM)
4602 bitmap_clear_bit (components, reg1);
4604 bitmap_clear_bit (components, LR_REGNUM);
4605 bitmap_clear_bit (components, SP_REGNUM);
4607 return components;
4610 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
4612 static sbitmap
4613 aarch64_components_for_bb (basic_block bb)
4615 bitmap in = DF_LIVE_IN (bb);
4616 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
4617 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
4619 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
4620 bitmap_clear (components);
4622 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
4623 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4624 if ((!call_used_regs[regno])
4625 && (bitmap_bit_p (in, regno)
4626 || bitmap_bit_p (gen, regno)
4627 || bitmap_bit_p (kill, regno)))
4629 unsigned regno2, offset, offset2;
4630 bitmap_set_bit (components, regno);
4632 /* If there is a callee-save at an adjacent offset, add it too
4633 to increase the use of LDP/STP. */
4634 offset = cfun->machine->frame.reg_offset[regno];
4635 regno2 = ((offset & 8) == 0) ? regno + 1 : regno - 1;
4637 if (regno2 <= LAST_SAVED_REGNUM)
4639 offset2 = cfun->machine->frame.reg_offset[regno2];
4640 if ((offset & ~8) == (offset2 & ~8))
4641 bitmap_set_bit (components, regno2);
4645 return components;
4648 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
4649 Nothing to do for aarch64. */
4651 static void
4652 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
4656 /* Return the next set bit in BMP from START onwards. Return the total number
4657 of bits in BMP if no set bit is found at or after START. */
4659 static unsigned int
4660 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
4662 unsigned int nbits = SBITMAP_SIZE (bmp);
4663 if (start == nbits)
4664 return start;
4666 gcc_assert (start < nbits);
4667 for (unsigned int i = start; i < nbits; i++)
4668 if (bitmap_bit_p (bmp, i))
4669 return i;
4671 return nbits;
4674 /* Do the work for aarch64_emit_prologue_components and
4675 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
4676 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
4677 for these components or the epilogue sequence. That is, it determines
4678 whether we should emit stores or loads and what kind of CFA notes to attach
4679 to the insns. Otherwise the logic for the two sequences is very
4680 similar. */
4682 static void
4683 aarch64_process_components (sbitmap components, bool prologue_p)
4685 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
4686 ? HARD_FRAME_POINTER_REGNUM
4687 : STACK_POINTER_REGNUM);
4689 unsigned last_regno = SBITMAP_SIZE (components);
4690 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
4691 rtx_insn *insn = NULL;
4693 while (regno != last_regno)
4695 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
4696 so DFmode for the vector registers is enough. */
4697 machine_mode mode = GP_REGNUM_P (regno) ? E_DImode : E_DFmode;
4698 rtx reg = gen_rtx_REG (mode, regno);
4699 poly_int64 offset = cfun->machine->frame.reg_offset[regno];
4700 if (!frame_pointer_needed)
4701 offset += cfun->machine->frame.frame_size
4702 - cfun->machine->frame.hard_fp_offset;
4703 rtx addr = plus_constant (Pmode, ptr_reg, offset);
4704 rtx mem = gen_frame_mem (mode, addr);
4706 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
4707 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
4708 /* No more registers to handle after REGNO.
4709 Emit a single save/restore and exit. */
4710 if (regno2 == last_regno)
4712 insn = emit_insn (set);
4713 RTX_FRAME_RELATED_P (insn) = 1;
4714 if (prologue_p)
4715 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4716 else
4717 add_reg_note (insn, REG_CFA_RESTORE, reg);
4718 break;
4721 poly_int64 offset2 = cfun->machine->frame.reg_offset[regno2];
4722 /* The next register is not of the same class or its offset is not
4723 mergeable with the current one into a pair. */
4724 if (!satisfies_constraint_Ump (mem)
4725 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
4726 || maybe_ne ((offset2 - cfun->machine->frame.reg_offset[regno]),
4727 GET_MODE_SIZE (mode)))
4729 insn = emit_insn (set);
4730 RTX_FRAME_RELATED_P (insn) = 1;
4731 if (prologue_p)
4732 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
4733 else
4734 add_reg_note (insn, REG_CFA_RESTORE, reg);
4736 regno = regno2;
4737 continue;
4740 /* REGNO2 can be saved/restored in a pair with REGNO. */
4741 rtx reg2 = gen_rtx_REG (mode, regno2);
4742 if (!frame_pointer_needed)
4743 offset2 += cfun->machine->frame.frame_size
4744 - cfun->machine->frame.hard_fp_offset;
4745 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
4746 rtx mem2 = gen_frame_mem (mode, addr2);
4747 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
4748 : gen_rtx_SET (reg2, mem2);
4750 if (prologue_p)
4751 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
4752 else
4753 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
4755 RTX_FRAME_RELATED_P (insn) = 1;
4756 if (prologue_p)
4758 add_reg_note (insn, REG_CFA_OFFSET, set);
4759 add_reg_note (insn, REG_CFA_OFFSET, set2);
4761 else
4763 add_reg_note (insn, REG_CFA_RESTORE, reg);
4764 add_reg_note (insn, REG_CFA_RESTORE, reg2);
4767 regno = aarch64_get_next_set_bit (components, regno2 + 1);
4771 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
4773 static void
4774 aarch64_emit_prologue_components (sbitmap components)
4776 aarch64_process_components (components, true);
4779 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
4781 static void
4782 aarch64_emit_epilogue_components (sbitmap components)
4784 aarch64_process_components (components, false);
4787 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
4789 static void
4790 aarch64_set_handled_components (sbitmap components)
4792 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
4793 if (bitmap_bit_p (components, regno))
4794 cfun->machine->reg_is_wrapped_separately[regno] = true;
4797 /* Add a REG_CFA_EXPRESSION note to INSN to say that register REG
4798 is saved at BASE + OFFSET. */
4800 static void
4801 aarch64_add_cfa_expression (rtx_insn *insn, unsigned int reg,
4802 rtx base, poly_int64 offset)
4804 rtx mem = gen_frame_mem (DImode, plus_constant (Pmode, base, offset));
4805 add_reg_note (insn, REG_CFA_EXPRESSION,
4806 gen_rtx_SET (mem, regno_reg_rtx[reg]));
4809 /* AArch64 stack frames generated by this compiler look like:
4811 +-------------------------------+
4813 | incoming stack arguments |
4815 +-------------------------------+
4816 | | <-- incoming stack pointer (aligned)
4817 | callee-allocated save area |
4818 | for register varargs |
4820 +-------------------------------+
4821 | local variables | <-- frame_pointer_rtx
4823 +-------------------------------+
4824 | padding0 | \
4825 +-------------------------------+ |
4826 | callee-saved registers | | frame.saved_regs_size
4827 +-------------------------------+ |
4828 | LR' | |
4829 +-------------------------------+ |
4830 | FP' | / <- hard_frame_pointer_rtx (aligned)
4831 +-------------------------------+
4832 | dynamic allocation |
4833 +-------------------------------+
4834 | padding |
4835 +-------------------------------+
4836 | outgoing stack arguments | <-- arg_pointer
4838 +-------------------------------+
4839 | | <-- stack_pointer_rtx (aligned)
4841 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
4842 but leave frame_pointer_rtx and hard_frame_pointer_rtx
4843 unchanged. */
4845 /* Generate the prologue instructions for entry into a function.
4846 Establish the stack frame by decreasing the stack pointer with a
4847 properly calculated size and, if necessary, create a frame record
4848 filled with the values of LR and previous frame pointer. The
4849 current FP is also set up if it is in use. */
4851 void
4852 aarch64_expand_prologue (void)
4854 poly_int64 frame_size = cfun->machine->frame.frame_size;
4855 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4856 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4857 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4858 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4859 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4860 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4861 bool emit_frame_chain = cfun->machine->frame.emit_frame_chain;
4862 rtx_insn *insn;
4864 /* Sign return address for functions. */
4865 if (aarch64_return_address_signing_enabled ())
4867 insn = emit_insn (gen_pacisp ());
4868 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
4869 RTX_FRAME_RELATED_P (insn) = 1;
4872 if (flag_stack_usage_info)
4873 current_function_static_stack_size = constant_lower_bound (frame_size);
4875 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
4877 if (crtl->is_leaf && !cfun->calls_alloca)
4879 if (maybe_gt (frame_size, PROBE_INTERVAL)
4880 && maybe_gt (frame_size, get_stack_check_protect ()))
4881 aarch64_emit_probe_stack_range (get_stack_check_protect (),
4882 (frame_size
4883 - get_stack_check_protect ()));
4885 else if (maybe_gt (frame_size, 0))
4886 aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size);
4889 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
4890 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
4892 aarch64_sub_sp (ip0_rtx, ip1_rtx, initial_adjust, true);
4894 if (callee_adjust != 0)
4895 aarch64_push_regs (reg1, reg2, callee_adjust);
4897 if (emit_frame_chain)
4899 poly_int64 reg_offset = callee_adjust;
4900 if (callee_adjust == 0)
4902 reg1 = R29_REGNUM;
4903 reg2 = R30_REGNUM;
4904 reg_offset = callee_offset;
4905 aarch64_save_callee_saves (DImode, reg_offset, reg1, reg2, false);
4907 aarch64_add_offset (Pmode, hard_frame_pointer_rtx,
4908 stack_pointer_rtx, callee_offset,
4909 ip1_rtx, ip0_rtx, frame_pointer_needed);
4910 if (frame_pointer_needed && !frame_size.is_constant ())
4912 /* Variable-sized frames need to describe the save slot
4913 address using DW_CFA_expression rather than DW_CFA_offset.
4914 This means that, without taking further action, the
4915 locations of the registers that we've already saved would
4916 remain based on the stack pointer even after we redefine
4917 the CFA based on the frame pointer. We therefore need new
4918 DW_CFA_expressions to re-express the save slots with addresses
4919 based on the frame pointer. */
4920 rtx_insn *insn = get_last_insn ();
4921 gcc_assert (RTX_FRAME_RELATED_P (insn));
4923 /* Add an explicit CFA definition if this was previously
4924 implicit. */
4925 if (!find_reg_note (insn, REG_CFA_ADJUST_CFA, NULL_RTX))
4927 rtx src = plus_constant (Pmode, stack_pointer_rtx,
4928 callee_offset);
4929 add_reg_note (insn, REG_CFA_ADJUST_CFA,
4930 gen_rtx_SET (hard_frame_pointer_rtx, src));
4933 /* Change the save slot expressions for the registers that
4934 we've already saved. */
4935 reg_offset -= callee_offset;
4936 aarch64_add_cfa_expression (insn, reg2, hard_frame_pointer_rtx,
4937 reg_offset + UNITS_PER_WORD);
4938 aarch64_add_cfa_expression (insn, reg1, hard_frame_pointer_rtx,
4939 reg_offset);
4941 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
4944 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
4945 callee_adjust != 0 || emit_frame_chain);
4946 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
4947 callee_adjust != 0 || emit_frame_chain);
4948 aarch64_sub_sp (ip1_rtx, ip0_rtx, final_adjust, !frame_pointer_needed);
4951 /* Return TRUE if we can use a simple_return insn.
4953 This function checks whether the callee saved stack is empty, which
4954 means no restore actions are need. The pro_and_epilogue will use
4955 this to check whether shrink-wrapping opt is feasible. */
4957 bool
4958 aarch64_use_return_insn_p (void)
4960 if (!reload_completed)
4961 return false;
4963 if (crtl->profile)
4964 return false;
4966 return known_eq (cfun->machine->frame.frame_size, 0);
4969 /* Generate the epilogue instructions for returning from a function.
4970 This is almost exactly the reverse of the prolog sequence, except
4971 that we need to insert barriers to avoid scheduling loads that read
4972 from a deallocated stack, and we optimize the unwind records by
4973 emitting them all together if possible. */
4974 void
4975 aarch64_expand_epilogue (bool for_sibcall)
4977 poly_int64 initial_adjust = cfun->machine->frame.initial_adjust;
4978 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
4979 poly_int64 final_adjust = cfun->machine->frame.final_adjust;
4980 poly_int64 callee_offset = cfun->machine->frame.callee_offset;
4981 unsigned reg1 = cfun->machine->frame.wb_candidate1;
4982 unsigned reg2 = cfun->machine->frame.wb_candidate2;
4983 rtx cfi_ops = NULL;
4984 rtx_insn *insn;
4985 /* A stack clash protection prologue may not have left IP0_REGNUM or
4986 IP1_REGNUM in a usable state. The same is true for allocations
4987 with an SVE component, since we then need both temporary registers
4988 for each allocation. */
4989 bool can_inherit_p = (initial_adjust.is_constant ()
4990 && final_adjust.is_constant ()
4991 && !flag_stack_clash_protection);
4993 /* We need to add memory barrier to prevent read from deallocated stack. */
4994 bool need_barrier_p
4995 = maybe_ne (get_frame_size ()
4996 + cfun->machine->frame.saved_varargs_size, 0);
4998 /* Emit a barrier to prevent loads from a deallocated stack. */
4999 if (maybe_gt (final_adjust, crtl->outgoing_args_size)
5000 || cfun->calls_alloca
5001 || crtl->calls_eh_return)
5003 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5004 need_barrier_p = false;
5007 /* Restore the stack pointer from the frame pointer if it may not
5008 be the same as the stack pointer. */
5009 rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM);
5010 rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM);
5011 if (frame_pointer_needed
5012 && (maybe_ne (final_adjust, 0) || cfun->calls_alloca))
5013 /* If writeback is used when restoring callee-saves, the CFA
5014 is restored on the instruction doing the writeback. */
5015 aarch64_add_offset (Pmode, stack_pointer_rtx,
5016 hard_frame_pointer_rtx, -callee_offset,
5017 ip1_rtx, ip0_rtx, callee_adjust == 0);
5018 else
5019 aarch64_add_sp (ip1_rtx, ip0_rtx, final_adjust,
5020 !can_inherit_p || df_regs_ever_live_p (IP1_REGNUM));
5022 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
5023 callee_adjust != 0, &cfi_ops);
5024 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
5025 callee_adjust != 0, &cfi_ops);
5027 if (need_barrier_p)
5028 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
5030 if (callee_adjust != 0)
5031 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
5033 if (callee_adjust != 0 || maybe_gt (initial_adjust, 65536))
5035 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
5036 insn = get_last_insn ();
5037 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
5038 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
5039 RTX_FRAME_RELATED_P (insn) = 1;
5040 cfi_ops = NULL;
5043 aarch64_add_sp (ip0_rtx, ip1_rtx, initial_adjust,
5044 !can_inherit_p || df_regs_ever_live_p (IP0_REGNUM));
5046 if (cfi_ops)
5048 /* Emit delayed restores and reset the CFA to be SP. */
5049 insn = get_last_insn ();
5050 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
5051 REG_NOTES (insn) = cfi_ops;
5052 RTX_FRAME_RELATED_P (insn) = 1;
5055 /* We prefer to emit the combined return/authenticate instruction RETAA,
5056 however there are three cases in which we must instead emit an explicit
5057 authentication instruction.
5059 1) Sibcalls don't return in a normal way, so if we're about to call one
5060 we must authenticate.
5062 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
5063 generating code for !TARGET_ARMV8_3 we can't use it and must
5064 explicitly authenticate.
5066 3) On an eh_return path we make extra stack adjustments to update the
5067 canonical frame address to be the exception handler's CFA. We want
5068 to authenticate using the CFA of the function which calls eh_return.
5070 if (aarch64_return_address_signing_enabled ()
5071 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
5073 insn = emit_insn (gen_autisp ());
5074 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
5075 RTX_FRAME_RELATED_P (insn) = 1;
5078 /* Stack adjustment for exception handler. */
5079 if (crtl->calls_eh_return)
5081 /* We need to unwind the stack by the offset computed by
5082 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
5083 to be SP; letting the CFA move during this adjustment
5084 is just as correct as retaining the CFA from the body
5085 of the function. Therefore, do nothing special. */
5086 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
5089 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
5090 if (!for_sibcall)
5091 emit_jump_insn (ret_rtx);
5094 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
5095 normally or return to a previous frame after unwinding.
5097 An EH return uses a single shared return sequence. The epilogue is
5098 exactly like a normal epilogue except that it has an extra input
5099 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
5100 that must be applied after the frame has been destroyed. An extra label
5101 is inserted before the epilogue which initializes this register to zero,
5102 and this is the entry point for a normal return.
5104 An actual EH return updates the return address, initializes the stack
5105 adjustment and jumps directly into the epilogue (bypassing the zeroing
5106 of the adjustment). Since the return address is typically saved on the
5107 stack when a function makes a call, the saved LR must be updated outside
5108 the epilogue.
5110 This poses problems as the store is generated well before the epilogue,
5111 so the offset of LR is not known yet. Also optimizations will remove the
5112 store as it appears dead, even after the epilogue is generated (as the
5113 base or offset for loading LR is different in many cases).
5115 To avoid these problems this implementation forces the frame pointer
5116 in eh_return functions so that the location of LR is fixed and known early.
5117 It also marks the store volatile, so no optimization is permitted to
5118 remove the store. */
5120 aarch64_eh_return_handler_rtx (void)
5122 rtx tmp = gen_frame_mem (Pmode,
5123 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
5125 /* Mark the store volatile, so no optimization is permitted to remove it. */
5126 MEM_VOLATILE_P (tmp) = true;
5127 return tmp;
5130 /* Output code to add DELTA to the first argument, and then jump
5131 to FUNCTION. Used for C++ multiple inheritance. */
5132 static void
5133 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
5134 HOST_WIDE_INT delta,
5135 HOST_WIDE_INT vcall_offset,
5136 tree function)
5138 /* The this pointer is always in x0. Note that this differs from
5139 Arm where the this pointer maybe bumped to r1 if r0 is required
5140 to return a pointer to an aggregate. On AArch64 a result value
5141 pointer will be in x8. */
5142 int this_regno = R0_REGNUM;
5143 rtx this_rtx, temp0, temp1, addr, funexp;
5144 rtx_insn *insn;
5146 reload_completed = 1;
5147 emit_note (NOTE_INSN_PROLOGUE_END);
5149 this_rtx = gen_rtx_REG (Pmode, this_regno);
5150 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
5151 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
5153 if (vcall_offset == 0)
5154 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, temp0, false);
5155 else
5157 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
5159 addr = this_rtx;
5160 if (delta != 0)
5162 if (delta >= -256 && delta < 256)
5163 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
5164 plus_constant (Pmode, this_rtx, delta));
5165 else
5166 aarch64_add_offset (Pmode, this_rtx, this_rtx, delta,
5167 temp1, temp0, false);
5170 if (Pmode == ptr_mode)
5171 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
5172 else
5173 aarch64_emit_move (temp0,
5174 gen_rtx_ZERO_EXTEND (Pmode,
5175 gen_rtx_MEM (ptr_mode, addr)));
5177 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
5178 addr = plus_constant (Pmode, temp0, vcall_offset);
5179 else
5181 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
5182 Pmode);
5183 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
5186 if (Pmode == ptr_mode)
5187 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
5188 else
5189 aarch64_emit_move (temp1,
5190 gen_rtx_SIGN_EXTEND (Pmode,
5191 gen_rtx_MEM (ptr_mode, addr)));
5193 emit_insn (gen_add2_insn (this_rtx, temp1));
5196 /* Generate a tail call to the target function. */
5197 if (!TREE_USED (function))
5199 assemble_external (function);
5200 TREE_USED (function) = 1;
5202 funexp = XEXP (DECL_RTL (function), 0);
5203 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
5204 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
5205 SIBLING_CALL_P (insn) = 1;
5207 insn = get_insns ();
5208 shorten_branches (insn);
5209 final_start_function (insn, file, 1);
5210 final (insn, file, 1);
5211 final_end_function ();
5213 /* Stop pretending to be a post-reload pass. */
5214 reload_completed = 0;
5217 static bool
5218 aarch64_tls_referenced_p (rtx x)
5220 if (!TARGET_HAVE_TLS)
5221 return false;
5222 subrtx_iterator::array_type array;
5223 FOR_EACH_SUBRTX (iter, array, x, ALL)
5225 const_rtx x = *iter;
5226 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
5227 return true;
5228 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
5229 TLS offsets, not real symbol references. */
5230 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5231 iter.skip_subrtxes ();
5233 return false;
5237 /* Return true if val can be encoded as a 12-bit unsigned immediate with
5238 a left shift of 0 or 12 bits. */
5239 bool
5240 aarch64_uimm12_shift (HOST_WIDE_INT val)
5242 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
5243 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
5248 /* Return true if val is an immediate that can be loaded into a
5249 register by a MOVZ instruction. */
5250 static bool
5251 aarch64_movw_imm (HOST_WIDE_INT val, scalar_int_mode mode)
5253 if (GET_MODE_SIZE (mode) > 4)
5255 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
5256 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
5257 return 1;
5259 else
5261 /* Ignore sign extension. */
5262 val &= (HOST_WIDE_INT) 0xffffffff;
5264 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
5265 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
5268 /* VAL is a value with the inner mode of MODE. Replicate it to fill a
5269 64-bit (DImode) integer. */
5271 static unsigned HOST_WIDE_INT
5272 aarch64_replicate_bitmask_imm (unsigned HOST_WIDE_INT val, machine_mode mode)
5274 unsigned int size = GET_MODE_UNIT_PRECISION (mode);
5275 while (size < 64)
5277 val &= (HOST_WIDE_INT_1U << size) - 1;
5278 val |= val << size;
5279 size *= 2;
5281 return val;
5284 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
5286 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
5288 0x0000000100000001ull,
5289 0x0001000100010001ull,
5290 0x0101010101010101ull,
5291 0x1111111111111111ull,
5292 0x5555555555555555ull,
5296 /* Return true if val is a valid bitmask immediate. */
5298 bool
5299 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
5301 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
5302 int bits;
5304 /* Check for a single sequence of one bits and return quickly if so.
5305 The special cases of all ones and all zeroes returns false. */
5306 val = aarch64_replicate_bitmask_imm (val_in, mode);
5307 tmp = val + (val & -val);
5309 if (tmp == (tmp & -tmp))
5310 return (val + 1) > 1;
5312 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
5313 if (mode == SImode)
5314 val = (val << 32) | (val & 0xffffffff);
5316 /* Invert if the immediate doesn't start with a zero bit - this means we
5317 only need to search for sequences of one bits. */
5318 if (val & 1)
5319 val = ~val;
5321 /* Find the first set bit and set tmp to val with the first sequence of one
5322 bits removed. Return success if there is a single sequence of ones. */
5323 first_one = val & -val;
5324 tmp = val & (val + first_one);
5326 if (tmp == 0)
5327 return true;
5329 /* Find the next set bit and compute the difference in bit position. */
5330 next_one = tmp & -tmp;
5331 bits = clz_hwi (first_one) - clz_hwi (next_one);
5332 mask = val ^ tmp;
5334 /* Check the bit position difference is a power of 2, and that the first
5335 sequence of one bits fits within 'bits' bits. */
5336 if ((mask >> bits) != 0 || bits != (bits & -bits))
5337 return false;
5339 /* Check the sequence of one bits is repeated 64/bits times. */
5340 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
5343 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
5344 Assumed precondition: VAL_IN Is not zero. */
5346 unsigned HOST_WIDE_INT
5347 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
5349 int lowest_bit_set = ctz_hwi (val_in);
5350 int highest_bit_set = floor_log2 (val_in);
5351 gcc_assert (val_in != 0);
5353 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
5354 (HOST_WIDE_INT_1U << lowest_bit_set));
5357 /* Create constant where bits outside of lowest bit set to highest bit set
5358 are set to 1. */
5360 unsigned HOST_WIDE_INT
5361 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
5363 return val_in | ~aarch64_and_split_imm1 (val_in);
5366 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
5368 bool
5369 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
5371 scalar_int_mode int_mode;
5372 if (!is_a <scalar_int_mode> (mode, &int_mode))
5373 return false;
5375 if (aarch64_bitmask_imm (val_in, int_mode))
5376 return false;
5378 if (aarch64_move_imm (val_in, int_mode))
5379 return false;
5381 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
5383 return aarch64_bitmask_imm (imm2, int_mode);
5386 /* Return true if val is an immediate that can be loaded into a
5387 register in a single instruction. */
5388 bool
5389 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
5391 scalar_int_mode int_mode;
5392 if (!is_a <scalar_int_mode> (mode, &int_mode))
5393 return false;
5395 if (aarch64_movw_imm (val, int_mode) || aarch64_movw_imm (~val, int_mode))
5396 return 1;
5397 return aarch64_bitmask_imm (val, int_mode);
5400 static bool
5401 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
5403 rtx base, offset;
5405 if (GET_CODE (x) == HIGH)
5406 return true;
5408 /* There's no way to calculate VL-based values using relocations. */
5409 subrtx_iterator::array_type array;
5410 FOR_EACH_SUBRTX (iter, array, x, ALL)
5411 if (GET_CODE (*iter) == CONST_POLY_INT)
5412 return true;
5414 split_const (x, &base, &offset);
5415 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
5417 if (aarch64_classify_symbol (base, INTVAL (offset))
5418 != SYMBOL_FORCE_TO_MEM)
5419 return true;
5420 else
5421 /* Avoid generating a 64-bit relocation in ILP32; leave
5422 to aarch64_expand_mov_immediate to handle it properly. */
5423 return mode != ptr_mode;
5426 return aarch64_tls_referenced_p (x);
5429 /* Implement TARGET_CASE_VALUES_THRESHOLD.
5430 The expansion for a table switch is quite expensive due to the number
5431 of instructions, the table lookup and hard to predict indirect jump.
5432 When optimizing for speed, and -O3 enabled, use the per-core tuning if
5433 set, otherwise use tables for > 16 cases as a tradeoff between size and
5434 performance. When optimizing for size, use the default setting. */
5436 static unsigned int
5437 aarch64_case_values_threshold (void)
5439 /* Use the specified limit for the number of cases before using jump
5440 tables at higher optimization levels. */
5441 if (optimize > 2
5442 && selected_cpu->tune->max_case_values != 0)
5443 return selected_cpu->tune->max_case_values;
5444 else
5445 return optimize_size ? default_case_values_threshold () : 17;
5448 /* Return true if register REGNO is a valid index register.
5449 STRICT_P is true if REG_OK_STRICT is in effect. */
5451 bool
5452 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
5454 if (!HARD_REGISTER_NUM_P (regno))
5456 if (!strict_p)
5457 return true;
5459 if (!reg_renumber)
5460 return false;
5462 regno = reg_renumber[regno];
5464 return GP_REGNUM_P (regno);
5467 /* Return true if register REGNO is a valid base register for mode MODE.
5468 STRICT_P is true if REG_OK_STRICT is in effect. */
5470 bool
5471 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
5473 if (!HARD_REGISTER_NUM_P (regno))
5475 if (!strict_p)
5476 return true;
5478 if (!reg_renumber)
5479 return false;
5481 regno = reg_renumber[regno];
5484 /* The fake registers will be eliminated to either the stack or
5485 hard frame pointer, both of which are usually valid base registers.
5486 Reload deals with the cases where the eliminated form isn't valid. */
5487 return (GP_REGNUM_P (regno)
5488 || regno == SP_REGNUM
5489 || regno == FRAME_POINTER_REGNUM
5490 || regno == ARG_POINTER_REGNUM);
5493 /* Return true if X is a valid base register for mode MODE.
5494 STRICT_P is true if REG_OK_STRICT is in effect. */
5496 static bool
5497 aarch64_base_register_rtx_p (rtx x, bool strict_p)
5499 if (!strict_p
5500 && GET_CODE (x) == SUBREG
5501 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (x))])
5502 x = SUBREG_REG (x);
5504 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
5507 /* Return true if address offset is a valid index. If it is, fill in INFO
5508 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
5510 static bool
5511 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
5512 machine_mode mode, bool strict_p)
5514 enum aarch64_address_type type;
5515 rtx index;
5516 int shift;
5518 /* (reg:P) */
5519 if ((REG_P (x) || GET_CODE (x) == SUBREG)
5520 && GET_MODE (x) == Pmode)
5522 type = ADDRESS_REG_REG;
5523 index = x;
5524 shift = 0;
5526 /* (sign_extend:DI (reg:SI)) */
5527 else if ((GET_CODE (x) == SIGN_EXTEND
5528 || GET_CODE (x) == ZERO_EXTEND)
5529 && GET_MODE (x) == DImode
5530 && GET_MODE (XEXP (x, 0)) == SImode)
5532 type = (GET_CODE (x) == SIGN_EXTEND)
5533 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5534 index = XEXP (x, 0);
5535 shift = 0;
5537 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
5538 else if (GET_CODE (x) == MULT
5539 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5540 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5541 && GET_MODE (XEXP (x, 0)) == DImode
5542 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5543 && CONST_INT_P (XEXP (x, 1)))
5545 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5546 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5547 index = XEXP (XEXP (x, 0), 0);
5548 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5550 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
5551 else if (GET_CODE (x) == ASHIFT
5552 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
5553 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
5554 && GET_MODE (XEXP (x, 0)) == DImode
5555 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
5556 && CONST_INT_P (XEXP (x, 1)))
5558 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
5559 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5560 index = XEXP (XEXP (x, 0), 0);
5561 shift = INTVAL (XEXP (x, 1));
5563 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
5564 else if ((GET_CODE (x) == SIGN_EXTRACT
5565 || GET_CODE (x) == ZERO_EXTRACT)
5566 && GET_MODE (x) == DImode
5567 && GET_CODE (XEXP (x, 0)) == MULT
5568 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5569 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5571 type = (GET_CODE (x) == SIGN_EXTRACT)
5572 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5573 index = XEXP (XEXP (x, 0), 0);
5574 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5575 if (INTVAL (XEXP (x, 1)) != 32 + shift
5576 || INTVAL (XEXP (x, 2)) != 0)
5577 shift = -1;
5579 /* (and:DI (mult:DI (reg:DI) (const_int scale))
5580 (const_int 0xffffffff<<shift)) */
5581 else if (GET_CODE (x) == AND
5582 && GET_MODE (x) == DImode
5583 && GET_CODE (XEXP (x, 0)) == MULT
5584 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5585 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5586 && CONST_INT_P (XEXP (x, 1)))
5588 type = ADDRESS_REG_UXTW;
5589 index = XEXP (XEXP (x, 0), 0);
5590 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
5591 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5592 shift = -1;
5594 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
5595 else if ((GET_CODE (x) == SIGN_EXTRACT
5596 || GET_CODE (x) == ZERO_EXTRACT)
5597 && GET_MODE (x) == DImode
5598 && GET_CODE (XEXP (x, 0)) == ASHIFT
5599 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5600 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
5602 type = (GET_CODE (x) == SIGN_EXTRACT)
5603 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
5604 index = XEXP (XEXP (x, 0), 0);
5605 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5606 if (INTVAL (XEXP (x, 1)) != 32 + shift
5607 || INTVAL (XEXP (x, 2)) != 0)
5608 shift = -1;
5610 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
5611 (const_int 0xffffffff<<shift)) */
5612 else if (GET_CODE (x) == AND
5613 && GET_MODE (x) == DImode
5614 && GET_CODE (XEXP (x, 0)) == ASHIFT
5615 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
5616 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
5617 && CONST_INT_P (XEXP (x, 1)))
5619 type = ADDRESS_REG_UXTW;
5620 index = XEXP (XEXP (x, 0), 0);
5621 shift = INTVAL (XEXP (XEXP (x, 0), 1));
5622 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
5623 shift = -1;
5625 /* (mult:P (reg:P) (const_int scale)) */
5626 else if (GET_CODE (x) == MULT
5627 && GET_MODE (x) == Pmode
5628 && GET_MODE (XEXP (x, 0)) == Pmode
5629 && CONST_INT_P (XEXP (x, 1)))
5631 type = ADDRESS_REG_REG;
5632 index = XEXP (x, 0);
5633 shift = exact_log2 (INTVAL (XEXP (x, 1)));
5635 /* (ashift:P (reg:P) (const_int shift)) */
5636 else if (GET_CODE (x) == ASHIFT
5637 && GET_MODE (x) == Pmode
5638 && GET_MODE (XEXP (x, 0)) == Pmode
5639 && CONST_INT_P (XEXP (x, 1)))
5641 type = ADDRESS_REG_REG;
5642 index = XEXP (x, 0);
5643 shift = INTVAL (XEXP (x, 1));
5645 else
5646 return false;
5648 if (!strict_p
5649 && GET_CODE (index) == SUBREG
5650 && contains_reg_of_mode[GENERAL_REGS][GET_MODE (SUBREG_REG (index))])
5651 index = SUBREG_REG (index);
5653 if (aarch64_sve_data_mode_p (mode))
5655 if (type != ADDRESS_REG_REG
5656 || (1 << shift) != GET_MODE_UNIT_SIZE (mode))
5657 return false;
5659 else
5661 if (shift != 0
5662 && !(IN_RANGE (shift, 1, 3)
5663 && known_eq (1 << shift, GET_MODE_SIZE (mode))))
5664 return false;
5667 if (REG_P (index)
5668 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
5670 info->type = type;
5671 info->offset = index;
5672 info->shift = shift;
5673 return true;
5676 return false;
5679 /* Return true if MODE is one of the modes for which we
5680 support LDP/STP operations. */
5682 static bool
5683 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
5685 return mode == SImode || mode == DImode
5686 || mode == SFmode || mode == DFmode
5687 || (aarch64_vector_mode_supported_p (mode)
5688 && (known_eq (GET_MODE_SIZE (mode), 8)
5689 || (known_eq (GET_MODE_SIZE (mode), 16)
5690 && (aarch64_tune_params.extra_tuning_flags
5691 & AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0)));
5694 /* Return true if REGNO is a virtual pointer register, or an eliminable
5695 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
5696 include stack_pointer or hard_frame_pointer. */
5697 static bool
5698 virt_or_elim_regno_p (unsigned regno)
5700 return ((regno >= FIRST_VIRTUAL_REGISTER
5701 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
5702 || regno == FRAME_POINTER_REGNUM
5703 || regno == ARG_POINTER_REGNUM);
5706 /* Return true if X is a valid address of type TYPE for machine mode MODE.
5707 If it is, fill in INFO appropriately. STRICT_P is true if
5708 REG_OK_STRICT is in effect. */
5710 bool
5711 aarch64_classify_address (struct aarch64_address_info *info,
5712 rtx x, machine_mode mode, bool strict_p,
5713 aarch64_addr_query_type type)
5715 enum rtx_code code = GET_CODE (x);
5716 rtx op0, op1;
5717 poly_int64 offset;
5719 HOST_WIDE_INT const_size;
5721 /* On BE, we use load/store pair for all large int mode load/stores.
5722 TI/TFmode may also use a load/store pair. */
5723 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
5724 bool advsimd_struct_p = (vec_flags == (VEC_ADVSIMD | VEC_STRUCT));
5725 bool load_store_pair_p = (type == ADDR_QUERY_LDP_STP
5726 || type == ADDR_QUERY_LDP_STP_N
5727 || mode == TImode
5728 || mode == TFmode
5729 || (BYTES_BIG_ENDIAN && advsimd_struct_p));
5731 /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
5732 corresponds to the actual size of the memory being loaded/stored and the
5733 mode of the corresponding addressing mode is half of that. */
5734 if (type == ADDR_QUERY_LDP_STP_N
5735 && known_eq (GET_MODE_SIZE (mode), 16))
5736 mode = DFmode;
5738 bool allow_reg_index_p = (!load_store_pair_p
5739 && (known_lt (GET_MODE_SIZE (mode), 16)
5740 || vec_flags == VEC_ADVSIMD
5741 || vec_flags == VEC_SVE_DATA));
5743 /* For SVE, only accept [Rn], [Rn, Rm, LSL #shift] and
5744 [Rn, #offset, MUL VL]. */
5745 if ((vec_flags & (VEC_SVE_DATA | VEC_SVE_PRED)) != 0
5746 && (code != REG && code != PLUS))
5747 return false;
5749 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
5750 REG addressing. */
5751 if (advsimd_struct_p
5752 && !BYTES_BIG_ENDIAN
5753 && (code != POST_INC && code != REG))
5754 return false;
5756 gcc_checking_assert (GET_MODE (x) == VOIDmode
5757 || SCALAR_INT_MODE_P (GET_MODE (x)));
5759 switch (code)
5761 case REG:
5762 case SUBREG:
5763 info->type = ADDRESS_REG_IMM;
5764 info->base = x;
5765 info->offset = const0_rtx;
5766 info->const_offset = 0;
5767 return aarch64_base_register_rtx_p (x, strict_p);
5769 case PLUS:
5770 op0 = XEXP (x, 0);
5771 op1 = XEXP (x, 1);
5773 if (! strict_p
5774 && REG_P (op0)
5775 && virt_or_elim_regno_p (REGNO (op0))
5776 && poly_int_rtx_p (op1, &offset))
5778 info->type = ADDRESS_REG_IMM;
5779 info->base = op0;
5780 info->offset = op1;
5781 info->const_offset = offset;
5783 return true;
5786 if (maybe_ne (GET_MODE_SIZE (mode), 0)
5787 && aarch64_base_register_rtx_p (op0, strict_p)
5788 && poly_int_rtx_p (op1, &offset))
5790 info->type = ADDRESS_REG_IMM;
5791 info->base = op0;
5792 info->offset = op1;
5793 info->const_offset = offset;
5795 /* TImode and TFmode values are allowed in both pairs of X
5796 registers and individual Q registers. The available
5797 address modes are:
5798 X,X: 7-bit signed scaled offset
5799 Q: 9-bit signed offset
5800 We conservatively require an offset representable in either mode.
5801 When performing the check for pairs of X registers i.e. LDP/STP
5802 pass down DImode since that is the natural size of the LDP/STP
5803 instruction memory accesses. */
5804 if (mode == TImode || mode == TFmode)
5805 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
5806 && (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
5807 || offset_12bit_unsigned_scaled_p (mode, offset)));
5809 /* A 7bit offset check because OImode will emit a ldp/stp
5810 instruction (only big endian will get here).
5811 For ldp/stp instructions, the offset is scaled for the size of a
5812 single element of the pair. */
5813 if (mode == OImode)
5814 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
5816 /* Three 9/12 bit offsets checks because CImode will emit three
5817 ldr/str instructions (only big endian will get here). */
5818 if (mode == CImode)
5819 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5820 && (aarch64_offset_9bit_signed_unscaled_p (V16QImode,
5821 offset + 32)
5822 || offset_12bit_unsigned_scaled_p (V16QImode,
5823 offset + 32)));
5825 /* Two 7bit offsets checks because XImode will emit two ldp/stp
5826 instructions (only big endian will get here). */
5827 if (mode == XImode)
5828 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
5829 && aarch64_offset_7bit_signed_scaled_p (TImode,
5830 offset + 32));
5832 /* Make "m" use the LD1 offset range for SVE data modes, so
5833 that pre-RTL optimizers like ivopts will work to that
5834 instead of the wider LDR/STR range. */
5835 if (vec_flags == VEC_SVE_DATA)
5836 return (type == ADDR_QUERY_M
5837 ? offset_4bit_signed_scaled_p (mode, offset)
5838 : offset_9bit_signed_scaled_p (mode, offset));
5840 if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
5842 poly_int64 end_offset = (offset
5843 + GET_MODE_SIZE (mode)
5844 - BYTES_PER_SVE_VECTOR);
5845 return (type == ADDR_QUERY_M
5846 ? offset_4bit_signed_scaled_p (mode, offset)
5847 : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
5848 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
5849 end_offset)));
5852 if (vec_flags == VEC_SVE_PRED)
5853 return offset_9bit_signed_scaled_p (mode, offset);
5855 if (load_store_pair_p)
5856 return ((known_eq (GET_MODE_SIZE (mode), 4)
5857 || known_eq (GET_MODE_SIZE (mode), 8)
5858 || known_eq (GET_MODE_SIZE (mode), 16))
5859 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5860 else
5861 return (aarch64_offset_9bit_signed_unscaled_p (mode, offset)
5862 || offset_12bit_unsigned_scaled_p (mode, offset));
5865 if (allow_reg_index_p)
5867 /* Look for base + (scaled/extended) index register. */
5868 if (aarch64_base_register_rtx_p (op0, strict_p)
5869 && aarch64_classify_index (info, op1, mode, strict_p))
5871 info->base = op0;
5872 return true;
5874 if (aarch64_base_register_rtx_p (op1, strict_p)
5875 && aarch64_classify_index (info, op0, mode, strict_p))
5877 info->base = op1;
5878 return true;
5882 return false;
5884 case POST_INC:
5885 case POST_DEC:
5886 case PRE_INC:
5887 case PRE_DEC:
5888 info->type = ADDRESS_REG_WB;
5889 info->base = XEXP (x, 0);
5890 info->offset = NULL_RTX;
5891 return aarch64_base_register_rtx_p (info->base, strict_p);
5893 case POST_MODIFY:
5894 case PRE_MODIFY:
5895 info->type = ADDRESS_REG_WB;
5896 info->base = XEXP (x, 0);
5897 if (GET_CODE (XEXP (x, 1)) == PLUS
5898 && poly_int_rtx_p (XEXP (XEXP (x, 1), 1), &offset)
5899 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
5900 && aarch64_base_register_rtx_p (info->base, strict_p))
5902 info->offset = XEXP (XEXP (x, 1), 1);
5903 info->const_offset = offset;
5905 /* TImode and TFmode values are allowed in both pairs of X
5906 registers and individual Q registers. The available
5907 address modes are:
5908 X,X: 7-bit signed scaled offset
5909 Q: 9-bit signed offset
5910 We conservatively require an offset representable in either mode.
5912 if (mode == TImode || mode == TFmode)
5913 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
5914 && aarch64_offset_9bit_signed_unscaled_p (mode, offset));
5916 if (load_store_pair_p)
5917 return ((known_eq (GET_MODE_SIZE (mode), 4)
5918 || known_eq (GET_MODE_SIZE (mode), 8)
5919 || known_eq (GET_MODE_SIZE (mode), 16))
5920 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
5921 else
5922 return aarch64_offset_9bit_signed_unscaled_p (mode, offset);
5924 return false;
5926 case CONST:
5927 case SYMBOL_REF:
5928 case LABEL_REF:
5929 /* load literal: pc-relative constant pool entry. Only supported
5930 for SI mode or larger. */
5931 info->type = ADDRESS_SYMBOLIC;
5933 if (!load_store_pair_p
5934 && GET_MODE_SIZE (mode).is_constant (&const_size)
5935 && const_size >= 4)
5937 rtx sym, addend;
5939 split_const (x, &sym, &addend);
5940 return ((GET_CODE (sym) == LABEL_REF
5941 || (GET_CODE (sym) == SYMBOL_REF
5942 && CONSTANT_POOL_ADDRESS_P (sym)
5943 && aarch64_pcrelative_literal_loads)));
5945 return false;
5947 case LO_SUM:
5948 info->type = ADDRESS_LO_SUM;
5949 info->base = XEXP (x, 0);
5950 info->offset = XEXP (x, 1);
5951 if (allow_reg_index_p
5952 && aarch64_base_register_rtx_p (info->base, strict_p))
5954 rtx sym, offs;
5955 split_const (info->offset, &sym, &offs);
5956 if (GET_CODE (sym) == SYMBOL_REF
5957 && (aarch64_classify_symbol (sym, INTVAL (offs))
5958 == SYMBOL_SMALL_ABSOLUTE))
5960 /* The symbol and offset must be aligned to the access size. */
5961 unsigned int align;
5963 if (CONSTANT_POOL_ADDRESS_P (sym))
5964 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
5965 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
5967 tree exp = SYMBOL_REF_DECL (sym);
5968 align = TYPE_ALIGN (TREE_TYPE (exp));
5969 align = aarch64_constant_alignment (exp, align);
5971 else if (SYMBOL_REF_DECL (sym))
5972 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
5973 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
5974 && SYMBOL_REF_BLOCK (sym) != NULL)
5975 align = SYMBOL_REF_BLOCK (sym)->alignment;
5976 else
5977 align = BITS_PER_UNIT;
5979 poly_int64 ref_size = GET_MODE_SIZE (mode);
5980 if (known_eq (ref_size, 0))
5981 ref_size = GET_MODE_SIZE (DImode);
5983 return (multiple_p (INTVAL (offs), ref_size)
5984 && multiple_p (align / BITS_PER_UNIT, ref_size));
5987 return false;
5989 default:
5990 return false;
5994 /* Return true if the address X is valid for a PRFM instruction.
5995 STRICT_P is true if we should do strict checking with
5996 aarch64_classify_address. */
5998 bool
5999 aarch64_address_valid_for_prefetch_p (rtx x, bool strict_p)
6001 struct aarch64_address_info addr;
6003 /* PRFM accepts the same addresses as DImode... */
6004 bool res = aarch64_classify_address (&addr, x, DImode, strict_p);
6005 if (!res)
6006 return false;
6008 /* ... except writeback forms. */
6009 return addr.type != ADDRESS_REG_WB;
6012 bool
6013 aarch64_symbolic_address_p (rtx x)
6015 rtx offset;
6017 split_const (x, &x, &offset);
6018 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
6021 /* Classify the base of symbolic expression X. */
6023 enum aarch64_symbol_type
6024 aarch64_classify_symbolic_expression (rtx x)
6026 rtx offset;
6028 split_const (x, &x, &offset);
6029 return aarch64_classify_symbol (x, INTVAL (offset));
6033 /* Return TRUE if X is a legitimate address for accessing memory in
6034 mode MODE. */
6035 static bool
6036 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
6038 struct aarch64_address_info addr;
6040 return aarch64_classify_address (&addr, x, mode, strict_p);
6043 /* Return TRUE if X is a legitimate address of type TYPE for accessing
6044 memory in mode MODE. STRICT_P is true if REG_OK_STRICT is in effect. */
6045 bool
6046 aarch64_legitimate_address_p (machine_mode mode, rtx x, bool strict_p,
6047 aarch64_addr_query_type type)
6049 struct aarch64_address_info addr;
6051 return aarch64_classify_address (&addr, x, mode, strict_p, type);
6054 /* Implement TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT. */
6056 static bool
6057 aarch64_legitimize_address_displacement (rtx *offset1, rtx *offset2,
6058 poly_int64 orig_offset,
6059 machine_mode mode)
6061 HOST_WIDE_INT size;
6062 if (GET_MODE_SIZE (mode).is_constant (&size))
6064 HOST_WIDE_INT const_offset, second_offset;
6066 /* A general SVE offset is A * VQ + B. Remove the A component from
6067 coefficient 0 in order to get the constant B. */
6068 const_offset = orig_offset.coeffs[0] - orig_offset.coeffs[1];
6070 /* Split an out-of-range address displacement into a base and
6071 offset. Use 4KB range for 1- and 2-byte accesses and a 16KB
6072 range otherwise to increase opportunities for sharing the base
6073 address of different sizes. Unaligned accesses use the signed
6074 9-bit range, TImode/TFmode use the intersection of signed
6075 scaled 7-bit and signed 9-bit offset. */
6076 if (mode == TImode || mode == TFmode)
6077 second_offset = ((const_offset + 0x100) & 0x1f8) - 0x100;
6078 else if ((const_offset & (size - 1)) != 0)
6079 second_offset = ((const_offset + 0x100) & 0x1ff) - 0x100;
6080 else
6081 second_offset = const_offset & (size < 4 ? 0xfff : 0x3ffc);
6083 if (second_offset == 0 || known_eq (orig_offset, second_offset))
6084 return false;
6086 /* Split the offset into second_offset and the rest. */
6087 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6088 *offset2 = gen_int_mode (second_offset, Pmode);
6089 return true;
6091 else
6093 /* Get the mode we should use as the basis of the range. For structure
6094 modes this is the mode of one vector. */
6095 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
6096 machine_mode step_mode
6097 = (vec_flags & VEC_STRUCT) != 0 ? SVE_BYTE_MODE : mode;
6099 /* Get the "mul vl" multiplier we'd like to use. */
6100 HOST_WIDE_INT factor = GET_MODE_SIZE (step_mode).coeffs[1];
6101 HOST_WIDE_INT vnum = orig_offset.coeffs[1] / factor;
6102 if (vec_flags & VEC_SVE_DATA)
6103 /* LDR supports a 9-bit range, but the move patterns for
6104 structure modes require all vectors to be in range of the
6105 same base. The simplest way of accomodating that while still
6106 promoting reuse of anchor points between different modes is
6107 to use an 8-bit range unconditionally. */
6108 vnum = ((vnum + 128) & 255) - 128;
6109 else
6110 /* Predicates are only handled singly, so we might as well use
6111 the full range. */
6112 vnum = ((vnum + 256) & 511) - 256;
6113 if (vnum == 0)
6114 return false;
6116 /* Convert the "mul vl" multiplier into a byte offset. */
6117 poly_int64 second_offset = GET_MODE_SIZE (step_mode) * vnum;
6118 if (known_eq (second_offset, orig_offset))
6119 return false;
6121 /* Split the offset into second_offset and the rest. */
6122 *offset1 = gen_int_mode (orig_offset - second_offset, Pmode);
6123 *offset2 = gen_int_mode (second_offset, Pmode);
6124 return true;
6128 /* Return the binary representation of floating point constant VALUE in INTVAL.
6129 If the value cannot be converted, return false without setting INTVAL.
6130 The conversion is done in the given MODE. */
6131 bool
6132 aarch64_reinterpret_float_as_int (rtx value, unsigned HOST_WIDE_INT *intval)
6135 /* We make a general exception for 0. */
6136 if (aarch64_float_const_zero_rtx_p (value))
6138 *intval = 0;
6139 return true;
6142 scalar_float_mode mode;
6143 if (GET_CODE (value) != CONST_DOUBLE
6144 || !is_a <scalar_float_mode> (GET_MODE (value), &mode)
6145 || GET_MODE_BITSIZE (mode) > HOST_BITS_PER_WIDE_INT
6146 /* Only support up to DF mode. */
6147 || GET_MODE_BITSIZE (mode) > GET_MODE_BITSIZE (DFmode))
6148 return false;
6150 unsigned HOST_WIDE_INT ival = 0;
6152 long res[2];
6153 real_to_target (res,
6154 CONST_DOUBLE_REAL_VALUE (value),
6155 REAL_MODE_FORMAT (mode));
6157 if (mode == DFmode)
6159 int order = BYTES_BIG_ENDIAN ? 1 : 0;
6160 ival = zext_hwi (res[order], 32);
6161 ival |= (zext_hwi (res[1 - order], 32) << 32);
6163 else
6164 ival = zext_hwi (res[0], 32);
6166 *intval = ival;
6167 return true;
6170 /* Return TRUE if rtx X is an immediate constant that can be moved using a
6171 single MOV(+MOVK) followed by an FMOV. */
6172 bool
6173 aarch64_float_const_rtx_p (rtx x)
6175 machine_mode mode = GET_MODE (x);
6176 if (mode == VOIDmode)
6177 return false;
6179 /* Determine whether it's cheaper to write float constants as
6180 mov/movk pairs over ldr/adrp pairs. */
6181 unsigned HOST_WIDE_INT ival;
6183 if (GET_CODE (x) == CONST_DOUBLE
6184 && SCALAR_FLOAT_MODE_P (mode)
6185 && aarch64_reinterpret_float_as_int (x, &ival))
6187 scalar_int_mode imode = (mode == HFmode
6188 ? SImode
6189 : int_mode_for_mode (mode).require ());
6190 int num_instr = aarch64_internal_mov_immediate
6191 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
6192 return num_instr < 3;
6195 return false;
6198 /* Return TRUE if rtx X is immediate constant 0.0 */
6199 bool
6200 aarch64_float_const_zero_rtx_p (rtx x)
6202 if (GET_MODE (x) == VOIDmode)
6203 return false;
6205 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
6206 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
6207 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
6210 /* Return TRUE if rtx X is immediate constant that fits in a single
6211 MOVI immediate operation. */
6212 bool
6213 aarch64_can_const_movi_rtx_p (rtx x, machine_mode mode)
6215 if (!TARGET_SIMD)
6216 return false;
6218 machine_mode vmode;
6219 scalar_int_mode imode;
6220 unsigned HOST_WIDE_INT ival;
6222 if (GET_CODE (x) == CONST_DOUBLE
6223 && SCALAR_FLOAT_MODE_P (mode))
6225 if (!aarch64_reinterpret_float_as_int (x, &ival))
6226 return false;
6228 /* We make a general exception for 0. */
6229 if (aarch64_float_const_zero_rtx_p (x))
6230 return true;
6232 imode = int_mode_for_mode (mode).require ();
6234 else if (GET_CODE (x) == CONST_INT
6235 && is_a <scalar_int_mode> (mode, &imode))
6236 ival = INTVAL (x);
6237 else
6238 return false;
6240 /* use a 64 bit mode for everything except for DI/DF mode, where we use
6241 a 128 bit vector mode. */
6242 int width = GET_MODE_BITSIZE (imode) == 64 ? 128 : 64;
6244 vmode = aarch64_simd_container_mode (imode, width);
6245 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, ival);
6247 return aarch64_simd_valid_immediate (v_op, NULL);
6251 /* Return the fixed registers used for condition codes. */
6253 static bool
6254 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
6256 *p1 = CC_REGNUM;
6257 *p2 = INVALID_REGNUM;
6258 return true;
6261 /* This function is used by the call expanders of the machine description.
6262 RESULT is the register in which the result is returned. It's NULL for
6263 "call" and "sibcall".
6264 MEM is the location of the function call.
6265 SIBCALL indicates whether this function call is normal call or sibling call.
6266 It will generate different pattern accordingly. */
6268 void
6269 aarch64_expand_call (rtx result, rtx mem, bool sibcall)
6271 rtx call, callee, tmp;
6272 rtvec vec;
6273 machine_mode mode;
6275 gcc_assert (MEM_P (mem));
6276 callee = XEXP (mem, 0);
6277 mode = GET_MODE (callee);
6278 gcc_assert (mode == Pmode);
6280 /* Decide if we should generate indirect calls by loading the
6281 address of the callee into a register before performing
6282 the branch-and-link. */
6283 if (SYMBOL_REF_P (callee)
6284 ? (aarch64_is_long_call_p (callee)
6285 || aarch64_is_noplt_call_p (callee))
6286 : !REG_P (callee))
6287 XEXP (mem, 0) = force_reg (mode, callee);
6289 call = gen_rtx_CALL (VOIDmode, mem, const0_rtx);
6291 if (result != NULL_RTX)
6292 call = gen_rtx_SET (result, call);
6294 if (sibcall)
6295 tmp = ret_rtx;
6296 else
6297 tmp = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (Pmode, LR_REGNUM));
6299 vec = gen_rtvec (2, call, tmp);
6300 call = gen_rtx_PARALLEL (VOIDmode, vec);
6302 aarch64_emit_call_insn (call);
6305 /* Emit call insn with PAT and do aarch64-specific handling. */
6307 void
6308 aarch64_emit_call_insn (rtx pat)
6310 rtx insn = emit_call_insn (pat);
6312 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
6313 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
6314 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
6317 machine_mode
6318 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
6320 /* All floating point compares return CCFP if it is an equality
6321 comparison, and CCFPE otherwise. */
6322 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
6324 switch (code)
6326 case EQ:
6327 case NE:
6328 case UNORDERED:
6329 case ORDERED:
6330 case UNLT:
6331 case UNLE:
6332 case UNGT:
6333 case UNGE:
6334 case UNEQ:
6335 return CCFPmode;
6337 case LT:
6338 case LE:
6339 case GT:
6340 case GE:
6341 case LTGT:
6342 return CCFPEmode;
6344 default:
6345 gcc_unreachable ();
6349 /* Equality comparisons of short modes against zero can be performed
6350 using the TST instruction with the appropriate bitmask. */
6351 if (y == const0_rtx && REG_P (x)
6352 && (code == EQ || code == NE)
6353 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
6354 return CC_NZmode;
6356 /* Similarly, comparisons of zero_extends from shorter modes can
6357 be performed using an ANDS with an immediate mask. */
6358 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
6359 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6360 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
6361 && (code == EQ || code == NE))
6362 return CC_NZmode;
6364 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6365 && y == const0_rtx
6366 && (code == EQ || code == NE || code == LT || code == GE)
6367 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
6368 || GET_CODE (x) == NEG
6369 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
6370 && CONST_INT_P (XEXP (x, 2)))))
6371 return CC_NZmode;
6373 /* A compare with a shifted operand. Because of canonicalization,
6374 the comparison will have to be swapped when we emit the assembly
6375 code. */
6376 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6377 && (REG_P (y) || GET_CODE (y) == SUBREG || y == const0_rtx)
6378 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
6379 || GET_CODE (x) == LSHIFTRT
6380 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
6381 return CC_SWPmode;
6383 /* Similarly for a negated operand, but we can only do this for
6384 equalities. */
6385 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
6386 && (REG_P (y) || GET_CODE (y) == SUBREG)
6387 && (code == EQ || code == NE)
6388 && GET_CODE (x) == NEG)
6389 return CC_Zmode;
6391 /* A test for unsigned overflow. */
6392 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6393 && code == NE
6394 && GET_CODE (x) == PLUS
6395 && GET_CODE (y) == ZERO_EXTEND)
6396 return CC_Cmode;
6398 /* A test for signed overflow. */
6399 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
6400 && code == NE
6401 && GET_CODE (x) == PLUS
6402 && GET_CODE (y) == SIGN_EXTEND)
6403 return CC_Vmode;
6405 /* For everything else, return CCmode. */
6406 return CCmode;
6409 static int
6410 aarch64_get_condition_code_1 (machine_mode, enum rtx_code);
6413 aarch64_get_condition_code (rtx x)
6415 machine_mode mode = GET_MODE (XEXP (x, 0));
6416 enum rtx_code comp_code = GET_CODE (x);
6418 if (GET_MODE_CLASS (mode) != MODE_CC)
6419 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
6420 return aarch64_get_condition_code_1 (mode, comp_code);
6423 static int
6424 aarch64_get_condition_code_1 (machine_mode mode, enum rtx_code comp_code)
6426 switch (mode)
6428 case E_CCFPmode:
6429 case E_CCFPEmode:
6430 switch (comp_code)
6432 case GE: return AARCH64_GE;
6433 case GT: return AARCH64_GT;
6434 case LE: return AARCH64_LS;
6435 case LT: return AARCH64_MI;
6436 case NE: return AARCH64_NE;
6437 case EQ: return AARCH64_EQ;
6438 case ORDERED: return AARCH64_VC;
6439 case UNORDERED: return AARCH64_VS;
6440 case UNLT: return AARCH64_LT;
6441 case UNLE: return AARCH64_LE;
6442 case UNGT: return AARCH64_HI;
6443 case UNGE: return AARCH64_PL;
6444 default: return -1;
6446 break;
6448 case E_CCmode:
6449 switch (comp_code)
6451 case NE: return AARCH64_NE;
6452 case EQ: return AARCH64_EQ;
6453 case GE: return AARCH64_GE;
6454 case GT: return AARCH64_GT;
6455 case LE: return AARCH64_LE;
6456 case LT: return AARCH64_LT;
6457 case GEU: return AARCH64_CS;
6458 case GTU: return AARCH64_HI;
6459 case LEU: return AARCH64_LS;
6460 case LTU: return AARCH64_CC;
6461 default: return -1;
6463 break;
6465 case E_CC_SWPmode:
6466 switch (comp_code)
6468 case NE: return AARCH64_NE;
6469 case EQ: return AARCH64_EQ;
6470 case GE: return AARCH64_LE;
6471 case GT: return AARCH64_LT;
6472 case LE: return AARCH64_GE;
6473 case LT: return AARCH64_GT;
6474 case GEU: return AARCH64_LS;
6475 case GTU: return AARCH64_CC;
6476 case LEU: return AARCH64_CS;
6477 case LTU: return AARCH64_HI;
6478 default: return -1;
6480 break;
6482 case E_CC_NZmode:
6483 switch (comp_code)
6485 case NE: return AARCH64_NE;
6486 case EQ: return AARCH64_EQ;
6487 case GE: return AARCH64_PL;
6488 case LT: return AARCH64_MI;
6489 default: return -1;
6491 break;
6493 case E_CC_Zmode:
6494 switch (comp_code)
6496 case NE: return AARCH64_NE;
6497 case EQ: return AARCH64_EQ;
6498 default: return -1;
6500 break;
6502 case E_CC_Cmode:
6503 switch (comp_code)
6505 case NE: return AARCH64_CS;
6506 case EQ: return AARCH64_CC;
6507 default: return -1;
6509 break;
6511 case E_CC_Vmode:
6512 switch (comp_code)
6514 case NE: return AARCH64_VS;
6515 case EQ: return AARCH64_VC;
6516 default: return -1;
6518 break;
6520 default:
6521 return -1;
6524 return -1;
6527 bool
6528 aarch64_const_vec_all_same_in_range_p (rtx x,
6529 HOST_WIDE_INT minval,
6530 HOST_WIDE_INT maxval)
6532 rtx elt;
6533 return (const_vec_duplicate_p (x, &elt)
6534 && CONST_INT_P (elt)
6535 && IN_RANGE (INTVAL (elt), minval, maxval));
6538 bool
6539 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
6541 return aarch64_const_vec_all_same_in_range_p (x, val, val);
6544 /* Return true if VEC is a constant in which every element is in the range
6545 [MINVAL, MAXVAL]. The elements do not need to have the same value. */
6547 static bool
6548 aarch64_const_vec_all_in_range_p (rtx vec,
6549 HOST_WIDE_INT minval,
6550 HOST_WIDE_INT maxval)
6552 if (GET_CODE (vec) != CONST_VECTOR
6553 || GET_MODE_CLASS (GET_MODE (vec)) != MODE_VECTOR_INT)
6554 return false;
6556 int nunits;
6557 if (!CONST_VECTOR_STEPPED_P (vec))
6558 nunits = const_vector_encoded_nelts (vec);
6559 else if (!CONST_VECTOR_NUNITS (vec).is_constant (&nunits))
6560 return false;
6562 for (int i = 0; i < nunits; i++)
6564 rtx vec_elem = CONST_VECTOR_ELT (vec, i);
6565 if (!CONST_INT_P (vec_elem)
6566 || !IN_RANGE (INTVAL (vec_elem), minval, maxval))
6567 return false;
6569 return true;
6572 /* N Z C V. */
6573 #define AARCH64_CC_V 1
6574 #define AARCH64_CC_C (1 << 1)
6575 #define AARCH64_CC_Z (1 << 2)
6576 #define AARCH64_CC_N (1 << 3)
6578 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
6579 static const int aarch64_nzcv_codes[] =
6581 0, /* EQ, Z == 1. */
6582 AARCH64_CC_Z, /* NE, Z == 0. */
6583 0, /* CS, C == 1. */
6584 AARCH64_CC_C, /* CC, C == 0. */
6585 0, /* MI, N == 1. */
6586 AARCH64_CC_N, /* PL, N == 0. */
6587 0, /* VS, V == 1. */
6588 AARCH64_CC_V, /* VC, V == 0. */
6589 0, /* HI, C ==1 && Z == 0. */
6590 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
6591 AARCH64_CC_V, /* GE, N == V. */
6592 0, /* LT, N != V. */
6593 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
6594 0, /* LE, !(Z == 0 && N == V). */
6595 0, /* AL, Any. */
6596 0 /* NV, Any. */
6599 /* Print floating-point vector immediate operand X to F, negating it
6600 first if NEGATE is true. Return true on success, false if it isn't
6601 a constant we can handle. */
6603 static bool
6604 aarch64_print_vector_float_operand (FILE *f, rtx x, bool negate)
6606 rtx elt;
6608 if (!const_vec_duplicate_p (x, &elt))
6609 return false;
6611 REAL_VALUE_TYPE r = *CONST_DOUBLE_REAL_VALUE (elt);
6612 if (negate)
6613 r = real_value_negate (&r);
6615 /* We only handle the SVE single-bit immediates here. */
6616 if (real_equal (&r, &dconst0))
6617 asm_fprintf (f, "0.0");
6618 else if (real_equal (&r, &dconst1))
6619 asm_fprintf (f, "1.0");
6620 else if (real_equal (&r, &dconsthalf))
6621 asm_fprintf (f, "0.5");
6622 else
6623 return false;
6625 return true;
6628 /* Return the equivalent letter for size. */
6629 static char
6630 sizetochar (int size)
6632 switch (size)
6634 case 64: return 'd';
6635 case 32: return 's';
6636 case 16: return 'h';
6637 case 8 : return 'b';
6638 default: gcc_unreachable ();
6642 /* Print operand X to file F in a target specific manner according to CODE.
6643 The acceptable formatting commands given by CODE are:
6644 'c': An integer or symbol address without a preceding #
6645 sign.
6646 'C': Take the duplicated element in a vector constant
6647 and print it in hex.
6648 'D': Take the duplicated element in a vector constant
6649 and print it as an unsigned integer, in decimal.
6650 'e': Print the sign/zero-extend size as a character 8->b,
6651 16->h, 32->w.
6652 'p': Prints N such that 2^N == X (X must be power of 2 and
6653 const int).
6654 'P': Print the number of non-zero bits in X (a const_int).
6655 'H': Print the higher numbered register of a pair (TImode)
6656 of regs.
6657 'm': Print a condition (eq, ne, etc).
6658 'M': Same as 'm', but invert condition.
6659 'N': Take the duplicated element in a vector constant
6660 and print the negative of it in decimal.
6661 'b/h/s/d/q': Print a scalar FP/SIMD register name.
6662 'S/T/U/V': Print a FP/SIMD register name for a register list.
6663 The register printed is the FP/SIMD register name
6664 of X + 0/1/2/3 for S/T/U/V.
6665 'R': Print a scalar FP/SIMD register name + 1.
6666 'X': Print bottom 16 bits of integer constant in hex.
6667 'w/x': Print a general register name or the zero register
6668 (32-bit or 64-bit).
6669 '0': Print a normal operand, if it's a general register,
6670 then we assume DImode.
6671 'k': Print NZCV for conditional compare instructions.
6672 'A': Output address constant representing the first
6673 argument of X, specifying a relocation offset
6674 if appropriate.
6675 'L': Output constant address specified by X
6676 with a relocation offset if appropriate.
6677 'G': Prints address of X, specifying a PC relative
6678 relocation mode if appropriate.
6679 'y': Output address of LDP or STP - this is used for
6680 some LDP/STPs which don't use a PARALLEL in their
6681 pattern (so the mode needs to be adjusted).
6682 'z': Output address of a typical LDP or STP. */
6684 static void
6685 aarch64_print_operand (FILE *f, rtx x, int code)
6687 rtx elt;
6688 switch (code)
6690 case 'c':
6691 switch (GET_CODE (x))
6693 case CONST_INT:
6694 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
6695 break;
6697 case SYMBOL_REF:
6698 output_addr_const (f, x);
6699 break;
6701 case CONST:
6702 if (GET_CODE (XEXP (x, 0)) == PLUS
6703 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
6705 output_addr_const (f, x);
6706 break;
6708 /* Fall through. */
6710 default:
6711 output_operand_lossage ("unsupported operand for code '%c'", code);
6713 break;
6715 case 'e':
6717 int n;
6719 if (!CONST_INT_P (x)
6720 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
6722 output_operand_lossage ("invalid operand for '%%%c'", code);
6723 return;
6726 switch (n)
6728 case 3:
6729 fputc ('b', f);
6730 break;
6731 case 4:
6732 fputc ('h', f);
6733 break;
6734 case 5:
6735 fputc ('w', f);
6736 break;
6737 default:
6738 output_operand_lossage ("invalid operand for '%%%c'", code);
6739 return;
6742 break;
6744 case 'p':
6746 int n;
6748 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
6750 output_operand_lossage ("invalid operand for '%%%c'", code);
6751 return;
6754 asm_fprintf (f, "%d", n);
6756 break;
6758 case 'P':
6759 if (!CONST_INT_P (x))
6761 output_operand_lossage ("invalid operand for '%%%c'", code);
6762 return;
6765 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
6766 break;
6768 case 'H':
6769 if (x == const0_rtx)
6771 asm_fprintf (f, "xzr");
6772 break;
6775 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
6777 output_operand_lossage ("invalid operand for '%%%c'", code);
6778 return;
6781 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
6782 break;
6784 case 'M':
6785 case 'm':
6787 int cond_code;
6788 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
6789 if (x == const_true_rtx)
6791 if (code == 'M')
6792 fputs ("nv", f);
6793 return;
6796 if (!COMPARISON_P (x))
6798 output_operand_lossage ("invalid operand for '%%%c'", code);
6799 return;
6802 cond_code = aarch64_get_condition_code (x);
6803 gcc_assert (cond_code >= 0);
6804 if (code == 'M')
6805 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
6806 fputs (aarch64_condition_codes[cond_code], f);
6808 break;
6810 case 'N':
6811 if (!const_vec_duplicate_p (x, &elt))
6813 output_operand_lossage ("invalid vector constant");
6814 return;
6817 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6818 asm_fprintf (f, "%wd", -INTVAL (elt));
6819 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6820 && aarch64_print_vector_float_operand (f, x, true))
6822 else
6824 output_operand_lossage ("invalid vector constant");
6825 return;
6827 break;
6829 case 'b':
6830 case 'h':
6831 case 's':
6832 case 'd':
6833 case 'q':
6834 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6836 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6837 return;
6839 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
6840 break;
6842 case 'S':
6843 case 'T':
6844 case 'U':
6845 case 'V':
6846 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6848 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6849 return;
6851 asm_fprintf (f, "%c%d",
6852 aarch64_sve_data_mode_p (GET_MODE (x)) ? 'z' : 'v',
6853 REGNO (x) - V0_REGNUM + (code - 'S'));
6854 break;
6856 case 'R':
6857 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
6859 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
6860 return;
6862 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
6863 break;
6865 case 'X':
6866 if (!CONST_INT_P (x))
6868 output_operand_lossage ("invalid operand for '%%%c'", code);
6869 return;
6871 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
6872 break;
6874 case 'C':
6876 /* Print a replicated constant in hex. */
6877 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6879 output_operand_lossage ("invalid operand for '%%%c'", code);
6880 return;
6882 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6883 asm_fprintf (f, "0x%wx", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6885 break;
6887 case 'D':
6889 /* Print a replicated constant in decimal, treating it as
6890 unsigned. */
6891 if (!const_vec_duplicate_p (x, &elt) || !CONST_INT_P (elt))
6893 output_operand_lossage ("invalid operand for '%%%c'", code);
6894 return;
6896 scalar_mode inner_mode = GET_MODE_INNER (GET_MODE (x));
6897 asm_fprintf (f, "%wd", UINTVAL (elt) & GET_MODE_MASK (inner_mode));
6899 break;
6901 case 'w':
6902 case 'x':
6903 if (x == const0_rtx
6904 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
6906 asm_fprintf (f, "%czr", code);
6907 break;
6910 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
6912 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
6913 break;
6916 if (REG_P (x) && REGNO (x) == SP_REGNUM)
6918 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
6919 break;
6922 /* Fall through */
6924 case 0:
6925 if (x == NULL)
6927 output_operand_lossage ("missing operand");
6928 return;
6931 switch (GET_CODE (x))
6933 case REG:
6934 if (aarch64_sve_data_mode_p (GET_MODE (x)))
6936 if (REG_NREGS (x) == 1)
6937 asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
6938 else
6940 char suffix
6941 = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
6942 asm_fprintf (f, "{z%d.%c - z%d.%c}",
6943 REGNO (x) - V0_REGNUM, suffix,
6944 END_REGNO (x) - V0_REGNUM - 1, suffix);
6947 else
6948 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
6949 break;
6951 case MEM:
6952 output_address (GET_MODE (x), XEXP (x, 0));
6953 break;
6955 case LABEL_REF:
6956 case SYMBOL_REF:
6957 output_addr_const (asm_out_file, x);
6958 break;
6960 case CONST_INT:
6961 asm_fprintf (f, "%wd", INTVAL (x));
6962 break;
6964 case CONST:
6965 if (!VECTOR_MODE_P (GET_MODE (x)))
6967 output_addr_const (asm_out_file, x);
6968 break;
6970 /* fall through */
6972 case CONST_VECTOR:
6973 if (!const_vec_duplicate_p (x, &elt))
6975 output_operand_lossage ("invalid vector constant");
6976 return;
6979 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
6980 asm_fprintf (f, "%wd", INTVAL (elt));
6981 else if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
6982 && aarch64_print_vector_float_operand (f, x, false))
6984 else
6986 output_operand_lossage ("invalid vector constant");
6987 return;
6989 break;
6991 case CONST_DOUBLE:
6992 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
6993 be getting CONST_DOUBLEs holding integers. */
6994 gcc_assert (GET_MODE (x) != VOIDmode);
6995 if (aarch64_float_const_zero_rtx_p (x))
6997 fputc ('0', f);
6998 break;
7000 else if (aarch64_float_const_representable_p (x))
7002 #define buf_size 20
7003 char float_buf[buf_size] = {'\0'};
7004 real_to_decimal_for_mode (float_buf,
7005 CONST_DOUBLE_REAL_VALUE (x),
7006 buf_size, buf_size,
7007 1, GET_MODE (x));
7008 asm_fprintf (asm_out_file, "%s", float_buf);
7009 break;
7010 #undef buf_size
7012 output_operand_lossage ("invalid constant");
7013 return;
7014 default:
7015 output_operand_lossage ("invalid operand");
7016 return;
7018 break;
7020 case 'A':
7021 if (GET_CODE (x) == HIGH)
7022 x = XEXP (x, 0);
7024 switch (aarch64_classify_symbolic_expression (x))
7026 case SYMBOL_SMALL_GOT_4G:
7027 asm_fprintf (asm_out_file, ":got:");
7028 break;
7030 case SYMBOL_SMALL_TLSGD:
7031 asm_fprintf (asm_out_file, ":tlsgd:");
7032 break;
7034 case SYMBOL_SMALL_TLSDESC:
7035 asm_fprintf (asm_out_file, ":tlsdesc:");
7036 break;
7038 case SYMBOL_SMALL_TLSIE:
7039 asm_fprintf (asm_out_file, ":gottprel:");
7040 break;
7042 case SYMBOL_TLSLE24:
7043 asm_fprintf (asm_out_file, ":tprel:");
7044 break;
7046 case SYMBOL_TINY_GOT:
7047 gcc_unreachable ();
7048 break;
7050 default:
7051 break;
7053 output_addr_const (asm_out_file, x);
7054 break;
7056 case 'L':
7057 switch (aarch64_classify_symbolic_expression (x))
7059 case SYMBOL_SMALL_GOT_4G:
7060 asm_fprintf (asm_out_file, ":lo12:");
7061 break;
7063 case SYMBOL_SMALL_TLSGD:
7064 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
7065 break;
7067 case SYMBOL_SMALL_TLSDESC:
7068 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
7069 break;
7071 case SYMBOL_SMALL_TLSIE:
7072 asm_fprintf (asm_out_file, ":gottprel_lo12:");
7073 break;
7075 case SYMBOL_TLSLE12:
7076 asm_fprintf (asm_out_file, ":tprel_lo12:");
7077 break;
7079 case SYMBOL_TLSLE24:
7080 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
7081 break;
7083 case SYMBOL_TINY_GOT:
7084 asm_fprintf (asm_out_file, ":got:");
7085 break;
7087 case SYMBOL_TINY_TLSIE:
7088 asm_fprintf (asm_out_file, ":gottprel:");
7089 break;
7091 default:
7092 break;
7094 output_addr_const (asm_out_file, x);
7095 break;
7097 case 'G':
7098 switch (aarch64_classify_symbolic_expression (x))
7100 case SYMBOL_TLSLE24:
7101 asm_fprintf (asm_out_file, ":tprel_hi12:");
7102 break;
7103 default:
7104 break;
7106 output_addr_const (asm_out_file, x);
7107 break;
7109 case 'k':
7111 HOST_WIDE_INT cond_code;
7113 if (!CONST_INT_P (x))
7115 output_operand_lossage ("invalid operand for '%%%c'", code);
7116 return;
7119 cond_code = INTVAL (x);
7120 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
7121 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
7123 break;
7125 case 'y':
7126 case 'z':
7128 machine_mode mode = GET_MODE (x);
7130 if (GET_CODE (x) != MEM
7131 || (code == 'y' && maybe_ne (GET_MODE_SIZE (mode), 16)))
7133 output_operand_lossage ("invalid operand for '%%%c'", code);
7134 return;
7137 if (!aarch64_print_address_internal (f, mode, XEXP (x, 0),
7138 code == 'y'
7139 ? ADDR_QUERY_LDP_STP_N
7140 : ADDR_QUERY_LDP_STP))
7141 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7143 break;
7145 default:
7146 output_operand_lossage ("invalid operand prefix '%%%c'", code);
7147 return;
7151 /* Print address 'x' of a memory access with mode 'mode'.
7152 'op' is the context required by aarch64_classify_address. It can either be
7153 MEM for a normal memory access or PARALLEL for LDP/STP. */
7154 static bool
7155 aarch64_print_address_internal (FILE *f, machine_mode mode, rtx x,
7156 aarch64_addr_query_type type)
7158 struct aarch64_address_info addr;
7159 unsigned int size;
7161 /* Check all addresses are Pmode - including ILP32. */
7162 if (GET_MODE (x) != Pmode)
7163 output_operand_lossage ("invalid address mode");
7165 if (aarch64_classify_address (&addr, x, mode, true, type))
7166 switch (addr.type)
7168 case ADDRESS_REG_IMM:
7169 if (known_eq (addr.const_offset, 0))
7170 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
7171 else if (aarch64_sve_data_mode_p (mode))
7173 HOST_WIDE_INT vnum
7174 = exact_div (addr.const_offset,
7175 BYTES_PER_SVE_VECTOR).to_constant ();
7176 asm_fprintf (f, "[%s, #%wd, mul vl]",
7177 reg_names[REGNO (addr.base)], vnum);
7179 else if (aarch64_sve_pred_mode_p (mode))
7181 HOST_WIDE_INT vnum
7182 = exact_div (addr.const_offset,
7183 BYTES_PER_SVE_PRED).to_constant ();
7184 asm_fprintf (f, "[%s, #%wd, mul vl]",
7185 reg_names[REGNO (addr.base)], vnum);
7187 else
7188 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
7189 INTVAL (addr.offset));
7190 return true;
7192 case ADDRESS_REG_REG:
7193 if (addr.shift == 0)
7194 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
7195 reg_names [REGNO (addr.offset)]);
7196 else
7197 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
7198 reg_names [REGNO (addr.offset)], addr.shift);
7199 return true;
7201 case ADDRESS_REG_UXTW:
7202 if (addr.shift == 0)
7203 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
7204 REGNO (addr.offset) - R0_REGNUM);
7205 else
7206 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
7207 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7208 return true;
7210 case ADDRESS_REG_SXTW:
7211 if (addr.shift == 0)
7212 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
7213 REGNO (addr.offset) - R0_REGNUM);
7214 else
7215 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
7216 REGNO (addr.offset) - R0_REGNUM, addr.shift);
7217 return true;
7219 case ADDRESS_REG_WB:
7220 /* Writeback is only supported for fixed-width modes. */
7221 size = GET_MODE_SIZE (mode).to_constant ();
7222 switch (GET_CODE (x))
7224 case PRE_INC:
7225 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)], size);
7226 return true;
7227 case POST_INC:
7228 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)], size);
7229 return true;
7230 case PRE_DEC:
7231 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)], size);
7232 return true;
7233 case POST_DEC:
7234 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)], size);
7235 return true;
7236 case PRE_MODIFY:
7237 asm_fprintf (f, "[%s, %wd]!", reg_names[REGNO (addr.base)],
7238 INTVAL (addr.offset));
7239 return true;
7240 case POST_MODIFY:
7241 asm_fprintf (f, "[%s], %wd", reg_names[REGNO (addr.base)],
7242 INTVAL (addr.offset));
7243 return true;
7244 default:
7245 break;
7247 break;
7249 case ADDRESS_LO_SUM:
7250 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
7251 output_addr_const (f, addr.offset);
7252 asm_fprintf (f, "]");
7253 return true;
7255 case ADDRESS_SYMBOLIC:
7256 output_addr_const (f, x);
7257 return true;
7260 return false;
7263 /* Print address 'x' of a memory access with mode 'mode'. */
7264 static void
7265 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
7267 if (!aarch64_print_address_internal (f, mode, x, ADDR_QUERY_ANY))
7268 output_addr_const (f, x);
7271 bool
7272 aarch64_label_mentioned_p (rtx x)
7274 const char *fmt;
7275 int i;
7277 if (GET_CODE (x) == LABEL_REF)
7278 return true;
7280 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
7281 referencing instruction, but they are constant offsets, not
7282 symbols. */
7283 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
7284 return false;
7286 fmt = GET_RTX_FORMAT (GET_CODE (x));
7287 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
7289 if (fmt[i] == 'E')
7291 int j;
7293 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
7294 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
7295 return 1;
7297 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
7298 return 1;
7301 return 0;
7304 /* Implement REGNO_REG_CLASS. */
7306 enum reg_class
7307 aarch64_regno_regclass (unsigned regno)
7309 if (GP_REGNUM_P (regno))
7310 return GENERAL_REGS;
7312 if (regno == SP_REGNUM)
7313 return STACK_REG;
7315 if (regno == FRAME_POINTER_REGNUM
7316 || regno == ARG_POINTER_REGNUM)
7317 return POINTER_REGS;
7319 if (FP_REGNUM_P (regno))
7320 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
7322 if (PR_REGNUM_P (regno))
7323 return PR_LO_REGNUM_P (regno) ? PR_LO_REGS : PR_HI_REGS;
7325 return NO_REGS;
7328 /* OFFSET is an address offset for mode MODE, which has SIZE bytes.
7329 If OFFSET is out of range, return an offset of an anchor point
7330 that is in range. Return 0 otherwise. */
7332 static HOST_WIDE_INT
7333 aarch64_anchor_offset (HOST_WIDE_INT offset, HOST_WIDE_INT size,
7334 machine_mode mode)
7336 /* Does it look like we'll need a 16-byte load/store-pair operation? */
7337 if (size > 16)
7338 return (offset + 0x400) & ~0x7f0;
7340 /* For offsets that aren't a multiple of the access size, the limit is
7341 -256...255. */
7342 if (offset & (size - 1))
7344 /* BLKmode typically uses LDP of X-registers. */
7345 if (mode == BLKmode)
7346 return (offset + 512) & ~0x3ff;
7347 return (offset + 0x100) & ~0x1ff;
7350 /* Small negative offsets are supported. */
7351 if (IN_RANGE (offset, -256, 0))
7352 return 0;
7354 if (mode == TImode || mode == TFmode)
7355 return (offset + 0x100) & ~0x1ff;
7357 /* Use 12-bit offset by access size. */
7358 return offset & (~0xfff * size);
7361 static rtx
7362 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
7364 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
7365 where mask is selected by alignment and size of the offset.
7366 We try to pick as large a range for the offset as possible to
7367 maximize the chance of a CSE. However, for aligned addresses
7368 we limit the range to 4k so that structures with different sized
7369 elements are likely to use the same base. We need to be careful
7370 not to split a CONST for some forms of address expression, otherwise
7371 it will generate sub-optimal code. */
7373 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
7375 rtx base = XEXP (x, 0);
7376 rtx offset_rtx = XEXP (x, 1);
7377 HOST_WIDE_INT offset = INTVAL (offset_rtx);
7379 if (GET_CODE (base) == PLUS)
7381 rtx op0 = XEXP (base, 0);
7382 rtx op1 = XEXP (base, 1);
7384 /* Force any scaling into a temp for CSE. */
7385 op0 = force_reg (Pmode, op0);
7386 op1 = force_reg (Pmode, op1);
7388 /* Let the pointer register be in op0. */
7389 if (REG_POINTER (op1))
7390 std::swap (op0, op1);
7392 /* If the pointer is virtual or frame related, then we know that
7393 virtual register instantiation or register elimination is going
7394 to apply a second constant. We want the two constants folded
7395 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
7396 if (virt_or_elim_regno_p (REGNO (op0)))
7398 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
7399 NULL_RTX, true, OPTAB_DIRECT);
7400 return gen_rtx_PLUS (Pmode, base, op1);
7403 /* Otherwise, in order to encourage CSE (and thence loop strength
7404 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
7405 base = expand_binop (Pmode, add_optab, op0, op1,
7406 NULL_RTX, true, OPTAB_DIRECT);
7407 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
7410 HOST_WIDE_INT size;
7411 if (GET_MODE_SIZE (mode).is_constant (&size))
7413 HOST_WIDE_INT base_offset = aarch64_anchor_offset (offset, size,
7414 mode);
7415 if (base_offset != 0)
7417 base = plus_constant (Pmode, base, base_offset);
7418 base = force_operand (base, NULL_RTX);
7419 return plus_constant (Pmode, base, offset - base_offset);
7424 return x;
7427 static reg_class_t
7428 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
7429 reg_class_t rclass,
7430 machine_mode mode,
7431 secondary_reload_info *sri)
7433 /* Use aarch64_sve_reload_be for SVE reloads that cannot be handled
7434 directly by the *aarch64_sve_mov<mode>_be move pattern. See the
7435 comment at the head of aarch64-sve.md for more details about the
7436 big-endian handling. */
7437 if (BYTES_BIG_ENDIAN
7438 && reg_class_subset_p (rclass, FP_REGS)
7439 && !((REG_P (x) && HARD_REGISTER_P (x))
7440 || aarch64_simd_valid_immediate (x, NULL))
7441 && aarch64_sve_data_mode_p (mode))
7443 sri->icode = CODE_FOR_aarch64_sve_reload_be;
7444 return NO_REGS;
7447 /* If we have to disable direct literal pool loads and stores because the
7448 function is too big, then we need a scratch register. */
7449 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
7450 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
7451 || targetm.vector_mode_supported_p (GET_MODE (x)))
7452 && !aarch64_pcrelative_literal_loads)
7454 sri->icode = code_for_aarch64_reload_movcp (mode, DImode);
7455 return NO_REGS;
7458 /* Without the TARGET_SIMD instructions we cannot move a Q register
7459 to a Q register directly. We need a scratch. */
7460 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
7461 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
7462 && reg_class_subset_p (rclass, FP_REGS))
7464 sri->icode = code_for_aarch64_reload_mov (mode);
7465 return NO_REGS;
7468 /* A TFmode or TImode memory access should be handled via an FP_REGS
7469 because AArch64 has richer addressing modes for LDR/STR instructions
7470 than LDP/STP instructions. */
7471 if (TARGET_FLOAT && rclass == GENERAL_REGS
7472 && known_eq (GET_MODE_SIZE (mode), 16) && MEM_P (x))
7473 return FP_REGS;
7475 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
7476 return GENERAL_REGS;
7478 return NO_REGS;
7481 static bool
7482 aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
7484 gcc_assert (from == ARG_POINTER_REGNUM || from == FRAME_POINTER_REGNUM);
7486 /* If we need a frame pointer, ARG_POINTER_REGNUM and FRAME_POINTER_REGNUM
7487 can only eliminate to HARD_FRAME_POINTER_REGNUM. */
7488 if (frame_pointer_needed)
7489 return to == HARD_FRAME_POINTER_REGNUM;
7490 return true;
7493 poly_int64
7494 aarch64_initial_elimination_offset (unsigned from, unsigned to)
7496 if (to == HARD_FRAME_POINTER_REGNUM)
7498 if (from == ARG_POINTER_REGNUM)
7499 return cfun->machine->frame.hard_fp_offset;
7501 if (from == FRAME_POINTER_REGNUM)
7502 return cfun->machine->frame.hard_fp_offset
7503 - cfun->machine->frame.locals_offset;
7506 if (to == STACK_POINTER_REGNUM)
7508 if (from == FRAME_POINTER_REGNUM)
7509 return cfun->machine->frame.frame_size
7510 - cfun->machine->frame.locals_offset;
7513 return cfun->machine->frame.frame_size;
7516 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
7517 previous frame. */
7520 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
7522 if (count != 0)
7523 return const0_rtx;
7524 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
7528 static void
7529 aarch64_asm_trampoline_template (FILE *f)
7531 if (TARGET_ILP32)
7533 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
7534 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
7536 else
7538 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
7539 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
7541 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
7542 assemble_aligned_integer (4, const0_rtx);
7543 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7544 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
7547 static void
7548 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
7550 rtx fnaddr, mem, a_tramp;
7551 const int tramp_code_sz = 16;
7553 /* Don't need to copy the trailing D-words, we fill those in below. */
7554 emit_block_move (m_tramp, assemble_trampoline_template (),
7555 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
7556 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
7557 fnaddr = XEXP (DECL_RTL (fndecl), 0);
7558 if (GET_MODE (fnaddr) != ptr_mode)
7559 fnaddr = convert_memory_address (ptr_mode, fnaddr);
7560 emit_move_insn (mem, fnaddr);
7562 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
7563 emit_move_insn (mem, chain_value);
7565 /* XXX We should really define a "clear_cache" pattern and use
7566 gen_clear_cache(). */
7567 a_tramp = XEXP (m_tramp, 0);
7568 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
7569 LCT_NORMAL, VOIDmode, a_tramp, ptr_mode,
7570 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
7571 ptr_mode);
7574 static unsigned char
7575 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
7577 /* ??? Logically we should only need to provide a value when
7578 HARD_REGNO_MODE_OK says that at least one register in REGCLASS
7579 can hold MODE, but at the moment we need to handle all modes.
7580 Just ignore any runtime parts for registers that can't store them. */
7581 HOST_WIDE_INT lowest_size = constant_lower_bound (GET_MODE_SIZE (mode));
7582 unsigned int nregs;
7583 switch (regclass)
7585 case TAILCALL_ADDR_REGS:
7586 case POINTER_REGS:
7587 case GENERAL_REGS:
7588 case ALL_REGS:
7589 case POINTER_AND_FP_REGS:
7590 case FP_REGS:
7591 case FP_LO_REGS:
7592 if (aarch64_sve_data_mode_p (mode)
7593 && constant_multiple_p (GET_MODE_SIZE (mode),
7594 BYTES_PER_SVE_VECTOR, &nregs))
7595 return nregs;
7596 return (aarch64_vector_data_mode_p (mode)
7597 ? CEIL (lowest_size, UNITS_PER_VREG)
7598 : CEIL (lowest_size, UNITS_PER_WORD));
7599 case STACK_REG:
7600 case PR_REGS:
7601 case PR_LO_REGS:
7602 case PR_HI_REGS:
7603 return 1;
7605 case NO_REGS:
7606 return 0;
7608 default:
7609 break;
7611 gcc_unreachable ();
7614 static reg_class_t
7615 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
7617 if (regclass == POINTER_REGS)
7618 return GENERAL_REGS;
7620 if (regclass == STACK_REG)
7622 if (REG_P(x)
7623 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
7624 return regclass;
7626 return NO_REGS;
7629 /* Register eliminiation can result in a request for
7630 SP+constant->FP_REGS. We cannot support such operations which
7631 use SP as source and an FP_REG as destination, so reject out
7632 right now. */
7633 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
7635 rtx lhs = XEXP (x, 0);
7637 /* Look through a possible SUBREG introduced by ILP32. */
7638 if (GET_CODE (lhs) == SUBREG)
7639 lhs = SUBREG_REG (lhs);
7641 gcc_assert (REG_P (lhs));
7642 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
7643 POINTER_REGS));
7644 return NO_REGS;
7647 return regclass;
7650 void
7651 aarch64_asm_output_labelref (FILE* f, const char *name)
7653 asm_fprintf (f, "%U%s", name);
7656 static void
7657 aarch64_elf_asm_constructor (rtx symbol, int priority)
7659 if (priority == DEFAULT_INIT_PRIORITY)
7660 default_ctor_section_asm_out_constructor (symbol, priority);
7661 else
7663 section *s;
7664 /* While priority is known to be in range [0, 65535], so 18 bytes
7665 would be enough, the compiler might not know that. To avoid
7666 -Wformat-truncation false positive, use a larger size. */
7667 char buf[23];
7668 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
7669 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7670 switch_to_section (s);
7671 assemble_align (POINTER_SIZE);
7672 assemble_aligned_integer (POINTER_BYTES, symbol);
7676 static void
7677 aarch64_elf_asm_destructor (rtx symbol, int priority)
7679 if (priority == DEFAULT_INIT_PRIORITY)
7680 default_dtor_section_asm_out_destructor (symbol, priority);
7681 else
7683 section *s;
7684 /* While priority is known to be in range [0, 65535], so 18 bytes
7685 would be enough, the compiler might not know that. To avoid
7686 -Wformat-truncation false positive, use a larger size. */
7687 char buf[23];
7688 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
7689 s = get_section (buf, SECTION_WRITE | SECTION_NOTYPE, NULL);
7690 switch_to_section (s);
7691 assemble_align (POINTER_SIZE);
7692 assemble_aligned_integer (POINTER_BYTES, symbol);
7696 const char*
7697 aarch64_output_casesi (rtx *operands)
7699 char buf[100];
7700 char label[100];
7701 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
7702 int index;
7703 static const char *const patterns[4][2] =
7706 "ldrb\t%w3, [%0,%w1,uxtw]",
7707 "add\t%3, %4, %w3, sxtb #2"
7710 "ldrh\t%w3, [%0,%w1,uxtw #1]",
7711 "add\t%3, %4, %w3, sxth #2"
7714 "ldr\t%w3, [%0,%w1,uxtw #2]",
7715 "add\t%3, %4, %w3, sxtw #2"
7717 /* We assume that DImode is only generated when not optimizing and
7718 that we don't really need 64-bit address offsets. That would
7719 imply an object file with 8GB of code in a single function! */
7721 "ldr\t%w3, [%0,%w1,uxtw #2]",
7722 "add\t%3, %4, %w3, sxtw #2"
7726 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
7728 scalar_int_mode mode = as_a <scalar_int_mode> (GET_MODE (diff_vec));
7729 index = exact_log2 (GET_MODE_SIZE (mode));
7731 gcc_assert (index >= 0 && index <= 3);
7733 /* Need to implement table size reduction, by chaning the code below. */
7734 output_asm_insn (patterns[index][0], operands);
7735 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
7736 snprintf (buf, sizeof (buf),
7737 "adr\t%%4, %s", targetm.strip_name_encoding (label));
7738 output_asm_insn (buf, operands);
7739 output_asm_insn (patterns[index][1], operands);
7740 output_asm_insn ("br\t%3", operands);
7741 assemble_label (asm_out_file, label);
7742 return "";
7746 /* Return size in bits of an arithmetic operand which is shifted/scaled and
7747 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
7748 operator. */
7751 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
7753 if (shift >= 0 && shift <= 3)
7755 int size;
7756 for (size = 8; size <= 32; size *= 2)
7758 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
7759 if (mask == bits << shift)
7760 return size;
7763 return 0;
7766 /* Constant pools are per function only when PC relative
7767 literal loads are true or we are in the large memory
7768 model. */
7770 static inline bool
7771 aarch64_can_use_per_function_literal_pools_p (void)
7773 return (aarch64_pcrelative_literal_loads
7774 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
7777 static bool
7778 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
7780 /* We can't use blocks for constants when we're using a per-function
7781 constant pool. */
7782 return !aarch64_can_use_per_function_literal_pools_p ();
7785 /* Select appropriate section for constants depending
7786 on where we place literal pools. */
7788 static section *
7789 aarch64_select_rtx_section (machine_mode mode,
7790 rtx x,
7791 unsigned HOST_WIDE_INT align)
7793 if (aarch64_can_use_per_function_literal_pools_p ())
7794 return function_section (current_function_decl);
7796 return default_elf_select_rtx_section (mode, x, align);
7799 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
7800 void
7801 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
7802 HOST_WIDE_INT offset)
7804 /* When using per-function literal pools, we must ensure that any code
7805 section is aligned to the minimal instruction length, lest we get
7806 errors from the assembler re "unaligned instructions". */
7807 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
7808 ASM_OUTPUT_ALIGN (f, 2);
7811 /* Costs. */
7813 /* Helper function for rtx cost calculation. Strip a shift expression
7814 from X. Returns the inner operand if successful, or the original
7815 expression on failure. */
7816 static rtx
7817 aarch64_strip_shift (rtx x)
7819 rtx op = x;
7821 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
7822 we can convert both to ROR during final output. */
7823 if ((GET_CODE (op) == ASHIFT
7824 || GET_CODE (op) == ASHIFTRT
7825 || GET_CODE (op) == LSHIFTRT
7826 || GET_CODE (op) == ROTATERT
7827 || GET_CODE (op) == ROTATE)
7828 && CONST_INT_P (XEXP (op, 1)))
7829 return XEXP (op, 0);
7831 if (GET_CODE (op) == MULT
7832 && CONST_INT_P (XEXP (op, 1))
7833 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
7834 return XEXP (op, 0);
7836 return x;
7839 /* Helper function for rtx cost calculation. Strip an extend
7840 expression from X. Returns the inner operand if successful, or the
7841 original expression on failure. We deal with a number of possible
7842 canonicalization variations here. If STRIP_SHIFT is true, then
7843 we can strip off a shift also. */
7844 static rtx
7845 aarch64_strip_extend (rtx x, bool strip_shift)
7847 scalar_int_mode mode;
7848 rtx op = x;
7850 if (!is_a <scalar_int_mode> (GET_MODE (op), &mode))
7851 return op;
7853 /* Zero and sign extraction of a widened value. */
7854 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
7855 && XEXP (op, 2) == const0_rtx
7856 && GET_CODE (XEXP (op, 0)) == MULT
7857 && aarch64_is_extend_from_extract (mode, XEXP (XEXP (op, 0), 1),
7858 XEXP (op, 1)))
7859 return XEXP (XEXP (op, 0), 0);
7861 /* It can also be represented (for zero-extend) as an AND with an
7862 immediate. */
7863 if (GET_CODE (op) == AND
7864 && GET_CODE (XEXP (op, 0)) == MULT
7865 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
7866 && CONST_INT_P (XEXP (op, 1))
7867 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
7868 INTVAL (XEXP (op, 1))) != 0)
7869 return XEXP (XEXP (op, 0), 0);
7871 /* Now handle extended register, as this may also have an optional
7872 left shift by 1..4. */
7873 if (strip_shift
7874 && GET_CODE (op) == ASHIFT
7875 && CONST_INT_P (XEXP (op, 1))
7876 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
7877 op = XEXP (op, 0);
7879 if (GET_CODE (op) == ZERO_EXTEND
7880 || GET_CODE (op) == SIGN_EXTEND)
7881 op = XEXP (op, 0);
7883 if (op != x)
7884 return op;
7886 return x;
7889 /* Return true iff CODE is a shift supported in combination
7890 with arithmetic instructions. */
7892 static bool
7893 aarch64_shift_p (enum rtx_code code)
7895 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
7899 /* Return true iff X is a cheap shift without a sign extend. */
7901 static bool
7902 aarch64_cheap_mult_shift_p (rtx x)
7904 rtx op0, op1;
7906 op0 = XEXP (x, 0);
7907 op1 = XEXP (x, 1);
7909 if (!(aarch64_tune_params.extra_tuning_flags
7910 & AARCH64_EXTRA_TUNE_CHEAP_SHIFT_EXTEND))
7911 return false;
7913 if (GET_CODE (op0) == SIGN_EXTEND)
7914 return false;
7916 if (GET_CODE (x) == ASHIFT && CONST_INT_P (op1)
7917 && UINTVAL (op1) <= 4)
7918 return true;
7920 if (GET_CODE (x) != MULT || !CONST_INT_P (op1))
7921 return false;
7923 HOST_WIDE_INT l2 = exact_log2 (INTVAL (op1));
7925 if (l2 > 0 && l2 <= 4)
7926 return true;
7928 return false;
7931 /* Helper function for rtx cost calculation. Calculate the cost of
7932 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
7933 Return the calculated cost of the expression, recursing manually in to
7934 operands where needed. */
7936 static int
7937 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
7939 rtx op0, op1;
7940 const struct cpu_cost_table *extra_cost
7941 = aarch64_tune_params.insn_extra_cost;
7942 int cost = 0;
7943 bool compound_p = (outer == PLUS || outer == MINUS);
7944 machine_mode mode = GET_MODE (x);
7946 gcc_checking_assert (code == MULT);
7948 op0 = XEXP (x, 0);
7949 op1 = XEXP (x, 1);
7951 if (VECTOR_MODE_P (mode))
7952 mode = GET_MODE_INNER (mode);
7954 /* Integer multiply/fma. */
7955 if (GET_MODE_CLASS (mode) == MODE_INT)
7957 /* The multiply will be canonicalized as a shift, cost it as such. */
7958 if (aarch64_shift_p (GET_CODE (x))
7959 || (CONST_INT_P (op1)
7960 && exact_log2 (INTVAL (op1)) > 0))
7962 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
7963 || GET_CODE (op0) == SIGN_EXTEND;
7964 if (speed)
7966 if (compound_p)
7968 /* If the shift is considered cheap,
7969 then don't add any cost. */
7970 if (aarch64_cheap_mult_shift_p (x))
7972 else if (REG_P (op1))
7973 /* ARITH + shift-by-register. */
7974 cost += extra_cost->alu.arith_shift_reg;
7975 else if (is_extend)
7976 /* ARITH + extended register. We don't have a cost field
7977 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
7978 cost += extra_cost->alu.extend_arith;
7979 else
7980 /* ARITH + shift-by-immediate. */
7981 cost += extra_cost->alu.arith_shift;
7983 else
7984 /* LSL (immediate). */
7985 cost += extra_cost->alu.shift;
7988 /* Strip extends as we will have costed them in the case above. */
7989 if (is_extend)
7990 op0 = aarch64_strip_extend (op0, true);
7992 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
7994 return cost;
7997 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
7998 compound and let the below cases handle it. After all, MNEG is a
7999 special-case alias of MSUB. */
8000 if (GET_CODE (op0) == NEG)
8002 op0 = XEXP (op0, 0);
8003 compound_p = true;
8006 /* Integer multiplies or FMAs have zero/sign extending variants. */
8007 if ((GET_CODE (op0) == ZERO_EXTEND
8008 && GET_CODE (op1) == ZERO_EXTEND)
8009 || (GET_CODE (op0) == SIGN_EXTEND
8010 && GET_CODE (op1) == SIGN_EXTEND))
8012 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
8013 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
8015 if (speed)
8017 if (compound_p)
8018 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
8019 cost += extra_cost->mult[0].extend_add;
8020 else
8021 /* MUL/SMULL/UMULL. */
8022 cost += extra_cost->mult[0].extend;
8025 return cost;
8028 /* This is either an integer multiply or a MADD. In both cases
8029 we want to recurse and cost the operands. */
8030 cost += rtx_cost (op0, mode, MULT, 0, speed);
8031 cost += rtx_cost (op1, mode, MULT, 1, speed);
8033 if (speed)
8035 if (compound_p)
8036 /* MADD/MSUB. */
8037 cost += extra_cost->mult[mode == DImode].add;
8038 else
8039 /* MUL. */
8040 cost += extra_cost->mult[mode == DImode].simple;
8043 return cost;
8045 else
8047 if (speed)
8049 /* Floating-point FMA/FMUL can also support negations of the
8050 operands, unless the rounding mode is upward or downward in
8051 which case FNMUL is different than FMUL with operand negation. */
8052 bool neg0 = GET_CODE (op0) == NEG;
8053 bool neg1 = GET_CODE (op1) == NEG;
8054 if (compound_p || !flag_rounding_math || (neg0 && neg1))
8056 if (neg0)
8057 op0 = XEXP (op0, 0);
8058 if (neg1)
8059 op1 = XEXP (op1, 0);
8062 if (compound_p)
8063 /* FMADD/FNMADD/FNMSUB/FMSUB. */
8064 cost += extra_cost->fp[mode == DFmode].fma;
8065 else
8066 /* FMUL/FNMUL. */
8067 cost += extra_cost->fp[mode == DFmode].mult;
8070 cost += rtx_cost (op0, mode, MULT, 0, speed);
8071 cost += rtx_cost (op1, mode, MULT, 1, speed);
8072 return cost;
8076 static int
8077 aarch64_address_cost (rtx x,
8078 machine_mode mode,
8079 addr_space_t as ATTRIBUTE_UNUSED,
8080 bool speed)
8082 enum rtx_code c = GET_CODE (x);
8083 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
8084 struct aarch64_address_info info;
8085 int cost = 0;
8086 info.shift = 0;
8088 if (!aarch64_classify_address (&info, x, mode, false))
8090 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
8092 /* This is a CONST or SYMBOL ref which will be split
8093 in a different way depending on the code model in use.
8094 Cost it through the generic infrastructure. */
8095 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
8096 /* Divide through by the cost of one instruction to
8097 bring it to the same units as the address costs. */
8098 cost_symbol_ref /= COSTS_N_INSNS (1);
8099 /* The cost is then the cost of preparing the address,
8100 followed by an immediate (possibly 0) offset. */
8101 return cost_symbol_ref + addr_cost->imm_offset;
8103 else
8105 /* This is most likely a jump table from a case
8106 statement. */
8107 return addr_cost->register_offset;
8111 switch (info.type)
8113 case ADDRESS_LO_SUM:
8114 case ADDRESS_SYMBOLIC:
8115 case ADDRESS_REG_IMM:
8116 cost += addr_cost->imm_offset;
8117 break;
8119 case ADDRESS_REG_WB:
8120 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
8121 cost += addr_cost->pre_modify;
8122 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
8123 cost += addr_cost->post_modify;
8124 else
8125 gcc_unreachable ();
8127 break;
8129 case ADDRESS_REG_REG:
8130 cost += addr_cost->register_offset;
8131 break;
8133 case ADDRESS_REG_SXTW:
8134 cost += addr_cost->register_sextend;
8135 break;
8137 case ADDRESS_REG_UXTW:
8138 cost += addr_cost->register_zextend;
8139 break;
8141 default:
8142 gcc_unreachable ();
8146 if (info.shift > 0)
8148 /* For the sake of calculating the cost of the shifted register
8149 component, we can treat same sized modes in the same way. */
8150 if (known_eq (GET_MODE_BITSIZE (mode), 16))
8151 cost += addr_cost->addr_scale_costs.hi;
8152 else if (known_eq (GET_MODE_BITSIZE (mode), 32))
8153 cost += addr_cost->addr_scale_costs.si;
8154 else if (known_eq (GET_MODE_BITSIZE (mode), 64))
8155 cost += addr_cost->addr_scale_costs.di;
8156 else
8157 /* We can't tell, or this is a 128-bit vector. */
8158 cost += addr_cost->addr_scale_costs.ti;
8161 return cost;
8164 /* Return the cost of a branch. If SPEED_P is true then the compiler is
8165 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
8166 to be taken. */
8169 aarch64_branch_cost (bool speed_p, bool predictable_p)
8171 /* When optimizing for speed, use the cost of unpredictable branches. */
8172 const struct cpu_branch_cost *branch_costs =
8173 aarch64_tune_params.branch_costs;
8175 if (!speed_p || predictable_p)
8176 return branch_costs->predictable;
8177 else
8178 return branch_costs->unpredictable;
8181 /* Return true if the RTX X in mode MODE is a zero or sign extract
8182 usable in an ADD or SUB (extended register) instruction. */
8183 static bool
8184 aarch64_rtx_arith_op_extract_p (rtx x, scalar_int_mode mode)
8186 /* Catch add with a sign extract.
8187 This is add_<optab><mode>_multp2. */
8188 if (GET_CODE (x) == SIGN_EXTRACT
8189 || GET_CODE (x) == ZERO_EXTRACT)
8191 rtx op0 = XEXP (x, 0);
8192 rtx op1 = XEXP (x, 1);
8193 rtx op2 = XEXP (x, 2);
8195 if (GET_CODE (op0) == MULT
8196 && CONST_INT_P (op1)
8197 && op2 == const0_rtx
8198 && CONST_INT_P (XEXP (op0, 1))
8199 && aarch64_is_extend_from_extract (mode,
8200 XEXP (op0, 1),
8201 op1))
8203 return true;
8206 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
8207 No shift. */
8208 else if (GET_CODE (x) == SIGN_EXTEND
8209 || GET_CODE (x) == ZERO_EXTEND)
8210 return REG_P (XEXP (x, 0));
8212 return false;
8215 static bool
8216 aarch64_frint_unspec_p (unsigned int u)
8218 switch (u)
8220 case UNSPEC_FRINTZ:
8221 case UNSPEC_FRINTP:
8222 case UNSPEC_FRINTM:
8223 case UNSPEC_FRINTA:
8224 case UNSPEC_FRINTN:
8225 case UNSPEC_FRINTX:
8226 case UNSPEC_FRINTI:
8227 return true;
8229 default:
8230 return false;
8234 /* Return true iff X is an rtx that will match an extr instruction
8235 i.e. as described in the *extr<mode>5_insn family of patterns.
8236 OP0 and OP1 will be set to the operands of the shifts involved
8237 on success and will be NULL_RTX otherwise. */
8239 static bool
8240 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
8242 rtx op0, op1;
8243 scalar_int_mode mode;
8244 if (!is_a <scalar_int_mode> (GET_MODE (x), &mode))
8245 return false;
8247 *res_op0 = NULL_RTX;
8248 *res_op1 = NULL_RTX;
8250 if (GET_CODE (x) != IOR)
8251 return false;
8253 op0 = XEXP (x, 0);
8254 op1 = XEXP (x, 1);
8256 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
8257 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
8259 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
8260 if (GET_CODE (op1) == ASHIFT)
8261 std::swap (op0, op1);
8263 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
8264 return false;
8266 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
8267 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
8269 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
8270 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
8272 *res_op0 = XEXP (op0, 0);
8273 *res_op1 = XEXP (op1, 0);
8274 return true;
8278 return false;
8281 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
8282 storing it in *COST. Result is true if the total cost of the operation
8283 has now been calculated. */
8284 static bool
8285 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
8287 rtx inner;
8288 rtx comparator;
8289 enum rtx_code cmpcode;
8291 if (COMPARISON_P (op0))
8293 inner = XEXP (op0, 0);
8294 comparator = XEXP (op0, 1);
8295 cmpcode = GET_CODE (op0);
8297 else
8299 inner = op0;
8300 comparator = const0_rtx;
8301 cmpcode = NE;
8304 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
8306 /* Conditional branch. */
8307 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8308 return true;
8309 else
8311 if (cmpcode == NE || cmpcode == EQ)
8313 if (comparator == const0_rtx)
8315 /* TBZ/TBNZ/CBZ/CBNZ. */
8316 if (GET_CODE (inner) == ZERO_EXTRACT)
8317 /* TBZ/TBNZ. */
8318 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
8319 ZERO_EXTRACT, 0, speed);
8320 else
8321 /* CBZ/CBNZ. */
8322 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
8324 return true;
8327 else if (cmpcode == LT || cmpcode == GE)
8329 /* TBZ/TBNZ. */
8330 if (comparator == const0_rtx)
8331 return true;
8335 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
8337 /* CCMP. */
8338 if (GET_CODE (op1) == COMPARE)
8340 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
8341 if (XEXP (op1, 1) == const0_rtx)
8342 *cost += 1;
8343 if (speed)
8345 machine_mode mode = GET_MODE (XEXP (op1, 0));
8346 const struct cpu_cost_table *extra_cost
8347 = aarch64_tune_params.insn_extra_cost;
8349 if (GET_MODE_CLASS (mode) == MODE_INT)
8350 *cost += extra_cost->alu.arith;
8351 else
8352 *cost += extra_cost->fp[mode == DFmode].compare;
8354 return true;
8357 /* It's a conditional operation based on the status flags,
8358 so it must be some flavor of CSEL. */
8360 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
8361 if (GET_CODE (op1) == NEG
8362 || GET_CODE (op1) == NOT
8363 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
8364 op1 = XEXP (op1, 0);
8365 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
8367 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
8368 op1 = XEXP (op1, 0);
8369 op2 = XEXP (op2, 0);
8372 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
8373 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
8374 return true;
8377 /* We don't know what this is, cost all operands. */
8378 return false;
8381 /* Check whether X is a bitfield operation of the form shift + extend that
8382 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
8383 operand to which the bitfield operation is applied. Otherwise return
8384 NULL_RTX. */
8386 static rtx
8387 aarch64_extend_bitfield_pattern_p (rtx x)
8389 rtx_code outer_code = GET_CODE (x);
8390 machine_mode outer_mode = GET_MODE (x);
8392 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
8393 && outer_mode != SImode && outer_mode != DImode)
8394 return NULL_RTX;
8396 rtx inner = XEXP (x, 0);
8397 rtx_code inner_code = GET_CODE (inner);
8398 machine_mode inner_mode = GET_MODE (inner);
8399 rtx op = NULL_RTX;
8401 switch (inner_code)
8403 case ASHIFT:
8404 if (CONST_INT_P (XEXP (inner, 1))
8405 && (inner_mode == QImode || inner_mode == HImode))
8406 op = XEXP (inner, 0);
8407 break;
8408 case LSHIFTRT:
8409 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
8410 && (inner_mode == QImode || inner_mode == HImode))
8411 op = XEXP (inner, 0);
8412 break;
8413 case ASHIFTRT:
8414 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
8415 && (inner_mode == QImode || inner_mode == HImode))
8416 op = XEXP (inner, 0);
8417 break;
8418 default:
8419 break;
8422 return op;
8425 /* Return true if the mask and a shift amount from an RTX of the form
8426 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
8427 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
8429 bool
8430 aarch64_mask_and_shift_for_ubfiz_p (scalar_int_mode mode, rtx mask,
8431 rtx shft_amnt)
8433 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
8434 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
8435 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
8436 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
8439 /* Calculate the cost of calculating X, storing it in *COST. Result
8440 is true if the total cost of the operation has now been calculated. */
8441 static bool
8442 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
8443 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
8445 rtx op0, op1, op2;
8446 const struct cpu_cost_table *extra_cost
8447 = aarch64_tune_params.insn_extra_cost;
8448 int code = GET_CODE (x);
8449 scalar_int_mode int_mode;
8451 /* By default, assume that everything has equivalent cost to the
8452 cheapest instruction. Any additional costs are applied as a delta
8453 above this default. */
8454 *cost = COSTS_N_INSNS (1);
8456 switch (code)
8458 case SET:
8459 /* The cost depends entirely on the operands to SET. */
8460 *cost = 0;
8461 op0 = SET_DEST (x);
8462 op1 = SET_SRC (x);
8464 switch (GET_CODE (op0))
8466 case MEM:
8467 if (speed)
8469 rtx address = XEXP (op0, 0);
8470 if (VECTOR_MODE_P (mode))
8471 *cost += extra_cost->ldst.storev;
8472 else if (GET_MODE_CLASS (mode) == MODE_INT)
8473 *cost += extra_cost->ldst.store;
8474 else if (mode == SFmode)
8475 *cost += extra_cost->ldst.storef;
8476 else if (mode == DFmode)
8477 *cost += extra_cost->ldst.stored;
8479 *cost +=
8480 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8481 0, speed));
8484 *cost += rtx_cost (op1, mode, SET, 1, speed);
8485 return true;
8487 case SUBREG:
8488 if (! REG_P (SUBREG_REG (op0)))
8489 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
8491 /* Fall through. */
8492 case REG:
8493 /* The cost is one per vector-register copied. */
8494 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
8496 int nregs = aarch64_hard_regno_nregs (V0_REGNUM, GET_MODE (op0));
8497 *cost = COSTS_N_INSNS (nregs);
8499 /* const0_rtx is in general free, but we will use an
8500 instruction to set a register to 0. */
8501 else if (REG_P (op1) || op1 == const0_rtx)
8503 /* The cost is 1 per register copied. */
8504 int nregs = aarch64_hard_regno_nregs (R0_REGNUM, GET_MODE (op0));
8505 *cost = COSTS_N_INSNS (nregs);
8507 else
8508 /* Cost is just the cost of the RHS of the set. */
8509 *cost += rtx_cost (op1, mode, SET, 1, speed);
8510 return true;
8512 case ZERO_EXTRACT:
8513 case SIGN_EXTRACT:
8514 /* Bit-field insertion. Strip any redundant widening of
8515 the RHS to meet the width of the target. */
8516 if (GET_CODE (op1) == SUBREG)
8517 op1 = SUBREG_REG (op1);
8518 if ((GET_CODE (op1) == ZERO_EXTEND
8519 || GET_CODE (op1) == SIGN_EXTEND)
8520 && CONST_INT_P (XEXP (op0, 1))
8521 && is_a <scalar_int_mode> (GET_MODE (XEXP (op1, 0)), &int_mode)
8522 && GET_MODE_BITSIZE (int_mode) >= INTVAL (XEXP (op0, 1)))
8523 op1 = XEXP (op1, 0);
8525 if (CONST_INT_P (op1))
8527 /* MOV immediate is assumed to always be cheap. */
8528 *cost = COSTS_N_INSNS (1);
8530 else
8532 /* BFM. */
8533 if (speed)
8534 *cost += extra_cost->alu.bfi;
8535 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
8538 return true;
8540 default:
8541 /* We can't make sense of this, assume default cost. */
8542 *cost = COSTS_N_INSNS (1);
8543 return false;
8545 return false;
8547 case CONST_INT:
8548 /* If an instruction can incorporate a constant within the
8549 instruction, the instruction's expression avoids calling
8550 rtx_cost() on the constant. If rtx_cost() is called on a
8551 constant, then it is usually because the constant must be
8552 moved into a register by one or more instructions.
8554 The exception is constant 0, which can be expressed
8555 as XZR/WZR and is therefore free. The exception to this is
8556 if we have (set (reg) (const0_rtx)) in which case we must cost
8557 the move. However, we can catch that when we cost the SET, so
8558 we don't need to consider that here. */
8559 if (x == const0_rtx)
8560 *cost = 0;
8561 else
8563 /* To an approximation, building any other constant is
8564 proportionally expensive to the number of instructions
8565 required to build that constant. This is true whether we
8566 are compiling for SPEED or otherwise. */
8567 if (!is_a <scalar_int_mode> (mode, &int_mode))
8568 int_mode = word_mode;
8569 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
8570 (NULL_RTX, x, false, int_mode));
8572 return true;
8574 case CONST_DOUBLE:
8576 /* First determine number of instructions to do the move
8577 as an integer constant. */
8578 if (!aarch64_float_const_representable_p (x)
8579 && !aarch64_can_const_movi_rtx_p (x, mode)
8580 && aarch64_float_const_rtx_p (x))
8582 unsigned HOST_WIDE_INT ival;
8583 bool succeed = aarch64_reinterpret_float_as_int (x, &ival);
8584 gcc_assert (succeed);
8586 scalar_int_mode imode = (mode == HFmode
8587 ? SImode
8588 : int_mode_for_mode (mode).require ());
8589 int ncost = aarch64_internal_mov_immediate
8590 (NULL_RTX, gen_int_mode (ival, imode), false, imode);
8591 *cost += COSTS_N_INSNS (ncost);
8592 return true;
8595 if (speed)
8597 /* mov[df,sf]_aarch64. */
8598 if (aarch64_float_const_representable_p (x))
8599 /* FMOV (scalar immediate). */
8600 *cost += extra_cost->fp[mode == DFmode].fpconst;
8601 else if (!aarch64_float_const_zero_rtx_p (x))
8603 /* This will be a load from memory. */
8604 if (mode == DFmode)
8605 *cost += extra_cost->ldst.loadd;
8606 else
8607 *cost += extra_cost->ldst.loadf;
8609 else
8610 /* Otherwise this is +0.0. We get this using MOVI d0, #0
8611 or MOV v0.s[0], wzr - neither of which are modeled by the
8612 cost tables. Just use the default cost. */
8617 return true;
8619 case MEM:
8620 if (speed)
8622 /* For loads we want the base cost of a load, plus an
8623 approximation for the additional cost of the addressing
8624 mode. */
8625 rtx address = XEXP (x, 0);
8626 if (VECTOR_MODE_P (mode))
8627 *cost += extra_cost->ldst.loadv;
8628 else if (GET_MODE_CLASS (mode) == MODE_INT)
8629 *cost += extra_cost->ldst.load;
8630 else if (mode == SFmode)
8631 *cost += extra_cost->ldst.loadf;
8632 else if (mode == DFmode)
8633 *cost += extra_cost->ldst.loadd;
8635 *cost +=
8636 COSTS_N_INSNS (aarch64_address_cost (address, mode,
8637 0, speed));
8640 return true;
8642 case NEG:
8643 op0 = XEXP (x, 0);
8645 if (VECTOR_MODE_P (mode))
8647 if (speed)
8649 /* FNEG. */
8650 *cost += extra_cost->vect.alu;
8652 return false;
8655 if (GET_MODE_CLASS (mode) == MODE_INT)
8657 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8658 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8660 /* CSETM. */
8661 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
8662 return true;
8665 /* Cost this as SUB wzr, X. */
8666 op0 = CONST0_RTX (mode);
8667 op1 = XEXP (x, 0);
8668 goto cost_minus;
8671 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8673 /* Support (neg(fma...)) as a single instruction only if
8674 sign of zeros is unimportant. This matches the decision
8675 making in aarch64.md. */
8676 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
8678 /* FNMADD. */
8679 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8680 return true;
8682 if (GET_CODE (op0) == MULT)
8684 /* FNMUL. */
8685 *cost = rtx_cost (op0, mode, NEG, 0, speed);
8686 return true;
8688 if (speed)
8689 /* FNEG. */
8690 *cost += extra_cost->fp[mode == DFmode].neg;
8691 return false;
8694 return false;
8696 case CLRSB:
8697 case CLZ:
8698 if (speed)
8700 if (VECTOR_MODE_P (mode))
8701 *cost += extra_cost->vect.alu;
8702 else
8703 *cost += extra_cost->alu.clz;
8706 return false;
8708 case COMPARE:
8709 op0 = XEXP (x, 0);
8710 op1 = XEXP (x, 1);
8712 if (op1 == const0_rtx
8713 && GET_CODE (op0) == AND)
8715 x = op0;
8716 mode = GET_MODE (op0);
8717 goto cost_logic;
8720 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
8722 /* TODO: A write to the CC flags possibly costs extra, this
8723 needs encoding in the cost tables. */
8725 mode = GET_MODE (op0);
8726 /* ANDS. */
8727 if (GET_CODE (op0) == AND)
8729 x = op0;
8730 goto cost_logic;
8733 if (GET_CODE (op0) == PLUS)
8735 /* ADDS (and CMN alias). */
8736 x = op0;
8737 goto cost_plus;
8740 if (GET_CODE (op0) == MINUS)
8742 /* SUBS. */
8743 x = op0;
8744 goto cost_minus;
8747 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
8748 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
8749 && CONST_INT_P (XEXP (op0, 2)))
8751 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
8752 Handle it here directly rather than going to cost_logic
8753 since we know the immediate generated for the TST is valid
8754 so we can avoid creating an intermediate rtx for it only
8755 for costing purposes. */
8756 if (speed)
8757 *cost += extra_cost->alu.logical;
8759 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
8760 ZERO_EXTRACT, 0, speed);
8761 return true;
8764 if (GET_CODE (op1) == NEG)
8766 /* CMN. */
8767 if (speed)
8768 *cost += extra_cost->alu.arith;
8770 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
8771 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
8772 return true;
8775 /* CMP.
8777 Compare can freely swap the order of operands, and
8778 canonicalization puts the more complex operation first.
8779 But the integer MINUS logic expects the shift/extend
8780 operation in op1. */
8781 if (! (REG_P (op0)
8782 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
8784 op0 = XEXP (x, 1);
8785 op1 = XEXP (x, 0);
8787 goto cost_minus;
8790 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
8792 /* FCMP. */
8793 if (speed)
8794 *cost += extra_cost->fp[mode == DFmode].compare;
8796 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
8798 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
8799 /* FCMP supports constant 0.0 for no extra cost. */
8800 return true;
8802 return false;
8805 if (VECTOR_MODE_P (mode))
8807 /* Vector compare. */
8808 if (speed)
8809 *cost += extra_cost->vect.alu;
8811 if (aarch64_float_const_zero_rtx_p (op1))
8813 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
8814 cost. */
8815 return true;
8817 return false;
8819 return false;
8821 case MINUS:
8823 op0 = XEXP (x, 0);
8824 op1 = XEXP (x, 1);
8826 cost_minus:
8827 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
8829 /* Detect valid immediates. */
8830 if ((GET_MODE_CLASS (mode) == MODE_INT
8831 || (GET_MODE_CLASS (mode) == MODE_CC
8832 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
8833 && CONST_INT_P (op1)
8834 && aarch64_uimm12_shift (INTVAL (op1)))
8836 if (speed)
8837 /* SUB(S) (immediate). */
8838 *cost += extra_cost->alu.arith;
8839 return true;
8842 /* Look for SUB (extended register). */
8843 if (is_a <scalar_int_mode> (mode, &int_mode)
8844 && aarch64_rtx_arith_op_extract_p (op1, int_mode))
8846 if (speed)
8847 *cost += extra_cost->alu.extend_arith;
8849 op1 = aarch64_strip_extend (op1, true);
8850 *cost += rtx_cost (op1, VOIDmode,
8851 (enum rtx_code) GET_CODE (op1), 0, speed);
8852 return true;
8855 rtx new_op1 = aarch64_strip_extend (op1, false);
8857 /* Cost this as an FMA-alike operation. */
8858 if ((GET_CODE (new_op1) == MULT
8859 || aarch64_shift_p (GET_CODE (new_op1)))
8860 && code != COMPARE)
8862 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
8863 (enum rtx_code) code,
8864 speed);
8865 return true;
8868 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
8870 if (speed)
8872 if (VECTOR_MODE_P (mode))
8874 /* Vector SUB. */
8875 *cost += extra_cost->vect.alu;
8877 else if (GET_MODE_CLASS (mode) == MODE_INT)
8879 /* SUB(S). */
8880 *cost += extra_cost->alu.arith;
8882 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8884 /* FSUB. */
8885 *cost += extra_cost->fp[mode == DFmode].addsub;
8888 return true;
8891 case PLUS:
8893 rtx new_op0;
8895 op0 = XEXP (x, 0);
8896 op1 = XEXP (x, 1);
8898 cost_plus:
8899 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
8900 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
8902 /* CSINC. */
8903 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
8904 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8905 return true;
8908 if (GET_MODE_CLASS (mode) == MODE_INT
8909 && ((CONST_INT_P (op1) && aarch64_uimm12_shift (INTVAL (op1)))
8910 || aarch64_sve_addvl_addpl_immediate (op1, mode)))
8912 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
8914 if (speed)
8915 /* ADD (immediate). */
8916 *cost += extra_cost->alu.arith;
8917 return true;
8920 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
8922 /* Look for ADD (extended register). */
8923 if (is_a <scalar_int_mode> (mode, &int_mode)
8924 && aarch64_rtx_arith_op_extract_p (op0, int_mode))
8926 if (speed)
8927 *cost += extra_cost->alu.extend_arith;
8929 op0 = aarch64_strip_extend (op0, true);
8930 *cost += rtx_cost (op0, VOIDmode,
8931 (enum rtx_code) GET_CODE (op0), 0, speed);
8932 return true;
8935 /* Strip any extend, leave shifts behind as we will
8936 cost them through mult_cost. */
8937 new_op0 = aarch64_strip_extend (op0, false);
8939 if (GET_CODE (new_op0) == MULT
8940 || aarch64_shift_p (GET_CODE (new_op0)))
8942 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
8943 speed);
8944 return true;
8947 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
8949 if (speed)
8951 if (VECTOR_MODE_P (mode))
8953 /* Vector ADD. */
8954 *cost += extra_cost->vect.alu;
8956 else if (GET_MODE_CLASS (mode) == MODE_INT)
8958 /* ADD. */
8959 *cost += extra_cost->alu.arith;
8961 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
8963 /* FADD. */
8964 *cost += extra_cost->fp[mode == DFmode].addsub;
8967 return true;
8970 case BSWAP:
8971 *cost = COSTS_N_INSNS (1);
8973 if (speed)
8975 if (VECTOR_MODE_P (mode))
8976 *cost += extra_cost->vect.alu;
8977 else
8978 *cost += extra_cost->alu.rev;
8980 return false;
8982 case IOR:
8983 if (aarch_rev16_p (x))
8985 *cost = COSTS_N_INSNS (1);
8987 if (speed)
8989 if (VECTOR_MODE_P (mode))
8990 *cost += extra_cost->vect.alu;
8991 else
8992 *cost += extra_cost->alu.rev;
8994 return true;
8997 if (aarch64_extr_rtx_p (x, &op0, &op1))
8999 *cost += rtx_cost (op0, mode, IOR, 0, speed);
9000 *cost += rtx_cost (op1, mode, IOR, 1, speed);
9001 if (speed)
9002 *cost += extra_cost->alu.shift;
9004 return true;
9006 /* Fall through. */
9007 case XOR:
9008 case AND:
9009 cost_logic:
9010 op0 = XEXP (x, 0);
9011 op1 = XEXP (x, 1);
9013 if (VECTOR_MODE_P (mode))
9015 if (speed)
9016 *cost += extra_cost->vect.alu;
9017 return true;
9020 if (code == AND
9021 && GET_CODE (op0) == MULT
9022 && CONST_INT_P (XEXP (op0, 1))
9023 && CONST_INT_P (op1)
9024 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
9025 INTVAL (op1)) != 0)
9027 /* This is a UBFM/SBFM. */
9028 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
9029 if (speed)
9030 *cost += extra_cost->alu.bfx;
9031 return true;
9034 if (is_int_mode (mode, &int_mode))
9036 if (CONST_INT_P (op1))
9038 /* We have a mask + shift version of a UBFIZ
9039 i.e. the *andim_ashift<mode>_bfiz pattern. */
9040 if (GET_CODE (op0) == ASHIFT
9041 && aarch64_mask_and_shift_for_ubfiz_p (int_mode, op1,
9042 XEXP (op0, 1)))
9044 *cost += rtx_cost (XEXP (op0, 0), int_mode,
9045 (enum rtx_code) code, 0, speed);
9046 if (speed)
9047 *cost += extra_cost->alu.bfx;
9049 return true;
9051 else if (aarch64_bitmask_imm (INTVAL (op1), int_mode))
9053 /* We possibly get the immediate for free, this is not
9054 modelled. */
9055 *cost += rtx_cost (op0, int_mode,
9056 (enum rtx_code) code, 0, speed);
9057 if (speed)
9058 *cost += extra_cost->alu.logical;
9060 return true;
9063 else
9065 rtx new_op0 = op0;
9067 /* Handle ORN, EON, or BIC. */
9068 if (GET_CODE (op0) == NOT)
9069 op0 = XEXP (op0, 0);
9071 new_op0 = aarch64_strip_shift (op0);
9073 /* If we had a shift on op0 then this is a logical-shift-
9074 by-register/immediate operation. Otherwise, this is just
9075 a logical operation. */
9076 if (speed)
9078 if (new_op0 != op0)
9080 /* Shift by immediate. */
9081 if (CONST_INT_P (XEXP (op0, 1)))
9082 *cost += extra_cost->alu.log_shift;
9083 else
9084 *cost += extra_cost->alu.log_shift_reg;
9086 else
9087 *cost += extra_cost->alu.logical;
9090 /* In both cases we want to cost both operands. */
9091 *cost += rtx_cost (new_op0, int_mode, (enum rtx_code) code,
9092 0, speed);
9093 *cost += rtx_cost (op1, int_mode, (enum rtx_code) code,
9094 1, speed);
9096 return true;
9099 return false;
9101 case NOT:
9102 x = XEXP (x, 0);
9103 op0 = aarch64_strip_shift (x);
9105 if (VECTOR_MODE_P (mode))
9107 /* Vector NOT. */
9108 *cost += extra_cost->vect.alu;
9109 return false;
9112 /* MVN-shifted-reg. */
9113 if (op0 != x)
9115 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9117 if (speed)
9118 *cost += extra_cost->alu.log_shift;
9120 return true;
9122 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
9123 Handle the second form here taking care that 'a' in the above can
9124 be a shift. */
9125 else if (GET_CODE (op0) == XOR)
9127 rtx newop0 = XEXP (op0, 0);
9128 rtx newop1 = XEXP (op0, 1);
9129 rtx op0_stripped = aarch64_strip_shift (newop0);
9131 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
9132 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
9134 if (speed)
9136 if (op0_stripped != newop0)
9137 *cost += extra_cost->alu.log_shift;
9138 else
9139 *cost += extra_cost->alu.logical;
9142 return true;
9144 /* MVN. */
9145 if (speed)
9146 *cost += extra_cost->alu.logical;
9148 return false;
9150 case ZERO_EXTEND:
9152 op0 = XEXP (x, 0);
9153 /* If a value is written in SI mode, then zero extended to DI
9154 mode, the operation will in general be free as a write to
9155 a 'w' register implicitly zeroes the upper bits of an 'x'
9156 register. However, if this is
9158 (set (reg) (zero_extend (reg)))
9160 we must cost the explicit register move. */
9161 if (mode == DImode
9162 && GET_MODE (op0) == SImode
9163 && outer == SET)
9165 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
9167 /* If OP_COST is non-zero, then the cost of the zero extend
9168 is effectively the cost of the inner operation. Otherwise
9169 we have a MOV instruction and we take the cost from the MOV
9170 itself. This is true independently of whether we are
9171 optimizing for space or time. */
9172 if (op_cost)
9173 *cost = op_cost;
9175 return true;
9177 else if (MEM_P (op0))
9179 /* All loads can zero extend to any size for free. */
9180 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
9181 return true;
9184 op0 = aarch64_extend_bitfield_pattern_p (x);
9185 if (op0)
9187 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
9188 if (speed)
9189 *cost += extra_cost->alu.bfx;
9190 return true;
9193 if (speed)
9195 if (VECTOR_MODE_P (mode))
9197 /* UMOV. */
9198 *cost += extra_cost->vect.alu;
9200 else
9202 /* We generate an AND instead of UXTB/UXTH. */
9203 *cost += extra_cost->alu.logical;
9206 return false;
9208 case SIGN_EXTEND:
9209 if (MEM_P (XEXP (x, 0)))
9211 /* LDRSH. */
9212 if (speed)
9214 rtx address = XEXP (XEXP (x, 0), 0);
9215 *cost += extra_cost->ldst.load_sign_extend;
9217 *cost +=
9218 COSTS_N_INSNS (aarch64_address_cost (address, mode,
9219 0, speed));
9221 return true;
9224 op0 = aarch64_extend_bitfield_pattern_p (x);
9225 if (op0)
9227 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
9228 if (speed)
9229 *cost += extra_cost->alu.bfx;
9230 return true;
9233 if (speed)
9235 if (VECTOR_MODE_P (mode))
9236 *cost += extra_cost->vect.alu;
9237 else
9238 *cost += extra_cost->alu.extend;
9240 return false;
9242 case ASHIFT:
9243 op0 = XEXP (x, 0);
9244 op1 = XEXP (x, 1);
9246 if (CONST_INT_P (op1))
9248 if (speed)
9250 if (VECTOR_MODE_P (mode))
9252 /* Vector shift (immediate). */
9253 *cost += extra_cost->vect.alu;
9255 else
9257 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
9258 aliases. */
9259 *cost += extra_cost->alu.shift;
9263 /* We can incorporate zero/sign extend for free. */
9264 if (GET_CODE (op0) == ZERO_EXTEND
9265 || GET_CODE (op0) == SIGN_EXTEND)
9266 op0 = XEXP (op0, 0);
9268 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
9269 return true;
9271 else
9273 if (VECTOR_MODE_P (mode))
9275 if (speed)
9276 /* Vector shift (register). */
9277 *cost += extra_cost->vect.alu;
9279 else
9281 if (speed)
9282 /* LSLV. */
9283 *cost += extra_cost->alu.shift_reg;
9285 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9286 && CONST_INT_P (XEXP (op1, 1))
9287 && known_eq (INTVAL (XEXP (op1, 1)),
9288 GET_MODE_BITSIZE (mode) - 1))
9290 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9291 /* We already demanded XEXP (op1, 0) to be REG_P, so
9292 don't recurse into it. */
9293 return true;
9296 return false; /* All arguments need to be in registers. */
9299 case ROTATE:
9300 case ROTATERT:
9301 case LSHIFTRT:
9302 case ASHIFTRT:
9303 op0 = XEXP (x, 0);
9304 op1 = XEXP (x, 1);
9306 if (CONST_INT_P (op1))
9308 /* ASR (immediate) and friends. */
9309 if (speed)
9311 if (VECTOR_MODE_P (mode))
9312 *cost += extra_cost->vect.alu;
9313 else
9314 *cost += extra_cost->alu.shift;
9317 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
9318 return true;
9320 else
9322 if (VECTOR_MODE_P (mode))
9324 if (speed)
9325 /* Vector shift (register). */
9326 *cost += extra_cost->vect.alu;
9328 else
9330 if (speed)
9331 /* ASR (register) and friends. */
9332 *cost += extra_cost->alu.shift_reg;
9334 if (GET_CODE (op1) == AND && REG_P (XEXP (op1, 0))
9335 && CONST_INT_P (XEXP (op1, 1))
9336 && known_eq (INTVAL (XEXP (op1, 1)),
9337 GET_MODE_BITSIZE (mode) - 1))
9339 *cost += rtx_cost (op0, mode, (rtx_code) code, 0, speed);
9340 /* We already demanded XEXP (op1, 0) to be REG_P, so
9341 don't recurse into it. */
9342 return true;
9345 return false; /* All arguments need to be in registers. */
9348 case SYMBOL_REF:
9350 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
9351 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
9353 /* LDR. */
9354 if (speed)
9355 *cost += extra_cost->ldst.load;
9357 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
9358 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
9360 /* ADRP, followed by ADD. */
9361 *cost += COSTS_N_INSNS (1);
9362 if (speed)
9363 *cost += 2 * extra_cost->alu.arith;
9365 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
9366 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
9368 /* ADR. */
9369 if (speed)
9370 *cost += extra_cost->alu.arith;
9373 if (flag_pic)
9375 /* One extra load instruction, after accessing the GOT. */
9376 *cost += COSTS_N_INSNS (1);
9377 if (speed)
9378 *cost += extra_cost->ldst.load;
9380 return true;
9382 case HIGH:
9383 case LO_SUM:
9384 /* ADRP/ADD (immediate). */
9385 if (speed)
9386 *cost += extra_cost->alu.arith;
9387 return true;
9389 case ZERO_EXTRACT:
9390 case SIGN_EXTRACT:
9391 /* UBFX/SBFX. */
9392 if (speed)
9394 if (VECTOR_MODE_P (mode))
9395 *cost += extra_cost->vect.alu;
9396 else
9397 *cost += extra_cost->alu.bfx;
9400 /* We can trust that the immediates used will be correct (there
9401 are no by-register forms), so we need only cost op0. */
9402 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
9403 return true;
9405 case MULT:
9406 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
9407 /* aarch64_rtx_mult_cost always handles recursion to its
9408 operands. */
9409 return true;
9411 case MOD:
9412 /* We can expand signed mod by power of 2 using a NEGS, two parallel
9413 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
9414 an unconditional negate. This case should only ever be reached through
9415 the set_smod_pow2_cheap check in expmed.c. */
9416 if (CONST_INT_P (XEXP (x, 1))
9417 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
9418 && (mode == SImode || mode == DImode))
9420 /* We expand to 4 instructions. Reset the baseline. */
9421 *cost = COSTS_N_INSNS (4);
9423 if (speed)
9424 *cost += 2 * extra_cost->alu.logical
9425 + 2 * extra_cost->alu.arith;
9427 return true;
9430 /* Fall-through. */
9431 case UMOD:
9432 if (speed)
9434 /* Slighly prefer UMOD over SMOD. */
9435 if (VECTOR_MODE_P (mode))
9436 *cost += extra_cost->vect.alu;
9437 else if (GET_MODE_CLASS (mode) == MODE_INT)
9438 *cost += (extra_cost->mult[mode == DImode].add
9439 + extra_cost->mult[mode == DImode].idiv
9440 + (code == MOD ? 1 : 0));
9442 return false; /* All arguments need to be in registers. */
9444 case DIV:
9445 case UDIV:
9446 case SQRT:
9447 if (speed)
9449 if (VECTOR_MODE_P (mode))
9450 *cost += extra_cost->vect.alu;
9451 else if (GET_MODE_CLASS (mode) == MODE_INT)
9452 /* There is no integer SQRT, so only DIV and UDIV can get
9453 here. */
9454 *cost += (extra_cost->mult[mode == DImode].idiv
9455 /* Slighly prefer UDIV over SDIV. */
9456 + (code == DIV ? 1 : 0));
9457 else
9458 *cost += extra_cost->fp[mode == DFmode].div;
9460 return false; /* All arguments need to be in registers. */
9462 case IF_THEN_ELSE:
9463 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
9464 XEXP (x, 2), cost, speed);
9466 case EQ:
9467 case NE:
9468 case GT:
9469 case GTU:
9470 case LT:
9471 case LTU:
9472 case GE:
9473 case GEU:
9474 case LE:
9475 case LEU:
9477 return false; /* All arguments must be in registers. */
9479 case FMA:
9480 op0 = XEXP (x, 0);
9481 op1 = XEXP (x, 1);
9482 op2 = XEXP (x, 2);
9484 if (speed)
9486 if (VECTOR_MODE_P (mode))
9487 *cost += extra_cost->vect.alu;
9488 else
9489 *cost += extra_cost->fp[mode == DFmode].fma;
9492 /* FMSUB, FNMADD, and FNMSUB are free. */
9493 if (GET_CODE (op0) == NEG)
9494 op0 = XEXP (op0, 0);
9496 if (GET_CODE (op2) == NEG)
9497 op2 = XEXP (op2, 0);
9499 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
9500 and the by-element operand as operand 0. */
9501 if (GET_CODE (op1) == NEG)
9502 op1 = XEXP (op1, 0);
9504 /* Catch vector-by-element operations. The by-element operand can
9505 either be (vec_duplicate (vec_select (x))) or just
9506 (vec_select (x)), depending on whether we are multiplying by
9507 a vector or a scalar.
9509 Canonicalization is not very good in these cases, FMA4 will put the
9510 by-element operand as operand 0, FNMA4 will have it as operand 1. */
9511 if (GET_CODE (op0) == VEC_DUPLICATE)
9512 op0 = XEXP (op0, 0);
9513 else if (GET_CODE (op1) == VEC_DUPLICATE)
9514 op1 = XEXP (op1, 0);
9516 if (GET_CODE (op0) == VEC_SELECT)
9517 op0 = XEXP (op0, 0);
9518 else if (GET_CODE (op1) == VEC_SELECT)
9519 op1 = XEXP (op1, 0);
9521 /* If the remaining parameters are not registers,
9522 get the cost to put them into registers. */
9523 *cost += rtx_cost (op0, mode, FMA, 0, speed);
9524 *cost += rtx_cost (op1, mode, FMA, 1, speed);
9525 *cost += rtx_cost (op2, mode, FMA, 2, speed);
9526 return true;
9528 case FLOAT:
9529 case UNSIGNED_FLOAT:
9530 if (speed)
9531 *cost += extra_cost->fp[mode == DFmode].fromint;
9532 return false;
9534 case FLOAT_EXTEND:
9535 if (speed)
9537 if (VECTOR_MODE_P (mode))
9539 /*Vector truncate. */
9540 *cost += extra_cost->vect.alu;
9542 else
9543 *cost += extra_cost->fp[mode == DFmode].widen;
9545 return false;
9547 case FLOAT_TRUNCATE:
9548 if (speed)
9550 if (VECTOR_MODE_P (mode))
9552 /*Vector conversion. */
9553 *cost += extra_cost->vect.alu;
9555 else
9556 *cost += extra_cost->fp[mode == DFmode].narrow;
9558 return false;
9560 case FIX:
9561 case UNSIGNED_FIX:
9562 x = XEXP (x, 0);
9563 /* Strip the rounding part. They will all be implemented
9564 by the fcvt* family of instructions anyway. */
9565 if (GET_CODE (x) == UNSPEC)
9567 unsigned int uns_code = XINT (x, 1);
9569 if (uns_code == UNSPEC_FRINTA
9570 || uns_code == UNSPEC_FRINTM
9571 || uns_code == UNSPEC_FRINTN
9572 || uns_code == UNSPEC_FRINTP
9573 || uns_code == UNSPEC_FRINTZ)
9574 x = XVECEXP (x, 0, 0);
9577 if (speed)
9579 if (VECTOR_MODE_P (mode))
9580 *cost += extra_cost->vect.alu;
9581 else
9582 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
9585 /* We can combine fmul by a power of 2 followed by a fcvt into a single
9586 fixed-point fcvt. */
9587 if (GET_CODE (x) == MULT
9588 && ((VECTOR_MODE_P (mode)
9589 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
9590 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
9592 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
9593 0, speed);
9594 return true;
9597 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
9598 return true;
9600 case ABS:
9601 if (VECTOR_MODE_P (mode))
9603 /* ABS (vector). */
9604 if (speed)
9605 *cost += extra_cost->vect.alu;
9607 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
9609 op0 = XEXP (x, 0);
9611 /* FABD, which is analogous to FADD. */
9612 if (GET_CODE (op0) == MINUS)
9614 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
9615 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
9616 if (speed)
9617 *cost += extra_cost->fp[mode == DFmode].addsub;
9619 return true;
9621 /* Simple FABS is analogous to FNEG. */
9622 if (speed)
9623 *cost += extra_cost->fp[mode == DFmode].neg;
9625 else
9627 /* Integer ABS will either be split to
9628 two arithmetic instructions, or will be an ABS
9629 (scalar), which we don't model. */
9630 *cost = COSTS_N_INSNS (2);
9631 if (speed)
9632 *cost += 2 * extra_cost->alu.arith;
9634 return false;
9636 case SMAX:
9637 case SMIN:
9638 if (speed)
9640 if (VECTOR_MODE_P (mode))
9641 *cost += extra_cost->vect.alu;
9642 else
9644 /* FMAXNM/FMINNM/FMAX/FMIN.
9645 TODO: This may not be accurate for all implementations, but
9646 we do not model this in the cost tables. */
9647 *cost += extra_cost->fp[mode == DFmode].addsub;
9650 return false;
9652 case UNSPEC:
9653 /* The floating point round to integer frint* instructions. */
9654 if (aarch64_frint_unspec_p (XINT (x, 1)))
9656 if (speed)
9657 *cost += extra_cost->fp[mode == DFmode].roundint;
9659 return false;
9662 if (XINT (x, 1) == UNSPEC_RBIT)
9664 if (speed)
9665 *cost += extra_cost->alu.rev;
9667 return false;
9669 break;
9671 case TRUNCATE:
9673 /* Decompose <su>muldi3_highpart. */
9674 if (/* (truncate:DI */
9675 mode == DImode
9676 /* (lshiftrt:TI */
9677 && GET_MODE (XEXP (x, 0)) == TImode
9678 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
9679 /* (mult:TI */
9680 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
9681 /* (ANY_EXTEND:TI (reg:DI))
9682 (ANY_EXTEND:TI (reg:DI))) */
9683 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
9684 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
9685 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
9686 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
9687 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
9688 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
9689 /* (const_int 64) */
9690 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
9691 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
9693 /* UMULH/SMULH. */
9694 if (speed)
9695 *cost += extra_cost->mult[mode == DImode].extend;
9696 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
9697 mode, MULT, 0, speed);
9698 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
9699 mode, MULT, 1, speed);
9700 return true;
9703 /* Fall through. */
9704 default:
9705 break;
9708 if (dump_file
9709 && flag_aarch64_verbose_cost)
9710 fprintf (dump_file,
9711 "\nFailed to cost RTX. Assuming default cost.\n");
9713 return true;
9716 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
9717 calculated for X. This cost is stored in *COST. Returns true
9718 if the total cost of X was calculated. */
9719 static bool
9720 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
9721 int param, int *cost, bool speed)
9723 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
9725 if (dump_file
9726 && flag_aarch64_verbose_cost)
9728 print_rtl_single (dump_file, x);
9729 fprintf (dump_file, "\n%s cost: %d (%s)\n",
9730 speed ? "Hot" : "Cold",
9731 *cost, result ? "final" : "partial");
9734 return result;
9737 static int
9738 aarch64_register_move_cost (machine_mode mode,
9739 reg_class_t from_i, reg_class_t to_i)
9741 enum reg_class from = (enum reg_class) from_i;
9742 enum reg_class to = (enum reg_class) to_i;
9743 const struct cpu_regmove_cost *regmove_cost
9744 = aarch64_tune_params.regmove_cost;
9746 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
9747 if (to == TAILCALL_ADDR_REGS || to == POINTER_REGS)
9748 to = GENERAL_REGS;
9750 if (from == TAILCALL_ADDR_REGS || from == POINTER_REGS)
9751 from = GENERAL_REGS;
9753 /* Moving between GPR and stack cost is the same as GP2GP. */
9754 if ((from == GENERAL_REGS && to == STACK_REG)
9755 || (to == GENERAL_REGS && from == STACK_REG))
9756 return regmove_cost->GP2GP;
9758 /* To/From the stack register, we move via the gprs. */
9759 if (to == STACK_REG || from == STACK_REG)
9760 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
9761 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
9763 if (known_eq (GET_MODE_SIZE (mode), 16))
9765 /* 128-bit operations on general registers require 2 instructions. */
9766 if (from == GENERAL_REGS && to == GENERAL_REGS)
9767 return regmove_cost->GP2GP * 2;
9768 else if (from == GENERAL_REGS)
9769 return regmove_cost->GP2FP * 2;
9770 else if (to == GENERAL_REGS)
9771 return regmove_cost->FP2GP * 2;
9773 /* When AdvSIMD instructions are disabled it is not possible to move
9774 a 128-bit value directly between Q registers. This is handled in
9775 secondary reload. A general register is used as a scratch to move
9776 the upper DI value and the lower DI value is moved directly,
9777 hence the cost is the sum of three moves. */
9778 if (! TARGET_SIMD)
9779 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
9781 return regmove_cost->FP2FP;
9784 if (from == GENERAL_REGS && to == GENERAL_REGS)
9785 return regmove_cost->GP2GP;
9786 else if (from == GENERAL_REGS)
9787 return regmove_cost->GP2FP;
9788 else if (to == GENERAL_REGS)
9789 return regmove_cost->FP2GP;
9791 return regmove_cost->FP2FP;
9794 static int
9795 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
9796 reg_class_t rclass ATTRIBUTE_UNUSED,
9797 bool in ATTRIBUTE_UNUSED)
9799 return aarch64_tune_params.memmov_cost;
9802 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
9803 to optimize 1.0/sqrt. */
9805 static bool
9806 use_rsqrt_p (machine_mode mode)
9808 return (!flag_trapping_math
9809 && flag_unsafe_math_optimizations
9810 && ((aarch64_tune_params.approx_modes->recip_sqrt
9811 & AARCH64_APPROX_MODE (mode))
9812 || flag_mrecip_low_precision_sqrt));
9815 /* Function to decide when to use the approximate reciprocal square root
9816 builtin. */
9818 static tree
9819 aarch64_builtin_reciprocal (tree fndecl)
9821 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
9823 if (!use_rsqrt_p (mode))
9824 return NULL_TREE;
9825 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
9828 /* Emit instruction sequence to compute either the approximate square root
9829 or its approximate reciprocal, depending on the flag RECP, and return
9830 whether the sequence was emitted or not. */
9832 bool
9833 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
9835 machine_mode mode = GET_MODE (dst);
9837 if (GET_MODE_INNER (mode) == HFmode)
9839 gcc_assert (!recp);
9840 return false;
9843 if (!recp)
9845 if (!(flag_mlow_precision_sqrt
9846 || (aarch64_tune_params.approx_modes->sqrt
9847 & AARCH64_APPROX_MODE (mode))))
9848 return false;
9850 if (flag_finite_math_only
9851 || flag_trapping_math
9852 || !flag_unsafe_math_optimizations
9853 || optimize_function_for_size_p (cfun))
9854 return false;
9856 else
9857 /* Caller assumes we cannot fail. */
9858 gcc_assert (use_rsqrt_p (mode));
9860 machine_mode mmsk = mode_for_int_vector (mode).require ();
9861 rtx xmsk = gen_reg_rtx (mmsk);
9862 if (!recp)
9863 /* When calculating the approximate square root, compare the
9864 argument with 0.0 and create a mask. */
9865 emit_insn (gen_rtx_SET (xmsk,
9866 gen_rtx_NEG (mmsk,
9867 gen_rtx_EQ (mmsk, src,
9868 CONST0_RTX (mode)))));
9870 /* Estimate the approximate reciprocal square root. */
9871 rtx xdst = gen_reg_rtx (mode);
9872 emit_insn (gen_aarch64_rsqrte (mode, xdst, src));
9874 /* Iterate over the series twice for SF and thrice for DF. */
9875 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9877 /* Optionally iterate over the series once less for faster performance
9878 while sacrificing the accuracy. */
9879 if ((recp && flag_mrecip_low_precision_sqrt)
9880 || (!recp && flag_mlow_precision_sqrt))
9881 iterations--;
9883 /* Iterate over the series to calculate the approximate reciprocal square
9884 root. */
9885 rtx x1 = gen_reg_rtx (mode);
9886 while (iterations--)
9888 rtx x2 = gen_reg_rtx (mode);
9889 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
9891 emit_insn (gen_aarch64_rsqrts (mode, x1, src, x2));
9893 if (iterations > 0)
9894 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
9897 if (!recp)
9899 /* Qualify the approximate reciprocal square root when the argument is
9900 0.0 by squashing the intermediary result to 0.0. */
9901 rtx xtmp = gen_reg_rtx (mmsk);
9902 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
9903 gen_rtx_SUBREG (mmsk, xdst, 0)));
9904 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
9906 /* Calculate the approximate square root. */
9907 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
9910 /* Finalize the approximation. */
9911 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
9913 return true;
9916 /* Emit the instruction sequence to compute the approximation for the division
9917 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
9919 bool
9920 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
9922 machine_mode mode = GET_MODE (quo);
9924 if (GET_MODE_INNER (mode) == HFmode)
9925 return false;
9927 bool use_approx_division_p = (flag_mlow_precision_div
9928 || (aarch64_tune_params.approx_modes->division
9929 & AARCH64_APPROX_MODE (mode)));
9931 if (!flag_finite_math_only
9932 || flag_trapping_math
9933 || !flag_unsafe_math_optimizations
9934 || optimize_function_for_size_p (cfun)
9935 || !use_approx_division_p)
9936 return false;
9938 if (!TARGET_SIMD && VECTOR_MODE_P (mode))
9939 return false;
9941 /* Estimate the approximate reciprocal. */
9942 rtx xrcp = gen_reg_rtx (mode);
9943 emit_insn (gen_aarch64_frecpe (mode, xrcp, den));
9945 /* Iterate over the series twice for SF and thrice for DF. */
9946 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
9948 /* Optionally iterate over the series once less for faster performance,
9949 while sacrificing the accuracy. */
9950 if (flag_mlow_precision_div)
9951 iterations--;
9953 /* Iterate over the series to calculate the approximate reciprocal. */
9954 rtx xtmp = gen_reg_rtx (mode);
9955 while (iterations--)
9957 emit_insn (gen_aarch64_frecps (mode, xtmp, xrcp, den));
9959 if (iterations > 0)
9960 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
9963 if (num != CONST1_RTX (mode))
9965 /* As the approximate reciprocal of DEN is already calculated, only
9966 calculate the approximate division when NUM is not 1.0. */
9967 rtx xnum = force_reg (mode, num);
9968 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
9971 /* Finalize the approximation. */
9972 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
9973 return true;
9976 /* Return the number of instructions that can be issued per cycle. */
9977 static int
9978 aarch64_sched_issue_rate (void)
9980 return aarch64_tune_params.issue_rate;
9983 static int
9984 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
9986 int issue_rate = aarch64_sched_issue_rate ();
9988 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
9992 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
9993 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
9994 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
9996 static int
9997 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
9998 int ready_index)
10000 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
10004 /* Vectorizer cost model target hooks. */
10006 /* Implement targetm.vectorize.builtin_vectorization_cost. */
10007 static int
10008 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
10009 tree vectype,
10010 int misalign ATTRIBUTE_UNUSED)
10012 unsigned elements;
10013 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
10014 bool fp = false;
10016 if (vectype != NULL)
10017 fp = FLOAT_TYPE_P (vectype);
10019 switch (type_of_cost)
10021 case scalar_stmt:
10022 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
10024 case scalar_load:
10025 return costs->scalar_load_cost;
10027 case scalar_store:
10028 return costs->scalar_store_cost;
10030 case vector_stmt:
10031 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10033 case vector_load:
10034 return costs->vec_align_load_cost;
10036 case vector_store:
10037 return costs->vec_store_cost;
10039 case vec_to_scalar:
10040 return costs->vec_to_scalar_cost;
10042 case scalar_to_vec:
10043 return costs->scalar_to_vec_cost;
10045 case unaligned_load:
10046 case vector_gather_load:
10047 return costs->vec_unalign_load_cost;
10049 case unaligned_store:
10050 case vector_scatter_store:
10051 return costs->vec_unalign_store_cost;
10053 case cond_branch_taken:
10054 return costs->cond_taken_branch_cost;
10056 case cond_branch_not_taken:
10057 return costs->cond_not_taken_branch_cost;
10059 case vec_perm:
10060 return costs->vec_permute_cost;
10062 case vec_promote_demote:
10063 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
10065 case vec_construct:
10066 elements = estimated_poly_value (TYPE_VECTOR_SUBPARTS (vectype));
10067 return elements / 2 + 1;
10069 default:
10070 gcc_unreachable ();
10074 /* Implement targetm.vectorize.add_stmt_cost. */
10075 static unsigned
10076 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
10077 struct _stmt_vec_info *stmt_info, int misalign,
10078 enum vect_cost_model_location where)
10080 unsigned *cost = (unsigned *) data;
10081 unsigned retval = 0;
10083 if (flag_vect_cost_model)
10085 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
10086 int stmt_cost =
10087 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
10089 /* Statements in an inner loop relative to the loop being
10090 vectorized are weighted more heavily. The value here is
10091 arbitrary and could potentially be improved with analysis. */
10092 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
10093 count *= 50; /* FIXME */
10095 retval = (unsigned) (count * stmt_cost);
10096 cost[where] += retval;
10099 return retval;
10102 static void initialize_aarch64_code_model (struct gcc_options *);
10104 /* Parse the TO_PARSE string and put the architecture struct that it
10105 selects into RES and the architectural features into ISA_FLAGS.
10106 Return an aarch64_parse_opt_result describing the parse result.
10107 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
10109 static enum aarch64_parse_opt_result
10110 aarch64_parse_arch (const char *to_parse, const struct processor **res,
10111 unsigned long *isa_flags)
10113 char *ext;
10114 const struct processor *arch;
10115 char *str = (char *) alloca (strlen (to_parse) + 1);
10116 size_t len;
10118 strcpy (str, to_parse);
10120 ext = strchr (str, '+');
10122 if (ext != NULL)
10123 len = ext - str;
10124 else
10125 len = strlen (str);
10127 if (len == 0)
10128 return AARCH64_PARSE_MISSING_ARG;
10131 /* Loop through the list of supported ARCHes to find a match. */
10132 for (arch = all_architectures; arch->name != NULL; arch++)
10134 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
10136 unsigned long isa_temp = arch->flags;
10138 if (ext != NULL)
10140 /* TO_PARSE string contains at least one extension. */
10141 enum aarch64_parse_opt_result ext_res
10142 = aarch64_parse_extension (ext, &isa_temp);
10144 if (ext_res != AARCH64_PARSE_OK)
10145 return ext_res;
10147 /* Extension parsing was successful. Confirm the result
10148 arch and ISA flags. */
10149 *res = arch;
10150 *isa_flags = isa_temp;
10151 return AARCH64_PARSE_OK;
10155 /* ARCH name not found in list. */
10156 return AARCH64_PARSE_INVALID_ARG;
10159 /* Parse the TO_PARSE string and put the result tuning in RES and the
10160 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
10161 describing the parse result. If there is an error parsing, RES and
10162 ISA_FLAGS are left unchanged. */
10164 static enum aarch64_parse_opt_result
10165 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
10166 unsigned long *isa_flags)
10168 char *ext;
10169 const struct processor *cpu;
10170 char *str = (char *) alloca (strlen (to_parse) + 1);
10171 size_t len;
10173 strcpy (str, to_parse);
10175 ext = strchr (str, '+');
10177 if (ext != NULL)
10178 len = ext - str;
10179 else
10180 len = strlen (str);
10182 if (len == 0)
10183 return AARCH64_PARSE_MISSING_ARG;
10186 /* Loop through the list of supported CPUs to find a match. */
10187 for (cpu = all_cores; cpu->name != NULL; cpu++)
10189 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
10191 unsigned long isa_temp = cpu->flags;
10194 if (ext != NULL)
10196 /* TO_PARSE string contains at least one extension. */
10197 enum aarch64_parse_opt_result ext_res
10198 = aarch64_parse_extension (ext, &isa_temp);
10200 if (ext_res != AARCH64_PARSE_OK)
10201 return ext_res;
10203 /* Extension parsing was successfull. Confirm the result
10204 cpu and ISA flags. */
10205 *res = cpu;
10206 *isa_flags = isa_temp;
10207 return AARCH64_PARSE_OK;
10211 /* CPU name not found in list. */
10212 return AARCH64_PARSE_INVALID_ARG;
10215 /* Parse the TO_PARSE string and put the cpu it selects into RES.
10216 Return an aarch64_parse_opt_result describing the parse result.
10217 If the parsing fails the RES does not change. */
10219 static enum aarch64_parse_opt_result
10220 aarch64_parse_tune (const char *to_parse, const struct processor **res)
10222 const struct processor *cpu;
10223 char *str = (char *) alloca (strlen (to_parse) + 1);
10225 strcpy (str, to_parse);
10227 /* Loop through the list of supported CPUs to find a match. */
10228 for (cpu = all_cores; cpu->name != NULL; cpu++)
10230 if (strcmp (cpu->name, str) == 0)
10232 *res = cpu;
10233 return AARCH64_PARSE_OK;
10237 /* CPU name not found in list. */
10238 return AARCH64_PARSE_INVALID_ARG;
10241 /* Parse TOKEN, which has length LENGTH to see if it is an option
10242 described in FLAG. If it is, return the index bit for that fusion type.
10243 If not, error (printing OPTION_NAME) and return zero. */
10245 static unsigned int
10246 aarch64_parse_one_option_token (const char *token,
10247 size_t length,
10248 const struct aarch64_flag_desc *flag,
10249 const char *option_name)
10251 for (; flag->name != NULL; flag++)
10253 if (length == strlen (flag->name)
10254 && !strncmp (flag->name, token, length))
10255 return flag->flag;
10258 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
10259 return 0;
10262 /* Parse OPTION which is a comma-separated list of flags to enable.
10263 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
10264 default state we inherit from the CPU tuning structures. OPTION_NAME
10265 gives the top-level option we are parsing in the -moverride string,
10266 for use in error messages. */
10268 static unsigned int
10269 aarch64_parse_boolean_options (const char *option,
10270 const struct aarch64_flag_desc *flags,
10271 unsigned int initial_state,
10272 const char *option_name)
10274 const char separator = '.';
10275 const char* specs = option;
10276 const char* ntoken = option;
10277 unsigned int found_flags = initial_state;
10279 while ((ntoken = strchr (specs, separator)))
10281 size_t token_length = ntoken - specs;
10282 unsigned token_ops = aarch64_parse_one_option_token (specs,
10283 token_length,
10284 flags,
10285 option_name);
10286 /* If we find "none" (or, for simplicity's sake, an error) anywhere
10287 in the token stream, reset the supported operations. So:
10289 adrp+add.cmp+branch.none.adrp+add
10291 would have the result of turning on only adrp+add fusion. */
10292 if (!token_ops)
10293 found_flags = 0;
10295 found_flags |= token_ops;
10296 specs = ++ntoken;
10299 /* We ended with a comma, print something. */
10300 if (!(*specs))
10302 error ("%s string ill-formed\n", option_name);
10303 return 0;
10306 /* We still have one more token to parse. */
10307 size_t token_length = strlen (specs);
10308 unsigned token_ops = aarch64_parse_one_option_token (specs,
10309 token_length,
10310 flags,
10311 option_name);
10312 if (!token_ops)
10313 found_flags = 0;
10315 found_flags |= token_ops;
10316 return found_flags;
10319 /* Support for overriding instruction fusion. */
10321 static void
10322 aarch64_parse_fuse_string (const char *fuse_string,
10323 struct tune_params *tune)
10325 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
10326 aarch64_fusible_pairs,
10327 tune->fusible_ops,
10328 "fuse=");
10331 /* Support for overriding other tuning flags. */
10333 static void
10334 aarch64_parse_tune_string (const char *tune_string,
10335 struct tune_params *tune)
10337 tune->extra_tuning_flags
10338 = aarch64_parse_boolean_options (tune_string,
10339 aarch64_tuning_flags,
10340 tune->extra_tuning_flags,
10341 "tune=");
10344 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
10345 we understand. If it is, extract the option string and handoff to
10346 the appropriate function. */
10348 void
10349 aarch64_parse_one_override_token (const char* token,
10350 size_t length,
10351 struct tune_params *tune)
10353 const struct aarch64_tuning_override_function *fn
10354 = aarch64_tuning_override_functions;
10356 const char *option_part = strchr (token, '=');
10357 if (!option_part)
10359 error ("tuning string missing in option (%s)", token);
10360 return;
10363 /* Get the length of the option name. */
10364 length = option_part - token;
10365 /* Skip the '=' to get to the option string. */
10366 option_part++;
10368 for (; fn->name != NULL; fn++)
10370 if (!strncmp (fn->name, token, length))
10372 fn->parse_override (option_part, tune);
10373 return;
10377 error ("unknown tuning option (%s)",token);
10378 return;
10381 /* A checking mechanism for the implementation of the tls size. */
10383 static void
10384 initialize_aarch64_tls_size (struct gcc_options *opts)
10386 if (aarch64_tls_size == 0)
10387 aarch64_tls_size = 24;
10389 switch (opts->x_aarch64_cmodel_var)
10391 case AARCH64_CMODEL_TINY:
10392 /* Both the default and maximum TLS size allowed under tiny is 1M which
10393 needs two instructions to address, so we clamp the size to 24. */
10394 if (aarch64_tls_size > 24)
10395 aarch64_tls_size = 24;
10396 break;
10397 case AARCH64_CMODEL_SMALL:
10398 /* The maximum TLS size allowed under small is 4G. */
10399 if (aarch64_tls_size > 32)
10400 aarch64_tls_size = 32;
10401 break;
10402 case AARCH64_CMODEL_LARGE:
10403 /* The maximum TLS size allowed under large is 16E.
10404 FIXME: 16E should be 64bit, we only support 48bit offset now. */
10405 if (aarch64_tls_size > 48)
10406 aarch64_tls_size = 48;
10407 break;
10408 default:
10409 gcc_unreachable ();
10412 return;
10415 /* Parse STRING looking for options in the format:
10416 string :: option:string
10417 option :: name=substring
10418 name :: {a-z}
10419 substring :: defined by option. */
10421 static void
10422 aarch64_parse_override_string (const char* input_string,
10423 struct tune_params* tune)
10425 const char separator = ':';
10426 size_t string_length = strlen (input_string) + 1;
10427 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
10428 char *string = string_root;
10429 strncpy (string, input_string, string_length);
10430 string[string_length - 1] = '\0';
10432 char* ntoken = string;
10434 while ((ntoken = strchr (string, separator)))
10436 size_t token_length = ntoken - string;
10437 /* Make this substring look like a string. */
10438 *ntoken = '\0';
10439 aarch64_parse_one_override_token (string, token_length, tune);
10440 string = ++ntoken;
10443 /* One last option to parse. */
10444 aarch64_parse_one_override_token (string, strlen (string), tune);
10445 free (string_root);
10449 static void
10450 aarch64_override_options_after_change_1 (struct gcc_options *opts)
10452 /* PR 70044: We have to be careful about being called multiple times for the
10453 same function. This means all changes should be repeatable. */
10455 /* Set aarch64_use_frame_pointer based on -fno-omit-frame-pointer.
10456 Disable the frame pointer flag so the mid-end will not use a frame
10457 pointer in leaf functions in order to support -fomit-leaf-frame-pointer.
10458 Set x_flag_omit_frame_pointer to the special value 2 to differentiate
10459 between -fomit-frame-pointer (1) and -fno-omit-frame-pointer (2). */
10460 aarch64_use_frame_pointer = opts->x_flag_omit_frame_pointer != 1;
10461 if (opts->x_flag_omit_frame_pointer == 0)
10462 opts->x_flag_omit_frame_pointer = 2;
10464 /* If not optimizing for size, set the default
10465 alignment to what the target wants. */
10466 if (!opts->x_optimize_size)
10468 if (opts->x_flag_align_loops && !opts->x_str_align_loops)
10469 opts->x_str_align_loops = aarch64_tune_params.loop_align;
10470 if (opts->x_flag_align_jumps && !opts->x_str_align_jumps)
10471 opts->x_str_align_jumps = aarch64_tune_params.jump_align;
10472 if (opts->x_flag_align_functions && !opts->x_str_align_functions)
10473 opts->x_str_align_functions = aarch64_tune_params.function_align;
10476 /* We default to no pc-relative literal loads. */
10478 aarch64_pcrelative_literal_loads = false;
10480 /* If -mpc-relative-literal-loads is set on the command line, this
10481 implies that the user asked for PC relative literal loads. */
10482 if (opts->x_pcrelative_literal_loads == 1)
10483 aarch64_pcrelative_literal_loads = true;
10485 /* In the tiny memory model it makes no sense to disallow PC relative
10486 literal pool loads. */
10487 if (aarch64_cmodel == AARCH64_CMODEL_TINY
10488 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
10489 aarch64_pcrelative_literal_loads = true;
10491 /* When enabling the lower precision Newton series for the square root, also
10492 enable it for the reciprocal square root, since the latter is an
10493 intermediary step for the former. */
10494 if (flag_mlow_precision_sqrt)
10495 flag_mrecip_low_precision_sqrt = true;
10498 /* 'Unpack' up the internal tuning structs and update the options
10499 in OPTS. The caller must have set up selected_tune and selected_arch
10500 as all the other target-specific codegen decisions are
10501 derived from them. */
10503 void
10504 aarch64_override_options_internal (struct gcc_options *opts)
10506 aarch64_tune_flags = selected_tune->flags;
10507 aarch64_tune = selected_tune->sched_core;
10508 /* Make a copy of the tuning parameters attached to the core, which
10509 we may later overwrite. */
10510 aarch64_tune_params = *(selected_tune->tune);
10511 aarch64_architecture_version = selected_arch->architecture_version;
10513 if (opts->x_aarch64_override_tune_string)
10514 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
10515 &aarch64_tune_params);
10517 /* This target defaults to strict volatile bitfields. */
10518 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
10519 opts->x_flag_strict_volatile_bitfields = 1;
10521 initialize_aarch64_code_model (opts);
10522 initialize_aarch64_tls_size (opts);
10524 int queue_depth = 0;
10525 switch (aarch64_tune_params.autoprefetcher_model)
10527 case tune_params::AUTOPREFETCHER_OFF:
10528 queue_depth = -1;
10529 break;
10530 case tune_params::AUTOPREFETCHER_WEAK:
10531 queue_depth = 0;
10532 break;
10533 case tune_params::AUTOPREFETCHER_STRONG:
10534 queue_depth = max_insn_queue_index + 1;
10535 break;
10536 default:
10537 gcc_unreachable ();
10540 /* We don't mind passing in global_options_set here as we don't use
10541 the *options_set structs anyway. */
10542 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
10543 queue_depth,
10544 opts->x_param_values,
10545 global_options_set.x_param_values);
10547 /* Set up parameters to be used in prefetching algorithm. Do not
10548 override the defaults unless we are tuning for a core we have
10549 researched values for. */
10550 if (aarch64_tune_params.prefetch->num_slots > 0)
10551 maybe_set_param_value (PARAM_SIMULTANEOUS_PREFETCHES,
10552 aarch64_tune_params.prefetch->num_slots,
10553 opts->x_param_values,
10554 global_options_set.x_param_values);
10555 if (aarch64_tune_params.prefetch->l1_cache_size >= 0)
10556 maybe_set_param_value (PARAM_L1_CACHE_SIZE,
10557 aarch64_tune_params.prefetch->l1_cache_size,
10558 opts->x_param_values,
10559 global_options_set.x_param_values);
10560 if (aarch64_tune_params.prefetch->l1_cache_line_size >= 0)
10561 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
10562 aarch64_tune_params.prefetch->l1_cache_line_size,
10563 opts->x_param_values,
10564 global_options_set.x_param_values);
10565 if (aarch64_tune_params.prefetch->l2_cache_size >= 0)
10566 maybe_set_param_value (PARAM_L2_CACHE_SIZE,
10567 aarch64_tune_params.prefetch->l2_cache_size,
10568 opts->x_param_values,
10569 global_options_set.x_param_values);
10570 if (!aarch64_tune_params.prefetch->prefetch_dynamic_strides)
10571 maybe_set_param_value (PARAM_PREFETCH_DYNAMIC_STRIDES,
10573 opts->x_param_values,
10574 global_options_set.x_param_values);
10575 if (aarch64_tune_params.prefetch->minimum_stride >= 0)
10576 maybe_set_param_value (PARAM_PREFETCH_MINIMUM_STRIDE,
10577 aarch64_tune_params.prefetch->minimum_stride,
10578 opts->x_param_values,
10579 global_options_set.x_param_values);
10581 /* Use the alternative scheduling-pressure algorithm by default. */
10582 maybe_set_param_value (PARAM_SCHED_PRESSURE_ALGORITHM, SCHED_PRESSURE_MODEL,
10583 opts->x_param_values,
10584 global_options_set.x_param_values);
10586 /* Enable sw prefetching at specified optimization level for
10587 CPUS that have prefetch. Lower optimization level threshold by 1
10588 when profiling is enabled. */
10589 if (opts->x_flag_prefetch_loop_arrays < 0
10590 && !opts->x_optimize_size
10591 && aarch64_tune_params.prefetch->default_opt_level >= 0
10592 && opts->x_optimize >= aarch64_tune_params.prefetch->default_opt_level)
10593 opts->x_flag_prefetch_loop_arrays = 1;
10595 if (opts->x_aarch64_arch_string == NULL)
10596 opts->x_aarch64_arch_string = selected_arch->name;
10597 if (opts->x_aarch64_cpu_string == NULL)
10598 opts->x_aarch64_cpu_string = selected_cpu->name;
10599 if (opts->x_aarch64_tune_string == NULL)
10600 opts->x_aarch64_tune_string = selected_tune->name;
10602 aarch64_override_options_after_change_1 (opts);
10605 /* Print a hint with a suggestion for a core or architecture name that
10606 most closely resembles what the user passed in STR. ARCH is true if
10607 the user is asking for an architecture name. ARCH is false if the user
10608 is asking for a core name. */
10610 static void
10611 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
10613 auto_vec<const char *> candidates;
10614 const struct processor *entry = arch ? all_architectures : all_cores;
10615 for (; entry->name != NULL; entry++)
10616 candidates.safe_push (entry->name);
10618 #ifdef HAVE_LOCAL_CPU_DETECT
10619 /* Add also "native" as possible value. */
10620 if (arch)
10621 candidates.safe_push ("native");
10622 #endif
10624 char *s;
10625 const char *hint = candidates_list_and_hint (str, s, candidates);
10626 if (hint)
10627 inform (input_location, "valid arguments are: %s;"
10628 " did you mean %qs?", s, hint);
10629 else
10630 inform (input_location, "valid arguments are: %s", s);
10632 XDELETEVEC (s);
10635 /* Print a hint with a suggestion for a core name that most closely resembles
10636 what the user passed in STR. */
10638 inline static void
10639 aarch64_print_hint_for_core (const char *str)
10641 aarch64_print_hint_for_core_or_arch (str, false);
10644 /* Print a hint with a suggestion for an architecture name that most closely
10645 resembles what the user passed in STR. */
10647 inline static void
10648 aarch64_print_hint_for_arch (const char *str)
10650 aarch64_print_hint_for_core_or_arch (str, true);
10653 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
10654 specified in STR and throw errors if appropriate. Put the results if
10655 they are valid in RES and ISA_FLAGS. Return whether the option is
10656 valid. */
10658 static bool
10659 aarch64_validate_mcpu (const char *str, const struct processor **res,
10660 unsigned long *isa_flags)
10662 enum aarch64_parse_opt_result parse_res
10663 = aarch64_parse_cpu (str, res, isa_flags);
10665 if (parse_res == AARCH64_PARSE_OK)
10666 return true;
10668 switch (parse_res)
10670 case AARCH64_PARSE_MISSING_ARG:
10671 error ("missing cpu name in %<-mcpu=%s%>", str);
10672 break;
10673 case AARCH64_PARSE_INVALID_ARG:
10674 error ("unknown value %qs for -mcpu", str);
10675 aarch64_print_hint_for_core (str);
10676 break;
10677 case AARCH64_PARSE_INVALID_FEATURE:
10678 error ("invalid feature modifier in %<-mcpu=%s%>", str);
10679 break;
10680 default:
10681 gcc_unreachable ();
10684 return false;
10687 /* Validate a command-line -march option. Parse the arch and extensions
10688 (if any) specified in STR and throw errors if appropriate. Put the
10689 results, if they are valid, in RES and ISA_FLAGS. Return whether the
10690 option is valid. */
10692 static bool
10693 aarch64_validate_march (const char *str, const struct processor **res,
10694 unsigned long *isa_flags)
10696 enum aarch64_parse_opt_result parse_res
10697 = aarch64_parse_arch (str, res, isa_flags);
10699 if (parse_res == AARCH64_PARSE_OK)
10700 return true;
10702 switch (parse_res)
10704 case AARCH64_PARSE_MISSING_ARG:
10705 error ("missing arch name in %<-march=%s%>", str);
10706 break;
10707 case AARCH64_PARSE_INVALID_ARG:
10708 error ("unknown value %qs for -march", str);
10709 aarch64_print_hint_for_arch (str);
10710 break;
10711 case AARCH64_PARSE_INVALID_FEATURE:
10712 error ("invalid feature modifier in %<-march=%s%>", str);
10713 break;
10714 default:
10715 gcc_unreachable ();
10718 return false;
10721 /* Validate a command-line -mtune option. Parse the cpu
10722 specified in STR and throw errors if appropriate. Put the
10723 result, if it is valid, in RES. Return whether the option is
10724 valid. */
10726 static bool
10727 aarch64_validate_mtune (const char *str, const struct processor **res)
10729 enum aarch64_parse_opt_result parse_res
10730 = aarch64_parse_tune (str, res);
10732 if (parse_res == AARCH64_PARSE_OK)
10733 return true;
10735 switch (parse_res)
10737 case AARCH64_PARSE_MISSING_ARG:
10738 error ("missing cpu name in %<-mtune=%s%>", str);
10739 break;
10740 case AARCH64_PARSE_INVALID_ARG:
10741 error ("unknown value %qs for -mtune", str);
10742 aarch64_print_hint_for_core (str);
10743 break;
10744 default:
10745 gcc_unreachable ();
10747 return false;
10750 /* Return the CPU corresponding to the enum CPU.
10751 If it doesn't specify a cpu, return the default. */
10753 static const struct processor *
10754 aarch64_get_tune_cpu (enum aarch64_processor cpu)
10756 if (cpu != aarch64_none)
10757 return &all_cores[cpu];
10759 /* The & 0x3f is to extract the bottom 6 bits that encode the
10760 default cpu as selected by the --with-cpu GCC configure option
10761 in config.gcc.
10762 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
10763 flags mechanism should be reworked to make it more sane. */
10764 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10767 /* Return the architecture corresponding to the enum ARCH.
10768 If it doesn't specify a valid architecture, return the default. */
10770 static const struct processor *
10771 aarch64_get_arch (enum aarch64_arch arch)
10773 if (arch != aarch64_no_arch)
10774 return &all_architectures[arch];
10776 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
10778 return &all_architectures[cpu->arch];
10781 /* Return the VG value associated with -msve-vector-bits= value VALUE. */
10783 static poly_uint16
10784 aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits_enum value)
10786 /* For now generate vector-length agnostic code for -msve-vector-bits=128.
10787 This ensures we can clearly distinguish SVE and Advanced SIMD modes when
10788 deciding which .md file patterns to use and when deciding whether
10789 something is a legitimate address or constant. */
10790 if (value == SVE_SCALABLE || value == SVE_128)
10791 return poly_uint16 (2, 2);
10792 else
10793 return (int) value / 64;
10796 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
10797 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
10798 tuning structs. In particular it must set selected_tune and
10799 aarch64_isa_flags that define the available ISA features and tuning
10800 decisions. It must also set selected_arch as this will be used to
10801 output the .arch asm tags for each function. */
10803 static void
10804 aarch64_override_options (void)
10806 unsigned long cpu_isa = 0;
10807 unsigned long arch_isa = 0;
10808 aarch64_isa_flags = 0;
10810 bool valid_cpu = true;
10811 bool valid_tune = true;
10812 bool valid_arch = true;
10814 selected_cpu = NULL;
10815 selected_arch = NULL;
10816 selected_tune = NULL;
10818 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
10819 If either of -march or -mtune is given, they override their
10820 respective component of -mcpu. */
10821 if (aarch64_cpu_string)
10822 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
10823 &cpu_isa);
10825 if (aarch64_arch_string)
10826 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
10827 &arch_isa);
10829 if (aarch64_tune_string)
10830 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
10832 /* If the user did not specify a processor, choose the default
10833 one for them. This will be the CPU set during configuration using
10834 --with-cpu, otherwise it is "generic". */
10835 if (!selected_cpu)
10837 if (selected_arch)
10839 selected_cpu = &all_cores[selected_arch->ident];
10840 aarch64_isa_flags = arch_isa;
10841 explicit_arch = selected_arch->arch;
10843 else
10845 /* Get default configure-time CPU. */
10846 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
10847 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
10850 if (selected_tune)
10851 explicit_tune_core = selected_tune->ident;
10853 /* If both -mcpu and -march are specified check that they are architecturally
10854 compatible, warn if they're not and prefer the -march ISA flags. */
10855 else if (selected_arch)
10857 if (selected_arch->arch != selected_cpu->arch)
10859 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
10860 all_architectures[selected_cpu->arch].name,
10861 selected_arch->name);
10863 aarch64_isa_flags = arch_isa;
10864 explicit_arch = selected_arch->arch;
10865 explicit_tune_core = selected_tune ? selected_tune->ident
10866 : selected_cpu->ident;
10868 else
10870 /* -mcpu but no -march. */
10871 aarch64_isa_flags = cpu_isa;
10872 explicit_tune_core = selected_tune ? selected_tune->ident
10873 : selected_cpu->ident;
10874 gcc_assert (selected_cpu);
10875 selected_arch = &all_architectures[selected_cpu->arch];
10876 explicit_arch = selected_arch->arch;
10879 /* Set the arch as well as we will need it when outputing
10880 the .arch directive in assembly. */
10881 if (!selected_arch)
10883 gcc_assert (selected_cpu);
10884 selected_arch = &all_architectures[selected_cpu->arch];
10887 if (!selected_tune)
10888 selected_tune = selected_cpu;
10890 #ifndef HAVE_AS_MABI_OPTION
10891 /* The compiler may have been configured with 2.23.* binutils, which does
10892 not have support for ILP32. */
10893 if (TARGET_ILP32)
10894 error ("assembler does not support -mabi=ilp32");
10895 #endif
10897 /* Convert -msve-vector-bits to a VG count. */
10898 aarch64_sve_vg = aarch64_convert_sve_vector_bits (aarch64_sve_vector_bits);
10900 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
10901 sorry ("return address signing is only supported for -mabi=lp64");
10903 /* Make sure we properly set up the explicit options. */
10904 if ((aarch64_cpu_string && valid_cpu)
10905 || (aarch64_tune_string && valid_tune))
10906 gcc_assert (explicit_tune_core != aarch64_none);
10908 if ((aarch64_cpu_string && valid_cpu)
10909 || (aarch64_arch_string && valid_arch))
10910 gcc_assert (explicit_arch != aarch64_no_arch);
10912 aarch64_override_options_internal (&global_options);
10914 /* Save these options as the default ones in case we push and pop them later
10915 while processing functions with potential target attributes. */
10916 target_option_default_node = target_option_current_node
10917 = build_target_option_node (&global_options);
10920 /* Implement targetm.override_options_after_change. */
10922 static void
10923 aarch64_override_options_after_change (void)
10925 aarch64_override_options_after_change_1 (&global_options);
10928 static struct machine_function *
10929 aarch64_init_machine_status (void)
10931 struct machine_function *machine;
10932 machine = ggc_cleared_alloc<machine_function> ();
10933 return machine;
10936 void
10937 aarch64_init_expanders (void)
10939 init_machine_status = aarch64_init_machine_status;
10942 /* A checking mechanism for the implementation of the various code models. */
10943 static void
10944 initialize_aarch64_code_model (struct gcc_options *opts)
10946 if (opts->x_flag_pic)
10948 switch (opts->x_aarch64_cmodel_var)
10950 case AARCH64_CMODEL_TINY:
10951 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
10952 break;
10953 case AARCH64_CMODEL_SMALL:
10954 #ifdef HAVE_AS_SMALL_PIC_RELOCS
10955 aarch64_cmodel = (flag_pic == 2
10956 ? AARCH64_CMODEL_SMALL_PIC
10957 : AARCH64_CMODEL_SMALL_SPIC);
10958 #else
10959 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
10960 #endif
10961 break;
10962 case AARCH64_CMODEL_LARGE:
10963 sorry ("code model %qs with -f%s", "large",
10964 opts->x_flag_pic > 1 ? "PIC" : "pic");
10965 break;
10966 default:
10967 gcc_unreachable ();
10970 else
10971 aarch64_cmodel = opts->x_aarch64_cmodel_var;
10974 /* Implement TARGET_OPTION_SAVE. */
10976 static void
10977 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
10979 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
10982 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
10983 using the information saved in PTR. */
10985 static void
10986 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
10988 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
10989 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
10990 opts->x_explicit_arch = ptr->x_explicit_arch;
10991 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
10992 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
10994 aarch64_override_options_internal (opts);
10997 /* Implement TARGET_OPTION_PRINT. */
10999 static void
11000 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
11002 const struct processor *cpu
11003 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
11004 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
11005 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
11006 std::string extension
11007 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
11009 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
11010 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
11011 arch->name, extension.c_str ());
11014 static GTY(()) tree aarch64_previous_fndecl;
11016 void
11017 aarch64_reset_previous_fndecl (void)
11019 aarch64_previous_fndecl = NULL;
11022 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
11023 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
11024 make sure optab availability predicates are recomputed when necessary. */
11026 void
11027 aarch64_save_restore_target_globals (tree new_tree)
11029 if (TREE_TARGET_GLOBALS (new_tree))
11030 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
11031 else if (new_tree == target_option_default_node)
11032 restore_target_globals (&default_target_globals);
11033 else
11034 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
11037 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
11038 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
11039 of the function, if such exists. This function may be called multiple
11040 times on a single function so use aarch64_previous_fndecl to avoid
11041 setting up identical state. */
11043 static void
11044 aarch64_set_current_function (tree fndecl)
11046 if (!fndecl || fndecl == aarch64_previous_fndecl)
11047 return;
11049 tree old_tree = (aarch64_previous_fndecl
11050 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
11051 : NULL_TREE);
11053 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11055 /* If current function has no attributes but the previous one did,
11056 use the default node. */
11057 if (!new_tree && old_tree)
11058 new_tree = target_option_default_node;
11060 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
11061 the default have been handled by aarch64_save_restore_target_globals from
11062 aarch64_pragma_target_parse. */
11063 if (old_tree == new_tree)
11064 return;
11066 aarch64_previous_fndecl = fndecl;
11068 /* First set the target options. */
11069 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
11071 aarch64_save_restore_target_globals (new_tree);
11074 /* Enum describing the various ways we can handle attributes.
11075 In many cases we can reuse the generic option handling machinery. */
11077 enum aarch64_attr_opt_type
11079 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
11080 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
11081 aarch64_attr_enum, /* Attribute sets an enum variable. */
11082 aarch64_attr_custom /* Attribute requires a custom handling function. */
11085 /* All the information needed to handle a target attribute.
11086 NAME is the name of the attribute.
11087 ATTR_TYPE specifies the type of behavior of the attribute as described
11088 in the definition of enum aarch64_attr_opt_type.
11089 ALLOW_NEG is true if the attribute supports a "no-" form.
11090 HANDLER is the function that takes the attribute string as an argument
11091 It is needed only when the ATTR_TYPE is aarch64_attr_custom.
11092 OPT_NUM is the enum specifying the option that the attribute modifies.
11093 This is needed for attributes that mirror the behavior of a command-line
11094 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
11095 aarch64_attr_enum. */
11097 struct aarch64_attribute_info
11099 const char *name;
11100 enum aarch64_attr_opt_type attr_type;
11101 bool allow_neg;
11102 bool (*handler) (const char *);
11103 enum opt_code opt_num;
11106 /* Handle the ARCH_STR argument to the arch= target attribute. */
11108 static bool
11109 aarch64_handle_attr_arch (const char *str)
11111 const struct processor *tmp_arch = NULL;
11112 enum aarch64_parse_opt_result parse_res
11113 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
11115 if (parse_res == AARCH64_PARSE_OK)
11117 gcc_assert (tmp_arch);
11118 selected_arch = tmp_arch;
11119 explicit_arch = selected_arch->arch;
11120 return true;
11123 switch (parse_res)
11125 case AARCH64_PARSE_MISSING_ARG:
11126 error ("missing name in %<target(\"arch=\")%> pragma or attribute");
11127 break;
11128 case AARCH64_PARSE_INVALID_ARG:
11129 error ("invalid name (\"%s\") in %<target(\"arch=\")%> pragma or attribute", str);
11130 aarch64_print_hint_for_arch (str);
11131 break;
11132 case AARCH64_PARSE_INVALID_FEATURE:
11133 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11134 break;
11135 default:
11136 gcc_unreachable ();
11139 return false;
11142 /* Handle the argument CPU_STR to the cpu= target attribute. */
11144 static bool
11145 aarch64_handle_attr_cpu (const char *str)
11147 const struct processor *tmp_cpu = NULL;
11148 enum aarch64_parse_opt_result parse_res
11149 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
11151 if (parse_res == AARCH64_PARSE_OK)
11153 gcc_assert (tmp_cpu);
11154 selected_tune = tmp_cpu;
11155 explicit_tune_core = selected_tune->ident;
11157 selected_arch = &all_architectures[tmp_cpu->arch];
11158 explicit_arch = selected_arch->arch;
11159 return true;
11162 switch (parse_res)
11164 case AARCH64_PARSE_MISSING_ARG:
11165 error ("missing name in %<target(\"cpu=\")%> pragma or attribute");
11166 break;
11167 case AARCH64_PARSE_INVALID_ARG:
11168 error ("invalid name (\"%s\") in %<target(\"cpu=\")%> pragma or attribute", str);
11169 aarch64_print_hint_for_core (str);
11170 break;
11171 case AARCH64_PARSE_INVALID_FEATURE:
11172 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11173 break;
11174 default:
11175 gcc_unreachable ();
11178 return false;
11181 /* Handle the argument STR to the tune= target attribute. */
11183 static bool
11184 aarch64_handle_attr_tune (const char *str)
11186 const struct processor *tmp_tune = NULL;
11187 enum aarch64_parse_opt_result parse_res
11188 = aarch64_parse_tune (str, &tmp_tune);
11190 if (parse_res == AARCH64_PARSE_OK)
11192 gcc_assert (tmp_tune);
11193 selected_tune = tmp_tune;
11194 explicit_tune_core = selected_tune->ident;
11195 return true;
11198 switch (parse_res)
11200 case AARCH64_PARSE_INVALID_ARG:
11201 error ("invalid name (\"%s\") in %<target(\"tune=\")%> pragma or attribute", str);
11202 aarch64_print_hint_for_core (str);
11203 break;
11204 default:
11205 gcc_unreachable ();
11208 return false;
11211 /* Parse an architecture extensions target attribute string specified in STR.
11212 For example "+fp+nosimd". Show any errors if needed. Return TRUE
11213 if successful. Update aarch64_isa_flags to reflect the ISA features
11214 modified. */
11216 static bool
11217 aarch64_handle_attr_isa_flags (char *str)
11219 enum aarch64_parse_opt_result parse_res;
11220 unsigned long isa_flags = aarch64_isa_flags;
11222 /* We allow "+nothing" in the beginning to clear out all architectural
11223 features if the user wants to handpick specific features. */
11224 if (strncmp ("+nothing", str, 8) == 0)
11226 isa_flags = 0;
11227 str += 8;
11230 parse_res = aarch64_parse_extension (str, &isa_flags);
11232 if (parse_res == AARCH64_PARSE_OK)
11234 aarch64_isa_flags = isa_flags;
11235 return true;
11238 switch (parse_res)
11240 case AARCH64_PARSE_MISSING_ARG:
11241 error ("missing value in %<target()%> pragma or attribute");
11242 break;
11244 case AARCH64_PARSE_INVALID_FEATURE:
11245 error ("invalid value (\"%s\") in %<target()%> pragma or attribute", str);
11246 break;
11248 default:
11249 gcc_unreachable ();
11252 return false;
11255 /* The target attributes that we support. On top of these we also support just
11256 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
11257 handled explicitly in aarch64_process_one_target_attr. */
11259 static const struct aarch64_attribute_info aarch64_attributes[] =
11261 { "general-regs-only", aarch64_attr_mask, false, NULL,
11262 OPT_mgeneral_regs_only },
11263 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
11264 OPT_mfix_cortex_a53_835769 },
11265 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
11266 OPT_mfix_cortex_a53_843419 },
11267 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
11268 { "strict-align", aarch64_attr_mask, true, NULL, OPT_mstrict_align },
11269 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
11270 OPT_momit_leaf_frame_pointer },
11271 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
11272 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
11273 OPT_march_ },
11274 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
11275 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
11276 OPT_mtune_ },
11277 { "sign-return-address", aarch64_attr_enum, false, NULL,
11278 OPT_msign_return_address_ },
11279 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
11282 /* Parse ARG_STR which contains the definition of one target attribute.
11283 Show appropriate errors if any or return true if the attribute is valid. */
11285 static bool
11286 aarch64_process_one_target_attr (char *arg_str)
11288 bool invert = false;
11290 size_t len = strlen (arg_str);
11292 if (len == 0)
11294 error ("malformed %<target()%> pragma or attribute");
11295 return false;
11298 char *str_to_check = (char *) alloca (len + 1);
11299 strcpy (str_to_check, arg_str);
11301 /* Skip leading whitespace. */
11302 while (*str_to_check == ' ' || *str_to_check == '\t')
11303 str_to_check++;
11305 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
11306 It is easier to detect and handle it explicitly here rather than going
11307 through the machinery for the rest of the target attributes in this
11308 function. */
11309 if (*str_to_check == '+')
11310 return aarch64_handle_attr_isa_flags (str_to_check);
11312 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
11314 invert = true;
11315 str_to_check += 3;
11317 char *arg = strchr (str_to_check, '=');
11319 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
11320 and point ARG to "foo". */
11321 if (arg)
11323 *arg = '\0';
11324 arg++;
11326 const struct aarch64_attribute_info *p_attr;
11327 bool found = false;
11328 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
11330 /* If the names don't match up, or the user has given an argument
11331 to an attribute that doesn't accept one, or didn't give an argument
11332 to an attribute that expects one, fail to match. */
11333 if (strcmp (str_to_check, p_attr->name) != 0)
11334 continue;
11336 found = true;
11337 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
11338 || p_attr->attr_type == aarch64_attr_enum;
11340 if (attr_need_arg_p ^ (arg != NULL))
11342 error ("pragma or attribute %<target(\"%s\")%> does not accept an argument", str_to_check);
11343 return false;
11346 /* If the name matches but the attribute does not allow "no-" versions
11347 then we can't match. */
11348 if (invert && !p_attr->allow_neg)
11350 error ("pragma or attribute %<target(\"%s\")%> does not allow a negated form", str_to_check);
11351 return false;
11354 switch (p_attr->attr_type)
11356 /* Has a custom handler registered.
11357 For example, cpu=, arch=, tune=. */
11358 case aarch64_attr_custom:
11359 gcc_assert (p_attr->handler);
11360 if (!p_attr->handler (arg))
11361 return false;
11362 break;
11364 /* Either set or unset a boolean option. */
11365 case aarch64_attr_bool:
11367 struct cl_decoded_option decoded;
11369 generate_option (p_attr->opt_num, NULL, !invert,
11370 CL_TARGET, &decoded);
11371 aarch64_handle_option (&global_options, &global_options_set,
11372 &decoded, input_location);
11373 break;
11375 /* Set or unset a bit in the target_flags. aarch64_handle_option
11376 should know what mask to apply given the option number. */
11377 case aarch64_attr_mask:
11379 struct cl_decoded_option decoded;
11380 /* We only need to specify the option number.
11381 aarch64_handle_option will know which mask to apply. */
11382 decoded.opt_index = p_attr->opt_num;
11383 decoded.value = !invert;
11384 aarch64_handle_option (&global_options, &global_options_set,
11385 &decoded, input_location);
11386 break;
11388 /* Use the option setting machinery to set an option to an enum. */
11389 case aarch64_attr_enum:
11391 gcc_assert (arg);
11392 bool valid;
11393 int value;
11394 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
11395 &value, CL_TARGET);
11396 if (valid)
11398 set_option (&global_options, NULL, p_attr->opt_num, value,
11399 NULL, DK_UNSPECIFIED, input_location,
11400 global_dc);
11402 else
11404 error ("pragma or attribute %<target(\"%s=%s\")%> is not valid", str_to_check, arg);
11406 break;
11408 default:
11409 gcc_unreachable ();
11413 /* If we reached here we either have found an attribute and validated
11414 it or didn't match any. If we matched an attribute but its arguments
11415 were malformed we will have returned false already. */
11416 return found;
11419 /* Count how many times the character C appears in
11420 NULL-terminated string STR. */
11422 static unsigned int
11423 num_occurences_in_str (char c, char *str)
11425 unsigned int res = 0;
11426 while (*str != '\0')
11428 if (*str == c)
11429 res++;
11431 str++;
11434 return res;
11437 /* Parse the tree in ARGS that contains the target attribute information
11438 and update the global target options space. */
11440 bool
11441 aarch64_process_target_attr (tree args)
11443 if (TREE_CODE (args) == TREE_LIST)
11447 tree head = TREE_VALUE (args);
11448 if (head)
11450 if (!aarch64_process_target_attr (head))
11451 return false;
11453 args = TREE_CHAIN (args);
11454 } while (args);
11456 return true;
11459 if (TREE_CODE (args) != STRING_CST)
11461 error ("attribute %<target%> argument not a string");
11462 return false;
11465 size_t len = strlen (TREE_STRING_POINTER (args));
11466 char *str_to_check = (char *) alloca (len + 1);
11467 strcpy (str_to_check, TREE_STRING_POINTER (args));
11469 if (len == 0)
11471 error ("malformed %<target()%> pragma or attribute");
11472 return false;
11475 /* Used to catch empty spaces between commas i.e.
11476 attribute ((target ("attr1,,attr2"))). */
11477 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
11479 /* Handle multiple target attributes separated by ','. */
11480 char *token = strtok (str_to_check, ",");
11482 unsigned int num_attrs = 0;
11483 while (token)
11485 num_attrs++;
11486 if (!aarch64_process_one_target_attr (token))
11488 error ("pragma or attribute %<target(\"%s\")%> is not valid", token);
11489 return false;
11492 token = strtok (NULL, ",");
11495 if (num_attrs != num_commas + 1)
11497 error ("malformed %<target(\"%s\")%> pragma or attribute", TREE_STRING_POINTER (args));
11498 return false;
11501 return true;
11504 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
11505 process attribute ((target ("..."))). */
11507 static bool
11508 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
11510 struct cl_target_option cur_target;
11511 bool ret;
11512 tree old_optimize;
11513 tree new_target, new_optimize;
11514 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11516 /* If what we're processing is the current pragma string then the
11517 target option node is already stored in target_option_current_node
11518 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
11519 having to re-parse the string. This is especially useful to keep
11520 arm_neon.h compile times down since that header contains a lot
11521 of intrinsics enclosed in pragmas. */
11522 if (!existing_target && args == current_target_pragma)
11524 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
11525 return true;
11527 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11529 old_optimize = build_optimization_node (&global_options);
11530 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
11532 /* If the function changed the optimization levels as well as setting
11533 target options, start with the optimizations specified. */
11534 if (func_optimize && func_optimize != old_optimize)
11535 cl_optimization_restore (&global_options,
11536 TREE_OPTIMIZATION (func_optimize));
11538 /* Save the current target options to restore at the end. */
11539 cl_target_option_save (&cur_target, &global_options);
11541 /* If fndecl already has some target attributes applied to it, unpack
11542 them so that we add this attribute on top of them, rather than
11543 overwriting them. */
11544 if (existing_target)
11546 struct cl_target_option *existing_options
11547 = TREE_TARGET_OPTION (existing_target);
11549 if (existing_options)
11550 cl_target_option_restore (&global_options, existing_options);
11552 else
11553 cl_target_option_restore (&global_options,
11554 TREE_TARGET_OPTION (target_option_current_node));
11556 ret = aarch64_process_target_attr (args);
11558 /* Set up any additional state. */
11559 if (ret)
11561 aarch64_override_options_internal (&global_options);
11562 /* Initialize SIMD builtins if we haven't already.
11563 Set current_target_pragma to NULL for the duration so that
11564 the builtin initialization code doesn't try to tag the functions
11565 being built with the attributes specified by any current pragma, thus
11566 going into an infinite recursion. */
11567 if (TARGET_SIMD)
11569 tree saved_current_target_pragma = current_target_pragma;
11570 current_target_pragma = NULL;
11571 aarch64_init_simd_builtins ();
11572 current_target_pragma = saved_current_target_pragma;
11574 new_target = build_target_option_node (&global_options);
11576 else
11577 new_target = NULL;
11579 new_optimize = build_optimization_node (&global_options);
11581 if (fndecl && ret)
11583 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
11585 if (old_optimize != new_optimize)
11586 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
11589 cl_target_option_restore (&global_options, &cur_target);
11591 if (old_optimize != new_optimize)
11592 cl_optimization_restore (&global_options,
11593 TREE_OPTIMIZATION (old_optimize));
11594 return ret;
11597 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
11598 tri-bool options (yes, no, don't care) and the default value is
11599 DEF, determine whether to reject inlining. */
11601 static bool
11602 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
11603 int dont_care, int def)
11605 /* If the callee doesn't care, always allow inlining. */
11606 if (callee == dont_care)
11607 return true;
11609 /* If the caller doesn't care, always allow inlining. */
11610 if (caller == dont_care)
11611 return true;
11613 /* Otherwise, allow inlining if either the callee and caller values
11614 agree, or if the callee is using the default value. */
11615 return (callee == caller || callee == def);
11618 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
11619 to inline CALLEE into CALLER based on target-specific info.
11620 Make sure that the caller and callee have compatible architectural
11621 features. Then go through the other possible target attributes
11622 and see if they can block inlining. Try not to reject always_inline
11623 callees unless they are incompatible architecturally. */
11625 static bool
11626 aarch64_can_inline_p (tree caller, tree callee)
11628 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
11629 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
11631 struct cl_target_option *caller_opts
11632 = TREE_TARGET_OPTION (caller_tree ? caller_tree
11633 : target_option_default_node);
11635 struct cl_target_option *callee_opts
11636 = TREE_TARGET_OPTION (callee_tree ? callee_tree
11637 : target_option_default_node);
11639 /* Callee's ISA flags should be a subset of the caller's. */
11640 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
11641 != callee_opts->x_aarch64_isa_flags)
11642 return false;
11644 /* Allow non-strict aligned functions inlining into strict
11645 aligned ones. */
11646 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
11647 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
11648 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
11649 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
11650 return false;
11652 bool always_inline = lookup_attribute ("always_inline",
11653 DECL_ATTRIBUTES (callee));
11655 /* If the architectural features match up and the callee is always_inline
11656 then the other attributes don't matter. */
11657 if (always_inline)
11658 return true;
11660 if (caller_opts->x_aarch64_cmodel_var
11661 != callee_opts->x_aarch64_cmodel_var)
11662 return false;
11664 if (caller_opts->x_aarch64_tls_dialect
11665 != callee_opts->x_aarch64_tls_dialect)
11666 return false;
11668 /* Honour explicit requests to workaround errata. */
11669 if (!aarch64_tribools_ok_for_inlining_p (
11670 caller_opts->x_aarch64_fix_a53_err835769,
11671 callee_opts->x_aarch64_fix_a53_err835769,
11672 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
11673 return false;
11675 if (!aarch64_tribools_ok_for_inlining_p (
11676 caller_opts->x_aarch64_fix_a53_err843419,
11677 callee_opts->x_aarch64_fix_a53_err843419,
11678 2, TARGET_FIX_ERR_A53_843419))
11679 return false;
11681 /* If the user explicitly specified -momit-leaf-frame-pointer for the
11682 caller and calle and they don't match up, reject inlining. */
11683 if (!aarch64_tribools_ok_for_inlining_p (
11684 caller_opts->x_flag_omit_leaf_frame_pointer,
11685 callee_opts->x_flag_omit_leaf_frame_pointer,
11686 2, 1))
11687 return false;
11689 /* If the callee has specific tuning overrides, respect them. */
11690 if (callee_opts->x_aarch64_override_tune_string != NULL
11691 && caller_opts->x_aarch64_override_tune_string == NULL)
11692 return false;
11694 /* If the user specified tuning override strings for the
11695 caller and callee and they don't match up, reject inlining.
11696 We just do a string compare here, we don't analyze the meaning
11697 of the string, as it would be too costly for little gain. */
11698 if (callee_opts->x_aarch64_override_tune_string
11699 && caller_opts->x_aarch64_override_tune_string
11700 && (strcmp (callee_opts->x_aarch64_override_tune_string,
11701 caller_opts->x_aarch64_override_tune_string) != 0))
11702 return false;
11704 return true;
11707 /* Return true if SYMBOL_REF X binds locally. */
11709 static bool
11710 aarch64_symbol_binds_local_p (const_rtx x)
11712 return (SYMBOL_REF_DECL (x)
11713 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
11714 : SYMBOL_REF_LOCAL_P (x));
11717 /* Return true if SYMBOL_REF X is thread local */
11718 static bool
11719 aarch64_tls_symbol_p (rtx x)
11721 if (! TARGET_HAVE_TLS)
11722 return false;
11724 if (GET_CODE (x) != SYMBOL_REF)
11725 return false;
11727 return SYMBOL_REF_TLS_MODEL (x) != 0;
11730 /* Classify a TLS symbol into one of the TLS kinds. */
11731 enum aarch64_symbol_type
11732 aarch64_classify_tls_symbol (rtx x)
11734 enum tls_model tls_kind = tls_symbolic_operand_type (x);
11736 switch (tls_kind)
11738 case TLS_MODEL_GLOBAL_DYNAMIC:
11739 case TLS_MODEL_LOCAL_DYNAMIC:
11740 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
11742 case TLS_MODEL_INITIAL_EXEC:
11743 switch (aarch64_cmodel)
11745 case AARCH64_CMODEL_TINY:
11746 case AARCH64_CMODEL_TINY_PIC:
11747 return SYMBOL_TINY_TLSIE;
11748 default:
11749 return SYMBOL_SMALL_TLSIE;
11752 case TLS_MODEL_LOCAL_EXEC:
11753 if (aarch64_tls_size == 12)
11754 return SYMBOL_TLSLE12;
11755 else if (aarch64_tls_size == 24)
11756 return SYMBOL_TLSLE24;
11757 else if (aarch64_tls_size == 32)
11758 return SYMBOL_TLSLE32;
11759 else if (aarch64_tls_size == 48)
11760 return SYMBOL_TLSLE48;
11761 else
11762 gcc_unreachable ();
11764 case TLS_MODEL_EMULATED:
11765 case TLS_MODEL_NONE:
11766 return SYMBOL_FORCE_TO_MEM;
11768 default:
11769 gcc_unreachable ();
11773 /* Return the correct method for accessing X + OFFSET, where X is either
11774 a SYMBOL_REF or LABEL_REF. */
11776 enum aarch64_symbol_type
11777 aarch64_classify_symbol (rtx x, HOST_WIDE_INT offset)
11779 if (GET_CODE (x) == LABEL_REF)
11781 switch (aarch64_cmodel)
11783 case AARCH64_CMODEL_LARGE:
11784 return SYMBOL_FORCE_TO_MEM;
11786 case AARCH64_CMODEL_TINY_PIC:
11787 case AARCH64_CMODEL_TINY:
11788 return SYMBOL_TINY_ABSOLUTE;
11790 case AARCH64_CMODEL_SMALL_SPIC:
11791 case AARCH64_CMODEL_SMALL_PIC:
11792 case AARCH64_CMODEL_SMALL:
11793 return SYMBOL_SMALL_ABSOLUTE;
11795 default:
11796 gcc_unreachable ();
11800 if (GET_CODE (x) == SYMBOL_REF)
11802 if (aarch64_tls_symbol_p (x))
11803 return aarch64_classify_tls_symbol (x);
11805 switch (aarch64_cmodel)
11807 case AARCH64_CMODEL_TINY:
11808 /* When we retrieve symbol + offset address, we have to make sure
11809 the offset does not cause overflow of the final address. But
11810 we have no way of knowing the address of symbol at compile time
11811 so we can't accurately say if the distance between the PC and
11812 symbol + offset is outside the addressible range of +/-1M in the
11813 TINY code model. So we rely on images not being greater than
11814 1M and cap the offset at 1M and anything beyond 1M will have to
11815 be loaded using an alternative mechanism. Furthermore if the
11816 symbol is a weak reference to something that isn't known to
11817 resolve to a symbol in this module, then force to memory. */
11818 if ((SYMBOL_REF_WEAK (x)
11819 && !aarch64_symbol_binds_local_p (x))
11820 || !IN_RANGE (offset, -1048575, 1048575))
11821 return SYMBOL_FORCE_TO_MEM;
11822 return SYMBOL_TINY_ABSOLUTE;
11824 case AARCH64_CMODEL_SMALL:
11825 /* Same reasoning as the tiny code model, but the offset cap here is
11826 4G. */
11827 if ((SYMBOL_REF_WEAK (x)
11828 && !aarch64_symbol_binds_local_p (x))
11829 || !IN_RANGE (offset, HOST_WIDE_INT_C (-4294967263),
11830 HOST_WIDE_INT_C (4294967264)))
11831 return SYMBOL_FORCE_TO_MEM;
11832 return SYMBOL_SMALL_ABSOLUTE;
11834 case AARCH64_CMODEL_TINY_PIC:
11835 if (!aarch64_symbol_binds_local_p (x))
11836 return SYMBOL_TINY_GOT;
11837 return SYMBOL_TINY_ABSOLUTE;
11839 case AARCH64_CMODEL_SMALL_SPIC:
11840 case AARCH64_CMODEL_SMALL_PIC:
11841 if (!aarch64_symbol_binds_local_p (x))
11842 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
11843 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
11844 return SYMBOL_SMALL_ABSOLUTE;
11846 case AARCH64_CMODEL_LARGE:
11847 /* This is alright even in PIC code as the constant
11848 pool reference is always PC relative and within
11849 the same translation unit. */
11850 if (!aarch64_pcrelative_literal_loads && CONSTANT_POOL_ADDRESS_P (x))
11851 return SYMBOL_SMALL_ABSOLUTE;
11852 else
11853 return SYMBOL_FORCE_TO_MEM;
11855 default:
11856 gcc_unreachable ();
11860 /* By default push everything into the constant pool. */
11861 return SYMBOL_FORCE_TO_MEM;
11864 bool
11865 aarch64_constant_address_p (rtx x)
11867 return (CONSTANT_P (x) && memory_address_p (DImode, x));
11870 bool
11871 aarch64_legitimate_pic_operand_p (rtx x)
11873 if (GET_CODE (x) == SYMBOL_REF
11874 || (GET_CODE (x) == CONST
11875 && GET_CODE (XEXP (x, 0)) == PLUS
11876 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
11877 return false;
11879 return true;
11882 /* Implement TARGET_LEGITIMATE_CONSTANT_P hook. Return true for constants
11883 that should be rematerialized rather than spilled. */
11885 static bool
11886 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
11888 /* Support CSE and rematerialization of common constants. */
11889 if (CONST_INT_P (x)
11890 || (CONST_DOUBLE_P (x) && GET_MODE_CLASS (mode) == MODE_FLOAT)
11891 || GET_CODE (x) == CONST_VECTOR)
11892 return true;
11894 /* Do not allow vector struct mode constants for Advanced SIMD.
11895 We could support 0 and -1 easily, but they need support in
11896 aarch64-simd.md. */
11897 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
11898 if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
11899 return false;
11901 /* Only accept variable-length vector constants if they can be
11902 handled directly.
11904 ??? It would be possible to handle rematerialization of other
11905 constants via secondary reloads. */
11906 if (vec_flags & VEC_ANY_SVE)
11907 return aarch64_simd_valid_immediate (x, NULL);
11909 if (GET_CODE (x) == HIGH)
11910 x = XEXP (x, 0);
11912 /* Accept polynomial constants that can be calculated by using the
11913 destination of a move as the sole temporary. Constants that
11914 require a second temporary cannot be rematerialized (they can't be
11915 forced to memory and also aren't legitimate constants). */
11916 poly_int64 offset;
11917 if (poly_int_rtx_p (x, &offset))
11918 return aarch64_offset_temporaries (false, offset) <= 1;
11920 /* If an offset is being added to something else, we need to allow the
11921 base to be moved into the destination register, meaning that there
11922 are no free temporaries for the offset. */
11923 x = strip_offset (x, &offset);
11924 if (!offset.is_constant () && aarch64_offset_temporaries (true, offset) > 0)
11925 return false;
11927 /* Do not allow const (plus (anchor_symbol, const_int)). */
11928 if (maybe_ne (offset, 0) && SYMBOL_REF_P (x) && SYMBOL_REF_ANCHOR_P (x))
11929 return false;
11931 /* Treat symbols as constants. Avoid TLS symbols as they are complex,
11932 so spilling them is better than rematerialization. */
11933 if (SYMBOL_REF_P (x) && !SYMBOL_REF_TLS_MODEL (x))
11934 return true;
11936 /* Label references are always constant. */
11937 if (GET_CODE (x) == LABEL_REF)
11938 return true;
11940 return false;
11944 aarch64_load_tp (rtx target)
11946 if (!target
11947 || GET_MODE (target) != Pmode
11948 || !register_operand (target, Pmode))
11949 target = gen_reg_rtx (Pmode);
11951 /* Can return in any reg. */
11952 emit_insn (gen_aarch64_load_tp_hard (target));
11953 return target;
11956 /* On AAPCS systems, this is the "struct __va_list". */
11957 static GTY(()) tree va_list_type;
11959 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
11960 Return the type to use as __builtin_va_list.
11962 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
11964 struct __va_list
11966 void *__stack;
11967 void *__gr_top;
11968 void *__vr_top;
11969 int __gr_offs;
11970 int __vr_offs;
11971 }; */
11973 static tree
11974 aarch64_build_builtin_va_list (void)
11976 tree va_list_name;
11977 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
11979 /* Create the type. */
11980 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
11981 /* Give it the required name. */
11982 va_list_name = build_decl (BUILTINS_LOCATION,
11983 TYPE_DECL,
11984 get_identifier ("__va_list"),
11985 va_list_type);
11986 DECL_ARTIFICIAL (va_list_name) = 1;
11987 TYPE_NAME (va_list_type) = va_list_name;
11988 TYPE_STUB_DECL (va_list_type) = va_list_name;
11990 /* Create the fields. */
11991 f_stack = build_decl (BUILTINS_LOCATION,
11992 FIELD_DECL, get_identifier ("__stack"),
11993 ptr_type_node);
11994 f_grtop = build_decl (BUILTINS_LOCATION,
11995 FIELD_DECL, get_identifier ("__gr_top"),
11996 ptr_type_node);
11997 f_vrtop = build_decl (BUILTINS_LOCATION,
11998 FIELD_DECL, get_identifier ("__vr_top"),
11999 ptr_type_node);
12000 f_groff = build_decl (BUILTINS_LOCATION,
12001 FIELD_DECL, get_identifier ("__gr_offs"),
12002 integer_type_node);
12003 f_vroff = build_decl (BUILTINS_LOCATION,
12004 FIELD_DECL, get_identifier ("__vr_offs"),
12005 integer_type_node);
12007 /* Tell tree-stdarg pass about our internal offset fields.
12008 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
12009 purpose to identify whether the code is updating va_list internal
12010 offset fields through irregular way. */
12011 va_list_gpr_counter_field = f_groff;
12012 va_list_fpr_counter_field = f_vroff;
12014 DECL_ARTIFICIAL (f_stack) = 1;
12015 DECL_ARTIFICIAL (f_grtop) = 1;
12016 DECL_ARTIFICIAL (f_vrtop) = 1;
12017 DECL_ARTIFICIAL (f_groff) = 1;
12018 DECL_ARTIFICIAL (f_vroff) = 1;
12020 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
12021 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
12022 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
12023 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
12024 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
12026 TYPE_FIELDS (va_list_type) = f_stack;
12027 DECL_CHAIN (f_stack) = f_grtop;
12028 DECL_CHAIN (f_grtop) = f_vrtop;
12029 DECL_CHAIN (f_vrtop) = f_groff;
12030 DECL_CHAIN (f_groff) = f_vroff;
12032 /* Compute its layout. */
12033 layout_type (va_list_type);
12035 return va_list_type;
12038 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
12039 static void
12040 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
12042 const CUMULATIVE_ARGS *cum;
12043 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12044 tree stack, grtop, vrtop, groff, vroff;
12045 tree t;
12046 int gr_save_area_size = cfun->va_list_gpr_size;
12047 int vr_save_area_size = cfun->va_list_fpr_size;
12048 int vr_offset;
12050 cum = &crtl->args.info;
12051 if (cfun->va_list_gpr_size)
12052 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
12053 cfun->va_list_gpr_size);
12054 if (cfun->va_list_fpr_size)
12055 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
12056 * UNITS_PER_VREG, cfun->va_list_fpr_size);
12058 if (!TARGET_FLOAT)
12060 gcc_assert (cum->aapcs_nvrn == 0);
12061 vr_save_area_size = 0;
12064 f_stack = TYPE_FIELDS (va_list_type_node);
12065 f_grtop = DECL_CHAIN (f_stack);
12066 f_vrtop = DECL_CHAIN (f_grtop);
12067 f_groff = DECL_CHAIN (f_vrtop);
12068 f_vroff = DECL_CHAIN (f_groff);
12070 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
12071 NULL_TREE);
12072 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
12073 NULL_TREE);
12074 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
12075 NULL_TREE);
12076 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
12077 NULL_TREE);
12078 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
12079 NULL_TREE);
12081 /* Emit code to initialize STACK, which points to the next varargs stack
12082 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
12083 by named arguments. STACK is 8-byte aligned. */
12084 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
12085 if (cum->aapcs_stack_size > 0)
12086 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
12087 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
12088 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12090 /* Emit code to initialize GRTOP, the top of the GR save area.
12091 virtual_incoming_args_rtx should have been 16 byte aligned. */
12092 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
12093 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
12094 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12096 /* Emit code to initialize VRTOP, the top of the VR save area.
12097 This address is gr_save_area_bytes below GRTOP, rounded
12098 down to the next 16-byte boundary. */
12099 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
12100 vr_offset = ROUND_UP (gr_save_area_size,
12101 STACK_BOUNDARY / BITS_PER_UNIT);
12103 if (vr_offset)
12104 t = fold_build_pointer_plus_hwi (t, -vr_offset);
12105 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
12106 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12108 /* Emit code to initialize GROFF, the offset from GRTOP of the
12109 next GPR argument. */
12110 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
12111 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
12112 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12114 /* Likewise emit code to initialize VROFF, the offset from FTOP
12115 of the next VR argument. */
12116 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
12117 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
12118 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
12121 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
12123 static tree
12124 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
12125 gimple_seq *post_p ATTRIBUTE_UNUSED)
12127 tree addr;
12128 bool indirect_p;
12129 bool is_ha; /* is HFA or HVA. */
12130 bool dw_align; /* double-word align. */
12131 machine_mode ag_mode = VOIDmode;
12132 int nregs;
12133 machine_mode mode;
12135 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
12136 tree stack, f_top, f_off, off, arg, roundup, on_stack;
12137 HOST_WIDE_INT size, rsize, adjust, align;
12138 tree t, u, cond1, cond2;
12140 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
12141 if (indirect_p)
12142 type = build_pointer_type (type);
12144 mode = TYPE_MODE (type);
12146 f_stack = TYPE_FIELDS (va_list_type_node);
12147 f_grtop = DECL_CHAIN (f_stack);
12148 f_vrtop = DECL_CHAIN (f_grtop);
12149 f_groff = DECL_CHAIN (f_vrtop);
12150 f_vroff = DECL_CHAIN (f_groff);
12152 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
12153 f_stack, NULL_TREE);
12154 size = int_size_in_bytes (type);
12155 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
12157 dw_align = false;
12158 adjust = 0;
12159 if (aarch64_vfp_is_call_or_return_candidate (mode,
12160 type,
12161 &ag_mode,
12162 &nregs,
12163 &is_ha))
12165 /* No frontends can create types with variable-sized modes, so we
12166 shouldn't be asked to pass or return them. */
12167 unsigned int ag_size = GET_MODE_SIZE (ag_mode).to_constant ();
12169 /* TYPE passed in fp/simd registers. */
12170 if (!TARGET_FLOAT)
12171 aarch64_err_no_fpadvsimd (mode);
12173 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
12174 unshare_expr (valist), f_vrtop, NULL_TREE);
12175 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
12176 unshare_expr (valist), f_vroff, NULL_TREE);
12178 rsize = nregs * UNITS_PER_VREG;
12180 if (is_ha)
12182 if (BYTES_BIG_ENDIAN && ag_size < UNITS_PER_VREG)
12183 adjust = UNITS_PER_VREG - ag_size;
12185 else if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12186 && size < UNITS_PER_VREG)
12188 adjust = UNITS_PER_VREG - size;
12191 else
12193 /* TYPE passed in general registers. */
12194 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
12195 unshare_expr (valist), f_grtop, NULL_TREE);
12196 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
12197 unshare_expr (valist), f_groff, NULL_TREE);
12198 rsize = ROUND_UP (size, UNITS_PER_WORD);
12199 nregs = rsize / UNITS_PER_WORD;
12201 if (align > 8)
12202 dw_align = true;
12204 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12205 && size < UNITS_PER_WORD)
12207 adjust = UNITS_PER_WORD - size;
12211 /* Get a local temporary for the field value. */
12212 off = get_initialized_tmp_var (f_off, pre_p, NULL);
12214 /* Emit code to branch if off >= 0. */
12215 t = build2 (GE_EXPR, boolean_type_node, off,
12216 build_int_cst (TREE_TYPE (off), 0));
12217 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
12219 if (dw_align)
12221 /* Emit: offs = (offs + 15) & -16. */
12222 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12223 build_int_cst (TREE_TYPE (off), 15));
12224 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
12225 build_int_cst (TREE_TYPE (off), -16));
12226 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
12228 else
12229 roundup = NULL;
12231 /* Update ap.__[g|v]r_offs */
12232 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
12233 build_int_cst (TREE_TYPE (off), rsize));
12234 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
12236 /* String up. */
12237 if (roundup)
12238 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12240 /* [cond2] if (ap.__[g|v]r_offs > 0) */
12241 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
12242 build_int_cst (TREE_TYPE (f_off), 0));
12243 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
12245 /* String up: make sure the assignment happens before the use. */
12246 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
12247 COND_EXPR_ELSE (cond1) = t;
12249 /* Prepare the trees handling the argument that is passed on the stack;
12250 the top level node will store in ON_STACK. */
12251 arg = get_initialized_tmp_var (stack, pre_p, NULL);
12252 if (align > 8)
12254 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
12255 t = fold_build_pointer_plus_hwi (arg, 15);
12256 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12257 build_int_cst (TREE_TYPE (t), -16));
12258 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
12260 else
12261 roundup = NULL;
12262 /* Advance ap.__stack */
12263 t = fold_build_pointer_plus_hwi (arg, size + 7);
12264 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
12265 build_int_cst (TREE_TYPE (t), -8));
12266 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
12267 /* String up roundup and advance. */
12268 if (roundup)
12269 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
12270 /* String up with arg */
12271 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
12272 /* Big-endianness related address adjustment. */
12273 if (BLOCK_REG_PADDING (mode, type, 1) == PAD_DOWNWARD
12274 && size < UNITS_PER_WORD)
12276 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
12277 size_int (UNITS_PER_WORD - size));
12278 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
12281 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
12282 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
12284 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
12285 t = off;
12286 if (adjust)
12287 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
12288 build_int_cst (TREE_TYPE (off), adjust));
12290 t = fold_convert (sizetype, t);
12291 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
12293 if (is_ha)
12295 /* type ha; // treat as "struct {ftype field[n];}"
12296 ... [computing offs]
12297 for (i = 0; i <nregs; ++i, offs += 16)
12298 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
12299 return ha; */
12300 int i;
12301 tree tmp_ha, field_t, field_ptr_t;
12303 /* Declare a local variable. */
12304 tmp_ha = create_tmp_var_raw (type, "ha");
12305 gimple_add_tmp_var (tmp_ha);
12307 /* Establish the base type. */
12308 switch (ag_mode)
12310 case E_SFmode:
12311 field_t = float_type_node;
12312 field_ptr_t = float_ptr_type_node;
12313 break;
12314 case E_DFmode:
12315 field_t = double_type_node;
12316 field_ptr_t = double_ptr_type_node;
12317 break;
12318 case E_TFmode:
12319 field_t = long_double_type_node;
12320 field_ptr_t = long_double_ptr_type_node;
12321 break;
12322 case E_HFmode:
12323 field_t = aarch64_fp16_type_node;
12324 field_ptr_t = aarch64_fp16_ptr_type_node;
12325 break;
12326 case E_V2SImode:
12327 case E_V4SImode:
12329 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
12330 field_t = build_vector_type_for_mode (innertype, ag_mode);
12331 field_ptr_t = build_pointer_type (field_t);
12333 break;
12334 default:
12335 gcc_assert (0);
12338 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
12339 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
12340 addr = t;
12341 t = fold_convert (field_ptr_t, addr);
12342 t = build2 (MODIFY_EXPR, field_t,
12343 build1 (INDIRECT_REF, field_t, tmp_ha),
12344 build1 (INDIRECT_REF, field_t, t));
12346 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
12347 for (i = 1; i < nregs; ++i)
12349 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
12350 u = fold_convert (field_ptr_t, addr);
12351 u = build2 (MODIFY_EXPR, field_t,
12352 build2 (MEM_REF, field_t, tmp_ha,
12353 build_int_cst (field_ptr_t,
12354 (i *
12355 int_size_in_bytes (field_t)))),
12356 build1 (INDIRECT_REF, field_t, u));
12357 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
12360 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
12361 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
12364 COND_EXPR_ELSE (cond2) = t;
12365 addr = fold_convert (build_pointer_type (type), cond1);
12366 addr = build_va_arg_indirect_ref (addr);
12368 if (indirect_p)
12369 addr = build_va_arg_indirect_ref (addr);
12371 return addr;
12374 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
12376 static void
12377 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
12378 tree type, int *pretend_size ATTRIBUTE_UNUSED,
12379 int no_rtl)
12381 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
12382 CUMULATIVE_ARGS local_cum;
12383 int gr_saved = cfun->va_list_gpr_size;
12384 int vr_saved = cfun->va_list_fpr_size;
12386 /* The caller has advanced CUM up to, but not beyond, the last named
12387 argument. Advance a local copy of CUM past the last "real" named
12388 argument, to find out how many registers are left over. */
12389 local_cum = *cum;
12390 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
12392 /* Found out how many registers we need to save.
12393 Honor tree-stdvar analysis results. */
12394 if (cfun->va_list_gpr_size)
12395 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
12396 cfun->va_list_gpr_size / UNITS_PER_WORD);
12397 if (cfun->va_list_fpr_size)
12398 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
12399 cfun->va_list_fpr_size / UNITS_PER_VREG);
12401 if (!TARGET_FLOAT)
12403 gcc_assert (local_cum.aapcs_nvrn == 0);
12404 vr_saved = 0;
12407 if (!no_rtl)
12409 if (gr_saved > 0)
12411 rtx ptr, mem;
12413 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
12414 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
12415 - gr_saved * UNITS_PER_WORD);
12416 mem = gen_frame_mem (BLKmode, ptr);
12417 set_mem_alias_set (mem, get_varargs_alias_set ());
12419 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
12420 mem, gr_saved);
12422 if (vr_saved > 0)
12424 /* We can't use move_block_from_reg, because it will use
12425 the wrong mode, storing D regs only. */
12426 machine_mode mode = TImode;
12427 int off, i, vr_start;
12429 /* Set OFF to the offset from virtual_incoming_args_rtx of
12430 the first vector register. The VR save area lies below
12431 the GR one, and is aligned to 16 bytes. */
12432 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
12433 STACK_BOUNDARY / BITS_PER_UNIT);
12434 off -= vr_saved * UNITS_PER_VREG;
12436 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
12437 for (i = 0; i < vr_saved; ++i)
12439 rtx ptr, mem;
12441 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
12442 mem = gen_frame_mem (mode, ptr);
12443 set_mem_alias_set (mem, get_varargs_alias_set ());
12444 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
12445 off += UNITS_PER_VREG;
12450 /* We don't save the size into *PRETEND_SIZE because we want to avoid
12451 any complication of having crtl->args.pretend_args_size changed. */
12452 cfun->machine->frame.saved_varargs_size
12453 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
12454 STACK_BOUNDARY / BITS_PER_UNIT)
12455 + vr_saved * UNITS_PER_VREG);
12458 static void
12459 aarch64_conditional_register_usage (void)
12461 int i;
12462 if (!TARGET_FLOAT)
12464 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
12466 fixed_regs[i] = 1;
12467 call_used_regs[i] = 1;
12470 if (!TARGET_SVE)
12471 for (i = P0_REGNUM; i <= P15_REGNUM; i++)
12473 fixed_regs[i] = 1;
12474 call_used_regs[i] = 1;
12477 /* When tracking speculation, we need a couple of call-clobbered registers
12478 to track the speculation state. It would be nice to just use
12479 IP0 and IP1, but currently there are numerous places that just
12480 assume these registers are free for other uses (eg pointer
12481 authentication). */
12482 if (aarch64_track_speculation)
12484 fixed_regs[SPECULATION_TRACKER_REGNUM] = 1;
12485 call_used_regs[SPECULATION_TRACKER_REGNUM] = 1;
12486 fixed_regs[SPECULATION_SCRATCH_REGNUM] = 1;
12487 call_used_regs[SPECULATION_SCRATCH_REGNUM] = 1;
12491 /* Walk down the type tree of TYPE counting consecutive base elements.
12492 If *MODEP is VOIDmode, then set it to the first valid floating point
12493 type. If a non-floating point type is found, or if a floating point
12494 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
12495 otherwise return the count in the sub-tree. */
12496 static int
12497 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
12499 machine_mode mode;
12500 HOST_WIDE_INT size;
12502 switch (TREE_CODE (type))
12504 case REAL_TYPE:
12505 mode = TYPE_MODE (type);
12506 if (mode != DFmode && mode != SFmode
12507 && mode != TFmode && mode != HFmode)
12508 return -1;
12510 if (*modep == VOIDmode)
12511 *modep = mode;
12513 if (*modep == mode)
12514 return 1;
12516 break;
12518 case COMPLEX_TYPE:
12519 mode = TYPE_MODE (TREE_TYPE (type));
12520 if (mode != DFmode && mode != SFmode
12521 && mode != TFmode && mode != HFmode)
12522 return -1;
12524 if (*modep == VOIDmode)
12525 *modep = mode;
12527 if (*modep == mode)
12528 return 2;
12530 break;
12532 case VECTOR_TYPE:
12533 /* Use V2SImode and V4SImode as representatives of all 64-bit
12534 and 128-bit vector types. */
12535 size = int_size_in_bytes (type);
12536 switch (size)
12538 case 8:
12539 mode = V2SImode;
12540 break;
12541 case 16:
12542 mode = V4SImode;
12543 break;
12544 default:
12545 return -1;
12548 if (*modep == VOIDmode)
12549 *modep = mode;
12551 /* Vector modes are considered to be opaque: two vectors are
12552 equivalent for the purposes of being homogeneous aggregates
12553 if they are the same size. */
12554 if (*modep == mode)
12555 return 1;
12557 break;
12559 case ARRAY_TYPE:
12561 int count;
12562 tree index = TYPE_DOMAIN (type);
12564 /* Can't handle incomplete types nor sizes that are not
12565 fixed. */
12566 if (!COMPLETE_TYPE_P (type)
12567 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12568 return -1;
12570 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
12571 if (count == -1
12572 || !index
12573 || !TYPE_MAX_VALUE (index)
12574 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
12575 || !TYPE_MIN_VALUE (index)
12576 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
12577 || count < 0)
12578 return -1;
12580 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
12581 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
12583 /* There must be no padding. */
12584 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12585 count * GET_MODE_BITSIZE (*modep)))
12586 return -1;
12588 return count;
12591 case RECORD_TYPE:
12593 int count = 0;
12594 int sub_count;
12595 tree field;
12597 /* Can't handle incomplete types nor sizes that are not
12598 fixed. */
12599 if (!COMPLETE_TYPE_P (type)
12600 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12601 return -1;
12603 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12605 if (TREE_CODE (field) != FIELD_DECL)
12606 continue;
12608 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12609 if (sub_count < 0)
12610 return -1;
12611 count += sub_count;
12614 /* There must be no padding. */
12615 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12616 count * GET_MODE_BITSIZE (*modep)))
12617 return -1;
12619 return count;
12622 case UNION_TYPE:
12623 case QUAL_UNION_TYPE:
12625 /* These aren't very interesting except in a degenerate case. */
12626 int count = 0;
12627 int sub_count;
12628 tree field;
12630 /* Can't handle incomplete types nor sizes that are not
12631 fixed. */
12632 if (!COMPLETE_TYPE_P (type)
12633 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
12634 return -1;
12636 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
12638 if (TREE_CODE (field) != FIELD_DECL)
12639 continue;
12641 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
12642 if (sub_count < 0)
12643 return -1;
12644 count = count > sub_count ? count : sub_count;
12647 /* There must be no padding. */
12648 if (maybe_ne (wi::to_poly_wide (TYPE_SIZE (type)),
12649 count * GET_MODE_BITSIZE (*modep)))
12650 return -1;
12652 return count;
12655 default:
12656 break;
12659 return -1;
12662 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
12663 type as described in AAPCS64 \S 4.1.2.
12665 See the comment above aarch64_composite_type_p for the notes on MODE. */
12667 static bool
12668 aarch64_short_vector_p (const_tree type,
12669 machine_mode mode)
12671 poly_int64 size = -1;
12673 if (type && TREE_CODE (type) == VECTOR_TYPE)
12674 size = int_size_in_bytes (type);
12675 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
12676 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
12677 size = GET_MODE_SIZE (mode);
12679 return known_eq (size, 8) || known_eq (size, 16);
12682 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
12683 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
12684 array types. The C99 floating-point complex types are also considered
12685 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
12686 types, which are GCC extensions and out of the scope of AAPCS64, are
12687 treated as composite types here as well.
12689 Note that MODE itself is not sufficient in determining whether a type
12690 is such a composite type or not. This is because
12691 stor-layout.c:compute_record_mode may have already changed the MODE
12692 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
12693 structure with only one field may have its MODE set to the mode of the
12694 field. Also an integer mode whose size matches the size of the
12695 RECORD_TYPE type may be used to substitute the original mode
12696 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
12697 solely relied on. */
12699 static bool
12700 aarch64_composite_type_p (const_tree type,
12701 machine_mode mode)
12703 if (aarch64_short_vector_p (type, mode))
12704 return false;
12706 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
12707 return true;
12709 if (mode == BLKmode
12710 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
12711 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
12712 return true;
12714 return false;
12717 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
12718 shall be passed or returned in simd/fp register(s) (providing these
12719 parameter passing registers are available).
12721 Upon successful return, *COUNT returns the number of needed registers,
12722 *BASE_MODE returns the mode of the individual register and when IS_HAF
12723 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
12724 floating-point aggregate or a homogeneous short-vector aggregate. */
12726 static bool
12727 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
12728 const_tree type,
12729 machine_mode *base_mode,
12730 int *count,
12731 bool *is_ha)
12733 machine_mode new_mode = VOIDmode;
12734 bool composite_p = aarch64_composite_type_p (type, mode);
12736 if (is_ha != NULL) *is_ha = false;
12738 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
12739 || aarch64_short_vector_p (type, mode))
12741 *count = 1;
12742 new_mode = mode;
12744 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
12746 if (is_ha != NULL) *is_ha = true;
12747 *count = 2;
12748 new_mode = GET_MODE_INNER (mode);
12750 else if (type && composite_p)
12752 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
12754 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
12756 if (is_ha != NULL) *is_ha = true;
12757 *count = ag_count;
12759 else
12760 return false;
12762 else
12763 return false;
12765 *base_mode = new_mode;
12766 return true;
12769 /* Implement TARGET_STRUCT_VALUE_RTX. */
12771 static rtx
12772 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
12773 int incoming ATTRIBUTE_UNUSED)
12775 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
12778 /* Implements target hook vector_mode_supported_p. */
12779 static bool
12780 aarch64_vector_mode_supported_p (machine_mode mode)
12782 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
12783 return vec_flags != 0 && (vec_flags & VEC_STRUCT) == 0;
12786 /* Return appropriate SIMD container
12787 for MODE within a vector of WIDTH bits. */
12788 static machine_mode
12789 aarch64_simd_container_mode (scalar_mode mode, poly_int64 width)
12791 if (TARGET_SVE && known_eq (width, BITS_PER_SVE_VECTOR))
12792 switch (mode)
12794 case E_DFmode:
12795 return VNx2DFmode;
12796 case E_SFmode:
12797 return VNx4SFmode;
12798 case E_HFmode:
12799 return VNx8HFmode;
12800 case E_DImode:
12801 return VNx2DImode;
12802 case E_SImode:
12803 return VNx4SImode;
12804 case E_HImode:
12805 return VNx8HImode;
12806 case E_QImode:
12807 return VNx16QImode;
12808 default:
12809 return word_mode;
12812 gcc_assert (known_eq (width, 64) || known_eq (width, 128));
12813 if (TARGET_SIMD)
12815 if (known_eq (width, 128))
12816 switch (mode)
12818 case E_DFmode:
12819 return V2DFmode;
12820 case E_SFmode:
12821 return V4SFmode;
12822 case E_HFmode:
12823 return V8HFmode;
12824 case E_SImode:
12825 return V4SImode;
12826 case E_HImode:
12827 return V8HImode;
12828 case E_QImode:
12829 return V16QImode;
12830 case E_DImode:
12831 return V2DImode;
12832 default:
12833 break;
12835 else
12836 switch (mode)
12838 case E_SFmode:
12839 return V2SFmode;
12840 case E_HFmode:
12841 return V4HFmode;
12842 case E_SImode:
12843 return V2SImode;
12844 case E_HImode:
12845 return V4HImode;
12846 case E_QImode:
12847 return V8QImode;
12848 default:
12849 break;
12852 return word_mode;
12855 /* Return 128-bit container as the preferred SIMD mode for MODE. */
12856 static machine_mode
12857 aarch64_preferred_simd_mode (scalar_mode mode)
12859 poly_int64 bits = TARGET_SVE ? BITS_PER_SVE_VECTOR : 128;
12860 return aarch64_simd_container_mode (mode, bits);
12863 /* Return a list of possible vector sizes for the vectorizer
12864 to iterate over. */
12865 static void
12866 aarch64_autovectorize_vector_sizes (vector_sizes *sizes)
12868 if (TARGET_SVE)
12869 sizes->safe_push (BYTES_PER_SVE_VECTOR);
12870 sizes->safe_push (16);
12871 sizes->safe_push (8);
12874 /* Implement TARGET_MANGLE_TYPE. */
12876 static const char *
12877 aarch64_mangle_type (const_tree type)
12879 /* The AArch64 ABI documents say that "__va_list" has to be
12880 managled as if it is in the "std" namespace. */
12881 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
12882 return "St9__va_list";
12884 /* Half-precision float. */
12885 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
12886 return "Dh";
12888 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
12889 builtin types. */
12890 if (TYPE_NAME (type) != NULL)
12891 return aarch64_mangle_builtin_type (type);
12893 /* Use the default mangling. */
12894 return NULL;
12897 /* Find the first rtx_insn before insn that will generate an assembly
12898 instruction. */
12900 static rtx_insn *
12901 aarch64_prev_real_insn (rtx_insn *insn)
12903 if (!insn)
12904 return NULL;
12908 insn = prev_real_insn (insn);
12910 while (insn && recog_memoized (insn) < 0);
12912 return insn;
12915 static bool
12916 is_madd_op (enum attr_type t1)
12918 unsigned int i;
12919 /* A number of these may be AArch32 only. */
12920 enum attr_type mlatypes[] = {
12921 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
12922 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
12923 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
12926 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
12928 if (t1 == mlatypes[i])
12929 return true;
12932 return false;
12935 /* Check if there is a register dependency between a load and the insn
12936 for which we hold recog_data. */
12938 static bool
12939 dep_between_memop_and_curr (rtx memop)
12941 rtx load_reg;
12942 int opno;
12944 gcc_assert (GET_CODE (memop) == SET);
12946 if (!REG_P (SET_DEST (memop)))
12947 return false;
12949 load_reg = SET_DEST (memop);
12950 for (opno = 1; opno < recog_data.n_operands; opno++)
12952 rtx operand = recog_data.operand[opno];
12953 if (REG_P (operand)
12954 && reg_overlap_mentioned_p (load_reg, operand))
12955 return true;
12958 return false;
12962 /* When working around the Cortex-A53 erratum 835769,
12963 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
12964 instruction and has a preceding memory instruction such that a NOP
12965 should be inserted between them. */
12967 bool
12968 aarch64_madd_needs_nop (rtx_insn* insn)
12970 enum attr_type attr_type;
12971 rtx_insn *prev;
12972 rtx body;
12974 if (!TARGET_FIX_ERR_A53_835769)
12975 return false;
12977 if (!INSN_P (insn) || recog_memoized (insn) < 0)
12978 return false;
12980 attr_type = get_attr_type (insn);
12981 if (!is_madd_op (attr_type))
12982 return false;
12984 prev = aarch64_prev_real_insn (insn);
12985 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
12986 Restore recog state to INSN to avoid state corruption. */
12987 extract_constrain_insn_cached (insn);
12989 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
12990 return false;
12992 body = single_set (prev);
12994 /* If the previous insn is a memory op and there is no dependency between
12995 it and the DImode madd, emit a NOP between them. If body is NULL then we
12996 have a complex memory operation, probably a load/store pair.
12997 Be conservative for now and emit a NOP. */
12998 if (GET_MODE (recog_data.operand[0]) == DImode
12999 && (!body || !dep_between_memop_and_curr (body)))
13000 return true;
13002 return false;
13007 /* Implement FINAL_PRESCAN_INSN. */
13009 void
13010 aarch64_final_prescan_insn (rtx_insn *insn)
13012 if (aarch64_madd_needs_nop (insn))
13013 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
13017 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
13018 instruction. */
13020 bool
13021 aarch64_sve_index_immediate_p (rtx base_or_step)
13023 return (CONST_INT_P (base_or_step)
13024 && IN_RANGE (INTVAL (base_or_step), -16, 15));
13027 /* Return true if X is a valid immediate for the SVE ADD and SUB
13028 instructions. Negate X first if NEGATE_P is true. */
13030 bool
13031 aarch64_sve_arith_immediate_p (rtx x, bool negate_p)
13033 rtx elt;
13035 if (!const_vec_duplicate_p (x, &elt)
13036 || !CONST_INT_P (elt))
13037 return false;
13039 HOST_WIDE_INT val = INTVAL (elt);
13040 if (negate_p)
13041 val = -val;
13042 val &= GET_MODE_MASK (GET_MODE_INNER (GET_MODE (x)));
13044 if (val & 0xff)
13045 return IN_RANGE (val, 0, 0xff);
13046 return IN_RANGE (val, 0, 0xff00);
13049 /* Return true if X is a valid immediate operand for an SVE logical
13050 instruction such as AND. */
13052 bool
13053 aarch64_sve_bitmask_immediate_p (rtx x)
13055 rtx elt;
13057 return (const_vec_duplicate_p (x, &elt)
13058 && CONST_INT_P (elt)
13059 && aarch64_bitmask_imm (INTVAL (elt),
13060 GET_MODE_INNER (GET_MODE (x))));
13063 /* Return true if X is a valid immediate for the SVE DUP and CPY
13064 instructions. */
13066 bool
13067 aarch64_sve_dup_immediate_p (rtx x)
13069 rtx elt;
13071 if (!const_vec_duplicate_p (x, &elt)
13072 || !CONST_INT_P (elt))
13073 return false;
13075 HOST_WIDE_INT val = INTVAL (elt);
13076 if (val & 0xff)
13077 return IN_RANGE (val, -0x80, 0x7f);
13078 return IN_RANGE (val, -0x8000, 0x7f00);
13081 /* Return true if X is a valid immediate operand for an SVE CMP instruction.
13082 SIGNED_P says whether the operand is signed rather than unsigned. */
13084 bool
13085 aarch64_sve_cmp_immediate_p (rtx x, bool signed_p)
13087 rtx elt;
13089 return (const_vec_duplicate_p (x, &elt)
13090 && CONST_INT_P (elt)
13091 && (signed_p
13092 ? IN_RANGE (INTVAL (elt), -16, 15)
13093 : IN_RANGE (INTVAL (elt), 0, 127)));
13096 /* Return true if X is a valid immediate operand for an SVE FADD or FSUB
13097 instruction. Negate X first if NEGATE_P is true. */
13099 bool
13100 aarch64_sve_float_arith_immediate_p (rtx x, bool negate_p)
13102 rtx elt;
13103 REAL_VALUE_TYPE r;
13105 if (!const_vec_duplicate_p (x, &elt)
13106 || GET_CODE (elt) != CONST_DOUBLE)
13107 return false;
13109 r = *CONST_DOUBLE_REAL_VALUE (elt);
13111 if (negate_p)
13112 r = real_value_negate (&r);
13114 if (real_equal (&r, &dconst1))
13115 return true;
13116 if (real_equal (&r, &dconsthalf))
13117 return true;
13118 return false;
13121 /* Return true if X is a valid immediate operand for an SVE FMUL
13122 instruction. */
13124 bool
13125 aarch64_sve_float_mul_immediate_p (rtx x)
13127 rtx elt;
13129 /* GCC will never generate a multiply with an immediate of 2, so there is no
13130 point testing for it (even though it is a valid constant). */
13131 return (const_vec_duplicate_p (x, &elt)
13132 && GET_CODE (elt) == CONST_DOUBLE
13133 && real_equal (CONST_DOUBLE_REAL_VALUE (elt), &dconsthalf));
13136 /* Return true if replicating VAL32 is a valid 2-byte or 4-byte immediate
13137 for the Advanced SIMD operation described by WHICH and INSN. If INFO
13138 is nonnull, use it to describe valid immediates. */
13139 static bool
13140 aarch64_advsimd_valid_immediate_hs (unsigned int val32,
13141 simd_immediate_info *info,
13142 enum simd_immediate_check which,
13143 simd_immediate_info::insn_type insn)
13145 /* Try a 4-byte immediate with LSL. */
13146 for (unsigned int shift = 0; shift < 32; shift += 8)
13147 if ((val32 & (0xff << shift)) == val32)
13149 if (info)
13150 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13151 simd_immediate_info::LSL, shift);
13152 return true;
13155 /* Try a 2-byte immediate with LSL. */
13156 unsigned int imm16 = val32 & 0xffff;
13157 if (imm16 == (val32 >> 16))
13158 for (unsigned int shift = 0; shift < 16; shift += 8)
13159 if ((imm16 & (0xff << shift)) == imm16)
13161 if (info)
13162 *info = simd_immediate_info (HImode, imm16 >> shift, insn,
13163 simd_immediate_info::LSL, shift);
13164 return true;
13167 /* Try a 4-byte immediate with MSL, except for cases that MVN
13168 can handle. */
13169 if (which == AARCH64_CHECK_MOV)
13170 for (unsigned int shift = 8; shift < 24; shift += 8)
13172 unsigned int low = (1 << shift) - 1;
13173 if (((val32 & (0xff << shift)) | low) == val32)
13175 if (info)
13176 *info = simd_immediate_info (SImode, val32 >> shift, insn,
13177 simd_immediate_info::MSL, shift);
13178 return true;
13182 return false;
13185 /* Return true if replicating VAL64 is a valid immediate for the
13186 Advanced SIMD operation described by WHICH. If INFO is nonnull,
13187 use it to describe valid immediates. */
13188 static bool
13189 aarch64_advsimd_valid_immediate (unsigned HOST_WIDE_INT val64,
13190 simd_immediate_info *info,
13191 enum simd_immediate_check which)
13193 unsigned int val32 = val64 & 0xffffffff;
13194 unsigned int val16 = val64 & 0xffff;
13195 unsigned int val8 = val64 & 0xff;
13197 if (val32 == (val64 >> 32))
13199 if ((which & AARCH64_CHECK_ORR) != 0
13200 && aarch64_advsimd_valid_immediate_hs (val32, info, which,
13201 simd_immediate_info::MOV))
13202 return true;
13204 if ((which & AARCH64_CHECK_BIC) != 0
13205 && aarch64_advsimd_valid_immediate_hs (~val32, info, which,
13206 simd_immediate_info::MVN))
13207 return true;
13209 /* Try using a replicated byte. */
13210 if (which == AARCH64_CHECK_MOV
13211 && val16 == (val32 >> 16)
13212 && val8 == (val16 >> 8))
13214 if (info)
13215 *info = simd_immediate_info (QImode, val8);
13216 return true;
13220 /* Try using a bit-to-bytemask. */
13221 if (which == AARCH64_CHECK_MOV)
13223 unsigned int i;
13224 for (i = 0; i < 64; i += 8)
13226 unsigned char byte = (val64 >> i) & 0xff;
13227 if (byte != 0 && byte != 0xff)
13228 break;
13230 if (i == 64)
13232 if (info)
13233 *info = simd_immediate_info (DImode, val64);
13234 return true;
13237 return false;
13240 /* Return true if replicating VAL64 gives a valid immediate for an SVE MOV
13241 instruction. If INFO is nonnull, use it to describe valid immediates. */
13243 static bool
13244 aarch64_sve_valid_immediate (unsigned HOST_WIDE_INT val64,
13245 simd_immediate_info *info)
13247 scalar_int_mode mode = DImode;
13248 unsigned int val32 = val64 & 0xffffffff;
13249 if (val32 == (val64 >> 32))
13251 mode = SImode;
13252 unsigned int val16 = val32 & 0xffff;
13253 if (val16 == (val32 >> 16))
13255 mode = HImode;
13256 unsigned int val8 = val16 & 0xff;
13257 if (val8 == (val16 >> 8))
13258 mode = QImode;
13261 HOST_WIDE_INT val = trunc_int_for_mode (val64, mode);
13262 if (IN_RANGE (val, -0x80, 0x7f))
13264 /* DUP with no shift. */
13265 if (info)
13266 *info = simd_immediate_info (mode, val);
13267 return true;
13269 if ((val & 0xff) == 0 && IN_RANGE (val, -0x8000, 0x7f00))
13271 /* DUP with LSL #8. */
13272 if (info)
13273 *info = simd_immediate_info (mode, val);
13274 return true;
13276 if (aarch64_bitmask_imm (val64, mode))
13278 /* DUPM. */
13279 if (info)
13280 *info = simd_immediate_info (mode, val);
13281 return true;
13283 return false;
13286 /* Return true if OP is a valid SIMD immediate for the operation
13287 described by WHICH. If INFO is nonnull, use it to describe valid
13288 immediates. */
13289 bool
13290 aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
13291 enum simd_immediate_check which)
13293 machine_mode mode = GET_MODE (op);
13294 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
13295 if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
13296 return false;
13298 scalar_mode elt_mode = GET_MODE_INNER (mode);
13299 rtx base, step;
13300 unsigned int n_elts;
13301 if (GET_CODE (op) == CONST_VECTOR
13302 && CONST_VECTOR_DUPLICATE_P (op))
13303 n_elts = CONST_VECTOR_NPATTERNS (op);
13304 else if ((vec_flags & VEC_SVE_DATA)
13305 && const_vec_series_p (op, &base, &step))
13307 gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_INT);
13308 if (!aarch64_sve_index_immediate_p (base)
13309 || !aarch64_sve_index_immediate_p (step))
13310 return false;
13312 if (info)
13313 *info = simd_immediate_info (elt_mode, base, step);
13314 return true;
13316 else if (GET_CODE (op) == CONST_VECTOR
13317 && CONST_VECTOR_NUNITS (op).is_constant (&n_elts))
13318 /* N_ELTS set above. */;
13319 else
13320 return false;
13322 /* Handle PFALSE and PTRUE. */
13323 if (vec_flags & VEC_SVE_PRED)
13324 return (op == CONST0_RTX (mode)
13325 || op == CONSTM1_RTX (mode));
13327 scalar_float_mode elt_float_mode;
13328 if (n_elts == 1
13329 && is_a <scalar_float_mode> (elt_mode, &elt_float_mode))
13331 rtx elt = CONST_VECTOR_ENCODED_ELT (op, 0);
13332 if (aarch64_float_const_zero_rtx_p (elt)
13333 || aarch64_float_const_representable_p (elt))
13335 if (info)
13336 *info = simd_immediate_info (elt_float_mode, elt);
13337 return true;
13341 unsigned int elt_size = GET_MODE_SIZE (elt_mode);
13342 if (elt_size > 8)
13343 return false;
13345 scalar_int_mode elt_int_mode = int_mode_for_mode (elt_mode).require ();
13347 /* Expand the vector constant out into a byte vector, with the least
13348 significant byte of the register first. */
13349 auto_vec<unsigned char, 16> bytes;
13350 bytes.reserve (n_elts * elt_size);
13351 for (unsigned int i = 0; i < n_elts; i++)
13353 /* The vector is provided in gcc endian-neutral fashion.
13354 For aarch64_be Advanced SIMD, it must be laid out in the vector
13355 register in reverse order. */
13356 bool swap_p = ((vec_flags & VEC_ADVSIMD) != 0 && BYTES_BIG_ENDIAN);
13357 rtx elt = CONST_VECTOR_ELT (op, swap_p ? (n_elts - 1 - i) : i);
13359 if (elt_mode != elt_int_mode)
13360 elt = gen_lowpart (elt_int_mode, elt);
13362 if (!CONST_INT_P (elt))
13363 return false;
13365 unsigned HOST_WIDE_INT elt_val = INTVAL (elt);
13366 for (unsigned int byte = 0; byte < elt_size; byte++)
13368 bytes.quick_push (elt_val & 0xff);
13369 elt_val >>= BITS_PER_UNIT;
13373 /* The immediate must repeat every eight bytes. */
13374 unsigned int nbytes = bytes.length ();
13375 for (unsigned i = 8; i < nbytes; ++i)
13376 if (bytes[i] != bytes[i - 8])
13377 return false;
13379 /* Get the repeating 8-byte value as an integer. No endian correction
13380 is needed here because bytes is already in lsb-first order. */
13381 unsigned HOST_WIDE_INT val64 = 0;
13382 for (unsigned int i = 0; i < 8; i++)
13383 val64 |= ((unsigned HOST_WIDE_INT) bytes[i % nbytes]
13384 << (i * BITS_PER_UNIT));
13386 if (vec_flags & VEC_SVE_DATA)
13387 return aarch64_sve_valid_immediate (val64, info);
13388 else
13389 return aarch64_advsimd_valid_immediate (val64, info, which);
13392 /* Check whether X is a VEC_SERIES-like constant that starts at 0 and
13393 has a step in the range of INDEX. Return the index expression if so,
13394 otherwise return null. */
13396 aarch64_check_zero_based_sve_index_immediate (rtx x)
13398 rtx base, step;
13399 if (const_vec_series_p (x, &base, &step)
13400 && base == const0_rtx
13401 && aarch64_sve_index_immediate_p (step))
13402 return step;
13403 return NULL_RTX;
13406 /* Check of immediate shift constants are within range. */
13407 bool
13408 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
13410 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
13411 if (left)
13412 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
13413 else
13414 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
13417 /* Return the bitmask CONST_INT to select the bits required by a zero extract
13418 operation of width WIDTH at bit position POS. */
13421 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
13423 gcc_assert (CONST_INT_P (width));
13424 gcc_assert (CONST_INT_P (pos));
13426 unsigned HOST_WIDE_INT mask
13427 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
13428 return GEN_INT (mask << UINTVAL (pos));
13431 bool
13432 aarch64_mov_operand_p (rtx x, machine_mode mode)
13434 if (GET_CODE (x) == HIGH
13435 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
13436 return true;
13438 if (CONST_INT_P (x))
13439 return true;
13441 if (VECTOR_MODE_P (GET_MODE (x)))
13442 return aarch64_simd_valid_immediate (x, NULL);
13444 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
13445 return true;
13447 if (aarch64_sve_cnt_immediate_p (x))
13448 return true;
13450 return aarch64_classify_symbolic_expression (x)
13451 == SYMBOL_TINY_ABSOLUTE;
13454 /* Return a const_int vector of VAL. */
13456 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
13458 rtx c = gen_int_mode (val, GET_MODE_INNER (mode));
13459 return gen_const_vec_duplicate (mode, c);
13462 /* Check OP is a legal scalar immediate for the MOVI instruction. */
13464 bool
13465 aarch64_simd_scalar_immediate_valid_for_move (rtx op, scalar_int_mode mode)
13467 machine_mode vmode;
13469 vmode = aarch64_simd_container_mode (mode, 64);
13470 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
13471 return aarch64_simd_valid_immediate (op_v, NULL);
13474 /* Construct and return a PARALLEL RTX vector with elements numbering the
13475 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
13476 the vector - from the perspective of the architecture. This does not
13477 line up with GCC's perspective on lane numbers, so we end up with
13478 different masks depending on our target endian-ness. The diagram
13479 below may help. We must draw the distinction when building masks
13480 which select one half of the vector. An instruction selecting
13481 architectural low-lanes for a big-endian target, must be described using
13482 a mask selecting GCC high-lanes.
13484 Big-Endian Little-Endian
13486 GCC 0 1 2 3 3 2 1 0
13487 | x | x | x | x | | x | x | x | x |
13488 Architecture 3 2 1 0 3 2 1 0
13490 Low Mask: { 2, 3 } { 0, 1 }
13491 High Mask: { 0, 1 } { 2, 3 }
13493 MODE Is the mode of the vector and NUNITS is the number of units in it. */
13496 aarch64_simd_vect_par_cnst_half (machine_mode mode, int nunits, bool high)
13498 rtvec v = rtvec_alloc (nunits / 2);
13499 int high_base = nunits / 2;
13500 int low_base = 0;
13501 int base;
13502 rtx t1;
13503 int i;
13505 if (BYTES_BIG_ENDIAN)
13506 base = high ? low_base : high_base;
13507 else
13508 base = high ? high_base : low_base;
13510 for (i = 0; i < nunits / 2; i++)
13511 RTVEC_ELT (v, i) = GEN_INT (base + i);
13513 t1 = gen_rtx_PARALLEL (mode, v);
13514 return t1;
13517 /* Check OP for validity as a PARALLEL RTX vector with elements
13518 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
13519 from the perspective of the architecture. See the diagram above
13520 aarch64_simd_vect_par_cnst_half for more details. */
13522 bool
13523 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
13524 bool high)
13526 int nelts;
13527 if (!VECTOR_MODE_P (mode) || !GET_MODE_NUNITS (mode).is_constant (&nelts))
13528 return false;
13530 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, nelts, high);
13531 HOST_WIDE_INT count_op = XVECLEN (op, 0);
13532 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
13533 int i = 0;
13535 if (count_op != count_ideal)
13536 return false;
13538 for (i = 0; i < count_ideal; i++)
13540 rtx elt_op = XVECEXP (op, 0, i);
13541 rtx elt_ideal = XVECEXP (ideal, 0, i);
13543 if (!CONST_INT_P (elt_op)
13544 || INTVAL (elt_ideal) != INTVAL (elt_op))
13545 return false;
13547 return true;
13550 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
13551 HIGH (exclusive). */
13552 void
13553 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
13554 const_tree exp)
13556 HOST_WIDE_INT lane;
13557 gcc_assert (CONST_INT_P (operand));
13558 lane = INTVAL (operand);
13560 if (lane < low || lane >= high)
13562 if (exp)
13563 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
13564 else
13565 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
13569 /* Peform endian correction on lane number N, which indexes a vector
13570 of mode MODE, and return the result as an SImode rtx. */
13573 aarch64_endian_lane_rtx (machine_mode mode, unsigned int n)
13575 return gen_int_mode (ENDIAN_LANE_N (GET_MODE_NUNITS (mode), n), SImode);
13578 /* Return TRUE if OP is a valid vector addressing mode. */
13580 bool
13581 aarch64_simd_mem_operand_p (rtx op)
13583 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
13584 || REG_P (XEXP (op, 0)));
13587 /* Return true if OP is a valid MEM operand for an SVE LD1R instruction. */
13589 bool
13590 aarch64_sve_ld1r_operand_p (rtx op)
13592 struct aarch64_address_info addr;
13593 scalar_mode mode;
13595 return (MEM_P (op)
13596 && is_a <scalar_mode> (GET_MODE (op), &mode)
13597 && aarch64_classify_address (&addr, XEXP (op, 0), mode, false)
13598 && addr.type == ADDRESS_REG_IMM
13599 && offset_6bit_unsigned_scaled_p (mode, addr.const_offset));
13602 /* Return true if OP is a valid MEM operand for an SVE LDR instruction.
13603 The conditions for STR are the same. */
13604 bool
13605 aarch64_sve_ldr_operand_p (rtx op)
13607 struct aarch64_address_info addr;
13609 return (MEM_P (op)
13610 && aarch64_classify_address (&addr, XEXP (op, 0), GET_MODE (op),
13611 false, ADDR_QUERY_ANY)
13612 && addr.type == ADDRESS_REG_IMM);
13615 /* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
13616 We need to be able to access the individual pieces, so the range
13617 is different from LD[234] and ST[234]. */
13618 bool
13619 aarch64_sve_struct_memory_operand_p (rtx op)
13621 if (!MEM_P (op))
13622 return false;
13624 machine_mode mode = GET_MODE (op);
13625 struct aarch64_address_info addr;
13626 if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
13627 ADDR_QUERY_ANY)
13628 || addr.type != ADDRESS_REG_IMM)
13629 return false;
13631 poly_int64 first = addr.const_offset;
13632 poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
13633 return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
13634 && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
13637 /* Emit a register copy from operand to operand, taking care not to
13638 early-clobber source registers in the process.
13640 COUNT is the number of components into which the copy needs to be
13641 decomposed. */
13642 void
13643 aarch64_simd_emit_reg_reg_move (rtx *operands, machine_mode mode,
13644 unsigned int count)
13646 unsigned int i;
13647 int rdest = REGNO (operands[0]);
13648 int rsrc = REGNO (operands[1]);
13650 if (!reg_overlap_mentioned_p (operands[0], operands[1])
13651 || rdest < rsrc)
13652 for (i = 0; i < count; i++)
13653 emit_move_insn (gen_rtx_REG (mode, rdest + i),
13654 gen_rtx_REG (mode, rsrc + i));
13655 else
13656 for (i = 0; i < count; i++)
13657 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
13658 gen_rtx_REG (mode, rsrc + count - i - 1));
13661 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
13662 one of VSTRUCT modes: OI, CI, or XI. */
13664 aarch64_simd_attr_length_rglist (machine_mode mode)
13666 /* This is only used (and only meaningful) for Advanced SIMD, not SVE. */
13667 return (GET_MODE_SIZE (mode).to_constant () / UNITS_PER_VREG) * 4;
13670 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
13671 alignment of a vector to 128 bits. SVE predicates have an alignment of
13672 16 bits. */
13673 static HOST_WIDE_INT
13674 aarch64_simd_vector_alignment (const_tree type)
13676 if (TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
13677 /* ??? Checking the mode isn't ideal, but VECTOR_BOOLEAN_TYPE_P can
13678 be set for non-predicate vectors of booleans. Modes are the most
13679 direct way we have of identifying real SVE predicate types. */
13680 return GET_MODE_CLASS (TYPE_MODE (type)) == MODE_VECTOR_BOOL ? 16 : 128;
13681 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
13682 return MIN (align, 128);
13685 /* Implement target hook TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT. */
13686 static HOST_WIDE_INT
13687 aarch64_vectorize_preferred_vector_alignment (const_tree type)
13689 if (aarch64_sve_data_mode_p (TYPE_MODE (type)))
13691 /* If the length of the vector is fixed, try to align to that length,
13692 otherwise don't try to align at all. */
13693 HOST_WIDE_INT result;
13694 if (!BITS_PER_SVE_VECTOR.is_constant (&result))
13695 result = TYPE_ALIGN (TREE_TYPE (type));
13696 return result;
13698 return TYPE_ALIGN (type);
13701 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
13702 static bool
13703 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
13705 if (is_packed)
13706 return false;
13708 /* For fixed-length vectors, check that the vectorizer will aim for
13709 full-vector alignment. This isn't true for generic GCC vectors
13710 that are wider than the ABI maximum of 128 bits. */
13711 if (TREE_CODE (TYPE_SIZE (type)) == INTEGER_CST
13712 && (wi::to_widest (TYPE_SIZE (type))
13713 != aarch64_vectorize_preferred_vector_alignment (type)))
13714 return false;
13716 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
13717 return true;
13720 /* Return true if the vector misalignment factor is supported by the
13721 target. */
13722 static bool
13723 aarch64_builtin_support_vector_misalignment (machine_mode mode,
13724 const_tree type, int misalignment,
13725 bool is_packed)
13727 if (TARGET_SIMD && STRICT_ALIGNMENT)
13729 /* Return if movmisalign pattern is not supported for this mode. */
13730 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
13731 return false;
13733 /* Misalignment factor is unknown at compile time. */
13734 if (misalignment == -1)
13735 return false;
13737 return default_builtin_support_vector_misalignment (mode, type, misalignment,
13738 is_packed);
13741 /* If VALS is a vector constant that can be loaded into a register
13742 using DUP, generate instructions to do so and return an RTX to
13743 assign to the register. Otherwise return NULL_RTX. */
13744 static rtx
13745 aarch64_simd_dup_constant (rtx vals)
13747 machine_mode mode = GET_MODE (vals);
13748 machine_mode inner_mode = GET_MODE_INNER (mode);
13749 rtx x;
13751 if (!const_vec_duplicate_p (vals, &x))
13752 return NULL_RTX;
13754 /* We can load this constant by using DUP and a constant in a
13755 single ARM register. This will be cheaper than a vector
13756 load. */
13757 x = copy_to_mode_reg (inner_mode, x);
13758 return gen_vec_duplicate (mode, x);
13762 /* Generate code to load VALS, which is a PARALLEL containing only
13763 constants (for vec_init) or CONST_VECTOR, efficiently into a
13764 register. Returns an RTX to copy into the register, or NULL_RTX
13765 for a PARALLEL that can not be converted into a CONST_VECTOR. */
13766 static rtx
13767 aarch64_simd_make_constant (rtx vals)
13769 machine_mode mode = GET_MODE (vals);
13770 rtx const_dup;
13771 rtx const_vec = NULL_RTX;
13772 int n_const = 0;
13773 int i;
13775 if (GET_CODE (vals) == CONST_VECTOR)
13776 const_vec = vals;
13777 else if (GET_CODE (vals) == PARALLEL)
13779 /* A CONST_VECTOR must contain only CONST_INTs and
13780 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
13781 Only store valid constants in a CONST_VECTOR. */
13782 int n_elts = XVECLEN (vals, 0);
13783 for (i = 0; i < n_elts; ++i)
13785 rtx x = XVECEXP (vals, 0, i);
13786 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13787 n_const++;
13789 if (n_const == n_elts)
13790 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
13792 else
13793 gcc_unreachable ();
13795 if (const_vec != NULL_RTX
13796 && aarch64_simd_valid_immediate (const_vec, NULL))
13797 /* Load using MOVI/MVNI. */
13798 return const_vec;
13799 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
13800 /* Loaded using DUP. */
13801 return const_dup;
13802 else if (const_vec != NULL_RTX)
13803 /* Load from constant pool. We can not take advantage of single-cycle
13804 LD1 because we need a PC-relative addressing mode. */
13805 return const_vec;
13806 else
13807 /* A PARALLEL containing something not valid inside CONST_VECTOR.
13808 We can not construct an initializer. */
13809 return NULL_RTX;
13812 /* Expand a vector initialisation sequence, such that TARGET is
13813 initialised to contain VALS. */
13815 void
13816 aarch64_expand_vector_init (rtx target, rtx vals)
13818 machine_mode mode = GET_MODE (target);
13819 scalar_mode inner_mode = GET_MODE_INNER (mode);
13820 /* The number of vector elements. */
13821 int n_elts = XVECLEN (vals, 0);
13822 /* The number of vector elements which are not constant. */
13823 int n_var = 0;
13824 rtx any_const = NULL_RTX;
13825 /* The first element of vals. */
13826 rtx v0 = XVECEXP (vals, 0, 0);
13827 bool all_same = true;
13829 /* Count the number of variable elements to initialise. */
13830 for (int i = 0; i < n_elts; ++i)
13832 rtx x = XVECEXP (vals, 0, i);
13833 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
13834 ++n_var;
13835 else
13836 any_const = x;
13838 all_same &= rtx_equal_p (x, v0);
13841 /* No variable elements, hand off to aarch64_simd_make_constant which knows
13842 how best to handle this. */
13843 if (n_var == 0)
13845 rtx constant = aarch64_simd_make_constant (vals);
13846 if (constant != NULL_RTX)
13848 emit_move_insn (target, constant);
13849 return;
13853 /* Splat a single non-constant element if we can. */
13854 if (all_same)
13856 rtx x = copy_to_mode_reg (inner_mode, v0);
13857 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13858 return;
13861 enum insn_code icode = optab_handler (vec_set_optab, mode);
13862 gcc_assert (icode != CODE_FOR_nothing);
13864 /* If there are only variable elements, try to optimize
13865 the insertion using dup for the most common element
13866 followed by insertions. */
13868 /* The algorithm will fill matches[*][0] with the earliest matching element,
13869 and matches[X][1] with the count of duplicate elements (if X is the
13870 earliest element which has duplicates). */
13872 if (n_var == n_elts && n_elts <= 16)
13874 int matches[16][2] = {0};
13875 for (int i = 0; i < n_elts; i++)
13877 for (int j = 0; j <= i; j++)
13879 if (rtx_equal_p (XVECEXP (vals, 0, i), XVECEXP (vals, 0, j)))
13881 matches[i][0] = j;
13882 matches[j][1]++;
13883 break;
13887 int maxelement = 0;
13888 int maxv = 0;
13889 for (int i = 0; i < n_elts; i++)
13890 if (matches[i][1] > maxv)
13892 maxelement = i;
13893 maxv = matches[i][1];
13896 /* Create a duplicate of the most common element, unless all elements
13897 are equally useless to us, in which case just immediately set the
13898 vector register using the first element. */
13900 if (maxv == 1)
13902 /* For vectors of two 64-bit elements, we can do even better. */
13903 if (n_elts == 2
13904 && (inner_mode == E_DImode
13905 || inner_mode == E_DFmode))
13908 rtx x0 = XVECEXP (vals, 0, 0);
13909 rtx x1 = XVECEXP (vals, 0, 1);
13910 /* Combine can pick up this case, but handling it directly
13911 here leaves clearer RTL.
13913 This is load_pair_lanes<mode>, and also gives us a clean-up
13914 for store_pair_lanes<mode>. */
13915 if (memory_operand (x0, inner_mode)
13916 && memory_operand (x1, inner_mode)
13917 && !STRICT_ALIGNMENT
13918 && rtx_equal_p (XEXP (x1, 0),
13919 plus_constant (Pmode,
13920 XEXP (x0, 0),
13921 GET_MODE_SIZE (inner_mode))))
13923 rtx t;
13924 if (inner_mode == DFmode)
13925 t = gen_load_pair_lanesdf (target, x0, x1);
13926 else
13927 t = gen_load_pair_lanesdi (target, x0, x1);
13928 emit_insn (t);
13929 return;
13932 /* The subreg-move sequence below will move into lane zero of the
13933 vector register. For big-endian we want that position to hold
13934 the last element of VALS. */
13935 maxelement = BYTES_BIG_ENDIAN ? n_elts - 1 : 0;
13936 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13937 aarch64_emit_move (target, lowpart_subreg (mode, x, inner_mode));
13939 else
13941 rtx x = copy_to_mode_reg (inner_mode, XVECEXP (vals, 0, maxelement));
13942 aarch64_emit_move (target, gen_vec_duplicate (mode, x));
13945 /* Insert the rest. */
13946 for (int i = 0; i < n_elts; i++)
13948 rtx x = XVECEXP (vals, 0, i);
13949 if (matches[i][0] == maxelement)
13950 continue;
13951 x = copy_to_mode_reg (inner_mode, x);
13952 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
13954 return;
13957 /* Initialise a vector which is part-variable. We want to first try
13958 to build those lanes which are constant in the most efficient way we
13959 can. */
13960 if (n_var != n_elts)
13962 rtx copy = copy_rtx (vals);
13964 /* Load constant part of vector. We really don't care what goes into the
13965 parts we will overwrite, but we're more likely to be able to load the
13966 constant efficiently if it has fewer, larger, repeating parts
13967 (see aarch64_simd_valid_immediate). */
13968 for (int i = 0; i < n_elts; i++)
13970 rtx x = XVECEXP (vals, 0, i);
13971 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13972 continue;
13973 rtx subst = any_const;
13974 for (int bit = n_elts / 2; bit > 0; bit /= 2)
13976 /* Look in the copied vector, as more elements are const. */
13977 rtx test = XVECEXP (copy, 0, i ^ bit);
13978 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
13980 subst = test;
13981 break;
13984 XVECEXP (copy, 0, i) = subst;
13986 aarch64_expand_vector_init (target, copy);
13989 /* Insert the variable lanes directly. */
13990 for (int i = 0; i < n_elts; i++)
13992 rtx x = XVECEXP (vals, 0, i);
13993 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
13994 continue;
13995 x = copy_to_mode_reg (inner_mode, x);
13996 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
14000 static unsigned HOST_WIDE_INT
14001 aarch64_shift_truncation_mask (machine_mode mode)
14003 if (!SHIFT_COUNT_TRUNCATED || aarch64_vector_data_mode_p (mode))
14004 return 0;
14005 return GET_MODE_UNIT_BITSIZE (mode) - 1;
14008 /* Select a format to encode pointers in exception handling data. */
14010 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
14012 int type;
14013 switch (aarch64_cmodel)
14015 case AARCH64_CMODEL_TINY:
14016 case AARCH64_CMODEL_TINY_PIC:
14017 case AARCH64_CMODEL_SMALL:
14018 case AARCH64_CMODEL_SMALL_PIC:
14019 case AARCH64_CMODEL_SMALL_SPIC:
14020 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
14021 for everything. */
14022 type = DW_EH_PE_sdata4;
14023 break;
14024 default:
14025 /* No assumptions here. 8-byte relocs required. */
14026 type = DW_EH_PE_sdata8;
14027 break;
14029 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
14032 /* The last .arch and .tune assembly strings that we printed. */
14033 static std::string aarch64_last_printed_arch_string;
14034 static std::string aarch64_last_printed_tune_string;
14036 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
14037 by the function fndecl. */
14039 void
14040 aarch64_declare_function_name (FILE *stream, const char* name,
14041 tree fndecl)
14043 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
14045 struct cl_target_option *targ_options;
14046 if (target_parts)
14047 targ_options = TREE_TARGET_OPTION (target_parts);
14048 else
14049 targ_options = TREE_TARGET_OPTION (target_option_current_node);
14050 gcc_assert (targ_options);
14052 const struct processor *this_arch
14053 = aarch64_get_arch (targ_options->x_explicit_arch);
14055 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
14056 std::string extension
14057 = aarch64_get_extension_string_for_isa_flags (isa_flags,
14058 this_arch->flags);
14059 /* Only update the assembler .arch string if it is distinct from the last
14060 such string we printed. */
14061 std::string to_print = this_arch->name + extension;
14062 if (to_print != aarch64_last_printed_arch_string)
14064 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
14065 aarch64_last_printed_arch_string = to_print;
14068 /* Print the cpu name we're tuning for in the comments, might be
14069 useful to readers of the generated asm. Do it only when it changes
14070 from function to function and verbose assembly is requested. */
14071 const struct processor *this_tune
14072 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
14074 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
14076 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
14077 this_tune->name);
14078 aarch64_last_printed_tune_string = this_tune->name;
14081 /* Don't forget the type directive for ELF. */
14082 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
14083 ASM_OUTPUT_LABEL (stream, name);
14086 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
14088 static void
14089 aarch64_start_file (void)
14091 struct cl_target_option *default_options
14092 = TREE_TARGET_OPTION (target_option_default_node);
14094 const struct processor *default_arch
14095 = aarch64_get_arch (default_options->x_explicit_arch);
14096 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
14097 std::string extension
14098 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
14099 default_arch->flags);
14101 aarch64_last_printed_arch_string = default_arch->name + extension;
14102 aarch64_last_printed_tune_string = "";
14103 asm_fprintf (asm_out_file, "\t.arch %s\n",
14104 aarch64_last_printed_arch_string.c_str ());
14106 default_file_start ();
14109 /* Emit load exclusive. */
14111 static void
14112 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
14113 rtx mem, rtx model_rtx)
14115 emit_insn (gen_aarch64_load_exclusive (mode, rval, mem, model_rtx));
14118 /* Emit store exclusive. */
14120 static void
14121 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
14122 rtx rval, rtx mem, rtx model_rtx)
14124 emit_insn (gen_aarch64_store_exclusive (mode, bval, rval, mem, model_rtx));
14127 /* Mark the previous jump instruction as unlikely. */
14129 static void
14130 aarch64_emit_unlikely_jump (rtx insn)
14132 rtx_insn *jump = emit_jump_insn (insn);
14133 add_reg_br_prob_note (jump, profile_probability::very_unlikely ());
14136 /* Expand a compare and swap pattern. */
14138 void
14139 aarch64_expand_compare_and_swap (rtx operands[])
14141 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
14142 machine_mode mode, cmp_mode;
14144 bval = operands[0];
14145 rval = operands[1];
14146 mem = operands[2];
14147 oldval = operands[3];
14148 newval = operands[4];
14149 is_weak = operands[5];
14150 mod_s = operands[6];
14151 mod_f = operands[7];
14152 mode = GET_MODE (mem);
14153 cmp_mode = mode;
14155 /* Normally the succ memory model must be stronger than fail, but in the
14156 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
14157 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
14159 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
14160 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
14161 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
14163 switch (mode)
14165 case E_QImode:
14166 case E_HImode:
14167 /* For short modes, we're going to perform the comparison in SImode,
14168 so do the zero-extension now. */
14169 cmp_mode = SImode;
14170 rval = gen_reg_rtx (SImode);
14171 oldval = convert_modes (SImode, mode, oldval, true);
14172 /* Fall through. */
14174 case E_SImode:
14175 case E_DImode:
14176 /* Force the value into a register if needed. */
14177 if (!aarch64_plus_operand (oldval, mode))
14178 oldval = force_reg (cmp_mode, oldval);
14179 break;
14181 default:
14182 gcc_unreachable ();
14185 if (TARGET_LSE)
14186 emit_insn (gen_aarch64_compare_and_swap_lse (mode, rval, mem, oldval,
14187 newval, is_weak, mod_s,
14188 mod_f));
14189 else
14190 emit_insn (gen_aarch64_compare_and_swap (mode, rval, mem, oldval, newval,
14191 is_weak, mod_s, mod_f));
14194 if (mode == QImode || mode == HImode)
14195 emit_move_insn (operands[1], gen_lowpart (mode, rval));
14197 x = gen_rtx_REG (CCmode, CC_REGNUM);
14198 x = gen_rtx_EQ (SImode, x, const0_rtx);
14199 emit_insn (gen_rtx_SET (bval, x));
14202 /* Test whether the target supports using a atomic load-operate instruction.
14203 CODE is the operation and AFTER is TRUE if the data in memory after the
14204 operation should be returned and FALSE if the data before the operation
14205 should be returned. Returns FALSE if the operation isn't supported by the
14206 architecture. */
14208 bool
14209 aarch64_atomic_ldop_supported_p (enum rtx_code code)
14211 if (!TARGET_LSE)
14212 return false;
14214 switch (code)
14216 case SET:
14217 case AND:
14218 case IOR:
14219 case XOR:
14220 case MINUS:
14221 case PLUS:
14222 return true;
14223 default:
14224 return false;
14228 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
14229 sequence implementing an atomic operation. */
14231 static void
14232 aarch64_emit_post_barrier (enum memmodel model)
14234 const enum memmodel base_model = memmodel_base (model);
14236 if (is_mm_sync (model)
14237 && (base_model == MEMMODEL_ACQUIRE
14238 || base_model == MEMMODEL_ACQ_REL
14239 || base_model == MEMMODEL_SEQ_CST))
14241 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
14245 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
14246 for the data in memory. EXPECTED is the value expected to be in memory.
14247 DESIRED is the value to store to memory. MEM is the memory location. MODEL
14248 is the memory ordering to use. */
14250 void
14251 aarch64_gen_atomic_cas (rtx rval, rtx mem,
14252 rtx expected, rtx desired,
14253 rtx model)
14255 machine_mode mode;
14257 mode = GET_MODE (mem);
14259 /* Move the expected value into the CAS destination register. */
14260 emit_insn (gen_rtx_SET (rval, expected));
14262 /* Emit the CAS. */
14263 emit_insn (gen_aarch64_atomic_cas (mode, rval, mem, desired, model));
14265 /* Compare the expected value with the value loaded by the CAS, to establish
14266 whether the swap was made. */
14267 aarch64_gen_compare_reg (EQ, rval, expected);
14270 /* Split a compare and swap pattern. */
14272 void
14273 aarch64_split_compare_and_swap (rtx operands[])
14275 rtx rval, mem, oldval, newval, scratch;
14276 machine_mode mode;
14277 bool is_weak;
14278 rtx_code_label *label1, *label2;
14279 rtx x, cond;
14280 enum memmodel model;
14281 rtx model_rtx;
14283 rval = operands[0];
14284 mem = operands[1];
14285 oldval = operands[2];
14286 newval = operands[3];
14287 is_weak = (operands[4] != const0_rtx);
14288 model_rtx = operands[5];
14289 scratch = operands[7];
14290 mode = GET_MODE (mem);
14291 model = memmodel_from_int (INTVAL (model_rtx));
14293 /* When OLDVAL is zero and we want the strong version we can emit a tighter
14294 loop:
14295 .label1:
14296 LD[A]XR rval, [mem]
14297 CBNZ rval, .label2
14298 ST[L]XR scratch, newval, [mem]
14299 CBNZ scratch, .label1
14300 .label2:
14301 CMP rval, 0. */
14302 bool strong_zero_p = !is_weak && oldval == const0_rtx;
14304 label1 = NULL;
14305 if (!is_weak)
14307 label1 = gen_label_rtx ();
14308 emit_label (label1);
14310 label2 = gen_label_rtx ();
14312 /* The initial load can be relaxed for a __sync operation since a final
14313 barrier will be emitted to stop code hoisting. */
14314 if (is_mm_sync (model))
14315 aarch64_emit_load_exclusive (mode, rval, mem,
14316 GEN_INT (MEMMODEL_RELAXED));
14317 else
14318 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
14320 if (strong_zero_p)
14322 if (aarch64_track_speculation)
14324 /* Emit an explicit compare instruction, so that we can correctly
14325 track the condition codes. */
14326 rtx cc_reg = aarch64_gen_compare_reg (NE, rval, const0_rtx);
14327 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14329 else
14330 x = gen_rtx_NE (VOIDmode, rval, const0_rtx);
14332 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14333 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14334 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14336 else
14338 cond = aarch64_gen_compare_reg (NE, rval, oldval);
14339 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14340 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14341 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
14342 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14345 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
14347 if (!is_weak)
14349 if (aarch64_track_speculation)
14351 /* Emit an explicit compare instruction, so that we can correctly
14352 track the condition codes. */
14353 rtx cc_reg = aarch64_gen_compare_reg (NE, scratch, const0_rtx);
14354 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14356 else
14357 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
14359 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14360 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
14361 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14363 else
14365 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14366 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
14367 emit_insn (gen_rtx_SET (cond, x));
14370 emit_label (label2);
14371 /* If we used a CBNZ in the exchange loop emit an explicit compare with RVAL
14372 to set the condition flags. If this is not used it will be removed by
14373 later passes. */
14374 if (strong_zero_p)
14376 cond = gen_rtx_REG (CCmode, CC_REGNUM);
14377 x = gen_rtx_COMPARE (CCmode, rval, const0_rtx);
14378 emit_insn (gen_rtx_SET (cond, x));
14380 /* Emit any final barrier needed for a __sync operation. */
14381 if (is_mm_sync (model))
14382 aarch64_emit_post_barrier (model);
14385 /* Emit a BIC instruction. */
14387 static void
14388 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
14390 rtx shift_rtx = GEN_INT (shift);
14391 rtx (*gen) (rtx, rtx, rtx, rtx);
14393 switch (mode)
14395 case E_SImode: gen = gen_and_one_cmpl_lshrsi3; break;
14396 case E_DImode: gen = gen_and_one_cmpl_lshrdi3; break;
14397 default:
14398 gcc_unreachable ();
14401 emit_insn (gen (dst, s2, shift_rtx, s1));
14404 /* Emit an atomic swap. */
14406 static void
14407 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
14408 rtx mem, rtx model)
14410 emit_insn (gen_aarch64_atomic_swp (mode, dst, mem, value, model));
14413 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
14414 location to store the data read from memory. OUT_RESULT is the location to
14415 store the result of the operation. MEM is the memory location to read and
14416 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
14417 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
14418 be NULL. */
14420 void
14421 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
14422 rtx mem, rtx value, rtx model_rtx)
14424 machine_mode mode = GET_MODE (mem);
14425 machine_mode wmode = (mode == DImode ? DImode : SImode);
14426 const bool short_mode = (mode < SImode);
14427 int ldop_code;
14428 rtx src;
14429 rtx x;
14431 if (out_data)
14432 out_data = gen_lowpart (mode, out_data);
14434 if (out_result)
14435 out_result = gen_lowpart (mode, out_result);
14437 /* Make sure the value is in a register, putting it into a destination
14438 register if it needs to be manipulated. */
14439 if (!register_operand (value, mode)
14440 || code == AND || code == MINUS)
14442 src = out_result ? out_result : out_data;
14443 emit_move_insn (src, gen_lowpart (mode, value));
14445 else
14446 src = value;
14447 gcc_assert (register_operand (src, mode));
14449 /* Preprocess the data for the operation as necessary. If the operation is
14450 a SET then emit a swap instruction and finish. */
14451 switch (code)
14453 case SET:
14454 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
14455 return;
14457 case MINUS:
14458 /* Negate the value and treat it as a PLUS. */
14460 rtx neg_src;
14462 /* Resize the value if necessary. */
14463 if (short_mode)
14464 src = gen_lowpart (wmode, src);
14466 neg_src = gen_rtx_NEG (wmode, src);
14467 emit_insn (gen_rtx_SET (src, neg_src));
14469 if (short_mode)
14470 src = gen_lowpart (mode, src);
14472 /* Fall-through. */
14473 case PLUS:
14474 ldop_code = UNSPECV_ATOMIC_LDOP_PLUS;
14475 break;
14477 case IOR:
14478 ldop_code = UNSPECV_ATOMIC_LDOP_OR;
14479 break;
14481 case XOR:
14482 ldop_code = UNSPECV_ATOMIC_LDOP_XOR;
14483 break;
14485 case AND:
14487 rtx not_src;
14489 /* Resize the value if necessary. */
14490 if (short_mode)
14491 src = gen_lowpart (wmode, src);
14493 not_src = gen_rtx_NOT (wmode, src);
14494 emit_insn (gen_rtx_SET (src, not_src));
14496 if (short_mode)
14497 src = gen_lowpart (mode, src);
14499 ldop_code = UNSPECV_ATOMIC_LDOP_BIC;
14500 break;
14502 default:
14503 /* The operation can't be done with atomic instructions. */
14504 gcc_unreachable ();
14507 emit_insn (gen_aarch64_atomic_load (ldop_code, mode,
14508 out_data, mem, src, model_rtx));
14510 /* If necessary, calculate the data in memory after the update by redoing the
14511 operation from values in registers. */
14512 if (!out_result)
14513 return;
14515 if (short_mode)
14517 src = gen_lowpart (wmode, src);
14518 out_data = gen_lowpart (wmode, out_data);
14519 out_result = gen_lowpart (wmode, out_result);
14522 x = NULL_RTX;
14524 switch (code)
14526 case MINUS:
14527 case PLUS:
14528 x = gen_rtx_PLUS (wmode, out_data, src);
14529 break;
14530 case IOR:
14531 x = gen_rtx_IOR (wmode, out_data, src);
14532 break;
14533 case XOR:
14534 x = gen_rtx_XOR (wmode, out_data, src);
14535 break;
14536 case AND:
14537 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
14538 return;
14539 default:
14540 gcc_unreachable ();
14543 emit_set_insn (out_result, x);
14545 return;
14548 /* Split an atomic operation. */
14550 void
14551 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
14552 rtx value, rtx model_rtx, rtx cond)
14554 machine_mode mode = GET_MODE (mem);
14555 machine_mode wmode = (mode == DImode ? DImode : SImode);
14556 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
14557 const bool is_sync = is_mm_sync (model);
14558 rtx_code_label *label;
14559 rtx x;
14561 /* Split the atomic operation into a sequence. */
14562 label = gen_label_rtx ();
14563 emit_label (label);
14565 if (new_out)
14566 new_out = gen_lowpart (wmode, new_out);
14567 if (old_out)
14568 old_out = gen_lowpart (wmode, old_out);
14569 else
14570 old_out = new_out;
14571 value = simplify_gen_subreg (wmode, value, mode, 0);
14573 /* The initial load can be relaxed for a __sync operation since a final
14574 barrier will be emitted to stop code hoisting. */
14575 if (is_sync)
14576 aarch64_emit_load_exclusive (mode, old_out, mem,
14577 GEN_INT (MEMMODEL_RELAXED));
14578 else
14579 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
14581 switch (code)
14583 case SET:
14584 new_out = value;
14585 break;
14587 case NOT:
14588 x = gen_rtx_AND (wmode, old_out, value);
14589 emit_insn (gen_rtx_SET (new_out, x));
14590 x = gen_rtx_NOT (wmode, new_out);
14591 emit_insn (gen_rtx_SET (new_out, x));
14592 break;
14594 case MINUS:
14595 if (CONST_INT_P (value))
14597 value = GEN_INT (-INTVAL (value));
14598 code = PLUS;
14600 /* Fall through. */
14602 default:
14603 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
14604 emit_insn (gen_rtx_SET (new_out, x));
14605 break;
14608 aarch64_emit_store_exclusive (mode, cond, mem,
14609 gen_lowpart (mode, new_out), model_rtx);
14611 if (aarch64_track_speculation)
14613 /* Emit an explicit compare instruction, so that we can correctly
14614 track the condition codes. */
14615 rtx cc_reg = aarch64_gen_compare_reg (NE, cond, const0_rtx);
14616 x = gen_rtx_NE (GET_MODE (cc_reg), cc_reg, const0_rtx);
14618 else
14619 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
14621 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
14622 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
14623 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
14625 /* Emit any final barrier needed for a __sync operation. */
14626 if (is_sync)
14627 aarch64_emit_post_barrier (model);
14630 static void
14631 aarch64_init_libfuncs (void)
14633 /* Half-precision float operations. The compiler handles all operations
14634 with NULL libfuncs by converting to SFmode. */
14636 /* Conversions. */
14637 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
14638 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
14640 /* Arithmetic. */
14641 set_optab_libfunc (add_optab, HFmode, NULL);
14642 set_optab_libfunc (sdiv_optab, HFmode, NULL);
14643 set_optab_libfunc (smul_optab, HFmode, NULL);
14644 set_optab_libfunc (neg_optab, HFmode, NULL);
14645 set_optab_libfunc (sub_optab, HFmode, NULL);
14647 /* Comparisons. */
14648 set_optab_libfunc (eq_optab, HFmode, NULL);
14649 set_optab_libfunc (ne_optab, HFmode, NULL);
14650 set_optab_libfunc (lt_optab, HFmode, NULL);
14651 set_optab_libfunc (le_optab, HFmode, NULL);
14652 set_optab_libfunc (ge_optab, HFmode, NULL);
14653 set_optab_libfunc (gt_optab, HFmode, NULL);
14654 set_optab_libfunc (unord_optab, HFmode, NULL);
14657 /* Target hook for c_mode_for_suffix. */
14658 static machine_mode
14659 aarch64_c_mode_for_suffix (char suffix)
14661 if (suffix == 'q')
14662 return TFmode;
14664 return VOIDmode;
14667 /* We can only represent floating point constants which will fit in
14668 "quarter-precision" values. These values are characterised by
14669 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
14672 (-1)^s * (n/16) * 2^r
14674 Where:
14675 's' is the sign bit.
14676 'n' is an integer in the range 16 <= n <= 31.
14677 'r' is an integer in the range -3 <= r <= 4. */
14679 /* Return true iff X can be represented by a quarter-precision
14680 floating point immediate operand X. Note, we cannot represent 0.0. */
14681 bool
14682 aarch64_float_const_representable_p (rtx x)
14684 /* This represents our current view of how many bits
14685 make up the mantissa. */
14686 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
14687 int exponent;
14688 unsigned HOST_WIDE_INT mantissa, mask;
14689 REAL_VALUE_TYPE r, m;
14690 bool fail;
14692 if (!CONST_DOUBLE_P (x))
14693 return false;
14695 if (GET_MODE (x) == VOIDmode
14696 || (GET_MODE (x) == HFmode && !TARGET_FP_F16INST))
14697 return false;
14699 r = *CONST_DOUBLE_REAL_VALUE (x);
14701 /* We cannot represent infinities, NaNs or +/-zero. We won't
14702 know if we have +zero until we analyse the mantissa, but we
14703 can reject the other invalid values. */
14704 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
14705 || REAL_VALUE_MINUS_ZERO (r))
14706 return false;
14708 /* Extract exponent. */
14709 r = real_value_abs (&r);
14710 exponent = REAL_EXP (&r);
14712 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
14713 highest (sign) bit, with a fixed binary point at bit point_pos.
14714 m1 holds the low part of the mantissa, m2 the high part.
14715 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
14716 bits for the mantissa, this can fail (low bits will be lost). */
14717 real_ldexp (&m, &r, point_pos - exponent);
14718 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
14720 /* If the low part of the mantissa has bits set we cannot represent
14721 the value. */
14722 if (w.ulow () != 0)
14723 return false;
14724 /* We have rejected the lower HOST_WIDE_INT, so update our
14725 understanding of how many bits lie in the mantissa and
14726 look only at the high HOST_WIDE_INT. */
14727 mantissa = w.elt (1);
14728 point_pos -= HOST_BITS_PER_WIDE_INT;
14730 /* We can only represent values with a mantissa of the form 1.xxxx. */
14731 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
14732 if ((mantissa & mask) != 0)
14733 return false;
14735 /* Having filtered unrepresentable values, we may now remove all
14736 but the highest 5 bits. */
14737 mantissa >>= point_pos - 5;
14739 /* We cannot represent the value 0.0, so reject it. This is handled
14740 elsewhere. */
14741 if (mantissa == 0)
14742 return false;
14744 /* Then, as bit 4 is always set, we can mask it off, leaving
14745 the mantissa in the range [0, 15]. */
14746 mantissa &= ~(1 << 4);
14747 gcc_assert (mantissa <= 15);
14749 /* GCC internally does not use IEEE754-like encoding (where normalized
14750 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
14751 Our mantissa values are shifted 4 places to the left relative to
14752 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
14753 by 5 places to correct for GCC's representation. */
14754 exponent = 5 - exponent;
14756 return (exponent >= 0 && exponent <= 7);
14759 /* Returns the string with the instruction for AdvSIMD MOVI, MVNI, ORR or BIC
14760 immediate with a CONST_VECTOR of MODE and WIDTH. WHICH selects whether to
14761 output MOVI/MVNI, ORR or BIC immediate. */
14762 char*
14763 aarch64_output_simd_mov_immediate (rtx const_vector, unsigned width,
14764 enum simd_immediate_check which)
14766 bool is_valid;
14767 static char templ[40];
14768 const char *mnemonic;
14769 const char *shift_op;
14770 unsigned int lane_count = 0;
14771 char element_char;
14773 struct simd_immediate_info info;
14775 /* This will return true to show const_vector is legal for use as either
14776 a AdvSIMD MOVI instruction (or, implicitly, MVNI), ORR or BIC immediate.
14777 It will also update INFO to show how the immediate should be generated.
14778 WHICH selects whether to check for MOVI/MVNI, ORR or BIC. */
14779 is_valid = aarch64_simd_valid_immediate (const_vector, &info, which);
14780 gcc_assert (is_valid);
14782 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14783 lane_count = width / GET_MODE_BITSIZE (info.elt_mode);
14785 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14787 gcc_assert (info.shift == 0 && info.insn == simd_immediate_info::MOV);
14788 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
14789 move immediate path. */
14790 if (aarch64_float_const_zero_rtx_p (info.value))
14791 info.value = GEN_INT (0);
14792 else
14794 const unsigned int buf_size = 20;
14795 char float_buf[buf_size] = {'\0'};
14796 real_to_decimal_for_mode (float_buf,
14797 CONST_DOUBLE_REAL_VALUE (info.value),
14798 buf_size, buf_size, 1, info.elt_mode);
14800 if (lane_count == 1)
14801 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
14802 else
14803 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
14804 lane_count, element_char, float_buf);
14805 return templ;
14809 gcc_assert (CONST_INT_P (info.value));
14811 if (which == AARCH64_CHECK_MOV)
14813 mnemonic = info.insn == simd_immediate_info::MVN ? "mvni" : "movi";
14814 shift_op = info.modifier == simd_immediate_info::MSL ? "msl" : "lsl";
14815 if (lane_count == 1)
14816 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
14817 mnemonic, UINTVAL (info.value));
14818 else if (info.shift)
14819 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14820 HOST_WIDE_INT_PRINT_HEX ", %s %d", mnemonic, lane_count,
14821 element_char, UINTVAL (info.value), shift_op, info.shift);
14822 else
14823 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, "
14824 HOST_WIDE_INT_PRINT_HEX, mnemonic, lane_count,
14825 element_char, UINTVAL (info.value));
14827 else
14829 /* For AARCH64_CHECK_BIC and AARCH64_CHECK_ORR. */
14830 mnemonic = info.insn == simd_immediate_info::MVN ? "bic" : "orr";
14831 if (info.shift)
14832 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14833 HOST_WIDE_INT_PRINT_DEC ", %s #%d", mnemonic, lane_count,
14834 element_char, UINTVAL (info.value), "lsl", info.shift);
14835 else
14836 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, #"
14837 HOST_WIDE_INT_PRINT_DEC, mnemonic, lane_count,
14838 element_char, UINTVAL (info.value));
14840 return templ;
14843 char*
14844 aarch64_output_scalar_simd_mov_immediate (rtx immediate, scalar_int_mode mode)
14847 /* If a floating point number was passed and we desire to use it in an
14848 integer mode do the conversion to integer. */
14849 if (CONST_DOUBLE_P (immediate) && GET_MODE_CLASS (mode) == MODE_INT)
14851 unsigned HOST_WIDE_INT ival;
14852 if (!aarch64_reinterpret_float_as_int (immediate, &ival))
14853 gcc_unreachable ();
14854 immediate = gen_int_mode (ival, mode);
14857 machine_mode vmode;
14858 /* use a 64 bit mode for everything except for DI/DF mode, where we use
14859 a 128 bit vector mode. */
14860 int width = GET_MODE_BITSIZE (mode) == 64 ? 128 : 64;
14862 vmode = aarch64_simd_container_mode (mode, width);
14863 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
14864 return aarch64_output_simd_mov_immediate (v_op, width);
14867 /* Return the output string to use for moving immediate CONST_VECTOR
14868 into an SVE register. */
14870 char *
14871 aarch64_output_sve_mov_immediate (rtx const_vector)
14873 static char templ[40];
14874 struct simd_immediate_info info;
14875 char element_char;
14877 bool is_valid = aarch64_simd_valid_immediate (const_vector, &info);
14878 gcc_assert (is_valid);
14880 element_char = sizetochar (GET_MODE_BITSIZE (info.elt_mode));
14882 if (info.step)
14884 snprintf (templ, sizeof (templ), "index\t%%0.%c, #"
14885 HOST_WIDE_INT_PRINT_DEC ", #" HOST_WIDE_INT_PRINT_DEC,
14886 element_char, INTVAL (info.value), INTVAL (info.step));
14887 return templ;
14890 if (GET_MODE_CLASS (info.elt_mode) == MODE_FLOAT)
14892 if (aarch64_float_const_zero_rtx_p (info.value))
14893 info.value = GEN_INT (0);
14894 else
14896 const int buf_size = 20;
14897 char float_buf[buf_size] = {};
14898 real_to_decimal_for_mode (float_buf,
14899 CONST_DOUBLE_REAL_VALUE (info.value),
14900 buf_size, buf_size, 1, info.elt_mode);
14902 snprintf (templ, sizeof (templ), "fmov\t%%0.%c, #%s",
14903 element_char, float_buf);
14904 return templ;
14908 snprintf (templ, sizeof (templ), "mov\t%%0.%c, #" HOST_WIDE_INT_PRINT_DEC,
14909 element_char, INTVAL (info.value));
14910 return templ;
14913 /* Return the asm format for a PTRUE instruction whose destination has
14914 mode MODE. SUFFIX is the element size suffix. */
14916 char *
14917 aarch64_output_ptrue (machine_mode mode, char suffix)
14919 unsigned int nunits;
14920 static char buf[sizeof ("ptrue\t%0.N, vlNNNNN")];
14921 if (GET_MODE_NUNITS (mode).is_constant (&nunits))
14922 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, vl%d", suffix, nunits);
14923 else
14924 snprintf (buf, sizeof (buf), "ptrue\t%%0.%c, all", suffix);
14925 return buf;
14928 /* Split operands into moves from op[1] + op[2] into op[0]. */
14930 void
14931 aarch64_split_combinev16qi (rtx operands[3])
14933 unsigned int dest = REGNO (operands[0]);
14934 unsigned int src1 = REGNO (operands[1]);
14935 unsigned int src2 = REGNO (operands[2]);
14936 machine_mode halfmode = GET_MODE (operands[1]);
14937 unsigned int halfregs = REG_NREGS (operands[1]);
14938 rtx destlo, desthi;
14940 gcc_assert (halfmode == V16QImode);
14942 if (src1 == dest && src2 == dest + halfregs)
14944 /* No-op move. Can't split to nothing; emit something. */
14945 emit_note (NOTE_INSN_DELETED);
14946 return;
14949 /* Preserve register attributes for variable tracking. */
14950 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
14951 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
14952 GET_MODE_SIZE (halfmode));
14954 /* Special case of reversed high/low parts. */
14955 if (reg_overlap_mentioned_p (operands[2], destlo)
14956 && reg_overlap_mentioned_p (operands[1], desthi))
14958 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14959 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
14960 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
14962 else if (!reg_overlap_mentioned_p (operands[2], destlo))
14964 /* Try to avoid unnecessary moves if part of the result
14965 is in the right place already. */
14966 if (src1 != dest)
14967 emit_move_insn (destlo, operands[1]);
14968 if (src2 != dest + halfregs)
14969 emit_move_insn (desthi, operands[2]);
14971 else
14973 if (src2 != dest + halfregs)
14974 emit_move_insn (desthi, operands[2]);
14975 if (src1 != dest)
14976 emit_move_insn (destlo, operands[1]);
14980 /* vec_perm support. */
14982 struct expand_vec_perm_d
14984 rtx target, op0, op1;
14985 vec_perm_indices perm;
14986 machine_mode vmode;
14987 unsigned int vec_flags;
14988 bool one_vector_p;
14989 bool testing_p;
14992 /* Generate a variable permutation. */
14994 static void
14995 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
14997 machine_mode vmode = GET_MODE (target);
14998 bool one_vector_p = rtx_equal_p (op0, op1);
15000 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
15001 gcc_checking_assert (GET_MODE (op0) == vmode);
15002 gcc_checking_assert (GET_MODE (op1) == vmode);
15003 gcc_checking_assert (GET_MODE (sel) == vmode);
15004 gcc_checking_assert (TARGET_SIMD);
15006 if (one_vector_p)
15008 if (vmode == V8QImode)
15010 /* Expand the argument to a V16QI mode by duplicating it. */
15011 rtx pair = gen_reg_rtx (V16QImode);
15012 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
15013 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15015 else
15017 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
15020 else
15022 rtx pair;
15024 if (vmode == V8QImode)
15026 pair = gen_reg_rtx (V16QImode);
15027 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
15028 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
15030 else
15032 pair = gen_reg_rtx (OImode);
15033 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
15034 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
15039 /* Expand a vec_perm with the operands given by TARGET, OP0, OP1 and SEL.
15040 NELT is the number of elements in the vector. */
15042 void
15043 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel,
15044 unsigned int nelt)
15046 machine_mode vmode = GET_MODE (target);
15047 bool one_vector_p = rtx_equal_p (op0, op1);
15048 rtx mask;
15050 /* The TBL instruction does not use a modulo index, so we must take care
15051 of that ourselves. */
15052 mask = aarch64_simd_gen_const_vector_dup (vmode,
15053 one_vector_p ? nelt - 1 : 2 * nelt - 1);
15054 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
15056 /* For big-endian, we also need to reverse the index within the vector
15057 (but not which vector). */
15058 if (BYTES_BIG_ENDIAN)
15060 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
15061 if (!one_vector_p)
15062 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
15063 sel = expand_simple_binop (vmode, XOR, sel, mask,
15064 NULL, 0, OPTAB_LIB_WIDEN);
15066 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
15069 /* Generate (set TARGET (unspec [OP0 OP1] CODE)). */
15071 static void
15072 emit_unspec2 (rtx target, int code, rtx op0, rtx op1)
15074 emit_insn (gen_rtx_SET (target,
15075 gen_rtx_UNSPEC (GET_MODE (target),
15076 gen_rtvec (2, op0, op1), code)));
15079 /* Expand an SVE vec_perm with the given operands. */
15081 void
15082 aarch64_expand_sve_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
15084 machine_mode data_mode = GET_MODE (target);
15085 machine_mode sel_mode = GET_MODE (sel);
15086 /* Enforced by the pattern condition. */
15087 int nunits = GET_MODE_NUNITS (sel_mode).to_constant ();
15089 /* Note: vec_perm indices are supposed to wrap when they go beyond the
15090 size of the two value vectors, i.e. the upper bits of the indices
15091 are effectively ignored. SVE TBL instead produces 0 for any
15092 out-of-range indices, so we need to modulo all the vec_perm indices
15093 to ensure they are all in range. */
15094 rtx sel_reg = force_reg (sel_mode, sel);
15096 /* Check if the sel only references the first values vector. */
15097 if (GET_CODE (sel) == CONST_VECTOR
15098 && aarch64_const_vec_all_in_range_p (sel, 0, nunits - 1))
15100 emit_unspec2 (target, UNSPEC_TBL, op0, sel_reg);
15101 return;
15104 /* Check if the two values vectors are the same. */
15105 if (rtx_equal_p (op0, op1))
15107 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode, nunits - 1);
15108 rtx sel_mod = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15109 NULL, 0, OPTAB_DIRECT);
15110 emit_unspec2 (target, UNSPEC_TBL, op0, sel_mod);
15111 return;
15114 /* Run TBL on for each value vector and combine the results. */
15116 rtx res0 = gen_reg_rtx (data_mode);
15117 rtx res1 = gen_reg_rtx (data_mode);
15118 rtx neg_num_elems = aarch64_simd_gen_const_vector_dup (sel_mode, -nunits);
15119 if (GET_CODE (sel) != CONST_VECTOR
15120 || !aarch64_const_vec_all_in_range_p (sel, 0, 2 * nunits - 1))
15122 rtx max_sel = aarch64_simd_gen_const_vector_dup (sel_mode,
15123 2 * nunits - 1);
15124 sel_reg = expand_simple_binop (sel_mode, AND, sel_reg, max_sel,
15125 NULL, 0, OPTAB_DIRECT);
15127 emit_unspec2 (res0, UNSPEC_TBL, op0, sel_reg);
15128 rtx sel_sub = expand_simple_binop (sel_mode, PLUS, sel_reg, neg_num_elems,
15129 NULL, 0, OPTAB_DIRECT);
15130 emit_unspec2 (res1, UNSPEC_TBL, op1, sel_sub);
15131 if (GET_MODE_CLASS (data_mode) == MODE_VECTOR_INT)
15132 emit_insn (gen_rtx_SET (target, gen_rtx_IOR (data_mode, res0, res1)));
15133 else
15134 emit_unspec2 (target, UNSPEC_IORF, res0, res1);
15137 /* Recognize patterns suitable for the TRN instructions. */
15138 static bool
15139 aarch64_evpc_trn (struct expand_vec_perm_d *d)
15141 HOST_WIDE_INT odd;
15142 poly_uint64 nelt = d->perm.length ();
15143 rtx out, in0, in1, x;
15144 machine_mode vmode = d->vmode;
15146 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15147 return false;
15149 /* Note that these are little-endian tests.
15150 We correct for big-endian later. */
15151 if (!d->perm[0].is_constant (&odd)
15152 || (odd != 0 && odd != 1)
15153 || !d->perm.series_p (0, 2, odd, 2)
15154 || !d->perm.series_p (1, 2, nelt + odd, 2))
15155 return false;
15157 /* Success! */
15158 if (d->testing_p)
15159 return true;
15161 in0 = d->op0;
15162 in1 = d->op1;
15163 /* We don't need a big-endian lane correction for SVE; see the comment
15164 at the head of aarch64-sve.md for details. */
15165 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15167 x = in0, in0 = in1, in1 = x;
15168 odd = !odd;
15170 out = d->target;
15172 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15173 odd ? UNSPEC_TRN2 : UNSPEC_TRN1));
15174 return true;
15177 /* Recognize patterns suitable for the UZP instructions. */
15178 static bool
15179 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
15181 HOST_WIDE_INT odd;
15182 rtx out, in0, in1, x;
15183 machine_mode vmode = d->vmode;
15185 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15186 return false;
15188 /* Note that these are little-endian tests.
15189 We correct for big-endian later. */
15190 if (!d->perm[0].is_constant (&odd)
15191 || (odd != 0 && odd != 1)
15192 || !d->perm.series_p (0, 1, odd, 2))
15193 return false;
15195 /* Success! */
15196 if (d->testing_p)
15197 return true;
15199 in0 = d->op0;
15200 in1 = d->op1;
15201 /* We don't need a big-endian lane correction for SVE; see the comment
15202 at the head of aarch64-sve.md for details. */
15203 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15205 x = in0, in0 = in1, in1 = x;
15206 odd = !odd;
15208 out = d->target;
15210 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15211 odd ? UNSPEC_UZP2 : UNSPEC_UZP1));
15212 return true;
15215 /* Recognize patterns suitable for the ZIP instructions. */
15216 static bool
15217 aarch64_evpc_zip (struct expand_vec_perm_d *d)
15219 unsigned int high;
15220 poly_uint64 nelt = d->perm.length ();
15221 rtx out, in0, in1, x;
15222 machine_mode vmode = d->vmode;
15224 if (GET_MODE_UNIT_SIZE (vmode) > 8)
15225 return false;
15227 /* Note that these are little-endian tests.
15228 We correct for big-endian later. */
15229 poly_uint64 first = d->perm[0];
15230 if ((maybe_ne (first, 0U) && maybe_ne (first * 2, nelt))
15231 || !d->perm.series_p (0, 2, first, 1)
15232 || !d->perm.series_p (1, 2, first + nelt, 1))
15233 return false;
15234 high = maybe_ne (first, 0U);
15236 /* Success! */
15237 if (d->testing_p)
15238 return true;
15240 in0 = d->op0;
15241 in1 = d->op1;
15242 /* We don't need a big-endian lane correction for SVE; see the comment
15243 at the head of aarch64-sve.md for details. */
15244 if (BYTES_BIG_ENDIAN && d->vec_flags == VEC_ADVSIMD)
15246 x = in0, in0 = in1, in1 = x;
15247 high = !high;
15249 out = d->target;
15251 emit_set_insn (out, gen_rtx_UNSPEC (vmode, gen_rtvec (2, in0, in1),
15252 high ? UNSPEC_ZIP2 : UNSPEC_ZIP1));
15253 return true;
15256 /* Recognize patterns for the EXT insn. */
15258 static bool
15259 aarch64_evpc_ext (struct expand_vec_perm_d *d)
15261 HOST_WIDE_INT location;
15262 rtx offset;
15264 /* The first element always refers to the first vector.
15265 Check if the extracted indices are increasing by one. */
15266 if (d->vec_flags == VEC_SVE_PRED
15267 || !d->perm[0].is_constant (&location)
15268 || !d->perm.series_p (0, 1, location, 1))
15269 return false;
15271 /* Success! */
15272 if (d->testing_p)
15273 return true;
15275 /* The case where (location == 0) is a no-op for both big- and little-endian,
15276 and is removed by the mid-end at optimization levels -O1 and higher.
15278 We don't need a big-endian lane correction for SVE; see the comment
15279 at the head of aarch64-sve.md for details. */
15280 if (BYTES_BIG_ENDIAN && location != 0 && d->vec_flags == VEC_ADVSIMD)
15282 /* After setup, we want the high elements of the first vector (stored
15283 at the LSB end of the register), and the low elements of the second
15284 vector (stored at the MSB end of the register). So swap. */
15285 std::swap (d->op0, d->op1);
15286 /* location != 0 (above), so safe to assume (nelt - location) < nelt.
15287 to_constant () is safe since this is restricted to Advanced SIMD
15288 vectors. */
15289 location = d->perm.length ().to_constant () - location;
15292 offset = GEN_INT (location);
15293 emit_set_insn (d->target,
15294 gen_rtx_UNSPEC (d->vmode,
15295 gen_rtvec (3, d->op0, d->op1, offset),
15296 UNSPEC_EXT));
15297 return true;
15300 /* Recognize patterns for the REV{64,32,16} insns, which reverse elements
15301 within each 64-bit, 32-bit or 16-bit granule. */
15303 static bool
15304 aarch64_evpc_rev_local (struct expand_vec_perm_d *d)
15306 HOST_WIDE_INT diff;
15307 unsigned int i, size, unspec;
15308 machine_mode pred_mode;
15310 if (d->vec_flags == VEC_SVE_PRED
15311 || !d->one_vector_p
15312 || !d->perm[0].is_constant (&diff))
15313 return false;
15315 size = (diff + 1) * GET_MODE_UNIT_SIZE (d->vmode);
15316 if (size == 8)
15318 unspec = UNSPEC_REV64;
15319 pred_mode = VNx2BImode;
15321 else if (size == 4)
15323 unspec = UNSPEC_REV32;
15324 pred_mode = VNx4BImode;
15326 else if (size == 2)
15328 unspec = UNSPEC_REV16;
15329 pred_mode = VNx8BImode;
15331 else
15332 return false;
15334 unsigned int step = diff + 1;
15335 for (i = 0; i < step; ++i)
15336 if (!d->perm.series_p (i, step, diff - i, step))
15337 return false;
15339 /* Success! */
15340 if (d->testing_p)
15341 return true;
15343 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), unspec);
15344 if (d->vec_flags == VEC_SVE_DATA)
15346 rtx pred = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15347 src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (2, pred, src),
15348 UNSPEC_MERGE_PTRUE);
15350 emit_set_insn (d->target, src);
15351 return true;
15354 /* Recognize patterns for the REV insn, which reverses elements within
15355 a full vector. */
15357 static bool
15358 aarch64_evpc_rev_global (struct expand_vec_perm_d *d)
15360 poly_uint64 nelt = d->perm.length ();
15362 if (!d->one_vector_p || d->vec_flags != VEC_SVE_DATA)
15363 return false;
15365 if (!d->perm.series_p (0, 1, nelt - 1, -1))
15366 return false;
15368 /* Success! */
15369 if (d->testing_p)
15370 return true;
15372 rtx src = gen_rtx_UNSPEC (d->vmode, gen_rtvec (1, d->op0), UNSPEC_REV);
15373 emit_set_insn (d->target, src);
15374 return true;
15377 static bool
15378 aarch64_evpc_dup (struct expand_vec_perm_d *d)
15380 rtx out = d->target;
15381 rtx in0;
15382 HOST_WIDE_INT elt;
15383 machine_mode vmode = d->vmode;
15384 rtx lane;
15386 if (d->vec_flags == VEC_SVE_PRED
15387 || d->perm.encoding ().encoded_nelts () != 1
15388 || !d->perm[0].is_constant (&elt))
15389 return false;
15391 if (d->vec_flags == VEC_SVE_DATA && elt >= 64 * GET_MODE_UNIT_SIZE (vmode))
15392 return false;
15394 /* Success! */
15395 if (d->testing_p)
15396 return true;
15398 /* The generic preparation in aarch64_expand_vec_perm_const_1
15399 swaps the operand order and the permute indices if it finds
15400 d->perm[0] to be in the second operand. Thus, we can always
15401 use d->op0 and need not do any extra arithmetic to get the
15402 correct lane number. */
15403 in0 = d->op0;
15404 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
15406 rtx parallel = gen_rtx_PARALLEL (vmode, gen_rtvec (1, lane));
15407 rtx select = gen_rtx_VEC_SELECT (GET_MODE_INNER (vmode), in0, parallel);
15408 emit_set_insn (out, gen_rtx_VEC_DUPLICATE (vmode, select));
15409 return true;
15412 static bool
15413 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
15415 rtx rperm[MAX_COMPILE_TIME_VEC_BYTES], sel;
15416 machine_mode vmode = d->vmode;
15418 /* Make sure that the indices are constant. */
15419 unsigned int encoded_nelts = d->perm.encoding ().encoded_nelts ();
15420 for (unsigned int i = 0; i < encoded_nelts; ++i)
15421 if (!d->perm[i].is_constant ())
15422 return false;
15424 if (d->testing_p)
15425 return true;
15427 /* Generic code will try constant permutation twice. Once with the
15428 original mode and again with the elements lowered to QImode.
15429 So wait and don't do the selector expansion ourselves. */
15430 if (vmode != V8QImode && vmode != V16QImode)
15431 return false;
15433 /* to_constant is safe since this routine is specific to Advanced SIMD
15434 vectors. */
15435 unsigned int nelt = d->perm.length ().to_constant ();
15436 for (unsigned int i = 0; i < nelt; ++i)
15437 /* If big-endian and two vectors we end up with a weird mixed-endian
15438 mode on NEON. Reverse the index within each word but not the word
15439 itself. to_constant is safe because we checked is_constant above. */
15440 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN
15441 ? d->perm[i].to_constant () ^ (nelt - 1)
15442 : d->perm[i].to_constant ());
15444 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
15445 sel = force_reg (vmode, sel);
15447 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
15448 return true;
15451 /* Try to implement D using an SVE TBL instruction. */
15453 static bool
15454 aarch64_evpc_sve_tbl (struct expand_vec_perm_d *d)
15456 unsigned HOST_WIDE_INT nelt;
15458 /* Permuting two variable-length vectors could overflow the
15459 index range. */
15460 if (!d->one_vector_p && !d->perm.length ().is_constant (&nelt))
15461 return false;
15463 if (d->testing_p)
15464 return true;
15466 machine_mode sel_mode = mode_for_int_vector (d->vmode).require ();
15467 rtx sel = vec_perm_indices_to_rtx (sel_mode, d->perm);
15468 if (d->one_vector_p)
15469 emit_unspec2 (d->target, UNSPEC_TBL, d->op0, force_reg (sel_mode, sel));
15470 else
15471 aarch64_expand_sve_vec_perm (d->target, d->op0, d->op1, sel);
15472 return true;
15475 static bool
15476 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
15478 /* The pattern matching functions above are written to look for a small
15479 number to begin the sequence (0, 1, N/2). If we begin with an index
15480 from the second operand, we can swap the operands. */
15481 poly_int64 nelt = d->perm.length ();
15482 if (known_ge (d->perm[0], nelt))
15484 d->perm.rotate_inputs (1);
15485 std::swap (d->op0, d->op1);
15488 if ((d->vec_flags == VEC_ADVSIMD
15489 || d->vec_flags == VEC_SVE_DATA
15490 || d->vec_flags == VEC_SVE_PRED)
15491 && known_gt (nelt, 1))
15493 if (aarch64_evpc_rev_local (d))
15494 return true;
15495 else if (aarch64_evpc_rev_global (d))
15496 return true;
15497 else if (aarch64_evpc_ext (d))
15498 return true;
15499 else if (aarch64_evpc_dup (d))
15500 return true;
15501 else if (aarch64_evpc_zip (d))
15502 return true;
15503 else if (aarch64_evpc_uzp (d))
15504 return true;
15505 else if (aarch64_evpc_trn (d))
15506 return true;
15507 if (d->vec_flags == VEC_SVE_DATA)
15508 return aarch64_evpc_sve_tbl (d);
15509 else if (d->vec_flags == VEC_ADVSIMD)
15510 return aarch64_evpc_tbl (d);
15512 return false;
15515 /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
15517 static bool
15518 aarch64_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
15519 rtx op1, const vec_perm_indices &sel)
15521 struct expand_vec_perm_d d;
15523 /* Check whether the mask can be applied to a single vector. */
15524 if (sel.ninputs () == 1
15525 || (op0 && rtx_equal_p (op0, op1)))
15526 d.one_vector_p = true;
15527 else if (sel.all_from_input_p (0))
15529 d.one_vector_p = true;
15530 op1 = op0;
15532 else if (sel.all_from_input_p (1))
15534 d.one_vector_p = true;
15535 op0 = op1;
15537 else
15538 d.one_vector_p = false;
15540 d.perm.new_vector (sel.encoding (), d.one_vector_p ? 1 : 2,
15541 sel.nelts_per_input ());
15542 d.vmode = vmode;
15543 d.vec_flags = aarch64_classify_vector_mode (d.vmode);
15544 d.target = target;
15545 d.op0 = op0;
15546 d.op1 = op1;
15547 d.testing_p = !target;
15549 if (!d.testing_p)
15550 return aarch64_expand_vec_perm_const_1 (&d);
15552 rtx_insn *last = get_last_insn ();
15553 bool ret = aarch64_expand_vec_perm_const_1 (&d);
15554 gcc_assert (last == get_last_insn ());
15556 return ret;
15559 /* Generate a byte permute mask for a register of mode MODE,
15560 which has NUNITS units. */
15563 aarch64_reverse_mask (machine_mode mode, unsigned int nunits)
15565 /* We have to reverse each vector because we dont have
15566 a permuted load that can reverse-load according to ABI rules. */
15567 rtx mask;
15568 rtvec v = rtvec_alloc (16);
15569 unsigned int i, j;
15570 unsigned int usize = GET_MODE_UNIT_SIZE (mode);
15572 gcc_assert (BYTES_BIG_ENDIAN);
15573 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
15575 for (i = 0; i < nunits; i++)
15576 for (j = 0; j < usize; j++)
15577 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
15578 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
15579 return force_reg (V16QImode, mask);
15582 /* Return true if X is a valid second operand for the SVE instruction
15583 that implements integer comparison OP_CODE. */
15585 static bool
15586 aarch64_sve_cmp_operand_p (rtx_code op_code, rtx x)
15588 if (register_operand (x, VOIDmode))
15589 return true;
15591 switch (op_code)
15593 case LTU:
15594 case LEU:
15595 case GEU:
15596 case GTU:
15597 return aarch64_sve_cmp_immediate_p (x, false);
15598 case LT:
15599 case LE:
15600 case GE:
15601 case GT:
15602 case NE:
15603 case EQ:
15604 return aarch64_sve_cmp_immediate_p (x, true);
15605 default:
15606 gcc_unreachable ();
15610 /* Use predicated SVE instructions to implement the equivalent of:
15612 (set TARGET OP)
15614 given that PTRUE is an all-true predicate of the appropriate mode. */
15616 static void
15617 aarch64_emit_sve_ptrue_op (rtx target, rtx ptrue, rtx op)
15619 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15620 gen_rtvec (2, ptrue, op),
15621 UNSPEC_MERGE_PTRUE);
15622 rtx_insn *insn = emit_set_insn (target, unspec);
15623 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15626 /* Likewise, but also clobber the condition codes. */
15628 static void
15629 aarch64_emit_sve_ptrue_op_cc (rtx target, rtx ptrue, rtx op)
15631 rtx unspec = gen_rtx_UNSPEC (GET_MODE (target),
15632 gen_rtvec (2, ptrue, op),
15633 UNSPEC_MERGE_PTRUE);
15634 rtx_insn *insn = emit_insn (gen_set_clobber_cc (target, unspec));
15635 set_unique_reg_note (insn, REG_EQUAL, copy_rtx (op));
15638 /* Return the UNSPEC_COND_* code for comparison CODE. */
15640 static unsigned int
15641 aarch64_unspec_cond_code (rtx_code code)
15643 switch (code)
15645 case NE:
15646 return UNSPEC_COND_NE;
15647 case EQ:
15648 return UNSPEC_COND_EQ;
15649 case LT:
15650 return UNSPEC_COND_LT;
15651 case GT:
15652 return UNSPEC_COND_GT;
15653 case LE:
15654 return UNSPEC_COND_LE;
15655 case GE:
15656 return UNSPEC_COND_GE;
15657 default:
15658 gcc_unreachable ();
15662 /* Emit:
15664 (set TARGET (unspec [PRED OP0 OP1] UNSPEC_COND_<X>))
15666 where <X> is the operation associated with comparison CODE. This form
15667 of instruction is used when (and (CODE OP0 OP1) PRED) would have different
15668 semantics, such as when PRED might not be all-true and when comparing
15669 inactive lanes could have side effects. */
15671 static void
15672 aarch64_emit_sve_predicated_cond (rtx target, rtx_code code,
15673 rtx pred, rtx op0, rtx op1)
15675 rtx unspec = gen_rtx_UNSPEC (GET_MODE (pred),
15676 gen_rtvec (3, pred, op0, op1),
15677 aarch64_unspec_cond_code (code));
15678 emit_set_insn (target, unspec);
15681 /* Expand an SVE integer comparison using the SVE equivalent of:
15683 (set TARGET (CODE OP0 OP1)). */
15685 void
15686 aarch64_expand_sve_vec_cmp_int (rtx target, rtx_code code, rtx op0, rtx op1)
15688 machine_mode pred_mode = GET_MODE (target);
15689 machine_mode data_mode = GET_MODE (op0);
15691 if (!aarch64_sve_cmp_operand_p (code, op1))
15692 op1 = force_reg (data_mode, op1);
15694 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15695 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15696 aarch64_emit_sve_ptrue_op_cc (target, ptrue, cond);
15699 /* Emit the SVE equivalent of:
15701 (set TMP1 (CODE1 OP0 OP1))
15702 (set TMP2 (CODE2 OP0 OP1))
15703 (set TARGET (ior:PRED_MODE TMP1 TMP2))
15705 PTRUE is an all-true predicate with the same mode as TARGET. */
15707 static void
15708 aarch64_emit_sve_or_conds (rtx target, rtx_code code1, rtx_code code2,
15709 rtx ptrue, rtx op0, rtx op1)
15711 machine_mode pred_mode = GET_MODE (ptrue);
15712 rtx tmp1 = gen_reg_rtx (pred_mode);
15713 aarch64_emit_sve_ptrue_op (tmp1, ptrue,
15714 gen_rtx_fmt_ee (code1, pred_mode, op0, op1));
15715 rtx tmp2 = gen_reg_rtx (pred_mode);
15716 aarch64_emit_sve_ptrue_op (tmp2, ptrue,
15717 gen_rtx_fmt_ee (code2, pred_mode, op0, op1));
15718 aarch64_emit_binop (target, ior_optab, tmp1, tmp2);
15721 /* Emit the SVE equivalent of:
15723 (set TMP (CODE OP0 OP1))
15724 (set TARGET (not TMP))
15726 PTRUE is an all-true predicate with the same mode as TARGET. */
15728 static void
15729 aarch64_emit_sve_inverted_cond (rtx target, rtx ptrue, rtx_code code,
15730 rtx op0, rtx op1)
15732 machine_mode pred_mode = GET_MODE (ptrue);
15733 rtx tmp = gen_reg_rtx (pred_mode);
15734 aarch64_emit_sve_ptrue_op (tmp, ptrue,
15735 gen_rtx_fmt_ee (code, pred_mode, op0, op1));
15736 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15739 /* Expand an SVE floating-point comparison using the SVE equivalent of:
15741 (set TARGET (CODE OP0 OP1))
15743 If CAN_INVERT_P is true, the caller can also handle inverted results;
15744 return true if the result is in fact inverted. */
15746 bool
15747 aarch64_expand_sve_vec_cmp_float (rtx target, rtx_code code,
15748 rtx op0, rtx op1, bool can_invert_p)
15750 machine_mode pred_mode = GET_MODE (target);
15751 machine_mode data_mode = GET_MODE (op0);
15753 rtx ptrue = force_reg (pred_mode, CONSTM1_RTX (pred_mode));
15754 switch (code)
15756 case UNORDERED:
15757 /* UNORDERED has no immediate form. */
15758 op1 = force_reg (data_mode, op1);
15759 /* fall through */
15760 case LT:
15761 case LE:
15762 case GT:
15763 case GE:
15764 case EQ:
15765 case NE:
15767 /* There is native support for the comparison. */
15768 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15769 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15770 return false;
15773 case LTGT:
15774 /* This is a trapping operation (LT or GT). */
15775 aarch64_emit_sve_or_conds (target, LT, GT, ptrue, op0, op1);
15776 return false;
15778 case UNEQ:
15779 if (!flag_trapping_math)
15781 /* This would trap for signaling NaNs. */
15782 op1 = force_reg (data_mode, op1);
15783 aarch64_emit_sve_or_conds (target, UNORDERED, EQ, ptrue, op0, op1);
15784 return false;
15786 /* fall through */
15787 case UNLT:
15788 case UNLE:
15789 case UNGT:
15790 case UNGE:
15791 if (flag_trapping_math)
15793 /* Work out which elements are ordered. */
15794 rtx ordered = gen_reg_rtx (pred_mode);
15795 op1 = force_reg (data_mode, op1);
15796 aarch64_emit_sve_inverted_cond (ordered, ptrue, UNORDERED, op0, op1);
15798 /* Test the opposite condition for the ordered elements,
15799 then invert the result. */
15800 if (code == UNEQ)
15801 code = NE;
15802 else
15803 code = reverse_condition_maybe_unordered (code);
15804 if (can_invert_p)
15806 aarch64_emit_sve_predicated_cond (target, code,
15807 ordered, op0, op1);
15808 return true;
15810 rtx tmp = gen_reg_rtx (pred_mode);
15811 aarch64_emit_sve_predicated_cond (tmp, code, ordered, op0, op1);
15812 aarch64_emit_unop (target, one_cmpl_optab, tmp);
15813 return false;
15815 break;
15817 case ORDERED:
15818 /* ORDERED has no immediate form. */
15819 op1 = force_reg (data_mode, op1);
15820 break;
15822 default:
15823 gcc_unreachable ();
15826 /* There is native support for the inverse comparison. */
15827 code = reverse_condition_maybe_unordered (code);
15828 if (can_invert_p)
15830 rtx cond = gen_rtx_fmt_ee (code, pred_mode, op0, op1);
15831 aarch64_emit_sve_ptrue_op (target, ptrue, cond);
15832 return true;
15834 aarch64_emit_sve_inverted_cond (target, ptrue, code, op0, op1);
15835 return false;
15838 /* Expand an SVE vcond pattern with operands OPS. DATA_MODE is the mode
15839 of the data being selected and CMP_MODE is the mode of the values being
15840 compared. */
15842 void
15843 aarch64_expand_sve_vcond (machine_mode data_mode, machine_mode cmp_mode,
15844 rtx *ops)
15846 machine_mode pred_mode
15847 = aarch64_get_mask_mode (GET_MODE_NUNITS (cmp_mode),
15848 GET_MODE_SIZE (cmp_mode)).require ();
15849 rtx pred = gen_reg_rtx (pred_mode);
15850 if (FLOAT_MODE_P (cmp_mode))
15852 if (aarch64_expand_sve_vec_cmp_float (pred, GET_CODE (ops[3]),
15853 ops[4], ops[5], true))
15854 std::swap (ops[1], ops[2]);
15856 else
15857 aarch64_expand_sve_vec_cmp_int (pred, GET_CODE (ops[3]), ops[4], ops[5]);
15859 rtvec vec = gen_rtvec (3, pred, ops[1], ops[2]);
15860 emit_set_insn (ops[0], gen_rtx_UNSPEC (data_mode, vec, UNSPEC_SEL));
15863 /* Implement TARGET_MODES_TIEABLE_P. In principle we should always return
15864 true. However due to issues with register allocation it is preferable
15865 to avoid tieing integer scalar and FP scalar modes. Executing integer
15866 operations in general registers is better than treating them as scalar
15867 vector operations. This reduces latency and avoids redundant int<->FP
15868 moves. So tie modes if they are either the same class, or vector modes
15869 with other vector modes, vector structs or any scalar mode. */
15871 static bool
15872 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
15874 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
15875 return true;
15877 /* We specifically want to allow elements of "structure" modes to
15878 be tieable to the structure. This more general condition allows
15879 other rarer situations too. The reason we don't extend this to
15880 predicate modes is that there are no predicate structure modes
15881 nor any specific instructions for extracting part of a predicate
15882 register. */
15883 if (aarch64_vector_data_mode_p (mode1)
15884 && aarch64_vector_data_mode_p (mode2))
15885 return true;
15887 /* Also allow any scalar modes with vectors. */
15888 if (aarch64_vector_mode_supported_p (mode1)
15889 || aarch64_vector_mode_supported_p (mode2))
15890 return true;
15892 return false;
15895 /* Return a new RTX holding the result of moving POINTER forward by
15896 AMOUNT bytes. */
15898 static rtx
15899 aarch64_move_pointer (rtx pointer, poly_int64 amount)
15901 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
15903 return adjust_automodify_address (pointer, GET_MODE (pointer),
15904 next, amount);
15907 /* Return a new RTX holding the result of moving POINTER forward by the
15908 size of the mode it points to. */
15910 static rtx
15911 aarch64_progress_pointer (rtx pointer)
15913 return aarch64_move_pointer (pointer, GET_MODE_SIZE (GET_MODE (pointer)));
15916 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
15917 MODE bytes. */
15919 static void
15920 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
15921 machine_mode mode)
15923 rtx reg = gen_reg_rtx (mode);
15925 /* "Cast" the pointers to the correct mode. */
15926 *src = adjust_address (*src, mode, 0);
15927 *dst = adjust_address (*dst, mode, 0);
15928 /* Emit the memcpy. */
15929 emit_move_insn (reg, *src);
15930 emit_move_insn (*dst, reg);
15931 /* Move the pointers forward. */
15932 *src = aarch64_progress_pointer (*src);
15933 *dst = aarch64_progress_pointer (*dst);
15936 /* Expand movmem, as if from a __builtin_memcpy. Return true if
15937 we succeed, otherwise return false. */
15939 bool
15940 aarch64_expand_movmem (rtx *operands)
15942 int n, mode_bits;
15943 rtx dst = operands[0];
15944 rtx src = operands[1];
15945 rtx base;
15946 machine_mode cur_mode = BLKmode, next_mode;
15947 bool speed_p = !optimize_function_for_size_p (cfun);
15949 /* When optimizing for size, give a better estimate of the length of a
15950 memcpy call, but use the default otherwise. Moves larger than 8 bytes
15951 will always require an even number of instructions to do now. And each
15952 operation requires both a load+store, so devide the max number by 2. */
15953 int max_num_moves = (speed_p ? 16 : AARCH64_CALL_RATIO) / 2;
15955 /* We can't do anything smart if the amount to copy is not constant. */
15956 if (!CONST_INT_P (operands[2]))
15957 return false;
15959 n = INTVAL (operands[2]);
15961 /* Try to keep the number of instructions low. For all cases we will do at
15962 most two moves for the residual amount, since we'll always overlap the
15963 remainder. */
15964 if (((n / 16) + (n % 16 ? 2 : 0)) > max_num_moves)
15965 return false;
15967 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
15968 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
15970 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
15971 src = adjust_automodify_address (src, VOIDmode, base, 0);
15973 /* Convert n to bits to make the rest of the code simpler. */
15974 n = n * BITS_PER_UNIT;
15976 /* Maximum amount to copy in one go. The AArch64 back-end has integer modes
15977 larger than TImode, but we should not use them for loads/stores here. */
15978 const int copy_limit = GET_MODE_BITSIZE (TImode);
15980 while (n > 0)
15982 /* Find the largest mode in which to do the copy in without over reading
15983 or writing. */
15984 opt_scalar_int_mode mode_iter;
15985 FOR_EACH_MODE_IN_CLASS (mode_iter, MODE_INT)
15986 if (GET_MODE_BITSIZE (mode_iter.require ()) <= MIN (n, copy_limit))
15987 cur_mode = mode_iter.require ();
15989 gcc_assert (cur_mode != BLKmode);
15991 mode_bits = GET_MODE_BITSIZE (cur_mode).to_constant ();
15992 aarch64_copy_one_block_and_progress_pointers (&src, &dst, cur_mode);
15994 n -= mode_bits;
15996 /* Do certain trailing copies as overlapping if it's going to be
15997 cheaper. i.e. less instructions to do so. For instance doing a 15
15998 byte copy it's more efficient to do two overlapping 8 byte copies than
15999 8 + 6 + 1. */
16000 if (n > 0 && n <= 8 * BITS_PER_UNIT)
16002 next_mode = smallest_mode_for_size (n, MODE_INT);
16003 int n_bits = GET_MODE_BITSIZE (next_mode).to_constant ();
16004 src = aarch64_move_pointer (src, (n - n_bits) / BITS_PER_UNIT);
16005 dst = aarch64_move_pointer (dst, (n - n_bits) / BITS_PER_UNIT);
16006 n = n_bits;
16010 return true;
16013 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
16014 SImode stores. Handle the case when the constant has identical
16015 bottom and top halves. This is beneficial when the two stores can be
16016 merged into an STP and we avoid synthesising potentially expensive
16017 immediates twice. Return true if such a split is possible. */
16019 bool
16020 aarch64_split_dimode_const_store (rtx dst, rtx src)
16022 rtx lo = gen_lowpart (SImode, src);
16023 rtx hi = gen_highpart_mode (SImode, DImode, src);
16025 bool size_p = optimize_function_for_size_p (cfun);
16027 if (!rtx_equal_p (lo, hi))
16028 return false;
16030 unsigned int orig_cost
16031 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
16032 unsigned int lo_cost
16033 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
16035 /* We want to transform:
16036 MOV x1, 49370
16037 MOVK x1, 0x140, lsl 16
16038 MOVK x1, 0xc0da, lsl 32
16039 MOVK x1, 0x140, lsl 48
16040 STR x1, [x0]
16041 into:
16042 MOV w1, 49370
16043 MOVK w1, 0x140, lsl 16
16044 STP w1, w1, [x0]
16045 So we want to perform this only when we save two instructions
16046 or more. When optimizing for size, however, accept any code size
16047 savings we can. */
16048 if (size_p && orig_cost <= lo_cost)
16049 return false;
16051 if (!size_p
16052 && (orig_cost <= lo_cost + 1))
16053 return false;
16055 rtx mem_lo = adjust_address (dst, SImode, 0);
16056 if (!aarch64_mem_pair_operand (mem_lo, SImode))
16057 return false;
16059 rtx tmp_reg = gen_reg_rtx (SImode);
16060 aarch64_expand_mov_immediate (tmp_reg, lo);
16061 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
16062 /* Don't emit an explicit store pair as this may not be always profitable.
16063 Let the sched-fusion logic decide whether to merge them. */
16064 emit_move_insn (mem_lo, tmp_reg);
16065 emit_move_insn (mem_hi, tmp_reg);
16067 return true;
16070 /* Generate RTL for a conditional branch with rtx comparison CODE in
16071 mode CC_MODE. The destination of the unlikely conditional branch
16072 is LABEL_REF. */
16074 void
16075 aarch64_gen_unlikely_cbranch (enum rtx_code code, machine_mode cc_mode,
16076 rtx label_ref)
16078 rtx x;
16079 x = gen_rtx_fmt_ee (code, VOIDmode,
16080 gen_rtx_REG (cc_mode, CC_REGNUM),
16081 const0_rtx);
16083 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
16084 gen_rtx_LABEL_REF (VOIDmode, label_ref),
16085 pc_rtx);
16086 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
16089 /* Generate DImode scratch registers for 128-bit (TImode) addition.
16091 OP1 represents the TImode destination operand 1
16092 OP2 represents the TImode destination operand 2
16093 LOW_DEST represents the low half (DImode) of TImode operand 0
16094 LOW_IN1 represents the low half (DImode) of TImode operand 1
16095 LOW_IN2 represents the low half (DImode) of TImode operand 2
16096 HIGH_DEST represents the high half (DImode) of TImode operand 0
16097 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16098 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16100 void
16101 aarch64_addti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16102 rtx *low_in1, rtx *low_in2,
16103 rtx *high_dest, rtx *high_in1,
16104 rtx *high_in2)
16106 *low_dest = gen_reg_rtx (DImode);
16107 *low_in1 = gen_lowpart (DImode, op1);
16108 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16109 subreg_lowpart_offset (DImode, TImode));
16110 *high_dest = gen_reg_rtx (DImode);
16111 *high_in1 = gen_highpart (DImode, op1);
16112 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16113 subreg_highpart_offset (DImode, TImode));
16116 /* Generate DImode scratch registers for 128-bit (TImode) subtraction.
16118 This function differs from 'arch64_addti_scratch_regs' in that
16119 OP1 can be an immediate constant (zero). We must call
16120 subreg_highpart_offset with DImode and TImode arguments, otherwise
16121 VOIDmode will be used for the const_int which generates an internal
16122 error from subreg_size_highpart_offset which does not expect a size of zero.
16124 OP1 represents the TImode destination operand 1
16125 OP2 represents the TImode destination operand 2
16126 LOW_DEST represents the low half (DImode) of TImode operand 0
16127 LOW_IN1 represents the low half (DImode) of TImode operand 1
16128 LOW_IN2 represents the low half (DImode) of TImode operand 2
16129 HIGH_DEST represents the high half (DImode) of TImode operand 0
16130 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16131 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16134 void
16135 aarch64_subvti_scratch_regs (rtx op1, rtx op2, rtx *low_dest,
16136 rtx *low_in1, rtx *low_in2,
16137 rtx *high_dest, rtx *high_in1,
16138 rtx *high_in2)
16140 *low_dest = gen_reg_rtx (DImode);
16141 *low_in1 = simplify_gen_subreg (DImode, op1, TImode,
16142 subreg_lowpart_offset (DImode, TImode));
16144 *low_in2 = simplify_gen_subreg (DImode, op2, TImode,
16145 subreg_lowpart_offset (DImode, TImode));
16146 *high_dest = gen_reg_rtx (DImode);
16148 *high_in1 = simplify_gen_subreg (DImode, op1, TImode,
16149 subreg_highpart_offset (DImode, TImode));
16150 *high_in2 = simplify_gen_subreg (DImode, op2, TImode,
16151 subreg_highpart_offset (DImode, TImode));
16154 /* Generate RTL for 128-bit (TImode) subtraction with overflow.
16156 OP0 represents the TImode destination operand 0
16157 LOW_DEST represents the low half (DImode) of TImode operand 0
16158 LOW_IN1 represents the low half (DImode) of TImode operand 1
16159 LOW_IN2 represents the low half (DImode) of TImode operand 2
16160 HIGH_DEST represents the high half (DImode) of TImode operand 0
16161 HIGH_IN1 represents the high half (DImode) of TImode operand 1
16162 HIGH_IN2 represents the high half (DImode) of TImode operand 2. */
16164 void
16165 aarch64_expand_subvti (rtx op0, rtx low_dest, rtx low_in1,
16166 rtx low_in2, rtx high_dest, rtx high_in1,
16167 rtx high_in2)
16169 if (low_in2 == const0_rtx)
16171 low_dest = low_in1;
16172 emit_insn (gen_subdi3_compare1 (high_dest, high_in1,
16173 force_reg (DImode, high_in2)));
16175 else
16177 if (CONST_INT_P (low_in2))
16179 low_in2 = force_reg (DImode, GEN_INT (-UINTVAL (low_in2)));
16180 high_in2 = force_reg (DImode, high_in2);
16181 emit_insn (gen_adddi3_compareC (low_dest, low_in1, low_in2));
16183 else
16184 emit_insn (gen_subdi3_compare1 (low_dest, low_in1, low_in2));
16185 emit_insn (gen_subdi3_carryinCV (high_dest,
16186 force_reg (DImode, high_in1),
16187 high_in2));
16190 emit_move_insn (gen_lowpart (DImode, op0), low_dest);
16191 emit_move_insn (gen_highpart (DImode, op0), high_dest);
16195 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
16197 static unsigned HOST_WIDE_INT
16198 aarch64_asan_shadow_offset (void)
16200 return (HOST_WIDE_INT_1 << 36);
16203 static rtx
16204 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
16205 int code, tree treeop0, tree treeop1)
16207 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16208 rtx op0, op1;
16209 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16210 insn_code icode;
16211 struct expand_operand ops[4];
16213 start_sequence ();
16214 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16216 op_mode = GET_MODE (op0);
16217 if (op_mode == VOIDmode)
16218 op_mode = GET_MODE (op1);
16220 switch (op_mode)
16222 case E_QImode:
16223 case E_HImode:
16224 case E_SImode:
16225 cmp_mode = SImode;
16226 icode = CODE_FOR_cmpsi;
16227 break;
16229 case E_DImode:
16230 cmp_mode = DImode;
16231 icode = CODE_FOR_cmpdi;
16232 break;
16234 case E_SFmode:
16235 cmp_mode = SFmode;
16236 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16237 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
16238 break;
16240 case E_DFmode:
16241 cmp_mode = DFmode;
16242 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
16243 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
16244 break;
16246 default:
16247 end_sequence ();
16248 return NULL_RTX;
16251 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
16252 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
16253 if (!op0 || !op1)
16255 end_sequence ();
16256 return NULL_RTX;
16258 *prep_seq = get_insns ();
16259 end_sequence ();
16261 create_fixed_operand (&ops[0], op0);
16262 create_fixed_operand (&ops[1], op1);
16264 start_sequence ();
16265 if (!maybe_expand_insn (icode, 2, ops))
16267 end_sequence ();
16268 return NULL_RTX;
16270 *gen_seq = get_insns ();
16271 end_sequence ();
16273 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
16274 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
16277 static rtx
16278 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
16279 int cmp_code, tree treeop0, tree treeop1, int bit_code)
16281 rtx op0, op1, target;
16282 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
16283 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
16284 insn_code icode;
16285 struct expand_operand ops[6];
16286 int aarch64_cond;
16288 push_to_sequence (*prep_seq);
16289 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
16291 op_mode = GET_MODE (op0);
16292 if (op_mode == VOIDmode)
16293 op_mode = GET_MODE (op1);
16295 switch (op_mode)
16297 case E_QImode:
16298 case E_HImode:
16299 case E_SImode:
16300 cmp_mode = SImode;
16301 icode = CODE_FOR_ccmpsi;
16302 break;
16304 case E_DImode:
16305 cmp_mode = DImode;
16306 icode = CODE_FOR_ccmpdi;
16307 break;
16309 case E_SFmode:
16310 cmp_mode = SFmode;
16311 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16312 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
16313 break;
16315 case E_DFmode:
16316 cmp_mode = DFmode;
16317 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
16318 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
16319 break;
16321 default:
16322 end_sequence ();
16323 return NULL_RTX;
16326 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
16327 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
16328 if (!op0 || !op1)
16330 end_sequence ();
16331 return NULL_RTX;
16333 *prep_seq = get_insns ();
16334 end_sequence ();
16336 target = gen_rtx_REG (cc_mode, CC_REGNUM);
16337 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
16339 if (bit_code != AND)
16341 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
16342 GET_MODE (XEXP (prev, 0))),
16343 VOIDmode, XEXP (prev, 0), const0_rtx);
16344 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
16347 create_fixed_operand (&ops[0], XEXP (prev, 0));
16348 create_fixed_operand (&ops[1], target);
16349 create_fixed_operand (&ops[2], op0);
16350 create_fixed_operand (&ops[3], op1);
16351 create_fixed_operand (&ops[4], prev);
16352 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
16354 push_to_sequence (*gen_seq);
16355 if (!maybe_expand_insn (icode, 6, ops))
16357 end_sequence ();
16358 return NULL_RTX;
16361 *gen_seq = get_insns ();
16362 end_sequence ();
16364 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
16367 #undef TARGET_GEN_CCMP_FIRST
16368 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
16370 #undef TARGET_GEN_CCMP_NEXT
16371 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
16373 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
16374 instruction fusion of some sort. */
16376 static bool
16377 aarch64_macro_fusion_p (void)
16379 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
16383 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
16384 should be kept together during scheduling. */
16386 static bool
16387 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
16389 rtx set_dest;
16390 rtx prev_set = single_set (prev);
16391 rtx curr_set = single_set (curr);
16392 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
16393 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
16395 if (!aarch64_macro_fusion_p ())
16396 return false;
16398 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
16400 /* We are trying to match:
16401 prev (mov) == (set (reg r0) (const_int imm16))
16402 curr (movk) == (set (zero_extract (reg r0)
16403 (const_int 16)
16404 (const_int 16))
16405 (const_int imm16_1)) */
16407 set_dest = SET_DEST (curr_set);
16409 if (GET_CODE (set_dest) == ZERO_EXTRACT
16410 && CONST_INT_P (SET_SRC (curr_set))
16411 && CONST_INT_P (SET_SRC (prev_set))
16412 && CONST_INT_P (XEXP (set_dest, 2))
16413 && INTVAL (XEXP (set_dest, 2)) == 16
16414 && REG_P (XEXP (set_dest, 0))
16415 && REG_P (SET_DEST (prev_set))
16416 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
16418 return true;
16422 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
16425 /* We're trying to match:
16426 prev (adrp) == (set (reg r1)
16427 (high (symbol_ref ("SYM"))))
16428 curr (add) == (set (reg r0)
16429 (lo_sum (reg r1)
16430 (symbol_ref ("SYM"))))
16431 Note that r0 need not necessarily be the same as r1, especially
16432 during pre-regalloc scheduling. */
16434 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16435 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16437 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
16438 && REG_P (XEXP (SET_SRC (curr_set), 0))
16439 && REGNO (XEXP (SET_SRC (curr_set), 0))
16440 == REGNO (SET_DEST (prev_set))
16441 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
16442 XEXP (SET_SRC (curr_set), 1)))
16443 return true;
16447 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
16450 /* We're trying to match:
16451 prev (movk) == (set (zero_extract (reg r0)
16452 (const_int 16)
16453 (const_int 32))
16454 (const_int imm16_1))
16455 curr (movk) == (set (zero_extract (reg r0)
16456 (const_int 16)
16457 (const_int 48))
16458 (const_int imm16_2)) */
16460 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
16461 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
16462 && REG_P (XEXP (SET_DEST (prev_set), 0))
16463 && REG_P (XEXP (SET_DEST (curr_set), 0))
16464 && REGNO (XEXP (SET_DEST (prev_set), 0))
16465 == REGNO (XEXP (SET_DEST (curr_set), 0))
16466 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
16467 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
16468 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
16469 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
16470 && CONST_INT_P (SET_SRC (prev_set))
16471 && CONST_INT_P (SET_SRC (curr_set)))
16472 return true;
16475 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
16477 /* We're trying to match:
16478 prev (adrp) == (set (reg r0)
16479 (high (symbol_ref ("SYM"))))
16480 curr (ldr) == (set (reg r1)
16481 (mem (lo_sum (reg r0)
16482 (symbol_ref ("SYM")))))
16484 curr (ldr) == (set (reg r1)
16485 (zero_extend (mem
16486 (lo_sum (reg r0)
16487 (symbol_ref ("SYM")))))) */
16488 if (satisfies_constraint_Ush (SET_SRC (prev_set))
16489 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
16491 rtx curr_src = SET_SRC (curr_set);
16493 if (GET_CODE (curr_src) == ZERO_EXTEND)
16494 curr_src = XEXP (curr_src, 0);
16496 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
16497 && REG_P (XEXP (XEXP (curr_src, 0), 0))
16498 && REGNO (XEXP (XEXP (curr_src, 0), 0))
16499 == REGNO (SET_DEST (prev_set))
16500 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
16501 XEXP (SET_SRC (prev_set), 0)))
16502 return true;
16506 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
16507 && aarch_crypto_can_dual_issue (prev, curr))
16508 return true;
16510 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
16511 && any_condjump_p (curr))
16513 enum attr_type prev_type = get_attr_type (prev);
16515 unsigned int condreg1, condreg2;
16516 rtx cc_reg_1;
16517 aarch64_fixed_condition_code_regs (&condreg1, &condreg2);
16518 cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
16520 if (reg_referenced_p (cc_reg_1, PATTERN (curr))
16521 && prev
16522 && modified_in_p (cc_reg_1, prev))
16524 /* FIXME: this misses some which is considered simple arthematic
16525 instructions for ThunderX. Simple shifts are missed here. */
16526 if (prev_type == TYPE_ALUS_SREG
16527 || prev_type == TYPE_ALUS_IMM
16528 || prev_type == TYPE_LOGICS_REG
16529 || prev_type == TYPE_LOGICS_IMM)
16530 return true;
16534 if (prev_set
16535 && curr_set
16536 && aarch64_fusion_enabled_p (AARCH64_FUSE_ALU_BRANCH)
16537 && any_condjump_p (curr))
16539 /* We're trying to match:
16540 prev (alu_insn) == (set (r0) plus ((r0) (r1/imm)))
16541 curr (cbz) == (set (pc) (if_then_else (eq/ne) (r0)
16542 (const_int 0))
16543 (label_ref ("SYM"))
16544 (pc)) */
16545 if (SET_DEST (curr_set) == (pc_rtx)
16546 && GET_CODE (SET_SRC (curr_set)) == IF_THEN_ELSE
16547 && REG_P (XEXP (XEXP (SET_SRC (curr_set), 0), 0))
16548 && REG_P (SET_DEST (prev_set))
16549 && REGNO (SET_DEST (prev_set))
16550 == REGNO (XEXP (XEXP (SET_SRC (curr_set), 0), 0)))
16552 /* Fuse ALU operations followed by conditional branch instruction. */
16553 switch (get_attr_type (prev))
16555 case TYPE_ALU_IMM:
16556 case TYPE_ALU_SREG:
16557 case TYPE_ADC_REG:
16558 case TYPE_ADC_IMM:
16559 case TYPE_ADCS_REG:
16560 case TYPE_ADCS_IMM:
16561 case TYPE_LOGIC_REG:
16562 case TYPE_LOGIC_IMM:
16563 case TYPE_CSEL:
16564 case TYPE_ADR:
16565 case TYPE_MOV_IMM:
16566 case TYPE_SHIFT_REG:
16567 case TYPE_SHIFT_IMM:
16568 case TYPE_BFM:
16569 case TYPE_RBIT:
16570 case TYPE_REV:
16571 case TYPE_EXTEND:
16572 return true;
16574 default:;
16579 return false;
16582 /* Return true iff the instruction fusion described by OP is enabled. */
16584 bool
16585 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
16587 return (aarch64_tune_params.fusible_ops & op) != 0;
16590 /* If MEM is in the form of [base+offset], extract the two parts
16591 of address and set to BASE and OFFSET, otherwise return false
16592 after clearing BASE and OFFSET. */
16594 bool
16595 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
16597 rtx addr;
16599 gcc_assert (MEM_P (mem));
16601 addr = XEXP (mem, 0);
16603 if (REG_P (addr))
16605 *base = addr;
16606 *offset = const0_rtx;
16607 return true;
16610 if (GET_CODE (addr) == PLUS
16611 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
16613 *base = XEXP (addr, 0);
16614 *offset = XEXP (addr, 1);
16615 return true;
16618 *base = NULL_RTX;
16619 *offset = NULL_RTX;
16621 return false;
16624 /* Types for scheduling fusion. */
16625 enum sched_fusion_type
16627 SCHED_FUSION_NONE = 0,
16628 SCHED_FUSION_LD_SIGN_EXTEND,
16629 SCHED_FUSION_LD_ZERO_EXTEND,
16630 SCHED_FUSION_LD,
16631 SCHED_FUSION_ST,
16632 SCHED_FUSION_NUM
16635 /* If INSN is a load or store of address in the form of [base+offset],
16636 extract the two parts and set to BASE and OFFSET. Return scheduling
16637 fusion type this INSN is. */
16639 static enum sched_fusion_type
16640 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
16642 rtx x, dest, src;
16643 enum sched_fusion_type fusion = SCHED_FUSION_LD;
16645 gcc_assert (INSN_P (insn));
16646 x = PATTERN (insn);
16647 if (GET_CODE (x) != SET)
16648 return SCHED_FUSION_NONE;
16650 src = SET_SRC (x);
16651 dest = SET_DEST (x);
16653 machine_mode dest_mode = GET_MODE (dest);
16655 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
16656 return SCHED_FUSION_NONE;
16658 if (GET_CODE (src) == SIGN_EXTEND)
16660 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
16661 src = XEXP (src, 0);
16662 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16663 return SCHED_FUSION_NONE;
16665 else if (GET_CODE (src) == ZERO_EXTEND)
16667 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
16668 src = XEXP (src, 0);
16669 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
16670 return SCHED_FUSION_NONE;
16673 if (GET_CODE (src) == MEM && REG_P (dest))
16674 extract_base_offset_in_addr (src, base, offset);
16675 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
16677 fusion = SCHED_FUSION_ST;
16678 extract_base_offset_in_addr (dest, base, offset);
16680 else
16681 return SCHED_FUSION_NONE;
16683 if (*base == NULL_RTX || *offset == NULL_RTX)
16684 fusion = SCHED_FUSION_NONE;
16686 return fusion;
16689 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
16691 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
16692 and PRI are only calculated for these instructions. For other instruction,
16693 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
16694 type instruction fusion can be added by returning different priorities.
16696 It's important that irrelevant instructions get the largest FUSION_PRI. */
16698 static void
16699 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
16700 int *fusion_pri, int *pri)
16702 int tmp, off_val;
16703 rtx base, offset;
16704 enum sched_fusion_type fusion;
16706 gcc_assert (INSN_P (insn));
16708 tmp = max_pri - 1;
16709 fusion = fusion_load_store (insn, &base, &offset);
16710 if (fusion == SCHED_FUSION_NONE)
16712 *pri = tmp;
16713 *fusion_pri = tmp;
16714 return;
16717 /* Set FUSION_PRI according to fusion type and base register. */
16718 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
16720 /* Calculate PRI. */
16721 tmp /= 2;
16723 /* INSN with smaller offset goes first. */
16724 off_val = (int)(INTVAL (offset));
16725 if (off_val >= 0)
16726 tmp -= (off_val & 0xfffff);
16727 else
16728 tmp += ((- off_val) & 0xfffff);
16730 *pri = tmp;
16731 return;
16734 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
16735 Adjust priority of sha1h instructions so they are scheduled before
16736 other SHA1 instructions. */
16738 static int
16739 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
16741 rtx x = PATTERN (insn);
16743 if (GET_CODE (x) == SET)
16745 x = SET_SRC (x);
16747 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
16748 return priority + 10;
16751 return priority;
16754 /* Given OPERANDS of consecutive load/store, check if we can merge
16755 them into ldp/stp. LOAD is true if they are load instructions.
16756 MODE is the mode of memory operands. */
16758 bool
16759 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
16760 machine_mode mode)
16762 HOST_WIDE_INT offval_1, offval_2, msize;
16763 enum reg_class rclass_1, rclass_2;
16764 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
16766 if (load)
16768 mem_1 = operands[1];
16769 mem_2 = operands[3];
16770 reg_1 = operands[0];
16771 reg_2 = operands[2];
16772 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
16773 if (REGNO (reg_1) == REGNO (reg_2))
16774 return false;
16776 else
16778 mem_1 = operands[0];
16779 mem_2 = operands[2];
16780 reg_1 = operands[1];
16781 reg_2 = operands[3];
16784 /* The mems cannot be volatile. */
16785 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
16786 return false;
16788 /* If we have SImode and slow unaligned ldp,
16789 check the alignment to be at least 8 byte. */
16790 if (mode == SImode
16791 && (aarch64_tune_params.extra_tuning_flags
16792 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
16793 && !optimize_size
16794 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
16795 return false;
16797 /* Check if the addresses are in the form of [base+offset]. */
16798 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16799 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
16800 return false;
16801 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16802 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
16803 return false;
16805 /* Check if the bases are same. */
16806 if (!rtx_equal_p (base_1, base_2))
16807 return false;
16809 /* The operands must be of the same size. */
16810 gcc_assert (known_eq (GET_MODE_SIZE (GET_MODE (mem_1)),
16811 GET_MODE_SIZE (GET_MODE (mem_2))));
16813 offval_1 = INTVAL (offset_1);
16814 offval_2 = INTVAL (offset_2);
16815 /* We should only be trying this for fixed-sized modes. There is no
16816 SVE LDP/STP instruction. */
16817 msize = GET_MODE_SIZE (mode).to_constant ();
16818 /* Check if the offsets are consecutive. */
16819 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
16820 return false;
16822 /* Check if the addresses are clobbered by load. */
16823 if (load)
16825 if (reg_mentioned_p (reg_1, mem_1))
16826 return false;
16828 /* In increasing order, the last load can clobber the address. */
16829 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
16830 return false;
16833 /* One of the memory accesses must be a mempair operand.
16834 If it is not the first one, they need to be swapped by the
16835 peephole. */
16836 if (!aarch64_mem_pair_operand (mem_1, GET_MODE (mem_1))
16837 && !aarch64_mem_pair_operand (mem_2, GET_MODE (mem_2)))
16838 return false;
16840 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
16841 rclass_1 = FP_REGS;
16842 else
16843 rclass_1 = GENERAL_REGS;
16845 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
16846 rclass_2 = FP_REGS;
16847 else
16848 rclass_2 = GENERAL_REGS;
16850 /* Check if the registers are of same class. */
16851 if (rclass_1 != rclass_2)
16852 return false;
16854 return true;
16857 /* Given OPERANDS of consecutive load/store that can be merged,
16858 swap them if they are not in ascending order. */
16859 void
16860 aarch64_swap_ldrstr_operands (rtx* operands, bool load)
16862 rtx mem_1, mem_2, base_1, base_2, offset_1, offset_2;
16863 HOST_WIDE_INT offval_1, offval_2;
16865 if (load)
16867 mem_1 = operands[1];
16868 mem_2 = operands[3];
16870 else
16872 mem_1 = operands[0];
16873 mem_2 = operands[2];
16876 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
16877 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
16879 offval_1 = INTVAL (offset_1);
16880 offval_2 = INTVAL (offset_2);
16882 if (offval_1 > offval_2)
16884 /* Irrespective of whether this is a load or a store,
16885 we do the same swap. */
16886 std::swap (operands[0], operands[2]);
16887 std::swap (operands[1], operands[3]);
16891 /* Taking X and Y to be HOST_WIDE_INT pointers, return the result of a
16892 comparison between the two. */
16894 aarch64_host_wide_int_compare (const void *x, const void *y)
16896 return wi::cmps (* ((const HOST_WIDE_INT *) x),
16897 * ((const HOST_WIDE_INT *) y));
16900 /* Taking X and Y to be pairs of RTX, one pointing to a MEM rtx and the
16901 other pointing to a REG rtx containing an offset, compare the offsets
16902 of the two pairs.
16904 Return:
16906 1 iff offset (X) > offset (Y)
16907 0 iff offset (X) == offset (Y)
16908 -1 iff offset (X) < offset (Y) */
16910 aarch64_ldrstr_offset_compare (const void *x, const void *y)
16912 const rtx * operands_1 = (const rtx *) x;
16913 const rtx * operands_2 = (const rtx *) y;
16914 rtx mem_1, mem_2, base, offset_1, offset_2;
16916 if (MEM_P (operands_1[0]))
16917 mem_1 = operands_1[0];
16918 else
16919 mem_1 = operands_1[1];
16921 if (MEM_P (operands_2[0]))
16922 mem_2 = operands_2[0];
16923 else
16924 mem_2 = operands_2[1];
16926 /* Extract the offsets. */
16927 extract_base_offset_in_addr (mem_1, &base, &offset_1);
16928 extract_base_offset_in_addr (mem_2, &base, &offset_2);
16930 gcc_assert (offset_1 != NULL_RTX && offset_2 != NULL_RTX);
16932 return wi::cmps (INTVAL (offset_1), INTVAL (offset_2));
16935 /* Given OPERANDS of consecutive load/store, check if we can merge
16936 them into ldp/stp by adjusting the offset. LOAD is true if they
16937 are load instructions. MODE is the mode of memory operands.
16939 Given below consecutive stores:
16941 str w1, [xb, 0x100]
16942 str w1, [xb, 0x104]
16943 str w1, [xb, 0x108]
16944 str w1, [xb, 0x10c]
16946 Though the offsets are out of the range supported by stp, we can
16947 still pair them after adjusting the offset, like:
16949 add scratch, xb, 0x100
16950 stp w1, w1, [scratch]
16951 stp w1, w1, [scratch, 0x8]
16953 The peephole patterns detecting this opportunity should guarantee
16954 the scratch register is avaliable. */
16956 bool
16957 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
16958 scalar_mode mode)
16960 const int num_insns = 4;
16961 enum reg_class rclass;
16962 HOST_WIDE_INT offvals[num_insns], msize;
16963 rtx mem[num_insns], reg[num_insns], base[num_insns], offset[num_insns];
16965 if (load)
16967 for (int i = 0; i < num_insns; i++)
16969 reg[i] = operands[2 * i];
16970 mem[i] = operands[2 * i + 1];
16972 gcc_assert (REG_P (reg[i]));
16975 /* Do not attempt to merge the loads if the loads clobber each other. */
16976 for (int i = 0; i < 8; i += 2)
16977 for (int j = i + 2; j < 8; j += 2)
16978 if (reg_overlap_mentioned_p (operands[i], operands[j]))
16979 return false;
16981 else
16982 for (int i = 0; i < num_insns; i++)
16984 mem[i] = operands[2 * i];
16985 reg[i] = operands[2 * i + 1];
16988 /* Skip if memory operand is by itself valid for ldp/stp. */
16989 if (!MEM_P (mem[0]) || aarch64_mem_pair_operand (mem[0], mode))
16990 return false;
16992 for (int i = 0; i < num_insns; i++)
16994 /* The mems cannot be volatile. */
16995 if (MEM_VOLATILE_P (mem[i]))
16996 return false;
16998 /* Check if the addresses are in the form of [base+offset]. */
16999 extract_base_offset_in_addr (mem[i], base + i, offset + i);
17000 if (base[i] == NULL_RTX || offset[i] == NULL_RTX)
17001 return false;
17004 /* Check if the registers are of same class. */
17005 rclass = REG_P (reg[0]) && FP_REGNUM_P (REGNO (reg[0]))
17006 ? FP_REGS : GENERAL_REGS;
17008 for (int i = 1; i < num_insns; i++)
17009 if (REG_P (reg[i]) && FP_REGNUM_P (REGNO (reg[i])))
17011 if (rclass != FP_REGS)
17012 return false;
17014 else
17016 if (rclass != GENERAL_REGS)
17017 return false;
17020 /* Only the last register in the order in which they occur
17021 may be clobbered by the load. */
17022 if (rclass == GENERAL_REGS && load)
17023 for (int i = 0; i < num_insns - 1; i++)
17024 if (reg_mentioned_p (reg[i], mem[i]))
17025 return false;
17027 /* Check if the bases are same. */
17028 for (int i = 0; i < num_insns - 1; i++)
17029 if (!rtx_equal_p (base[i], base[i + 1]))
17030 return false;
17032 for (int i = 0; i < num_insns; i++)
17033 offvals[i] = INTVAL (offset[i]);
17035 msize = GET_MODE_SIZE (mode);
17037 /* Check if the offsets can be put in the right order to do a ldp/stp. */
17038 qsort (offvals, num_insns, sizeof (HOST_WIDE_INT),
17039 aarch64_host_wide_int_compare);
17041 if (!(offvals[1] == offvals[0] + msize
17042 && offvals[3] == offvals[2] + msize))
17043 return false;
17045 /* Check that offsets are within range of each other. The ldp/stp
17046 instructions have 7 bit immediate offsets, so use 0x80. */
17047 if (offvals[2] - offvals[0] >= msize * 0x80)
17048 return false;
17050 /* The offsets must be aligned with respect to each other. */
17051 if (offvals[0] % msize != offvals[2] % msize)
17052 return false;
17054 /* If we have SImode and slow unaligned ldp,
17055 check the alignment to be at least 8 byte. */
17056 if (mode == SImode
17057 && (aarch64_tune_params.extra_tuning_flags
17058 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
17059 && !optimize_size
17060 && MEM_ALIGN (mem[0]) < 8 * BITS_PER_UNIT)
17061 return false;
17063 return true;
17066 /* Given OPERANDS of consecutive load/store, this function pairs them
17067 into LDP/STP after adjusting the offset. It depends on the fact
17068 that the operands can be sorted so the offsets are correct for STP.
17069 MODE is the mode of memory operands. CODE is the rtl operator
17070 which should be applied to all memory operands, it's SIGN_EXTEND,
17071 ZERO_EXTEND or UNKNOWN. */
17073 bool
17074 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
17075 scalar_mode mode, RTX_CODE code)
17077 rtx base, offset_1, offset_3, t1, t2;
17078 rtx mem_1, mem_2, mem_3, mem_4;
17079 rtx temp_operands[8];
17080 HOST_WIDE_INT off_val_1, off_val_3, base_off, new_off_1, new_off_3,
17081 stp_off_upper_limit, stp_off_lower_limit, msize;
17083 /* We make changes on a copy as we may still bail out. */
17084 for (int i = 0; i < 8; i ++)
17085 temp_operands[i] = operands[i];
17087 /* Sort the operands. */
17088 qsort (temp_operands, 4, 2 * sizeof (rtx *), aarch64_ldrstr_offset_compare);
17090 if (load)
17092 mem_1 = temp_operands[1];
17093 mem_2 = temp_operands[3];
17094 mem_3 = temp_operands[5];
17095 mem_4 = temp_operands[7];
17097 else
17099 mem_1 = temp_operands[0];
17100 mem_2 = temp_operands[2];
17101 mem_3 = temp_operands[4];
17102 mem_4 = temp_operands[6];
17103 gcc_assert (code == UNKNOWN);
17106 extract_base_offset_in_addr (mem_1, &base, &offset_1);
17107 extract_base_offset_in_addr (mem_3, &base, &offset_3);
17108 gcc_assert (base != NULL_RTX && offset_1 != NULL_RTX
17109 && offset_3 != NULL_RTX);
17111 /* Adjust offset so it can fit in LDP/STP instruction. */
17112 msize = GET_MODE_SIZE (mode);
17113 stp_off_upper_limit = msize * (0x40 - 1);
17114 stp_off_lower_limit = - msize * 0x40;
17116 off_val_1 = INTVAL (offset_1);
17117 off_val_3 = INTVAL (offset_3);
17119 /* The base offset is optimally half way between the two STP/LDP offsets. */
17120 if (msize <= 4)
17121 base_off = (off_val_1 + off_val_3) / 2;
17122 else
17123 /* However, due to issues with negative LDP/STP offset generation for
17124 larger modes, for DF, DI and vector modes. we must not use negative
17125 addresses smaller than 9 signed unadjusted bits can store. This
17126 provides the most range in this case. */
17127 base_off = off_val_1;
17129 /* Adjust the base so that it is aligned with the addresses but still
17130 optimal. */
17131 if (base_off % msize != off_val_1 % msize)
17132 /* Fix the offset, bearing in mind we want to make it bigger not
17133 smaller. */
17134 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17135 else if (msize <= 4)
17136 /* The negative range of LDP/STP is one larger than the positive range. */
17137 base_off += msize;
17139 /* Check if base offset is too big or too small. We can attempt to resolve
17140 this issue by setting it to the maximum value and seeing if the offsets
17141 still fit. */
17142 if (base_off >= 0x1000)
17144 base_off = 0x1000 - 1;
17145 /* We must still make sure that the base offset is aligned with respect
17146 to the address. But it may may not be made any bigger. */
17147 base_off -= (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17150 /* Likewise for the case where the base is too small. */
17151 if (base_off <= -0x1000)
17153 base_off = -0x1000 + 1;
17154 base_off += (((base_off % msize) - (off_val_1 % msize)) + msize) % msize;
17157 /* Offset of the first STP/LDP. */
17158 new_off_1 = off_val_1 - base_off;
17160 /* Offset of the second STP/LDP. */
17161 new_off_3 = off_val_3 - base_off;
17163 /* The offsets must be within the range of the LDP/STP instructions. */
17164 if (new_off_1 > stp_off_upper_limit || new_off_1 < stp_off_lower_limit
17165 || new_off_3 > stp_off_upper_limit || new_off_3 < stp_off_lower_limit)
17166 return false;
17168 replace_equiv_address_nv (mem_1, plus_constant (Pmode, operands[8],
17169 new_off_1), true);
17170 replace_equiv_address_nv (mem_2, plus_constant (Pmode, operands[8],
17171 new_off_1 + msize), true);
17172 replace_equiv_address_nv (mem_3, plus_constant (Pmode, operands[8],
17173 new_off_3), true);
17174 replace_equiv_address_nv (mem_4, plus_constant (Pmode, operands[8],
17175 new_off_3 + msize), true);
17177 if (!aarch64_mem_pair_operand (mem_1, mode)
17178 || !aarch64_mem_pair_operand (mem_3, mode))
17179 return false;
17181 if (code == ZERO_EXTEND)
17183 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
17184 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
17185 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
17186 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
17188 else if (code == SIGN_EXTEND)
17190 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
17191 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
17192 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
17193 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
17196 if (load)
17198 operands[0] = temp_operands[0];
17199 operands[1] = mem_1;
17200 operands[2] = temp_operands[2];
17201 operands[3] = mem_2;
17202 operands[4] = temp_operands[4];
17203 operands[5] = mem_3;
17204 operands[6] = temp_operands[6];
17205 operands[7] = mem_4;
17207 else
17209 operands[0] = mem_1;
17210 operands[1] = temp_operands[1];
17211 operands[2] = mem_2;
17212 operands[3] = temp_operands[3];
17213 operands[4] = mem_3;
17214 operands[5] = temp_operands[5];
17215 operands[6] = mem_4;
17216 operands[7] = temp_operands[7];
17219 /* Emit adjusting instruction. */
17220 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, base_off)));
17221 /* Emit ldp/stp instructions. */
17222 t1 = gen_rtx_SET (operands[0], operands[1]);
17223 t2 = gen_rtx_SET (operands[2], operands[3]);
17224 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17225 t1 = gen_rtx_SET (operands[4], operands[5]);
17226 t2 = gen_rtx_SET (operands[6], operands[7]);
17227 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
17228 return true;
17231 /* Implement TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE. Assume for now that
17232 it isn't worth branching around empty masked ops (including masked
17233 stores). */
17235 static bool
17236 aarch64_empty_mask_is_expensive (unsigned)
17238 return false;
17241 /* Return 1 if pseudo register should be created and used to hold
17242 GOT address for PIC code. */
17244 bool
17245 aarch64_use_pseudo_pic_reg (void)
17247 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
17250 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
17252 static int
17253 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
17255 switch (XINT (x, 1))
17257 case UNSPEC_GOTSMALLPIC:
17258 case UNSPEC_GOTSMALLPIC28K:
17259 case UNSPEC_GOTTINYPIC:
17260 return 0;
17261 default:
17262 break;
17265 return default_unspec_may_trap_p (x, flags);
17269 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
17270 return the log2 of that value. Otherwise return -1. */
17273 aarch64_fpconst_pow_of_2 (rtx x)
17275 const REAL_VALUE_TYPE *r;
17277 if (!CONST_DOUBLE_P (x))
17278 return -1;
17280 r = CONST_DOUBLE_REAL_VALUE (x);
17282 if (REAL_VALUE_NEGATIVE (*r)
17283 || REAL_VALUE_ISNAN (*r)
17284 || REAL_VALUE_ISINF (*r)
17285 || !real_isinteger (r, DFmode))
17286 return -1;
17288 return exact_log2 (real_to_integer (r));
17291 /* If X is a vector of equal CONST_DOUBLE values and that value is
17292 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
17295 aarch64_vec_fpconst_pow_of_2 (rtx x)
17297 int nelts;
17298 if (GET_CODE (x) != CONST_VECTOR
17299 || !CONST_VECTOR_NUNITS (x).is_constant (&nelts))
17300 return -1;
17302 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
17303 return -1;
17305 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
17306 if (firstval <= 0)
17307 return -1;
17309 for (int i = 1; i < nelts; i++)
17310 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
17311 return -1;
17313 return firstval;
17316 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
17317 to float.
17319 __fp16 always promotes through this hook.
17320 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
17321 through the generic excess precision logic rather than here. */
17323 static tree
17324 aarch64_promoted_type (const_tree t)
17326 if (SCALAR_FLOAT_TYPE_P (t)
17327 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
17328 return float_type_node;
17330 return NULL_TREE;
17333 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
17335 static bool
17336 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
17337 optimization_type opt_type)
17339 switch (op)
17341 case rsqrt_optab:
17342 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
17344 default:
17345 return true;
17349 /* Implement the TARGET_DWARF_POLY_INDETERMINATE_VALUE hook. */
17351 static unsigned int
17352 aarch64_dwarf_poly_indeterminate_value (unsigned int i, unsigned int *factor,
17353 int *offset)
17355 /* Polynomial invariant 1 == (VG / 2) - 1. */
17356 gcc_assert (i == 1);
17357 *factor = 2;
17358 *offset = 1;
17359 return AARCH64_DWARF_VG;
17362 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
17363 if MODE is HFmode, and punt to the generic implementation otherwise. */
17365 static bool
17366 aarch64_libgcc_floating_mode_supported_p (scalar_float_mode mode)
17368 return (mode == HFmode
17369 ? true
17370 : default_libgcc_floating_mode_supported_p (mode));
17373 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
17374 if MODE is HFmode, and punt to the generic implementation otherwise. */
17376 static bool
17377 aarch64_scalar_mode_supported_p (scalar_mode mode)
17379 return (mode == HFmode
17380 ? true
17381 : default_scalar_mode_supported_p (mode));
17384 /* Set the value of FLT_EVAL_METHOD.
17385 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
17387 0: evaluate all operations and constants, whose semantic type has at
17388 most the range and precision of type float, to the range and
17389 precision of float; evaluate all other operations and constants to
17390 the range and precision of the semantic type;
17392 N, where _FloatN is a supported interchange floating type
17393 evaluate all operations and constants, whose semantic type has at
17394 most the range and precision of _FloatN type, to the range and
17395 precision of the _FloatN type; evaluate all other operations and
17396 constants to the range and precision of the semantic type;
17398 If we have the ARMv8.2-A extensions then we support _Float16 in native
17399 precision, so we should set this to 16. Otherwise, we support the type,
17400 but want to evaluate expressions in float precision, so set this to
17401 0. */
17403 static enum flt_eval_method
17404 aarch64_excess_precision (enum excess_precision_type type)
17406 switch (type)
17408 case EXCESS_PRECISION_TYPE_FAST:
17409 case EXCESS_PRECISION_TYPE_STANDARD:
17410 /* We can calculate either in 16-bit range and precision or
17411 32-bit range and precision. Make that decision based on whether
17412 we have native support for the ARMv8.2-A 16-bit floating-point
17413 instructions or not. */
17414 return (TARGET_FP_F16INST
17415 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
17416 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
17417 case EXCESS_PRECISION_TYPE_IMPLICIT:
17418 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
17419 default:
17420 gcc_unreachable ();
17422 return FLT_EVAL_METHOD_UNPREDICTABLE;
17425 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
17426 scheduled for speculative execution. Reject the long-running division
17427 and square-root instructions. */
17429 static bool
17430 aarch64_sched_can_speculate_insn (rtx_insn *insn)
17432 switch (get_attr_type (insn))
17434 case TYPE_SDIV:
17435 case TYPE_UDIV:
17436 case TYPE_FDIVS:
17437 case TYPE_FDIVD:
17438 case TYPE_FSQRTS:
17439 case TYPE_FSQRTD:
17440 case TYPE_NEON_FP_SQRT_S:
17441 case TYPE_NEON_FP_SQRT_D:
17442 case TYPE_NEON_FP_SQRT_S_Q:
17443 case TYPE_NEON_FP_SQRT_D_Q:
17444 case TYPE_NEON_FP_DIV_S:
17445 case TYPE_NEON_FP_DIV_D:
17446 case TYPE_NEON_FP_DIV_S_Q:
17447 case TYPE_NEON_FP_DIV_D_Q:
17448 return false;
17449 default:
17450 return true;
17454 /* Implement TARGET_COMPUTE_PRESSURE_CLASSES. */
17456 static int
17457 aarch64_compute_pressure_classes (reg_class *classes)
17459 int i = 0;
17460 classes[i++] = GENERAL_REGS;
17461 classes[i++] = FP_REGS;
17462 /* PR_REGS isn't a useful pressure class because many predicate pseudo
17463 registers need to go in PR_LO_REGS at some point during their
17464 lifetime. Splitting it into two halves has the effect of making
17465 all predicates count against PR_LO_REGS, so that we try whenever
17466 possible to restrict the number of live predicates to 8. This
17467 greatly reduces the amount of spilling in certain loops. */
17468 classes[i++] = PR_LO_REGS;
17469 classes[i++] = PR_HI_REGS;
17470 return i;
17473 /* Implement TARGET_CAN_CHANGE_MODE_CLASS. */
17475 static bool
17476 aarch64_can_change_mode_class (machine_mode from,
17477 machine_mode to, reg_class_t)
17479 if (BYTES_BIG_ENDIAN)
17481 bool from_sve_p = aarch64_sve_data_mode_p (from);
17482 bool to_sve_p = aarch64_sve_data_mode_p (to);
17484 /* Don't allow changes between SVE data modes and non-SVE modes.
17485 See the comment at the head of aarch64-sve.md for details. */
17486 if (from_sve_p != to_sve_p)
17487 return false;
17489 /* Don't allow changes in element size: lane 0 of the new vector
17490 would not then be lane 0 of the old vector. See the comment
17491 above aarch64_maybe_expand_sve_subreg_move for a more detailed
17492 description.
17494 In the worst case, this forces a register to be spilled in
17495 one mode and reloaded in the other, which handles the
17496 endianness correctly. */
17497 if (from_sve_p && GET_MODE_UNIT_SIZE (from) != GET_MODE_UNIT_SIZE (to))
17498 return false;
17500 return true;
17503 /* Implement TARGET_EARLY_REMAT_MODES. */
17505 static void
17506 aarch64_select_early_remat_modes (sbitmap modes)
17508 /* SVE values are not normally live across a call, so it should be
17509 worth doing early rematerialization even in VL-specific mode. */
17510 for (int i = 0; i < NUM_MACHINE_MODES; ++i)
17512 machine_mode mode = (machine_mode) i;
17513 unsigned int vec_flags = aarch64_classify_vector_mode (mode);
17514 if (vec_flags & VEC_ANY_SVE)
17515 bitmap_set_bit (modes, i);
17519 /* Override the default target speculation_safe_value. */
17520 static rtx
17521 aarch64_speculation_safe_value (machine_mode mode,
17522 rtx result, rtx val, rtx failval)
17524 /* Maybe we should warn if falling back to hard barriers. They are
17525 likely to be noticably more expensive than the alternative below. */
17526 if (!aarch64_track_speculation)
17527 return default_speculation_safe_value (mode, result, val, failval);
17529 if (!REG_P (val))
17530 val = copy_to_mode_reg (mode, val);
17532 if (!aarch64_reg_or_zero (failval, mode))
17533 failval = copy_to_mode_reg (mode, failval);
17535 switch (mode)
17537 case E_QImode:
17538 emit_insn (gen_despeculate_copyqi (result, val, failval));
17539 break;
17540 case E_HImode:
17541 emit_insn (gen_despeculate_copyhi (result, val, failval));
17542 break;
17543 case E_SImode:
17544 emit_insn (gen_despeculate_copysi (result, val, failval));
17545 break;
17546 case E_DImode:
17547 emit_insn (gen_despeculate_copydi (result, val, failval));
17548 break;
17549 case E_TImode:
17550 emit_insn (gen_despeculate_copyti (result, val, failval));
17551 break;
17552 default:
17553 gcc_unreachable ();
17555 return result;
17558 /* Target-specific selftests. */
17560 #if CHECKING_P
17562 namespace selftest {
17564 /* Selftest for the RTL loader.
17565 Verify that the RTL loader copes with a dump from
17566 print_rtx_function. This is essentially just a test that class
17567 function_reader can handle a real dump, but it also verifies
17568 that lookup_reg_by_dump_name correctly handles hard regs.
17569 The presence of hard reg names in the dump means that the test is
17570 target-specific, hence it is in this file. */
17572 static void
17573 aarch64_test_loading_full_dump ()
17575 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
17577 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
17579 rtx_insn *insn_1 = get_insn_by_uid (1);
17580 ASSERT_EQ (NOTE, GET_CODE (insn_1));
17582 rtx_insn *insn_15 = get_insn_by_uid (15);
17583 ASSERT_EQ (INSN, GET_CODE (insn_15));
17584 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
17586 /* Verify crtl->return_rtx. */
17587 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
17588 ASSERT_EQ (0, REGNO (crtl->return_rtx));
17589 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
17592 /* Run all target-specific selftests. */
17594 static void
17595 aarch64_run_selftests (void)
17597 aarch64_test_loading_full_dump ();
17600 } // namespace selftest
17602 #endif /* #if CHECKING_P */
17604 #undef TARGET_ADDRESS_COST
17605 #define TARGET_ADDRESS_COST aarch64_address_cost
17607 /* This hook will determines whether unnamed bitfields affect the alignment
17608 of the containing structure. The hook returns true if the structure
17609 should inherit the alignment requirements of an unnamed bitfield's
17610 type. */
17611 #undef TARGET_ALIGN_ANON_BITFIELD
17612 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
17614 #undef TARGET_ASM_ALIGNED_DI_OP
17615 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
17617 #undef TARGET_ASM_ALIGNED_HI_OP
17618 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
17620 #undef TARGET_ASM_ALIGNED_SI_OP
17621 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
17623 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
17624 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
17625 hook_bool_const_tree_hwi_hwi_const_tree_true
17627 #undef TARGET_ASM_FILE_START
17628 #define TARGET_ASM_FILE_START aarch64_start_file
17630 #undef TARGET_ASM_OUTPUT_MI_THUNK
17631 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
17633 #undef TARGET_ASM_SELECT_RTX_SECTION
17634 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
17636 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
17637 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
17639 #undef TARGET_BUILD_BUILTIN_VA_LIST
17640 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
17642 #undef TARGET_CALLEE_COPIES
17643 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
17645 #undef TARGET_CAN_ELIMINATE
17646 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
17648 #undef TARGET_CAN_INLINE_P
17649 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
17651 #undef TARGET_CANNOT_FORCE_CONST_MEM
17652 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
17654 #undef TARGET_CASE_VALUES_THRESHOLD
17655 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
17657 #undef TARGET_CONDITIONAL_REGISTER_USAGE
17658 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
17660 /* Only the least significant bit is used for initialization guard
17661 variables. */
17662 #undef TARGET_CXX_GUARD_MASK_BIT
17663 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
17665 #undef TARGET_C_MODE_FOR_SUFFIX
17666 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
17668 #ifdef TARGET_BIG_ENDIAN_DEFAULT
17669 #undef TARGET_DEFAULT_TARGET_FLAGS
17670 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
17671 #endif
17673 #undef TARGET_CLASS_MAX_NREGS
17674 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
17676 #undef TARGET_BUILTIN_DECL
17677 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
17679 #undef TARGET_BUILTIN_RECIPROCAL
17680 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
17682 #undef TARGET_C_EXCESS_PRECISION
17683 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
17685 #undef TARGET_EXPAND_BUILTIN
17686 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
17688 #undef TARGET_EXPAND_BUILTIN_VA_START
17689 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
17691 #undef TARGET_FOLD_BUILTIN
17692 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
17694 #undef TARGET_FUNCTION_ARG
17695 #define TARGET_FUNCTION_ARG aarch64_function_arg
17697 #undef TARGET_FUNCTION_ARG_ADVANCE
17698 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
17700 #undef TARGET_FUNCTION_ARG_BOUNDARY
17701 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
17703 #undef TARGET_FUNCTION_ARG_PADDING
17704 #define TARGET_FUNCTION_ARG_PADDING aarch64_function_arg_padding
17706 #undef TARGET_GET_RAW_RESULT_MODE
17707 #define TARGET_GET_RAW_RESULT_MODE aarch64_get_reg_raw_mode
17708 #undef TARGET_GET_RAW_ARG_MODE
17709 #define TARGET_GET_RAW_ARG_MODE aarch64_get_reg_raw_mode
17711 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
17712 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
17714 #undef TARGET_FUNCTION_VALUE
17715 #define TARGET_FUNCTION_VALUE aarch64_function_value
17717 #undef TARGET_FUNCTION_VALUE_REGNO_P
17718 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
17720 #undef TARGET_GIMPLE_FOLD_BUILTIN
17721 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
17723 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
17724 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
17726 #undef TARGET_INIT_BUILTINS
17727 #define TARGET_INIT_BUILTINS aarch64_init_builtins
17729 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
17730 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
17731 aarch64_ira_change_pseudo_allocno_class
17733 #undef TARGET_LEGITIMATE_ADDRESS_P
17734 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
17736 #undef TARGET_LEGITIMATE_CONSTANT_P
17737 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
17739 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
17740 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
17741 aarch64_legitimize_address_displacement
17743 #undef TARGET_LIBGCC_CMP_RETURN_MODE
17744 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
17746 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
17747 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
17748 aarch64_libgcc_floating_mode_supported_p
17750 #undef TARGET_MANGLE_TYPE
17751 #define TARGET_MANGLE_TYPE aarch64_mangle_type
17753 #undef TARGET_MEMORY_MOVE_COST
17754 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
17756 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
17757 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
17759 #undef TARGET_MUST_PASS_IN_STACK
17760 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
17762 /* This target hook should return true if accesses to volatile bitfields
17763 should use the narrowest mode possible. It should return false if these
17764 accesses should use the bitfield container type. */
17765 #undef TARGET_NARROW_VOLATILE_BITFIELD
17766 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
17768 #undef TARGET_OPTION_OVERRIDE
17769 #define TARGET_OPTION_OVERRIDE aarch64_override_options
17771 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
17772 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
17773 aarch64_override_options_after_change
17775 #undef TARGET_OPTION_SAVE
17776 #define TARGET_OPTION_SAVE aarch64_option_save
17778 #undef TARGET_OPTION_RESTORE
17779 #define TARGET_OPTION_RESTORE aarch64_option_restore
17781 #undef TARGET_OPTION_PRINT
17782 #define TARGET_OPTION_PRINT aarch64_option_print
17784 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
17785 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
17787 #undef TARGET_SET_CURRENT_FUNCTION
17788 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
17790 #undef TARGET_PASS_BY_REFERENCE
17791 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
17793 #undef TARGET_PREFERRED_RELOAD_CLASS
17794 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
17796 #undef TARGET_SCHED_REASSOCIATION_WIDTH
17797 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
17799 #undef TARGET_PROMOTED_TYPE
17800 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
17802 #undef TARGET_SECONDARY_RELOAD
17803 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
17805 #undef TARGET_SHIFT_TRUNCATION_MASK
17806 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
17808 #undef TARGET_SETUP_INCOMING_VARARGS
17809 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
17811 #undef TARGET_STRUCT_VALUE_RTX
17812 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
17814 #undef TARGET_REGISTER_MOVE_COST
17815 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
17817 #undef TARGET_RETURN_IN_MEMORY
17818 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
17820 #undef TARGET_RETURN_IN_MSB
17821 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
17823 #undef TARGET_RTX_COSTS
17824 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
17826 #undef TARGET_SCALAR_MODE_SUPPORTED_P
17827 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
17829 #undef TARGET_SCHED_ISSUE_RATE
17830 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
17832 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
17833 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
17834 aarch64_sched_first_cycle_multipass_dfa_lookahead
17836 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
17837 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
17838 aarch64_first_cycle_multipass_dfa_lookahead_guard
17840 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
17841 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
17842 aarch64_get_separate_components
17844 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
17845 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
17846 aarch64_components_for_bb
17848 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
17849 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
17850 aarch64_disqualify_components
17852 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
17853 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
17854 aarch64_emit_prologue_components
17856 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
17857 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
17858 aarch64_emit_epilogue_components
17860 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
17861 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
17862 aarch64_set_handled_components
17864 #undef TARGET_TRAMPOLINE_INIT
17865 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
17867 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
17868 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
17870 #undef TARGET_VECTOR_MODE_SUPPORTED_P
17871 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
17873 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
17874 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
17875 aarch64_builtin_support_vector_misalignment
17877 #undef TARGET_ARRAY_MODE
17878 #define TARGET_ARRAY_MODE aarch64_array_mode
17880 #undef TARGET_ARRAY_MODE_SUPPORTED_P
17881 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
17883 #undef TARGET_VECTORIZE_ADD_STMT_COST
17884 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
17886 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
17887 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
17888 aarch64_builtin_vectorization_cost
17890 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
17891 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
17893 #undef TARGET_VECTORIZE_BUILTINS
17894 #define TARGET_VECTORIZE_BUILTINS
17896 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
17897 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
17898 aarch64_builtin_vectorized_function
17900 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
17901 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
17902 aarch64_autovectorize_vector_sizes
17904 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
17905 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
17906 aarch64_atomic_assign_expand_fenv
17908 /* Section anchor support. */
17910 #undef TARGET_MIN_ANCHOR_OFFSET
17911 #define TARGET_MIN_ANCHOR_OFFSET -256
17913 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
17914 byte offset; we can do much more for larger data types, but have no way
17915 to determine the size of the access. We assume accesses are aligned. */
17916 #undef TARGET_MAX_ANCHOR_OFFSET
17917 #define TARGET_MAX_ANCHOR_OFFSET 4095
17919 #undef TARGET_VECTOR_ALIGNMENT
17920 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
17922 #undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
17923 #define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
17924 aarch64_vectorize_preferred_vector_alignment
17925 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
17926 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
17927 aarch64_simd_vector_alignment_reachable
17929 /* vec_perm support. */
17931 #undef TARGET_VECTORIZE_VEC_PERM_CONST
17932 #define TARGET_VECTORIZE_VEC_PERM_CONST \
17933 aarch64_vectorize_vec_perm_const
17935 #undef TARGET_VECTORIZE_GET_MASK_MODE
17936 #define TARGET_VECTORIZE_GET_MASK_MODE aarch64_get_mask_mode
17937 #undef TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE
17938 #define TARGET_VECTORIZE_EMPTY_MASK_IS_EXPENSIVE \
17939 aarch64_empty_mask_is_expensive
17940 #undef TARGET_PREFERRED_ELSE_VALUE
17941 #define TARGET_PREFERRED_ELSE_VALUE \
17942 aarch64_preferred_else_value
17944 #undef TARGET_INIT_LIBFUNCS
17945 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
17947 #undef TARGET_FIXED_CONDITION_CODE_REGS
17948 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
17950 #undef TARGET_FLAGS_REGNUM
17951 #define TARGET_FLAGS_REGNUM CC_REGNUM
17953 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
17954 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
17956 #undef TARGET_ASAN_SHADOW_OFFSET
17957 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
17959 #undef TARGET_LEGITIMIZE_ADDRESS
17960 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
17962 #undef TARGET_SCHED_CAN_SPECULATE_INSN
17963 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
17965 #undef TARGET_CAN_USE_DOLOOP_P
17966 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
17968 #undef TARGET_SCHED_ADJUST_PRIORITY
17969 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
17971 #undef TARGET_SCHED_MACRO_FUSION_P
17972 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
17974 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
17975 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
17977 #undef TARGET_SCHED_FUSION_PRIORITY
17978 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
17980 #undef TARGET_UNSPEC_MAY_TRAP_P
17981 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
17983 #undef TARGET_USE_PSEUDO_PIC_REG
17984 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
17986 #undef TARGET_PRINT_OPERAND
17987 #define TARGET_PRINT_OPERAND aarch64_print_operand
17989 #undef TARGET_PRINT_OPERAND_ADDRESS
17990 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
17992 #undef TARGET_OPTAB_SUPPORTED_P
17993 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
17995 #undef TARGET_OMIT_STRUCT_RETURN_REG
17996 #define TARGET_OMIT_STRUCT_RETURN_REG true
17998 #undef TARGET_DWARF_POLY_INDETERMINATE_VALUE
17999 #define TARGET_DWARF_POLY_INDETERMINATE_VALUE \
18000 aarch64_dwarf_poly_indeterminate_value
18002 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
18003 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
18004 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
18006 #undef TARGET_HARD_REGNO_NREGS
18007 #define TARGET_HARD_REGNO_NREGS aarch64_hard_regno_nregs
18008 #undef TARGET_HARD_REGNO_MODE_OK
18009 #define TARGET_HARD_REGNO_MODE_OK aarch64_hard_regno_mode_ok
18011 #undef TARGET_MODES_TIEABLE_P
18012 #define TARGET_MODES_TIEABLE_P aarch64_modes_tieable_p
18014 #undef TARGET_HARD_REGNO_CALL_PART_CLOBBERED
18015 #define TARGET_HARD_REGNO_CALL_PART_CLOBBERED \
18016 aarch64_hard_regno_call_part_clobbered
18018 #undef TARGET_CONSTANT_ALIGNMENT
18019 #define TARGET_CONSTANT_ALIGNMENT aarch64_constant_alignment
18021 #undef TARGET_COMPUTE_PRESSURE_CLASSES
18022 #define TARGET_COMPUTE_PRESSURE_CLASSES aarch64_compute_pressure_classes
18024 #undef TARGET_CAN_CHANGE_MODE_CLASS
18025 #define TARGET_CAN_CHANGE_MODE_CLASS aarch64_can_change_mode_class
18027 #undef TARGET_SELECT_EARLY_REMAT_MODES
18028 #define TARGET_SELECT_EARLY_REMAT_MODES aarch64_select_early_remat_modes
18030 #undef TARGET_SPECULATION_SAFE_VALUE
18031 #define TARGET_SPECULATION_SAFE_VALUE aarch64_speculation_safe_value
18033 #if CHECKING_P
18034 #undef TARGET_RUN_TARGET_SELFTESTS
18035 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
18036 #endif /* #if CHECKING_P */
18038 struct gcc_target targetm = TARGET_INITIALIZER;
18040 #include "gt-aarch64.h"