[AArch64] Separate shrink wrapping hooks implementation
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobaf3aa0b895674a67164458cd26f12827b3243dbb
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2016 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "insn-attr.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "stor-layout.h"
45 #include "calls.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "flags.h"
49 #include "explow.h"
50 #include "expr.h"
51 #include "reload.h"
52 #include "langhooks.h"
53 #include "opts.h"
54 #include "params.h"
55 #include "gimplify.h"
56 #include "dwarf2.h"
57 #include "gimple-iterator.h"
58 #include "tree-vectorizer.h"
59 #include "aarch64-cost-tables.h"
60 #include "dumpfile.h"
61 #include "builtins.h"
62 #include "rtl-iter.h"
63 #include "tm-constrs.h"
64 #include "sched-int.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
68 /* This file should be included last. */
69 #include "target-def.h"
71 /* Defined for convenience. */
72 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
74 /* Classifies an address.
76 ADDRESS_REG_IMM
77 A simple base register plus immediate offset.
79 ADDRESS_REG_WB
80 A base register indexed by immediate offset with writeback.
82 ADDRESS_REG_REG
83 A base register indexed by (optionally scaled) register.
85 ADDRESS_REG_UXTW
86 A base register indexed by (optionally scaled) zero-extended register.
88 ADDRESS_REG_SXTW
89 A base register indexed by (optionally scaled) sign-extended register.
91 ADDRESS_LO_SUM
92 A LO_SUM rtx with a base register and "LO12" symbol relocation.
94 ADDRESS_SYMBOLIC:
95 A constant symbolic address, in pc-relative literal pool. */
97 enum aarch64_address_type {
98 ADDRESS_REG_IMM,
99 ADDRESS_REG_WB,
100 ADDRESS_REG_REG,
101 ADDRESS_REG_UXTW,
102 ADDRESS_REG_SXTW,
103 ADDRESS_LO_SUM,
104 ADDRESS_SYMBOLIC
107 struct aarch64_address_info {
108 enum aarch64_address_type type;
109 rtx base;
110 rtx offset;
111 int shift;
112 enum aarch64_symbol_type symbol_type;
115 struct simd_immediate_info
117 rtx value;
118 int shift;
119 int element_width;
120 bool mvn;
121 bool msl;
124 /* The current code model. */
125 enum aarch64_code_model aarch64_cmodel;
127 #ifdef HAVE_AS_TLS
128 #undef TARGET_HAVE_TLS
129 #define TARGET_HAVE_TLS 1
130 #endif
132 static bool aarch64_composite_type_p (const_tree, machine_mode);
133 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
134 const_tree,
135 machine_mode *, int *,
136 bool *);
137 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
138 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
139 static void aarch64_override_options_after_change (void);
140 static bool aarch64_vector_mode_supported_p (machine_mode);
141 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
142 const unsigned char *sel);
143 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
145 /* Major revision number of the ARM Architecture implemented by the target. */
146 unsigned aarch64_architecture_version;
148 /* The processor for which instructions should be scheduled. */
149 enum aarch64_processor aarch64_tune = cortexa53;
151 /* Mask to specify which instruction scheduling options should be used. */
152 unsigned long aarch64_tune_flags = 0;
154 /* Global flag for PC relative loads. */
155 bool aarch64_pcrelative_literal_loads;
157 /* Support for command line parsing of boolean flags in the tuning
158 structures. */
159 struct aarch64_flag_desc
161 const char* name;
162 unsigned int flag;
165 #define AARCH64_FUSION_PAIR(name, internal_name) \
166 { name, AARCH64_FUSE_##internal_name },
167 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
169 { "none", AARCH64_FUSE_NOTHING },
170 #include "aarch64-fusion-pairs.def"
171 { "all", AARCH64_FUSE_ALL },
172 { NULL, AARCH64_FUSE_NOTHING }
175 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
176 { name, AARCH64_EXTRA_TUNE_##internal_name },
177 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
179 { "none", AARCH64_EXTRA_TUNE_NONE },
180 #include "aarch64-tuning-flags.def"
181 { "all", AARCH64_EXTRA_TUNE_ALL },
182 { NULL, AARCH64_EXTRA_TUNE_NONE }
185 /* Tuning parameters. */
187 static const struct cpu_addrcost_table generic_addrcost_table =
190 0, /* hi */
191 0, /* si */
192 0, /* di */
193 0, /* ti */
195 0, /* pre_modify */
196 0, /* post_modify */
197 0, /* register_offset */
198 0, /* register_sextend */
199 0, /* register_zextend */
200 0 /* imm_offset */
203 static const struct cpu_addrcost_table cortexa57_addrcost_table =
206 1, /* hi */
207 0, /* si */
208 0, /* di */
209 1, /* ti */
211 0, /* pre_modify */
212 0, /* post_modify */
213 0, /* register_offset */
214 0, /* register_sextend */
215 0, /* register_zextend */
216 0, /* imm_offset */
219 static const struct cpu_addrcost_table exynosm1_addrcost_table =
222 0, /* hi */
223 0, /* si */
224 0, /* di */
225 2, /* ti */
227 0, /* pre_modify */
228 0, /* post_modify */
229 1, /* register_offset */
230 1, /* register_sextend */
231 2, /* register_zextend */
232 0, /* imm_offset */
235 static const struct cpu_addrcost_table xgene1_addrcost_table =
238 1, /* hi */
239 0, /* si */
240 0, /* di */
241 1, /* ti */
243 1, /* pre_modify */
244 0, /* post_modify */
245 0, /* register_offset */
246 1, /* register_sextend */
247 1, /* register_zextend */
248 0, /* imm_offset */
251 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
254 1, /* hi */
255 0, /* si */
256 0, /* di */
257 1, /* ti */
259 0, /* pre_modify */
260 0, /* post_modify */
261 0, /* register_offset */
262 0, /* register_sextend */
263 0, /* register_zextend */
264 0 /* imm_offset */
267 static const struct cpu_addrcost_table vulcan_addrcost_table =
270 0, /* hi */
271 0, /* si */
272 0, /* di */
273 2, /* ti */
275 0, /* pre_modify */
276 0, /* post_modify */
277 2, /* register_offset */
278 3, /* register_sextend */
279 3, /* register_zextend */
280 0, /* imm_offset */
283 static const struct cpu_regmove_cost generic_regmove_cost =
285 1, /* GP2GP */
286 /* Avoid the use of slow int<->fp moves for spilling by setting
287 their cost higher than memmov_cost. */
288 5, /* GP2FP */
289 5, /* FP2GP */
290 2 /* FP2FP */
293 static const struct cpu_regmove_cost cortexa57_regmove_cost =
295 1, /* GP2GP */
296 /* Avoid the use of slow int<->fp moves for spilling by setting
297 their cost higher than memmov_cost. */
298 5, /* GP2FP */
299 5, /* FP2GP */
300 2 /* FP2FP */
303 static const struct cpu_regmove_cost cortexa53_regmove_cost =
305 1, /* GP2GP */
306 /* Avoid the use of slow int<->fp moves for spilling by setting
307 their cost higher than memmov_cost. */
308 5, /* GP2FP */
309 5, /* FP2GP */
310 2 /* FP2FP */
313 static const struct cpu_regmove_cost exynosm1_regmove_cost =
315 1, /* GP2GP */
316 /* Avoid the use of slow int<->fp moves for spilling by setting
317 their cost higher than memmov_cost (actual, 4 and 9). */
318 9, /* GP2FP */
319 9, /* FP2GP */
320 1 /* FP2FP */
323 static const struct cpu_regmove_cost thunderx_regmove_cost =
325 2, /* GP2GP */
326 2, /* GP2FP */
327 6, /* FP2GP */
328 4 /* FP2FP */
331 static const struct cpu_regmove_cost xgene1_regmove_cost =
333 1, /* GP2GP */
334 /* Avoid the use of slow int<->fp moves for spilling by setting
335 their cost higher than memmov_cost. */
336 8, /* GP2FP */
337 8, /* FP2GP */
338 2 /* FP2FP */
341 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
343 2, /* GP2GP */
344 /* Avoid the use of int<->fp moves for spilling. */
345 6, /* GP2FP */
346 6, /* FP2GP */
347 4 /* FP2FP */
350 static const struct cpu_regmove_cost vulcan_regmove_cost =
352 1, /* GP2GP */
353 /* Avoid the use of int<->fp moves for spilling. */
354 8, /* GP2FP */
355 8, /* FP2GP */
356 4 /* FP2FP */
359 /* Generic costs for vector insn classes. */
360 static const struct cpu_vector_cost generic_vector_cost =
362 1, /* scalar_stmt_cost */
363 1, /* scalar_load_cost */
364 1, /* scalar_store_cost */
365 1, /* vec_stmt_cost */
366 2, /* vec_permute_cost */
367 1, /* vec_to_scalar_cost */
368 1, /* scalar_to_vec_cost */
369 1, /* vec_align_load_cost */
370 1, /* vec_unalign_load_cost */
371 1, /* vec_unalign_store_cost */
372 1, /* vec_store_cost */
373 3, /* cond_taken_branch_cost */
374 1 /* cond_not_taken_branch_cost */
377 /* ThunderX costs for vector insn classes. */
378 static const struct cpu_vector_cost thunderx_vector_cost =
380 1, /* scalar_stmt_cost */
381 3, /* scalar_load_cost */
382 1, /* scalar_store_cost */
383 4, /* vec_stmt_cost */
384 4, /* vec_permute_cost */
385 2, /* vec_to_scalar_cost */
386 2, /* scalar_to_vec_cost */
387 3, /* vec_align_load_cost */
388 10, /* vec_unalign_load_cost */
389 10, /* vec_unalign_store_cost */
390 1, /* vec_store_cost */
391 3, /* cond_taken_branch_cost */
392 3 /* cond_not_taken_branch_cost */
395 /* Generic costs for vector insn classes. */
396 static const struct cpu_vector_cost cortexa57_vector_cost =
398 1, /* scalar_stmt_cost */
399 4, /* scalar_load_cost */
400 1, /* scalar_store_cost */
401 2, /* vec_stmt_cost */
402 3, /* vec_permute_cost */
403 8, /* vec_to_scalar_cost */
404 8, /* scalar_to_vec_cost */
405 4, /* vec_align_load_cost */
406 4, /* vec_unalign_load_cost */
407 1, /* vec_unalign_store_cost */
408 1, /* vec_store_cost */
409 1, /* cond_taken_branch_cost */
410 1 /* cond_not_taken_branch_cost */
413 static const struct cpu_vector_cost exynosm1_vector_cost =
415 1, /* scalar_stmt_cost */
416 5, /* scalar_load_cost */
417 1, /* scalar_store_cost */
418 3, /* vec_stmt_cost */
419 3, /* vec_permute_cost */
420 3, /* vec_to_scalar_cost */
421 3, /* scalar_to_vec_cost */
422 5, /* vec_align_load_cost */
423 5, /* vec_unalign_load_cost */
424 1, /* vec_unalign_store_cost */
425 1, /* vec_store_cost */
426 1, /* cond_taken_branch_cost */
427 1 /* cond_not_taken_branch_cost */
430 /* Generic costs for vector insn classes. */
431 static const struct cpu_vector_cost xgene1_vector_cost =
433 1, /* scalar_stmt_cost */
434 5, /* scalar_load_cost */
435 1, /* scalar_store_cost */
436 2, /* vec_stmt_cost */
437 2, /* vec_permute_cost */
438 4, /* vec_to_scalar_cost */
439 4, /* scalar_to_vec_cost */
440 10, /* vec_align_load_cost */
441 10, /* vec_unalign_load_cost */
442 2, /* vec_unalign_store_cost */
443 2, /* vec_store_cost */
444 2, /* cond_taken_branch_cost */
445 1 /* cond_not_taken_branch_cost */
448 /* Costs for vector insn classes for Vulcan. */
449 static const struct cpu_vector_cost vulcan_vector_cost =
451 6, /* scalar_stmt_cost */
452 4, /* scalar_load_cost */
453 1, /* scalar_store_cost */
454 6, /* vec_stmt_cost */
455 3, /* vec_permute_cost */
456 6, /* vec_to_scalar_cost */
457 5, /* scalar_to_vec_cost */
458 8, /* vec_align_load_cost */
459 8, /* vec_unalign_load_cost */
460 4, /* vec_unalign_store_cost */
461 4, /* vec_store_cost */
462 2, /* cond_taken_branch_cost */
463 1 /* cond_not_taken_branch_cost */
466 /* Generic costs for branch instructions. */
467 static const struct cpu_branch_cost generic_branch_cost =
469 2, /* Predictable. */
470 2 /* Unpredictable. */
473 /* Branch costs for Cortex-A57. */
474 static const struct cpu_branch_cost cortexa57_branch_cost =
476 1, /* Predictable. */
477 3 /* Unpredictable. */
480 /* Branch costs for Vulcan. */
481 static const struct cpu_branch_cost vulcan_branch_cost =
483 1, /* Predictable. */
484 3 /* Unpredictable. */
487 /* Generic approximation modes. */
488 static const cpu_approx_modes generic_approx_modes =
490 AARCH64_APPROX_NONE, /* division */
491 AARCH64_APPROX_NONE, /* sqrt */
492 AARCH64_APPROX_NONE /* recip_sqrt */
495 /* Approximation modes for Exynos M1. */
496 static const cpu_approx_modes exynosm1_approx_modes =
498 AARCH64_APPROX_NONE, /* division */
499 AARCH64_APPROX_ALL, /* sqrt */
500 AARCH64_APPROX_ALL /* recip_sqrt */
503 /* Approximation modes for X-Gene 1. */
504 static const cpu_approx_modes xgene1_approx_modes =
506 AARCH64_APPROX_NONE, /* division */
507 AARCH64_APPROX_NONE, /* sqrt */
508 AARCH64_APPROX_ALL /* recip_sqrt */
511 static const struct tune_params generic_tunings =
513 &cortexa57_extra_costs,
514 &generic_addrcost_table,
515 &generic_regmove_cost,
516 &generic_vector_cost,
517 &generic_branch_cost,
518 &generic_approx_modes,
519 4, /* memmov_cost */
520 2, /* issue_rate */
521 AARCH64_FUSE_NOTHING, /* fusible_ops */
522 8, /* function_align. */
523 8, /* jump_align. */
524 4, /* loop_align. */
525 2, /* int_reassoc_width. */
526 4, /* fp_reassoc_width. */
527 1, /* vec_reassoc_width. */
528 2, /* min_div_recip_mul_sf. */
529 2, /* min_div_recip_mul_df. */
530 0, /* max_case_values. */
531 0, /* cache_line_size. */
532 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
533 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
536 static const struct tune_params cortexa35_tunings =
538 &cortexa53_extra_costs,
539 &generic_addrcost_table,
540 &cortexa53_regmove_cost,
541 &generic_vector_cost,
542 &cortexa57_branch_cost,
543 &generic_approx_modes,
544 4, /* memmov_cost */
545 1, /* issue_rate */
546 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
547 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
548 16, /* function_align. */
549 8, /* jump_align. */
550 8, /* loop_align. */
551 2, /* int_reassoc_width. */
552 4, /* fp_reassoc_width. */
553 1, /* vec_reassoc_width. */
554 2, /* min_div_recip_mul_sf. */
555 2, /* min_div_recip_mul_df. */
556 0, /* max_case_values. */
557 0, /* cache_line_size. */
558 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
559 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
562 static const struct tune_params cortexa53_tunings =
564 &cortexa53_extra_costs,
565 &generic_addrcost_table,
566 &cortexa53_regmove_cost,
567 &generic_vector_cost,
568 &cortexa57_branch_cost,
569 &generic_approx_modes,
570 4, /* memmov_cost */
571 2, /* issue_rate */
572 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
573 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
574 16, /* function_align. */
575 8, /* jump_align. */
576 8, /* loop_align. */
577 2, /* int_reassoc_width. */
578 4, /* fp_reassoc_width. */
579 1, /* vec_reassoc_width. */
580 2, /* min_div_recip_mul_sf. */
581 2, /* min_div_recip_mul_df. */
582 0, /* max_case_values. */
583 0, /* cache_line_size. */
584 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
585 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
588 static const struct tune_params cortexa57_tunings =
590 &cortexa57_extra_costs,
591 &cortexa57_addrcost_table,
592 &cortexa57_regmove_cost,
593 &cortexa57_vector_cost,
594 &cortexa57_branch_cost,
595 &generic_approx_modes,
596 4, /* memmov_cost */
597 3, /* issue_rate */
598 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
599 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
600 16, /* function_align. */
601 8, /* jump_align. */
602 8, /* loop_align. */
603 2, /* int_reassoc_width. */
604 4, /* fp_reassoc_width. */
605 1, /* vec_reassoc_width. */
606 2, /* min_div_recip_mul_sf. */
607 2, /* min_div_recip_mul_df. */
608 0, /* max_case_values. */
609 0, /* cache_line_size. */
610 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
611 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
614 static const struct tune_params cortexa72_tunings =
616 &cortexa57_extra_costs,
617 &cortexa57_addrcost_table,
618 &cortexa57_regmove_cost,
619 &cortexa57_vector_cost,
620 &cortexa57_branch_cost,
621 &generic_approx_modes,
622 4, /* memmov_cost */
623 3, /* issue_rate */
624 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
625 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
626 16, /* function_align. */
627 8, /* jump_align. */
628 8, /* loop_align. */
629 2, /* int_reassoc_width. */
630 4, /* fp_reassoc_width. */
631 1, /* vec_reassoc_width. */
632 2, /* min_div_recip_mul_sf. */
633 2, /* min_div_recip_mul_df. */
634 0, /* max_case_values. */
635 0, /* cache_line_size. */
636 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
637 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
640 static const struct tune_params cortexa73_tunings =
642 &cortexa57_extra_costs,
643 &cortexa57_addrcost_table,
644 &cortexa57_regmove_cost,
645 &cortexa57_vector_cost,
646 &cortexa57_branch_cost,
647 &generic_approx_modes,
648 4, /* memmov_cost. */
649 2, /* issue_rate. */
650 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
651 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
652 16, /* function_align. */
653 8, /* jump_align. */
654 8, /* loop_align. */
655 2, /* int_reassoc_width. */
656 4, /* fp_reassoc_width. */
657 1, /* vec_reassoc_width. */
658 2, /* min_div_recip_mul_sf. */
659 2, /* min_div_recip_mul_df. */
660 0, /* max_case_values. */
661 0, /* cache_line_size. */
662 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
663 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
666 static const struct tune_params exynosm1_tunings =
668 &exynosm1_extra_costs,
669 &exynosm1_addrcost_table,
670 &exynosm1_regmove_cost,
671 &exynosm1_vector_cost,
672 &generic_branch_cost,
673 &exynosm1_approx_modes,
674 4, /* memmov_cost */
675 3, /* issue_rate */
676 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
677 4, /* function_align. */
678 4, /* jump_align. */
679 4, /* loop_align. */
680 2, /* int_reassoc_width. */
681 4, /* fp_reassoc_width. */
682 1, /* vec_reassoc_width. */
683 2, /* min_div_recip_mul_sf. */
684 2, /* min_div_recip_mul_df. */
685 48, /* max_case_values. */
686 64, /* cache_line_size. */
687 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
688 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
691 static const struct tune_params thunderx_tunings =
693 &thunderx_extra_costs,
694 &generic_addrcost_table,
695 &thunderx_regmove_cost,
696 &thunderx_vector_cost,
697 &generic_branch_cost,
698 &generic_approx_modes,
699 6, /* memmov_cost */
700 2, /* issue_rate */
701 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
702 8, /* function_align. */
703 8, /* jump_align. */
704 8, /* loop_align. */
705 2, /* int_reassoc_width. */
706 4, /* fp_reassoc_width. */
707 1, /* vec_reassoc_width. */
708 2, /* min_div_recip_mul_sf. */
709 2, /* min_div_recip_mul_df. */
710 0, /* max_case_values. */
711 0, /* cache_line_size. */
712 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
713 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) /* tune_flags. */
716 static const struct tune_params xgene1_tunings =
718 &xgene1_extra_costs,
719 &xgene1_addrcost_table,
720 &xgene1_regmove_cost,
721 &xgene1_vector_cost,
722 &generic_branch_cost,
723 &xgene1_approx_modes,
724 6, /* memmov_cost */
725 4, /* issue_rate */
726 AARCH64_FUSE_NOTHING, /* fusible_ops */
727 16, /* function_align. */
728 8, /* jump_align. */
729 16, /* loop_align. */
730 2, /* int_reassoc_width. */
731 4, /* fp_reassoc_width. */
732 1, /* vec_reassoc_width. */
733 2, /* min_div_recip_mul_sf. */
734 2, /* min_div_recip_mul_df. */
735 0, /* max_case_values. */
736 0, /* cache_line_size. */
737 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
738 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
741 static const struct tune_params qdf24xx_tunings =
743 &qdf24xx_extra_costs,
744 &qdf24xx_addrcost_table,
745 &qdf24xx_regmove_cost,
746 &generic_vector_cost,
747 &generic_branch_cost,
748 &generic_approx_modes,
749 4, /* memmov_cost */
750 4, /* issue_rate */
751 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
752 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
753 16, /* function_align. */
754 8, /* jump_align. */
755 16, /* loop_align. */
756 2, /* int_reassoc_width. */
757 4, /* fp_reassoc_width. */
758 1, /* vec_reassoc_width. */
759 2, /* min_div_recip_mul_sf. */
760 2, /* min_div_recip_mul_df. */
761 0, /* max_case_values. */
762 64, /* cache_line_size. */
763 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
764 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
767 static const struct tune_params vulcan_tunings =
769 &vulcan_extra_costs,
770 &vulcan_addrcost_table,
771 &vulcan_regmove_cost,
772 &vulcan_vector_cost,
773 &vulcan_branch_cost,
774 &generic_approx_modes,
775 4, /* memmov_cost. */
776 4, /* issue_rate. */
777 AARCH64_FUSE_NOTHING, /* fuseable_ops. */
778 16, /* function_align. */
779 8, /* jump_align. */
780 16, /* loop_align. */
781 3, /* int_reassoc_width. */
782 2, /* fp_reassoc_width. */
783 2, /* vec_reassoc_width. */
784 2, /* min_div_recip_mul_sf. */
785 2, /* min_div_recip_mul_df. */
786 0, /* max_case_values. */
787 64, /* cache_line_size. */
788 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
789 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
792 /* Support for fine-grained override of the tuning structures. */
793 struct aarch64_tuning_override_function
795 const char* name;
796 void (*parse_override)(const char*, struct tune_params*);
799 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
800 static void aarch64_parse_tune_string (const char*, struct tune_params*);
802 static const struct aarch64_tuning_override_function
803 aarch64_tuning_override_functions[] =
805 { "fuse", aarch64_parse_fuse_string },
806 { "tune", aarch64_parse_tune_string },
807 { NULL, NULL }
810 /* A processor implementing AArch64. */
811 struct processor
813 const char *const name;
814 enum aarch64_processor ident;
815 enum aarch64_processor sched_core;
816 enum aarch64_arch arch;
817 unsigned architecture_version;
818 const unsigned long flags;
819 const struct tune_params *const tune;
822 /* Architectures implementing AArch64. */
823 static const struct processor all_architectures[] =
825 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
826 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
827 #include "aarch64-arches.def"
828 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
831 /* Processor cores implementing AArch64. */
832 static const struct processor all_cores[] =
834 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
835 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
836 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
837 FLAGS, &COSTS##_tunings},
838 #include "aarch64-cores.def"
839 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
840 AARCH64_FL_FOR_ARCH8, &generic_tunings},
841 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
845 /* Target specification. These are populated by the -march, -mtune, -mcpu
846 handling code or by target attributes. */
847 static const struct processor *selected_arch;
848 static const struct processor *selected_cpu;
849 static const struct processor *selected_tune;
851 /* The current tuning set. */
852 struct tune_params aarch64_tune_params = generic_tunings;
854 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
856 /* An ISA extension in the co-processor and main instruction set space. */
857 struct aarch64_option_extension
859 const char *const name;
860 const unsigned long flags_on;
861 const unsigned long flags_off;
864 typedef enum aarch64_cond_code
866 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
867 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
868 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
870 aarch64_cc;
872 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
874 /* The condition codes of the processor, and the inverse function. */
875 static const char * const aarch64_condition_codes[] =
877 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
878 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
881 /* Generate code to enable conditional branches in functions over 1 MiB. */
882 const char *
883 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
884 const char * branch_format)
886 rtx_code_label * tmp_label = gen_label_rtx ();
887 char label_buf[256];
888 char buffer[128];
889 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
890 CODE_LABEL_NUMBER (tmp_label));
891 const char *label_ptr = targetm.strip_name_encoding (label_buf);
892 rtx dest_label = operands[pos_label];
893 operands[pos_label] = tmp_label;
895 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
896 output_asm_insn (buffer, operands);
898 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
899 operands[pos_label] = dest_label;
900 output_asm_insn (buffer, operands);
901 return "";
904 void
905 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
907 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
908 if (TARGET_GENERAL_REGS_ONLY)
909 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
910 else
911 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
914 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
915 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
916 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
917 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
918 cost (in this case the best class is the lowest cost one). Using ALL_REGS
919 irrespectively of its cost results in bad allocations with many redundant
920 int<->FP moves which are expensive on various cores.
921 To avoid this we don't allow ALL_REGS as the allocno class, but force a
922 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
923 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
924 Otherwise set the allocno class depending on the mode.
925 The result of this is that it is no longer inefficient to have a higher
926 memory move cost than the register move cost.
929 static reg_class_t
930 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
931 reg_class_t best_class)
933 enum machine_mode mode;
935 if (allocno_class != ALL_REGS)
936 return allocno_class;
938 if (best_class != ALL_REGS)
939 return best_class;
941 mode = PSEUDO_REGNO_MODE (regno);
942 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
945 static unsigned int
946 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
948 if (GET_MODE_UNIT_SIZE (mode) == 4)
949 return aarch64_tune_params.min_div_recip_mul_sf;
950 return aarch64_tune_params.min_div_recip_mul_df;
953 static int
954 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
955 enum machine_mode mode)
957 if (VECTOR_MODE_P (mode))
958 return aarch64_tune_params.vec_reassoc_width;
959 if (INTEGRAL_MODE_P (mode))
960 return aarch64_tune_params.int_reassoc_width;
961 if (FLOAT_MODE_P (mode))
962 return aarch64_tune_params.fp_reassoc_width;
963 return 1;
966 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
967 unsigned
968 aarch64_dbx_register_number (unsigned regno)
970 if (GP_REGNUM_P (regno))
971 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
972 else if (regno == SP_REGNUM)
973 return AARCH64_DWARF_SP;
974 else if (FP_REGNUM_P (regno))
975 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
977 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
978 equivalent DWARF register. */
979 return DWARF_FRAME_REGISTERS;
982 /* Return TRUE if MODE is any of the large INT modes. */
983 static bool
984 aarch64_vect_struct_mode_p (machine_mode mode)
986 return mode == OImode || mode == CImode || mode == XImode;
989 /* Return TRUE if MODE is any of the vector modes. */
990 static bool
991 aarch64_vector_mode_p (machine_mode mode)
993 return aarch64_vector_mode_supported_p (mode)
994 || aarch64_vect_struct_mode_p (mode);
997 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
998 static bool
999 aarch64_array_mode_supported_p (machine_mode mode,
1000 unsigned HOST_WIDE_INT nelems)
1002 if (TARGET_SIMD
1003 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1004 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1005 && (nelems >= 2 && nelems <= 4))
1006 return true;
1008 return false;
1011 /* Implement HARD_REGNO_NREGS. */
1014 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1016 switch (aarch64_regno_regclass (regno))
1018 case FP_REGS:
1019 case FP_LO_REGS:
1020 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1021 default:
1022 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1024 gcc_unreachable ();
1027 /* Implement HARD_REGNO_MODE_OK. */
1030 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1032 if (GET_MODE_CLASS (mode) == MODE_CC)
1033 return regno == CC_REGNUM;
1035 if (regno == SP_REGNUM)
1036 /* The purpose of comparing with ptr_mode is to support the
1037 global register variable associated with the stack pointer
1038 register via the syntax of asm ("wsp") in ILP32. */
1039 return mode == Pmode || mode == ptr_mode;
1041 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1042 return mode == Pmode;
1044 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1045 return 1;
1047 if (FP_REGNUM_P (regno))
1049 if (aarch64_vect_struct_mode_p (mode))
1050 return
1051 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1052 else
1053 return 1;
1056 return 0;
1059 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1060 machine_mode
1061 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1062 machine_mode mode)
1064 /* Handle modes that fit within single registers. */
1065 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1067 if (GET_MODE_SIZE (mode) >= 4)
1068 return mode;
1069 else
1070 return SImode;
1072 /* Fall back to generic for multi-reg and very large modes. */
1073 else
1074 return choose_hard_reg_mode (regno, nregs, false);
1077 /* Return true if calls to DECL should be treated as
1078 long-calls (ie called via a register). */
1079 static bool
1080 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1082 return false;
1085 /* Return true if calls to symbol-ref SYM should be treated as
1086 long-calls (ie called via a register). */
1087 bool
1088 aarch64_is_long_call_p (rtx sym)
1090 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1093 /* Return true if calls to symbol-ref SYM should not go through
1094 plt stubs. */
1096 bool
1097 aarch64_is_noplt_call_p (rtx sym)
1099 const_tree decl = SYMBOL_REF_DECL (sym);
1101 if (flag_pic
1102 && decl
1103 && (!flag_plt
1104 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1105 && !targetm.binds_local_p (decl))
1106 return true;
1108 return false;
1111 /* Return true if the offsets to a zero/sign-extract operation
1112 represent an expression that matches an extend operation. The
1113 operands represent the paramters from
1115 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1116 bool
1117 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1118 rtx extract_imm)
1120 HOST_WIDE_INT mult_val, extract_val;
1122 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1123 return false;
1125 mult_val = INTVAL (mult_imm);
1126 extract_val = INTVAL (extract_imm);
1128 if (extract_val > 8
1129 && extract_val < GET_MODE_BITSIZE (mode)
1130 && exact_log2 (extract_val & ~7) > 0
1131 && (extract_val & 7) <= 4
1132 && mult_val == (1 << (extract_val & 7)))
1133 return true;
1135 return false;
1138 /* Emit an insn that's a simple single-set. Both the operands must be
1139 known to be valid. */
1140 inline static rtx_insn *
1141 emit_set_insn (rtx x, rtx y)
1143 return emit_insn (gen_rtx_SET (x, y));
1146 /* X and Y are two things to compare using CODE. Emit the compare insn and
1147 return the rtx for register 0 in the proper mode. */
1149 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1151 machine_mode mode = SELECT_CC_MODE (code, x, y);
1152 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1154 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1155 return cc_reg;
1158 /* Build the SYMBOL_REF for __tls_get_addr. */
1160 static GTY(()) rtx tls_get_addr_libfunc;
1163 aarch64_tls_get_addr (void)
1165 if (!tls_get_addr_libfunc)
1166 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1167 return tls_get_addr_libfunc;
1170 /* Return the TLS model to use for ADDR. */
1172 static enum tls_model
1173 tls_symbolic_operand_type (rtx addr)
1175 enum tls_model tls_kind = TLS_MODEL_NONE;
1176 rtx sym, addend;
1178 if (GET_CODE (addr) == CONST)
1180 split_const (addr, &sym, &addend);
1181 if (GET_CODE (sym) == SYMBOL_REF)
1182 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1184 else if (GET_CODE (addr) == SYMBOL_REF)
1185 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1187 return tls_kind;
1190 /* We'll allow lo_sum's in addresses in our legitimate addresses
1191 so that combine would take care of combining addresses where
1192 necessary, but for generation purposes, we'll generate the address
1193 as :
1194 RTL Absolute
1195 tmp = hi (symbol_ref); adrp x1, foo
1196 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1199 PIC TLS
1200 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1201 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1202 bl __tls_get_addr
1205 Load TLS symbol, depending on TLS mechanism and TLS access model.
1207 Global Dynamic - Traditional TLS:
1208 adrp tmp, :tlsgd:imm
1209 add dest, tmp, #:tlsgd_lo12:imm
1210 bl __tls_get_addr
1212 Global Dynamic - TLS Descriptors:
1213 adrp dest, :tlsdesc:imm
1214 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1215 add dest, dest, #:tlsdesc_lo12:imm
1216 blr tmp
1217 mrs tp, tpidr_el0
1218 add dest, dest, tp
1220 Initial Exec:
1221 mrs tp, tpidr_el0
1222 adrp tmp, :gottprel:imm
1223 ldr dest, [tmp, #:gottprel_lo12:imm]
1224 add dest, dest, tp
1226 Local Exec:
1227 mrs tp, tpidr_el0
1228 add t0, tp, #:tprel_hi12:imm, lsl #12
1229 add t0, t0, #:tprel_lo12_nc:imm
1232 static void
1233 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1234 enum aarch64_symbol_type type)
1236 switch (type)
1238 case SYMBOL_SMALL_ABSOLUTE:
1240 /* In ILP32, the mode of dest can be either SImode or DImode. */
1241 rtx tmp_reg = dest;
1242 machine_mode mode = GET_MODE (dest);
1244 gcc_assert (mode == Pmode || mode == ptr_mode);
1246 if (can_create_pseudo_p ())
1247 tmp_reg = gen_reg_rtx (mode);
1249 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1250 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1251 return;
1254 case SYMBOL_TINY_ABSOLUTE:
1255 emit_insn (gen_rtx_SET (dest, imm));
1256 return;
1258 case SYMBOL_SMALL_GOT_28K:
1260 machine_mode mode = GET_MODE (dest);
1261 rtx gp_rtx = pic_offset_table_rtx;
1262 rtx insn;
1263 rtx mem;
1265 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1266 here before rtl expand. Tree IVOPT will generate rtl pattern to
1267 decide rtx costs, in which case pic_offset_table_rtx is not
1268 initialized. For that case no need to generate the first adrp
1269 instruction as the final cost for global variable access is
1270 one instruction. */
1271 if (gp_rtx != NULL)
1273 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1274 using the page base as GOT base, the first page may be wasted,
1275 in the worst scenario, there is only 28K space for GOT).
1277 The generate instruction sequence for accessing global variable
1280 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1282 Only one instruction needed. But we must initialize
1283 pic_offset_table_rtx properly. We generate initialize insn for
1284 every global access, and allow CSE to remove all redundant.
1286 The final instruction sequences will look like the following
1287 for multiply global variables access.
1289 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1291 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1292 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1293 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1294 ... */
1296 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1297 crtl->uses_pic_offset_table = 1;
1298 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1300 if (mode != GET_MODE (gp_rtx))
1301 gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1304 if (mode == ptr_mode)
1306 if (mode == DImode)
1307 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1308 else
1309 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1311 mem = XVECEXP (SET_SRC (insn), 0, 0);
1313 else
1315 gcc_assert (mode == Pmode);
1317 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1318 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1321 /* The operand is expected to be MEM. Whenever the related insn
1322 pattern changed, above code which calculate mem should be
1323 updated. */
1324 gcc_assert (GET_CODE (mem) == MEM);
1325 MEM_READONLY_P (mem) = 1;
1326 MEM_NOTRAP_P (mem) = 1;
1327 emit_insn (insn);
1328 return;
1331 case SYMBOL_SMALL_GOT_4G:
1333 /* In ILP32, the mode of dest can be either SImode or DImode,
1334 while the got entry is always of SImode size. The mode of
1335 dest depends on how dest is used: if dest is assigned to a
1336 pointer (e.g. in the memory), it has SImode; it may have
1337 DImode if dest is dereferenced to access the memeory.
1338 This is why we have to handle three different ldr_got_small
1339 patterns here (two patterns for ILP32). */
1341 rtx insn;
1342 rtx mem;
1343 rtx tmp_reg = dest;
1344 machine_mode mode = GET_MODE (dest);
1346 if (can_create_pseudo_p ())
1347 tmp_reg = gen_reg_rtx (mode);
1349 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1350 if (mode == ptr_mode)
1352 if (mode == DImode)
1353 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1354 else
1355 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1357 mem = XVECEXP (SET_SRC (insn), 0, 0);
1359 else
1361 gcc_assert (mode == Pmode);
1363 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1364 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1367 gcc_assert (GET_CODE (mem) == MEM);
1368 MEM_READONLY_P (mem) = 1;
1369 MEM_NOTRAP_P (mem) = 1;
1370 emit_insn (insn);
1371 return;
1374 case SYMBOL_SMALL_TLSGD:
1376 rtx_insn *insns;
1377 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1379 start_sequence ();
1380 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1381 insns = get_insns ();
1382 end_sequence ();
1384 RTL_CONST_CALL_P (insns) = 1;
1385 emit_libcall_block (insns, dest, result, imm);
1386 return;
1389 case SYMBOL_SMALL_TLSDESC:
1391 machine_mode mode = GET_MODE (dest);
1392 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1393 rtx tp;
1395 gcc_assert (mode == Pmode || mode == ptr_mode);
1397 /* In ILP32, the got entry is always of SImode size. Unlike
1398 small GOT, the dest is fixed at reg 0. */
1399 if (TARGET_ILP32)
1400 emit_insn (gen_tlsdesc_small_si (imm));
1401 else
1402 emit_insn (gen_tlsdesc_small_di (imm));
1403 tp = aarch64_load_tp (NULL);
1405 if (mode != Pmode)
1406 tp = gen_lowpart (mode, tp);
1408 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1409 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1410 return;
1413 case SYMBOL_SMALL_TLSIE:
1415 /* In ILP32, the mode of dest can be either SImode or DImode,
1416 while the got entry is always of SImode size. The mode of
1417 dest depends on how dest is used: if dest is assigned to a
1418 pointer (e.g. in the memory), it has SImode; it may have
1419 DImode if dest is dereferenced to access the memeory.
1420 This is why we have to handle three different tlsie_small
1421 patterns here (two patterns for ILP32). */
1422 machine_mode mode = GET_MODE (dest);
1423 rtx tmp_reg = gen_reg_rtx (mode);
1424 rtx tp = aarch64_load_tp (NULL);
1426 if (mode == ptr_mode)
1428 if (mode == DImode)
1429 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1430 else
1432 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1433 tp = gen_lowpart (mode, tp);
1436 else
1438 gcc_assert (mode == Pmode);
1439 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1442 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1443 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1444 return;
1447 case SYMBOL_TLSLE12:
1448 case SYMBOL_TLSLE24:
1449 case SYMBOL_TLSLE32:
1450 case SYMBOL_TLSLE48:
1452 machine_mode mode = GET_MODE (dest);
1453 rtx tp = aarch64_load_tp (NULL);
1455 if (mode != Pmode)
1456 tp = gen_lowpart (mode, tp);
1458 switch (type)
1460 case SYMBOL_TLSLE12:
1461 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1462 (dest, tp, imm));
1463 break;
1464 case SYMBOL_TLSLE24:
1465 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1466 (dest, tp, imm));
1467 break;
1468 case SYMBOL_TLSLE32:
1469 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1470 (dest, imm));
1471 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1472 (dest, dest, tp));
1473 break;
1474 case SYMBOL_TLSLE48:
1475 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1476 (dest, imm));
1477 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1478 (dest, dest, tp));
1479 break;
1480 default:
1481 gcc_unreachable ();
1484 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1485 return;
1488 case SYMBOL_TINY_GOT:
1489 emit_insn (gen_ldr_got_tiny (dest, imm));
1490 return;
1492 case SYMBOL_TINY_TLSIE:
1494 machine_mode mode = GET_MODE (dest);
1495 rtx tp = aarch64_load_tp (NULL);
1497 if (mode == ptr_mode)
1499 if (mode == DImode)
1500 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1501 else
1503 tp = gen_lowpart (mode, tp);
1504 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1507 else
1509 gcc_assert (mode == Pmode);
1510 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1513 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1514 return;
1517 default:
1518 gcc_unreachable ();
1522 /* Emit a move from SRC to DEST. Assume that the move expanders can
1523 handle all moves if !can_create_pseudo_p (). The distinction is
1524 important because, unlike emit_move_insn, the move expanders know
1525 how to force Pmode objects into the constant pool even when the
1526 constant pool address is not itself legitimate. */
1527 static rtx
1528 aarch64_emit_move (rtx dest, rtx src)
1530 return (can_create_pseudo_p ()
1531 ? emit_move_insn (dest, src)
1532 : emit_move_insn_1 (dest, src));
1535 /* Split a 128-bit move operation into two 64-bit move operations,
1536 taking care to handle partial overlap of register to register
1537 copies. Special cases are needed when moving between GP regs and
1538 FP regs. SRC can be a register, constant or memory; DST a register
1539 or memory. If either operand is memory it must not have any side
1540 effects. */
1541 void
1542 aarch64_split_128bit_move (rtx dst, rtx src)
1544 rtx dst_lo, dst_hi;
1545 rtx src_lo, src_hi;
1547 machine_mode mode = GET_MODE (dst);
1549 gcc_assert (mode == TImode || mode == TFmode);
1550 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1551 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1553 if (REG_P (dst) && REG_P (src))
1555 int src_regno = REGNO (src);
1556 int dst_regno = REGNO (dst);
1558 /* Handle FP <-> GP regs. */
1559 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1561 src_lo = gen_lowpart (word_mode, src);
1562 src_hi = gen_highpart (word_mode, src);
1564 if (mode == TImode)
1566 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1567 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1569 else
1571 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1572 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1574 return;
1576 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1578 dst_lo = gen_lowpart (word_mode, dst);
1579 dst_hi = gen_highpart (word_mode, dst);
1581 if (mode == TImode)
1583 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1584 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1586 else
1588 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1589 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1591 return;
1595 dst_lo = gen_lowpart (word_mode, dst);
1596 dst_hi = gen_highpart (word_mode, dst);
1597 src_lo = gen_lowpart (word_mode, src);
1598 src_hi = gen_highpart_mode (word_mode, mode, src);
1600 /* At most one pairing may overlap. */
1601 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1603 aarch64_emit_move (dst_hi, src_hi);
1604 aarch64_emit_move (dst_lo, src_lo);
1606 else
1608 aarch64_emit_move (dst_lo, src_lo);
1609 aarch64_emit_move (dst_hi, src_hi);
1613 bool
1614 aarch64_split_128bit_move_p (rtx dst, rtx src)
1616 return (! REG_P (src)
1617 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1620 /* Split a complex SIMD combine. */
1622 void
1623 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1625 machine_mode src_mode = GET_MODE (src1);
1626 machine_mode dst_mode = GET_MODE (dst);
1628 gcc_assert (VECTOR_MODE_P (dst_mode));
1630 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1632 rtx (*gen) (rtx, rtx, rtx);
1634 switch (src_mode)
1636 case V8QImode:
1637 gen = gen_aarch64_simd_combinev8qi;
1638 break;
1639 case V4HImode:
1640 gen = gen_aarch64_simd_combinev4hi;
1641 break;
1642 case V2SImode:
1643 gen = gen_aarch64_simd_combinev2si;
1644 break;
1645 case V4HFmode:
1646 gen = gen_aarch64_simd_combinev4hf;
1647 break;
1648 case V2SFmode:
1649 gen = gen_aarch64_simd_combinev2sf;
1650 break;
1651 case DImode:
1652 gen = gen_aarch64_simd_combinedi;
1653 break;
1654 case DFmode:
1655 gen = gen_aarch64_simd_combinedf;
1656 break;
1657 default:
1658 gcc_unreachable ();
1661 emit_insn (gen (dst, src1, src2));
1662 return;
1666 /* Split a complex SIMD move. */
1668 void
1669 aarch64_split_simd_move (rtx dst, rtx src)
1671 machine_mode src_mode = GET_MODE (src);
1672 machine_mode dst_mode = GET_MODE (dst);
1674 gcc_assert (VECTOR_MODE_P (dst_mode));
1676 if (REG_P (dst) && REG_P (src))
1678 rtx (*gen) (rtx, rtx);
1680 gcc_assert (VECTOR_MODE_P (src_mode));
1682 switch (src_mode)
1684 case V16QImode:
1685 gen = gen_aarch64_split_simd_movv16qi;
1686 break;
1687 case V8HImode:
1688 gen = gen_aarch64_split_simd_movv8hi;
1689 break;
1690 case V4SImode:
1691 gen = gen_aarch64_split_simd_movv4si;
1692 break;
1693 case V2DImode:
1694 gen = gen_aarch64_split_simd_movv2di;
1695 break;
1696 case V8HFmode:
1697 gen = gen_aarch64_split_simd_movv8hf;
1698 break;
1699 case V4SFmode:
1700 gen = gen_aarch64_split_simd_movv4sf;
1701 break;
1702 case V2DFmode:
1703 gen = gen_aarch64_split_simd_movv2df;
1704 break;
1705 default:
1706 gcc_unreachable ();
1709 emit_insn (gen (dst, src));
1710 return;
1714 bool
1715 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1716 machine_mode ymode, rtx y)
1718 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1719 gcc_assert (r != NULL);
1720 return rtx_equal_p (x, r);
1724 static rtx
1725 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1727 if (can_create_pseudo_p ())
1728 return force_reg (mode, value);
1729 else
1731 x = aarch64_emit_move (x, value);
1732 return x;
1737 static rtx
1738 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1740 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1742 rtx high;
1743 /* Load the full offset into a register. This
1744 might be improvable in the future. */
1745 high = GEN_INT (offset);
1746 offset = 0;
1747 high = aarch64_force_temporary (mode, temp, high);
1748 reg = aarch64_force_temporary (mode, temp,
1749 gen_rtx_PLUS (mode, high, reg));
1751 return plus_constant (mode, reg, offset);
1754 static int
1755 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1756 machine_mode mode)
1758 int i;
1759 unsigned HOST_WIDE_INT val, val2, mask;
1760 int one_match, zero_match;
1761 int num_insns;
1763 val = INTVAL (imm);
1765 if (aarch64_move_imm (val, mode))
1767 if (generate)
1768 emit_insn (gen_rtx_SET (dest, imm));
1769 return 1;
1772 if ((val >> 32) == 0 || mode == SImode)
1774 if (generate)
1776 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1777 if (mode == SImode)
1778 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1779 GEN_INT ((val >> 16) & 0xffff)));
1780 else
1781 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1782 GEN_INT ((val >> 16) & 0xffff)));
1784 return 2;
1787 /* Remaining cases are all for DImode. */
1789 mask = 0xffff;
1790 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1791 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1792 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1793 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1795 if (zero_match != 2 && one_match != 2)
1797 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1798 For a 64-bit bitmask try whether changing 16 bits to all ones or
1799 zeroes creates a valid bitmask. To check any repeated bitmask,
1800 try using 16 bits from the other 32-bit half of val. */
1802 for (i = 0; i < 64; i += 16, mask <<= 16)
1804 val2 = val & ~mask;
1805 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1806 break;
1807 val2 = val | mask;
1808 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1809 break;
1810 val2 = val2 & ~mask;
1811 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1812 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1813 break;
1815 if (i != 64)
1817 if (generate)
1819 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1820 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1821 GEN_INT ((val >> i) & 0xffff)));
1823 return 2;
1827 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1828 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1829 otherwise skip zero bits. */
1831 num_insns = 1;
1832 mask = 0xffff;
1833 val2 = one_match > zero_match ? ~val : val;
1834 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1836 if (generate)
1837 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1838 ? (val | ~(mask << i))
1839 : (val & (mask << i)))));
1840 for (i += 16; i < 64; i += 16)
1842 if ((val2 & (mask << i)) == 0)
1843 continue;
1844 if (generate)
1845 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1846 GEN_INT ((val >> i) & 0xffff)));
1847 num_insns ++;
1850 return num_insns;
1854 void
1855 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1857 machine_mode mode = GET_MODE (dest);
1859 gcc_assert (mode == SImode || mode == DImode);
1861 /* Check on what type of symbol it is. */
1862 if (GET_CODE (imm) == SYMBOL_REF
1863 || GET_CODE (imm) == LABEL_REF
1864 || GET_CODE (imm) == CONST)
1866 rtx mem, base, offset;
1867 enum aarch64_symbol_type sty;
1869 /* If we have (const (plus symbol offset)), separate out the offset
1870 before we start classifying the symbol. */
1871 split_const (imm, &base, &offset);
1873 sty = aarch64_classify_symbol (base, offset);
1874 switch (sty)
1876 case SYMBOL_FORCE_TO_MEM:
1877 if (offset != const0_rtx
1878 && targetm.cannot_force_const_mem (mode, imm))
1880 gcc_assert (can_create_pseudo_p ());
1881 base = aarch64_force_temporary (mode, dest, base);
1882 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1883 aarch64_emit_move (dest, base);
1884 return;
1887 mem = force_const_mem (ptr_mode, imm);
1888 gcc_assert (mem);
1890 /* If we aren't generating PC relative literals, then
1891 we need to expand the literal pool access carefully.
1892 This is something that needs to be done in a number
1893 of places, so could well live as a separate function. */
1894 if (!aarch64_pcrelative_literal_loads)
1896 gcc_assert (can_create_pseudo_p ());
1897 base = gen_reg_rtx (ptr_mode);
1898 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1899 mem = gen_rtx_MEM (ptr_mode, base);
1902 if (mode != ptr_mode)
1903 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1905 emit_insn (gen_rtx_SET (dest, mem));
1907 return;
1909 case SYMBOL_SMALL_TLSGD:
1910 case SYMBOL_SMALL_TLSDESC:
1911 case SYMBOL_SMALL_TLSIE:
1912 case SYMBOL_SMALL_GOT_28K:
1913 case SYMBOL_SMALL_GOT_4G:
1914 case SYMBOL_TINY_GOT:
1915 case SYMBOL_TINY_TLSIE:
1916 if (offset != const0_rtx)
1918 gcc_assert(can_create_pseudo_p ());
1919 base = aarch64_force_temporary (mode, dest, base);
1920 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1921 aarch64_emit_move (dest, base);
1922 return;
1924 /* FALLTHRU */
1926 case SYMBOL_SMALL_ABSOLUTE:
1927 case SYMBOL_TINY_ABSOLUTE:
1928 case SYMBOL_TLSLE12:
1929 case SYMBOL_TLSLE24:
1930 case SYMBOL_TLSLE32:
1931 case SYMBOL_TLSLE48:
1932 aarch64_load_symref_appropriately (dest, imm, sty);
1933 return;
1935 default:
1936 gcc_unreachable ();
1940 if (!CONST_INT_P (imm))
1942 if (GET_CODE (imm) == HIGH)
1943 emit_insn (gen_rtx_SET (dest, imm));
1944 else
1946 rtx mem = force_const_mem (mode, imm);
1947 gcc_assert (mem);
1948 emit_insn (gen_rtx_SET (dest, mem));
1951 return;
1954 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1957 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
1958 temporary value if necessary. FRAME_RELATED_P should be true if
1959 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1960 to the generated instructions. If SCRATCHREG is known to hold
1961 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1962 immediate again.
1964 Since this function may be used to adjust the stack pointer, we must
1965 ensure that it cannot cause transient stack deallocation (for example
1966 by first incrementing SP and then decrementing when adjusting by a
1967 large immediate). */
1969 static void
1970 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
1971 HOST_WIDE_INT delta, bool frame_related_p,
1972 bool emit_move_imm)
1974 HOST_WIDE_INT mdelta = abs_hwi (delta);
1975 rtx this_rtx = gen_rtx_REG (mode, regnum);
1976 rtx_insn *insn;
1978 if (!mdelta)
1979 return;
1981 /* Single instruction adjustment. */
1982 if (aarch64_uimm12_shift (mdelta))
1984 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
1985 RTX_FRAME_RELATED_P (insn) = frame_related_p;
1986 return;
1989 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
1990 Only do this if mdelta is not a 16-bit move as adjusting using a move
1991 is better. */
1992 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
1994 HOST_WIDE_INT low_off = mdelta & 0xfff;
1996 low_off = delta < 0 ? -low_off : low_off;
1997 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
1998 RTX_FRAME_RELATED_P (insn) = frame_related_p;
1999 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2000 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2001 return;
2004 /* Emit a move immediate if required and an addition/subtraction. */
2005 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2006 if (emit_move_imm)
2007 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2008 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2009 : gen_add2_insn (this_rtx, scratch_rtx));
2010 if (frame_related_p)
2012 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2013 rtx adj = plus_constant (mode, this_rtx, delta);
2014 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2018 static inline void
2019 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2020 HOST_WIDE_INT delta)
2022 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2025 static inline void
2026 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2028 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2029 true, emit_move_imm);
2032 static inline void
2033 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2035 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2036 frame_related_p, true);
2039 static bool
2040 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2041 tree exp ATTRIBUTE_UNUSED)
2043 /* Currently, always true. */
2044 return true;
2047 /* Implement TARGET_PASS_BY_REFERENCE. */
2049 static bool
2050 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2051 machine_mode mode,
2052 const_tree type,
2053 bool named ATTRIBUTE_UNUSED)
2055 HOST_WIDE_INT size;
2056 machine_mode dummymode;
2057 int nregs;
2059 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2060 size = (mode == BLKmode && type)
2061 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2063 /* Aggregates are passed by reference based on their size. */
2064 if (type && AGGREGATE_TYPE_P (type))
2066 size = int_size_in_bytes (type);
2069 /* Variable sized arguments are always returned by reference. */
2070 if (size < 0)
2071 return true;
2073 /* Can this be a candidate to be passed in fp/simd register(s)? */
2074 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2075 &dummymode, &nregs,
2076 NULL))
2077 return false;
2079 /* Arguments which are variable sized or larger than 2 registers are
2080 passed by reference unless they are a homogenous floating point
2081 aggregate. */
2082 return size > 2 * UNITS_PER_WORD;
2085 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2086 static bool
2087 aarch64_return_in_msb (const_tree valtype)
2089 machine_mode dummy_mode;
2090 int dummy_int;
2092 /* Never happens in little-endian mode. */
2093 if (!BYTES_BIG_ENDIAN)
2094 return false;
2096 /* Only composite types smaller than or equal to 16 bytes can
2097 be potentially returned in registers. */
2098 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2099 || int_size_in_bytes (valtype) <= 0
2100 || int_size_in_bytes (valtype) > 16)
2101 return false;
2103 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2104 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2105 is always passed/returned in the least significant bits of fp/simd
2106 register(s). */
2107 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2108 &dummy_mode, &dummy_int, NULL))
2109 return false;
2111 return true;
2114 /* Implement TARGET_FUNCTION_VALUE.
2115 Define how to find the value returned by a function. */
2117 static rtx
2118 aarch64_function_value (const_tree type, const_tree func,
2119 bool outgoing ATTRIBUTE_UNUSED)
2121 machine_mode mode;
2122 int unsignedp;
2123 int count;
2124 machine_mode ag_mode;
2126 mode = TYPE_MODE (type);
2127 if (INTEGRAL_TYPE_P (type))
2128 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2130 if (aarch64_return_in_msb (type))
2132 HOST_WIDE_INT size = int_size_in_bytes (type);
2134 if (size % UNITS_PER_WORD != 0)
2136 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2137 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2141 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2142 &ag_mode, &count, NULL))
2144 if (!aarch64_composite_type_p (type, mode))
2146 gcc_assert (count == 1 && mode == ag_mode);
2147 return gen_rtx_REG (mode, V0_REGNUM);
2149 else
2151 int i;
2152 rtx par;
2154 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2155 for (i = 0; i < count; i++)
2157 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2158 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2159 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2160 XVECEXP (par, 0, i) = tmp;
2162 return par;
2165 else
2166 return gen_rtx_REG (mode, R0_REGNUM);
2169 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2170 Return true if REGNO is the number of a hard register in which the values
2171 of called function may come back. */
2173 static bool
2174 aarch64_function_value_regno_p (const unsigned int regno)
2176 /* Maximum of 16 bytes can be returned in the general registers. Examples
2177 of 16-byte return values are: 128-bit integers and 16-byte small
2178 structures (excluding homogeneous floating-point aggregates). */
2179 if (regno == R0_REGNUM || regno == R1_REGNUM)
2180 return true;
2182 /* Up to four fp/simd registers can return a function value, e.g. a
2183 homogeneous floating-point aggregate having four members. */
2184 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2185 return TARGET_FLOAT;
2187 return false;
2190 /* Implement TARGET_RETURN_IN_MEMORY.
2192 If the type T of the result of a function is such that
2193 void func (T arg)
2194 would require that arg be passed as a value in a register (or set of
2195 registers) according to the parameter passing rules, then the result
2196 is returned in the same registers as would be used for such an
2197 argument. */
2199 static bool
2200 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2202 HOST_WIDE_INT size;
2203 machine_mode ag_mode;
2204 int count;
2206 if (!AGGREGATE_TYPE_P (type)
2207 && TREE_CODE (type) != COMPLEX_TYPE
2208 && TREE_CODE (type) != VECTOR_TYPE)
2209 /* Simple scalar types always returned in registers. */
2210 return false;
2212 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2213 type,
2214 &ag_mode,
2215 &count,
2216 NULL))
2217 return false;
2219 /* Types larger than 2 registers returned in memory. */
2220 size = int_size_in_bytes (type);
2221 return (size < 0 || size > 2 * UNITS_PER_WORD);
2224 static bool
2225 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2226 const_tree type, int *nregs)
2228 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2229 return aarch64_vfp_is_call_or_return_candidate (mode,
2230 type,
2231 &pcum->aapcs_vfp_rmode,
2232 nregs,
2233 NULL);
2236 /* Given MODE and TYPE of a function argument, return the alignment in
2237 bits. The idea is to suppress any stronger alignment requested by
2238 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2239 This is a helper function for local use only. */
2241 static unsigned int
2242 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2244 if (!type)
2245 return GET_MODE_ALIGNMENT (mode);
2246 if (integer_zerop (TYPE_SIZE (type)))
2247 return 0;
2249 gcc_assert (TYPE_MODE (type) == mode);
2251 if (!AGGREGATE_TYPE_P (type))
2252 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2254 if (TREE_CODE (type) == ARRAY_TYPE)
2255 return TYPE_ALIGN (TREE_TYPE (type));
2257 unsigned int alignment = 0;
2259 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2260 alignment = std::max (alignment, DECL_ALIGN (field));
2262 return alignment;
2265 /* Layout a function argument according to the AAPCS64 rules. The rule
2266 numbers refer to the rule numbers in the AAPCS64. */
2268 static void
2269 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2270 const_tree type,
2271 bool named ATTRIBUTE_UNUSED)
2273 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2274 int ncrn, nvrn, nregs;
2275 bool allocate_ncrn, allocate_nvrn;
2276 HOST_WIDE_INT size;
2278 /* We need to do this once per argument. */
2279 if (pcum->aapcs_arg_processed)
2280 return;
2282 pcum->aapcs_arg_processed = true;
2284 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2285 size
2286 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2287 UNITS_PER_WORD);
2289 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2290 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2291 mode,
2292 type,
2293 &nregs);
2295 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2296 The following code thus handles passing by SIMD/FP registers first. */
2298 nvrn = pcum->aapcs_nvrn;
2300 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2301 and homogenous short-vector aggregates (HVA). */
2302 if (allocate_nvrn)
2304 if (!TARGET_FLOAT)
2305 aarch64_err_no_fpadvsimd (mode, "argument");
2307 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2309 pcum->aapcs_nextnvrn = nvrn + nregs;
2310 if (!aarch64_composite_type_p (type, mode))
2312 gcc_assert (nregs == 1);
2313 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2315 else
2317 rtx par;
2318 int i;
2319 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2320 for (i = 0; i < nregs; i++)
2322 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2323 V0_REGNUM + nvrn + i);
2324 tmp = gen_rtx_EXPR_LIST
2325 (VOIDmode, tmp,
2326 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2327 XVECEXP (par, 0, i) = tmp;
2329 pcum->aapcs_reg = par;
2331 return;
2333 else
2335 /* C.3 NSRN is set to 8. */
2336 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2337 goto on_stack;
2341 ncrn = pcum->aapcs_ncrn;
2342 nregs = size / UNITS_PER_WORD;
2344 /* C6 - C9. though the sign and zero extension semantics are
2345 handled elsewhere. This is the case where the argument fits
2346 entirely general registers. */
2347 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2349 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2351 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2353 /* C.8 if the argument has an alignment of 16 then the NGRN is
2354 rounded up to the next even number. */
2355 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2357 ++ncrn;
2358 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2360 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2361 A reg is still generated for it, but the caller should be smart
2362 enough not to use it. */
2363 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2365 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2367 else
2369 rtx par;
2370 int i;
2372 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2373 for (i = 0; i < nregs; i++)
2375 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2376 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2377 GEN_INT (i * UNITS_PER_WORD));
2378 XVECEXP (par, 0, i) = tmp;
2380 pcum->aapcs_reg = par;
2383 pcum->aapcs_nextncrn = ncrn + nregs;
2384 return;
2387 /* C.11 */
2388 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2390 /* The argument is passed on stack; record the needed number of words for
2391 this argument and align the total size if necessary. */
2392 on_stack:
2393 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2394 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2395 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2396 16 / UNITS_PER_WORD);
2397 return;
2400 /* Implement TARGET_FUNCTION_ARG. */
2402 static rtx
2403 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2404 const_tree type, bool named)
2406 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2407 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2409 if (mode == VOIDmode)
2410 return NULL_RTX;
2412 aarch64_layout_arg (pcum_v, mode, type, named);
2413 return pcum->aapcs_reg;
2416 void
2417 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2418 const_tree fntype ATTRIBUTE_UNUSED,
2419 rtx libname ATTRIBUTE_UNUSED,
2420 const_tree fndecl ATTRIBUTE_UNUSED,
2421 unsigned n_named ATTRIBUTE_UNUSED)
2423 pcum->aapcs_ncrn = 0;
2424 pcum->aapcs_nvrn = 0;
2425 pcum->aapcs_nextncrn = 0;
2426 pcum->aapcs_nextnvrn = 0;
2427 pcum->pcs_variant = ARM_PCS_AAPCS64;
2428 pcum->aapcs_reg = NULL_RTX;
2429 pcum->aapcs_arg_processed = false;
2430 pcum->aapcs_stack_words = 0;
2431 pcum->aapcs_stack_size = 0;
2433 if (!TARGET_FLOAT
2434 && fndecl && TREE_PUBLIC (fndecl)
2435 && fntype && fntype != error_mark_node)
2437 const_tree type = TREE_TYPE (fntype);
2438 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2439 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2440 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2441 &mode, &nregs, NULL))
2442 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2444 return;
2447 static void
2448 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2449 machine_mode mode,
2450 const_tree type,
2451 bool named)
2453 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2454 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2456 aarch64_layout_arg (pcum_v, mode, type, named);
2457 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2458 != (pcum->aapcs_stack_words != 0));
2459 pcum->aapcs_arg_processed = false;
2460 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2461 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2462 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2463 pcum->aapcs_stack_words = 0;
2464 pcum->aapcs_reg = NULL_RTX;
2468 bool
2469 aarch64_function_arg_regno_p (unsigned regno)
2471 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2472 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2475 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2476 PARM_BOUNDARY bits of alignment, but will be given anything up
2477 to STACK_BOUNDARY bits if the type requires it. This makes sure
2478 that both before and after the layout of each argument, the Next
2479 Stacked Argument Address (NSAA) will have a minimum alignment of
2480 8 bytes. */
2482 static unsigned int
2483 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2485 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2487 if (alignment < PARM_BOUNDARY)
2488 alignment = PARM_BOUNDARY;
2489 if (alignment > STACK_BOUNDARY)
2490 alignment = STACK_BOUNDARY;
2491 return alignment;
2494 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2496 Return true if an argument passed on the stack should be padded upwards,
2497 i.e. if the least-significant byte of the stack slot has useful data.
2499 Small aggregate types are placed in the lowest memory address.
2501 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2503 bool
2504 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2506 /* On little-endian targets, the least significant byte of every stack
2507 argument is passed at the lowest byte address of the stack slot. */
2508 if (!BYTES_BIG_ENDIAN)
2509 return true;
2511 /* Otherwise, integral, floating-point and pointer types are padded downward:
2512 the least significant byte of a stack argument is passed at the highest
2513 byte address of the stack slot. */
2514 if (type
2515 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2516 || POINTER_TYPE_P (type))
2517 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2518 return false;
2520 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2521 return true;
2524 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2526 It specifies padding for the last (may also be the only)
2527 element of a block move between registers and memory. If
2528 assuming the block is in the memory, padding upward means that
2529 the last element is padded after its highest significant byte,
2530 while in downward padding, the last element is padded at the
2531 its least significant byte side.
2533 Small aggregates and small complex types are always padded
2534 upwards.
2536 We don't need to worry about homogeneous floating-point or
2537 short-vector aggregates; their move is not affected by the
2538 padding direction determined here. Regardless of endianness,
2539 each element of such an aggregate is put in the least
2540 significant bits of a fp/simd register.
2542 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2543 register has useful data, and return the opposite if the most
2544 significant byte does. */
2546 bool
2547 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2548 bool first ATTRIBUTE_UNUSED)
2551 /* Small composite types are always padded upward. */
2552 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2554 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2555 : GET_MODE_SIZE (mode));
2556 if (size < 2 * UNITS_PER_WORD)
2557 return true;
2560 /* Otherwise, use the default padding. */
2561 return !BYTES_BIG_ENDIAN;
2564 static machine_mode
2565 aarch64_libgcc_cmp_return_mode (void)
2567 return SImode;
2570 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2572 /* We use the 12-bit shifted immediate arithmetic instructions so values
2573 must be multiple of (1 << 12), i.e. 4096. */
2574 #define ARITH_FACTOR 4096
2576 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2577 #error Cannot use simple address calculation for stack probing
2578 #endif
2580 /* The pair of scratch registers used for stack probing. */
2581 #define PROBE_STACK_FIRST_REG 9
2582 #define PROBE_STACK_SECOND_REG 10
2584 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2585 inclusive. These are offsets from the current stack pointer. */
2587 static void
2588 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2590 rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2592 /* See the same assertion on PROBE_INTERVAL above. */
2593 gcc_assert ((first % ARITH_FACTOR) == 0);
2595 /* See if we have a constant small number of probes to generate. If so,
2596 that's the easy case. */
2597 if (size <= PROBE_INTERVAL)
2599 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2601 emit_set_insn (reg1,
2602 plus_constant (ptr_mode,
2603 stack_pointer_rtx, -(first + base)));
2604 emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2607 /* The run-time loop is made up of 8 insns in the generic case while the
2608 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2609 else if (size <= 4 * PROBE_INTERVAL)
2611 HOST_WIDE_INT i, rem;
2613 emit_set_insn (reg1,
2614 plus_constant (ptr_mode,
2615 stack_pointer_rtx,
2616 -(first + PROBE_INTERVAL)));
2617 emit_stack_probe (reg1);
2619 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2620 it exceeds SIZE. If only two probes are needed, this will not
2621 generate any code. Then probe at FIRST + SIZE. */
2622 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2624 emit_set_insn (reg1,
2625 plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2626 emit_stack_probe (reg1);
2629 rem = size - (i - PROBE_INTERVAL);
2630 if (rem > 256)
2632 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2634 emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2635 emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2637 else
2638 emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2641 /* Otherwise, do the same as above, but in a loop. Note that we must be
2642 extra careful with variables wrapping around because we might be at
2643 the very top (or the very bottom) of the address space and we have
2644 to be able to handle this case properly; in particular, we use an
2645 equality test for the loop condition. */
2646 else
2648 rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2650 /* Step 1: round SIZE to the previous multiple of the interval. */
2652 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2655 /* Step 2: compute initial and final value of the loop counter. */
2657 /* TEST_ADDR = SP + FIRST. */
2658 emit_set_insn (reg1,
2659 plus_constant (ptr_mode, stack_pointer_rtx, -first));
2661 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2662 emit_set_insn (reg2,
2663 plus_constant (ptr_mode, stack_pointer_rtx,
2664 -(first + rounded_size)));
2667 /* Step 3: the loop
2671 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2672 probe at TEST_ADDR
2674 while (TEST_ADDR != LAST_ADDR)
2676 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2677 until it is equal to ROUNDED_SIZE. */
2679 if (ptr_mode == DImode)
2680 emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2681 else
2682 emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2685 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2686 that SIZE is equal to ROUNDED_SIZE. */
2688 if (size != rounded_size)
2690 HOST_WIDE_INT rem = size - rounded_size;
2692 if (rem > 256)
2694 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2696 emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2697 emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2699 else
2700 emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2704 /* Make sure nothing is scheduled before we are done. */
2705 emit_insn (gen_blockage ());
2708 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2709 absolute addresses. */
2711 const char *
2712 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2714 static int labelno = 0;
2715 char loop_lab[32];
2716 rtx xops[2];
2718 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2720 /* Loop. */
2721 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2723 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2724 xops[0] = reg1;
2725 xops[1] = GEN_INT (PROBE_INTERVAL);
2726 output_asm_insn ("sub\t%0, %0, %1", xops);
2728 /* Probe at TEST_ADDR. */
2729 output_asm_insn ("str\txzr, [%0]", xops);
2731 /* Test if TEST_ADDR == LAST_ADDR. */
2732 xops[1] = reg2;
2733 output_asm_insn ("cmp\t%0, %1", xops);
2735 /* Branch. */
2736 fputs ("\tb.ne\t", asm_out_file);
2737 assemble_name_raw (asm_out_file, loop_lab);
2738 fputc ('\n', asm_out_file);
2740 return "";
2743 static bool
2744 aarch64_frame_pointer_required (void)
2746 /* In aarch64_override_options_after_change
2747 flag_omit_leaf_frame_pointer turns off the frame pointer by
2748 default. Turn it back on now if we've not got a leaf
2749 function. */
2750 if (flag_omit_leaf_frame_pointer
2751 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2752 return true;
2754 return false;
2757 /* Mark the registers that need to be saved by the callee and calculate
2758 the size of the callee-saved registers area and frame record (both FP
2759 and LR may be omitted). */
2760 static void
2761 aarch64_layout_frame (void)
2763 HOST_WIDE_INT offset = 0;
2764 int regno, last_fp_reg = INVALID_REGNUM;
2766 if (reload_completed && cfun->machine->frame.laid_out)
2767 return;
2769 #define SLOT_NOT_REQUIRED (-2)
2770 #define SLOT_REQUIRED (-1)
2772 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2773 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2775 /* First mark all the registers that really need to be saved... */
2776 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2777 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2779 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2780 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2782 /* ... that includes the eh data registers (if needed)... */
2783 if (crtl->calls_eh_return)
2784 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2785 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2786 = SLOT_REQUIRED;
2788 /* ... and any callee saved register that dataflow says is live. */
2789 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2790 if (df_regs_ever_live_p (regno)
2791 && (regno == R30_REGNUM
2792 || !call_used_regs[regno]))
2793 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2795 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2796 if (df_regs_ever_live_p (regno)
2797 && !call_used_regs[regno])
2799 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2800 last_fp_reg = regno;
2803 if (frame_pointer_needed)
2805 /* FP and LR are placed in the linkage record. */
2806 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2807 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2808 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2809 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2810 offset += 2 * UNITS_PER_WORD;
2813 /* Now assign stack slots for them. */
2814 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2815 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2817 cfun->machine->frame.reg_offset[regno] = offset;
2818 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2819 cfun->machine->frame.wb_candidate1 = regno;
2820 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2821 cfun->machine->frame.wb_candidate2 = regno;
2822 offset += UNITS_PER_WORD;
2825 HOST_WIDE_INT max_int_offset = offset;
2826 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2827 bool has_align_gap = offset != max_int_offset;
2829 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2830 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2832 /* If there is an alignment gap between integer and fp callee-saves,
2833 allocate the last fp register to it if possible. */
2834 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2836 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2837 break;
2840 cfun->machine->frame.reg_offset[regno] = offset;
2841 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2842 cfun->machine->frame.wb_candidate1 = regno;
2843 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2844 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2845 cfun->machine->frame.wb_candidate2 = regno;
2846 offset += UNITS_PER_WORD;
2849 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2851 cfun->machine->frame.saved_regs_size = offset;
2853 HOST_WIDE_INT varargs_and_saved_regs_size
2854 = offset + cfun->machine->frame.saved_varargs_size;
2856 cfun->machine->frame.hard_fp_offset
2857 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2858 STACK_BOUNDARY / BITS_PER_UNIT);
2860 cfun->machine->frame.frame_size
2861 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2862 + crtl->outgoing_args_size,
2863 STACK_BOUNDARY / BITS_PER_UNIT);
2865 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2867 cfun->machine->frame.initial_adjust = 0;
2868 cfun->machine->frame.final_adjust = 0;
2869 cfun->machine->frame.callee_adjust = 0;
2870 cfun->machine->frame.callee_offset = 0;
2872 HOST_WIDE_INT max_push_offset = 0;
2873 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2874 max_push_offset = 512;
2875 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2876 max_push_offset = 256;
2878 if (cfun->machine->frame.frame_size < max_push_offset
2879 && crtl->outgoing_args_size == 0)
2881 /* Simple, small frame with no outgoing arguments:
2882 stp reg1, reg2, [sp, -frame_size]!
2883 stp reg3, reg4, [sp, 16] */
2884 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2886 else if ((crtl->outgoing_args_size
2887 + cfun->machine->frame.saved_regs_size < 512)
2888 && !(cfun->calls_alloca
2889 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2891 /* Frame with small outgoing arguments:
2892 sub sp, sp, frame_size
2893 stp reg1, reg2, [sp, outgoing_args_size]
2894 stp reg3, reg4, [sp, outgoing_args_size + 16] */
2895 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2896 cfun->machine->frame.callee_offset
2897 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2899 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2901 /* Frame with large outgoing arguments but a small local area:
2902 stp reg1, reg2, [sp, -hard_fp_offset]!
2903 stp reg3, reg4, [sp, 16]
2904 sub sp, sp, outgoing_args_size */
2905 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2906 cfun->machine->frame.final_adjust
2907 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2909 else if (!frame_pointer_needed
2910 && varargs_and_saved_regs_size < max_push_offset)
2912 /* Frame with large local area and outgoing arguments (this pushes the
2913 callee-saves first, followed by the locals and outgoing area):
2914 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2915 stp reg3, reg4, [sp, 16]
2916 sub sp, sp, frame_size - varargs_and_saved_regs_size */
2917 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2918 cfun->machine->frame.final_adjust
2919 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2920 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2921 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2923 else
2925 /* Frame with large local area and outgoing arguments using frame pointer:
2926 sub sp, sp, hard_fp_offset
2927 stp x29, x30, [sp, 0]
2928 add x29, sp, 0
2929 stp reg3, reg4, [sp, 16]
2930 sub sp, sp, outgoing_args_size */
2931 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2932 cfun->machine->frame.final_adjust
2933 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2936 cfun->machine->frame.laid_out = true;
2939 /* Return true if the register REGNO is saved on entry to
2940 the current function. */
2942 static bool
2943 aarch64_register_saved_on_entry (int regno)
2945 return cfun->machine->frame.reg_offset[regno] >= 0;
2948 /* Return the next register up from REGNO up to LIMIT for the callee
2949 to save. */
2951 static unsigned
2952 aarch64_next_callee_save (unsigned regno, unsigned limit)
2954 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2955 regno ++;
2956 return regno;
2959 /* Push the register number REGNO of mode MODE to the stack with write-back
2960 adjusting the stack by ADJUSTMENT. */
2962 static void
2963 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2964 HOST_WIDE_INT adjustment)
2966 rtx base_rtx = stack_pointer_rtx;
2967 rtx insn, reg, mem;
2969 reg = gen_rtx_REG (mode, regno);
2970 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2971 plus_constant (Pmode, base_rtx, -adjustment));
2972 mem = gen_rtx_MEM (mode, mem);
2974 insn = emit_move_insn (mem, reg);
2975 RTX_FRAME_RELATED_P (insn) = 1;
2978 /* Generate and return an instruction to store the pair of registers
2979 REG and REG2 of mode MODE to location BASE with write-back adjusting
2980 the stack location BASE by ADJUSTMENT. */
2982 static rtx
2983 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2984 HOST_WIDE_INT adjustment)
2986 switch (mode)
2988 case DImode:
2989 return gen_storewb_pairdi_di (base, base, reg, reg2,
2990 GEN_INT (-adjustment),
2991 GEN_INT (UNITS_PER_WORD - adjustment));
2992 case DFmode:
2993 return gen_storewb_pairdf_di (base, base, reg, reg2,
2994 GEN_INT (-adjustment),
2995 GEN_INT (UNITS_PER_WORD - adjustment));
2996 default:
2997 gcc_unreachable ();
3001 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3002 stack pointer by ADJUSTMENT. */
3004 static void
3005 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3007 rtx_insn *insn;
3008 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3010 if (regno2 == INVALID_REGNUM)
3011 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3013 rtx reg1 = gen_rtx_REG (mode, regno1);
3014 rtx reg2 = gen_rtx_REG (mode, regno2);
3016 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3017 reg2, adjustment));
3018 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3019 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3020 RTX_FRAME_RELATED_P (insn) = 1;
3023 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3024 adjusting it by ADJUSTMENT afterwards. */
3026 static rtx
3027 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3028 HOST_WIDE_INT adjustment)
3030 switch (mode)
3032 case DImode:
3033 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3034 GEN_INT (UNITS_PER_WORD));
3035 case DFmode:
3036 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3037 GEN_INT (UNITS_PER_WORD));
3038 default:
3039 gcc_unreachable ();
3043 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3044 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3045 into CFI_OPS. */
3047 static void
3048 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3049 rtx *cfi_ops)
3051 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3052 rtx reg1 = gen_rtx_REG (mode, regno1);
3054 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3056 if (regno2 == INVALID_REGNUM)
3058 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3059 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3060 emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3062 else
3064 rtx reg2 = gen_rtx_REG (mode, regno2);
3065 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3066 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3067 reg2, adjustment));
3071 /* Generate and return a store pair instruction of mode MODE to store
3072 register REG1 to MEM1 and register REG2 to MEM2. */
3074 static rtx
3075 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3076 rtx reg2)
3078 switch (mode)
3080 case DImode:
3081 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3083 case DFmode:
3084 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3086 default:
3087 gcc_unreachable ();
3091 /* Generate and regurn a load pair isntruction of mode MODE to load register
3092 REG1 from MEM1 and register REG2 from MEM2. */
3094 static rtx
3095 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3096 rtx mem2)
3098 switch (mode)
3100 case DImode:
3101 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3103 case DFmode:
3104 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3106 default:
3107 gcc_unreachable ();
3111 /* Emit code to save the callee-saved registers from register number START
3112 to LIMIT to the stack at the location starting at offset START_OFFSET,
3113 skipping any write-back candidates if SKIP_WB is true. */
3115 static void
3116 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3117 unsigned start, unsigned limit, bool skip_wb)
3119 rtx_insn *insn;
3120 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3121 ? gen_frame_mem : gen_rtx_MEM);
3122 unsigned regno;
3123 unsigned regno2;
3125 for (regno = aarch64_next_callee_save (start, limit);
3126 regno <= limit;
3127 regno = aarch64_next_callee_save (regno + 1, limit))
3129 rtx reg, mem;
3130 HOST_WIDE_INT offset;
3132 if (skip_wb
3133 && (regno == cfun->machine->frame.wb_candidate1
3134 || regno == cfun->machine->frame.wb_candidate2))
3135 continue;
3137 if (cfun->machine->reg_is_wrapped_separately[regno])
3138 continue;
3140 reg = gen_rtx_REG (mode, regno);
3141 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3142 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3143 offset));
3145 regno2 = aarch64_next_callee_save (regno + 1, limit);
3147 if (regno2 <= limit
3148 && !cfun->machine->reg_is_wrapped_separately[regno2]
3149 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3150 == cfun->machine->frame.reg_offset[regno2]))
3153 rtx reg2 = gen_rtx_REG (mode, regno2);
3154 rtx mem2;
3156 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3157 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3158 offset));
3159 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3160 reg2));
3162 /* The first part of a frame-related parallel insn is
3163 always assumed to be relevant to the frame
3164 calculations; subsequent parts, are only
3165 frame-related if explicitly marked. */
3166 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3167 regno = regno2;
3169 else
3170 insn = emit_move_insn (mem, reg);
3172 RTX_FRAME_RELATED_P (insn) = 1;
3176 /* Emit code to restore the callee registers of mode MODE from register
3177 number START up to and including LIMIT. Restore from the stack offset
3178 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3179 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3181 static void
3182 aarch64_restore_callee_saves (machine_mode mode,
3183 HOST_WIDE_INT start_offset, unsigned start,
3184 unsigned limit, bool skip_wb, rtx *cfi_ops)
3186 rtx base_rtx = stack_pointer_rtx;
3187 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3188 ? gen_frame_mem : gen_rtx_MEM);
3189 unsigned regno;
3190 unsigned regno2;
3191 HOST_WIDE_INT offset;
3193 for (regno = aarch64_next_callee_save (start, limit);
3194 regno <= limit;
3195 regno = aarch64_next_callee_save (regno + 1, limit))
3197 if (cfun->machine->reg_is_wrapped_separately[regno])
3198 continue;
3200 rtx reg, mem;
3202 if (skip_wb
3203 && (regno == cfun->machine->frame.wb_candidate1
3204 || regno == cfun->machine->frame.wb_candidate2))
3205 continue;
3207 reg = gen_rtx_REG (mode, regno);
3208 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3209 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3211 regno2 = aarch64_next_callee_save (regno + 1, limit);
3213 if (regno2 <= limit
3214 && !cfun->machine->reg_is_wrapped_separately[regno2]
3215 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3216 == cfun->machine->frame.reg_offset[regno2]))
3218 rtx reg2 = gen_rtx_REG (mode, regno2);
3219 rtx mem2;
3221 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3222 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3223 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3225 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3226 regno = regno2;
3228 else
3229 emit_move_insn (reg, mem);
3230 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3234 static inline bool
3235 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3236 HOST_WIDE_INT offset)
3238 return offset >= -256 && offset < 256;
3241 static inline bool
3242 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3244 return (offset >= 0
3245 && offset < 4096 * GET_MODE_SIZE (mode)
3246 && offset % GET_MODE_SIZE (mode) == 0);
3249 bool
3250 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3252 return (offset >= -64 * GET_MODE_SIZE (mode)
3253 && offset < 64 * GET_MODE_SIZE (mode)
3254 && offset % GET_MODE_SIZE (mode) == 0);
3257 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3259 static sbitmap
3260 aarch64_get_separate_components (void)
3262 aarch64_layout_frame ();
3264 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3265 bitmap_clear (components);
3267 /* The registers we need saved to the frame. */
3268 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3269 if (aarch64_register_saved_on_entry (regno))
3271 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3272 if (!frame_pointer_needed)
3273 offset += cfun->machine->frame.frame_size
3274 - cfun->machine->frame.hard_fp_offset;
3275 /* Check that we can access the stack slot of the register with one
3276 direct load with no adjustments needed. */
3277 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3278 bitmap_set_bit (components, regno);
3281 /* Don't mess with the hard frame pointer. */
3282 if (frame_pointer_needed)
3283 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3285 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3286 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3287 /* If aarch64_layout_frame has chosen registers to store/restore with
3288 writeback don't interfere with them to avoid having to output explicit
3289 stack adjustment instructions. */
3290 if (reg2 != INVALID_REGNUM)
3291 bitmap_clear_bit (components, reg2);
3292 if (reg1 != INVALID_REGNUM)
3293 bitmap_clear_bit (components, reg1);
3295 bitmap_clear_bit (components, LR_REGNUM);
3296 bitmap_clear_bit (components, SP_REGNUM);
3298 return components;
3301 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3303 static sbitmap
3304 aarch64_components_for_bb (basic_block bb)
3306 bitmap in = DF_LIVE_IN (bb);
3307 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3308 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3310 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3311 bitmap_clear (components);
3313 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3314 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3315 if ((!call_used_regs[regno])
3316 && (bitmap_bit_p (in, regno)
3317 || bitmap_bit_p (gen, regno)
3318 || bitmap_bit_p (kill, regno)))
3319 bitmap_set_bit (components, regno);
3321 return components;
3324 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3325 Nothing to do for aarch64. */
3327 static void
3328 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3332 /* Return the next set bit in BMP from START onwards. Return the total number
3333 of bits in BMP if no set bit is found at or after START. */
3335 static unsigned int
3336 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3338 unsigned int nbits = SBITMAP_SIZE (bmp);
3339 if (start == nbits)
3340 return start;
3342 gcc_assert (start < nbits);
3343 for (unsigned int i = start; i < nbits; i++)
3344 if (bitmap_bit_p (bmp, i))
3345 return i;
3347 return nbits;
3350 /* Do the work for aarch64_emit_prologue_components and
3351 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3352 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3353 for these components or the epilogue sequence. That is, it determines
3354 whether we should emit stores or loads and what kind of CFA notes to attach
3355 to the insns. Otherwise the logic for the two sequences is very
3356 similar. */
3358 static void
3359 aarch64_process_components (sbitmap components, bool prologue_p)
3361 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3362 ? HARD_FRAME_POINTER_REGNUM
3363 : STACK_POINTER_REGNUM);
3365 unsigned last_regno = SBITMAP_SIZE (components);
3366 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3367 rtx_insn *insn = NULL;
3369 while (regno != last_regno)
3371 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3372 so DFmode for the vector registers is enough. */
3373 machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3374 rtx reg = gen_rtx_REG (mode, regno);
3375 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3376 if (!frame_pointer_needed)
3377 offset += cfun->machine->frame.frame_size
3378 - cfun->machine->frame.hard_fp_offset;
3379 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3380 rtx mem = gen_frame_mem (mode, addr);
3382 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3383 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3384 /* No more registers to handle after REGNO.
3385 Emit a single save/restore and exit. */
3386 if (regno2 == last_regno)
3388 insn = emit_insn (set);
3389 RTX_FRAME_RELATED_P (insn) = 1;
3390 if (prologue_p)
3391 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3392 else
3393 add_reg_note (insn, REG_CFA_RESTORE, reg);
3394 break;
3397 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3398 /* The next register is not of the same class or its offset is not
3399 mergeable with the current one into a pair. */
3400 if (!satisfies_constraint_Ump (mem)
3401 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3402 || (offset2 - cfun->machine->frame.reg_offset[regno])
3403 != GET_MODE_SIZE (mode))
3405 insn = emit_insn (set);
3406 RTX_FRAME_RELATED_P (insn) = 1;
3407 if (prologue_p)
3408 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3409 else
3410 add_reg_note (insn, REG_CFA_RESTORE, reg);
3412 regno = regno2;
3413 continue;
3416 /* REGNO2 can be saved/restored in a pair with REGNO. */
3417 rtx reg2 = gen_rtx_REG (mode, regno2);
3418 if (!frame_pointer_needed)
3419 offset2 += cfun->machine->frame.frame_size
3420 - cfun->machine->frame.hard_fp_offset;
3421 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3422 rtx mem2 = gen_frame_mem (mode, addr2);
3423 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3424 : gen_rtx_SET (reg2, mem2);
3426 if (prologue_p)
3427 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3428 else
3429 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3431 RTX_FRAME_RELATED_P (insn) = 1;
3432 if (prologue_p)
3434 add_reg_note (insn, REG_CFA_OFFSET, set);
3435 add_reg_note (insn, REG_CFA_OFFSET, set2);
3437 else
3439 add_reg_note (insn, REG_CFA_RESTORE, reg);
3440 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3443 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3447 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3449 static void
3450 aarch64_emit_prologue_components (sbitmap components)
3452 aarch64_process_components (components, true);
3455 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3457 static void
3458 aarch64_emit_epilogue_components (sbitmap components)
3460 aarch64_process_components (components, false);
3463 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3465 static void
3466 aarch64_set_handled_components (sbitmap components)
3468 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3469 if (bitmap_bit_p (components, regno))
3470 cfun->machine->reg_is_wrapped_separately[regno] = true;
3473 /* AArch64 stack frames generated by this compiler look like:
3475 +-------------------------------+
3477 | incoming stack arguments |
3479 +-------------------------------+
3480 | | <-- incoming stack pointer (aligned)
3481 | callee-allocated save area |
3482 | for register varargs |
3484 +-------------------------------+
3485 | local variables | <-- frame_pointer_rtx
3487 +-------------------------------+
3488 | padding0 | \
3489 +-------------------------------+ |
3490 | callee-saved registers | | frame.saved_regs_size
3491 +-------------------------------+ |
3492 | LR' | |
3493 +-------------------------------+ |
3494 | FP' | / <- hard_frame_pointer_rtx (aligned)
3495 +-------------------------------+
3496 | dynamic allocation |
3497 +-------------------------------+
3498 | padding |
3499 +-------------------------------+
3500 | outgoing stack arguments | <-- arg_pointer
3502 +-------------------------------+
3503 | | <-- stack_pointer_rtx (aligned)
3505 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3506 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3507 unchanged. */
3509 /* Generate the prologue instructions for entry into a function.
3510 Establish the stack frame by decreasing the stack pointer with a
3511 properly calculated size and, if necessary, create a frame record
3512 filled with the values of LR and previous frame pointer. The
3513 current FP is also set up if it is in use. */
3515 void
3516 aarch64_expand_prologue (void)
3518 aarch64_layout_frame ();
3520 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3521 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3522 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3523 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3524 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3525 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3526 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3527 rtx_insn *insn;
3529 if (flag_stack_usage_info)
3530 current_function_static_stack_size = frame_size;
3532 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3534 if (crtl->is_leaf && !cfun->calls_alloca)
3536 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3537 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3538 frame_size - STACK_CHECK_PROTECT);
3540 else if (frame_size > 0)
3541 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3544 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3546 if (callee_adjust != 0)
3547 aarch64_push_regs (reg1, reg2, callee_adjust);
3549 if (frame_pointer_needed)
3551 if (callee_adjust == 0)
3552 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3553 R30_REGNUM, false);
3554 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3555 stack_pointer_rtx,
3556 GEN_INT (callee_offset)));
3557 RTX_FRAME_RELATED_P (insn) = 1;
3558 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3561 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3562 callee_adjust != 0 || frame_pointer_needed);
3563 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3564 callee_adjust != 0 || frame_pointer_needed);
3565 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3568 /* Return TRUE if we can use a simple_return insn.
3570 This function checks whether the callee saved stack is empty, which
3571 means no restore actions are need. The pro_and_epilogue will use
3572 this to check whether shrink-wrapping opt is feasible. */
3574 bool
3575 aarch64_use_return_insn_p (void)
3577 if (!reload_completed)
3578 return false;
3580 if (crtl->profile)
3581 return false;
3583 aarch64_layout_frame ();
3585 return cfun->machine->frame.frame_size == 0;
3588 /* Generate the epilogue instructions for returning from a function.
3589 This is almost exactly the reverse of the prolog sequence, except
3590 that we need to insert barriers to avoid scheduling loads that read
3591 from a deallocated stack, and we optimize the unwind records by
3592 emitting them all together if possible. */
3593 void
3594 aarch64_expand_epilogue (bool for_sibcall)
3596 aarch64_layout_frame ();
3598 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3599 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3600 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3601 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3602 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3603 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3604 rtx cfi_ops = NULL;
3605 rtx_insn *insn;
3607 /* We need to add memory barrier to prevent read from deallocated stack. */
3608 bool need_barrier_p = (get_frame_size ()
3609 + cfun->machine->frame.saved_varargs_size) != 0;
3611 /* Emit a barrier to prevent loads from a deallocated stack. */
3612 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca)
3614 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3615 need_barrier_p = false;
3618 /* Restore the stack pointer from the frame pointer if it may not
3619 be the same as the stack pointer. */
3620 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3622 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3623 hard_frame_pointer_rtx,
3624 GEN_INT (-callee_offset)));
3625 /* If writeback is used when restoring callee-saves, the CFA
3626 is restored on the instruction doing the writeback. */
3627 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3629 else
3630 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3632 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3633 callee_adjust != 0, &cfi_ops);
3634 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3635 callee_adjust != 0, &cfi_ops);
3637 if (need_barrier_p)
3638 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3640 if (callee_adjust != 0)
3641 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3643 if (callee_adjust != 0 || initial_adjust > 65536)
3645 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3646 insn = get_last_insn ();
3647 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3648 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3649 RTX_FRAME_RELATED_P (insn) = 1;
3650 cfi_ops = NULL;
3653 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3655 if (cfi_ops)
3657 /* Emit delayed restores and reset the CFA to be SP. */
3658 insn = get_last_insn ();
3659 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3660 REG_NOTES (insn) = cfi_ops;
3661 RTX_FRAME_RELATED_P (insn) = 1;
3664 /* Stack adjustment for exception handler. */
3665 if (crtl->calls_eh_return)
3667 /* We need to unwind the stack by the offset computed by
3668 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3669 to be SP; letting the CFA move during this adjustment
3670 is just as correct as retaining the CFA from the body
3671 of the function. Therefore, do nothing special. */
3672 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3675 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3676 if (!for_sibcall)
3677 emit_jump_insn (ret_rtx);
3680 /* Return the place to copy the exception unwinding return address to.
3681 This will probably be a stack slot, but could (in theory be the
3682 return register). */
3684 aarch64_final_eh_return_addr (void)
3686 HOST_WIDE_INT fp_offset;
3688 aarch64_layout_frame ();
3690 fp_offset = cfun->machine->frame.frame_size
3691 - cfun->machine->frame.hard_fp_offset;
3693 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
3694 return gen_rtx_REG (DImode, LR_REGNUM);
3696 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
3697 result in a store to save LR introduced by builtin_eh_return () being
3698 incorrectly deleted because the alias is not detected.
3699 So in the calculation of the address to copy the exception unwinding
3700 return address to, we note 2 cases.
3701 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
3702 we return a SP-relative location since all the addresses are SP-relative
3703 in this case. This prevents the store from being optimized away.
3704 If the fp_offset is not 0, then the addresses will be FP-relative and
3705 therefore we return a FP-relative location. */
3707 if (frame_pointer_needed)
3709 if (fp_offset)
3710 return gen_frame_mem (DImode,
3711 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3712 else
3713 return gen_frame_mem (DImode,
3714 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
3717 /* If FP is not needed, we calculate the location of LR, which would be
3718 at the top of the saved registers block. */
3720 return gen_frame_mem (DImode,
3721 plus_constant (Pmode,
3722 stack_pointer_rtx,
3723 fp_offset
3724 + cfun->machine->frame.saved_regs_size
3725 - 2 * UNITS_PER_WORD));
3728 /* Output code to add DELTA to the first argument, and then jump
3729 to FUNCTION. Used for C++ multiple inheritance. */
3730 static void
3731 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3732 HOST_WIDE_INT delta,
3733 HOST_WIDE_INT vcall_offset,
3734 tree function)
3736 /* The this pointer is always in x0. Note that this differs from
3737 Arm where the this pointer maybe bumped to r1 if r0 is required
3738 to return a pointer to an aggregate. On AArch64 a result value
3739 pointer will be in x8. */
3740 int this_regno = R0_REGNUM;
3741 rtx this_rtx, temp0, temp1, addr, funexp;
3742 rtx_insn *insn;
3744 reload_completed = 1;
3745 emit_note (NOTE_INSN_PROLOGUE_END);
3747 if (vcall_offset == 0)
3748 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3749 else
3751 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3753 this_rtx = gen_rtx_REG (Pmode, this_regno);
3754 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3755 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3757 addr = this_rtx;
3758 if (delta != 0)
3760 if (delta >= -256 && delta < 256)
3761 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3762 plus_constant (Pmode, this_rtx, delta));
3763 else
3764 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3767 if (Pmode == ptr_mode)
3768 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3769 else
3770 aarch64_emit_move (temp0,
3771 gen_rtx_ZERO_EXTEND (Pmode,
3772 gen_rtx_MEM (ptr_mode, addr)));
3774 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3775 addr = plus_constant (Pmode, temp0, vcall_offset);
3776 else
3778 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3779 Pmode);
3780 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3783 if (Pmode == ptr_mode)
3784 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3785 else
3786 aarch64_emit_move (temp1,
3787 gen_rtx_SIGN_EXTEND (Pmode,
3788 gen_rtx_MEM (ptr_mode, addr)));
3790 emit_insn (gen_add2_insn (this_rtx, temp1));
3793 /* Generate a tail call to the target function. */
3794 if (!TREE_USED (function))
3796 assemble_external (function);
3797 TREE_USED (function) = 1;
3799 funexp = XEXP (DECL_RTL (function), 0);
3800 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3801 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3802 SIBLING_CALL_P (insn) = 1;
3804 insn = get_insns ();
3805 shorten_branches (insn);
3806 final_start_function (insn, file, 1);
3807 final (insn, file, 1);
3808 final_end_function ();
3810 /* Stop pretending to be a post-reload pass. */
3811 reload_completed = 0;
3814 static bool
3815 aarch64_tls_referenced_p (rtx x)
3817 if (!TARGET_HAVE_TLS)
3818 return false;
3819 subrtx_iterator::array_type array;
3820 FOR_EACH_SUBRTX (iter, array, x, ALL)
3822 const_rtx x = *iter;
3823 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3824 return true;
3825 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3826 TLS offsets, not real symbol references. */
3827 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3828 iter.skip_subrtxes ();
3830 return false;
3834 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3835 a left shift of 0 or 12 bits. */
3836 bool
3837 aarch64_uimm12_shift (HOST_WIDE_INT val)
3839 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3840 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3845 /* Return true if val is an immediate that can be loaded into a
3846 register by a MOVZ instruction. */
3847 static bool
3848 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3850 if (GET_MODE_SIZE (mode) > 4)
3852 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3853 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3854 return 1;
3856 else
3858 /* Ignore sign extension. */
3859 val &= (HOST_WIDE_INT) 0xffffffff;
3861 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3862 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3865 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3867 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3869 0x0000000100000001ull,
3870 0x0001000100010001ull,
3871 0x0101010101010101ull,
3872 0x1111111111111111ull,
3873 0x5555555555555555ull,
3877 /* Return true if val is a valid bitmask immediate. */
3879 bool
3880 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3882 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3883 int bits;
3885 /* Check for a single sequence of one bits and return quickly if so.
3886 The special cases of all ones and all zeroes returns false. */
3887 val = (unsigned HOST_WIDE_INT) val_in;
3888 tmp = val + (val & -val);
3890 if (tmp == (tmp & -tmp))
3891 return (val + 1) > 1;
3893 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
3894 if (mode == SImode)
3895 val = (val << 32) | (val & 0xffffffff);
3897 /* Invert if the immediate doesn't start with a zero bit - this means we
3898 only need to search for sequences of one bits. */
3899 if (val & 1)
3900 val = ~val;
3902 /* Find the first set bit and set tmp to val with the first sequence of one
3903 bits removed. Return success if there is a single sequence of ones. */
3904 first_one = val & -val;
3905 tmp = val & (val + first_one);
3907 if (tmp == 0)
3908 return true;
3910 /* Find the next set bit and compute the difference in bit position. */
3911 next_one = tmp & -tmp;
3912 bits = clz_hwi (first_one) - clz_hwi (next_one);
3913 mask = val ^ tmp;
3915 /* Check the bit position difference is a power of 2, and that the first
3916 sequence of one bits fits within 'bits' bits. */
3917 if ((mask >> bits) != 0 || bits != (bits & -bits))
3918 return false;
3920 /* Check the sequence of one bits is repeated 64/bits times. */
3921 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3924 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
3925 Assumed precondition: VAL_IN Is not zero. */
3927 unsigned HOST_WIDE_INT
3928 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
3930 int lowest_bit_set = ctz_hwi (val_in);
3931 int highest_bit_set = floor_log2 (val_in);
3932 gcc_assert (val_in != 0);
3934 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
3935 (HOST_WIDE_INT_1U << lowest_bit_set));
3938 /* Create constant where bits outside of lowest bit set to highest bit set
3939 are set to 1. */
3941 unsigned HOST_WIDE_INT
3942 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
3944 return val_in | ~aarch64_and_split_imm1 (val_in);
3947 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
3949 bool
3950 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
3952 if (aarch64_bitmask_imm (val_in, mode))
3953 return false;
3955 if (aarch64_move_imm (val_in, mode))
3956 return false;
3958 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
3960 return aarch64_bitmask_imm (imm2, mode);
3963 /* Return true if val is an immediate that can be loaded into a
3964 register in a single instruction. */
3965 bool
3966 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3968 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3969 return 1;
3970 return aarch64_bitmask_imm (val, mode);
3973 static bool
3974 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3976 rtx base, offset;
3978 if (GET_CODE (x) == HIGH)
3979 return true;
3981 split_const (x, &base, &offset);
3982 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3984 if (aarch64_classify_symbol (base, offset)
3985 != SYMBOL_FORCE_TO_MEM)
3986 return true;
3987 else
3988 /* Avoid generating a 64-bit relocation in ILP32; leave
3989 to aarch64_expand_mov_immediate to handle it properly. */
3990 return mode != ptr_mode;
3993 return aarch64_tls_referenced_p (x);
3996 /* Implement TARGET_CASE_VALUES_THRESHOLD.
3997 The expansion for a table switch is quite expensive due to the number
3998 of instructions, the table lookup and hard to predict indirect jump.
3999 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4000 set, otherwise use tables for > 16 cases as a tradeoff between size and
4001 performance. When optimizing for size, use the default setting. */
4003 static unsigned int
4004 aarch64_case_values_threshold (void)
4006 /* Use the specified limit for the number of cases before using jump
4007 tables at higher optimization levels. */
4008 if (optimize > 2
4009 && selected_cpu->tune->max_case_values != 0)
4010 return selected_cpu->tune->max_case_values;
4011 else
4012 return optimize_size ? default_case_values_threshold () : 17;
4015 /* Return true if register REGNO is a valid index register.
4016 STRICT_P is true if REG_OK_STRICT is in effect. */
4018 bool
4019 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4021 if (!HARD_REGISTER_NUM_P (regno))
4023 if (!strict_p)
4024 return true;
4026 if (!reg_renumber)
4027 return false;
4029 regno = reg_renumber[regno];
4031 return GP_REGNUM_P (regno);
4034 /* Return true if register REGNO is a valid base register for mode MODE.
4035 STRICT_P is true if REG_OK_STRICT is in effect. */
4037 bool
4038 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4040 if (!HARD_REGISTER_NUM_P (regno))
4042 if (!strict_p)
4043 return true;
4045 if (!reg_renumber)
4046 return false;
4048 regno = reg_renumber[regno];
4051 /* The fake registers will be eliminated to either the stack or
4052 hard frame pointer, both of which are usually valid base registers.
4053 Reload deals with the cases where the eliminated form isn't valid. */
4054 return (GP_REGNUM_P (regno)
4055 || regno == SP_REGNUM
4056 || regno == FRAME_POINTER_REGNUM
4057 || regno == ARG_POINTER_REGNUM);
4060 /* Return true if X is a valid base register for mode MODE.
4061 STRICT_P is true if REG_OK_STRICT is in effect. */
4063 static bool
4064 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4066 if (!strict_p && GET_CODE (x) == SUBREG)
4067 x = SUBREG_REG (x);
4069 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4072 /* Return true if address offset is a valid index. If it is, fill in INFO
4073 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4075 static bool
4076 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4077 machine_mode mode, bool strict_p)
4079 enum aarch64_address_type type;
4080 rtx index;
4081 int shift;
4083 /* (reg:P) */
4084 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4085 && GET_MODE (x) == Pmode)
4087 type = ADDRESS_REG_REG;
4088 index = x;
4089 shift = 0;
4091 /* (sign_extend:DI (reg:SI)) */
4092 else if ((GET_CODE (x) == SIGN_EXTEND
4093 || GET_CODE (x) == ZERO_EXTEND)
4094 && GET_MODE (x) == DImode
4095 && GET_MODE (XEXP (x, 0)) == SImode)
4097 type = (GET_CODE (x) == SIGN_EXTEND)
4098 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4099 index = XEXP (x, 0);
4100 shift = 0;
4102 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4103 else if (GET_CODE (x) == MULT
4104 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4105 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4106 && GET_MODE (XEXP (x, 0)) == DImode
4107 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4108 && CONST_INT_P (XEXP (x, 1)))
4110 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4111 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4112 index = XEXP (XEXP (x, 0), 0);
4113 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4115 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4116 else if (GET_CODE (x) == ASHIFT
4117 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4118 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4119 && GET_MODE (XEXP (x, 0)) == DImode
4120 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4121 && CONST_INT_P (XEXP (x, 1)))
4123 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4124 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4125 index = XEXP (XEXP (x, 0), 0);
4126 shift = INTVAL (XEXP (x, 1));
4128 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4129 else if ((GET_CODE (x) == SIGN_EXTRACT
4130 || GET_CODE (x) == ZERO_EXTRACT)
4131 && GET_MODE (x) == DImode
4132 && GET_CODE (XEXP (x, 0)) == MULT
4133 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4134 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4136 type = (GET_CODE (x) == SIGN_EXTRACT)
4137 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4138 index = XEXP (XEXP (x, 0), 0);
4139 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4140 if (INTVAL (XEXP (x, 1)) != 32 + shift
4141 || INTVAL (XEXP (x, 2)) != 0)
4142 shift = -1;
4144 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4145 (const_int 0xffffffff<<shift)) */
4146 else if (GET_CODE (x) == AND
4147 && GET_MODE (x) == DImode
4148 && GET_CODE (XEXP (x, 0)) == MULT
4149 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4150 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4151 && CONST_INT_P (XEXP (x, 1)))
4153 type = ADDRESS_REG_UXTW;
4154 index = XEXP (XEXP (x, 0), 0);
4155 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4156 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4157 shift = -1;
4159 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4160 else if ((GET_CODE (x) == SIGN_EXTRACT
4161 || GET_CODE (x) == ZERO_EXTRACT)
4162 && GET_MODE (x) == DImode
4163 && GET_CODE (XEXP (x, 0)) == ASHIFT
4164 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4165 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4167 type = (GET_CODE (x) == SIGN_EXTRACT)
4168 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4169 index = XEXP (XEXP (x, 0), 0);
4170 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4171 if (INTVAL (XEXP (x, 1)) != 32 + shift
4172 || INTVAL (XEXP (x, 2)) != 0)
4173 shift = -1;
4175 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4176 (const_int 0xffffffff<<shift)) */
4177 else if (GET_CODE (x) == AND
4178 && GET_MODE (x) == DImode
4179 && GET_CODE (XEXP (x, 0)) == ASHIFT
4180 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4181 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4182 && CONST_INT_P (XEXP (x, 1)))
4184 type = ADDRESS_REG_UXTW;
4185 index = XEXP (XEXP (x, 0), 0);
4186 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4187 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4188 shift = -1;
4190 /* (mult:P (reg:P) (const_int scale)) */
4191 else if (GET_CODE (x) == MULT
4192 && GET_MODE (x) == Pmode
4193 && GET_MODE (XEXP (x, 0)) == Pmode
4194 && CONST_INT_P (XEXP (x, 1)))
4196 type = ADDRESS_REG_REG;
4197 index = XEXP (x, 0);
4198 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4200 /* (ashift:P (reg:P) (const_int shift)) */
4201 else if (GET_CODE (x) == ASHIFT
4202 && GET_MODE (x) == Pmode
4203 && GET_MODE (XEXP (x, 0)) == Pmode
4204 && CONST_INT_P (XEXP (x, 1)))
4206 type = ADDRESS_REG_REG;
4207 index = XEXP (x, 0);
4208 shift = INTVAL (XEXP (x, 1));
4210 else
4211 return false;
4213 if (GET_CODE (index) == SUBREG)
4214 index = SUBREG_REG (index);
4216 if ((shift == 0 ||
4217 (shift > 0 && shift <= 3
4218 && (1 << shift) == GET_MODE_SIZE (mode)))
4219 && REG_P (index)
4220 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4222 info->type = type;
4223 info->offset = index;
4224 info->shift = shift;
4225 return true;
4228 return false;
4231 /* Return true if MODE is one of the modes for which we
4232 support LDP/STP operations. */
4234 static bool
4235 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4237 return mode == SImode || mode == DImode
4238 || mode == SFmode || mode == DFmode
4239 || (aarch64_vector_mode_supported_p (mode)
4240 && GET_MODE_SIZE (mode) == 8);
4243 /* Return true if REGNO is a virtual pointer register, or an eliminable
4244 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4245 include stack_pointer or hard_frame_pointer. */
4246 static bool
4247 virt_or_elim_regno_p (unsigned regno)
4249 return ((regno >= FIRST_VIRTUAL_REGISTER
4250 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4251 || regno == FRAME_POINTER_REGNUM
4252 || regno == ARG_POINTER_REGNUM);
4255 /* Return true if X is a valid address for machine mode MODE. If it is,
4256 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4257 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4259 static bool
4260 aarch64_classify_address (struct aarch64_address_info *info,
4261 rtx x, machine_mode mode,
4262 RTX_CODE outer_code, bool strict_p)
4264 enum rtx_code code = GET_CODE (x);
4265 rtx op0, op1;
4267 /* On BE, we use load/store pair for all large int mode load/stores. */
4268 bool load_store_pair_p = (outer_code == PARALLEL
4269 || (BYTES_BIG_ENDIAN
4270 && aarch64_vect_struct_mode_p (mode)));
4272 bool allow_reg_index_p =
4273 !load_store_pair_p
4274 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4275 && !aarch64_vect_struct_mode_p (mode);
4277 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4278 REG addressing. */
4279 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4280 && (code != POST_INC && code != REG))
4281 return false;
4283 switch (code)
4285 case REG:
4286 case SUBREG:
4287 info->type = ADDRESS_REG_IMM;
4288 info->base = x;
4289 info->offset = const0_rtx;
4290 return aarch64_base_register_rtx_p (x, strict_p);
4292 case PLUS:
4293 op0 = XEXP (x, 0);
4294 op1 = XEXP (x, 1);
4296 if (! strict_p
4297 && REG_P (op0)
4298 && virt_or_elim_regno_p (REGNO (op0))
4299 && CONST_INT_P (op1))
4301 info->type = ADDRESS_REG_IMM;
4302 info->base = op0;
4303 info->offset = op1;
4305 return true;
4308 if (GET_MODE_SIZE (mode) != 0
4309 && CONST_INT_P (op1)
4310 && aarch64_base_register_rtx_p (op0, strict_p))
4312 HOST_WIDE_INT offset = INTVAL (op1);
4314 info->type = ADDRESS_REG_IMM;
4315 info->base = op0;
4316 info->offset = op1;
4318 /* TImode and TFmode values are allowed in both pairs of X
4319 registers and individual Q registers. The available
4320 address modes are:
4321 X,X: 7-bit signed scaled offset
4322 Q: 9-bit signed offset
4323 We conservatively require an offset representable in either mode.
4324 When performing the check for pairs of X registers i.e. LDP/STP
4325 pass down DImode since that is the natural size of the LDP/STP
4326 instruction memory accesses. */
4327 if (mode == TImode || mode == TFmode)
4328 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4329 && offset_9bit_signed_unscaled_p (mode, offset));
4331 /* A 7bit offset check because OImode will emit a ldp/stp
4332 instruction (only big endian will get here).
4333 For ldp/stp instructions, the offset is scaled for the size of a
4334 single element of the pair. */
4335 if (mode == OImode)
4336 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4338 /* Three 9/12 bit offsets checks because CImode will emit three
4339 ldr/str instructions (only big endian will get here). */
4340 if (mode == CImode)
4341 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4342 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4343 || offset_12bit_unsigned_scaled_p (V16QImode,
4344 offset + 32)));
4346 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4347 instructions (only big endian will get here). */
4348 if (mode == XImode)
4349 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4350 && aarch64_offset_7bit_signed_scaled_p (TImode,
4351 offset + 32));
4353 if (load_store_pair_p)
4354 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4355 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4356 else
4357 return (offset_9bit_signed_unscaled_p (mode, offset)
4358 || offset_12bit_unsigned_scaled_p (mode, offset));
4361 if (allow_reg_index_p)
4363 /* Look for base + (scaled/extended) index register. */
4364 if (aarch64_base_register_rtx_p (op0, strict_p)
4365 && aarch64_classify_index (info, op1, mode, strict_p))
4367 info->base = op0;
4368 return true;
4370 if (aarch64_base_register_rtx_p (op1, strict_p)
4371 && aarch64_classify_index (info, op0, mode, strict_p))
4373 info->base = op1;
4374 return true;
4378 return false;
4380 case POST_INC:
4381 case POST_DEC:
4382 case PRE_INC:
4383 case PRE_DEC:
4384 info->type = ADDRESS_REG_WB;
4385 info->base = XEXP (x, 0);
4386 info->offset = NULL_RTX;
4387 return aarch64_base_register_rtx_p (info->base, strict_p);
4389 case POST_MODIFY:
4390 case PRE_MODIFY:
4391 info->type = ADDRESS_REG_WB;
4392 info->base = XEXP (x, 0);
4393 if (GET_CODE (XEXP (x, 1)) == PLUS
4394 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4395 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4396 && aarch64_base_register_rtx_p (info->base, strict_p))
4398 HOST_WIDE_INT offset;
4399 info->offset = XEXP (XEXP (x, 1), 1);
4400 offset = INTVAL (info->offset);
4402 /* TImode and TFmode values are allowed in both pairs of X
4403 registers and individual Q registers. The available
4404 address modes are:
4405 X,X: 7-bit signed scaled offset
4406 Q: 9-bit signed offset
4407 We conservatively require an offset representable in either mode.
4409 if (mode == TImode || mode == TFmode)
4410 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4411 && offset_9bit_signed_unscaled_p (mode, offset));
4413 if (load_store_pair_p)
4414 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4415 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4416 else
4417 return offset_9bit_signed_unscaled_p (mode, offset);
4419 return false;
4421 case CONST:
4422 case SYMBOL_REF:
4423 case LABEL_REF:
4424 /* load literal: pc-relative constant pool entry. Only supported
4425 for SI mode or larger. */
4426 info->type = ADDRESS_SYMBOLIC;
4428 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4430 rtx sym, addend;
4432 split_const (x, &sym, &addend);
4433 return ((GET_CODE (sym) == LABEL_REF
4434 || (GET_CODE (sym) == SYMBOL_REF
4435 && CONSTANT_POOL_ADDRESS_P (sym)
4436 && aarch64_pcrelative_literal_loads)));
4438 return false;
4440 case LO_SUM:
4441 info->type = ADDRESS_LO_SUM;
4442 info->base = XEXP (x, 0);
4443 info->offset = XEXP (x, 1);
4444 if (allow_reg_index_p
4445 && aarch64_base_register_rtx_p (info->base, strict_p))
4447 rtx sym, offs;
4448 split_const (info->offset, &sym, &offs);
4449 if (GET_CODE (sym) == SYMBOL_REF
4450 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4452 /* The symbol and offset must be aligned to the access size. */
4453 unsigned int align;
4454 unsigned int ref_size;
4456 if (CONSTANT_POOL_ADDRESS_P (sym))
4457 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4458 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4460 tree exp = SYMBOL_REF_DECL (sym);
4461 align = TYPE_ALIGN (TREE_TYPE (exp));
4462 align = CONSTANT_ALIGNMENT (exp, align);
4464 else if (SYMBOL_REF_DECL (sym))
4465 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4466 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4467 && SYMBOL_REF_BLOCK (sym) != NULL)
4468 align = SYMBOL_REF_BLOCK (sym)->alignment;
4469 else
4470 align = BITS_PER_UNIT;
4472 ref_size = GET_MODE_SIZE (mode);
4473 if (ref_size == 0)
4474 ref_size = GET_MODE_SIZE (DImode);
4476 return ((INTVAL (offs) & (ref_size - 1)) == 0
4477 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4480 return false;
4482 default:
4483 return false;
4487 bool
4488 aarch64_symbolic_address_p (rtx x)
4490 rtx offset;
4492 split_const (x, &x, &offset);
4493 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4496 /* Classify the base of symbolic expression X. */
4498 enum aarch64_symbol_type
4499 aarch64_classify_symbolic_expression (rtx x)
4501 rtx offset;
4503 split_const (x, &x, &offset);
4504 return aarch64_classify_symbol (x, offset);
4508 /* Return TRUE if X is a legitimate address for accessing memory in
4509 mode MODE. */
4510 static bool
4511 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4513 struct aarch64_address_info addr;
4515 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4518 /* Return TRUE if X is a legitimate address for accessing memory in
4519 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4520 pair operation. */
4521 bool
4522 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4523 RTX_CODE outer_code, bool strict_p)
4525 struct aarch64_address_info addr;
4527 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4530 /* Split an out-of-range address displacement into a base and offset.
4531 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4532 to increase opportunities for sharing the base address of different sizes.
4533 For TI/TFmode and unaligned accesses use a 256-byte range. */
4534 static bool
4535 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4537 HOST_WIDE_INT mask = GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3fff;
4539 if (mode == TImode || mode == TFmode ||
4540 (INTVAL (*disp) & (GET_MODE_SIZE (mode) - 1)) != 0)
4541 mask = 0xff;
4543 *off = GEN_INT (INTVAL (*disp) & ~mask);
4544 *disp = GEN_INT (INTVAL (*disp) & mask);
4545 return true;
4548 /* Return TRUE if rtx X is immediate constant 0.0 */
4549 bool
4550 aarch64_float_const_zero_rtx_p (rtx x)
4552 if (GET_MODE (x) == VOIDmode)
4553 return false;
4555 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4556 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4557 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4560 /* Return the fixed registers used for condition codes. */
4562 static bool
4563 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4565 *p1 = CC_REGNUM;
4566 *p2 = INVALID_REGNUM;
4567 return true;
4570 /* Emit call insn with PAT and do aarch64-specific handling. */
4572 void
4573 aarch64_emit_call_insn (rtx pat)
4575 rtx insn = emit_call_insn (pat);
4577 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4578 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4579 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4582 machine_mode
4583 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4585 /* All floating point compares return CCFP if it is an equality
4586 comparison, and CCFPE otherwise. */
4587 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4589 switch (code)
4591 case EQ:
4592 case NE:
4593 case UNORDERED:
4594 case ORDERED:
4595 case UNLT:
4596 case UNLE:
4597 case UNGT:
4598 case UNGE:
4599 case UNEQ:
4600 case LTGT:
4601 return CCFPmode;
4603 case LT:
4604 case LE:
4605 case GT:
4606 case GE:
4607 return CCFPEmode;
4609 default:
4610 gcc_unreachable ();
4614 /* Equality comparisons of short modes against zero can be performed
4615 using the TST instruction with the appropriate bitmask. */
4616 if (y == const0_rtx && REG_P (x)
4617 && (code == EQ || code == NE)
4618 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4619 return CC_NZmode;
4621 /* Similarly, comparisons of zero_extends from shorter modes can
4622 be performed using an ANDS with an immediate mask. */
4623 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4624 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4625 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4626 && (code == EQ || code == NE))
4627 return CC_NZmode;
4629 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4630 && y == const0_rtx
4631 && (code == EQ || code == NE || code == LT || code == GE)
4632 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4633 || GET_CODE (x) == NEG
4634 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4635 && CONST_INT_P (XEXP (x, 2)))))
4636 return CC_NZmode;
4638 /* A compare with a shifted operand. Because of canonicalization,
4639 the comparison will have to be swapped when we emit the assembly
4640 code. */
4641 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4642 && (REG_P (y) || GET_CODE (y) == SUBREG)
4643 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4644 || GET_CODE (x) == LSHIFTRT
4645 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4646 return CC_SWPmode;
4648 /* Similarly for a negated operand, but we can only do this for
4649 equalities. */
4650 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4651 && (REG_P (y) || GET_CODE (y) == SUBREG)
4652 && (code == EQ || code == NE)
4653 && GET_CODE (x) == NEG)
4654 return CC_Zmode;
4656 /* A test for unsigned overflow. */
4657 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4658 && code == NE
4659 && GET_CODE (x) == PLUS
4660 && GET_CODE (y) == ZERO_EXTEND)
4661 return CC_Cmode;
4663 /* For everything else, return CCmode. */
4664 return CCmode;
4667 static int
4668 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4671 aarch64_get_condition_code (rtx x)
4673 machine_mode mode = GET_MODE (XEXP (x, 0));
4674 enum rtx_code comp_code = GET_CODE (x);
4676 if (GET_MODE_CLASS (mode) != MODE_CC)
4677 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4678 return aarch64_get_condition_code_1 (mode, comp_code);
4681 static int
4682 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4684 switch (mode)
4686 case CCFPmode:
4687 case CCFPEmode:
4688 switch (comp_code)
4690 case GE: return AARCH64_GE;
4691 case GT: return AARCH64_GT;
4692 case LE: return AARCH64_LS;
4693 case LT: return AARCH64_MI;
4694 case NE: return AARCH64_NE;
4695 case EQ: return AARCH64_EQ;
4696 case ORDERED: return AARCH64_VC;
4697 case UNORDERED: return AARCH64_VS;
4698 case UNLT: return AARCH64_LT;
4699 case UNLE: return AARCH64_LE;
4700 case UNGT: return AARCH64_HI;
4701 case UNGE: return AARCH64_PL;
4702 default: return -1;
4704 break;
4706 case CCmode:
4707 switch (comp_code)
4709 case NE: return AARCH64_NE;
4710 case EQ: return AARCH64_EQ;
4711 case GE: return AARCH64_GE;
4712 case GT: return AARCH64_GT;
4713 case LE: return AARCH64_LE;
4714 case LT: return AARCH64_LT;
4715 case GEU: return AARCH64_CS;
4716 case GTU: return AARCH64_HI;
4717 case LEU: return AARCH64_LS;
4718 case LTU: return AARCH64_CC;
4719 default: return -1;
4721 break;
4723 case CC_SWPmode:
4724 switch (comp_code)
4726 case NE: return AARCH64_NE;
4727 case EQ: return AARCH64_EQ;
4728 case GE: return AARCH64_LE;
4729 case GT: return AARCH64_LT;
4730 case LE: return AARCH64_GE;
4731 case LT: return AARCH64_GT;
4732 case GEU: return AARCH64_LS;
4733 case GTU: return AARCH64_CC;
4734 case LEU: return AARCH64_CS;
4735 case LTU: return AARCH64_HI;
4736 default: return -1;
4738 break;
4740 case CC_NZmode:
4741 switch (comp_code)
4743 case NE: return AARCH64_NE;
4744 case EQ: return AARCH64_EQ;
4745 case GE: return AARCH64_PL;
4746 case LT: return AARCH64_MI;
4747 default: return -1;
4749 break;
4751 case CC_Zmode:
4752 switch (comp_code)
4754 case NE: return AARCH64_NE;
4755 case EQ: return AARCH64_EQ;
4756 default: return -1;
4758 break;
4760 case CC_Cmode:
4761 switch (comp_code)
4763 case NE: return AARCH64_CS;
4764 case EQ: return AARCH64_CC;
4765 default: return -1;
4767 break;
4769 default:
4770 return -1;
4773 return -1;
4776 bool
4777 aarch64_const_vec_all_same_in_range_p (rtx x,
4778 HOST_WIDE_INT minval,
4779 HOST_WIDE_INT maxval)
4781 HOST_WIDE_INT firstval;
4782 int count, i;
4784 if (GET_CODE (x) != CONST_VECTOR
4785 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4786 return false;
4788 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4789 if (firstval < minval || firstval > maxval)
4790 return false;
4792 count = CONST_VECTOR_NUNITS (x);
4793 for (i = 1; i < count; i++)
4794 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4795 return false;
4797 return true;
4800 bool
4801 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4803 return aarch64_const_vec_all_same_in_range_p (x, val, val);
4807 /* N Z C V. */
4808 #define AARCH64_CC_V 1
4809 #define AARCH64_CC_C (1 << 1)
4810 #define AARCH64_CC_Z (1 << 2)
4811 #define AARCH64_CC_N (1 << 3)
4813 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
4814 static const int aarch64_nzcv_codes[] =
4816 0, /* EQ, Z == 1. */
4817 AARCH64_CC_Z, /* NE, Z == 0. */
4818 0, /* CS, C == 1. */
4819 AARCH64_CC_C, /* CC, C == 0. */
4820 0, /* MI, N == 1. */
4821 AARCH64_CC_N, /* PL, N == 0. */
4822 0, /* VS, V == 1. */
4823 AARCH64_CC_V, /* VC, V == 0. */
4824 0, /* HI, C ==1 && Z == 0. */
4825 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
4826 AARCH64_CC_V, /* GE, N == V. */
4827 0, /* LT, N != V. */
4828 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
4829 0, /* LE, !(Z == 0 && N == V). */
4830 0, /* AL, Any. */
4831 0 /* NV, Any. */
4834 static void
4835 aarch64_print_operand (FILE *f, rtx x, int code)
4837 switch (code)
4839 /* An integer or symbol address without a preceding # sign. */
4840 case 'c':
4841 switch (GET_CODE (x))
4843 case CONST_INT:
4844 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4845 break;
4847 case SYMBOL_REF:
4848 output_addr_const (f, x);
4849 break;
4851 case CONST:
4852 if (GET_CODE (XEXP (x, 0)) == PLUS
4853 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4855 output_addr_const (f, x);
4856 break;
4858 /* Fall through. */
4860 default:
4861 output_operand_lossage ("Unsupported operand for code '%c'", code);
4863 break;
4865 case 'e':
4866 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4868 int n;
4870 if (!CONST_INT_P (x)
4871 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4873 output_operand_lossage ("invalid operand for '%%%c'", code);
4874 return;
4877 switch (n)
4879 case 3:
4880 fputc ('b', f);
4881 break;
4882 case 4:
4883 fputc ('h', f);
4884 break;
4885 case 5:
4886 fputc ('w', f);
4887 break;
4888 default:
4889 output_operand_lossage ("invalid operand for '%%%c'", code);
4890 return;
4893 break;
4895 case 'p':
4897 int n;
4899 /* Print N such that 2^N == X. */
4900 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4902 output_operand_lossage ("invalid operand for '%%%c'", code);
4903 return;
4906 asm_fprintf (f, "%d", n);
4908 break;
4910 case 'P':
4911 /* Print the number of non-zero bits in X (a const_int). */
4912 if (!CONST_INT_P (x))
4914 output_operand_lossage ("invalid operand for '%%%c'", code);
4915 return;
4918 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4919 break;
4921 case 'H':
4922 /* Print the higher numbered register of a pair (TImode) of regs. */
4923 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4925 output_operand_lossage ("invalid operand for '%%%c'", code);
4926 return;
4929 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4930 break;
4932 case 'M':
4933 case 'm':
4935 int cond_code;
4936 /* Print a condition (eq, ne, etc) or its inverse. */
4938 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
4939 if (x == const_true_rtx)
4941 if (code == 'M')
4942 fputs ("nv", f);
4943 return;
4946 if (!COMPARISON_P (x))
4948 output_operand_lossage ("invalid operand for '%%%c'", code);
4949 return;
4952 cond_code = aarch64_get_condition_code (x);
4953 gcc_assert (cond_code >= 0);
4954 if (code == 'M')
4955 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4956 fputs (aarch64_condition_codes[cond_code], f);
4958 break;
4960 case 'b':
4961 case 'h':
4962 case 's':
4963 case 'd':
4964 case 'q':
4965 /* Print a scalar FP/SIMD register name. */
4966 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4968 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4969 return;
4971 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4972 break;
4974 case 'S':
4975 case 'T':
4976 case 'U':
4977 case 'V':
4978 /* Print the first FP/SIMD register name in a list. */
4979 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4981 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4982 return;
4984 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4985 break;
4987 case 'R':
4988 /* Print a scalar FP/SIMD register name + 1. */
4989 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4991 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4992 return;
4994 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4995 break;
4997 case 'X':
4998 /* Print bottom 16 bits of integer constant in hex. */
4999 if (!CONST_INT_P (x))
5001 output_operand_lossage ("invalid operand for '%%%c'", code);
5002 return;
5004 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5005 break;
5007 case 'w':
5008 case 'x':
5009 /* Print a general register name or the zero register (32-bit or
5010 64-bit). */
5011 if (x == const0_rtx
5012 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5014 asm_fprintf (f, "%czr", code);
5015 break;
5018 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5020 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5021 break;
5024 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5026 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5027 break;
5030 /* Fall through */
5032 case 0:
5033 /* Print a normal operand, if it's a general register, then we
5034 assume DImode. */
5035 if (x == NULL)
5037 output_operand_lossage ("missing operand");
5038 return;
5041 switch (GET_CODE (x))
5043 case REG:
5044 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5045 break;
5047 case MEM:
5048 output_address (GET_MODE (x), XEXP (x, 0));
5049 break;
5051 case CONST:
5052 case LABEL_REF:
5053 case SYMBOL_REF:
5054 output_addr_const (asm_out_file, x);
5055 break;
5057 case CONST_INT:
5058 asm_fprintf (f, "%wd", INTVAL (x));
5059 break;
5061 case CONST_VECTOR:
5062 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5064 gcc_assert (
5065 aarch64_const_vec_all_same_in_range_p (x,
5066 HOST_WIDE_INT_MIN,
5067 HOST_WIDE_INT_MAX));
5068 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5070 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5072 fputc ('0', f);
5074 else
5075 gcc_unreachable ();
5076 break;
5078 case CONST_DOUBLE:
5079 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5080 be getting CONST_DOUBLEs holding integers. */
5081 gcc_assert (GET_MODE (x) != VOIDmode);
5082 if (aarch64_float_const_zero_rtx_p (x))
5084 fputc ('0', f);
5085 break;
5087 else if (aarch64_float_const_representable_p (x))
5089 #define buf_size 20
5090 char float_buf[buf_size] = {'\0'};
5091 real_to_decimal_for_mode (float_buf,
5092 CONST_DOUBLE_REAL_VALUE (x),
5093 buf_size, buf_size,
5094 1, GET_MODE (x));
5095 asm_fprintf (asm_out_file, "%s", float_buf);
5096 break;
5097 #undef buf_size
5099 output_operand_lossage ("invalid constant");
5100 return;
5101 default:
5102 output_operand_lossage ("invalid operand");
5103 return;
5105 break;
5107 case 'A':
5108 if (GET_CODE (x) == HIGH)
5109 x = XEXP (x, 0);
5111 switch (aarch64_classify_symbolic_expression (x))
5113 case SYMBOL_SMALL_GOT_4G:
5114 asm_fprintf (asm_out_file, ":got:");
5115 break;
5117 case SYMBOL_SMALL_TLSGD:
5118 asm_fprintf (asm_out_file, ":tlsgd:");
5119 break;
5121 case SYMBOL_SMALL_TLSDESC:
5122 asm_fprintf (asm_out_file, ":tlsdesc:");
5123 break;
5125 case SYMBOL_SMALL_TLSIE:
5126 asm_fprintf (asm_out_file, ":gottprel:");
5127 break;
5129 case SYMBOL_TLSLE24:
5130 asm_fprintf (asm_out_file, ":tprel:");
5131 break;
5133 case SYMBOL_TINY_GOT:
5134 gcc_unreachable ();
5135 break;
5137 default:
5138 break;
5140 output_addr_const (asm_out_file, x);
5141 break;
5143 case 'L':
5144 switch (aarch64_classify_symbolic_expression (x))
5146 case SYMBOL_SMALL_GOT_4G:
5147 asm_fprintf (asm_out_file, ":lo12:");
5148 break;
5150 case SYMBOL_SMALL_TLSGD:
5151 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5152 break;
5154 case SYMBOL_SMALL_TLSDESC:
5155 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5156 break;
5158 case SYMBOL_SMALL_TLSIE:
5159 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5160 break;
5162 case SYMBOL_TLSLE12:
5163 asm_fprintf (asm_out_file, ":tprel_lo12:");
5164 break;
5166 case SYMBOL_TLSLE24:
5167 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5168 break;
5170 case SYMBOL_TINY_GOT:
5171 asm_fprintf (asm_out_file, ":got:");
5172 break;
5174 case SYMBOL_TINY_TLSIE:
5175 asm_fprintf (asm_out_file, ":gottprel:");
5176 break;
5178 default:
5179 break;
5181 output_addr_const (asm_out_file, x);
5182 break;
5184 case 'G':
5186 switch (aarch64_classify_symbolic_expression (x))
5188 case SYMBOL_TLSLE24:
5189 asm_fprintf (asm_out_file, ":tprel_hi12:");
5190 break;
5191 default:
5192 break;
5194 output_addr_const (asm_out_file, x);
5195 break;
5197 case 'k':
5199 HOST_WIDE_INT cond_code;
5200 /* Print nzcv. */
5202 if (!CONST_INT_P (x))
5204 output_operand_lossage ("invalid operand for '%%%c'", code);
5205 return;
5208 cond_code = INTVAL (x);
5209 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5210 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5212 break;
5214 default:
5215 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5216 return;
5220 static void
5221 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5223 struct aarch64_address_info addr;
5225 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5226 switch (addr.type)
5228 case ADDRESS_REG_IMM:
5229 if (addr.offset == const0_rtx)
5230 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5231 else
5232 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5233 INTVAL (addr.offset));
5234 return;
5236 case ADDRESS_REG_REG:
5237 if (addr.shift == 0)
5238 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5239 reg_names [REGNO (addr.offset)]);
5240 else
5241 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5242 reg_names [REGNO (addr.offset)], addr.shift);
5243 return;
5245 case ADDRESS_REG_UXTW:
5246 if (addr.shift == 0)
5247 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5248 REGNO (addr.offset) - R0_REGNUM);
5249 else
5250 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5251 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5252 return;
5254 case ADDRESS_REG_SXTW:
5255 if (addr.shift == 0)
5256 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5257 REGNO (addr.offset) - R0_REGNUM);
5258 else
5259 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5260 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5261 return;
5263 case ADDRESS_REG_WB:
5264 switch (GET_CODE (x))
5266 case PRE_INC:
5267 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5268 GET_MODE_SIZE (mode));
5269 return;
5270 case POST_INC:
5271 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5272 GET_MODE_SIZE (mode));
5273 return;
5274 case PRE_DEC:
5275 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5276 GET_MODE_SIZE (mode));
5277 return;
5278 case POST_DEC:
5279 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5280 GET_MODE_SIZE (mode));
5281 return;
5282 case PRE_MODIFY:
5283 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5284 INTVAL (addr.offset));
5285 return;
5286 case POST_MODIFY:
5287 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5288 INTVAL (addr.offset));
5289 return;
5290 default:
5291 break;
5293 break;
5295 case ADDRESS_LO_SUM:
5296 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5297 output_addr_const (f, addr.offset);
5298 asm_fprintf (f, "]");
5299 return;
5301 case ADDRESS_SYMBOLIC:
5302 break;
5305 output_addr_const (f, x);
5308 bool
5309 aarch64_label_mentioned_p (rtx x)
5311 const char *fmt;
5312 int i;
5314 if (GET_CODE (x) == LABEL_REF)
5315 return true;
5317 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5318 referencing instruction, but they are constant offsets, not
5319 symbols. */
5320 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5321 return false;
5323 fmt = GET_RTX_FORMAT (GET_CODE (x));
5324 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5326 if (fmt[i] == 'E')
5328 int j;
5330 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5331 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5332 return 1;
5334 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5335 return 1;
5338 return 0;
5341 /* Implement REGNO_REG_CLASS. */
5343 enum reg_class
5344 aarch64_regno_regclass (unsigned regno)
5346 if (GP_REGNUM_P (regno))
5347 return GENERAL_REGS;
5349 if (regno == SP_REGNUM)
5350 return STACK_REG;
5352 if (regno == FRAME_POINTER_REGNUM
5353 || regno == ARG_POINTER_REGNUM)
5354 return POINTER_REGS;
5356 if (FP_REGNUM_P (regno))
5357 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5359 return NO_REGS;
5362 static rtx
5363 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5365 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5366 where mask is selected by alignment and size of the offset.
5367 We try to pick as large a range for the offset as possible to
5368 maximize the chance of a CSE. However, for aligned addresses
5369 we limit the range to 4k so that structures with different sized
5370 elements are likely to use the same base. We need to be careful
5371 not to split a CONST for some forms of address expression, otherwise
5372 it will generate sub-optimal code. */
5374 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5376 rtx base = XEXP (x, 0);
5377 rtx offset_rtx = XEXP (x, 1);
5378 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5380 if (GET_CODE (base) == PLUS)
5382 rtx op0 = XEXP (base, 0);
5383 rtx op1 = XEXP (base, 1);
5385 /* Force any scaling into a temp for CSE. */
5386 op0 = force_reg (Pmode, op0);
5387 op1 = force_reg (Pmode, op1);
5389 /* Let the pointer register be in op0. */
5390 if (REG_POINTER (op1))
5391 std::swap (op0, op1);
5393 /* If the pointer is virtual or frame related, then we know that
5394 virtual register instantiation or register elimination is going
5395 to apply a second constant. We want the two constants folded
5396 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5397 if (virt_or_elim_regno_p (REGNO (op0)))
5399 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5400 NULL_RTX, true, OPTAB_DIRECT);
5401 return gen_rtx_PLUS (Pmode, base, op1);
5404 /* Otherwise, in order to encourage CSE (and thence loop strength
5405 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5406 base = expand_binop (Pmode, add_optab, op0, op1,
5407 NULL_RTX, true, OPTAB_DIRECT);
5408 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5411 /* Does it look like we'll need a load/store-pair operation? */
5412 HOST_WIDE_INT base_offset;
5413 if (GET_MODE_SIZE (mode) > 16
5414 || mode == TImode)
5415 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
5416 & ~((128 * GET_MODE_SIZE (mode)) - 1));
5417 /* For offsets aren't a multiple of the access size, the limit is
5418 -256...255. */
5419 else if (offset & (GET_MODE_SIZE (mode) - 1))
5421 base_offset = (offset + 0x100) & ~0x1ff;
5423 /* BLKmode typically uses LDP of X-registers. */
5424 if (mode == BLKmode)
5425 base_offset = (offset + 512) & ~0x3ff;
5427 /* Small negative offsets are supported. */
5428 else if (IN_RANGE (offset, -256, 0))
5429 base_offset = 0;
5430 /* Use 12-bit offset by access size. */
5431 else
5432 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5434 if (base_offset != 0)
5436 base = plus_constant (Pmode, base, base_offset);
5437 base = force_operand (base, NULL_RTX);
5438 return plus_constant (Pmode, base, offset - base_offset);
5442 return x;
5445 /* Return the reload icode required for a constant pool in mode. */
5446 static enum insn_code
5447 aarch64_constant_pool_reload_icode (machine_mode mode)
5449 switch (mode)
5451 case SFmode:
5452 return CODE_FOR_aarch64_reload_movcpsfdi;
5454 case DFmode:
5455 return CODE_FOR_aarch64_reload_movcpdfdi;
5457 case TFmode:
5458 return CODE_FOR_aarch64_reload_movcptfdi;
5460 case V8QImode:
5461 return CODE_FOR_aarch64_reload_movcpv8qidi;
5463 case V16QImode:
5464 return CODE_FOR_aarch64_reload_movcpv16qidi;
5466 case V4HImode:
5467 return CODE_FOR_aarch64_reload_movcpv4hidi;
5469 case V8HImode:
5470 return CODE_FOR_aarch64_reload_movcpv8hidi;
5472 case V2SImode:
5473 return CODE_FOR_aarch64_reload_movcpv2sidi;
5475 case V4SImode:
5476 return CODE_FOR_aarch64_reload_movcpv4sidi;
5478 case V2DImode:
5479 return CODE_FOR_aarch64_reload_movcpv2didi;
5481 case V2DFmode:
5482 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5484 default:
5485 gcc_unreachable ();
5488 gcc_unreachable ();
5490 static reg_class_t
5491 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5492 reg_class_t rclass,
5493 machine_mode mode,
5494 secondary_reload_info *sri)
5497 /* If we have to disable direct literal pool loads and stores because the
5498 function is too big, then we need a scratch register. */
5499 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5500 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5501 || targetm.vector_mode_supported_p (GET_MODE (x)))
5502 && !aarch64_pcrelative_literal_loads)
5504 sri->icode = aarch64_constant_pool_reload_icode (mode);
5505 return NO_REGS;
5508 /* Without the TARGET_SIMD instructions we cannot move a Q register
5509 to a Q register directly. We need a scratch. */
5510 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5511 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5512 && reg_class_subset_p (rclass, FP_REGS))
5514 if (mode == TFmode)
5515 sri->icode = CODE_FOR_aarch64_reload_movtf;
5516 else if (mode == TImode)
5517 sri->icode = CODE_FOR_aarch64_reload_movti;
5518 return NO_REGS;
5521 /* A TFmode or TImode memory access should be handled via an FP_REGS
5522 because AArch64 has richer addressing modes for LDR/STR instructions
5523 than LDP/STP instructions. */
5524 if (TARGET_FLOAT && rclass == GENERAL_REGS
5525 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5526 return FP_REGS;
5528 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5529 return GENERAL_REGS;
5531 return NO_REGS;
5534 static bool
5535 aarch64_can_eliminate (const int from, const int to)
5537 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5538 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5540 if (frame_pointer_needed)
5542 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5543 return true;
5544 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5545 return false;
5546 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5547 && !cfun->calls_alloca)
5548 return true;
5549 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5550 return true;
5552 return false;
5554 else
5556 /* If we decided that we didn't need a leaf frame pointer but then used
5557 LR in the function, then we'll want a frame pointer after all, so
5558 prevent this elimination to ensure a frame pointer is used. */
5559 if (to == STACK_POINTER_REGNUM
5560 && flag_omit_leaf_frame_pointer
5561 && df_regs_ever_live_p (LR_REGNUM))
5562 return false;
5565 return true;
5568 HOST_WIDE_INT
5569 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5571 aarch64_layout_frame ();
5573 if (to == HARD_FRAME_POINTER_REGNUM)
5575 if (from == ARG_POINTER_REGNUM)
5576 return cfun->machine->frame.hard_fp_offset;
5578 if (from == FRAME_POINTER_REGNUM)
5579 return cfun->machine->frame.hard_fp_offset
5580 - cfun->machine->frame.locals_offset;
5583 if (to == STACK_POINTER_REGNUM)
5585 if (from == FRAME_POINTER_REGNUM)
5586 return cfun->machine->frame.frame_size
5587 - cfun->machine->frame.locals_offset;
5590 return cfun->machine->frame.frame_size;
5593 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5594 previous frame. */
5597 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5599 if (count != 0)
5600 return const0_rtx;
5601 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5605 static void
5606 aarch64_asm_trampoline_template (FILE *f)
5608 if (TARGET_ILP32)
5610 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5611 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5613 else
5615 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5616 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5618 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5619 assemble_aligned_integer (4, const0_rtx);
5620 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5621 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5624 static void
5625 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5627 rtx fnaddr, mem, a_tramp;
5628 const int tramp_code_sz = 16;
5630 /* Don't need to copy the trailing D-words, we fill those in below. */
5631 emit_block_move (m_tramp, assemble_trampoline_template (),
5632 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5633 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5634 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5635 if (GET_MODE (fnaddr) != ptr_mode)
5636 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5637 emit_move_insn (mem, fnaddr);
5639 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5640 emit_move_insn (mem, chain_value);
5642 /* XXX We should really define a "clear_cache" pattern and use
5643 gen_clear_cache(). */
5644 a_tramp = XEXP (m_tramp, 0);
5645 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5646 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5647 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5648 ptr_mode);
5651 static unsigned char
5652 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5654 switch (regclass)
5656 case CALLER_SAVE_REGS:
5657 case POINTER_REGS:
5658 case GENERAL_REGS:
5659 case ALL_REGS:
5660 case FP_REGS:
5661 case FP_LO_REGS:
5662 return
5663 aarch64_vector_mode_p (mode)
5664 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5665 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5666 case STACK_REG:
5667 return 1;
5669 case NO_REGS:
5670 return 0;
5672 default:
5673 break;
5675 gcc_unreachable ();
5678 static reg_class_t
5679 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5681 if (regclass == POINTER_REGS)
5682 return GENERAL_REGS;
5684 if (regclass == STACK_REG)
5686 if (REG_P(x)
5687 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5688 return regclass;
5690 return NO_REGS;
5693 /* If it's an integer immediate that MOVI can't handle, then
5694 FP_REGS is not an option, so we return NO_REGS instead. */
5695 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5696 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5697 return NO_REGS;
5699 /* Register eliminiation can result in a request for
5700 SP+constant->FP_REGS. We cannot support such operations which
5701 use SP as source and an FP_REG as destination, so reject out
5702 right now. */
5703 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5705 rtx lhs = XEXP (x, 0);
5707 /* Look through a possible SUBREG introduced by ILP32. */
5708 if (GET_CODE (lhs) == SUBREG)
5709 lhs = SUBREG_REG (lhs);
5711 gcc_assert (REG_P (lhs));
5712 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5713 POINTER_REGS));
5714 return NO_REGS;
5717 return regclass;
5720 void
5721 aarch64_asm_output_labelref (FILE* f, const char *name)
5723 asm_fprintf (f, "%U%s", name);
5726 static void
5727 aarch64_elf_asm_constructor (rtx symbol, int priority)
5729 if (priority == DEFAULT_INIT_PRIORITY)
5730 default_ctor_section_asm_out_constructor (symbol, priority);
5731 else
5733 section *s;
5734 char buf[18];
5735 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5736 s = get_section (buf, SECTION_WRITE, NULL);
5737 switch_to_section (s);
5738 assemble_align (POINTER_SIZE);
5739 assemble_aligned_integer (POINTER_BYTES, symbol);
5743 static void
5744 aarch64_elf_asm_destructor (rtx symbol, int priority)
5746 if (priority == DEFAULT_INIT_PRIORITY)
5747 default_dtor_section_asm_out_destructor (symbol, priority);
5748 else
5750 section *s;
5751 char buf[18];
5752 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5753 s = get_section (buf, SECTION_WRITE, NULL);
5754 switch_to_section (s);
5755 assemble_align (POINTER_SIZE);
5756 assemble_aligned_integer (POINTER_BYTES, symbol);
5760 const char*
5761 aarch64_output_casesi (rtx *operands)
5763 char buf[100];
5764 char label[100];
5765 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5766 int index;
5767 static const char *const patterns[4][2] =
5770 "ldrb\t%w3, [%0,%w1,uxtw]",
5771 "add\t%3, %4, %w3, sxtb #2"
5774 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5775 "add\t%3, %4, %w3, sxth #2"
5778 "ldr\t%w3, [%0,%w1,uxtw #2]",
5779 "add\t%3, %4, %w3, sxtw #2"
5781 /* We assume that DImode is only generated when not optimizing and
5782 that we don't really need 64-bit address offsets. That would
5783 imply an object file with 8GB of code in a single function! */
5785 "ldr\t%w3, [%0,%w1,uxtw #2]",
5786 "add\t%3, %4, %w3, sxtw #2"
5790 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5792 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5794 gcc_assert (index >= 0 && index <= 3);
5796 /* Need to implement table size reduction, by chaning the code below. */
5797 output_asm_insn (patterns[index][0], operands);
5798 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5799 snprintf (buf, sizeof (buf),
5800 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5801 output_asm_insn (buf, operands);
5802 output_asm_insn (patterns[index][1], operands);
5803 output_asm_insn ("br\t%3", operands);
5804 assemble_label (asm_out_file, label);
5805 return "";
5809 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5810 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5811 operator. */
5814 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5816 if (shift >= 0 && shift <= 3)
5818 int size;
5819 for (size = 8; size <= 32; size *= 2)
5821 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5822 if (mask == bits << shift)
5823 return size;
5826 return 0;
5829 /* Constant pools are per function only when PC relative
5830 literal loads are true or we are in the large memory
5831 model. */
5833 static inline bool
5834 aarch64_can_use_per_function_literal_pools_p (void)
5836 return (aarch64_pcrelative_literal_loads
5837 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5840 static bool
5841 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5843 /* Fixme:: In an ideal world this would work similar
5844 to the logic in aarch64_select_rtx_section but this
5845 breaks bootstrap in gcc go. For now we workaround
5846 this by returning false here. */
5847 return false;
5850 /* Select appropriate section for constants depending
5851 on where we place literal pools. */
5853 static section *
5854 aarch64_select_rtx_section (machine_mode mode,
5855 rtx x,
5856 unsigned HOST_WIDE_INT align)
5858 if (aarch64_can_use_per_function_literal_pools_p ())
5859 return function_section (current_function_decl);
5861 return default_elf_select_rtx_section (mode, x, align);
5864 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
5865 void
5866 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5867 HOST_WIDE_INT offset)
5869 /* When using per-function literal pools, we must ensure that any code
5870 section is aligned to the minimal instruction length, lest we get
5871 errors from the assembler re "unaligned instructions". */
5872 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5873 ASM_OUTPUT_ALIGN (f, 2);
5876 /* Costs. */
5878 /* Helper function for rtx cost calculation. Strip a shift expression
5879 from X. Returns the inner operand if successful, or the original
5880 expression on failure. */
5881 static rtx
5882 aarch64_strip_shift (rtx x)
5884 rtx op = x;
5886 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5887 we can convert both to ROR during final output. */
5888 if ((GET_CODE (op) == ASHIFT
5889 || GET_CODE (op) == ASHIFTRT
5890 || GET_CODE (op) == LSHIFTRT
5891 || GET_CODE (op) == ROTATERT
5892 || GET_CODE (op) == ROTATE)
5893 && CONST_INT_P (XEXP (op, 1)))
5894 return XEXP (op, 0);
5896 if (GET_CODE (op) == MULT
5897 && CONST_INT_P (XEXP (op, 1))
5898 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5899 return XEXP (op, 0);
5901 return x;
5904 /* Helper function for rtx cost calculation. Strip an extend
5905 expression from X. Returns the inner operand if successful, or the
5906 original expression on failure. We deal with a number of possible
5907 canonicalization variations here. */
5908 static rtx
5909 aarch64_strip_extend (rtx x)
5911 rtx op = x;
5913 /* Zero and sign extraction of a widened value. */
5914 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5915 && XEXP (op, 2) == const0_rtx
5916 && GET_CODE (XEXP (op, 0)) == MULT
5917 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5918 XEXP (op, 1)))
5919 return XEXP (XEXP (op, 0), 0);
5921 /* It can also be represented (for zero-extend) as an AND with an
5922 immediate. */
5923 if (GET_CODE (op) == AND
5924 && GET_CODE (XEXP (op, 0)) == MULT
5925 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5926 && CONST_INT_P (XEXP (op, 1))
5927 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5928 INTVAL (XEXP (op, 1))) != 0)
5929 return XEXP (XEXP (op, 0), 0);
5931 /* Now handle extended register, as this may also have an optional
5932 left shift by 1..4. */
5933 if (GET_CODE (op) == ASHIFT
5934 && CONST_INT_P (XEXP (op, 1))
5935 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5936 op = XEXP (op, 0);
5938 if (GET_CODE (op) == ZERO_EXTEND
5939 || GET_CODE (op) == SIGN_EXTEND)
5940 op = XEXP (op, 0);
5942 if (op != x)
5943 return op;
5945 return x;
5948 /* Return true iff CODE is a shift supported in combination
5949 with arithmetic instructions. */
5951 static bool
5952 aarch64_shift_p (enum rtx_code code)
5954 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5957 /* Helper function for rtx cost calculation. Calculate the cost of
5958 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5959 Return the calculated cost of the expression, recursing manually in to
5960 operands where needed. */
5962 static int
5963 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5965 rtx op0, op1;
5966 const struct cpu_cost_table *extra_cost
5967 = aarch64_tune_params.insn_extra_cost;
5968 int cost = 0;
5969 bool compound_p = (outer == PLUS || outer == MINUS);
5970 machine_mode mode = GET_MODE (x);
5972 gcc_checking_assert (code == MULT);
5974 op0 = XEXP (x, 0);
5975 op1 = XEXP (x, 1);
5977 if (VECTOR_MODE_P (mode))
5978 mode = GET_MODE_INNER (mode);
5980 /* Integer multiply/fma. */
5981 if (GET_MODE_CLASS (mode) == MODE_INT)
5983 /* The multiply will be canonicalized as a shift, cost it as such. */
5984 if (aarch64_shift_p (GET_CODE (x))
5985 || (CONST_INT_P (op1)
5986 && exact_log2 (INTVAL (op1)) > 0))
5988 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5989 || GET_CODE (op0) == SIGN_EXTEND;
5990 if (speed)
5992 if (compound_p)
5994 if (REG_P (op1))
5995 /* ARITH + shift-by-register. */
5996 cost += extra_cost->alu.arith_shift_reg;
5997 else if (is_extend)
5998 /* ARITH + extended register. We don't have a cost field
5999 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6000 cost += extra_cost->alu.extend_arith;
6001 else
6002 /* ARITH + shift-by-immediate. */
6003 cost += extra_cost->alu.arith_shift;
6005 else
6006 /* LSL (immediate). */
6007 cost += extra_cost->alu.shift;
6010 /* Strip extends as we will have costed them in the case above. */
6011 if (is_extend)
6012 op0 = aarch64_strip_extend (op0);
6014 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6016 return cost;
6019 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6020 compound and let the below cases handle it. After all, MNEG is a
6021 special-case alias of MSUB. */
6022 if (GET_CODE (op0) == NEG)
6024 op0 = XEXP (op0, 0);
6025 compound_p = true;
6028 /* Integer multiplies or FMAs have zero/sign extending variants. */
6029 if ((GET_CODE (op0) == ZERO_EXTEND
6030 && GET_CODE (op1) == ZERO_EXTEND)
6031 || (GET_CODE (op0) == SIGN_EXTEND
6032 && GET_CODE (op1) == SIGN_EXTEND))
6034 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6035 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6037 if (speed)
6039 if (compound_p)
6040 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6041 cost += extra_cost->mult[0].extend_add;
6042 else
6043 /* MUL/SMULL/UMULL. */
6044 cost += extra_cost->mult[0].extend;
6047 return cost;
6050 /* This is either an integer multiply or a MADD. In both cases
6051 we want to recurse and cost the operands. */
6052 cost += rtx_cost (op0, mode, MULT, 0, speed);
6053 cost += rtx_cost (op1, mode, MULT, 1, speed);
6055 if (speed)
6057 if (compound_p)
6058 /* MADD/MSUB. */
6059 cost += extra_cost->mult[mode == DImode].add;
6060 else
6061 /* MUL. */
6062 cost += extra_cost->mult[mode == DImode].simple;
6065 return cost;
6067 else
6069 if (speed)
6071 /* Floating-point FMA/FMUL can also support negations of the
6072 operands, unless the rounding mode is upward or downward in
6073 which case FNMUL is different than FMUL with operand negation. */
6074 bool neg0 = GET_CODE (op0) == NEG;
6075 bool neg1 = GET_CODE (op1) == NEG;
6076 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6078 if (neg0)
6079 op0 = XEXP (op0, 0);
6080 if (neg1)
6081 op1 = XEXP (op1, 0);
6084 if (compound_p)
6085 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6086 cost += extra_cost->fp[mode == DFmode].fma;
6087 else
6088 /* FMUL/FNMUL. */
6089 cost += extra_cost->fp[mode == DFmode].mult;
6092 cost += rtx_cost (op0, mode, MULT, 0, speed);
6093 cost += rtx_cost (op1, mode, MULT, 1, speed);
6094 return cost;
6098 static int
6099 aarch64_address_cost (rtx x,
6100 machine_mode mode,
6101 addr_space_t as ATTRIBUTE_UNUSED,
6102 bool speed)
6104 enum rtx_code c = GET_CODE (x);
6105 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6106 struct aarch64_address_info info;
6107 int cost = 0;
6108 info.shift = 0;
6110 if (!aarch64_classify_address (&info, x, mode, c, false))
6112 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6114 /* This is a CONST or SYMBOL ref which will be split
6115 in a different way depending on the code model in use.
6116 Cost it through the generic infrastructure. */
6117 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6118 /* Divide through by the cost of one instruction to
6119 bring it to the same units as the address costs. */
6120 cost_symbol_ref /= COSTS_N_INSNS (1);
6121 /* The cost is then the cost of preparing the address,
6122 followed by an immediate (possibly 0) offset. */
6123 return cost_symbol_ref + addr_cost->imm_offset;
6125 else
6127 /* This is most likely a jump table from a case
6128 statement. */
6129 return addr_cost->register_offset;
6133 switch (info.type)
6135 case ADDRESS_LO_SUM:
6136 case ADDRESS_SYMBOLIC:
6137 case ADDRESS_REG_IMM:
6138 cost += addr_cost->imm_offset;
6139 break;
6141 case ADDRESS_REG_WB:
6142 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6143 cost += addr_cost->pre_modify;
6144 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6145 cost += addr_cost->post_modify;
6146 else
6147 gcc_unreachable ();
6149 break;
6151 case ADDRESS_REG_REG:
6152 cost += addr_cost->register_offset;
6153 break;
6155 case ADDRESS_REG_SXTW:
6156 cost += addr_cost->register_sextend;
6157 break;
6159 case ADDRESS_REG_UXTW:
6160 cost += addr_cost->register_zextend;
6161 break;
6163 default:
6164 gcc_unreachable ();
6168 if (info.shift > 0)
6170 /* For the sake of calculating the cost of the shifted register
6171 component, we can treat same sized modes in the same way. */
6172 switch (GET_MODE_BITSIZE (mode))
6174 case 16:
6175 cost += addr_cost->addr_scale_costs.hi;
6176 break;
6178 case 32:
6179 cost += addr_cost->addr_scale_costs.si;
6180 break;
6182 case 64:
6183 cost += addr_cost->addr_scale_costs.di;
6184 break;
6186 /* We can't tell, or this is a 128-bit vector. */
6187 default:
6188 cost += addr_cost->addr_scale_costs.ti;
6189 break;
6193 return cost;
6196 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6197 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6198 to be taken. */
6201 aarch64_branch_cost (bool speed_p, bool predictable_p)
6203 /* When optimizing for speed, use the cost of unpredictable branches. */
6204 const struct cpu_branch_cost *branch_costs =
6205 aarch64_tune_params.branch_costs;
6207 if (!speed_p || predictable_p)
6208 return branch_costs->predictable;
6209 else
6210 return branch_costs->unpredictable;
6213 /* Return true if the RTX X in mode MODE is a zero or sign extract
6214 usable in an ADD or SUB (extended register) instruction. */
6215 static bool
6216 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6218 /* Catch add with a sign extract.
6219 This is add_<optab><mode>_multp2. */
6220 if (GET_CODE (x) == SIGN_EXTRACT
6221 || GET_CODE (x) == ZERO_EXTRACT)
6223 rtx op0 = XEXP (x, 0);
6224 rtx op1 = XEXP (x, 1);
6225 rtx op2 = XEXP (x, 2);
6227 if (GET_CODE (op0) == MULT
6228 && CONST_INT_P (op1)
6229 && op2 == const0_rtx
6230 && CONST_INT_P (XEXP (op0, 1))
6231 && aarch64_is_extend_from_extract (mode,
6232 XEXP (op0, 1),
6233 op1))
6235 return true;
6238 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6239 No shift. */
6240 else if (GET_CODE (x) == SIGN_EXTEND
6241 || GET_CODE (x) == ZERO_EXTEND)
6242 return REG_P (XEXP (x, 0));
6244 return false;
6247 static bool
6248 aarch64_frint_unspec_p (unsigned int u)
6250 switch (u)
6252 case UNSPEC_FRINTZ:
6253 case UNSPEC_FRINTP:
6254 case UNSPEC_FRINTM:
6255 case UNSPEC_FRINTA:
6256 case UNSPEC_FRINTN:
6257 case UNSPEC_FRINTX:
6258 case UNSPEC_FRINTI:
6259 return true;
6261 default:
6262 return false;
6266 /* Return true iff X is an rtx that will match an extr instruction
6267 i.e. as described in the *extr<mode>5_insn family of patterns.
6268 OP0 and OP1 will be set to the operands of the shifts involved
6269 on success and will be NULL_RTX otherwise. */
6271 static bool
6272 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6274 rtx op0, op1;
6275 machine_mode mode = GET_MODE (x);
6277 *res_op0 = NULL_RTX;
6278 *res_op1 = NULL_RTX;
6280 if (GET_CODE (x) != IOR)
6281 return false;
6283 op0 = XEXP (x, 0);
6284 op1 = XEXP (x, 1);
6286 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6287 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6289 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6290 if (GET_CODE (op1) == ASHIFT)
6291 std::swap (op0, op1);
6293 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6294 return false;
6296 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6297 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6299 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6300 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6302 *res_op0 = XEXP (op0, 0);
6303 *res_op1 = XEXP (op1, 0);
6304 return true;
6308 return false;
6311 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6312 storing it in *COST. Result is true if the total cost of the operation
6313 has now been calculated. */
6314 static bool
6315 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6317 rtx inner;
6318 rtx comparator;
6319 enum rtx_code cmpcode;
6321 if (COMPARISON_P (op0))
6323 inner = XEXP (op0, 0);
6324 comparator = XEXP (op0, 1);
6325 cmpcode = GET_CODE (op0);
6327 else
6329 inner = op0;
6330 comparator = const0_rtx;
6331 cmpcode = NE;
6334 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6336 /* Conditional branch. */
6337 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6338 return true;
6339 else
6341 if (cmpcode == NE || cmpcode == EQ)
6343 if (comparator == const0_rtx)
6345 /* TBZ/TBNZ/CBZ/CBNZ. */
6346 if (GET_CODE (inner) == ZERO_EXTRACT)
6347 /* TBZ/TBNZ. */
6348 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6349 ZERO_EXTRACT, 0, speed);
6350 else
6351 /* CBZ/CBNZ. */
6352 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6354 return true;
6357 else if (cmpcode == LT || cmpcode == GE)
6359 /* TBZ/TBNZ. */
6360 if (comparator == const0_rtx)
6361 return true;
6365 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6367 /* CCMP. */
6368 if (GET_CODE (op1) == COMPARE)
6370 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6371 if (XEXP (op1, 1) == const0_rtx)
6372 *cost += 1;
6373 if (speed)
6375 machine_mode mode = GET_MODE (XEXP (op1, 0));
6376 const struct cpu_cost_table *extra_cost
6377 = aarch64_tune_params.insn_extra_cost;
6379 if (GET_MODE_CLASS (mode) == MODE_INT)
6380 *cost += extra_cost->alu.arith;
6381 else
6382 *cost += extra_cost->fp[mode == DFmode].compare;
6384 return true;
6387 /* It's a conditional operation based on the status flags,
6388 so it must be some flavor of CSEL. */
6390 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6391 if (GET_CODE (op1) == NEG
6392 || GET_CODE (op1) == NOT
6393 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6394 op1 = XEXP (op1, 0);
6395 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6397 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6398 op1 = XEXP (op1, 0);
6399 op2 = XEXP (op2, 0);
6402 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6403 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6404 return true;
6407 /* We don't know what this is, cost all operands. */
6408 return false;
6411 /* Check whether X is a bitfield operation of the form shift + extend that
6412 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6413 operand to which the bitfield operation is applied. Otherwise return
6414 NULL_RTX. */
6416 static rtx
6417 aarch64_extend_bitfield_pattern_p (rtx x)
6419 rtx_code outer_code = GET_CODE (x);
6420 machine_mode outer_mode = GET_MODE (x);
6422 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6423 && outer_mode != SImode && outer_mode != DImode)
6424 return NULL_RTX;
6426 rtx inner = XEXP (x, 0);
6427 rtx_code inner_code = GET_CODE (inner);
6428 machine_mode inner_mode = GET_MODE (inner);
6429 rtx op = NULL_RTX;
6431 switch (inner_code)
6433 case ASHIFT:
6434 if (CONST_INT_P (XEXP (inner, 1))
6435 && (inner_mode == QImode || inner_mode == HImode))
6436 op = XEXP (inner, 0);
6437 break;
6438 case LSHIFTRT:
6439 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6440 && (inner_mode == QImode || inner_mode == HImode))
6441 op = XEXP (inner, 0);
6442 break;
6443 case ASHIFTRT:
6444 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6445 && (inner_mode == QImode || inner_mode == HImode))
6446 op = XEXP (inner, 0);
6447 break;
6448 default:
6449 break;
6452 return op;
6455 /* Return true if the mask and a shift amount from an RTX of the form
6456 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6457 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6459 bool
6460 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6462 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6463 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6464 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6465 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6468 /* Calculate the cost of calculating X, storing it in *COST. Result
6469 is true if the total cost of the operation has now been calculated. */
6470 static bool
6471 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6472 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6474 rtx op0, op1, op2;
6475 const struct cpu_cost_table *extra_cost
6476 = aarch64_tune_params.insn_extra_cost;
6477 int code = GET_CODE (x);
6479 /* By default, assume that everything has equivalent cost to the
6480 cheapest instruction. Any additional costs are applied as a delta
6481 above this default. */
6482 *cost = COSTS_N_INSNS (1);
6484 switch (code)
6486 case SET:
6487 /* The cost depends entirely on the operands to SET. */
6488 *cost = 0;
6489 op0 = SET_DEST (x);
6490 op1 = SET_SRC (x);
6492 switch (GET_CODE (op0))
6494 case MEM:
6495 if (speed)
6497 rtx address = XEXP (op0, 0);
6498 if (VECTOR_MODE_P (mode))
6499 *cost += extra_cost->ldst.storev;
6500 else if (GET_MODE_CLASS (mode) == MODE_INT)
6501 *cost += extra_cost->ldst.store;
6502 else if (mode == SFmode)
6503 *cost += extra_cost->ldst.storef;
6504 else if (mode == DFmode)
6505 *cost += extra_cost->ldst.stored;
6507 *cost +=
6508 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6509 0, speed));
6512 *cost += rtx_cost (op1, mode, SET, 1, speed);
6513 return true;
6515 case SUBREG:
6516 if (! REG_P (SUBREG_REG (op0)))
6517 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6519 /* Fall through. */
6520 case REG:
6521 /* The cost is one per vector-register copied. */
6522 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6524 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6525 / GET_MODE_SIZE (V4SImode);
6526 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6528 /* const0_rtx is in general free, but we will use an
6529 instruction to set a register to 0. */
6530 else if (REG_P (op1) || op1 == const0_rtx)
6532 /* The cost is 1 per register copied. */
6533 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6534 / UNITS_PER_WORD;
6535 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6537 else
6538 /* Cost is just the cost of the RHS of the set. */
6539 *cost += rtx_cost (op1, mode, SET, 1, speed);
6540 return true;
6542 case ZERO_EXTRACT:
6543 case SIGN_EXTRACT:
6544 /* Bit-field insertion. Strip any redundant widening of
6545 the RHS to meet the width of the target. */
6546 if (GET_CODE (op1) == SUBREG)
6547 op1 = SUBREG_REG (op1);
6548 if ((GET_CODE (op1) == ZERO_EXTEND
6549 || GET_CODE (op1) == SIGN_EXTEND)
6550 && CONST_INT_P (XEXP (op0, 1))
6551 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6552 >= INTVAL (XEXP (op0, 1))))
6553 op1 = XEXP (op1, 0);
6555 if (CONST_INT_P (op1))
6557 /* MOV immediate is assumed to always be cheap. */
6558 *cost = COSTS_N_INSNS (1);
6560 else
6562 /* BFM. */
6563 if (speed)
6564 *cost += extra_cost->alu.bfi;
6565 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6568 return true;
6570 default:
6571 /* We can't make sense of this, assume default cost. */
6572 *cost = COSTS_N_INSNS (1);
6573 return false;
6575 return false;
6577 case CONST_INT:
6578 /* If an instruction can incorporate a constant within the
6579 instruction, the instruction's expression avoids calling
6580 rtx_cost() on the constant. If rtx_cost() is called on a
6581 constant, then it is usually because the constant must be
6582 moved into a register by one or more instructions.
6584 The exception is constant 0, which can be expressed
6585 as XZR/WZR and is therefore free. The exception to this is
6586 if we have (set (reg) (const0_rtx)) in which case we must cost
6587 the move. However, we can catch that when we cost the SET, so
6588 we don't need to consider that here. */
6589 if (x == const0_rtx)
6590 *cost = 0;
6591 else
6593 /* To an approximation, building any other constant is
6594 proportionally expensive to the number of instructions
6595 required to build that constant. This is true whether we
6596 are compiling for SPEED or otherwise. */
6597 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6598 (NULL_RTX, x, false, mode));
6600 return true;
6602 case CONST_DOUBLE:
6603 if (speed)
6605 /* mov[df,sf]_aarch64. */
6606 if (aarch64_float_const_representable_p (x))
6607 /* FMOV (scalar immediate). */
6608 *cost += extra_cost->fp[mode == DFmode].fpconst;
6609 else if (!aarch64_float_const_zero_rtx_p (x))
6611 /* This will be a load from memory. */
6612 if (mode == DFmode)
6613 *cost += extra_cost->ldst.loadd;
6614 else
6615 *cost += extra_cost->ldst.loadf;
6617 else
6618 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6619 or MOV v0.s[0], wzr - neither of which are modeled by the
6620 cost tables. Just use the default cost. */
6625 return true;
6627 case MEM:
6628 if (speed)
6630 /* For loads we want the base cost of a load, plus an
6631 approximation for the additional cost of the addressing
6632 mode. */
6633 rtx address = XEXP (x, 0);
6634 if (VECTOR_MODE_P (mode))
6635 *cost += extra_cost->ldst.loadv;
6636 else if (GET_MODE_CLASS (mode) == MODE_INT)
6637 *cost += extra_cost->ldst.load;
6638 else if (mode == SFmode)
6639 *cost += extra_cost->ldst.loadf;
6640 else if (mode == DFmode)
6641 *cost += extra_cost->ldst.loadd;
6643 *cost +=
6644 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6645 0, speed));
6648 return true;
6650 case NEG:
6651 op0 = XEXP (x, 0);
6653 if (VECTOR_MODE_P (mode))
6655 if (speed)
6657 /* FNEG. */
6658 *cost += extra_cost->vect.alu;
6660 return false;
6663 if (GET_MODE_CLASS (mode) == MODE_INT)
6665 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6666 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6668 /* CSETM. */
6669 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6670 return true;
6673 /* Cost this as SUB wzr, X. */
6674 op0 = CONST0_RTX (mode);
6675 op1 = XEXP (x, 0);
6676 goto cost_minus;
6679 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6681 /* Support (neg(fma...)) as a single instruction only if
6682 sign of zeros is unimportant. This matches the decision
6683 making in aarch64.md. */
6684 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6686 /* FNMADD. */
6687 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6688 return true;
6690 if (GET_CODE (op0) == MULT)
6692 /* FNMUL. */
6693 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6694 return true;
6696 if (speed)
6697 /* FNEG. */
6698 *cost += extra_cost->fp[mode == DFmode].neg;
6699 return false;
6702 return false;
6704 case CLRSB:
6705 case CLZ:
6706 if (speed)
6708 if (VECTOR_MODE_P (mode))
6709 *cost += extra_cost->vect.alu;
6710 else
6711 *cost += extra_cost->alu.clz;
6714 return false;
6716 case COMPARE:
6717 op0 = XEXP (x, 0);
6718 op1 = XEXP (x, 1);
6720 if (op1 == const0_rtx
6721 && GET_CODE (op0) == AND)
6723 x = op0;
6724 mode = GET_MODE (op0);
6725 goto cost_logic;
6728 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6730 /* TODO: A write to the CC flags possibly costs extra, this
6731 needs encoding in the cost tables. */
6733 mode = GET_MODE (op0);
6734 /* ANDS. */
6735 if (GET_CODE (op0) == AND)
6737 x = op0;
6738 goto cost_logic;
6741 if (GET_CODE (op0) == PLUS)
6743 /* ADDS (and CMN alias). */
6744 x = op0;
6745 goto cost_plus;
6748 if (GET_CODE (op0) == MINUS)
6750 /* SUBS. */
6751 x = op0;
6752 goto cost_minus;
6755 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6756 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6757 && CONST_INT_P (XEXP (op0, 2)))
6759 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6760 Handle it here directly rather than going to cost_logic
6761 since we know the immediate generated for the TST is valid
6762 so we can avoid creating an intermediate rtx for it only
6763 for costing purposes. */
6764 if (speed)
6765 *cost += extra_cost->alu.logical;
6767 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6768 ZERO_EXTRACT, 0, speed);
6769 return true;
6772 if (GET_CODE (op1) == NEG)
6774 /* CMN. */
6775 if (speed)
6776 *cost += extra_cost->alu.arith;
6778 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6779 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6780 return true;
6783 /* CMP.
6785 Compare can freely swap the order of operands, and
6786 canonicalization puts the more complex operation first.
6787 But the integer MINUS logic expects the shift/extend
6788 operation in op1. */
6789 if (! (REG_P (op0)
6790 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6792 op0 = XEXP (x, 1);
6793 op1 = XEXP (x, 0);
6795 goto cost_minus;
6798 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6800 /* FCMP. */
6801 if (speed)
6802 *cost += extra_cost->fp[mode == DFmode].compare;
6804 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6806 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6807 /* FCMP supports constant 0.0 for no extra cost. */
6808 return true;
6810 return false;
6813 if (VECTOR_MODE_P (mode))
6815 /* Vector compare. */
6816 if (speed)
6817 *cost += extra_cost->vect.alu;
6819 if (aarch64_float_const_zero_rtx_p (op1))
6821 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6822 cost. */
6823 return true;
6825 return false;
6827 return false;
6829 case MINUS:
6831 op0 = XEXP (x, 0);
6832 op1 = XEXP (x, 1);
6834 cost_minus:
6835 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6837 /* Detect valid immediates. */
6838 if ((GET_MODE_CLASS (mode) == MODE_INT
6839 || (GET_MODE_CLASS (mode) == MODE_CC
6840 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6841 && CONST_INT_P (op1)
6842 && aarch64_uimm12_shift (INTVAL (op1)))
6844 if (speed)
6845 /* SUB(S) (immediate). */
6846 *cost += extra_cost->alu.arith;
6847 return true;
6850 /* Look for SUB (extended register). */
6851 if (aarch64_rtx_arith_op_extract_p (op1, mode))
6853 if (speed)
6854 *cost += extra_cost->alu.extend_arith;
6856 op1 = aarch64_strip_extend (op1);
6857 *cost += rtx_cost (op1, VOIDmode,
6858 (enum rtx_code) GET_CODE (op1), 0, speed);
6859 return true;
6862 rtx new_op1 = aarch64_strip_extend (op1);
6864 /* Cost this as an FMA-alike operation. */
6865 if ((GET_CODE (new_op1) == MULT
6866 || aarch64_shift_p (GET_CODE (new_op1)))
6867 && code != COMPARE)
6869 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6870 (enum rtx_code) code,
6871 speed);
6872 return true;
6875 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6877 if (speed)
6879 if (VECTOR_MODE_P (mode))
6881 /* Vector SUB. */
6882 *cost += extra_cost->vect.alu;
6884 else if (GET_MODE_CLASS (mode) == MODE_INT)
6886 /* SUB(S). */
6887 *cost += extra_cost->alu.arith;
6889 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6891 /* FSUB. */
6892 *cost += extra_cost->fp[mode == DFmode].addsub;
6895 return true;
6898 case PLUS:
6900 rtx new_op0;
6902 op0 = XEXP (x, 0);
6903 op1 = XEXP (x, 1);
6905 cost_plus:
6906 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6907 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6909 /* CSINC. */
6910 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6911 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6912 return true;
6915 if (GET_MODE_CLASS (mode) == MODE_INT
6916 && CONST_INT_P (op1)
6917 && aarch64_uimm12_shift (INTVAL (op1)))
6919 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6921 if (speed)
6922 /* ADD (immediate). */
6923 *cost += extra_cost->alu.arith;
6924 return true;
6927 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6929 /* Look for ADD (extended register). */
6930 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6932 if (speed)
6933 *cost += extra_cost->alu.extend_arith;
6935 op0 = aarch64_strip_extend (op0);
6936 *cost += rtx_cost (op0, VOIDmode,
6937 (enum rtx_code) GET_CODE (op0), 0, speed);
6938 return true;
6941 /* Strip any extend, leave shifts behind as we will
6942 cost them through mult_cost. */
6943 new_op0 = aarch64_strip_extend (op0);
6945 if (GET_CODE (new_op0) == MULT
6946 || aarch64_shift_p (GET_CODE (new_op0)))
6948 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6949 speed);
6950 return true;
6953 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6955 if (speed)
6957 if (VECTOR_MODE_P (mode))
6959 /* Vector ADD. */
6960 *cost += extra_cost->vect.alu;
6962 else if (GET_MODE_CLASS (mode) == MODE_INT)
6964 /* ADD. */
6965 *cost += extra_cost->alu.arith;
6967 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6969 /* FADD. */
6970 *cost += extra_cost->fp[mode == DFmode].addsub;
6973 return true;
6976 case BSWAP:
6977 *cost = COSTS_N_INSNS (1);
6979 if (speed)
6981 if (VECTOR_MODE_P (mode))
6982 *cost += extra_cost->vect.alu;
6983 else
6984 *cost += extra_cost->alu.rev;
6986 return false;
6988 case IOR:
6989 if (aarch_rev16_p (x))
6991 *cost = COSTS_N_INSNS (1);
6993 if (speed)
6995 if (VECTOR_MODE_P (mode))
6996 *cost += extra_cost->vect.alu;
6997 else
6998 *cost += extra_cost->alu.rev;
7000 return true;
7003 if (aarch64_extr_rtx_p (x, &op0, &op1))
7005 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7006 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7007 if (speed)
7008 *cost += extra_cost->alu.shift;
7010 return true;
7012 /* Fall through. */
7013 case XOR:
7014 case AND:
7015 cost_logic:
7016 op0 = XEXP (x, 0);
7017 op1 = XEXP (x, 1);
7019 if (VECTOR_MODE_P (mode))
7021 if (speed)
7022 *cost += extra_cost->vect.alu;
7023 return true;
7026 if (code == AND
7027 && GET_CODE (op0) == MULT
7028 && CONST_INT_P (XEXP (op0, 1))
7029 && CONST_INT_P (op1)
7030 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7031 INTVAL (op1)) != 0)
7033 /* This is a UBFM/SBFM. */
7034 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7035 if (speed)
7036 *cost += extra_cost->alu.bfx;
7037 return true;
7040 if (GET_MODE_CLASS (mode) == MODE_INT)
7042 if (CONST_INT_P (op1))
7044 /* We have a mask + shift version of a UBFIZ
7045 i.e. the *andim_ashift<mode>_bfiz pattern. */
7046 if (GET_CODE (op0) == ASHIFT
7047 && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7048 XEXP (op0, 1)))
7050 *cost += rtx_cost (XEXP (op0, 0), mode,
7051 (enum rtx_code) code, 0, speed);
7052 if (speed)
7053 *cost += extra_cost->alu.bfx;
7055 return true;
7057 else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7059 /* We possibly get the immediate for free, this is not
7060 modelled. */
7061 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7062 if (speed)
7063 *cost += extra_cost->alu.logical;
7065 return true;
7068 else
7070 rtx new_op0 = op0;
7072 /* Handle ORN, EON, or BIC. */
7073 if (GET_CODE (op0) == NOT)
7074 op0 = XEXP (op0, 0);
7076 new_op0 = aarch64_strip_shift (op0);
7078 /* If we had a shift on op0 then this is a logical-shift-
7079 by-register/immediate operation. Otherwise, this is just
7080 a logical operation. */
7081 if (speed)
7083 if (new_op0 != op0)
7085 /* Shift by immediate. */
7086 if (CONST_INT_P (XEXP (op0, 1)))
7087 *cost += extra_cost->alu.log_shift;
7088 else
7089 *cost += extra_cost->alu.log_shift_reg;
7091 else
7092 *cost += extra_cost->alu.logical;
7095 /* In both cases we want to cost both operands. */
7096 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7097 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7099 return true;
7102 return false;
7104 case NOT:
7105 x = XEXP (x, 0);
7106 op0 = aarch64_strip_shift (x);
7108 if (VECTOR_MODE_P (mode))
7110 /* Vector NOT. */
7111 *cost += extra_cost->vect.alu;
7112 return false;
7115 /* MVN-shifted-reg. */
7116 if (op0 != x)
7118 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7120 if (speed)
7121 *cost += extra_cost->alu.log_shift;
7123 return true;
7125 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7126 Handle the second form here taking care that 'a' in the above can
7127 be a shift. */
7128 else if (GET_CODE (op0) == XOR)
7130 rtx newop0 = XEXP (op0, 0);
7131 rtx newop1 = XEXP (op0, 1);
7132 rtx op0_stripped = aarch64_strip_shift (newop0);
7134 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7135 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7137 if (speed)
7139 if (op0_stripped != newop0)
7140 *cost += extra_cost->alu.log_shift;
7141 else
7142 *cost += extra_cost->alu.logical;
7145 return true;
7147 /* MVN. */
7148 if (speed)
7149 *cost += extra_cost->alu.logical;
7151 return false;
7153 case ZERO_EXTEND:
7155 op0 = XEXP (x, 0);
7156 /* If a value is written in SI mode, then zero extended to DI
7157 mode, the operation will in general be free as a write to
7158 a 'w' register implicitly zeroes the upper bits of an 'x'
7159 register. However, if this is
7161 (set (reg) (zero_extend (reg)))
7163 we must cost the explicit register move. */
7164 if (mode == DImode
7165 && GET_MODE (op0) == SImode
7166 && outer == SET)
7168 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7170 /* If OP_COST is non-zero, then the cost of the zero extend
7171 is effectively the cost of the inner operation. Otherwise
7172 we have a MOV instruction and we take the cost from the MOV
7173 itself. This is true independently of whether we are
7174 optimizing for space or time. */
7175 if (op_cost)
7176 *cost = op_cost;
7178 return true;
7180 else if (MEM_P (op0))
7182 /* All loads can zero extend to any size for free. */
7183 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7184 return true;
7187 op0 = aarch64_extend_bitfield_pattern_p (x);
7188 if (op0)
7190 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7191 if (speed)
7192 *cost += extra_cost->alu.bfx;
7193 return true;
7196 if (speed)
7198 if (VECTOR_MODE_P (mode))
7200 /* UMOV. */
7201 *cost += extra_cost->vect.alu;
7203 else
7205 /* We generate an AND instead of UXTB/UXTH. */
7206 *cost += extra_cost->alu.logical;
7209 return false;
7211 case SIGN_EXTEND:
7212 if (MEM_P (XEXP (x, 0)))
7214 /* LDRSH. */
7215 if (speed)
7217 rtx address = XEXP (XEXP (x, 0), 0);
7218 *cost += extra_cost->ldst.load_sign_extend;
7220 *cost +=
7221 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7222 0, speed));
7224 return true;
7227 op0 = aarch64_extend_bitfield_pattern_p (x);
7228 if (op0)
7230 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7231 if (speed)
7232 *cost += extra_cost->alu.bfx;
7233 return true;
7236 if (speed)
7238 if (VECTOR_MODE_P (mode))
7239 *cost += extra_cost->vect.alu;
7240 else
7241 *cost += extra_cost->alu.extend;
7243 return false;
7245 case ASHIFT:
7246 op0 = XEXP (x, 0);
7247 op1 = XEXP (x, 1);
7249 if (CONST_INT_P (op1))
7251 if (speed)
7253 if (VECTOR_MODE_P (mode))
7255 /* Vector shift (immediate). */
7256 *cost += extra_cost->vect.alu;
7258 else
7260 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7261 aliases. */
7262 *cost += extra_cost->alu.shift;
7266 /* We can incorporate zero/sign extend for free. */
7267 if (GET_CODE (op0) == ZERO_EXTEND
7268 || GET_CODE (op0) == SIGN_EXTEND)
7269 op0 = XEXP (op0, 0);
7271 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7272 return true;
7274 else
7276 if (speed)
7278 if (VECTOR_MODE_P (mode))
7280 /* Vector shift (register). */
7281 *cost += extra_cost->vect.alu;
7283 else
7285 /* LSLV. */
7286 *cost += extra_cost->alu.shift_reg;
7289 return false; /* All arguments need to be in registers. */
7292 case ROTATE:
7293 case ROTATERT:
7294 case LSHIFTRT:
7295 case ASHIFTRT:
7296 op0 = XEXP (x, 0);
7297 op1 = XEXP (x, 1);
7299 if (CONST_INT_P (op1))
7301 /* ASR (immediate) and friends. */
7302 if (speed)
7304 if (VECTOR_MODE_P (mode))
7305 *cost += extra_cost->vect.alu;
7306 else
7307 *cost += extra_cost->alu.shift;
7310 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7311 return true;
7313 else
7316 /* ASR (register) and friends. */
7317 if (speed)
7319 if (VECTOR_MODE_P (mode))
7320 *cost += extra_cost->vect.alu;
7321 else
7322 *cost += extra_cost->alu.shift_reg;
7324 return false; /* All arguments need to be in registers. */
7327 case SYMBOL_REF:
7329 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7330 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7332 /* LDR. */
7333 if (speed)
7334 *cost += extra_cost->ldst.load;
7336 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7337 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7339 /* ADRP, followed by ADD. */
7340 *cost += COSTS_N_INSNS (1);
7341 if (speed)
7342 *cost += 2 * extra_cost->alu.arith;
7344 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7345 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7347 /* ADR. */
7348 if (speed)
7349 *cost += extra_cost->alu.arith;
7352 if (flag_pic)
7354 /* One extra load instruction, after accessing the GOT. */
7355 *cost += COSTS_N_INSNS (1);
7356 if (speed)
7357 *cost += extra_cost->ldst.load;
7359 return true;
7361 case HIGH:
7362 case LO_SUM:
7363 /* ADRP/ADD (immediate). */
7364 if (speed)
7365 *cost += extra_cost->alu.arith;
7366 return true;
7368 case ZERO_EXTRACT:
7369 case SIGN_EXTRACT:
7370 /* UBFX/SBFX. */
7371 if (speed)
7373 if (VECTOR_MODE_P (mode))
7374 *cost += extra_cost->vect.alu;
7375 else
7376 *cost += extra_cost->alu.bfx;
7379 /* We can trust that the immediates used will be correct (there
7380 are no by-register forms), so we need only cost op0. */
7381 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7382 return true;
7384 case MULT:
7385 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7386 /* aarch64_rtx_mult_cost always handles recursion to its
7387 operands. */
7388 return true;
7390 case MOD:
7391 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7392 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7393 an unconditional negate. This case should only ever be reached through
7394 the set_smod_pow2_cheap check in expmed.c. */
7395 if (CONST_INT_P (XEXP (x, 1))
7396 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7397 && (mode == SImode || mode == DImode))
7399 /* We expand to 4 instructions. Reset the baseline. */
7400 *cost = COSTS_N_INSNS (4);
7402 if (speed)
7403 *cost += 2 * extra_cost->alu.logical
7404 + 2 * extra_cost->alu.arith;
7406 return true;
7409 /* Fall-through. */
7410 case UMOD:
7411 if (speed)
7413 if (VECTOR_MODE_P (mode))
7414 *cost += extra_cost->vect.alu;
7415 else if (GET_MODE_CLASS (mode) == MODE_INT)
7416 *cost += (extra_cost->mult[mode == DImode].add
7417 + extra_cost->mult[mode == DImode].idiv);
7418 else if (mode == DFmode)
7419 *cost += (extra_cost->fp[1].mult
7420 + extra_cost->fp[1].div);
7421 else if (mode == SFmode)
7422 *cost += (extra_cost->fp[0].mult
7423 + extra_cost->fp[0].div);
7425 return false; /* All arguments need to be in registers. */
7427 case DIV:
7428 case UDIV:
7429 case SQRT:
7430 if (speed)
7432 if (VECTOR_MODE_P (mode))
7433 *cost += extra_cost->vect.alu;
7434 else if (GET_MODE_CLASS (mode) == MODE_INT)
7435 /* There is no integer SQRT, so only DIV and UDIV can get
7436 here. */
7437 *cost += extra_cost->mult[mode == DImode].idiv;
7438 else
7439 *cost += extra_cost->fp[mode == DFmode].div;
7441 return false; /* All arguments need to be in registers. */
7443 case IF_THEN_ELSE:
7444 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7445 XEXP (x, 2), cost, speed);
7447 case EQ:
7448 case NE:
7449 case GT:
7450 case GTU:
7451 case LT:
7452 case LTU:
7453 case GE:
7454 case GEU:
7455 case LE:
7456 case LEU:
7458 return false; /* All arguments must be in registers. */
7460 case FMA:
7461 op0 = XEXP (x, 0);
7462 op1 = XEXP (x, 1);
7463 op2 = XEXP (x, 2);
7465 if (speed)
7467 if (VECTOR_MODE_P (mode))
7468 *cost += extra_cost->vect.alu;
7469 else
7470 *cost += extra_cost->fp[mode == DFmode].fma;
7473 /* FMSUB, FNMADD, and FNMSUB are free. */
7474 if (GET_CODE (op0) == NEG)
7475 op0 = XEXP (op0, 0);
7477 if (GET_CODE (op2) == NEG)
7478 op2 = XEXP (op2, 0);
7480 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7481 and the by-element operand as operand 0. */
7482 if (GET_CODE (op1) == NEG)
7483 op1 = XEXP (op1, 0);
7485 /* Catch vector-by-element operations. The by-element operand can
7486 either be (vec_duplicate (vec_select (x))) or just
7487 (vec_select (x)), depending on whether we are multiplying by
7488 a vector or a scalar.
7490 Canonicalization is not very good in these cases, FMA4 will put the
7491 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7492 if (GET_CODE (op0) == VEC_DUPLICATE)
7493 op0 = XEXP (op0, 0);
7494 else if (GET_CODE (op1) == VEC_DUPLICATE)
7495 op1 = XEXP (op1, 0);
7497 if (GET_CODE (op0) == VEC_SELECT)
7498 op0 = XEXP (op0, 0);
7499 else if (GET_CODE (op1) == VEC_SELECT)
7500 op1 = XEXP (op1, 0);
7502 /* If the remaining parameters are not registers,
7503 get the cost to put them into registers. */
7504 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7505 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7506 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7507 return true;
7509 case FLOAT:
7510 case UNSIGNED_FLOAT:
7511 if (speed)
7512 *cost += extra_cost->fp[mode == DFmode].fromint;
7513 return false;
7515 case FLOAT_EXTEND:
7516 if (speed)
7518 if (VECTOR_MODE_P (mode))
7520 /*Vector truncate. */
7521 *cost += extra_cost->vect.alu;
7523 else
7524 *cost += extra_cost->fp[mode == DFmode].widen;
7526 return false;
7528 case FLOAT_TRUNCATE:
7529 if (speed)
7531 if (VECTOR_MODE_P (mode))
7533 /*Vector conversion. */
7534 *cost += extra_cost->vect.alu;
7536 else
7537 *cost += extra_cost->fp[mode == DFmode].narrow;
7539 return false;
7541 case FIX:
7542 case UNSIGNED_FIX:
7543 x = XEXP (x, 0);
7544 /* Strip the rounding part. They will all be implemented
7545 by the fcvt* family of instructions anyway. */
7546 if (GET_CODE (x) == UNSPEC)
7548 unsigned int uns_code = XINT (x, 1);
7550 if (uns_code == UNSPEC_FRINTA
7551 || uns_code == UNSPEC_FRINTM
7552 || uns_code == UNSPEC_FRINTN
7553 || uns_code == UNSPEC_FRINTP
7554 || uns_code == UNSPEC_FRINTZ)
7555 x = XVECEXP (x, 0, 0);
7558 if (speed)
7560 if (VECTOR_MODE_P (mode))
7561 *cost += extra_cost->vect.alu;
7562 else
7563 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7566 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7567 fixed-point fcvt. */
7568 if (GET_CODE (x) == MULT
7569 && ((VECTOR_MODE_P (mode)
7570 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7571 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7573 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7574 0, speed);
7575 return true;
7578 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7579 return true;
7581 case ABS:
7582 if (VECTOR_MODE_P (mode))
7584 /* ABS (vector). */
7585 if (speed)
7586 *cost += extra_cost->vect.alu;
7588 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7590 op0 = XEXP (x, 0);
7592 /* FABD, which is analogous to FADD. */
7593 if (GET_CODE (op0) == MINUS)
7595 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7596 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7597 if (speed)
7598 *cost += extra_cost->fp[mode == DFmode].addsub;
7600 return true;
7602 /* Simple FABS is analogous to FNEG. */
7603 if (speed)
7604 *cost += extra_cost->fp[mode == DFmode].neg;
7606 else
7608 /* Integer ABS will either be split to
7609 two arithmetic instructions, or will be an ABS
7610 (scalar), which we don't model. */
7611 *cost = COSTS_N_INSNS (2);
7612 if (speed)
7613 *cost += 2 * extra_cost->alu.arith;
7615 return false;
7617 case SMAX:
7618 case SMIN:
7619 if (speed)
7621 if (VECTOR_MODE_P (mode))
7622 *cost += extra_cost->vect.alu;
7623 else
7625 /* FMAXNM/FMINNM/FMAX/FMIN.
7626 TODO: This may not be accurate for all implementations, but
7627 we do not model this in the cost tables. */
7628 *cost += extra_cost->fp[mode == DFmode].addsub;
7631 return false;
7633 case UNSPEC:
7634 /* The floating point round to integer frint* instructions. */
7635 if (aarch64_frint_unspec_p (XINT (x, 1)))
7637 if (speed)
7638 *cost += extra_cost->fp[mode == DFmode].roundint;
7640 return false;
7643 if (XINT (x, 1) == UNSPEC_RBIT)
7645 if (speed)
7646 *cost += extra_cost->alu.rev;
7648 return false;
7650 break;
7652 case TRUNCATE:
7654 /* Decompose <su>muldi3_highpart. */
7655 if (/* (truncate:DI */
7656 mode == DImode
7657 /* (lshiftrt:TI */
7658 && GET_MODE (XEXP (x, 0)) == TImode
7659 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7660 /* (mult:TI */
7661 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7662 /* (ANY_EXTEND:TI (reg:DI))
7663 (ANY_EXTEND:TI (reg:DI))) */
7664 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7665 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7666 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7667 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7668 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7669 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7670 /* (const_int 64) */
7671 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7672 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7674 /* UMULH/SMULH. */
7675 if (speed)
7676 *cost += extra_cost->mult[mode == DImode].extend;
7677 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7678 mode, MULT, 0, speed);
7679 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7680 mode, MULT, 1, speed);
7681 return true;
7684 /* Fall through. */
7685 default:
7686 break;
7689 if (dump_file
7690 && flag_aarch64_verbose_cost)
7691 fprintf (dump_file,
7692 "\nFailed to cost RTX. Assuming default cost.\n");
7694 return true;
7697 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7698 calculated for X. This cost is stored in *COST. Returns true
7699 if the total cost of X was calculated. */
7700 static bool
7701 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7702 int param, int *cost, bool speed)
7704 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7706 if (dump_file
7707 && flag_aarch64_verbose_cost)
7709 print_rtl_single (dump_file, x);
7710 fprintf (dump_file, "\n%s cost: %d (%s)\n",
7711 speed ? "Hot" : "Cold",
7712 *cost, result ? "final" : "partial");
7715 return result;
7718 static int
7719 aarch64_register_move_cost (machine_mode mode,
7720 reg_class_t from_i, reg_class_t to_i)
7722 enum reg_class from = (enum reg_class) from_i;
7723 enum reg_class to = (enum reg_class) to_i;
7724 const struct cpu_regmove_cost *regmove_cost
7725 = aarch64_tune_params.regmove_cost;
7727 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
7728 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7729 to = GENERAL_REGS;
7731 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7732 from = GENERAL_REGS;
7734 /* Moving between GPR and stack cost is the same as GP2GP. */
7735 if ((from == GENERAL_REGS && to == STACK_REG)
7736 || (to == GENERAL_REGS && from == STACK_REG))
7737 return regmove_cost->GP2GP;
7739 /* To/From the stack register, we move via the gprs. */
7740 if (to == STACK_REG || from == STACK_REG)
7741 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7742 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7744 if (GET_MODE_SIZE (mode) == 16)
7746 /* 128-bit operations on general registers require 2 instructions. */
7747 if (from == GENERAL_REGS && to == GENERAL_REGS)
7748 return regmove_cost->GP2GP * 2;
7749 else if (from == GENERAL_REGS)
7750 return regmove_cost->GP2FP * 2;
7751 else if (to == GENERAL_REGS)
7752 return regmove_cost->FP2GP * 2;
7754 /* When AdvSIMD instructions are disabled it is not possible to move
7755 a 128-bit value directly between Q registers. This is handled in
7756 secondary reload. A general register is used as a scratch to move
7757 the upper DI value and the lower DI value is moved directly,
7758 hence the cost is the sum of three moves. */
7759 if (! TARGET_SIMD)
7760 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7762 return regmove_cost->FP2FP;
7765 if (from == GENERAL_REGS && to == GENERAL_REGS)
7766 return regmove_cost->GP2GP;
7767 else if (from == GENERAL_REGS)
7768 return regmove_cost->GP2FP;
7769 else if (to == GENERAL_REGS)
7770 return regmove_cost->FP2GP;
7772 return regmove_cost->FP2FP;
7775 static int
7776 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7777 reg_class_t rclass ATTRIBUTE_UNUSED,
7778 bool in ATTRIBUTE_UNUSED)
7780 return aarch64_tune_params.memmov_cost;
7783 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7784 to optimize 1.0/sqrt. */
7786 static bool
7787 use_rsqrt_p (machine_mode mode)
7789 return (!flag_trapping_math
7790 && flag_unsafe_math_optimizations
7791 && ((aarch64_tune_params.approx_modes->recip_sqrt
7792 & AARCH64_APPROX_MODE (mode))
7793 || flag_mrecip_low_precision_sqrt));
7796 /* Function to decide when to use the approximate reciprocal square root
7797 builtin. */
7799 static tree
7800 aarch64_builtin_reciprocal (tree fndecl)
7802 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7804 if (!use_rsqrt_p (mode))
7805 return NULL_TREE;
7806 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7809 typedef rtx (*rsqrte_type) (rtx, rtx);
7811 /* Select reciprocal square root initial estimate insn depending on machine
7812 mode. */
7814 static rsqrte_type
7815 get_rsqrte_type (machine_mode mode)
7817 switch (mode)
7819 case DFmode: return gen_aarch64_rsqrtedf;
7820 case SFmode: return gen_aarch64_rsqrtesf;
7821 case V2DFmode: return gen_aarch64_rsqrtev2df;
7822 case V2SFmode: return gen_aarch64_rsqrtev2sf;
7823 case V4SFmode: return gen_aarch64_rsqrtev4sf;
7824 default: gcc_unreachable ();
7828 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7830 /* Select reciprocal square root series step insn depending on machine mode. */
7832 static rsqrts_type
7833 get_rsqrts_type (machine_mode mode)
7835 switch (mode)
7837 case DFmode: return gen_aarch64_rsqrtsdf;
7838 case SFmode: return gen_aarch64_rsqrtssf;
7839 case V2DFmode: return gen_aarch64_rsqrtsv2df;
7840 case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7841 case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7842 default: gcc_unreachable ();
7846 /* Emit instruction sequence to compute either the approximate square root
7847 or its approximate reciprocal, depending on the flag RECP, and return
7848 whether the sequence was emitted or not. */
7850 bool
7851 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7853 machine_mode mode = GET_MODE (dst);
7855 if (GET_MODE_INNER (mode) == HFmode)
7856 return false;
7858 machine_mode mmsk = mode_for_vector
7859 (int_mode_for_mode (GET_MODE_INNER (mode)),
7860 GET_MODE_NUNITS (mode));
7861 bool use_approx_sqrt_p = (!recp
7862 && (flag_mlow_precision_sqrt
7863 || (aarch64_tune_params.approx_modes->sqrt
7864 & AARCH64_APPROX_MODE (mode))));
7865 bool use_approx_rsqrt_p = (recp
7866 && (flag_mrecip_low_precision_sqrt
7867 || (aarch64_tune_params.approx_modes->recip_sqrt
7868 & AARCH64_APPROX_MODE (mode))));
7870 if (!flag_finite_math_only
7871 || flag_trapping_math
7872 || !flag_unsafe_math_optimizations
7873 || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7874 || optimize_function_for_size_p (cfun))
7875 return false;
7877 rtx xmsk = gen_reg_rtx (mmsk);
7878 if (!recp)
7879 /* When calculating the approximate square root, compare the argument with
7880 0.0 and create a mask. */
7881 emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7882 CONST0_RTX (mode)))));
7884 /* Estimate the approximate reciprocal square root. */
7885 rtx xdst = gen_reg_rtx (mode);
7886 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7888 /* Iterate over the series twice for SF and thrice for DF. */
7889 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7891 /* Optionally iterate over the series once less for faster performance
7892 while sacrificing the accuracy. */
7893 if ((recp && flag_mrecip_low_precision_sqrt)
7894 || (!recp && flag_mlow_precision_sqrt))
7895 iterations--;
7897 /* Iterate over the series to calculate the approximate reciprocal square
7898 root. */
7899 rtx x1 = gen_reg_rtx (mode);
7900 while (iterations--)
7902 rtx x2 = gen_reg_rtx (mode);
7903 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7905 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7907 if (iterations > 0)
7908 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7911 if (!recp)
7913 /* Qualify the approximate reciprocal square root when the argument is
7914 0.0 by squashing the intermediary result to 0.0. */
7915 rtx xtmp = gen_reg_rtx (mmsk);
7916 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7917 gen_rtx_SUBREG (mmsk, xdst, 0)));
7918 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7920 /* Calculate the approximate square root. */
7921 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7924 /* Finalize the approximation. */
7925 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
7927 return true;
7930 typedef rtx (*recpe_type) (rtx, rtx);
7932 /* Select reciprocal initial estimate insn depending on machine mode. */
7934 static recpe_type
7935 get_recpe_type (machine_mode mode)
7937 switch (mode)
7939 case SFmode: return (gen_aarch64_frecpesf);
7940 case V2SFmode: return (gen_aarch64_frecpev2sf);
7941 case V4SFmode: return (gen_aarch64_frecpev4sf);
7942 case DFmode: return (gen_aarch64_frecpedf);
7943 case V2DFmode: return (gen_aarch64_frecpev2df);
7944 default: gcc_unreachable ();
7948 typedef rtx (*recps_type) (rtx, rtx, rtx);
7950 /* Select reciprocal series step insn depending on machine mode. */
7952 static recps_type
7953 get_recps_type (machine_mode mode)
7955 switch (mode)
7957 case SFmode: return (gen_aarch64_frecpssf);
7958 case V2SFmode: return (gen_aarch64_frecpsv2sf);
7959 case V4SFmode: return (gen_aarch64_frecpsv4sf);
7960 case DFmode: return (gen_aarch64_frecpsdf);
7961 case V2DFmode: return (gen_aarch64_frecpsv2df);
7962 default: gcc_unreachable ();
7966 /* Emit the instruction sequence to compute the approximation for the division
7967 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
7969 bool
7970 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
7972 machine_mode mode = GET_MODE (quo);
7974 if (GET_MODE_INNER (mode) == HFmode)
7975 return false;
7977 bool use_approx_division_p = (flag_mlow_precision_div
7978 || (aarch64_tune_params.approx_modes->division
7979 & AARCH64_APPROX_MODE (mode)));
7981 if (!flag_finite_math_only
7982 || flag_trapping_math
7983 || !flag_unsafe_math_optimizations
7984 || optimize_function_for_size_p (cfun)
7985 || !use_approx_division_p)
7986 return false;
7988 /* Estimate the approximate reciprocal. */
7989 rtx xrcp = gen_reg_rtx (mode);
7990 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
7992 /* Iterate over the series twice for SF and thrice for DF. */
7993 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7995 /* Optionally iterate over the series once less for faster performance,
7996 while sacrificing the accuracy. */
7997 if (flag_mlow_precision_div)
7998 iterations--;
8000 /* Iterate over the series to calculate the approximate reciprocal. */
8001 rtx xtmp = gen_reg_rtx (mode);
8002 while (iterations--)
8004 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8006 if (iterations > 0)
8007 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8010 if (num != CONST1_RTX (mode))
8012 /* As the approximate reciprocal of DEN is already calculated, only
8013 calculate the approximate division when NUM is not 1.0. */
8014 rtx xnum = force_reg (mode, num);
8015 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8018 /* Finalize the approximation. */
8019 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8020 return true;
8023 /* Return the number of instructions that can be issued per cycle. */
8024 static int
8025 aarch64_sched_issue_rate (void)
8027 return aarch64_tune_params.issue_rate;
8030 static int
8031 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8033 int issue_rate = aarch64_sched_issue_rate ();
8035 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8039 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8040 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8041 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8043 static int
8044 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8045 int ready_index)
8047 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8051 /* Vectorizer cost model target hooks. */
8053 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8054 static int
8055 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8056 tree vectype,
8057 int misalign ATTRIBUTE_UNUSED)
8059 unsigned elements;
8061 switch (type_of_cost)
8063 case scalar_stmt:
8064 return aarch64_tune_params.vec_costs->scalar_stmt_cost;
8066 case scalar_load:
8067 return aarch64_tune_params.vec_costs->scalar_load_cost;
8069 case scalar_store:
8070 return aarch64_tune_params.vec_costs->scalar_store_cost;
8072 case vector_stmt:
8073 return aarch64_tune_params.vec_costs->vec_stmt_cost;
8075 case vector_load:
8076 return aarch64_tune_params.vec_costs->vec_align_load_cost;
8078 case vector_store:
8079 return aarch64_tune_params.vec_costs->vec_store_cost;
8081 case vec_to_scalar:
8082 return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
8084 case scalar_to_vec:
8085 return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
8087 case unaligned_load:
8088 return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
8090 case unaligned_store:
8091 return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
8093 case cond_branch_taken:
8094 return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
8096 case cond_branch_not_taken:
8097 return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
8099 case vec_perm:
8100 return aarch64_tune_params.vec_costs->vec_permute_cost;
8102 case vec_promote_demote:
8103 return aarch64_tune_params.vec_costs->vec_stmt_cost;
8105 case vec_construct:
8106 elements = TYPE_VECTOR_SUBPARTS (vectype);
8107 return elements / 2 + 1;
8109 default:
8110 gcc_unreachable ();
8114 /* Implement targetm.vectorize.add_stmt_cost. */
8115 static unsigned
8116 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8117 struct _stmt_vec_info *stmt_info, int misalign,
8118 enum vect_cost_model_location where)
8120 unsigned *cost = (unsigned *) data;
8121 unsigned retval = 0;
8123 if (flag_vect_cost_model)
8125 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8126 int stmt_cost =
8127 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8129 /* Statements in an inner loop relative to the loop being
8130 vectorized are weighted more heavily. The value here is
8131 arbitrary and could potentially be improved with analysis. */
8132 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8133 count *= 50; /* FIXME */
8135 retval = (unsigned) (count * stmt_cost);
8136 cost[where] += retval;
8139 return retval;
8142 static void initialize_aarch64_code_model (struct gcc_options *);
8144 /* Parse the TO_PARSE string and put the architecture struct that it
8145 selects into RES and the architectural features into ISA_FLAGS.
8146 Return an aarch64_parse_opt_result describing the parse result.
8147 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8149 static enum aarch64_parse_opt_result
8150 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8151 unsigned long *isa_flags)
8153 char *ext;
8154 const struct processor *arch;
8155 char *str = (char *) alloca (strlen (to_parse) + 1);
8156 size_t len;
8158 strcpy (str, to_parse);
8160 ext = strchr (str, '+');
8162 if (ext != NULL)
8163 len = ext - str;
8164 else
8165 len = strlen (str);
8167 if (len == 0)
8168 return AARCH64_PARSE_MISSING_ARG;
8171 /* Loop through the list of supported ARCHes to find a match. */
8172 for (arch = all_architectures; arch->name != NULL; arch++)
8174 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8176 unsigned long isa_temp = arch->flags;
8178 if (ext != NULL)
8180 /* TO_PARSE string contains at least one extension. */
8181 enum aarch64_parse_opt_result ext_res
8182 = aarch64_parse_extension (ext, &isa_temp);
8184 if (ext_res != AARCH64_PARSE_OK)
8185 return ext_res;
8187 /* Extension parsing was successful. Confirm the result
8188 arch and ISA flags. */
8189 *res = arch;
8190 *isa_flags = isa_temp;
8191 return AARCH64_PARSE_OK;
8195 /* ARCH name not found in list. */
8196 return AARCH64_PARSE_INVALID_ARG;
8199 /* Parse the TO_PARSE string and put the result tuning in RES and the
8200 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8201 describing the parse result. If there is an error parsing, RES and
8202 ISA_FLAGS are left unchanged. */
8204 static enum aarch64_parse_opt_result
8205 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8206 unsigned long *isa_flags)
8208 char *ext;
8209 const struct processor *cpu;
8210 char *str = (char *) alloca (strlen (to_parse) + 1);
8211 size_t len;
8213 strcpy (str, to_parse);
8215 ext = strchr (str, '+');
8217 if (ext != NULL)
8218 len = ext - str;
8219 else
8220 len = strlen (str);
8222 if (len == 0)
8223 return AARCH64_PARSE_MISSING_ARG;
8226 /* Loop through the list of supported CPUs to find a match. */
8227 for (cpu = all_cores; cpu->name != NULL; cpu++)
8229 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8231 unsigned long isa_temp = cpu->flags;
8234 if (ext != NULL)
8236 /* TO_PARSE string contains at least one extension. */
8237 enum aarch64_parse_opt_result ext_res
8238 = aarch64_parse_extension (ext, &isa_temp);
8240 if (ext_res != AARCH64_PARSE_OK)
8241 return ext_res;
8243 /* Extension parsing was successfull. Confirm the result
8244 cpu and ISA flags. */
8245 *res = cpu;
8246 *isa_flags = isa_temp;
8247 return AARCH64_PARSE_OK;
8251 /* CPU name not found in list. */
8252 return AARCH64_PARSE_INVALID_ARG;
8255 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8256 Return an aarch64_parse_opt_result describing the parse result.
8257 If the parsing fails the RES does not change. */
8259 static enum aarch64_parse_opt_result
8260 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8262 const struct processor *cpu;
8263 char *str = (char *) alloca (strlen (to_parse) + 1);
8265 strcpy (str, to_parse);
8267 /* Loop through the list of supported CPUs to find a match. */
8268 for (cpu = all_cores; cpu->name != NULL; cpu++)
8270 if (strcmp (cpu->name, str) == 0)
8272 *res = cpu;
8273 return AARCH64_PARSE_OK;
8277 /* CPU name not found in list. */
8278 return AARCH64_PARSE_INVALID_ARG;
8281 /* Parse TOKEN, which has length LENGTH to see if it is an option
8282 described in FLAG. If it is, return the index bit for that fusion type.
8283 If not, error (printing OPTION_NAME) and return zero. */
8285 static unsigned int
8286 aarch64_parse_one_option_token (const char *token,
8287 size_t length,
8288 const struct aarch64_flag_desc *flag,
8289 const char *option_name)
8291 for (; flag->name != NULL; flag++)
8293 if (length == strlen (flag->name)
8294 && !strncmp (flag->name, token, length))
8295 return flag->flag;
8298 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8299 return 0;
8302 /* Parse OPTION which is a comma-separated list of flags to enable.
8303 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8304 default state we inherit from the CPU tuning structures. OPTION_NAME
8305 gives the top-level option we are parsing in the -moverride string,
8306 for use in error messages. */
8308 static unsigned int
8309 aarch64_parse_boolean_options (const char *option,
8310 const struct aarch64_flag_desc *flags,
8311 unsigned int initial_state,
8312 const char *option_name)
8314 const char separator = '.';
8315 const char* specs = option;
8316 const char* ntoken = option;
8317 unsigned int found_flags = initial_state;
8319 while ((ntoken = strchr (specs, separator)))
8321 size_t token_length = ntoken - specs;
8322 unsigned token_ops = aarch64_parse_one_option_token (specs,
8323 token_length,
8324 flags,
8325 option_name);
8326 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8327 in the token stream, reset the supported operations. So:
8329 adrp+add.cmp+branch.none.adrp+add
8331 would have the result of turning on only adrp+add fusion. */
8332 if (!token_ops)
8333 found_flags = 0;
8335 found_flags |= token_ops;
8336 specs = ++ntoken;
8339 /* We ended with a comma, print something. */
8340 if (!(*specs))
8342 error ("%s string ill-formed\n", option_name);
8343 return 0;
8346 /* We still have one more token to parse. */
8347 size_t token_length = strlen (specs);
8348 unsigned token_ops = aarch64_parse_one_option_token (specs,
8349 token_length,
8350 flags,
8351 option_name);
8352 if (!token_ops)
8353 found_flags = 0;
8355 found_flags |= token_ops;
8356 return found_flags;
8359 /* Support for overriding instruction fusion. */
8361 static void
8362 aarch64_parse_fuse_string (const char *fuse_string,
8363 struct tune_params *tune)
8365 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8366 aarch64_fusible_pairs,
8367 tune->fusible_ops,
8368 "fuse=");
8371 /* Support for overriding other tuning flags. */
8373 static void
8374 aarch64_parse_tune_string (const char *tune_string,
8375 struct tune_params *tune)
8377 tune->extra_tuning_flags
8378 = aarch64_parse_boolean_options (tune_string,
8379 aarch64_tuning_flags,
8380 tune->extra_tuning_flags,
8381 "tune=");
8384 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8385 we understand. If it is, extract the option string and handoff to
8386 the appropriate function. */
8388 void
8389 aarch64_parse_one_override_token (const char* token,
8390 size_t length,
8391 struct tune_params *tune)
8393 const struct aarch64_tuning_override_function *fn
8394 = aarch64_tuning_override_functions;
8396 const char *option_part = strchr (token, '=');
8397 if (!option_part)
8399 error ("tuning string missing in option (%s)", token);
8400 return;
8403 /* Get the length of the option name. */
8404 length = option_part - token;
8405 /* Skip the '=' to get to the option string. */
8406 option_part++;
8408 for (; fn->name != NULL; fn++)
8410 if (!strncmp (fn->name, token, length))
8412 fn->parse_override (option_part, tune);
8413 return;
8417 error ("unknown tuning option (%s)",token);
8418 return;
8421 /* A checking mechanism for the implementation of the tls size. */
8423 static void
8424 initialize_aarch64_tls_size (struct gcc_options *opts)
8426 if (aarch64_tls_size == 0)
8427 aarch64_tls_size = 24;
8429 switch (opts->x_aarch64_cmodel_var)
8431 case AARCH64_CMODEL_TINY:
8432 /* Both the default and maximum TLS size allowed under tiny is 1M which
8433 needs two instructions to address, so we clamp the size to 24. */
8434 if (aarch64_tls_size > 24)
8435 aarch64_tls_size = 24;
8436 break;
8437 case AARCH64_CMODEL_SMALL:
8438 /* The maximum TLS size allowed under small is 4G. */
8439 if (aarch64_tls_size > 32)
8440 aarch64_tls_size = 32;
8441 break;
8442 case AARCH64_CMODEL_LARGE:
8443 /* The maximum TLS size allowed under large is 16E.
8444 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8445 if (aarch64_tls_size > 48)
8446 aarch64_tls_size = 48;
8447 break;
8448 default:
8449 gcc_unreachable ();
8452 return;
8455 /* Parse STRING looking for options in the format:
8456 string :: option:string
8457 option :: name=substring
8458 name :: {a-z}
8459 substring :: defined by option. */
8461 static void
8462 aarch64_parse_override_string (const char* input_string,
8463 struct tune_params* tune)
8465 const char separator = ':';
8466 size_t string_length = strlen (input_string) + 1;
8467 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8468 char *string = string_root;
8469 strncpy (string, input_string, string_length);
8470 string[string_length - 1] = '\0';
8472 char* ntoken = string;
8474 while ((ntoken = strchr (string, separator)))
8476 size_t token_length = ntoken - string;
8477 /* Make this substring look like a string. */
8478 *ntoken = '\0';
8479 aarch64_parse_one_override_token (string, token_length, tune);
8480 string = ++ntoken;
8483 /* One last option to parse. */
8484 aarch64_parse_one_override_token (string, strlen (string), tune);
8485 free (string_root);
8489 static void
8490 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8492 /* The logic here is that if we are disabling all frame pointer generation
8493 then we do not need to disable leaf frame pointer generation as a
8494 separate operation. But if we are *only* disabling leaf frame pointer
8495 generation then we set flag_omit_frame_pointer to true, but in
8496 aarch64_frame_pointer_required we return false only for leaf functions.
8498 PR 70044: We have to be careful about being called multiple times for the
8499 same function. Once we have decided to set flag_omit_frame_pointer just
8500 so that we can omit leaf frame pointers, we must then not interpret a
8501 second call as meaning that all frame pointer generation should be
8502 omitted. We do this by setting flag_omit_frame_pointer to a special,
8503 non-zero value. */
8504 if (opts->x_flag_omit_frame_pointer == 2)
8505 opts->x_flag_omit_frame_pointer = 0;
8507 if (opts->x_flag_omit_frame_pointer)
8508 opts->x_flag_omit_leaf_frame_pointer = false;
8509 else if (opts->x_flag_omit_leaf_frame_pointer)
8510 opts->x_flag_omit_frame_pointer = 2;
8512 /* If not optimizing for size, set the default
8513 alignment to what the target wants. */
8514 if (!opts->x_optimize_size)
8516 if (opts->x_align_loops <= 0)
8517 opts->x_align_loops = aarch64_tune_params.loop_align;
8518 if (opts->x_align_jumps <= 0)
8519 opts->x_align_jumps = aarch64_tune_params.jump_align;
8520 if (opts->x_align_functions <= 0)
8521 opts->x_align_functions = aarch64_tune_params.function_align;
8524 /* We default to no pc-relative literal loads. */
8526 aarch64_pcrelative_literal_loads = false;
8528 /* If -mpc-relative-literal-loads is set on the command line, this
8529 implies that the user asked for PC relative literal loads. */
8530 if (opts->x_pcrelative_literal_loads == 1)
8531 aarch64_pcrelative_literal_loads = true;
8533 /* This is PR70113. When building the Linux kernel with
8534 CONFIG_ARM64_ERRATUM_843419, support for relocations
8535 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8536 removed from the kernel to avoid loading objects with possibly
8537 offending sequences. Without -mpc-relative-literal-loads we would
8538 generate such relocations, preventing the kernel build from
8539 succeeding. */
8540 if (opts->x_pcrelative_literal_loads == 2
8541 && TARGET_FIX_ERR_A53_843419)
8542 aarch64_pcrelative_literal_loads = true;
8544 /* In the tiny memory model it makes no sense to disallow PC relative
8545 literal pool loads. */
8546 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8547 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8548 aarch64_pcrelative_literal_loads = true;
8550 /* When enabling the lower precision Newton series for the square root, also
8551 enable it for the reciprocal square root, since the latter is an
8552 intermediary step for the former. */
8553 if (flag_mlow_precision_sqrt)
8554 flag_mrecip_low_precision_sqrt = true;
8557 /* 'Unpack' up the internal tuning structs and update the options
8558 in OPTS. The caller must have set up selected_tune and selected_arch
8559 as all the other target-specific codegen decisions are
8560 derived from them. */
8562 void
8563 aarch64_override_options_internal (struct gcc_options *opts)
8565 aarch64_tune_flags = selected_tune->flags;
8566 aarch64_tune = selected_tune->sched_core;
8567 /* Make a copy of the tuning parameters attached to the core, which
8568 we may later overwrite. */
8569 aarch64_tune_params = *(selected_tune->tune);
8570 aarch64_architecture_version = selected_arch->architecture_version;
8572 if (opts->x_aarch64_override_tune_string)
8573 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8574 &aarch64_tune_params);
8576 /* This target defaults to strict volatile bitfields. */
8577 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8578 opts->x_flag_strict_volatile_bitfields = 1;
8580 initialize_aarch64_code_model (opts);
8581 initialize_aarch64_tls_size (opts);
8583 int queue_depth = 0;
8584 switch (aarch64_tune_params.autoprefetcher_model)
8586 case tune_params::AUTOPREFETCHER_OFF:
8587 queue_depth = -1;
8588 break;
8589 case tune_params::AUTOPREFETCHER_WEAK:
8590 queue_depth = 0;
8591 break;
8592 case tune_params::AUTOPREFETCHER_STRONG:
8593 queue_depth = max_insn_queue_index + 1;
8594 break;
8595 default:
8596 gcc_unreachable ();
8599 /* We don't mind passing in global_options_set here as we don't use
8600 the *options_set structs anyway. */
8601 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8602 queue_depth,
8603 opts->x_param_values,
8604 global_options_set.x_param_values);
8606 /* Set the L1 cache line size. */
8607 if (selected_cpu->tune->cache_line_size != 0)
8608 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8609 selected_cpu->tune->cache_line_size,
8610 opts->x_param_values,
8611 global_options_set.x_param_values);
8613 aarch64_override_options_after_change_1 (opts);
8616 /* Print a hint with a suggestion for a core or architecture name that
8617 most closely resembles what the user passed in STR. ARCH is true if
8618 the user is asking for an architecture name. ARCH is false if the user
8619 is asking for a core name. */
8621 static void
8622 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8624 auto_vec<const char *> candidates;
8625 const struct processor *entry = arch ? all_architectures : all_cores;
8626 for (; entry->name != NULL; entry++)
8627 candidates.safe_push (entry->name);
8628 char *s;
8629 const char *hint = candidates_list_and_hint (str, s, candidates);
8630 if (hint)
8631 inform (input_location, "valid arguments are: %s;"
8632 " did you mean %qs?", s, hint);
8633 XDELETEVEC (s);
8636 /* Print a hint with a suggestion for a core name that most closely resembles
8637 what the user passed in STR. */
8639 inline static void
8640 aarch64_print_hint_for_core (const char *str)
8642 aarch64_print_hint_for_core_or_arch (str, false);
8645 /* Print a hint with a suggestion for an architecture name that most closely
8646 resembles what the user passed in STR. */
8648 inline static void
8649 aarch64_print_hint_for_arch (const char *str)
8651 aarch64_print_hint_for_core_or_arch (str, true);
8654 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8655 specified in STR and throw errors if appropriate. Put the results if
8656 they are valid in RES and ISA_FLAGS. Return whether the option is
8657 valid. */
8659 static bool
8660 aarch64_validate_mcpu (const char *str, const struct processor **res,
8661 unsigned long *isa_flags)
8663 enum aarch64_parse_opt_result parse_res
8664 = aarch64_parse_cpu (str, res, isa_flags);
8666 if (parse_res == AARCH64_PARSE_OK)
8667 return true;
8669 switch (parse_res)
8671 case AARCH64_PARSE_MISSING_ARG:
8672 error ("missing cpu name in -mcpu=%qs", str);
8673 break;
8674 case AARCH64_PARSE_INVALID_ARG:
8675 error ("unknown value %qs for -mcpu", str);
8676 aarch64_print_hint_for_core (str);
8677 break;
8678 case AARCH64_PARSE_INVALID_FEATURE:
8679 error ("invalid feature modifier in -mcpu=%qs", str);
8680 break;
8681 default:
8682 gcc_unreachable ();
8685 return false;
8688 /* Validate a command-line -march option. Parse the arch and extensions
8689 (if any) specified in STR and throw errors if appropriate. Put the
8690 results, if they are valid, in RES and ISA_FLAGS. Return whether the
8691 option is valid. */
8693 static bool
8694 aarch64_validate_march (const char *str, const struct processor **res,
8695 unsigned long *isa_flags)
8697 enum aarch64_parse_opt_result parse_res
8698 = aarch64_parse_arch (str, res, isa_flags);
8700 if (parse_res == AARCH64_PARSE_OK)
8701 return true;
8703 switch (parse_res)
8705 case AARCH64_PARSE_MISSING_ARG:
8706 error ("missing arch name in -march=%qs", str);
8707 break;
8708 case AARCH64_PARSE_INVALID_ARG:
8709 error ("unknown value %qs for -march", str);
8710 aarch64_print_hint_for_arch (str);
8711 break;
8712 case AARCH64_PARSE_INVALID_FEATURE:
8713 error ("invalid feature modifier in -march=%qs", str);
8714 break;
8715 default:
8716 gcc_unreachable ();
8719 return false;
8722 /* Validate a command-line -mtune option. Parse the cpu
8723 specified in STR and throw errors if appropriate. Put the
8724 result, if it is valid, in RES. Return whether the option is
8725 valid. */
8727 static bool
8728 aarch64_validate_mtune (const char *str, const struct processor **res)
8730 enum aarch64_parse_opt_result parse_res
8731 = aarch64_parse_tune (str, res);
8733 if (parse_res == AARCH64_PARSE_OK)
8734 return true;
8736 switch (parse_res)
8738 case AARCH64_PARSE_MISSING_ARG:
8739 error ("missing cpu name in -mtune=%qs", str);
8740 break;
8741 case AARCH64_PARSE_INVALID_ARG:
8742 error ("unknown value %qs for -mtune", str);
8743 aarch64_print_hint_for_core (str);
8744 break;
8745 default:
8746 gcc_unreachable ();
8748 return false;
8751 /* Return the CPU corresponding to the enum CPU.
8752 If it doesn't specify a cpu, return the default. */
8754 static const struct processor *
8755 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8757 if (cpu != aarch64_none)
8758 return &all_cores[cpu];
8760 /* The & 0x3f is to extract the bottom 6 bits that encode the
8761 default cpu as selected by the --with-cpu GCC configure option
8762 in config.gcc.
8763 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8764 flags mechanism should be reworked to make it more sane. */
8765 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8768 /* Return the architecture corresponding to the enum ARCH.
8769 If it doesn't specify a valid architecture, return the default. */
8771 static const struct processor *
8772 aarch64_get_arch (enum aarch64_arch arch)
8774 if (arch != aarch64_no_arch)
8775 return &all_architectures[arch];
8777 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8779 return &all_architectures[cpu->arch];
8782 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
8783 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8784 tuning structs. In particular it must set selected_tune and
8785 aarch64_isa_flags that define the available ISA features and tuning
8786 decisions. It must also set selected_arch as this will be used to
8787 output the .arch asm tags for each function. */
8789 static void
8790 aarch64_override_options (void)
8792 unsigned long cpu_isa = 0;
8793 unsigned long arch_isa = 0;
8794 aarch64_isa_flags = 0;
8796 bool valid_cpu = true;
8797 bool valid_tune = true;
8798 bool valid_arch = true;
8800 selected_cpu = NULL;
8801 selected_arch = NULL;
8802 selected_tune = NULL;
8804 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8805 If either of -march or -mtune is given, they override their
8806 respective component of -mcpu. */
8807 if (aarch64_cpu_string)
8808 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8809 &cpu_isa);
8811 if (aarch64_arch_string)
8812 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8813 &arch_isa);
8815 if (aarch64_tune_string)
8816 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8818 /* If the user did not specify a processor, choose the default
8819 one for them. This will be the CPU set during configuration using
8820 --with-cpu, otherwise it is "generic". */
8821 if (!selected_cpu)
8823 if (selected_arch)
8825 selected_cpu = &all_cores[selected_arch->ident];
8826 aarch64_isa_flags = arch_isa;
8827 explicit_arch = selected_arch->arch;
8829 else
8831 /* Get default configure-time CPU. */
8832 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8833 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8836 if (selected_tune)
8837 explicit_tune_core = selected_tune->ident;
8839 /* If both -mcpu and -march are specified check that they are architecturally
8840 compatible, warn if they're not and prefer the -march ISA flags. */
8841 else if (selected_arch)
8843 if (selected_arch->arch != selected_cpu->arch)
8845 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8846 all_architectures[selected_cpu->arch].name,
8847 selected_arch->name);
8849 aarch64_isa_flags = arch_isa;
8850 explicit_arch = selected_arch->arch;
8851 explicit_tune_core = selected_tune ? selected_tune->ident
8852 : selected_cpu->ident;
8854 else
8856 /* -mcpu but no -march. */
8857 aarch64_isa_flags = cpu_isa;
8858 explicit_tune_core = selected_tune ? selected_tune->ident
8859 : selected_cpu->ident;
8860 gcc_assert (selected_cpu);
8861 selected_arch = &all_architectures[selected_cpu->arch];
8862 explicit_arch = selected_arch->arch;
8865 /* Set the arch as well as we will need it when outputing
8866 the .arch directive in assembly. */
8867 if (!selected_arch)
8869 gcc_assert (selected_cpu);
8870 selected_arch = &all_architectures[selected_cpu->arch];
8873 if (!selected_tune)
8874 selected_tune = selected_cpu;
8876 #ifndef HAVE_AS_MABI_OPTION
8877 /* The compiler may have been configured with 2.23.* binutils, which does
8878 not have support for ILP32. */
8879 if (TARGET_ILP32)
8880 error ("Assembler does not support -mabi=ilp32");
8881 #endif
8883 /* Make sure we properly set up the explicit options. */
8884 if ((aarch64_cpu_string && valid_cpu)
8885 || (aarch64_tune_string && valid_tune))
8886 gcc_assert (explicit_tune_core != aarch64_none);
8888 if ((aarch64_cpu_string && valid_cpu)
8889 || (aarch64_arch_string && valid_arch))
8890 gcc_assert (explicit_arch != aarch64_no_arch);
8892 aarch64_override_options_internal (&global_options);
8894 /* Save these options as the default ones in case we push and pop them later
8895 while processing functions with potential target attributes. */
8896 target_option_default_node = target_option_current_node
8897 = build_target_option_node (&global_options);
8900 /* Implement targetm.override_options_after_change. */
8902 static void
8903 aarch64_override_options_after_change (void)
8905 aarch64_override_options_after_change_1 (&global_options);
8908 static struct machine_function *
8909 aarch64_init_machine_status (void)
8911 struct machine_function *machine;
8912 machine = ggc_cleared_alloc<machine_function> ();
8913 return machine;
8916 void
8917 aarch64_init_expanders (void)
8919 init_machine_status = aarch64_init_machine_status;
8922 /* A checking mechanism for the implementation of the various code models. */
8923 static void
8924 initialize_aarch64_code_model (struct gcc_options *opts)
8926 if (opts->x_flag_pic)
8928 switch (opts->x_aarch64_cmodel_var)
8930 case AARCH64_CMODEL_TINY:
8931 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8932 break;
8933 case AARCH64_CMODEL_SMALL:
8934 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8935 aarch64_cmodel = (flag_pic == 2
8936 ? AARCH64_CMODEL_SMALL_PIC
8937 : AARCH64_CMODEL_SMALL_SPIC);
8938 #else
8939 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8940 #endif
8941 break;
8942 case AARCH64_CMODEL_LARGE:
8943 sorry ("code model %qs with -f%s", "large",
8944 opts->x_flag_pic > 1 ? "PIC" : "pic");
8945 break;
8946 default:
8947 gcc_unreachable ();
8950 else
8951 aarch64_cmodel = opts->x_aarch64_cmodel_var;
8954 /* Implement TARGET_OPTION_SAVE. */
8956 static void
8957 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8959 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8962 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
8963 using the information saved in PTR. */
8965 static void
8966 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8968 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8969 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8970 opts->x_explicit_arch = ptr->x_explicit_arch;
8971 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8972 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8974 aarch64_override_options_internal (opts);
8977 /* Implement TARGET_OPTION_PRINT. */
8979 static void
8980 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8982 const struct processor *cpu
8983 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8984 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8985 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8986 std::string extension
8987 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
8989 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8990 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8991 arch->name, extension.c_str ());
8994 static GTY(()) tree aarch64_previous_fndecl;
8996 void
8997 aarch64_reset_previous_fndecl (void)
8999 aarch64_previous_fndecl = NULL;
9002 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9003 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9004 make sure optab availability predicates are recomputed when necessary. */
9006 void
9007 aarch64_save_restore_target_globals (tree new_tree)
9009 if (TREE_TARGET_GLOBALS (new_tree))
9010 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9011 else if (new_tree == target_option_default_node)
9012 restore_target_globals (&default_target_globals);
9013 else
9014 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9017 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9018 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9019 of the function, if such exists. This function may be called multiple
9020 times on a single function so use aarch64_previous_fndecl to avoid
9021 setting up identical state. */
9023 static void
9024 aarch64_set_current_function (tree fndecl)
9026 if (!fndecl || fndecl == aarch64_previous_fndecl)
9027 return;
9029 tree old_tree = (aarch64_previous_fndecl
9030 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9031 : NULL_TREE);
9033 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9035 /* If current function has no attributes but the previous one did,
9036 use the default node. */
9037 if (!new_tree && old_tree)
9038 new_tree = target_option_default_node;
9040 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9041 the default have been handled by aarch64_save_restore_target_globals from
9042 aarch64_pragma_target_parse. */
9043 if (old_tree == new_tree)
9044 return;
9046 aarch64_previous_fndecl = fndecl;
9048 /* First set the target options. */
9049 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9051 aarch64_save_restore_target_globals (new_tree);
9054 /* Enum describing the various ways we can handle attributes.
9055 In many cases we can reuse the generic option handling machinery. */
9057 enum aarch64_attr_opt_type
9059 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9060 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9061 aarch64_attr_enum, /* Attribute sets an enum variable. */
9062 aarch64_attr_custom /* Attribute requires a custom handling function. */
9065 /* All the information needed to handle a target attribute.
9066 NAME is the name of the attribute.
9067 ATTR_TYPE specifies the type of behavior of the attribute as described
9068 in the definition of enum aarch64_attr_opt_type.
9069 ALLOW_NEG is true if the attribute supports a "no-" form.
9070 HANDLER is the function that takes the attribute string and whether
9071 it is a pragma or attribute and handles the option. It is needed only
9072 when the ATTR_TYPE is aarch64_attr_custom.
9073 OPT_NUM is the enum specifying the option that the attribute modifies.
9074 This is needed for attributes that mirror the behavior of a command-line
9075 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9076 aarch64_attr_enum. */
9078 struct aarch64_attribute_info
9080 const char *name;
9081 enum aarch64_attr_opt_type attr_type;
9082 bool allow_neg;
9083 bool (*handler) (const char *, const char *);
9084 enum opt_code opt_num;
9087 /* Handle the ARCH_STR argument to the arch= target attribute.
9088 PRAGMA_OR_ATTR is used in potential error messages. */
9090 static bool
9091 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9093 const struct processor *tmp_arch = NULL;
9094 enum aarch64_parse_opt_result parse_res
9095 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9097 if (parse_res == AARCH64_PARSE_OK)
9099 gcc_assert (tmp_arch);
9100 selected_arch = tmp_arch;
9101 explicit_arch = selected_arch->arch;
9102 return true;
9105 switch (parse_res)
9107 case AARCH64_PARSE_MISSING_ARG:
9108 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9109 break;
9110 case AARCH64_PARSE_INVALID_ARG:
9111 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9112 aarch64_print_hint_for_arch (str);
9113 break;
9114 case AARCH64_PARSE_INVALID_FEATURE:
9115 error ("invalid feature modifier %qs for 'arch' target %s",
9116 str, pragma_or_attr);
9117 break;
9118 default:
9119 gcc_unreachable ();
9122 return false;
9125 /* Handle the argument CPU_STR to the cpu= target attribute.
9126 PRAGMA_OR_ATTR is used in potential error messages. */
9128 static bool
9129 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9131 const struct processor *tmp_cpu = NULL;
9132 enum aarch64_parse_opt_result parse_res
9133 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9135 if (parse_res == AARCH64_PARSE_OK)
9137 gcc_assert (tmp_cpu);
9138 selected_tune = tmp_cpu;
9139 explicit_tune_core = selected_tune->ident;
9141 selected_arch = &all_architectures[tmp_cpu->arch];
9142 explicit_arch = selected_arch->arch;
9143 return true;
9146 switch (parse_res)
9148 case AARCH64_PARSE_MISSING_ARG:
9149 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9150 break;
9151 case AARCH64_PARSE_INVALID_ARG:
9152 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9153 aarch64_print_hint_for_core (str);
9154 break;
9155 case AARCH64_PARSE_INVALID_FEATURE:
9156 error ("invalid feature modifier %qs for 'cpu' target %s",
9157 str, pragma_or_attr);
9158 break;
9159 default:
9160 gcc_unreachable ();
9163 return false;
9166 /* Handle the argument STR to the tune= target attribute.
9167 PRAGMA_OR_ATTR is used in potential error messages. */
9169 static bool
9170 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9172 const struct processor *tmp_tune = NULL;
9173 enum aarch64_parse_opt_result parse_res
9174 = aarch64_parse_tune (str, &tmp_tune);
9176 if (parse_res == AARCH64_PARSE_OK)
9178 gcc_assert (tmp_tune);
9179 selected_tune = tmp_tune;
9180 explicit_tune_core = selected_tune->ident;
9181 return true;
9184 switch (parse_res)
9186 case AARCH64_PARSE_INVALID_ARG:
9187 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9188 aarch64_print_hint_for_core (str);
9189 break;
9190 default:
9191 gcc_unreachable ();
9194 return false;
9197 /* Parse an architecture extensions target attribute string specified in STR.
9198 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9199 if successful. Update aarch64_isa_flags to reflect the ISA features
9200 modified.
9201 PRAGMA_OR_ATTR is used in potential error messages. */
9203 static bool
9204 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9206 enum aarch64_parse_opt_result parse_res;
9207 unsigned long isa_flags = aarch64_isa_flags;
9209 /* We allow "+nothing" in the beginning to clear out all architectural
9210 features if the user wants to handpick specific features. */
9211 if (strncmp ("+nothing", str, 8) == 0)
9213 isa_flags = 0;
9214 str += 8;
9217 parse_res = aarch64_parse_extension (str, &isa_flags);
9219 if (parse_res == AARCH64_PARSE_OK)
9221 aarch64_isa_flags = isa_flags;
9222 return true;
9225 switch (parse_res)
9227 case AARCH64_PARSE_MISSING_ARG:
9228 error ("missing feature modifier in target %s %qs",
9229 pragma_or_attr, str);
9230 break;
9232 case AARCH64_PARSE_INVALID_FEATURE:
9233 error ("invalid feature modifier in target %s %qs",
9234 pragma_or_attr, str);
9235 break;
9237 default:
9238 gcc_unreachable ();
9241 return false;
9244 /* The target attributes that we support. On top of these we also support just
9245 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9246 handled explicitly in aarch64_process_one_target_attr. */
9248 static const struct aarch64_attribute_info aarch64_attributes[] =
9250 { "general-regs-only", aarch64_attr_mask, false, NULL,
9251 OPT_mgeneral_regs_only },
9252 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9253 OPT_mfix_cortex_a53_835769 },
9254 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9255 OPT_mfix_cortex_a53_843419 },
9256 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9257 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9258 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9259 OPT_momit_leaf_frame_pointer },
9260 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9261 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9262 OPT_march_ },
9263 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9264 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9265 OPT_mtune_ },
9266 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9269 /* Parse ARG_STR which contains the definition of one target attribute.
9270 Show appropriate errors if any or return true if the attribute is valid.
9271 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9272 we're processing a target attribute or pragma. */
9274 static bool
9275 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9277 bool invert = false;
9279 size_t len = strlen (arg_str);
9281 if (len == 0)
9283 error ("malformed target %s", pragma_or_attr);
9284 return false;
9287 char *str_to_check = (char *) alloca (len + 1);
9288 strcpy (str_to_check, arg_str);
9290 /* Skip leading whitespace. */
9291 while (*str_to_check == ' ' || *str_to_check == '\t')
9292 str_to_check++;
9294 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9295 It is easier to detect and handle it explicitly here rather than going
9296 through the machinery for the rest of the target attributes in this
9297 function. */
9298 if (*str_to_check == '+')
9299 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9301 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9303 invert = true;
9304 str_to_check += 3;
9306 char *arg = strchr (str_to_check, '=');
9308 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9309 and point ARG to "foo". */
9310 if (arg)
9312 *arg = '\0';
9313 arg++;
9315 const struct aarch64_attribute_info *p_attr;
9316 bool found = false;
9317 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9319 /* If the names don't match up, or the user has given an argument
9320 to an attribute that doesn't accept one, or didn't give an argument
9321 to an attribute that expects one, fail to match. */
9322 if (strcmp (str_to_check, p_attr->name) != 0)
9323 continue;
9325 found = true;
9326 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9327 || p_attr->attr_type == aarch64_attr_enum;
9329 if (attr_need_arg_p ^ (arg != NULL))
9331 error ("target %s %qs does not accept an argument",
9332 pragma_or_attr, str_to_check);
9333 return false;
9336 /* If the name matches but the attribute does not allow "no-" versions
9337 then we can't match. */
9338 if (invert && !p_attr->allow_neg)
9340 error ("target %s %qs does not allow a negated form",
9341 pragma_or_attr, str_to_check);
9342 return false;
9345 switch (p_attr->attr_type)
9347 /* Has a custom handler registered.
9348 For example, cpu=, arch=, tune=. */
9349 case aarch64_attr_custom:
9350 gcc_assert (p_attr->handler);
9351 if (!p_attr->handler (arg, pragma_or_attr))
9352 return false;
9353 break;
9355 /* Either set or unset a boolean option. */
9356 case aarch64_attr_bool:
9358 struct cl_decoded_option decoded;
9360 generate_option (p_attr->opt_num, NULL, !invert,
9361 CL_TARGET, &decoded);
9362 aarch64_handle_option (&global_options, &global_options_set,
9363 &decoded, input_location);
9364 break;
9366 /* Set or unset a bit in the target_flags. aarch64_handle_option
9367 should know what mask to apply given the option number. */
9368 case aarch64_attr_mask:
9370 struct cl_decoded_option decoded;
9371 /* We only need to specify the option number.
9372 aarch64_handle_option will know which mask to apply. */
9373 decoded.opt_index = p_attr->opt_num;
9374 decoded.value = !invert;
9375 aarch64_handle_option (&global_options, &global_options_set,
9376 &decoded, input_location);
9377 break;
9379 /* Use the option setting machinery to set an option to an enum. */
9380 case aarch64_attr_enum:
9382 gcc_assert (arg);
9383 bool valid;
9384 int value;
9385 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9386 &value, CL_TARGET);
9387 if (valid)
9389 set_option (&global_options, NULL, p_attr->opt_num, value,
9390 NULL, DK_UNSPECIFIED, input_location,
9391 global_dc);
9393 else
9395 error ("target %s %s=%s is not valid",
9396 pragma_or_attr, str_to_check, arg);
9398 break;
9400 default:
9401 gcc_unreachable ();
9405 /* If we reached here we either have found an attribute and validated
9406 it or didn't match any. If we matched an attribute but its arguments
9407 were malformed we will have returned false already. */
9408 return found;
9411 /* Count how many times the character C appears in
9412 NULL-terminated string STR. */
9414 static unsigned int
9415 num_occurences_in_str (char c, char *str)
9417 unsigned int res = 0;
9418 while (*str != '\0')
9420 if (*str == c)
9421 res++;
9423 str++;
9426 return res;
9429 /* Parse the tree in ARGS that contains the target attribute information
9430 and update the global target options space. PRAGMA_OR_ATTR is a string
9431 to be used in error messages, specifying whether this is processing
9432 a target attribute or a target pragma. */
9434 bool
9435 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9437 if (TREE_CODE (args) == TREE_LIST)
9441 tree head = TREE_VALUE (args);
9442 if (head)
9444 if (!aarch64_process_target_attr (head, pragma_or_attr))
9445 return false;
9447 args = TREE_CHAIN (args);
9448 } while (args);
9450 return true;
9452 /* We expect to find a string to parse. */
9453 gcc_assert (TREE_CODE (args) == STRING_CST);
9455 size_t len = strlen (TREE_STRING_POINTER (args));
9456 char *str_to_check = (char *) alloca (len + 1);
9457 strcpy (str_to_check, TREE_STRING_POINTER (args));
9459 if (len == 0)
9461 error ("malformed target %s value", pragma_or_attr);
9462 return false;
9465 /* Used to catch empty spaces between commas i.e.
9466 attribute ((target ("attr1,,attr2"))). */
9467 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9469 /* Handle multiple target attributes separated by ','. */
9470 char *token = strtok (str_to_check, ",");
9472 unsigned int num_attrs = 0;
9473 while (token)
9475 num_attrs++;
9476 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9478 error ("target %s %qs is invalid", pragma_or_attr, token);
9479 return false;
9482 token = strtok (NULL, ",");
9485 if (num_attrs != num_commas + 1)
9487 error ("malformed target %s list %qs",
9488 pragma_or_attr, TREE_STRING_POINTER (args));
9489 return false;
9492 return true;
9495 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9496 process attribute ((target ("..."))). */
9498 static bool
9499 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9501 struct cl_target_option cur_target;
9502 bool ret;
9503 tree old_optimize;
9504 tree new_target, new_optimize;
9505 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9507 /* If what we're processing is the current pragma string then the
9508 target option node is already stored in target_option_current_node
9509 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9510 having to re-parse the string. This is especially useful to keep
9511 arm_neon.h compile times down since that header contains a lot
9512 of intrinsics enclosed in pragmas. */
9513 if (!existing_target && args == current_target_pragma)
9515 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9516 return true;
9518 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9520 old_optimize = build_optimization_node (&global_options);
9521 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9523 /* If the function changed the optimization levels as well as setting
9524 target options, start with the optimizations specified. */
9525 if (func_optimize && func_optimize != old_optimize)
9526 cl_optimization_restore (&global_options,
9527 TREE_OPTIMIZATION (func_optimize));
9529 /* Save the current target options to restore at the end. */
9530 cl_target_option_save (&cur_target, &global_options);
9532 /* If fndecl already has some target attributes applied to it, unpack
9533 them so that we add this attribute on top of them, rather than
9534 overwriting them. */
9535 if (existing_target)
9537 struct cl_target_option *existing_options
9538 = TREE_TARGET_OPTION (existing_target);
9540 if (existing_options)
9541 cl_target_option_restore (&global_options, existing_options);
9543 else
9544 cl_target_option_restore (&global_options,
9545 TREE_TARGET_OPTION (target_option_current_node));
9548 ret = aarch64_process_target_attr (args, "attribute");
9550 /* Set up any additional state. */
9551 if (ret)
9553 aarch64_override_options_internal (&global_options);
9554 /* Initialize SIMD builtins if we haven't already.
9555 Set current_target_pragma to NULL for the duration so that
9556 the builtin initialization code doesn't try to tag the functions
9557 being built with the attributes specified by any current pragma, thus
9558 going into an infinite recursion. */
9559 if (TARGET_SIMD)
9561 tree saved_current_target_pragma = current_target_pragma;
9562 current_target_pragma = NULL;
9563 aarch64_init_simd_builtins ();
9564 current_target_pragma = saved_current_target_pragma;
9566 new_target = build_target_option_node (&global_options);
9568 else
9569 new_target = NULL;
9571 new_optimize = build_optimization_node (&global_options);
9573 if (fndecl && ret)
9575 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9577 if (old_optimize != new_optimize)
9578 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9581 cl_target_option_restore (&global_options, &cur_target);
9583 if (old_optimize != new_optimize)
9584 cl_optimization_restore (&global_options,
9585 TREE_OPTIMIZATION (old_optimize));
9586 return ret;
9589 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
9590 tri-bool options (yes, no, don't care) and the default value is
9591 DEF, determine whether to reject inlining. */
9593 static bool
9594 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9595 int dont_care, int def)
9597 /* If the callee doesn't care, always allow inlining. */
9598 if (callee == dont_care)
9599 return true;
9601 /* If the caller doesn't care, always allow inlining. */
9602 if (caller == dont_care)
9603 return true;
9605 /* Otherwise, allow inlining if either the callee and caller values
9606 agree, or if the callee is using the default value. */
9607 return (callee == caller || callee == def);
9610 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
9611 to inline CALLEE into CALLER based on target-specific info.
9612 Make sure that the caller and callee have compatible architectural
9613 features. Then go through the other possible target attributes
9614 and see if they can block inlining. Try not to reject always_inline
9615 callees unless they are incompatible architecturally. */
9617 static bool
9618 aarch64_can_inline_p (tree caller, tree callee)
9620 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9621 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9623 /* If callee has no option attributes, then it is ok to inline. */
9624 if (!callee_tree)
9625 return true;
9627 struct cl_target_option *caller_opts
9628 = TREE_TARGET_OPTION (caller_tree ? caller_tree
9629 : target_option_default_node);
9631 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9634 /* Callee's ISA flags should be a subset of the caller's. */
9635 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9636 != callee_opts->x_aarch64_isa_flags)
9637 return false;
9639 /* Allow non-strict aligned functions inlining into strict
9640 aligned ones. */
9641 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9642 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9643 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9644 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9645 return false;
9647 bool always_inline = lookup_attribute ("always_inline",
9648 DECL_ATTRIBUTES (callee));
9650 /* If the architectural features match up and the callee is always_inline
9651 then the other attributes don't matter. */
9652 if (always_inline)
9653 return true;
9655 if (caller_opts->x_aarch64_cmodel_var
9656 != callee_opts->x_aarch64_cmodel_var)
9657 return false;
9659 if (caller_opts->x_aarch64_tls_dialect
9660 != callee_opts->x_aarch64_tls_dialect)
9661 return false;
9663 /* Honour explicit requests to workaround errata. */
9664 if (!aarch64_tribools_ok_for_inlining_p (
9665 caller_opts->x_aarch64_fix_a53_err835769,
9666 callee_opts->x_aarch64_fix_a53_err835769,
9667 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9668 return false;
9670 if (!aarch64_tribools_ok_for_inlining_p (
9671 caller_opts->x_aarch64_fix_a53_err843419,
9672 callee_opts->x_aarch64_fix_a53_err843419,
9673 2, TARGET_FIX_ERR_A53_843419))
9674 return false;
9676 /* If the user explicitly specified -momit-leaf-frame-pointer for the
9677 caller and calle and they don't match up, reject inlining. */
9678 if (!aarch64_tribools_ok_for_inlining_p (
9679 caller_opts->x_flag_omit_leaf_frame_pointer,
9680 callee_opts->x_flag_omit_leaf_frame_pointer,
9681 2, 1))
9682 return false;
9684 /* If the callee has specific tuning overrides, respect them. */
9685 if (callee_opts->x_aarch64_override_tune_string != NULL
9686 && caller_opts->x_aarch64_override_tune_string == NULL)
9687 return false;
9689 /* If the user specified tuning override strings for the
9690 caller and callee and they don't match up, reject inlining.
9691 We just do a string compare here, we don't analyze the meaning
9692 of the string, as it would be too costly for little gain. */
9693 if (callee_opts->x_aarch64_override_tune_string
9694 && caller_opts->x_aarch64_override_tune_string
9695 && (strcmp (callee_opts->x_aarch64_override_tune_string,
9696 caller_opts->x_aarch64_override_tune_string) != 0))
9697 return false;
9699 return true;
9702 /* Return true if SYMBOL_REF X binds locally. */
9704 static bool
9705 aarch64_symbol_binds_local_p (const_rtx x)
9707 return (SYMBOL_REF_DECL (x)
9708 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9709 : SYMBOL_REF_LOCAL_P (x));
9712 /* Return true if SYMBOL_REF X is thread local */
9713 static bool
9714 aarch64_tls_symbol_p (rtx x)
9716 if (! TARGET_HAVE_TLS)
9717 return false;
9719 if (GET_CODE (x) != SYMBOL_REF)
9720 return false;
9722 return SYMBOL_REF_TLS_MODEL (x) != 0;
9725 /* Classify a TLS symbol into one of the TLS kinds. */
9726 enum aarch64_symbol_type
9727 aarch64_classify_tls_symbol (rtx x)
9729 enum tls_model tls_kind = tls_symbolic_operand_type (x);
9731 switch (tls_kind)
9733 case TLS_MODEL_GLOBAL_DYNAMIC:
9734 case TLS_MODEL_LOCAL_DYNAMIC:
9735 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9737 case TLS_MODEL_INITIAL_EXEC:
9738 switch (aarch64_cmodel)
9740 case AARCH64_CMODEL_TINY:
9741 case AARCH64_CMODEL_TINY_PIC:
9742 return SYMBOL_TINY_TLSIE;
9743 default:
9744 return SYMBOL_SMALL_TLSIE;
9747 case TLS_MODEL_LOCAL_EXEC:
9748 if (aarch64_tls_size == 12)
9749 return SYMBOL_TLSLE12;
9750 else if (aarch64_tls_size == 24)
9751 return SYMBOL_TLSLE24;
9752 else if (aarch64_tls_size == 32)
9753 return SYMBOL_TLSLE32;
9754 else if (aarch64_tls_size == 48)
9755 return SYMBOL_TLSLE48;
9756 else
9757 gcc_unreachable ();
9759 case TLS_MODEL_EMULATED:
9760 case TLS_MODEL_NONE:
9761 return SYMBOL_FORCE_TO_MEM;
9763 default:
9764 gcc_unreachable ();
9768 /* Return the method that should be used to access SYMBOL_REF or
9769 LABEL_REF X. */
9771 enum aarch64_symbol_type
9772 aarch64_classify_symbol (rtx x, rtx offset)
9774 if (GET_CODE (x) == LABEL_REF)
9776 switch (aarch64_cmodel)
9778 case AARCH64_CMODEL_LARGE:
9779 return SYMBOL_FORCE_TO_MEM;
9781 case AARCH64_CMODEL_TINY_PIC:
9782 case AARCH64_CMODEL_TINY:
9783 return SYMBOL_TINY_ABSOLUTE;
9785 case AARCH64_CMODEL_SMALL_SPIC:
9786 case AARCH64_CMODEL_SMALL_PIC:
9787 case AARCH64_CMODEL_SMALL:
9788 return SYMBOL_SMALL_ABSOLUTE;
9790 default:
9791 gcc_unreachable ();
9795 if (GET_CODE (x) == SYMBOL_REF)
9797 if (aarch64_tls_symbol_p (x))
9798 return aarch64_classify_tls_symbol (x);
9800 switch (aarch64_cmodel)
9802 case AARCH64_CMODEL_TINY:
9803 /* When we retrieve symbol + offset address, we have to make sure
9804 the offset does not cause overflow of the final address. But
9805 we have no way of knowing the address of symbol at compile time
9806 so we can't accurately say if the distance between the PC and
9807 symbol + offset is outside the addressible range of +/-1M in the
9808 TINY code model. So we rely on images not being greater than
9809 1M and cap the offset at 1M and anything beyond 1M will have to
9810 be loaded using an alternative mechanism. Furthermore if the
9811 symbol is a weak reference to something that isn't known to
9812 resolve to a symbol in this module, then force to memory. */
9813 if ((SYMBOL_REF_WEAK (x)
9814 && !aarch64_symbol_binds_local_p (x))
9815 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9816 return SYMBOL_FORCE_TO_MEM;
9817 return SYMBOL_TINY_ABSOLUTE;
9819 case AARCH64_CMODEL_SMALL:
9820 /* Same reasoning as the tiny code model, but the offset cap here is
9821 4G. */
9822 if ((SYMBOL_REF_WEAK (x)
9823 && !aarch64_symbol_binds_local_p (x))
9824 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9825 HOST_WIDE_INT_C (4294967264)))
9826 return SYMBOL_FORCE_TO_MEM;
9827 return SYMBOL_SMALL_ABSOLUTE;
9829 case AARCH64_CMODEL_TINY_PIC:
9830 if (!aarch64_symbol_binds_local_p (x))
9831 return SYMBOL_TINY_GOT;
9832 return SYMBOL_TINY_ABSOLUTE;
9834 case AARCH64_CMODEL_SMALL_SPIC:
9835 case AARCH64_CMODEL_SMALL_PIC:
9836 if (!aarch64_symbol_binds_local_p (x))
9837 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9838 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9839 return SYMBOL_SMALL_ABSOLUTE;
9841 case AARCH64_CMODEL_LARGE:
9842 /* This is alright even in PIC code as the constant
9843 pool reference is always PC relative and within
9844 the same translation unit. */
9845 if (CONSTANT_POOL_ADDRESS_P (x))
9846 return SYMBOL_SMALL_ABSOLUTE;
9847 else
9848 return SYMBOL_FORCE_TO_MEM;
9850 default:
9851 gcc_unreachable ();
9855 /* By default push everything into the constant pool. */
9856 return SYMBOL_FORCE_TO_MEM;
9859 bool
9860 aarch64_constant_address_p (rtx x)
9862 return (CONSTANT_P (x) && memory_address_p (DImode, x));
9865 bool
9866 aarch64_legitimate_pic_operand_p (rtx x)
9868 if (GET_CODE (x) == SYMBOL_REF
9869 || (GET_CODE (x) == CONST
9870 && GET_CODE (XEXP (x, 0)) == PLUS
9871 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9872 return false;
9874 return true;
9877 /* Return true if X holds either a quarter-precision or
9878 floating-point +0.0 constant. */
9879 static bool
9880 aarch64_valid_floating_const (machine_mode mode, rtx x)
9882 if (!CONST_DOUBLE_P (x))
9883 return false;
9885 if (aarch64_float_const_zero_rtx_p (x))
9886 return true;
9888 /* We only handle moving 0.0 to a TFmode register. */
9889 if (!(mode == SFmode || mode == DFmode))
9890 return false;
9892 return aarch64_float_const_representable_p (x);
9895 static bool
9896 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9898 /* Do not allow vector struct mode constants. We could support
9899 0 and -1 easily, but they need support in aarch64-simd.md. */
9900 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9901 return false;
9903 /* This could probably go away because
9904 we now decompose CONST_INTs according to expand_mov_immediate. */
9905 if ((GET_CODE (x) == CONST_VECTOR
9906 && aarch64_simd_valid_immediate (x, mode, false, NULL))
9907 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9908 return !targetm.cannot_force_const_mem (mode, x);
9910 if (GET_CODE (x) == HIGH
9911 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9912 return true;
9914 return aarch64_constant_address_p (x);
9918 aarch64_load_tp (rtx target)
9920 if (!target
9921 || GET_MODE (target) != Pmode
9922 || !register_operand (target, Pmode))
9923 target = gen_reg_rtx (Pmode);
9925 /* Can return in any reg. */
9926 emit_insn (gen_aarch64_load_tp_hard (target));
9927 return target;
9930 /* On AAPCS systems, this is the "struct __va_list". */
9931 static GTY(()) tree va_list_type;
9933 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9934 Return the type to use as __builtin_va_list.
9936 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9938 struct __va_list
9940 void *__stack;
9941 void *__gr_top;
9942 void *__vr_top;
9943 int __gr_offs;
9944 int __vr_offs;
9945 }; */
9947 static tree
9948 aarch64_build_builtin_va_list (void)
9950 tree va_list_name;
9951 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9953 /* Create the type. */
9954 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9955 /* Give it the required name. */
9956 va_list_name = build_decl (BUILTINS_LOCATION,
9957 TYPE_DECL,
9958 get_identifier ("__va_list"),
9959 va_list_type);
9960 DECL_ARTIFICIAL (va_list_name) = 1;
9961 TYPE_NAME (va_list_type) = va_list_name;
9962 TYPE_STUB_DECL (va_list_type) = va_list_name;
9964 /* Create the fields. */
9965 f_stack = build_decl (BUILTINS_LOCATION,
9966 FIELD_DECL, get_identifier ("__stack"),
9967 ptr_type_node);
9968 f_grtop = build_decl (BUILTINS_LOCATION,
9969 FIELD_DECL, get_identifier ("__gr_top"),
9970 ptr_type_node);
9971 f_vrtop = build_decl (BUILTINS_LOCATION,
9972 FIELD_DECL, get_identifier ("__vr_top"),
9973 ptr_type_node);
9974 f_groff = build_decl (BUILTINS_LOCATION,
9975 FIELD_DECL, get_identifier ("__gr_offs"),
9976 integer_type_node);
9977 f_vroff = build_decl (BUILTINS_LOCATION,
9978 FIELD_DECL, get_identifier ("__vr_offs"),
9979 integer_type_node);
9981 /* Tell tree-stdarg pass about our internal offset fields.
9982 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
9983 purpose to identify whether the code is updating va_list internal
9984 offset fields through irregular way. */
9985 va_list_gpr_counter_field = f_groff;
9986 va_list_fpr_counter_field = f_vroff;
9988 DECL_ARTIFICIAL (f_stack) = 1;
9989 DECL_ARTIFICIAL (f_grtop) = 1;
9990 DECL_ARTIFICIAL (f_vrtop) = 1;
9991 DECL_ARTIFICIAL (f_groff) = 1;
9992 DECL_ARTIFICIAL (f_vroff) = 1;
9994 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9995 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9996 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9997 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9998 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10000 TYPE_FIELDS (va_list_type) = f_stack;
10001 DECL_CHAIN (f_stack) = f_grtop;
10002 DECL_CHAIN (f_grtop) = f_vrtop;
10003 DECL_CHAIN (f_vrtop) = f_groff;
10004 DECL_CHAIN (f_groff) = f_vroff;
10006 /* Compute its layout. */
10007 layout_type (va_list_type);
10009 return va_list_type;
10012 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10013 static void
10014 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10016 const CUMULATIVE_ARGS *cum;
10017 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10018 tree stack, grtop, vrtop, groff, vroff;
10019 tree t;
10020 int gr_save_area_size = cfun->va_list_gpr_size;
10021 int vr_save_area_size = cfun->va_list_fpr_size;
10022 int vr_offset;
10024 cum = &crtl->args.info;
10025 if (cfun->va_list_gpr_size)
10026 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10027 cfun->va_list_gpr_size);
10028 if (cfun->va_list_fpr_size)
10029 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10030 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10032 if (!TARGET_FLOAT)
10034 gcc_assert (cum->aapcs_nvrn == 0);
10035 vr_save_area_size = 0;
10038 f_stack = TYPE_FIELDS (va_list_type_node);
10039 f_grtop = DECL_CHAIN (f_stack);
10040 f_vrtop = DECL_CHAIN (f_grtop);
10041 f_groff = DECL_CHAIN (f_vrtop);
10042 f_vroff = DECL_CHAIN (f_groff);
10044 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10045 NULL_TREE);
10046 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10047 NULL_TREE);
10048 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10049 NULL_TREE);
10050 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10051 NULL_TREE);
10052 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10053 NULL_TREE);
10055 /* Emit code to initialize STACK, which points to the next varargs stack
10056 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10057 by named arguments. STACK is 8-byte aligned. */
10058 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10059 if (cum->aapcs_stack_size > 0)
10060 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10061 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10062 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10064 /* Emit code to initialize GRTOP, the top of the GR save area.
10065 virtual_incoming_args_rtx should have been 16 byte aligned. */
10066 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10067 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10068 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10070 /* Emit code to initialize VRTOP, the top of the VR save area.
10071 This address is gr_save_area_bytes below GRTOP, rounded
10072 down to the next 16-byte boundary. */
10073 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10074 vr_offset = ROUND_UP (gr_save_area_size,
10075 STACK_BOUNDARY / BITS_PER_UNIT);
10077 if (vr_offset)
10078 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10079 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10080 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10082 /* Emit code to initialize GROFF, the offset from GRTOP of the
10083 next GPR argument. */
10084 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10085 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10086 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10088 /* Likewise emit code to initialize VROFF, the offset from FTOP
10089 of the next VR argument. */
10090 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10091 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10092 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10095 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10097 static tree
10098 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10099 gimple_seq *post_p ATTRIBUTE_UNUSED)
10101 tree addr;
10102 bool indirect_p;
10103 bool is_ha; /* is HFA or HVA. */
10104 bool dw_align; /* double-word align. */
10105 machine_mode ag_mode = VOIDmode;
10106 int nregs;
10107 machine_mode mode;
10109 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10110 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10111 HOST_WIDE_INT size, rsize, adjust, align;
10112 tree t, u, cond1, cond2;
10114 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10115 if (indirect_p)
10116 type = build_pointer_type (type);
10118 mode = TYPE_MODE (type);
10120 f_stack = TYPE_FIELDS (va_list_type_node);
10121 f_grtop = DECL_CHAIN (f_stack);
10122 f_vrtop = DECL_CHAIN (f_grtop);
10123 f_groff = DECL_CHAIN (f_vrtop);
10124 f_vroff = DECL_CHAIN (f_groff);
10126 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10127 f_stack, NULL_TREE);
10128 size = int_size_in_bytes (type);
10129 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10131 dw_align = false;
10132 adjust = 0;
10133 if (aarch64_vfp_is_call_or_return_candidate (mode,
10134 type,
10135 &ag_mode,
10136 &nregs,
10137 &is_ha))
10139 /* TYPE passed in fp/simd registers. */
10140 if (!TARGET_FLOAT)
10141 aarch64_err_no_fpadvsimd (mode, "varargs");
10143 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10144 unshare_expr (valist), f_vrtop, NULL_TREE);
10145 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10146 unshare_expr (valist), f_vroff, NULL_TREE);
10148 rsize = nregs * UNITS_PER_VREG;
10150 if (is_ha)
10152 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10153 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10155 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10156 && size < UNITS_PER_VREG)
10158 adjust = UNITS_PER_VREG - size;
10161 else
10163 /* TYPE passed in general registers. */
10164 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10165 unshare_expr (valist), f_grtop, NULL_TREE);
10166 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10167 unshare_expr (valist), f_groff, NULL_TREE);
10168 rsize = ROUND_UP (size, UNITS_PER_WORD);
10169 nregs = rsize / UNITS_PER_WORD;
10171 if (align > 8)
10172 dw_align = true;
10174 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10175 && size < UNITS_PER_WORD)
10177 adjust = UNITS_PER_WORD - size;
10181 /* Get a local temporary for the field value. */
10182 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10184 /* Emit code to branch if off >= 0. */
10185 t = build2 (GE_EXPR, boolean_type_node, off,
10186 build_int_cst (TREE_TYPE (off), 0));
10187 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10189 if (dw_align)
10191 /* Emit: offs = (offs + 15) & -16. */
10192 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10193 build_int_cst (TREE_TYPE (off), 15));
10194 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10195 build_int_cst (TREE_TYPE (off), -16));
10196 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10198 else
10199 roundup = NULL;
10201 /* Update ap.__[g|v]r_offs */
10202 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10203 build_int_cst (TREE_TYPE (off), rsize));
10204 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10206 /* String up. */
10207 if (roundup)
10208 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10210 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10211 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10212 build_int_cst (TREE_TYPE (f_off), 0));
10213 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10215 /* String up: make sure the assignment happens before the use. */
10216 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10217 COND_EXPR_ELSE (cond1) = t;
10219 /* Prepare the trees handling the argument that is passed on the stack;
10220 the top level node will store in ON_STACK. */
10221 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10222 if (align > 8)
10224 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10225 t = fold_convert (intDI_type_node, arg);
10226 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10227 build_int_cst (TREE_TYPE (t), 15));
10228 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10229 build_int_cst (TREE_TYPE (t), -16));
10230 t = fold_convert (TREE_TYPE (arg), t);
10231 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10233 else
10234 roundup = NULL;
10235 /* Advance ap.__stack */
10236 t = fold_convert (intDI_type_node, arg);
10237 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10238 build_int_cst (TREE_TYPE (t), size + 7));
10239 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10240 build_int_cst (TREE_TYPE (t), -8));
10241 t = fold_convert (TREE_TYPE (arg), t);
10242 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10243 /* String up roundup and advance. */
10244 if (roundup)
10245 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10246 /* String up with arg */
10247 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10248 /* Big-endianness related address adjustment. */
10249 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10250 && size < UNITS_PER_WORD)
10252 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10253 size_int (UNITS_PER_WORD - size));
10254 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10257 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10258 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10260 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10261 t = off;
10262 if (adjust)
10263 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10264 build_int_cst (TREE_TYPE (off), adjust));
10266 t = fold_convert (sizetype, t);
10267 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10269 if (is_ha)
10271 /* type ha; // treat as "struct {ftype field[n];}"
10272 ... [computing offs]
10273 for (i = 0; i <nregs; ++i, offs += 16)
10274 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10275 return ha; */
10276 int i;
10277 tree tmp_ha, field_t, field_ptr_t;
10279 /* Declare a local variable. */
10280 tmp_ha = create_tmp_var_raw (type, "ha");
10281 gimple_add_tmp_var (tmp_ha);
10283 /* Establish the base type. */
10284 switch (ag_mode)
10286 case SFmode:
10287 field_t = float_type_node;
10288 field_ptr_t = float_ptr_type_node;
10289 break;
10290 case DFmode:
10291 field_t = double_type_node;
10292 field_ptr_t = double_ptr_type_node;
10293 break;
10294 case TFmode:
10295 field_t = long_double_type_node;
10296 field_ptr_t = long_double_ptr_type_node;
10297 break;
10298 case HFmode:
10299 field_t = aarch64_fp16_type_node;
10300 field_ptr_t = aarch64_fp16_ptr_type_node;
10301 break;
10302 case V2SImode:
10303 case V4SImode:
10305 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10306 field_t = build_vector_type_for_mode (innertype, ag_mode);
10307 field_ptr_t = build_pointer_type (field_t);
10309 break;
10310 default:
10311 gcc_assert (0);
10314 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10315 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10316 addr = t;
10317 t = fold_convert (field_ptr_t, addr);
10318 t = build2 (MODIFY_EXPR, field_t,
10319 build1 (INDIRECT_REF, field_t, tmp_ha),
10320 build1 (INDIRECT_REF, field_t, t));
10322 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10323 for (i = 1; i < nregs; ++i)
10325 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10326 u = fold_convert (field_ptr_t, addr);
10327 u = build2 (MODIFY_EXPR, field_t,
10328 build2 (MEM_REF, field_t, tmp_ha,
10329 build_int_cst (field_ptr_t,
10330 (i *
10331 int_size_in_bytes (field_t)))),
10332 build1 (INDIRECT_REF, field_t, u));
10333 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10336 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10337 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10340 COND_EXPR_ELSE (cond2) = t;
10341 addr = fold_convert (build_pointer_type (type), cond1);
10342 addr = build_va_arg_indirect_ref (addr);
10344 if (indirect_p)
10345 addr = build_va_arg_indirect_ref (addr);
10347 return addr;
10350 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10352 static void
10353 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10354 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10355 int no_rtl)
10357 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10358 CUMULATIVE_ARGS local_cum;
10359 int gr_saved = cfun->va_list_gpr_size;
10360 int vr_saved = cfun->va_list_fpr_size;
10362 /* The caller has advanced CUM up to, but not beyond, the last named
10363 argument. Advance a local copy of CUM past the last "real" named
10364 argument, to find out how many registers are left over. */
10365 local_cum = *cum;
10366 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10368 /* Found out how many registers we need to save.
10369 Honor tree-stdvar analysis results. */
10370 if (cfun->va_list_gpr_size)
10371 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10372 cfun->va_list_gpr_size / UNITS_PER_WORD);
10373 if (cfun->va_list_fpr_size)
10374 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10375 cfun->va_list_fpr_size / UNITS_PER_VREG);
10377 if (!TARGET_FLOAT)
10379 gcc_assert (local_cum.aapcs_nvrn == 0);
10380 vr_saved = 0;
10383 if (!no_rtl)
10385 if (gr_saved > 0)
10387 rtx ptr, mem;
10389 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10390 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10391 - gr_saved * UNITS_PER_WORD);
10392 mem = gen_frame_mem (BLKmode, ptr);
10393 set_mem_alias_set (mem, get_varargs_alias_set ());
10395 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10396 mem, gr_saved);
10398 if (vr_saved > 0)
10400 /* We can't use move_block_from_reg, because it will use
10401 the wrong mode, storing D regs only. */
10402 machine_mode mode = TImode;
10403 int off, i, vr_start;
10405 /* Set OFF to the offset from virtual_incoming_args_rtx of
10406 the first vector register. The VR save area lies below
10407 the GR one, and is aligned to 16 bytes. */
10408 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10409 STACK_BOUNDARY / BITS_PER_UNIT);
10410 off -= vr_saved * UNITS_PER_VREG;
10412 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10413 for (i = 0; i < vr_saved; ++i)
10415 rtx ptr, mem;
10417 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10418 mem = gen_frame_mem (mode, ptr);
10419 set_mem_alias_set (mem, get_varargs_alias_set ());
10420 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10421 off += UNITS_PER_VREG;
10426 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10427 any complication of having crtl->args.pretend_args_size changed. */
10428 cfun->machine->frame.saved_varargs_size
10429 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10430 STACK_BOUNDARY / BITS_PER_UNIT)
10431 + vr_saved * UNITS_PER_VREG);
10434 static void
10435 aarch64_conditional_register_usage (void)
10437 int i;
10438 if (!TARGET_FLOAT)
10440 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10442 fixed_regs[i] = 1;
10443 call_used_regs[i] = 1;
10448 /* Walk down the type tree of TYPE counting consecutive base elements.
10449 If *MODEP is VOIDmode, then set it to the first valid floating point
10450 type. If a non-floating point type is found, or if a floating point
10451 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10452 otherwise return the count in the sub-tree. */
10453 static int
10454 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10456 machine_mode mode;
10457 HOST_WIDE_INT size;
10459 switch (TREE_CODE (type))
10461 case REAL_TYPE:
10462 mode = TYPE_MODE (type);
10463 if (mode != DFmode && mode != SFmode
10464 && mode != TFmode && mode != HFmode)
10465 return -1;
10467 if (*modep == VOIDmode)
10468 *modep = mode;
10470 if (*modep == mode)
10471 return 1;
10473 break;
10475 case COMPLEX_TYPE:
10476 mode = TYPE_MODE (TREE_TYPE (type));
10477 if (mode != DFmode && mode != SFmode
10478 && mode != TFmode && mode != HFmode)
10479 return -1;
10481 if (*modep == VOIDmode)
10482 *modep = mode;
10484 if (*modep == mode)
10485 return 2;
10487 break;
10489 case VECTOR_TYPE:
10490 /* Use V2SImode and V4SImode as representatives of all 64-bit
10491 and 128-bit vector types. */
10492 size = int_size_in_bytes (type);
10493 switch (size)
10495 case 8:
10496 mode = V2SImode;
10497 break;
10498 case 16:
10499 mode = V4SImode;
10500 break;
10501 default:
10502 return -1;
10505 if (*modep == VOIDmode)
10506 *modep = mode;
10508 /* Vector modes are considered to be opaque: two vectors are
10509 equivalent for the purposes of being homogeneous aggregates
10510 if they are the same size. */
10511 if (*modep == mode)
10512 return 1;
10514 break;
10516 case ARRAY_TYPE:
10518 int count;
10519 tree index = TYPE_DOMAIN (type);
10521 /* Can't handle incomplete types nor sizes that are not
10522 fixed. */
10523 if (!COMPLETE_TYPE_P (type)
10524 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10525 return -1;
10527 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10528 if (count == -1
10529 || !index
10530 || !TYPE_MAX_VALUE (index)
10531 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10532 || !TYPE_MIN_VALUE (index)
10533 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10534 || count < 0)
10535 return -1;
10537 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10538 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10540 /* There must be no padding. */
10541 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10542 return -1;
10544 return count;
10547 case RECORD_TYPE:
10549 int count = 0;
10550 int sub_count;
10551 tree field;
10553 /* Can't handle incomplete types nor sizes that are not
10554 fixed. */
10555 if (!COMPLETE_TYPE_P (type)
10556 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10557 return -1;
10559 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10561 if (TREE_CODE (field) != FIELD_DECL)
10562 continue;
10564 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10565 if (sub_count < 0)
10566 return -1;
10567 count += sub_count;
10570 /* There must be no padding. */
10571 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10572 return -1;
10574 return count;
10577 case UNION_TYPE:
10578 case QUAL_UNION_TYPE:
10580 /* These aren't very interesting except in a degenerate case. */
10581 int count = 0;
10582 int sub_count;
10583 tree field;
10585 /* Can't handle incomplete types nor sizes that are not
10586 fixed. */
10587 if (!COMPLETE_TYPE_P (type)
10588 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10589 return -1;
10591 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10593 if (TREE_CODE (field) != FIELD_DECL)
10594 continue;
10596 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10597 if (sub_count < 0)
10598 return -1;
10599 count = count > sub_count ? count : sub_count;
10602 /* There must be no padding. */
10603 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10604 return -1;
10606 return count;
10609 default:
10610 break;
10613 return -1;
10616 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10617 type as described in AAPCS64 \S 4.1.2.
10619 See the comment above aarch64_composite_type_p for the notes on MODE. */
10621 static bool
10622 aarch64_short_vector_p (const_tree type,
10623 machine_mode mode)
10625 HOST_WIDE_INT size = -1;
10627 if (type && TREE_CODE (type) == VECTOR_TYPE)
10628 size = int_size_in_bytes (type);
10629 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10630 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10631 size = GET_MODE_SIZE (mode);
10633 return (size == 8 || size == 16);
10636 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10637 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
10638 array types. The C99 floating-point complex types are also considered
10639 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
10640 types, which are GCC extensions and out of the scope of AAPCS64, are
10641 treated as composite types here as well.
10643 Note that MODE itself is not sufficient in determining whether a type
10644 is such a composite type or not. This is because
10645 stor-layout.c:compute_record_mode may have already changed the MODE
10646 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10647 structure with only one field may have its MODE set to the mode of the
10648 field. Also an integer mode whose size matches the size of the
10649 RECORD_TYPE type may be used to substitute the original mode
10650 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10651 solely relied on. */
10653 static bool
10654 aarch64_composite_type_p (const_tree type,
10655 machine_mode mode)
10657 if (aarch64_short_vector_p (type, mode))
10658 return false;
10660 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10661 return true;
10663 if (mode == BLKmode
10664 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10665 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10666 return true;
10668 return false;
10671 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10672 shall be passed or returned in simd/fp register(s) (providing these
10673 parameter passing registers are available).
10675 Upon successful return, *COUNT returns the number of needed registers,
10676 *BASE_MODE returns the mode of the individual register and when IS_HAF
10677 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10678 floating-point aggregate or a homogeneous short-vector aggregate. */
10680 static bool
10681 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10682 const_tree type,
10683 machine_mode *base_mode,
10684 int *count,
10685 bool *is_ha)
10687 machine_mode new_mode = VOIDmode;
10688 bool composite_p = aarch64_composite_type_p (type, mode);
10690 if (is_ha != NULL) *is_ha = false;
10692 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10693 || aarch64_short_vector_p (type, mode))
10695 *count = 1;
10696 new_mode = mode;
10698 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10700 if (is_ha != NULL) *is_ha = true;
10701 *count = 2;
10702 new_mode = GET_MODE_INNER (mode);
10704 else if (type && composite_p)
10706 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10708 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10710 if (is_ha != NULL) *is_ha = true;
10711 *count = ag_count;
10713 else
10714 return false;
10716 else
10717 return false;
10719 *base_mode = new_mode;
10720 return true;
10723 /* Implement TARGET_STRUCT_VALUE_RTX. */
10725 static rtx
10726 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10727 int incoming ATTRIBUTE_UNUSED)
10729 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10732 /* Implements target hook vector_mode_supported_p. */
10733 static bool
10734 aarch64_vector_mode_supported_p (machine_mode mode)
10736 if (TARGET_SIMD
10737 && (mode == V4SImode || mode == V8HImode
10738 || mode == V16QImode || mode == V2DImode
10739 || mode == V2SImode || mode == V4HImode
10740 || mode == V8QImode || mode == V2SFmode
10741 || mode == V4SFmode || mode == V2DFmode
10742 || mode == V4HFmode || mode == V8HFmode
10743 || mode == V1DFmode))
10744 return true;
10746 return false;
10749 /* Return appropriate SIMD container
10750 for MODE within a vector of WIDTH bits. */
10751 static machine_mode
10752 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10754 gcc_assert (width == 64 || width == 128);
10755 if (TARGET_SIMD)
10757 if (width == 128)
10758 switch (mode)
10760 case DFmode:
10761 return V2DFmode;
10762 case SFmode:
10763 return V4SFmode;
10764 case SImode:
10765 return V4SImode;
10766 case HImode:
10767 return V8HImode;
10768 case QImode:
10769 return V16QImode;
10770 case DImode:
10771 return V2DImode;
10772 default:
10773 break;
10775 else
10776 switch (mode)
10778 case SFmode:
10779 return V2SFmode;
10780 case SImode:
10781 return V2SImode;
10782 case HImode:
10783 return V4HImode;
10784 case QImode:
10785 return V8QImode;
10786 default:
10787 break;
10790 return word_mode;
10793 /* Return 128-bit container as the preferred SIMD mode for MODE. */
10794 static machine_mode
10795 aarch64_preferred_simd_mode (machine_mode mode)
10797 return aarch64_simd_container_mode (mode, 128);
10800 /* Return the bitmask of possible vector sizes for the vectorizer
10801 to iterate over. */
10802 static unsigned int
10803 aarch64_autovectorize_vector_sizes (void)
10805 return (16 | 8);
10808 /* Implement TARGET_MANGLE_TYPE. */
10810 static const char *
10811 aarch64_mangle_type (const_tree type)
10813 /* The AArch64 ABI documents say that "__va_list" has to be
10814 managled as if it is in the "std" namespace. */
10815 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10816 return "St9__va_list";
10818 /* Half-precision float. */
10819 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10820 return "Dh";
10822 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
10823 builtin types. */
10824 if (TYPE_NAME (type) != NULL)
10825 return aarch64_mangle_builtin_type (type);
10827 /* Use the default mangling. */
10828 return NULL;
10832 /* Return true if the rtx_insn contains a MEM RTX somewhere
10833 in it. */
10835 static bool
10836 has_memory_op (rtx_insn *mem_insn)
10838 subrtx_iterator::array_type array;
10839 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10840 if (MEM_P (*iter))
10841 return true;
10843 return false;
10846 /* Find the first rtx_insn before insn that will generate an assembly
10847 instruction. */
10849 static rtx_insn *
10850 aarch64_prev_real_insn (rtx_insn *insn)
10852 if (!insn)
10853 return NULL;
10857 insn = prev_real_insn (insn);
10859 while (insn && recog_memoized (insn) < 0);
10861 return insn;
10864 static bool
10865 is_madd_op (enum attr_type t1)
10867 unsigned int i;
10868 /* A number of these may be AArch32 only. */
10869 enum attr_type mlatypes[] = {
10870 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10871 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10872 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10875 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10877 if (t1 == mlatypes[i])
10878 return true;
10881 return false;
10884 /* Check if there is a register dependency between a load and the insn
10885 for which we hold recog_data. */
10887 static bool
10888 dep_between_memop_and_curr (rtx memop)
10890 rtx load_reg;
10891 int opno;
10893 gcc_assert (GET_CODE (memop) == SET);
10895 if (!REG_P (SET_DEST (memop)))
10896 return false;
10898 load_reg = SET_DEST (memop);
10899 for (opno = 1; opno < recog_data.n_operands; opno++)
10901 rtx operand = recog_data.operand[opno];
10902 if (REG_P (operand)
10903 && reg_overlap_mentioned_p (load_reg, operand))
10904 return true;
10907 return false;
10911 /* When working around the Cortex-A53 erratum 835769,
10912 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10913 instruction and has a preceding memory instruction such that a NOP
10914 should be inserted between them. */
10916 bool
10917 aarch64_madd_needs_nop (rtx_insn* insn)
10919 enum attr_type attr_type;
10920 rtx_insn *prev;
10921 rtx body;
10923 if (!TARGET_FIX_ERR_A53_835769)
10924 return false;
10926 if (!INSN_P (insn) || recog_memoized (insn) < 0)
10927 return false;
10929 attr_type = get_attr_type (insn);
10930 if (!is_madd_op (attr_type))
10931 return false;
10933 prev = aarch64_prev_real_insn (insn);
10934 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10935 Restore recog state to INSN to avoid state corruption. */
10936 extract_constrain_insn_cached (insn);
10938 if (!prev || !has_memory_op (prev))
10939 return false;
10941 body = single_set (prev);
10943 /* If the previous insn is a memory op and there is no dependency between
10944 it and the DImode madd, emit a NOP between them. If body is NULL then we
10945 have a complex memory operation, probably a load/store pair.
10946 Be conservative for now and emit a NOP. */
10947 if (GET_MODE (recog_data.operand[0]) == DImode
10948 && (!body || !dep_between_memop_and_curr (body)))
10949 return true;
10951 return false;
10956 /* Implement FINAL_PRESCAN_INSN. */
10958 void
10959 aarch64_final_prescan_insn (rtx_insn *insn)
10961 if (aarch64_madd_needs_nop (insn))
10962 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10966 /* Return the equivalent letter for size. */
10967 static char
10968 sizetochar (int size)
10970 switch (size)
10972 case 64: return 'd';
10973 case 32: return 's';
10974 case 16: return 'h';
10975 case 8 : return 'b';
10976 default: gcc_unreachable ();
10980 /* Return true iff x is a uniform vector of floating-point
10981 constants, and the constant can be represented in
10982 quarter-precision form. Note, as aarch64_float_const_representable
10983 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
10984 static bool
10985 aarch64_vect_float_const_representable_p (rtx x)
10987 rtx elt;
10988 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10989 && const_vec_duplicate_p (x, &elt)
10990 && aarch64_float_const_representable_p (elt));
10993 /* Return true for valid and false for invalid. */
10994 bool
10995 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10996 struct simd_immediate_info *info)
10998 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
10999 matches = 1; \
11000 for (i = 0; i < idx; i += (STRIDE)) \
11001 if (!(TEST)) \
11002 matches = 0; \
11003 if (matches) \
11005 immtype = (CLASS); \
11006 elsize = (ELSIZE); \
11007 eshift = (SHIFT); \
11008 emvn = (NEG); \
11009 break; \
11012 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11013 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11014 unsigned char bytes[16];
11015 int immtype = -1, matches;
11016 unsigned int invmask = inverse ? 0xff : 0;
11017 int eshift, emvn;
11019 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11021 if (! (aarch64_simd_imm_zero_p (op, mode)
11022 || aarch64_vect_float_const_representable_p (op)))
11023 return false;
11025 if (info)
11027 info->value = CONST_VECTOR_ELT (op, 0);
11028 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11029 info->mvn = false;
11030 info->shift = 0;
11033 return true;
11036 /* Splat vector constant out into a byte vector. */
11037 for (i = 0; i < n_elts; i++)
11039 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11040 it must be laid out in the vector register in reverse order. */
11041 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11042 unsigned HOST_WIDE_INT elpart;
11044 gcc_assert (CONST_INT_P (el));
11045 elpart = INTVAL (el);
11047 for (unsigned int byte = 0; byte < innersize; byte++)
11049 bytes[idx++] = (elpart & 0xff) ^ invmask;
11050 elpart >>= BITS_PER_UNIT;
11055 /* Sanity check. */
11056 gcc_assert (idx == GET_MODE_SIZE (mode));
11060 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11061 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11063 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11064 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11066 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11067 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11069 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11070 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11072 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11074 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11076 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11077 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11079 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11080 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11082 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11083 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11085 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11086 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11088 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11090 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11092 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11093 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11095 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11096 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11098 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11099 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11101 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11102 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11104 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11106 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11107 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11109 while (0);
11111 if (immtype == -1)
11112 return false;
11114 if (info)
11116 info->element_width = elsize;
11117 info->mvn = emvn != 0;
11118 info->shift = eshift;
11120 unsigned HOST_WIDE_INT imm = 0;
11122 if (immtype >= 12 && immtype <= 15)
11123 info->msl = true;
11125 /* Un-invert bytes of recognized vector, if necessary. */
11126 if (invmask != 0)
11127 for (i = 0; i < idx; i++)
11128 bytes[i] ^= invmask;
11130 if (immtype == 17)
11132 /* FIXME: Broken on 32-bit H_W_I hosts. */
11133 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11135 for (i = 0; i < 8; i++)
11136 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11137 << (i * BITS_PER_UNIT);
11140 info->value = GEN_INT (imm);
11142 else
11144 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11145 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11147 /* Construct 'abcdefgh' because the assembler cannot handle
11148 generic constants. */
11149 if (info->mvn)
11150 imm = ~imm;
11151 imm = (imm >> info->shift) & 0xff;
11152 info->value = GEN_INT (imm);
11156 return true;
11157 #undef CHECK
11160 /* Check of immediate shift constants are within range. */
11161 bool
11162 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11164 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11165 if (left)
11166 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11167 else
11168 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11171 /* Return true if X is a uniform vector where all elements
11172 are either the floating-point constant 0.0 or the
11173 integer constant 0. */
11174 bool
11175 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11177 return x == CONST0_RTX (mode);
11181 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11182 operation of width WIDTH at bit position POS. */
11185 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11187 gcc_assert (CONST_INT_P (width));
11188 gcc_assert (CONST_INT_P (pos));
11190 unsigned HOST_WIDE_INT mask
11191 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11192 return GEN_INT (mask << UINTVAL (pos));
11195 bool
11196 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
11198 HOST_WIDE_INT imm = INTVAL (x);
11199 int i;
11201 for (i = 0; i < 8; i++)
11203 unsigned int byte = imm & 0xff;
11204 if (byte != 0xff && byte != 0)
11205 return false;
11206 imm >>= 8;
11209 return true;
11212 bool
11213 aarch64_mov_operand_p (rtx x, machine_mode mode)
11215 if (GET_CODE (x) == HIGH
11216 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11217 return true;
11219 if (CONST_INT_P (x))
11220 return true;
11222 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11223 return true;
11225 return aarch64_classify_symbolic_expression (x)
11226 == SYMBOL_TINY_ABSOLUTE;
11229 /* Return a const_int vector of VAL. */
11231 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
11233 int nunits = GET_MODE_NUNITS (mode);
11234 rtvec v = rtvec_alloc (nunits);
11235 int i;
11237 for (i=0; i < nunits; i++)
11238 RTVEC_ELT (v, i) = GEN_INT (val);
11240 return gen_rtx_CONST_VECTOR (mode, v);
11243 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11245 bool
11246 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11248 machine_mode vmode;
11250 gcc_assert (!VECTOR_MODE_P (mode));
11251 vmode = aarch64_preferred_simd_mode (mode);
11252 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11253 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11256 /* Construct and return a PARALLEL RTX vector with elements numbering the
11257 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11258 the vector - from the perspective of the architecture. This does not
11259 line up with GCC's perspective on lane numbers, so we end up with
11260 different masks depending on our target endian-ness. The diagram
11261 below may help. We must draw the distinction when building masks
11262 which select one half of the vector. An instruction selecting
11263 architectural low-lanes for a big-endian target, must be described using
11264 a mask selecting GCC high-lanes.
11266 Big-Endian Little-Endian
11268 GCC 0 1 2 3 3 2 1 0
11269 | x | x | x | x | | x | x | x | x |
11270 Architecture 3 2 1 0 3 2 1 0
11272 Low Mask: { 2, 3 } { 0, 1 }
11273 High Mask: { 0, 1 } { 2, 3 }
11277 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11279 int nunits = GET_MODE_NUNITS (mode);
11280 rtvec v = rtvec_alloc (nunits / 2);
11281 int high_base = nunits / 2;
11282 int low_base = 0;
11283 int base;
11284 rtx t1;
11285 int i;
11287 if (BYTES_BIG_ENDIAN)
11288 base = high ? low_base : high_base;
11289 else
11290 base = high ? high_base : low_base;
11292 for (i = 0; i < nunits / 2; i++)
11293 RTVEC_ELT (v, i) = GEN_INT (base + i);
11295 t1 = gen_rtx_PARALLEL (mode, v);
11296 return t1;
11299 /* Check OP for validity as a PARALLEL RTX vector with elements
11300 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11301 from the perspective of the architecture. See the diagram above
11302 aarch64_simd_vect_par_cnst_half for more details. */
11304 bool
11305 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11306 bool high)
11308 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11309 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11310 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11311 int i = 0;
11313 if (!VECTOR_MODE_P (mode))
11314 return false;
11316 if (count_op != count_ideal)
11317 return false;
11319 for (i = 0; i < count_ideal; i++)
11321 rtx elt_op = XVECEXP (op, 0, i);
11322 rtx elt_ideal = XVECEXP (ideal, 0, i);
11324 if (!CONST_INT_P (elt_op)
11325 || INTVAL (elt_ideal) != INTVAL (elt_op))
11326 return false;
11328 return true;
11331 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11332 HIGH (exclusive). */
11333 void
11334 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11335 const_tree exp)
11337 HOST_WIDE_INT lane;
11338 gcc_assert (CONST_INT_P (operand));
11339 lane = INTVAL (operand);
11341 if (lane < low || lane >= high)
11343 if (exp)
11344 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11345 else
11346 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11350 /* Return TRUE if OP is a valid vector addressing mode. */
11351 bool
11352 aarch64_simd_mem_operand_p (rtx op)
11354 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11355 || REG_P (XEXP (op, 0)));
11358 /* Emit a register copy from operand to operand, taking care not to
11359 early-clobber source registers in the process.
11361 COUNT is the number of components into which the copy needs to be
11362 decomposed. */
11363 void
11364 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11365 unsigned int count)
11367 unsigned int i;
11368 int rdest = REGNO (operands[0]);
11369 int rsrc = REGNO (operands[1]);
11371 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11372 || rdest < rsrc)
11373 for (i = 0; i < count; i++)
11374 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11375 gen_rtx_REG (mode, rsrc + i));
11376 else
11377 for (i = 0; i < count; i++)
11378 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11379 gen_rtx_REG (mode, rsrc + count - i - 1));
11382 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11383 one of VSTRUCT modes: OI, CI, or XI. */
11385 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11387 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11390 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11391 alignment of a vector to 128 bits. */
11392 static HOST_WIDE_INT
11393 aarch64_simd_vector_alignment (const_tree type)
11395 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11396 return MIN (align, 128);
11399 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11400 static bool
11401 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11403 if (is_packed)
11404 return false;
11406 /* We guarantee alignment for vectors up to 128-bits. */
11407 if (tree_int_cst_compare (TYPE_SIZE (type),
11408 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11409 return false;
11411 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11412 return true;
11415 /* If VALS is a vector constant that can be loaded into a register
11416 using DUP, generate instructions to do so and return an RTX to
11417 assign to the register. Otherwise return NULL_RTX. */
11418 static rtx
11419 aarch64_simd_dup_constant (rtx vals)
11421 machine_mode mode = GET_MODE (vals);
11422 machine_mode inner_mode = GET_MODE_INNER (mode);
11423 rtx x;
11425 if (!const_vec_duplicate_p (vals, &x))
11426 return NULL_RTX;
11428 /* We can load this constant by using DUP and a constant in a
11429 single ARM register. This will be cheaper than a vector
11430 load. */
11431 x = copy_to_mode_reg (inner_mode, x);
11432 return gen_rtx_VEC_DUPLICATE (mode, x);
11436 /* Generate code to load VALS, which is a PARALLEL containing only
11437 constants (for vec_init) or CONST_VECTOR, efficiently into a
11438 register. Returns an RTX to copy into the register, or NULL_RTX
11439 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11440 static rtx
11441 aarch64_simd_make_constant (rtx vals)
11443 machine_mode mode = GET_MODE (vals);
11444 rtx const_dup;
11445 rtx const_vec = NULL_RTX;
11446 int n_elts = GET_MODE_NUNITS (mode);
11447 int n_const = 0;
11448 int i;
11450 if (GET_CODE (vals) == CONST_VECTOR)
11451 const_vec = vals;
11452 else if (GET_CODE (vals) == PARALLEL)
11454 /* A CONST_VECTOR must contain only CONST_INTs and
11455 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11456 Only store valid constants in a CONST_VECTOR. */
11457 for (i = 0; i < n_elts; ++i)
11459 rtx x = XVECEXP (vals, 0, i);
11460 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11461 n_const++;
11463 if (n_const == n_elts)
11464 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11466 else
11467 gcc_unreachable ();
11469 if (const_vec != NULL_RTX
11470 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11471 /* Load using MOVI/MVNI. */
11472 return const_vec;
11473 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11474 /* Loaded using DUP. */
11475 return const_dup;
11476 else if (const_vec != NULL_RTX)
11477 /* Load from constant pool. We can not take advantage of single-cycle
11478 LD1 because we need a PC-relative addressing mode. */
11479 return const_vec;
11480 else
11481 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11482 We can not construct an initializer. */
11483 return NULL_RTX;
11486 /* Expand a vector initialisation sequence, such that TARGET is
11487 initialised to contain VALS. */
11489 void
11490 aarch64_expand_vector_init (rtx target, rtx vals)
11492 machine_mode mode = GET_MODE (target);
11493 machine_mode inner_mode = GET_MODE_INNER (mode);
11494 /* The number of vector elements. */
11495 int n_elts = GET_MODE_NUNITS (mode);
11496 /* The number of vector elements which are not constant. */
11497 int n_var = 0;
11498 rtx any_const = NULL_RTX;
11499 /* The first element of vals. */
11500 rtx v0 = XVECEXP (vals, 0, 0);
11501 bool all_same = true;
11503 /* Count the number of variable elements to initialise. */
11504 for (int i = 0; i < n_elts; ++i)
11506 rtx x = XVECEXP (vals, 0, i);
11507 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11508 ++n_var;
11509 else
11510 any_const = x;
11512 all_same &= rtx_equal_p (x, v0);
11515 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11516 how best to handle this. */
11517 if (n_var == 0)
11519 rtx constant = aarch64_simd_make_constant (vals);
11520 if (constant != NULL_RTX)
11522 emit_move_insn (target, constant);
11523 return;
11527 /* Splat a single non-constant element if we can. */
11528 if (all_same)
11530 rtx x = copy_to_mode_reg (inner_mode, v0);
11531 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11532 return;
11535 /* Initialise a vector which is part-variable. We want to first try
11536 to build those lanes which are constant in the most efficient way we
11537 can. */
11538 if (n_var != n_elts)
11540 rtx copy = copy_rtx (vals);
11542 /* Load constant part of vector. We really don't care what goes into the
11543 parts we will overwrite, but we're more likely to be able to load the
11544 constant efficiently if it has fewer, larger, repeating parts
11545 (see aarch64_simd_valid_immediate). */
11546 for (int i = 0; i < n_elts; i++)
11548 rtx x = XVECEXP (vals, 0, i);
11549 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11550 continue;
11551 rtx subst = any_const;
11552 for (int bit = n_elts / 2; bit > 0; bit /= 2)
11554 /* Look in the copied vector, as more elements are const. */
11555 rtx test = XVECEXP (copy, 0, i ^ bit);
11556 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11558 subst = test;
11559 break;
11562 XVECEXP (copy, 0, i) = subst;
11564 aarch64_expand_vector_init (target, copy);
11567 /* Insert the variable lanes directly. */
11569 enum insn_code icode = optab_handler (vec_set_optab, mode);
11570 gcc_assert (icode != CODE_FOR_nothing);
11572 for (int i = 0; i < n_elts; i++)
11574 rtx x = XVECEXP (vals, 0, i);
11575 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11576 continue;
11577 x = copy_to_mode_reg (inner_mode, x);
11578 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11582 static unsigned HOST_WIDE_INT
11583 aarch64_shift_truncation_mask (machine_mode mode)
11585 return
11586 (!SHIFT_COUNT_TRUNCATED
11587 || aarch64_vector_mode_supported_p (mode)
11588 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11591 /* Select a format to encode pointers in exception handling data. */
11593 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11595 int type;
11596 switch (aarch64_cmodel)
11598 case AARCH64_CMODEL_TINY:
11599 case AARCH64_CMODEL_TINY_PIC:
11600 case AARCH64_CMODEL_SMALL:
11601 case AARCH64_CMODEL_SMALL_PIC:
11602 case AARCH64_CMODEL_SMALL_SPIC:
11603 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
11604 for everything. */
11605 type = DW_EH_PE_sdata4;
11606 break;
11607 default:
11608 /* No assumptions here. 8-byte relocs required. */
11609 type = DW_EH_PE_sdata8;
11610 break;
11612 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11615 /* The last .arch and .tune assembly strings that we printed. */
11616 static std::string aarch64_last_printed_arch_string;
11617 static std::string aarch64_last_printed_tune_string;
11619 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
11620 by the function fndecl. */
11622 void
11623 aarch64_declare_function_name (FILE *stream, const char* name,
11624 tree fndecl)
11626 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11628 struct cl_target_option *targ_options;
11629 if (target_parts)
11630 targ_options = TREE_TARGET_OPTION (target_parts);
11631 else
11632 targ_options = TREE_TARGET_OPTION (target_option_current_node);
11633 gcc_assert (targ_options);
11635 const struct processor *this_arch
11636 = aarch64_get_arch (targ_options->x_explicit_arch);
11638 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11639 std::string extension
11640 = aarch64_get_extension_string_for_isa_flags (isa_flags,
11641 this_arch->flags);
11642 /* Only update the assembler .arch string if it is distinct from the last
11643 such string we printed. */
11644 std::string to_print = this_arch->name + extension;
11645 if (to_print != aarch64_last_printed_arch_string)
11647 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11648 aarch64_last_printed_arch_string = to_print;
11651 /* Print the cpu name we're tuning for in the comments, might be
11652 useful to readers of the generated asm. Do it only when it changes
11653 from function to function and verbose assembly is requested. */
11654 const struct processor *this_tune
11655 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11657 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11659 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11660 this_tune->name);
11661 aarch64_last_printed_tune_string = this_tune->name;
11664 /* Don't forget the type directive for ELF. */
11665 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11666 ASM_OUTPUT_LABEL (stream, name);
11669 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
11671 static void
11672 aarch64_start_file (void)
11674 struct cl_target_option *default_options
11675 = TREE_TARGET_OPTION (target_option_default_node);
11677 const struct processor *default_arch
11678 = aarch64_get_arch (default_options->x_explicit_arch);
11679 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11680 std::string extension
11681 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11682 default_arch->flags);
11684 aarch64_last_printed_arch_string = default_arch->name + extension;
11685 aarch64_last_printed_tune_string = "";
11686 asm_fprintf (asm_out_file, "\t.arch %s\n",
11687 aarch64_last_printed_arch_string.c_str ());
11689 default_file_start ();
11692 /* Emit load exclusive. */
11694 static void
11695 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11696 rtx mem, rtx model_rtx)
11698 rtx (*gen) (rtx, rtx, rtx);
11700 switch (mode)
11702 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11703 case HImode: gen = gen_aarch64_load_exclusivehi; break;
11704 case SImode: gen = gen_aarch64_load_exclusivesi; break;
11705 case DImode: gen = gen_aarch64_load_exclusivedi; break;
11706 default:
11707 gcc_unreachable ();
11710 emit_insn (gen (rval, mem, model_rtx));
11713 /* Emit store exclusive. */
11715 static void
11716 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11717 rtx rval, rtx mem, rtx model_rtx)
11719 rtx (*gen) (rtx, rtx, rtx, rtx);
11721 switch (mode)
11723 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11724 case HImode: gen = gen_aarch64_store_exclusivehi; break;
11725 case SImode: gen = gen_aarch64_store_exclusivesi; break;
11726 case DImode: gen = gen_aarch64_store_exclusivedi; break;
11727 default:
11728 gcc_unreachable ();
11731 emit_insn (gen (bval, rval, mem, model_rtx));
11734 /* Mark the previous jump instruction as unlikely. */
11736 static void
11737 aarch64_emit_unlikely_jump (rtx insn)
11739 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11741 rtx_insn *jump = emit_jump_insn (insn);
11742 add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
11745 /* Expand a compare and swap pattern. */
11747 void
11748 aarch64_expand_compare_and_swap (rtx operands[])
11750 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11751 machine_mode mode, cmp_mode;
11752 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11753 int idx;
11754 gen_cas_fn gen;
11755 const gen_cas_fn split_cas[] =
11757 gen_aarch64_compare_and_swapqi,
11758 gen_aarch64_compare_and_swaphi,
11759 gen_aarch64_compare_and_swapsi,
11760 gen_aarch64_compare_and_swapdi
11762 const gen_cas_fn atomic_cas[] =
11764 gen_aarch64_compare_and_swapqi_lse,
11765 gen_aarch64_compare_and_swaphi_lse,
11766 gen_aarch64_compare_and_swapsi_lse,
11767 gen_aarch64_compare_and_swapdi_lse
11770 bval = operands[0];
11771 rval = operands[1];
11772 mem = operands[2];
11773 oldval = operands[3];
11774 newval = operands[4];
11775 is_weak = operands[5];
11776 mod_s = operands[6];
11777 mod_f = operands[7];
11778 mode = GET_MODE (mem);
11779 cmp_mode = mode;
11781 /* Normally the succ memory model must be stronger than fail, but in the
11782 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11783 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
11785 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11786 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11787 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11789 switch (mode)
11791 case QImode:
11792 case HImode:
11793 /* For short modes, we're going to perform the comparison in SImode,
11794 so do the zero-extension now. */
11795 cmp_mode = SImode;
11796 rval = gen_reg_rtx (SImode);
11797 oldval = convert_modes (SImode, mode, oldval, true);
11798 /* Fall through. */
11800 case SImode:
11801 case DImode:
11802 /* Force the value into a register if needed. */
11803 if (!aarch64_plus_operand (oldval, mode))
11804 oldval = force_reg (cmp_mode, oldval);
11805 break;
11807 default:
11808 gcc_unreachable ();
11811 switch (mode)
11813 case QImode: idx = 0; break;
11814 case HImode: idx = 1; break;
11815 case SImode: idx = 2; break;
11816 case DImode: idx = 3; break;
11817 default:
11818 gcc_unreachable ();
11820 if (TARGET_LSE)
11821 gen = atomic_cas[idx];
11822 else
11823 gen = split_cas[idx];
11825 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11827 if (mode == QImode || mode == HImode)
11828 emit_move_insn (operands[1], gen_lowpart (mode, rval));
11830 x = gen_rtx_REG (CCmode, CC_REGNUM);
11831 x = gen_rtx_EQ (SImode, x, const0_rtx);
11832 emit_insn (gen_rtx_SET (bval, x));
11835 /* Test whether the target supports using a atomic load-operate instruction.
11836 CODE is the operation and AFTER is TRUE if the data in memory after the
11837 operation should be returned and FALSE if the data before the operation
11838 should be returned. Returns FALSE if the operation isn't supported by the
11839 architecture. */
11841 bool
11842 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11844 if (!TARGET_LSE)
11845 return false;
11847 switch (code)
11849 case SET:
11850 case AND:
11851 case IOR:
11852 case XOR:
11853 case MINUS:
11854 case PLUS:
11855 return true;
11856 default:
11857 return false;
11861 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11862 sequence implementing an atomic operation. */
11864 static void
11865 aarch64_emit_post_barrier (enum memmodel model)
11867 const enum memmodel base_model = memmodel_base (model);
11869 if (is_mm_sync (model)
11870 && (base_model == MEMMODEL_ACQUIRE
11871 || base_model == MEMMODEL_ACQ_REL
11872 || base_model == MEMMODEL_SEQ_CST))
11874 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11878 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
11879 for the data in memory. EXPECTED is the value expected to be in memory.
11880 DESIRED is the value to store to memory. MEM is the memory location. MODEL
11881 is the memory ordering to use. */
11883 void
11884 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11885 rtx expected, rtx desired,
11886 rtx model)
11888 rtx (*gen) (rtx, rtx, rtx, rtx);
11889 machine_mode mode;
11891 mode = GET_MODE (mem);
11893 switch (mode)
11895 case QImode: gen = gen_aarch64_atomic_casqi; break;
11896 case HImode: gen = gen_aarch64_atomic_cashi; break;
11897 case SImode: gen = gen_aarch64_atomic_cassi; break;
11898 case DImode: gen = gen_aarch64_atomic_casdi; break;
11899 default:
11900 gcc_unreachable ();
11903 /* Move the expected value into the CAS destination register. */
11904 emit_insn (gen_rtx_SET (rval, expected));
11906 /* Emit the CAS. */
11907 emit_insn (gen (rval, mem, desired, model));
11909 /* Compare the expected value with the value loaded by the CAS, to establish
11910 whether the swap was made. */
11911 aarch64_gen_compare_reg (EQ, rval, expected);
11914 /* Split a compare and swap pattern. */
11916 void
11917 aarch64_split_compare_and_swap (rtx operands[])
11919 rtx rval, mem, oldval, newval, scratch;
11920 machine_mode mode;
11921 bool is_weak;
11922 rtx_code_label *label1, *label2;
11923 rtx x, cond;
11924 enum memmodel model;
11925 rtx model_rtx;
11927 rval = operands[0];
11928 mem = operands[1];
11929 oldval = operands[2];
11930 newval = operands[3];
11931 is_weak = (operands[4] != const0_rtx);
11932 model_rtx = operands[5];
11933 scratch = operands[7];
11934 mode = GET_MODE (mem);
11935 model = memmodel_from_int (INTVAL (model_rtx));
11937 label1 = NULL;
11938 if (!is_weak)
11940 label1 = gen_label_rtx ();
11941 emit_label (label1);
11943 label2 = gen_label_rtx ();
11945 /* The initial load can be relaxed for a __sync operation since a final
11946 barrier will be emitted to stop code hoisting. */
11947 if (is_mm_sync (model))
11948 aarch64_emit_load_exclusive (mode, rval, mem,
11949 GEN_INT (MEMMODEL_RELAXED));
11950 else
11951 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11953 cond = aarch64_gen_compare_reg (NE, rval, oldval);
11954 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11955 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11956 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11957 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11959 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11961 if (!is_weak)
11963 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11964 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11965 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11966 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11968 else
11970 cond = gen_rtx_REG (CCmode, CC_REGNUM);
11971 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11972 emit_insn (gen_rtx_SET (cond, x));
11975 emit_label (label2);
11977 /* Emit any final barrier needed for a __sync operation. */
11978 if (is_mm_sync (model))
11979 aarch64_emit_post_barrier (model);
11982 /* Emit a BIC instruction. */
11984 static void
11985 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11987 rtx shift_rtx = GEN_INT (shift);
11988 rtx (*gen) (rtx, rtx, rtx, rtx);
11990 switch (mode)
11992 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11993 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11994 default:
11995 gcc_unreachable ();
11998 emit_insn (gen (dst, s2, shift_rtx, s1));
12001 /* Emit an atomic swap. */
12003 static void
12004 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12005 rtx mem, rtx model)
12007 rtx (*gen) (rtx, rtx, rtx, rtx);
12009 switch (mode)
12011 case QImode: gen = gen_aarch64_atomic_swpqi; break;
12012 case HImode: gen = gen_aarch64_atomic_swphi; break;
12013 case SImode: gen = gen_aarch64_atomic_swpsi; break;
12014 case DImode: gen = gen_aarch64_atomic_swpdi; break;
12015 default:
12016 gcc_unreachable ();
12019 emit_insn (gen (dst, mem, value, model));
12022 /* Operations supported by aarch64_emit_atomic_load_op. */
12024 enum aarch64_atomic_load_op_code
12026 AARCH64_LDOP_PLUS, /* A + B */
12027 AARCH64_LDOP_XOR, /* A ^ B */
12028 AARCH64_LDOP_OR, /* A | B */
12029 AARCH64_LDOP_BIC /* A & ~B */
12032 /* Emit an atomic load-operate. */
12034 static void
12035 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12036 machine_mode mode, rtx dst, rtx src,
12037 rtx mem, rtx model)
12039 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12040 const aarch64_atomic_load_op_fn plus[] =
12042 gen_aarch64_atomic_loadaddqi,
12043 gen_aarch64_atomic_loadaddhi,
12044 gen_aarch64_atomic_loadaddsi,
12045 gen_aarch64_atomic_loadadddi
12047 const aarch64_atomic_load_op_fn eor[] =
12049 gen_aarch64_atomic_loadeorqi,
12050 gen_aarch64_atomic_loadeorhi,
12051 gen_aarch64_atomic_loadeorsi,
12052 gen_aarch64_atomic_loadeordi
12054 const aarch64_atomic_load_op_fn ior[] =
12056 gen_aarch64_atomic_loadsetqi,
12057 gen_aarch64_atomic_loadsethi,
12058 gen_aarch64_atomic_loadsetsi,
12059 gen_aarch64_atomic_loadsetdi
12061 const aarch64_atomic_load_op_fn bic[] =
12063 gen_aarch64_atomic_loadclrqi,
12064 gen_aarch64_atomic_loadclrhi,
12065 gen_aarch64_atomic_loadclrsi,
12066 gen_aarch64_atomic_loadclrdi
12068 aarch64_atomic_load_op_fn gen;
12069 int idx = 0;
12071 switch (mode)
12073 case QImode: idx = 0; break;
12074 case HImode: idx = 1; break;
12075 case SImode: idx = 2; break;
12076 case DImode: idx = 3; break;
12077 default:
12078 gcc_unreachable ();
12081 switch (code)
12083 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12084 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12085 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12086 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12087 default:
12088 gcc_unreachable ();
12091 emit_insn (gen (dst, mem, src, model));
12094 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12095 location to store the data read from memory. OUT_RESULT is the location to
12096 store the result of the operation. MEM is the memory location to read and
12097 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12098 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12099 be NULL. */
12101 void
12102 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12103 rtx mem, rtx value, rtx model_rtx)
12105 machine_mode mode = GET_MODE (mem);
12106 machine_mode wmode = (mode == DImode ? DImode : SImode);
12107 const bool short_mode = (mode < SImode);
12108 aarch64_atomic_load_op_code ldop_code;
12109 rtx src;
12110 rtx x;
12112 if (out_data)
12113 out_data = gen_lowpart (mode, out_data);
12115 if (out_result)
12116 out_result = gen_lowpart (mode, out_result);
12118 /* Make sure the value is in a register, putting it into a destination
12119 register if it needs to be manipulated. */
12120 if (!register_operand (value, mode)
12121 || code == AND || code == MINUS)
12123 src = out_result ? out_result : out_data;
12124 emit_move_insn (src, gen_lowpart (mode, value));
12126 else
12127 src = value;
12128 gcc_assert (register_operand (src, mode));
12130 /* Preprocess the data for the operation as necessary. If the operation is
12131 a SET then emit a swap instruction and finish. */
12132 switch (code)
12134 case SET:
12135 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12136 return;
12138 case MINUS:
12139 /* Negate the value and treat it as a PLUS. */
12141 rtx neg_src;
12143 /* Resize the value if necessary. */
12144 if (short_mode)
12145 src = gen_lowpart (wmode, src);
12147 neg_src = gen_rtx_NEG (wmode, src);
12148 emit_insn (gen_rtx_SET (src, neg_src));
12150 if (short_mode)
12151 src = gen_lowpart (mode, src);
12153 /* Fall-through. */
12154 case PLUS:
12155 ldop_code = AARCH64_LDOP_PLUS;
12156 break;
12158 case IOR:
12159 ldop_code = AARCH64_LDOP_OR;
12160 break;
12162 case XOR:
12163 ldop_code = AARCH64_LDOP_XOR;
12164 break;
12166 case AND:
12168 rtx not_src;
12170 /* Resize the value if necessary. */
12171 if (short_mode)
12172 src = gen_lowpart (wmode, src);
12174 not_src = gen_rtx_NOT (wmode, src);
12175 emit_insn (gen_rtx_SET (src, not_src));
12177 if (short_mode)
12178 src = gen_lowpart (mode, src);
12180 ldop_code = AARCH64_LDOP_BIC;
12181 break;
12183 default:
12184 /* The operation can't be done with atomic instructions. */
12185 gcc_unreachable ();
12188 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12190 /* If necessary, calculate the data in memory after the update by redoing the
12191 operation from values in registers. */
12192 if (!out_result)
12193 return;
12195 if (short_mode)
12197 src = gen_lowpart (wmode, src);
12198 out_data = gen_lowpart (wmode, out_data);
12199 out_result = gen_lowpart (wmode, out_result);
12202 x = NULL_RTX;
12204 switch (code)
12206 case MINUS:
12207 case PLUS:
12208 x = gen_rtx_PLUS (wmode, out_data, src);
12209 break;
12210 case IOR:
12211 x = gen_rtx_IOR (wmode, out_data, src);
12212 break;
12213 case XOR:
12214 x = gen_rtx_XOR (wmode, out_data, src);
12215 break;
12216 case AND:
12217 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12218 return;
12219 default:
12220 gcc_unreachable ();
12223 emit_set_insn (out_result, x);
12225 return;
12228 /* Split an atomic operation. */
12230 void
12231 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12232 rtx value, rtx model_rtx, rtx cond)
12234 machine_mode mode = GET_MODE (mem);
12235 machine_mode wmode = (mode == DImode ? DImode : SImode);
12236 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12237 const bool is_sync = is_mm_sync (model);
12238 rtx_code_label *label;
12239 rtx x;
12241 /* Split the atomic operation into a sequence. */
12242 label = gen_label_rtx ();
12243 emit_label (label);
12245 if (new_out)
12246 new_out = gen_lowpart (wmode, new_out);
12247 if (old_out)
12248 old_out = gen_lowpart (wmode, old_out);
12249 else
12250 old_out = new_out;
12251 value = simplify_gen_subreg (wmode, value, mode, 0);
12253 /* The initial load can be relaxed for a __sync operation since a final
12254 barrier will be emitted to stop code hoisting. */
12255 if (is_sync)
12256 aarch64_emit_load_exclusive (mode, old_out, mem,
12257 GEN_INT (MEMMODEL_RELAXED));
12258 else
12259 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12261 switch (code)
12263 case SET:
12264 new_out = value;
12265 break;
12267 case NOT:
12268 x = gen_rtx_AND (wmode, old_out, value);
12269 emit_insn (gen_rtx_SET (new_out, x));
12270 x = gen_rtx_NOT (wmode, new_out);
12271 emit_insn (gen_rtx_SET (new_out, x));
12272 break;
12274 case MINUS:
12275 if (CONST_INT_P (value))
12277 value = GEN_INT (-INTVAL (value));
12278 code = PLUS;
12280 /* Fall through. */
12282 default:
12283 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12284 emit_insn (gen_rtx_SET (new_out, x));
12285 break;
12288 aarch64_emit_store_exclusive (mode, cond, mem,
12289 gen_lowpart (mode, new_out), model_rtx);
12291 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12292 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12293 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12294 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12296 /* Emit any final barrier needed for a __sync operation. */
12297 if (is_sync)
12298 aarch64_emit_post_barrier (model);
12301 static void
12302 aarch64_init_libfuncs (void)
12304 /* Half-precision float operations. The compiler handles all operations
12305 with NULL libfuncs by converting to SFmode. */
12307 /* Conversions. */
12308 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12309 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12311 /* Arithmetic. */
12312 set_optab_libfunc (add_optab, HFmode, NULL);
12313 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12314 set_optab_libfunc (smul_optab, HFmode, NULL);
12315 set_optab_libfunc (neg_optab, HFmode, NULL);
12316 set_optab_libfunc (sub_optab, HFmode, NULL);
12318 /* Comparisons. */
12319 set_optab_libfunc (eq_optab, HFmode, NULL);
12320 set_optab_libfunc (ne_optab, HFmode, NULL);
12321 set_optab_libfunc (lt_optab, HFmode, NULL);
12322 set_optab_libfunc (le_optab, HFmode, NULL);
12323 set_optab_libfunc (ge_optab, HFmode, NULL);
12324 set_optab_libfunc (gt_optab, HFmode, NULL);
12325 set_optab_libfunc (unord_optab, HFmode, NULL);
12328 /* Target hook for c_mode_for_suffix. */
12329 static machine_mode
12330 aarch64_c_mode_for_suffix (char suffix)
12332 if (suffix == 'q')
12333 return TFmode;
12335 return VOIDmode;
12338 /* We can only represent floating point constants which will fit in
12339 "quarter-precision" values. These values are characterised by
12340 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12343 (-1)^s * (n/16) * 2^r
12345 Where:
12346 's' is the sign bit.
12347 'n' is an integer in the range 16 <= n <= 31.
12348 'r' is an integer in the range -3 <= r <= 4. */
12350 /* Return true iff X can be represented by a quarter-precision
12351 floating point immediate operand X. Note, we cannot represent 0.0. */
12352 bool
12353 aarch64_float_const_representable_p (rtx x)
12355 /* This represents our current view of how many bits
12356 make up the mantissa. */
12357 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12358 int exponent;
12359 unsigned HOST_WIDE_INT mantissa, mask;
12360 REAL_VALUE_TYPE r, m;
12361 bool fail;
12363 if (!CONST_DOUBLE_P (x))
12364 return false;
12366 /* We don't support HFmode constants yet. */
12367 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12368 return false;
12370 r = *CONST_DOUBLE_REAL_VALUE (x);
12372 /* We cannot represent infinities, NaNs or +/-zero. We won't
12373 know if we have +zero until we analyse the mantissa, but we
12374 can reject the other invalid values. */
12375 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12376 || REAL_VALUE_MINUS_ZERO (r))
12377 return false;
12379 /* Extract exponent. */
12380 r = real_value_abs (&r);
12381 exponent = REAL_EXP (&r);
12383 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12384 highest (sign) bit, with a fixed binary point at bit point_pos.
12385 m1 holds the low part of the mantissa, m2 the high part.
12386 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12387 bits for the mantissa, this can fail (low bits will be lost). */
12388 real_ldexp (&m, &r, point_pos - exponent);
12389 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12391 /* If the low part of the mantissa has bits set we cannot represent
12392 the value. */
12393 if (w.elt (0) != 0)
12394 return false;
12395 /* We have rejected the lower HOST_WIDE_INT, so update our
12396 understanding of how many bits lie in the mantissa and
12397 look only at the high HOST_WIDE_INT. */
12398 mantissa = w.elt (1);
12399 point_pos -= HOST_BITS_PER_WIDE_INT;
12401 /* We can only represent values with a mantissa of the form 1.xxxx. */
12402 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12403 if ((mantissa & mask) != 0)
12404 return false;
12406 /* Having filtered unrepresentable values, we may now remove all
12407 but the highest 5 bits. */
12408 mantissa >>= point_pos - 5;
12410 /* We cannot represent the value 0.0, so reject it. This is handled
12411 elsewhere. */
12412 if (mantissa == 0)
12413 return false;
12415 /* Then, as bit 4 is always set, we can mask it off, leaving
12416 the mantissa in the range [0, 15]. */
12417 mantissa &= ~(1 << 4);
12418 gcc_assert (mantissa <= 15);
12420 /* GCC internally does not use IEEE754-like encoding (where normalized
12421 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12422 Our mantissa values are shifted 4 places to the left relative to
12423 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12424 by 5 places to correct for GCC's representation. */
12425 exponent = 5 - exponent;
12427 return (exponent >= 0 && exponent <= 7);
12430 char*
12431 aarch64_output_simd_mov_immediate (rtx const_vector,
12432 machine_mode mode,
12433 unsigned width)
12435 bool is_valid;
12436 static char templ[40];
12437 const char *mnemonic;
12438 const char *shift_op;
12439 unsigned int lane_count = 0;
12440 char element_char;
12442 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12444 /* This will return true to show const_vector is legal for use as either
12445 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12446 also update INFO to show how the immediate should be generated. */
12447 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12448 gcc_assert (is_valid);
12450 element_char = sizetochar (info.element_width);
12451 lane_count = width / info.element_width;
12453 mode = GET_MODE_INNER (mode);
12454 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12456 gcc_assert (info.shift == 0 && ! info.mvn);
12457 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12458 move immediate path. */
12459 if (aarch64_float_const_zero_rtx_p (info.value))
12460 info.value = GEN_INT (0);
12461 else
12463 const unsigned int buf_size = 20;
12464 char float_buf[buf_size] = {'\0'};
12465 real_to_decimal_for_mode (float_buf,
12466 CONST_DOUBLE_REAL_VALUE (info.value),
12467 buf_size, buf_size, 1, mode);
12469 if (lane_count == 1)
12470 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12471 else
12472 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12473 lane_count, element_char, float_buf);
12474 return templ;
12478 mnemonic = info.mvn ? "mvni" : "movi";
12479 shift_op = info.msl ? "msl" : "lsl";
12481 gcc_assert (CONST_INT_P (info.value));
12482 if (lane_count == 1)
12483 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12484 mnemonic, UINTVAL (info.value));
12485 else if (info.shift)
12486 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12487 ", %s %d", mnemonic, lane_count, element_char,
12488 UINTVAL (info.value), shift_op, info.shift);
12489 else
12490 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12491 mnemonic, lane_count, element_char, UINTVAL (info.value));
12492 return templ;
12495 char*
12496 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12497 machine_mode mode)
12499 machine_mode vmode;
12501 gcc_assert (!VECTOR_MODE_P (mode));
12502 vmode = aarch64_simd_container_mode (mode, 64);
12503 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12504 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12507 /* Split operands into moves from op[1] + op[2] into op[0]. */
12509 void
12510 aarch64_split_combinev16qi (rtx operands[3])
12512 unsigned int dest = REGNO (operands[0]);
12513 unsigned int src1 = REGNO (operands[1]);
12514 unsigned int src2 = REGNO (operands[2]);
12515 machine_mode halfmode = GET_MODE (operands[1]);
12516 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12517 rtx destlo, desthi;
12519 gcc_assert (halfmode == V16QImode);
12521 if (src1 == dest && src2 == dest + halfregs)
12523 /* No-op move. Can't split to nothing; emit something. */
12524 emit_note (NOTE_INSN_DELETED);
12525 return;
12528 /* Preserve register attributes for variable tracking. */
12529 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12530 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12531 GET_MODE_SIZE (halfmode));
12533 /* Special case of reversed high/low parts. */
12534 if (reg_overlap_mentioned_p (operands[2], destlo)
12535 && reg_overlap_mentioned_p (operands[1], desthi))
12537 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12538 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12539 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12541 else if (!reg_overlap_mentioned_p (operands[2], destlo))
12543 /* Try to avoid unnecessary moves if part of the result
12544 is in the right place already. */
12545 if (src1 != dest)
12546 emit_move_insn (destlo, operands[1]);
12547 if (src2 != dest + halfregs)
12548 emit_move_insn (desthi, operands[2]);
12550 else
12552 if (src2 != dest + halfregs)
12553 emit_move_insn (desthi, operands[2]);
12554 if (src1 != dest)
12555 emit_move_insn (destlo, operands[1]);
12559 /* vec_perm support. */
12561 #define MAX_VECT_LEN 16
12563 struct expand_vec_perm_d
12565 rtx target, op0, op1;
12566 unsigned char perm[MAX_VECT_LEN];
12567 machine_mode vmode;
12568 unsigned char nelt;
12569 bool one_vector_p;
12570 bool testing_p;
12573 /* Generate a variable permutation. */
12575 static void
12576 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12578 machine_mode vmode = GET_MODE (target);
12579 bool one_vector_p = rtx_equal_p (op0, op1);
12581 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12582 gcc_checking_assert (GET_MODE (op0) == vmode);
12583 gcc_checking_assert (GET_MODE (op1) == vmode);
12584 gcc_checking_assert (GET_MODE (sel) == vmode);
12585 gcc_checking_assert (TARGET_SIMD);
12587 if (one_vector_p)
12589 if (vmode == V8QImode)
12591 /* Expand the argument to a V16QI mode by duplicating it. */
12592 rtx pair = gen_reg_rtx (V16QImode);
12593 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12594 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12596 else
12598 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12601 else
12603 rtx pair;
12605 if (vmode == V8QImode)
12607 pair = gen_reg_rtx (V16QImode);
12608 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12609 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12611 else
12613 pair = gen_reg_rtx (OImode);
12614 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12615 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12620 void
12621 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12623 machine_mode vmode = GET_MODE (target);
12624 unsigned int nelt = GET_MODE_NUNITS (vmode);
12625 bool one_vector_p = rtx_equal_p (op0, op1);
12626 rtx mask;
12628 /* The TBL instruction does not use a modulo index, so we must take care
12629 of that ourselves. */
12630 mask = aarch64_simd_gen_const_vector_dup (vmode,
12631 one_vector_p ? nelt - 1 : 2 * nelt - 1);
12632 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12634 /* For big-endian, we also need to reverse the index within the vector
12635 (but not which vector). */
12636 if (BYTES_BIG_ENDIAN)
12638 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
12639 if (!one_vector_p)
12640 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12641 sel = expand_simple_binop (vmode, XOR, sel, mask,
12642 NULL, 0, OPTAB_LIB_WIDEN);
12644 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12647 /* Recognize patterns suitable for the TRN instructions. */
12648 static bool
12649 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12651 unsigned int i, odd, mask, nelt = d->nelt;
12652 rtx out, in0, in1, x;
12653 rtx (*gen) (rtx, rtx, rtx);
12654 machine_mode vmode = d->vmode;
12656 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12657 return false;
12659 /* Note that these are little-endian tests.
12660 We correct for big-endian later. */
12661 if (d->perm[0] == 0)
12662 odd = 0;
12663 else if (d->perm[0] == 1)
12664 odd = 1;
12665 else
12666 return false;
12667 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12669 for (i = 0; i < nelt; i += 2)
12671 if (d->perm[i] != i + odd)
12672 return false;
12673 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12674 return false;
12677 /* Success! */
12678 if (d->testing_p)
12679 return true;
12681 in0 = d->op0;
12682 in1 = d->op1;
12683 if (BYTES_BIG_ENDIAN)
12685 x = in0, in0 = in1, in1 = x;
12686 odd = !odd;
12688 out = d->target;
12690 if (odd)
12692 switch (vmode)
12694 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12695 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12696 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12697 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12698 case V4SImode: gen = gen_aarch64_trn2v4si; break;
12699 case V2SImode: gen = gen_aarch64_trn2v2si; break;
12700 case V2DImode: gen = gen_aarch64_trn2v2di; break;
12701 case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12702 case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12703 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12704 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12705 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12706 default:
12707 return false;
12710 else
12712 switch (vmode)
12714 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12715 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12716 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12717 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12718 case V4SImode: gen = gen_aarch64_trn1v4si; break;
12719 case V2SImode: gen = gen_aarch64_trn1v2si; break;
12720 case V2DImode: gen = gen_aarch64_trn1v2di; break;
12721 case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12722 case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12723 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12724 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12725 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12726 default:
12727 return false;
12731 emit_insn (gen (out, in0, in1));
12732 return true;
12735 /* Recognize patterns suitable for the UZP instructions. */
12736 static bool
12737 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12739 unsigned int i, odd, mask, nelt = d->nelt;
12740 rtx out, in0, in1, x;
12741 rtx (*gen) (rtx, rtx, rtx);
12742 machine_mode vmode = d->vmode;
12744 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12745 return false;
12747 /* Note that these are little-endian tests.
12748 We correct for big-endian later. */
12749 if (d->perm[0] == 0)
12750 odd = 0;
12751 else if (d->perm[0] == 1)
12752 odd = 1;
12753 else
12754 return false;
12755 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12757 for (i = 0; i < nelt; i++)
12759 unsigned elt = (i * 2 + odd) & mask;
12760 if (d->perm[i] != elt)
12761 return false;
12764 /* Success! */
12765 if (d->testing_p)
12766 return true;
12768 in0 = d->op0;
12769 in1 = d->op1;
12770 if (BYTES_BIG_ENDIAN)
12772 x = in0, in0 = in1, in1 = x;
12773 odd = !odd;
12775 out = d->target;
12777 if (odd)
12779 switch (vmode)
12781 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12782 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12783 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12784 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12785 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12786 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12787 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12788 case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12789 case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12790 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12791 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12792 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12793 default:
12794 return false;
12797 else
12799 switch (vmode)
12801 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12802 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12803 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12804 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12805 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12806 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12807 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12808 case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12809 case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12810 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12811 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12812 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12813 default:
12814 return false;
12818 emit_insn (gen (out, in0, in1));
12819 return true;
12822 /* Recognize patterns suitable for the ZIP instructions. */
12823 static bool
12824 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12826 unsigned int i, high, mask, nelt = d->nelt;
12827 rtx out, in0, in1, x;
12828 rtx (*gen) (rtx, rtx, rtx);
12829 machine_mode vmode = d->vmode;
12831 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12832 return false;
12834 /* Note that these are little-endian tests.
12835 We correct for big-endian later. */
12836 high = nelt / 2;
12837 if (d->perm[0] == high)
12838 /* Do Nothing. */
12840 else if (d->perm[0] == 0)
12841 high = 0;
12842 else
12843 return false;
12844 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12846 for (i = 0; i < nelt / 2; i++)
12848 unsigned elt = (i + high) & mask;
12849 if (d->perm[i * 2] != elt)
12850 return false;
12851 elt = (elt + nelt) & mask;
12852 if (d->perm[i * 2 + 1] != elt)
12853 return false;
12856 /* Success! */
12857 if (d->testing_p)
12858 return true;
12860 in0 = d->op0;
12861 in1 = d->op1;
12862 if (BYTES_BIG_ENDIAN)
12864 x = in0, in0 = in1, in1 = x;
12865 high = !high;
12867 out = d->target;
12869 if (high)
12871 switch (vmode)
12873 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12874 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12875 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12876 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12877 case V4SImode: gen = gen_aarch64_zip2v4si; break;
12878 case V2SImode: gen = gen_aarch64_zip2v2si; break;
12879 case V2DImode: gen = gen_aarch64_zip2v2di; break;
12880 case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12881 case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12882 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12883 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12884 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12885 default:
12886 return false;
12889 else
12891 switch (vmode)
12893 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12894 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12895 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12896 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12897 case V4SImode: gen = gen_aarch64_zip1v4si; break;
12898 case V2SImode: gen = gen_aarch64_zip1v2si; break;
12899 case V2DImode: gen = gen_aarch64_zip1v2di; break;
12900 case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
12901 case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
12902 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12903 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12904 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12905 default:
12906 return false;
12910 emit_insn (gen (out, in0, in1));
12911 return true;
12914 /* Recognize patterns for the EXT insn. */
12916 static bool
12917 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12919 unsigned int i, nelt = d->nelt;
12920 rtx (*gen) (rtx, rtx, rtx, rtx);
12921 rtx offset;
12923 unsigned int location = d->perm[0]; /* Always < nelt. */
12925 /* Check if the extracted indices are increasing by one. */
12926 for (i = 1; i < nelt; i++)
12928 unsigned int required = location + i;
12929 if (d->one_vector_p)
12931 /* We'll pass the same vector in twice, so allow indices to wrap. */
12932 required &= (nelt - 1);
12934 if (d->perm[i] != required)
12935 return false;
12938 switch (d->vmode)
12940 case V16QImode: gen = gen_aarch64_extv16qi; break;
12941 case V8QImode: gen = gen_aarch64_extv8qi; break;
12942 case V4HImode: gen = gen_aarch64_extv4hi; break;
12943 case V8HImode: gen = gen_aarch64_extv8hi; break;
12944 case V2SImode: gen = gen_aarch64_extv2si; break;
12945 case V4SImode: gen = gen_aarch64_extv4si; break;
12946 case V4HFmode: gen = gen_aarch64_extv4hf; break;
12947 case V8HFmode: gen = gen_aarch64_extv8hf; break;
12948 case V2SFmode: gen = gen_aarch64_extv2sf; break;
12949 case V4SFmode: gen = gen_aarch64_extv4sf; break;
12950 case V2DImode: gen = gen_aarch64_extv2di; break;
12951 case V2DFmode: gen = gen_aarch64_extv2df; break;
12952 default:
12953 return false;
12956 /* Success! */
12957 if (d->testing_p)
12958 return true;
12960 /* The case where (location == 0) is a no-op for both big- and little-endian,
12961 and is removed by the mid-end at optimization levels -O1 and higher. */
12963 if (BYTES_BIG_ENDIAN && (location != 0))
12965 /* After setup, we want the high elements of the first vector (stored
12966 at the LSB end of the register), and the low elements of the second
12967 vector (stored at the MSB end of the register). So swap. */
12968 std::swap (d->op0, d->op1);
12969 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
12970 location = nelt - location;
12973 offset = GEN_INT (location);
12974 emit_insn (gen (d->target, d->op0, d->op1, offset));
12975 return true;
12978 /* Recognize patterns for the REV insns. */
12980 static bool
12981 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12983 unsigned int i, j, diff, nelt = d->nelt;
12984 rtx (*gen) (rtx, rtx);
12986 if (!d->one_vector_p)
12987 return false;
12989 diff = d->perm[0];
12990 switch (diff)
12992 case 7:
12993 switch (d->vmode)
12995 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12996 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
12997 default:
12998 return false;
13000 break;
13001 case 3:
13002 switch (d->vmode)
13004 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13005 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
13006 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
13007 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
13008 default:
13009 return false;
13011 break;
13012 case 1:
13013 switch (d->vmode)
13015 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13016 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
13017 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
13018 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
13019 case V4SImode: gen = gen_aarch64_rev64v4si; break;
13020 case V2SImode: gen = gen_aarch64_rev64v2si; break;
13021 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13022 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13023 case V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13024 case V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13025 default:
13026 return false;
13028 break;
13029 default:
13030 return false;
13033 for (i = 0; i < nelt ; i += diff + 1)
13034 for (j = 0; j <= diff; j += 1)
13036 /* This is guaranteed to be true as the value of diff
13037 is 7, 3, 1 and we should have enough elements in the
13038 queue to generate this. Getting a vector mask with a
13039 value of diff other than these values implies that
13040 something is wrong by the time we get here. */
13041 gcc_assert (i + j < nelt);
13042 if (d->perm[i + j] != i + diff - j)
13043 return false;
13046 /* Success! */
13047 if (d->testing_p)
13048 return true;
13050 emit_insn (gen (d->target, d->op0));
13051 return true;
13054 static bool
13055 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13057 rtx (*gen) (rtx, rtx, rtx);
13058 rtx out = d->target;
13059 rtx in0;
13060 machine_mode vmode = d->vmode;
13061 unsigned int i, elt, nelt = d->nelt;
13062 rtx lane;
13064 elt = d->perm[0];
13065 for (i = 1; i < nelt; i++)
13067 if (elt != d->perm[i])
13068 return false;
13071 /* The generic preparation in aarch64_expand_vec_perm_const_1
13072 swaps the operand order and the permute indices if it finds
13073 d->perm[0] to be in the second operand. Thus, we can always
13074 use d->op0 and need not do any extra arithmetic to get the
13075 correct lane number. */
13076 in0 = d->op0;
13077 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13079 switch (vmode)
13081 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13082 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13083 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13084 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13085 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13086 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13087 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13088 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13089 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13090 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13091 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13092 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13093 default:
13094 return false;
13097 emit_insn (gen (out, in0, lane));
13098 return true;
13101 static bool
13102 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13104 rtx rperm[MAX_VECT_LEN], sel;
13105 machine_mode vmode = d->vmode;
13106 unsigned int i, nelt = d->nelt;
13108 if (d->testing_p)
13109 return true;
13111 /* Generic code will try constant permutation twice. Once with the
13112 original mode and again with the elements lowered to QImode.
13113 So wait and don't do the selector expansion ourselves. */
13114 if (vmode != V8QImode && vmode != V16QImode)
13115 return false;
13117 for (i = 0; i < nelt; ++i)
13119 int nunits = GET_MODE_NUNITS (vmode);
13121 /* If big-endian and two vectors we end up with a weird mixed-endian
13122 mode on NEON. Reverse the index within each word but not the word
13123 itself. */
13124 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13125 : d->perm[i]);
13127 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13128 sel = force_reg (vmode, sel);
13130 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13131 return true;
13134 static bool
13135 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13137 /* The pattern matching functions above are written to look for a small
13138 number to begin the sequence (0, 1, N/2). If we begin with an index
13139 from the second operand, we can swap the operands. */
13140 if (d->perm[0] >= d->nelt)
13142 unsigned i, nelt = d->nelt;
13144 gcc_assert (nelt == (nelt & -nelt));
13145 for (i = 0; i < nelt; ++i)
13146 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13148 std::swap (d->op0, d->op1);
13151 if (TARGET_SIMD)
13153 if (aarch64_evpc_rev (d))
13154 return true;
13155 else if (aarch64_evpc_ext (d))
13156 return true;
13157 else if (aarch64_evpc_dup (d))
13158 return true;
13159 else if (aarch64_evpc_zip (d))
13160 return true;
13161 else if (aarch64_evpc_uzp (d))
13162 return true;
13163 else if (aarch64_evpc_trn (d))
13164 return true;
13165 return aarch64_evpc_tbl (d);
13167 return false;
13170 /* Expand a vec_perm_const pattern. */
13172 bool
13173 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13175 struct expand_vec_perm_d d;
13176 int i, nelt, which;
13178 d.target = target;
13179 d.op0 = op0;
13180 d.op1 = op1;
13182 d.vmode = GET_MODE (target);
13183 gcc_assert (VECTOR_MODE_P (d.vmode));
13184 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13185 d.testing_p = false;
13187 for (i = which = 0; i < nelt; ++i)
13189 rtx e = XVECEXP (sel, 0, i);
13190 int ei = INTVAL (e) & (2 * nelt - 1);
13191 which |= (ei < nelt ? 1 : 2);
13192 d.perm[i] = ei;
13195 switch (which)
13197 default:
13198 gcc_unreachable ();
13200 case 3:
13201 d.one_vector_p = false;
13202 if (!rtx_equal_p (op0, op1))
13203 break;
13205 /* The elements of PERM do not suggest that only the first operand
13206 is used, but both operands are identical. Allow easier matching
13207 of the permutation by folding the permutation into the single
13208 input vector. */
13209 /* Fall Through. */
13210 case 2:
13211 for (i = 0; i < nelt; ++i)
13212 d.perm[i] &= nelt - 1;
13213 d.op0 = op1;
13214 d.one_vector_p = true;
13215 break;
13217 case 1:
13218 d.op1 = op0;
13219 d.one_vector_p = true;
13220 break;
13223 return aarch64_expand_vec_perm_const_1 (&d);
13226 static bool
13227 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13228 const unsigned char *sel)
13230 struct expand_vec_perm_d d;
13231 unsigned int i, nelt, which;
13232 bool ret;
13234 d.vmode = vmode;
13235 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13236 d.testing_p = true;
13237 memcpy (d.perm, sel, nelt);
13239 /* Calculate whether all elements are in one vector. */
13240 for (i = which = 0; i < nelt; ++i)
13242 unsigned char e = d.perm[i];
13243 gcc_assert (e < 2 * nelt);
13244 which |= (e < nelt ? 1 : 2);
13247 /* If all elements are from the second vector, reindex as if from the
13248 first vector. */
13249 if (which == 2)
13250 for (i = 0; i < nelt; ++i)
13251 d.perm[i] -= nelt;
13253 /* Check whether the mask can be applied to a single vector. */
13254 d.one_vector_p = (which != 3);
13256 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13257 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13258 if (!d.one_vector_p)
13259 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13261 start_sequence ();
13262 ret = aarch64_expand_vec_perm_const_1 (&d);
13263 end_sequence ();
13265 return ret;
13269 aarch64_reverse_mask (enum machine_mode mode)
13271 /* We have to reverse each vector because we dont have
13272 a permuted load that can reverse-load according to ABI rules. */
13273 rtx mask;
13274 rtvec v = rtvec_alloc (16);
13275 int i, j;
13276 int nunits = GET_MODE_NUNITS (mode);
13277 int usize = GET_MODE_UNIT_SIZE (mode);
13279 gcc_assert (BYTES_BIG_ENDIAN);
13280 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13282 for (i = 0; i < nunits; i++)
13283 for (j = 0; j < usize; j++)
13284 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13285 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13286 return force_reg (V16QImode, mask);
13289 /* Implement MODES_TIEABLE_P. In principle we should always return true.
13290 However due to issues with register allocation it is preferable to avoid
13291 tieing integer scalar and FP scalar modes. Executing integer operations
13292 in general registers is better than treating them as scalar vector
13293 operations. This reduces latency and avoids redundant int<->FP moves.
13294 So tie modes if they are either the same class, or vector modes with
13295 other vector modes, vector structs or any scalar mode.
13298 bool
13299 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13301 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13302 return true;
13304 /* We specifically want to allow elements of "structure" modes to
13305 be tieable to the structure. This more general condition allows
13306 other rarer situations too. */
13307 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13308 return true;
13310 /* Also allow any scalar modes with vectors. */
13311 if (aarch64_vector_mode_supported_p (mode1)
13312 || aarch64_vector_mode_supported_p (mode2))
13313 return true;
13315 return false;
13318 /* Return a new RTX holding the result of moving POINTER forward by
13319 AMOUNT bytes. */
13321 static rtx
13322 aarch64_move_pointer (rtx pointer, int amount)
13324 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13326 return adjust_automodify_address (pointer, GET_MODE (pointer),
13327 next, amount);
13330 /* Return a new RTX holding the result of moving POINTER forward by the
13331 size of the mode it points to. */
13333 static rtx
13334 aarch64_progress_pointer (rtx pointer)
13336 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13338 return aarch64_move_pointer (pointer, amount);
13341 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13342 MODE bytes. */
13344 static void
13345 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13346 machine_mode mode)
13348 rtx reg = gen_reg_rtx (mode);
13350 /* "Cast" the pointers to the correct mode. */
13351 *src = adjust_address (*src, mode, 0);
13352 *dst = adjust_address (*dst, mode, 0);
13353 /* Emit the memcpy. */
13354 emit_move_insn (reg, *src);
13355 emit_move_insn (*dst, reg);
13356 /* Move the pointers forward. */
13357 *src = aarch64_progress_pointer (*src);
13358 *dst = aarch64_progress_pointer (*dst);
13361 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13362 we succeed, otherwise return false. */
13364 bool
13365 aarch64_expand_movmem (rtx *operands)
13367 unsigned int n;
13368 rtx dst = operands[0];
13369 rtx src = operands[1];
13370 rtx base;
13371 bool speed_p = !optimize_function_for_size_p (cfun);
13373 /* When optimizing for size, give a better estimate of the length of a
13374 memcpy call, but use the default otherwise. */
13375 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13377 /* We can't do anything smart if the amount to copy is not constant. */
13378 if (!CONST_INT_P (operands[2]))
13379 return false;
13381 n = UINTVAL (operands[2]);
13383 /* Try to keep the number of instructions low. For cases below 16 bytes we
13384 need to make at most two moves. For cases above 16 bytes it will be one
13385 move for each 16 byte chunk, then at most two additional moves. */
13386 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13387 return false;
13389 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13390 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13392 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13393 src = adjust_automodify_address (src, VOIDmode, base, 0);
13395 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13396 1-byte chunk. */
13397 if (n < 4)
13399 if (n >= 2)
13401 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13402 n -= 2;
13405 if (n == 1)
13406 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13408 return true;
13411 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13412 4-byte chunk, partially overlapping with the previously copied chunk. */
13413 if (n < 8)
13415 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13416 n -= 4;
13417 if (n > 0)
13419 int move = n - 4;
13421 src = aarch64_move_pointer (src, move);
13422 dst = aarch64_move_pointer (dst, move);
13423 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13425 return true;
13428 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13429 them, then (if applicable) an 8-byte chunk. */
13430 while (n >= 8)
13432 if (n / 16)
13434 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13435 n -= 16;
13437 else
13439 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13440 n -= 8;
13444 /* Finish the final bytes of the copy. We can always do this in one
13445 instruction. We either copy the exact amount we need, or partially
13446 overlap with the previous chunk we copied and copy 8-bytes. */
13447 if (n == 0)
13448 return true;
13449 else if (n == 1)
13450 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13451 else if (n == 2)
13452 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13453 else if (n == 4)
13454 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13455 else
13457 if (n == 3)
13459 src = aarch64_move_pointer (src, -1);
13460 dst = aarch64_move_pointer (dst, -1);
13461 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13463 else
13465 int move = n - 8;
13467 src = aarch64_move_pointer (src, move);
13468 dst = aarch64_move_pointer (dst, move);
13469 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13473 return true;
13476 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13477 SImode stores. Handle the case when the constant has identical
13478 bottom and top halves. This is beneficial when the two stores can be
13479 merged into an STP and we avoid synthesising potentially expensive
13480 immediates twice. Return true if such a split is possible. */
13482 bool
13483 aarch64_split_dimode_const_store (rtx dst, rtx src)
13485 rtx lo = gen_lowpart (SImode, src);
13486 rtx hi = gen_highpart_mode (SImode, DImode, src);
13488 bool size_p = optimize_function_for_size_p (cfun);
13490 if (!rtx_equal_p (lo, hi))
13491 return false;
13493 unsigned int orig_cost
13494 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13495 unsigned int lo_cost
13496 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13498 /* We want to transform:
13499 MOV x1, 49370
13500 MOVK x1, 0x140, lsl 16
13501 MOVK x1, 0xc0da, lsl 32
13502 MOVK x1, 0x140, lsl 48
13503 STR x1, [x0]
13504 into:
13505 MOV w1, 49370
13506 MOVK w1, 0x140, lsl 16
13507 STP w1, w1, [x0]
13508 So we want to perform this only when we save two instructions
13509 or more. When optimizing for size, however, accept any code size
13510 savings we can. */
13511 if (size_p && orig_cost <= lo_cost)
13512 return false;
13514 if (!size_p
13515 && (orig_cost <= lo_cost + 1))
13516 return false;
13518 rtx mem_lo = adjust_address (dst, SImode, 0);
13519 if (!aarch64_mem_pair_operand (mem_lo, SImode))
13520 return false;
13522 rtx tmp_reg = gen_reg_rtx (SImode);
13523 aarch64_expand_mov_immediate (tmp_reg, lo);
13524 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13525 /* Don't emit an explicit store pair as this may not be always profitable.
13526 Let the sched-fusion logic decide whether to merge them. */
13527 emit_move_insn (mem_lo, tmp_reg);
13528 emit_move_insn (mem_hi, tmp_reg);
13530 return true;
13533 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13535 static unsigned HOST_WIDE_INT
13536 aarch64_asan_shadow_offset (void)
13538 return (HOST_WIDE_INT_1 << 36);
13541 static bool
13542 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13543 unsigned int align,
13544 enum by_pieces_operation op,
13545 bool speed_p)
13547 /* STORE_BY_PIECES can be used when copying a constant string, but
13548 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13549 For now we always fail this and let the move_by_pieces code copy
13550 the string from read-only memory. */
13551 if (op == STORE_BY_PIECES)
13552 return false;
13554 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13557 static rtx
13558 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13559 int code, tree treeop0, tree treeop1)
13561 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13562 rtx op0, op1;
13563 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13564 insn_code icode;
13565 struct expand_operand ops[4];
13567 start_sequence ();
13568 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13570 op_mode = GET_MODE (op0);
13571 if (op_mode == VOIDmode)
13572 op_mode = GET_MODE (op1);
13574 switch (op_mode)
13576 case QImode:
13577 case HImode:
13578 case SImode:
13579 cmp_mode = SImode;
13580 icode = CODE_FOR_cmpsi;
13581 break;
13583 case DImode:
13584 cmp_mode = DImode;
13585 icode = CODE_FOR_cmpdi;
13586 break;
13588 case SFmode:
13589 cmp_mode = SFmode;
13590 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13591 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13592 break;
13594 case DFmode:
13595 cmp_mode = DFmode;
13596 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13597 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13598 break;
13600 default:
13601 end_sequence ();
13602 return NULL_RTX;
13605 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13606 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13607 if (!op0 || !op1)
13609 end_sequence ();
13610 return NULL_RTX;
13612 *prep_seq = get_insns ();
13613 end_sequence ();
13615 create_fixed_operand (&ops[0], op0);
13616 create_fixed_operand (&ops[1], op1);
13618 start_sequence ();
13619 if (!maybe_expand_insn (icode, 2, ops))
13621 end_sequence ();
13622 return NULL_RTX;
13624 *gen_seq = get_insns ();
13625 end_sequence ();
13627 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13628 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13631 static rtx
13632 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
13633 int cmp_code, tree treeop0, tree treeop1, int bit_code)
13635 rtx op0, op1, target;
13636 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13637 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13638 insn_code icode;
13639 struct expand_operand ops[6];
13640 int aarch64_cond;
13642 push_to_sequence (*prep_seq);
13643 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13645 op_mode = GET_MODE (op0);
13646 if (op_mode == VOIDmode)
13647 op_mode = GET_MODE (op1);
13649 switch (op_mode)
13651 case QImode:
13652 case HImode:
13653 case SImode:
13654 cmp_mode = SImode;
13655 icode = CODE_FOR_ccmpsi;
13656 break;
13658 case DImode:
13659 cmp_mode = DImode;
13660 icode = CODE_FOR_ccmpdi;
13661 break;
13663 case SFmode:
13664 cmp_mode = SFmode;
13665 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13666 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13667 break;
13669 case DFmode:
13670 cmp_mode = DFmode;
13671 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13672 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13673 break;
13675 default:
13676 end_sequence ();
13677 return NULL_RTX;
13680 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13681 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13682 if (!op0 || !op1)
13684 end_sequence ();
13685 return NULL_RTX;
13687 *prep_seq = get_insns ();
13688 end_sequence ();
13690 target = gen_rtx_REG (cc_mode, CC_REGNUM);
13691 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13693 if (bit_code != AND)
13695 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13696 GET_MODE (XEXP (prev, 0))),
13697 VOIDmode, XEXP (prev, 0), const0_rtx);
13698 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13701 create_fixed_operand (&ops[0], XEXP (prev, 0));
13702 create_fixed_operand (&ops[1], target);
13703 create_fixed_operand (&ops[2], op0);
13704 create_fixed_operand (&ops[3], op1);
13705 create_fixed_operand (&ops[4], prev);
13706 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13708 push_to_sequence (*gen_seq);
13709 if (!maybe_expand_insn (icode, 6, ops))
13711 end_sequence ();
13712 return NULL_RTX;
13715 *gen_seq = get_insns ();
13716 end_sequence ();
13718 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13721 #undef TARGET_GEN_CCMP_FIRST
13722 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13724 #undef TARGET_GEN_CCMP_NEXT
13725 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13727 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
13728 instruction fusion of some sort. */
13730 static bool
13731 aarch64_macro_fusion_p (void)
13733 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13737 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
13738 should be kept together during scheduling. */
13740 static bool
13741 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13743 rtx set_dest;
13744 rtx prev_set = single_set (prev);
13745 rtx curr_set = single_set (curr);
13746 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
13747 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13749 if (!aarch64_macro_fusion_p ())
13750 return false;
13752 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13754 /* We are trying to match:
13755 prev (mov) == (set (reg r0) (const_int imm16))
13756 curr (movk) == (set (zero_extract (reg r0)
13757 (const_int 16)
13758 (const_int 16))
13759 (const_int imm16_1)) */
13761 set_dest = SET_DEST (curr_set);
13763 if (GET_CODE (set_dest) == ZERO_EXTRACT
13764 && CONST_INT_P (SET_SRC (curr_set))
13765 && CONST_INT_P (SET_SRC (prev_set))
13766 && CONST_INT_P (XEXP (set_dest, 2))
13767 && INTVAL (XEXP (set_dest, 2)) == 16
13768 && REG_P (XEXP (set_dest, 0))
13769 && REG_P (SET_DEST (prev_set))
13770 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13772 return true;
13776 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13779 /* We're trying to match:
13780 prev (adrp) == (set (reg r1)
13781 (high (symbol_ref ("SYM"))))
13782 curr (add) == (set (reg r0)
13783 (lo_sum (reg r1)
13784 (symbol_ref ("SYM"))))
13785 Note that r0 need not necessarily be the same as r1, especially
13786 during pre-regalloc scheduling. */
13788 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13789 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13791 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13792 && REG_P (XEXP (SET_SRC (curr_set), 0))
13793 && REGNO (XEXP (SET_SRC (curr_set), 0))
13794 == REGNO (SET_DEST (prev_set))
13795 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13796 XEXP (SET_SRC (curr_set), 1)))
13797 return true;
13801 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13804 /* We're trying to match:
13805 prev (movk) == (set (zero_extract (reg r0)
13806 (const_int 16)
13807 (const_int 32))
13808 (const_int imm16_1))
13809 curr (movk) == (set (zero_extract (reg r0)
13810 (const_int 16)
13811 (const_int 48))
13812 (const_int imm16_2)) */
13814 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13815 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13816 && REG_P (XEXP (SET_DEST (prev_set), 0))
13817 && REG_P (XEXP (SET_DEST (curr_set), 0))
13818 && REGNO (XEXP (SET_DEST (prev_set), 0))
13819 == REGNO (XEXP (SET_DEST (curr_set), 0))
13820 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13821 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13822 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13823 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13824 && CONST_INT_P (SET_SRC (prev_set))
13825 && CONST_INT_P (SET_SRC (curr_set)))
13826 return true;
13829 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13831 /* We're trying to match:
13832 prev (adrp) == (set (reg r0)
13833 (high (symbol_ref ("SYM"))))
13834 curr (ldr) == (set (reg r1)
13835 (mem (lo_sum (reg r0)
13836 (symbol_ref ("SYM")))))
13838 curr (ldr) == (set (reg r1)
13839 (zero_extend (mem
13840 (lo_sum (reg r0)
13841 (symbol_ref ("SYM")))))) */
13842 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13843 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13845 rtx curr_src = SET_SRC (curr_set);
13847 if (GET_CODE (curr_src) == ZERO_EXTEND)
13848 curr_src = XEXP (curr_src, 0);
13850 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13851 && REG_P (XEXP (XEXP (curr_src, 0), 0))
13852 && REGNO (XEXP (XEXP (curr_src, 0), 0))
13853 == REGNO (SET_DEST (prev_set))
13854 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13855 XEXP (SET_SRC (prev_set), 0)))
13856 return true;
13860 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
13861 && aarch_crypto_can_dual_issue (prev, curr))
13862 return true;
13864 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
13865 && any_condjump_p (curr))
13867 enum attr_type prev_type = get_attr_type (prev);
13869 /* FIXME: this misses some which is considered simple arthematic
13870 instructions for ThunderX. Simple shifts are missed here. */
13871 if (prev_type == TYPE_ALUS_SREG
13872 || prev_type == TYPE_ALUS_IMM
13873 || prev_type == TYPE_LOGICS_REG
13874 || prev_type == TYPE_LOGICS_IMM)
13875 return true;
13878 return false;
13881 /* Return true iff the instruction fusion described by OP is enabled. */
13883 bool
13884 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13886 return (aarch64_tune_params.fusible_ops & op) != 0;
13889 /* If MEM is in the form of [base+offset], extract the two parts
13890 of address and set to BASE and OFFSET, otherwise return false
13891 after clearing BASE and OFFSET. */
13893 bool
13894 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13896 rtx addr;
13898 gcc_assert (MEM_P (mem));
13900 addr = XEXP (mem, 0);
13902 if (REG_P (addr))
13904 *base = addr;
13905 *offset = const0_rtx;
13906 return true;
13909 if (GET_CODE (addr) == PLUS
13910 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13912 *base = XEXP (addr, 0);
13913 *offset = XEXP (addr, 1);
13914 return true;
13917 *base = NULL_RTX;
13918 *offset = NULL_RTX;
13920 return false;
13923 /* Types for scheduling fusion. */
13924 enum sched_fusion_type
13926 SCHED_FUSION_NONE = 0,
13927 SCHED_FUSION_LD_SIGN_EXTEND,
13928 SCHED_FUSION_LD_ZERO_EXTEND,
13929 SCHED_FUSION_LD,
13930 SCHED_FUSION_ST,
13931 SCHED_FUSION_NUM
13934 /* If INSN is a load or store of address in the form of [base+offset],
13935 extract the two parts and set to BASE and OFFSET. Return scheduling
13936 fusion type this INSN is. */
13938 static enum sched_fusion_type
13939 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13941 rtx x, dest, src;
13942 enum sched_fusion_type fusion = SCHED_FUSION_LD;
13944 gcc_assert (INSN_P (insn));
13945 x = PATTERN (insn);
13946 if (GET_CODE (x) != SET)
13947 return SCHED_FUSION_NONE;
13949 src = SET_SRC (x);
13950 dest = SET_DEST (x);
13952 machine_mode dest_mode = GET_MODE (dest);
13954 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13955 return SCHED_FUSION_NONE;
13957 if (GET_CODE (src) == SIGN_EXTEND)
13959 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13960 src = XEXP (src, 0);
13961 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13962 return SCHED_FUSION_NONE;
13964 else if (GET_CODE (src) == ZERO_EXTEND)
13966 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13967 src = XEXP (src, 0);
13968 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13969 return SCHED_FUSION_NONE;
13972 if (GET_CODE (src) == MEM && REG_P (dest))
13973 extract_base_offset_in_addr (src, base, offset);
13974 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13976 fusion = SCHED_FUSION_ST;
13977 extract_base_offset_in_addr (dest, base, offset);
13979 else
13980 return SCHED_FUSION_NONE;
13982 if (*base == NULL_RTX || *offset == NULL_RTX)
13983 fusion = SCHED_FUSION_NONE;
13985 return fusion;
13988 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13990 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13991 and PRI are only calculated for these instructions. For other instruction,
13992 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
13993 type instruction fusion can be added by returning different priorities.
13995 It's important that irrelevant instructions get the largest FUSION_PRI. */
13997 static void
13998 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13999 int *fusion_pri, int *pri)
14001 int tmp, off_val;
14002 rtx base, offset;
14003 enum sched_fusion_type fusion;
14005 gcc_assert (INSN_P (insn));
14007 tmp = max_pri - 1;
14008 fusion = fusion_load_store (insn, &base, &offset);
14009 if (fusion == SCHED_FUSION_NONE)
14011 *pri = tmp;
14012 *fusion_pri = tmp;
14013 return;
14016 /* Set FUSION_PRI according to fusion type and base register. */
14017 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14019 /* Calculate PRI. */
14020 tmp /= 2;
14022 /* INSN with smaller offset goes first. */
14023 off_val = (int)(INTVAL (offset));
14024 if (off_val >= 0)
14025 tmp -= (off_val & 0xfffff);
14026 else
14027 tmp += ((- off_val) & 0xfffff);
14029 *pri = tmp;
14030 return;
14033 /* Given OPERANDS of consecutive load/store, check if we can merge
14034 them into ldp/stp. LOAD is true if they are load instructions.
14035 MODE is the mode of memory operands. */
14037 bool
14038 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14039 enum machine_mode mode)
14041 HOST_WIDE_INT offval_1, offval_2, msize;
14042 enum reg_class rclass_1, rclass_2;
14043 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14045 if (load)
14047 mem_1 = operands[1];
14048 mem_2 = operands[3];
14049 reg_1 = operands[0];
14050 reg_2 = operands[2];
14051 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14052 if (REGNO (reg_1) == REGNO (reg_2))
14053 return false;
14055 else
14057 mem_1 = operands[0];
14058 mem_2 = operands[2];
14059 reg_1 = operands[1];
14060 reg_2 = operands[3];
14063 /* The mems cannot be volatile. */
14064 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14065 return false;
14067 /* If we have SImode and slow unaligned ldp,
14068 check the alignment to be at least 8 byte. */
14069 if (mode == SImode
14070 && (aarch64_tune_params.extra_tuning_flags
14071 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14072 && !optimize_size
14073 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14074 return false;
14076 /* Check if the addresses are in the form of [base+offset]. */
14077 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14078 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14079 return false;
14080 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14081 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14082 return false;
14084 /* Check if the bases are same. */
14085 if (!rtx_equal_p (base_1, base_2))
14086 return false;
14088 offval_1 = INTVAL (offset_1);
14089 offval_2 = INTVAL (offset_2);
14090 msize = GET_MODE_SIZE (mode);
14091 /* Check if the offsets are consecutive. */
14092 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14093 return false;
14095 /* Check if the addresses are clobbered by load. */
14096 if (load)
14098 if (reg_mentioned_p (reg_1, mem_1))
14099 return false;
14101 /* In increasing order, the last load can clobber the address. */
14102 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14103 return false;
14106 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14107 rclass_1 = FP_REGS;
14108 else
14109 rclass_1 = GENERAL_REGS;
14111 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14112 rclass_2 = FP_REGS;
14113 else
14114 rclass_2 = GENERAL_REGS;
14116 /* Check if the registers are of same class. */
14117 if (rclass_1 != rclass_2)
14118 return false;
14120 return true;
14123 /* Given OPERANDS of consecutive load/store, check if we can merge
14124 them into ldp/stp by adjusting the offset. LOAD is true if they
14125 are load instructions. MODE is the mode of memory operands.
14127 Given below consecutive stores:
14129 str w1, [xb, 0x100]
14130 str w1, [xb, 0x104]
14131 str w1, [xb, 0x108]
14132 str w1, [xb, 0x10c]
14134 Though the offsets are out of the range supported by stp, we can
14135 still pair them after adjusting the offset, like:
14137 add scratch, xb, 0x100
14138 stp w1, w1, [scratch]
14139 stp w1, w1, [scratch, 0x8]
14141 The peephole patterns detecting this opportunity should guarantee
14142 the scratch register is avaliable. */
14144 bool
14145 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14146 enum machine_mode mode)
14148 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14149 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14150 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14151 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14153 if (load)
14155 reg_1 = operands[0];
14156 mem_1 = operands[1];
14157 reg_2 = operands[2];
14158 mem_2 = operands[3];
14159 reg_3 = operands[4];
14160 mem_3 = operands[5];
14161 reg_4 = operands[6];
14162 mem_4 = operands[7];
14163 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14164 && REG_P (reg_3) && REG_P (reg_4));
14165 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14166 return false;
14168 else
14170 mem_1 = operands[0];
14171 reg_1 = operands[1];
14172 mem_2 = operands[2];
14173 reg_2 = operands[3];
14174 mem_3 = operands[4];
14175 reg_3 = operands[5];
14176 mem_4 = operands[6];
14177 reg_4 = operands[7];
14179 /* Skip if memory operand is by itslef valid for ldp/stp. */
14180 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14181 return false;
14183 /* The mems cannot be volatile. */
14184 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14185 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14186 return false;
14188 /* Check if the addresses are in the form of [base+offset]. */
14189 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14190 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14191 return false;
14192 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14193 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14194 return false;
14195 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14196 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14197 return false;
14198 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14199 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14200 return false;
14202 /* Check if the bases are same. */
14203 if (!rtx_equal_p (base_1, base_2)
14204 || !rtx_equal_p (base_2, base_3)
14205 || !rtx_equal_p (base_3, base_4))
14206 return false;
14208 offval_1 = INTVAL (offset_1);
14209 offval_2 = INTVAL (offset_2);
14210 offval_3 = INTVAL (offset_3);
14211 offval_4 = INTVAL (offset_4);
14212 msize = GET_MODE_SIZE (mode);
14213 /* Check if the offsets are consecutive. */
14214 if ((offval_1 != (offval_2 + msize)
14215 || offval_1 != (offval_3 + msize * 2)
14216 || offval_1 != (offval_4 + msize * 3))
14217 && (offval_4 != (offval_3 + msize)
14218 || offval_4 != (offval_2 + msize * 2)
14219 || offval_4 != (offval_1 + msize * 3)))
14220 return false;
14222 /* Check if the addresses are clobbered by load. */
14223 if (load)
14225 if (reg_mentioned_p (reg_1, mem_1)
14226 || reg_mentioned_p (reg_2, mem_2)
14227 || reg_mentioned_p (reg_3, mem_3))
14228 return false;
14230 /* In increasing order, the last load can clobber the address. */
14231 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14232 return false;
14235 /* If we have SImode and slow unaligned ldp,
14236 check the alignment to be at least 8 byte. */
14237 if (mode == SImode
14238 && (aarch64_tune_params.extra_tuning_flags
14239 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14240 && !optimize_size
14241 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14242 return false;
14244 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14245 rclass_1 = FP_REGS;
14246 else
14247 rclass_1 = GENERAL_REGS;
14249 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14250 rclass_2 = FP_REGS;
14251 else
14252 rclass_2 = GENERAL_REGS;
14254 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14255 rclass_3 = FP_REGS;
14256 else
14257 rclass_3 = GENERAL_REGS;
14259 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14260 rclass_4 = FP_REGS;
14261 else
14262 rclass_4 = GENERAL_REGS;
14264 /* Check if the registers are of same class. */
14265 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14266 return false;
14268 return true;
14271 /* Given OPERANDS of consecutive load/store, this function pairs them
14272 into ldp/stp after adjusting the offset. It depends on the fact
14273 that addresses of load/store instructions are in increasing order.
14274 MODE is the mode of memory operands. CODE is the rtl operator
14275 which should be applied to all memory operands, it's SIGN_EXTEND,
14276 ZERO_EXTEND or UNKNOWN. */
14278 bool
14279 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14280 enum machine_mode mode, RTX_CODE code)
14282 rtx base, offset, t1, t2;
14283 rtx mem_1, mem_2, mem_3, mem_4;
14284 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14286 if (load)
14288 mem_1 = operands[1];
14289 mem_2 = operands[3];
14290 mem_3 = operands[5];
14291 mem_4 = operands[7];
14293 else
14295 mem_1 = operands[0];
14296 mem_2 = operands[2];
14297 mem_3 = operands[4];
14298 mem_4 = operands[6];
14299 gcc_assert (code == UNKNOWN);
14302 extract_base_offset_in_addr (mem_1, &base, &offset);
14303 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14305 /* Adjust offset thus it can fit in ldp/stp instruction. */
14306 msize = GET_MODE_SIZE (mode);
14307 stp_off_limit = msize * 0x40;
14308 off_val = INTVAL (offset);
14309 abs_off = (off_val < 0) ? -off_val : off_val;
14310 new_off = abs_off % stp_off_limit;
14311 adj_off = abs_off - new_off;
14313 /* Further adjust to make sure all offsets are OK. */
14314 if ((new_off + msize * 2) >= stp_off_limit)
14316 adj_off += stp_off_limit;
14317 new_off -= stp_off_limit;
14320 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14321 if (adj_off >= 0x1000)
14322 return false;
14324 if (off_val < 0)
14326 adj_off = -adj_off;
14327 new_off = -new_off;
14330 /* Create new memory references. */
14331 mem_1 = change_address (mem_1, VOIDmode,
14332 plus_constant (DImode, operands[8], new_off));
14334 /* Check if the adjusted address is OK for ldp/stp. */
14335 if (!aarch64_mem_pair_operand (mem_1, mode))
14336 return false;
14338 msize = GET_MODE_SIZE (mode);
14339 mem_2 = change_address (mem_2, VOIDmode,
14340 plus_constant (DImode,
14341 operands[8],
14342 new_off + msize));
14343 mem_3 = change_address (mem_3, VOIDmode,
14344 plus_constant (DImode,
14345 operands[8],
14346 new_off + msize * 2));
14347 mem_4 = change_address (mem_4, VOIDmode,
14348 plus_constant (DImode,
14349 operands[8],
14350 new_off + msize * 3));
14352 if (code == ZERO_EXTEND)
14354 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14355 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14356 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14357 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14359 else if (code == SIGN_EXTEND)
14361 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14362 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14363 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14364 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14367 if (load)
14369 operands[1] = mem_1;
14370 operands[3] = mem_2;
14371 operands[5] = mem_3;
14372 operands[7] = mem_4;
14374 else
14376 operands[0] = mem_1;
14377 operands[2] = mem_2;
14378 operands[4] = mem_3;
14379 operands[6] = mem_4;
14382 /* Emit adjusting instruction. */
14383 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14384 /* Emit ldp/stp instructions. */
14385 t1 = gen_rtx_SET (operands[0], operands[1]);
14386 t2 = gen_rtx_SET (operands[2], operands[3]);
14387 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14388 t1 = gen_rtx_SET (operands[4], operands[5]);
14389 t2 = gen_rtx_SET (operands[6], operands[7]);
14390 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14391 return true;
14394 /* Return 1 if pseudo register should be created and used to hold
14395 GOT address for PIC code. */
14397 bool
14398 aarch64_use_pseudo_pic_reg (void)
14400 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14403 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
14405 static int
14406 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14408 switch (XINT (x, 1))
14410 case UNSPEC_GOTSMALLPIC:
14411 case UNSPEC_GOTSMALLPIC28K:
14412 case UNSPEC_GOTTINYPIC:
14413 return 0;
14414 default:
14415 break;
14418 return default_unspec_may_trap_p (x, flags);
14422 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14423 return the log2 of that value. Otherwise return -1. */
14426 aarch64_fpconst_pow_of_2 (rtx x)
14428 const REAL_VALUE_TYPE *r;
14430 if (!CONST_DOUBLE_P (x))
14431 return -1;
14433 r = CONST_DOUBLE_REAL_VALUE (x);
14435 if (REAL_VALUE_NEGATIVE (*r)
14436 || REAL_VALUE_ISNAN (*r)
14437 || REAL_VALUE_ISINF (*r)
14438 || !real_isinteger (r, DFmode))
14439 return -1;
14441 return exact_log2 (real_to_integer (r));
14444 /* If X is a vector of equal CONST_DOUBLE values and that value is
14445 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
14448 aarch64_vec_fpconst_pow_of_2 (rtx x)
14450 if (GET_CODE (x) != CONST_VECTOR)
14451 return -1;
14453 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14454 return -1;
14456 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14457 if (firstval <= 0)
14458 return -1;
14460 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14461 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14462 return -1;
14464 return firstval;
14467 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14468 to float.
14470 __fp16 always promotes through this hook.
14471 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14472 through the generic excess precision logic rather than here. */
14474 static tree
14475 aarch64_promoted_type (const_tree t)
14477 if (SCALAR_FLOAT_TYPE_P (t)
14478 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14479 return float_type_node;
14481 return NULL_TREE;
14484 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
14486 static bool
14487 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14488 optimization_type opt_type)
14490 switch (op)
14492 case rsqrt_optab:
14493 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14495 default:
14496 return true;
14500 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
14501 if MODE is HFmode, and punt to the generic implementation otherwise. */
14503 static bool
14504 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
14506 return (mode == HFmode
14507 ? true
14508 : default_libgcc_floating_mode_supported_p (mode));
14511 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
14512 if MODE is HFmode, and punt to the generic implementation otherwise. */
14514 static bool
14515 aarch64_scalar_mode_supported_p (machine_mode mode)
14517 return (mode == HFmode
14518 ? true
14519 : default_scalar_mode_supported_p (mode));
14522 /* Set the value of FLT_EVAL_METHOD.
14523 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
14525 0: evaluate all operations and constants, whose semantic type has at
14526 most the range and precision of type float, to the range and
14527 precision of float; evaluate all other operations and constants to
14528 the range and precision of the semantic type;
14530 N, where _FloatN is a supported interchange floating type
14531 evaluate all operations and constants, whose semantic type has at
14532 most the range and precision of _FloatN type, to the range and
14533 precision of the _FloatN type; evaluate all other operations and
14534 constants to the range and precision of the semantic type;
14536 If we have the ARMv8.2-A extensions then we support _Float16 in native
14537 precision, so we should set this to 16. Otherwise, we support the type,
14538 but want to evaluate expressions in float precision, so set this to
14539 0. */
14541 static enum flt_eval_method
14542 aarch64_excess_precision (enum excess_precision_type type)
14544 switch (type)
14546 case EXCESS_PRECISION_TYPE_FAST:
14547 case EXCESS_PRECISION_TYPE_STANDARD:
14548 /* We can calculate either in 16-bit range and precision or
14549 32-bit range and precision. Make that decision based on whether
14550 we have native support for the ARMv8.2-A 16-bit floating-point
14551 instructions or not. */
14552 return (TARGET_FP_F16INST
14553 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
14554 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
14555 case EXCESS_PRECISION_TYPE_IMPLICIT:
14556 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
14557 default:
14558 gcc_unreachable ();
14560 return FLT_EVAL_METHOD_UNPREDICTABLE;
14563 #undef TARGET_ADDRESS_COST
14564 #define TARGET_ADDRESS_COST aarch64_address_cost
14566 /* This hook will determines whether unnamed bitfields affect the alignment
14567 of the containing structure. The hook returns true if the structure
14568 should inherit the alignment requirements of an unnamed bitfield's
14569 type. */
14570 #undef TARGET_ALIGN_ANON_BITFIELD
14571 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14573 #undef TARGET_ASM_ALIGNED_DI_OP
14574 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14576 #undef TARGET_ASM_ALIGNED_HI_OP
14577 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14579 #undef TARGET_ASM_ALIGNED_SI_OP
14580 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14582 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14583 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14584 hook_bool_const_tree_hwi_hwi_const_tree_true
14586 #undef TARGET_ASM_FILE_START
14587 #define TARGET_ASM_FILE_START aarch64_start_file
14589 #undef TARGET_ASM_OUTPUT_MI_THUNK
14590 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14592 #undef TARGET_ASM_SELECT_RTX_SECTION
14593 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14595 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14596 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14598 #undef TARGET_BUILD_BUILTIN_VA_LIST
14599 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14601 #undef TARGET_CALLEE_COPIES
14602 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14604 #undef TARGET_CAN_ELIMINATE
14605 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14607 #undef TARGET_CAN_INLINE_P
14608 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14610 #undef TARGET_CANNOT_FORCE_CONST_MEM
14611 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14613 #undef TARGET_CASE_VALUES_THRESHOLD
14614 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14616 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14617 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14619 /* Only the least significant bit is used for initialization guard
14620 variables. */
14621 #undef TARGET_CXX_GUARD_MASK_BIT
14622 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14624 #undef TARGET_C_MODE_FOR_SUFFIX
14625 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14627 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14628 #undef TARGET_DEFAULT_TARGET_FLAGS
14629 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14630 #endif
14632 #undef TARGET_CLASS_MAX_NREGS
14633 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14635 #undef TARGET_BUILTIN_DECL
14636 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14638 #undef TARGET_BUILTIN_RECIPROCAL
14639 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14641 #undef TARGET_C_EXCESS_PRECISION
14642 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
14644 #undef TARGET_EXPAND_BUILTIN
14645 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14647 #undef TARGET_EXPAND_BUILTIN_VA_START
14648 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14650 #undef TARGET_FOLD_BUILTIN
14651 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14653 #undef TARGET_FUNCTION_ARG
14654 #define TARGET_FUNCTION_ARG aarch64_function_arg
14656 #undef TARGET_FUNCTION_ARG_ADVANCE
14657 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14659 #undef TARGET_FUNCTION_ARG_BOUNDARY
14660 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14662 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14663 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14665 #undef TARGET_FUNCTION_VALUE
14666 #define TARGET_FUNCTION_VALUE aarch64_function_value
14668 #undef TARGET_FUNCTION_VALUE_REGNO_P
14669 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14671 #undef TARGET_FRAME_POINTER_REQUIRED
14672 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14674 #undef TARGET_GIMPLE_FOLD_BUILTIN
14675 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14677 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14678 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14680 #undef TARGET_INIT_BUILTINS
14681 #define TARGET_INIT_BUILTINS aarch64_init_builtins
14683 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14684 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14685 aarch64_ira_change_pseudo_allocno_class
14687 #undef TARGET_LEGITIMATE_ADDRESS_P
14688 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14690 #undef TARGET_LEGITIMATE_CONSTANT_P
14691 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14693 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14694 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14695 aarch64_legitimize_address_displacement
14697 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14698 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14700 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
14701 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
14702 aarch64_libgcc_floating_mode_supported_p
14704 #undef TARGET_MANGLE_TYPE
14705 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14707 #undef TARGET_MEMORY_MOVE_COST
14708 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14710 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14711 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14713 #undef TARGET_MUST_PASS_IN_STACK
14714 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14716 /* This target hook should return true if accesses to volatile bitfields
14717 should use the narrowest mode possible. It should return false if these
14718 accesses should use the bitfield container type. */
14719 #undef TARGET_NARROW_VOLATILE_BITFIELD
14720 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14722 #undef TARGET_OPTION_OVERRIDE
14723 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14725 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14726 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14727 aarch64_override_options_after_change
14729 #undef TARGET_OPTION_SAVE
14730 #define TARGET_OPTION_SAVE aarch64_option_save
14732 #undef TARGET_OPTION_RESTORE
14733 #define TARGET_OPTION_RESTORE aarch64_option_restore
14735 #undef TARGET_OPTION_PRINT
14736 #define TARGET_OPTION_PRINT aarch64_option_print
14738 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14739 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14741 #undef TARGET_SET_CURRENT_FUNCTION
14742 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14744 #undef TARGET_PASS_BY_REFERENCE
14745 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14747 #undef TARGET_PREFERRED_RELOAD_CLASS
14748 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14750 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14751 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14753 #undef TARGET_PROMOTED_TYPE
14754 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14756 #undef TARGET_SECONDARY_RELOAD
14757 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14759 #undef TARGET_SHIFT_TRUNCATION_MASK
14760 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14762 #undef TARGET_SETUP_INCOMING_VARARGS
14763 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14765 #undef TARGET_STRUCT_VALUE_RTX
14766 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
14768 #undef TARGET_REGISTER_MOVE_COST
14769 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14771 #undef TARGET_RETURN_IN_MEMORY
14772 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14774 #undef TARGET_RETURN_IN_MSB
14775 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14777 #undef TARGET_RTX_COSTS
14778 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14780 #undef TARGET_SCALAR_MODE_SUPPORTED_P
14781 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
14783 #undef TARGET_SCHED_ISSUE_RATE
14784 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14786 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14787 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14788 aarch64_sched_first_cycle_multipass_dfa_lookahead
14790 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14791 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14792 aarch64_first_cycle_multipass_dfa_lookahead_guard
14794 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
14795 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
14796 aarch64_get_separate_components
14798 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
14799 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
14800 aarch64_components_for_bb
14802 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
14803 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
14804 aarch64_disqualify_components
14806 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
14807 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
14808 aarch64_emit_prologue_components
14810 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
14811 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
14812 aarch64_emit_epilogue_components
14814 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
14815 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
14816 aarch64_set_handled_components
14818 #undef TARGET_TRAMPOLINE_INIT
14819 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14821 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14822 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14824 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14825 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14827 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14828 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14830 #undef TARGET_VECTORIZE_ADD_STMT_COST
14831 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14833 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14834 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14835 aarch64_builtin_vectorization_cost
14837 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14838 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14840 #undef TARGET_VECTORIZE_BUILTINS
14841 #define TARGET_VECTORIZE_BUILTINS
14843 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14844 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14845 aarch64_builtin_vectorized_function
14847 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14848 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14849 aarch64_autovectorize_vector_sizes
14851 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14852 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14853 aarch64_atomic_assign_expand_fenv
14855 /* Section anchor support. */
14857 #undef TARGET_MIN_ANCHOR_OFFSET
14858 #define TARGET_MIN_ANCHOR_OFFSET -256
14860 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14861 byte offset; we can do much more for larger data types, but have no way
14862 to determine the size of the access. We assume accesses are aligned. */
14863 #undef TARGET_MAX_ANCHOR_OFFSET
14864 #define TARGET_MAX_ANCHOR_OFFSET 4095
14866 #undef TARGET_VECTOR_ALIGNMENT
14867 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14869 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14870 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14871 aarch64_simd_vector_alignment_reachable
14873 /* vec_perm support. */
14875 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14876 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14877 aarch64_vectorize_vec_perm_const_ok
14879 #undef TARGET_INIT_LIBFUNCS
14880 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14882 #undef TARGET_FIXED_CONDITION_CODE_REGS
14883 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14885 #undef TARGET_FLAGS_REGNUM
14886 #define TARGET_FLAGS_REGNUM CC_REGNUM
14888 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14889 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14891 #undef TARGET_ASAN_SHADOW_OFFSET
14892 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14894 #undef TARGET_LEGITIMIZE_ADDRESS
14895 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14897 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14898 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14899 aarch64_use_by_pieces_infrastructure_p
14901 #undef TARGET_CAN_USE_DOLOOP_P
14902 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14904 #undef TARGET_SCHED_MACRO_FUSION_P
14905 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14907 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14908 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14910 #undef TARGET_SCHED_FUSION_PRIORITY
14911 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14913 #undef TARGET_UNSPEC_MAY_TRAP_P
14914 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14916 #undef TARGET_USE_PSEUDO_PIC_REG
14917 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14919 #undef TARGET_PRINT_OPERAND
14920 #define TARGET_PRINT_OPERAND aarch64_print_operand
14922 #undef TARGET_PRINT_OPERAND_ADDRESS
14923 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14925 #undef TARGET_OPTAB_SUPPORTED_P
14926 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14928 #undef TARGET_OMIT_STRUCT_RETURN_REG
14929 #define TARGET_OMIT_STRUCT_RETURN_REG true
14931 struct gcc_target targetm = TARGET_INITIALIZER;
14933 #include "gt-aarch64.h"