[AArch64] Add ANDS pattern for CMP+ZERO_EXTEND
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blobe813d66b40a6a9abce0a913f3d643153917bfd05
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2016 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "gimple.h"
30 #include "cfghooks.h"
31 #include "cfgloop.h"
32 #include "df.h"
33 #include "tm_p.h"
34 #include "stringpool.h"
35 #include "optabs.h"
36 #include "regs.h"
37 #include "emit-rtl.h"
38 #include "recog.h"
39 #include "diagnostic.h"
40 #include "insn-attr.h"
41 #include "alias.h"
42 #include "fold-const.h"
43 #include "stor-layout.h"
44 #include "calls.h"
45 #include "varasm.h"
46 #include "output.h"
47 #include "flags.h"
48 #include "explow.h"
49 #include "expr.h"
50 #include "reload.h"
51 #include "langhooks.h"
52 #include "opts.h"
53 #include "params.h"
54 #include "gimplify.h"
55 #include "dwarf2.h"
56 #include "gimple-iterator.h"
57 #include "tree-vectorizer.h"
58 #include "aarch64-cost-tables.h"
59 #include "dumpfile.h"
60 #include "builtins.h"
61 #include "rtl-iter.h"
62 #include "tm-constrs.h"
63 #include "sched-int.h"
64 #include "cortex-a57-fma-steering.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
68 /* This file should be included last. */
69 #include "target-def.h"
71 /* Defined for convenience. */
72 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
74 /* Classifies an address.
76 ADDRESS_REG_IMM
77 A simple base register plus immediate offset.
79 ADDRESS_REG_WB
80 A base register indexed by immediate offset with writeback.
82 ADDRESS_REG_REG
83 A base register indexed by (optionally scaled) register.
85 ADDRESS_REG_UXTW
86 A base register indexed by (optionally scaled) zero-extended register.
88 ADDRESS_REG_SXTW
89 A base register indexed by (optionally scaled) sign-extended register.
91 ADDRESS_LO_SUM
92 A LO_SUM rtx with a base register and "LO12" symbol relocation.
94 ADDRESS_SYMBOLIC:
95 A constant symbolic address, in pc-relative literal pool. */
97 enum aarch64_address_type {
98 ADDRESS_REG_IMM,
99 ADDRESS_REG_WB,
100 ADDRESS_REG_REG,
101 ADDRESS_REG_UXTW,
102 ADDRESS_REG_SXTW,
103 ADDRESS_LO_SUM,
104 ADDRESS_SYMBOLIC
107 struct aarch64_address_info {
108 enum aarch64_address_type type;
109 rtx base;
110 rtx offset;
111 int shift;
112 enum aarch64_symbol_type symbol_type;
115 struct simd_immediate_info
117 rtx value;
118 int shift;
119 int element_width;
120 bool mvn;
121 bool msl;
124 /* The current code model. */
125 enum aarch64_code_model aarch64_cmodel;
127 #ifdef HAVE_AS_TLS
128 #undef TARGET_HAVE_TLS
129 #define TARGET_HAVE_TLS 1
130 #endif
132 static bool aarch64_composite_type_p (const_tree, machine_mode);
133 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
134 const_tree,
135 machine_mode *, int *,
136 bool *);
137 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
138 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
139 static void aarch64_override_options_after_change (void);
140 static bool aarch64_vector_mode_supported_p (machine_mode);
141 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
142 const unsigned char *sel);
143 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
145 /* Major revision number of the ARM Architecture implemented by the target. */
146 unsigned aarch64_architecture_version;
148 /* The processor for which instructions should be scheduled. */
149 enum aarch64_processor aarch64_tune = cortexa53;
151 /* Mask to specify which instruction scheduling options should be used. */
152 unsigned long aarch64_tune_flags = 0;
154 /* Global flag for PC relative loads. */
155 bool aarch64_pcrelative_literal_loads;
157 /* Support for command line parsing of boolean flags in the tuning
158 structures. */
159 struct aarch64_flag_desc
161 const char* name;
162 unsigned int flag;
165 #define AARCH64_FUSION_PAIR(name, internal_name) \
166 { name, AARCH64_FUSE_##internal_name },
167 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
169 { "none", AARCH64_FUSE_NOTHING },
170 #include "aarch64-fusion-pairs.def"
171 { "all", AARCH64_FUSE_ALL },
172 { NULL, AARCH64_FUSE_NOTHING }
174 #undef AARCH64_FUION_PAIR
176 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
177 { name, AARCH64_EXTRA_TUNE_##internal_name },
178 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
180 { "none", AARCH64_EXTRA_TUNE_NONE },
181 #include "aarch64-tuning-flags.def"
182 { "all", AARCH64_EXTRA_TUNE_ALL },
183 { NULL, AARCH64_EXTRA_TUNE_NONE }
185 #undef AARCH64_EXTRA_TUNING_OPTION
187 /* Tuning parameters. */
189 static const struct cpu_addrcost_table generic_addrcost_table =
192 0, /* hi */
193 0, /* si */
194 0, /* di */
195 0, /* ti */
197 0, /* pre_modify */
198 0, /* post_modify */
199 0, /* register_offset */
200 0, /* register_sextend */
201 0, /* register_zextend */
202 0 /* imm_offset */
205 static const struct cpu_addrcost_table cortexa57_addrcost_table =
208 1, /* hi */
209 0, /* si */
210 0, /* di */
211 1, /* ti */
213 0, /* pre_modify */
214 0, /* post_modify */
215 0, /* register_offset */
216 0, /* register_sextend */
217 0, /* register_zextend */
218 0, /* imm_offset */
221 static const struct cpu_addrcost_table exynosm1_addrcost_table =
224 0, /* hi */
225 0, /* si */
226 0, /* di */
227 2, /* ti */
229 0, /* pre_modify */
230 0, /* post_modify */
231 1, /* register_offset */
232 1, /* register_sextend */
233 2, /* register_zextend */
234 0, /* imm_offset */
237 static const struct cpu_addrcost_table xgene1_addrcost_table =
240 1, /* hi */
241 0, /* si */
242 0, /* di */
243 1, /* ti */
245 1, /* pre_modify */
246 0, /* post_modify */
247 0, /* register_offset */
248 1, /* register_sextend */
249 1, /* register_zextend */
250 0, /* imm_offset */
253 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
256 1, /* hi */
257 0, /* si */
258 0, /* di */
259 1, /* ti */
261 0, /* pre_modify */
262 0, /* post_modify */
263 0, /* register_offset */
264 0, /* register_sextend */
265 0, /* register_zextend */
266 0 /* imm_offset */
269 static const struct cpu_addrcost_table vulcan_addrcost_table =
272 0, /* hi */
273 0, /* si */
274 0, /* di */
275 2, /* ti */
277 0, /* pre_modify */
278 0, /* post_modify */
279 2, /* register_offset */
280 3, /* register_sextend */
281 3, /* register_zextend */
282 0, /* imm_offset */
285 static const struct cpu_regmove_cost generic_regmove_cost =
287 1, /* GP2GP */
288 /* Avoid the use of slow int<->fp moves for spilling by setting
289 their cost higher than memmov_cost. */
290 5, /* GP2FP */
291 5, /* FP2GP */
292 2 /* FP2FP */
295 static const struct cpu_regmove_cost cortexa57_regmove_cost =
297 1, /* GP2GP */
298 /* Avoid the use of slow int<->fp moves for spilling by setting
299 their cost higher than memmov_cost. */
300 5, /* GP2FP */
301 5, /* FP2GP */
302 2 /* FP2FP */
305 static const struct cpu_regmove_cost cortexa53_regmove_cost =
307 1, /* GP2GP */
308 /* Avoid the use of slow int<->fp moves for spilling by setting
309 their cost higher than memmov_cost. */
310 5, /* GP2FP */
311 5, /* FP2GP */
312 2 /* FP2FP */
315 static const struct cpu_regmove_cost exynosm1_regmove_cost =
317 1, /* GP2GP */
318 /* Avoid the use of slow int<->fp moves for spilling by setting
319 their cost higher than memmov_cost (actual, 4 and 9). */
320 9, /* GP2FP */
321 9, /* FP2GP */
322 1 /* FP2FP */
325 static const struct cpu_regmove_cost thunderx_regmove_cost =
327 2, /* GP2GP */
328 2, /* GP2FP */
329 6, /* FP2GP */
330 4 /* FP2FP */
333 static const struct cpu_regmove_cost xgene1_regmove_cost =
335 1, /* GP2GP */
336 /* Avoid the use of slow int<->fp moves for spilling by setting
337 their cost higher than memmov_cost. */
338 8, /* GP2FP */
339 8, /* FP2GP */
340 2 /* FP2FP */
343 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
345 2, /* GP2GP */
346 /* Avoid the use of int<->fp moves for spilling. */
347 6, /* GP2FP */
348 6, /* FP2GP */
349 4 /* FP2FP */
352 static const struct cpu_regmove_cost vulcan_regmove_cost =
354 1, /* GP2GP */
355 /* Avoid the use of int<->fp moves for spilling. */
356 8, /* GP2FP */
357 8, /* FP2GP */
358 4 /* FP2FP */
361 /* Generic costs for vector insn classes. */
362 static const struct cpu_vector_cost generic_vector_cost =
364 1, /* scalar_stmt_cost */
365 1, /* scalar_load_cost */
366 1, /* scalar_store_cost */
367 1, /* vec_stmt_cost */
368 2, /* vec_permute_cost */
369 1, /* vec_to_scalar_cost */
370 1, /* scalar_to_vec_cost */
371 1, /* vec_align_load_cost */
372 1, /* vec_unalign_load_cost */
373 1, /* vec_unalign_store_cost */
374 1, /* vec_store_cost */
375 3, /* cond_taken_branch_cost */
376 1 /* cond_not_taken_branch_cost */
379 /* ThunderX costs for vector insn classes. */
380 static const struct cpu_vector_cost thunderx_vector_cost =
382 1, /* scalar_stmt_cost */
383 3, /* scalar_load_cost */
384 1, /* scalar_store_cost */
385 4, /* vec_stmt_cost */
386 4, /* vec_permute_cost */
387 2, /* vec_to_scalar_cost */
388 2, /* scalar_to_vec_cost */
389 3, /* vec_align_load_cost */
390 10, /* vec_unalign_load_cost */
391 10, /* vec_unalign_store_cost */
392 1, /* vec_store_cost */
393 3, /* cond_taken_branch_cost */
394 3 /* cond_not_taken_branch_cost */
397 /* Generic costs for vector insn classes. */
398 static const struct cpu_vector_cost cortexa57_vector_cost =
400 1, /* scalar_stmt_cost */
401 4, /* scalar_load_cost */
402 1, /* scalar_store_cost */
403 3, /* vec_stmt_cost */
404 3, /* vec_permute_cost */
405 8, /* vec_to_scalar_cost */
406 8, /* scalar_to_vec_cost */
407 5, /* vec_align_load_cost */
408 5, /* vec_unalign_load_cost */
409 1, /* vec_unalign_store_cost */
410 1, /* vec_store_cost */
411 1, /* cond_taken_branch_cost */
412 1 /* cond_not_taken_branch_cost */
415 static const struct cpu_vector_cost exynosm1_vector_cost =
417 1, /* scalar_stmt_cost */
418 5, /* scalar_load_cost */
419 1, /* scalar_store_cost */
420 3, /* vec_stmt_cost */
421 3, /* vec_permute_cost */
422 3, /* vec_to_scalar_cost */
423 3, /* scalar_to_vec_cost */
424 5, /* vec_align_load_cost */
425 5, /* vec_unalign_load_cost */
426 1, /* vec_unalign_store_cost */
427 1, /* vec_store_cost */
428 1, /* cond_taken_branch_cost */
429 1 /* cond_not_taken_branch_cost */
432 /* Generic costs for vector insn classes. */
433 static const struct cpu_vector_cost xgene1_vector_cost =
435 1, /* scalar_stmt_cost */
436 5, /* scalar_load_cost */
437 1, /* scalar_store_cost */
438 2, /* vec_stmt_cost */
439 2, /* vec_permute_cost */
440 4, /* vec_to_scalar_cost */
441 4, /* scalar_to_vec_cost */
442 10, /* vec_align_load_cost */
443 10, /* vec_unalign_load_cost */
444 2, /* vec_unalign_store_cost */
445 2, /* vec_store_cost */
446 2, /* cond_taken_branch_cost */
447 1 /* cond_not_taken_branch_cost */
450 /* Costs for vector insn classes for Vulcan. */
451 static const struct cpu_vector_cost vulcan_vector_cost =
453 6, /* scalar_stmt_cost */
454 4, /* scalar_load_cost */
455 1, /* scalar_store_cost */
456 6, /* vec_stmt_cost */
457 3, /* vec_permute_cost */
458 6, /* vec_to_scalar_cost */
459 5, /* scalar_to_vec_cost */
460 8, /* vec_align_load_cost */
461 8, /* vec_unalign_load_cost */
462 4, /* vec_unalign_store_cost */
463 4, /* vec_store_cost */
464 2, /* cond_taken_branch_cost */
465 1 /* cond_not_taken_branch_cost */
468 /* Generic costs for branch instructions. */
469 static const struct cpu_branch_cost generic_branch_cost =
471 2, /* Predictable. */
472 2 /* Unpredictable. */
475 /* Branch costs for Cortex-A57. */
476 static const struct cpu_branch_cost cortexa57_branch_cost =
478 1, /* Predictable. */
479 3 /* Unpredictable. */
482 /* Branch costs for Vulcan. */
483 static const struct cpu_branch_cost vulcan_branch_cost =
485 1, /* Predictable. */
486 3 /* Unpredictable. */
489 /* Generic approximation modes. */
490 static const cpu_approx_modes generic_approx_modes =
492 AARCH64_APPROX_NONE, /* division */
493 AARCH64_APPROX_NONE, /* sqrt */
494 AARCH64_APPROX_NONE /* recip_sqrt */
497 /* Approximation modes for Exynos M1. */
498 static const cpu_approx_modes exynosm1_approx_modes =
500 AARCH64_APPROX_NONE, /* division */
501 AARCH64_APPROX_ALL, /* sqrt */
502 AARCH64_APPROX_ALL /* recip_sqrt */
505 /* Approximation modes for X-Gene 1. */
506 static const cpu_approx_modes xgene1_approx_modes =
508 AARCH64_APPROX_NONE, /* division */
509 AARCH64_APPROX_NONE, /* sqrt */
510 AARCH64_APPROX_ALL /* recip_sqrt */
513 static const struct tune_params generic_tunings =
515 &cortexa57_extra_costs,
516 &generic_addrcost_table,
517 &generic_regmove_cost,
518 &generic_vector_cost,
519 &generic_branch_cost,
520 &generic_approx_modes,
521 4, /* memmov_cost */
522 2, /* issue_rate */
523 AARCH64_FUSE_NOTHING, /* fusible_ops */
524 8, /* function_align. */
525 8, /* jump_align. */
526 4, /* loop_align. */
527 2, /* int_reassoc_width. */
528 4, /* fp_reassoc_width. */
529 1, /* vec_reassoc_width. */
530 2, /* min_div_recip_mul_sf. */
531 2, /* min_div_recip_mul_df. */
532 0, /* max_case_values. */
533 0, /* cache_line_size. */
534 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
535 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
538 static const struct tune_params cortexa35_tunings =
540 &cortexa53_extra_costs,
541 &generic_addrcost_table,
542 &cortexa53_regmove_cost,
543 &generic_vector_cost,
544 &cortexa57_branch_cost,
545 &generic_approx_modes,
546 4, /* memmov_cost */
547 1, /* issue_rate */
548 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
549 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
550 16, /* function_align. */
551 8, /* jump_align. */
552 8, /* loop_align. */
553 2, /* int_reassoc_width. */
554 4, /* fp_reassoc_width. */
555 1, /* vec_reassoc_width. */
556 2, /* min_div_recip_mul_sf. */
557 2, /* min_div_recip_mul_df. */
558 0, /* max_case_values. */
559 0, /* cache_line_size. */
560 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
561 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
564 static const struct tune_params cortexa53_tunings =
566 &cortexa53_extra_costs,
567 &generic_addrcost_table,
568 &cortexa53_regmove_cost,
569 &generic_vector_cost,
570 &cortexa57_branch_cost,
571 &generic_approx_modes,
572 4, /* memmov_cost */
573 2, /* issue_rate */
574 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
575 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
576 16, /* function_align. */
577 8, /* jump_align. */
578 8, /* loop_align. */
579 2, /* int_reassoc_width. */
580 4, /* fp_reassoc_width. */
581 1, /* vec_reassoc_width. */
582 2, /* min_div_recip_mul_sf. */
583 2, /* min_div_recip_mul_df. */
584 0, /* max_case_values. */
585 0, /* cache_line_size. */
586 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
587 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
590 static const struct tune_params cortexa57_tunings =
592 &cortexa57_extra_costs,
593 &cortexa57_addrcost_table,
594 &cortexa57_regmove_cost,
595 &cortexa57_vector_cost,
596 &cortexa57_branch_cost,
597 &generic_approx_modes,
598 4, /* memmov_cost */
599 3, /* issue_rate */
600 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
601 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
602 16, /* function_align. */
603 8, /* jump_align. */
604 8, /* loop_align. */
605 2, /* int_reassoc_width. */
606 4, /* fp_reassoc_width. */
607 1, /* vec_reassoc_width. */
608 2, /* min_div_recip_mul_sf. */
609 2, /* min_div_recip_mul_df. */
610 0, /* max_case_values. */
611 0, /* cache_line_size. */
612 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
613 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
616 static const struct tune_params cortexa72_tunings =
618 &cortexa57_extra_costs,
619 &cortexa57_addrcost_table,
620 &cortexa57_regmove_cost,
621 &cortexa57_vector_cost,
622 &cortexa57_branch_cost,
623 &generic_approx_modes,
624 4, /* memmov_cost */
625 3, /* issue_rate */
626 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
627 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
628 16, /* function_align. */
629 8, /* jump_align. */
630 8, /* loop_align. */
631 2, /* int_reassoc_width. */
632 4, /* fp_reassoc_width. */
633 1, /* vec_reassoc_width. */
634 2, /* min_div_recip_mul_sf. */
635 2, /* min_div_recip_mul_df. */
636 0, /* max_case_values. */
637 0, /* cache_line_size. */
638 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
639 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
642 static const struct tune_params cortexa73_tunings =
644 &cortexa57_extra_costs,
645 &cortexa57_addrcost_table,
646 &cortexa57_regmove_cost,
647 &cortexa57_vector_cost,
648 &cortexa57_branch_cost,
649 &generic_approx_modes,
650 4, /* memmov_cost. */
651 2, /* issue_rate. */
652 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
653 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
654 16, /* function_align. */
655 8, /* jump_align. */
656 8, /* loop_align. */
657 2, /* int_reassoc_width. */
658 4, /* fp_reassoc_width. */
659 1, /* vec_reassoc_width. */
660 2, /* min_div_recip_mul_sf. */
661 2, /* min_div_recip_mul_df. */
662 0, /* max_case_values. */
663 0, /* cache_line_size. */
664 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
665 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
668 static const struct tune_params exynosm1_tunings =
670 &exynosm1_extra_costs,
671 &exynosm1_addrcost_table,
672 &exynosm1_regmove_cost,
673 &exynosm1_vector_cost,
674 &generic_branch_cost,
675 &exynosm1_approx_modes,
676 4, /* memmov_cost */
677 3, /* issue_rate */
678 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
679 4, /* function_align. */
680 4, /* jump_align. */
681 4, /* loop_align. */
682 2, /* int_reassoc_width. */
683 4, /* fp_reassoc_width. */
684 1, /* vec_reassoc_width. */
685 2, /* min_div_recip_mul_sf. */
686 2, /* min_div_recip_mul_df. */
687 48, /* max_case_values. */
688 64, /* cache_line_size. */
689 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
690 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
693 static const struct tune_params thunderx_tunings =
695 &thunderx_extra_costs,
696 &generic_addrcost_table,
697 &thunderx_regmove_cost,
698 &thunderx_vector_cost,
699 &generic_branch_cost,
700 &generic_approx_modes,
701 6, /* memmov_cost */
702 2, /* issue_rate */
703 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
704 8, /* function_align. */
705 8, /* jump_align. */
706 8, /* loop_align. */
707 2, /* int_reassoc_width. */
708 4, /* fp_reassoc_width. */
709 1, /* vec_reassoc_width. */
710 2, /* min_div_recip_mul_sf. */
711 2, /* min_div_recip_mul_df. */
712 0, /* max_case_values. */
713 0, /* cache_line_size. */
714 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
715 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
718 static const struct tune_params xgene1_tunings =
720 &xgene1_extra_costs,
721 &xgene1_addrcost_table,
722 &xgene1_regmove_cost,
723 &xgene1_vector_cost,
724 &generic_branch_cost,
725 &xgene1_approx_modes,
726 6, /* memmov_cost */
727 4, /* issue_rate */
728 AARCH64_FUSE_NOTHING, /* fusible_ops */
729 16, /* function_align. */
730 8, /* jump_align. */
731 16, /* loop_align. */
732 2, /* int_reassoc_width. */
733 4, /* fp_reassoc_width. */
734 1, /* vec_reassoc_width. */
735 2, /* min_div_recip_mul_sf. */
736 2, /* min_div_recip_mul_df. */
737 0, /* max_case_values. */
738 0, /* cache_line_size. */
739 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
740 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
743 static const struct tune_params qdf24xx_tunings =
745 &qdf24xx_extra_costs,
746 &qdf24xx_addrcost_table,
747 &qdf24xx_regmove_cost,
748 &generic_vector_cost,
749 &generic_branch_cost,
750 &generic_approx_modes,
751 4, /* memmov_cost */
752 4, /* issue_rate */
753 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
754 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
755 16, /* function_align. */
756 8, /* jump_align. */
757 16, /* loop_align. */
758 2, /* int_reassoc_width. */
759 4, /* fp_reassoc_width. */
760 1, /* vec_reassoc_width. */
761 2, /* min_div_recip_mul_sf. */
762 2, /* min_div_recip_mul_df. */
763 0, /* max_case_values. */
764 64, /* cache_line_size. */
765 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
766 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
769 static const struct tune_params vulcan_tunings =
771 &vulcan_extra_costs,
772 &vulcan_addrcost_table,
773 &vulcan_regmove_cost,
774 &vulcan_vector_cost,
775 &vulcan_branch_cost,
776 &generic_approx_modes,
777 4, /* memmov_cost. */
778 4, /* issue_rate. */
779 AARCH64_FUSE_NOTHING, /* fuseable_ops. */
780 16, /* function_align. */
781 8, /* jump_align. */
782 16, /* loop_align. */
783 3, /* int_reassoc_width. */
784 2, /* fp_reassoc_width. */
785 2, /* vec_reassoc_width. */
786 2, /* min_div_recip_mul_sf. */
787 2, /* min_div_recip_mul_df. */
788 0, /* max_case_values. */
789 64, /* cache_line_size. */
790 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
791 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
794 /* Support for fine-grained override of the tuning structures. */
795 struct aarch64_tuning_override_function
797 const char* name;
798 void (*parse_override)(const char*, struct tune_params*);
801 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
802 static void aarch64_parse_tune_string (const char*, struct tune_params*);
804 static const struct aarch64_tuning_override_function
805 aarch64_tuning_override_functions[] =
807 { "fuse", aarch64_parse_fuse_string },
808 { "tune", aarch64_parse_tune_string },
809 { NULL, NULL }
812 /* A processor implementing AArch64. */
813 struct processor
815 const char *const name;
816 enum aarch64_processor ident;
817 enum aarch64_processor sched_core;
818 enum aarch64_arch arch;
819 unsigned architecture_version;
820 const unsigned long flags;
821 const struct tune_params *const tune;
824 /* Architectures implementing AArch64. */
825 static const struct processor all_architectures[] =
827 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
828 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
829 #include "aarch64-arches.def"
830 #undef AARCH64_ARCH
831 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
834 /* Processor cores implementing AArch64. */
835 static const struct processor all_cores[] =
837 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART) \
838 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
839 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
840 FLAGS, &COSTS##_tunings},
841 #include "aarch64-cores.def"
842 #undef AARCH64_CORE
843 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
844 AARCH64_FL_FOR_ARCH8, &generic_tunings},
845 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
849 /* Target specification. These are populated by the -march, -mtune, -mcpu
850 handling code or by target attributes. */
851 static const struct processor *selected_arch;
852 static const struct processor *selected_cpu;
853 static const struct processor *selected_tune;
855 /* The current tuning set. */
856 struct tune_params aarch64_tune_params = generic_tunings;
858 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
860 /* An ISA extension in the co-processor and main instruction set space. */
861 struct aarch64_option_extension
863 const char *const name;
864 const unsigned long flags_on;
865 const unsigned long flags_off;
868 typedef enum aarch64_cond_code
870 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
871 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
872 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
874 aarch64_cc;
876 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
878 /* The condition codes of the processor, and the inverse function. */
879 static const char * const aarch64_condition_codes[] =
881 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
882 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
885 /* Generate code to enable conditional branches in functions over 1 MiB. */
886 const char *
887 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
888 const char * branch_format)
890 rtx_code_label * tmp_label = gen_label_rtx ();
891 char label_buf[256];
892 char buffer[128];
893 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
894 CODE_LABEL_NUMBER (tmp_label));
895 const char *label_ptr = targetm.strip_name_encoding (label_buf);
896 rtx dest_label = operands[pos_label];
897 operands[pos_label] = tmp_label;
899 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
900 output_asm_insn (buffer, operands);
902 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
903 operands[pos_label] = dest_label;
904 output_asm_insn (buffer, operands);
905 return "";
908 void
909 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
911 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
912 if (TARGET_GENERAL_REGS_ONLY)
913 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
914 else
915 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
918 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
919 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
920 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
921 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
922 cost (in this case the best class is the lowest cost one). Using ALL_REGS
923 irrespectively of its cost results in bad allocations with many redundant
924 int<->FP moves which are expensive on various cores.
925 To avoid this we don't allow ALL_REGS as the allocno class, but force a
926 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
927 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
928 Otherwise set the allocno class depending on the mode.
929 The result of this is that it is no longer inefficient to have a higher
930 memory move cost than the register move cost.
933 static reg_class_t
934 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
935 reg_class_t best_class)
937 enum machine_mode mode;
939 if (allocno_class != ALL_REGS)
940 return allocno_class;
942 if (best_class != ALL_REGS)
943 return best_class;
945 mode = PSEUDO_REGNO_MODE (regno);
946 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
949 static unsigned int
950 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
952 if (GET_MODE_UNIT_SIZE (mode) == 4)
953 return aarch64_tune_params.min_div_recip_mul_sf;
954 return aarch64_tune_params.min_div_recip_mul_df;
957 static int
958 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
959 enum machine_mode mode)
961 if (VECTOR_MODE_P (mode))
962 return aarch64_tune_params.vec_reassoc_width;
963 if (INTEGRAL_MODE_P (mode))
964 return aarch64_tune_params.int_reassoc_width;
965 if (FLOAT_MODE_P (mode))
966 return aarch64_tune_params.fp_reassoc_width;
967 return 1;
970 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
971 unsigned
972 aarch64_dbx_register_number (unsigned regno)
974 if (GP_REGNUM_P (regno))
975 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
976 else if (regno == SP_REGNUM)
977 return AARCH64_DWARF_SP;
978 else if (FP_REGNUM_P (regno))
979 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
981 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
982 equivalent DWARF register. */
983 return DWARF_FRAME_REGISTERS;
986 /* Return TRUE if MODE is any of the large INT modes. */
987 static bool
988 aarch64_vect_struct_mode_p (machine_mode mode)
990 return mode == OImode || mode == CImode || mode == XImode;
993 /* Return TRUE if MODE is any of the vector modes. */
994 static bool
995 aarch64_vector_mode_p (machine_mode mode)
997 return aarch64_vector_mode_supported_p (mode)
998 || aarch64_vect_struct_mode_p (mode);
1001 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1002 static bool
1003 aarch64_array_mode_supported_p (machine_mode mode,
1004 unsigned HOST_WIDE_INT nelems)
1006 if (TARGET_SIMD
1007 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1008 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1009 && (nelems >= 2 && nelems <= 4))
1010 return true;
1012 return false;
1015 /* Implement HARD_REGNO_NREGS. */
1018 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1020 switch (aarch64_regno_regclass (regno))
1022 case FP_REGS:
1023 case FP_LO_REGS:
1024 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1025 default:
1026 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1028 gcc_unreachable ();
1031 /* Implement HARD_REGNO_MODE_OK. */
1034 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1036 if (GET_MODE_CLASS (mode) == MODE_CC)
1037 return regno == CC_REGNUM;
1039 if (regno == SP_REGNUM)
1040 /* The purpose of comparing with ptr_mode is to support the
1041 global register variable associated with the stack pointer
1042 register via the syntax of asm ("wsp") in ILP32. */
1043 return mode == Pmode || mode == ptr_mode;
1045 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1046 return mode == Pmode;
1048 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1049 return 1;
1051 if (FP_REGNUM_P (regno))
1053 if (aarch64_vect_struct_mode_p (mode))
1054 return
1055 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1056 else
1057 return 1;
1060 return 0;
1063 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1064 machine_mode
1065 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1066 machine_mode mode)
1068 /* Handle modes that fit within single registers. */
1069 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1071 if (GET_MODE_SIZE (mode) >= 4)
1072 return mode;
1073 else
1074 return SImode;
1076 /* Fall back to generic for multi-reg and very large modes. */
1077 else
1078 return choose_hard_reg_mode (regno, nregs, false);
1081 /* Return true if calls to DECL should be treated as
1082 long-calls (ie called via a register). */
1083 static bool
1084 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1086 return false;
1089 /* Return true if calls to symbol-ref SYM should be treated as
1090 long-calls (ie called via a register). */
1091 bool
1092 aarch64_is_long_call_p (rtx sym)
1094 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1097 /* Return true if calls to symbol-ref SYM should not go through
1098 plt stubs. */
1100 bool
1101 aarch64_is_noplt_call_p (rtx sym)
1103 const_tree decl = SYMBOL_REF_DECL (sym);
1105 if (flag_pic
1106 && decl
1107 && (!flag_plt
1108 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1109 && !targetm.binds_local_p (decl))
1110 return true;
1112 return false;
1115 /* Return true if the offsets to a zero/sign-extract operation
1116 represent an expression that matches an extend operation. The
1117 operands represent the paramters from
1119 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1120 bool
1121 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1122 rtx extract_imm)
1124 HOST_WIDE_INT mult_val, extract_val;
1126 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1127 return false;
1129 mult_val = INTVAL (mult_imm);
1130 extract_val = INTVAL (extract_imm);
1132 if (extract_val > 8
1133 && extract_val < GET_MODE_BITSIZE (mode)
1134 && exact_log2 (extract_val & ~7) > 0
1135 && (extract_val & 7) <= 4
1136 && mult_val == (1 << (extract_val & 7)))
1137 return true;
1139 return false;
1142 /* Emit an insn that's a simple single-set. Both the operands must be
1143 known to be valid. */
1144 inline static rtx
1145 emit_set_insn (rtx x, rtx y)
1147 return emit_insn (gen_rtx_SET (x, y));
1150 /* X and Y are two things to compare using CODE. Emit the compare insn and
1151 return the rtx for register 0 in the proper mode. */
1153 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1155 machine_mode mode = SELECT_CC_MODE (code, x, y);
1156 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1158 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1159 return cc_reg;
1162 /* Build the SYMBOL_REF for __tls_get_addr. */
1164 static GTY(()) rtx tls_get_addr_libfunc;
1167 aarch64_tls_get_addr (void)
1169 if (!tls_get_addr_libfunc)
1170 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1171 return tls_get_addr_libfunc;
1174 /* Return the TLS model to use for ADDR. */
1176 static enum tls_model
1177 tls_symbolic_operand_type (rtx addr)
1179 enum tls_model tls_kind = TLS_MODEL_NONE;
1180 rtx sym, addend;
1182 if (GET_CODE (addr) == CONST)
1184 split_const (addr, &sym, &addend);
1185 if (GET_CODE (sym) == SYMBOL_REF)
1186 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1188 else if (GET_CODE (addr) == SYMBOL_REF)
1189 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1191 return tls_kind;
1194 /* We'll allow lo_sum's in addresses in our legitimate addresses
1195 so that combine would take care of combining addresses where
1196 necessary, but for generation purposes, we'll generate the address
1197 as :
1198 RTL Absolute
1199 tmp = hi (symbol_ref); adrp x1, foo
1200 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1203 PIC TLS
1204 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1205 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1206 bl __tls_get_addr
1209 Load TLS symbol, depending on TLS mechanism and TLS access model.
1211 Global Dynamic - Traditional TLS:
1212 adrp tmp, :tlsgd:imm
1213 add dest, tmp, #:tlsgd_lo12:imm
1214 bl __tls_get_addr
1216 Global Dynamic - TLS Descriptors:
1217 adrp dest, :tlsdesc:imm
1218 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1219 add dest, dest, #:tlsdesc_lo12:imm
1220 blr tmp
1221 mrs tp, tpidr_el0
1222 add dest, dest, tp
1224 Initial Exec:
1225 mrs tp, tpidr_el0
1226 adrp tmp, :gottprel:imm
1227 ldr dest, [tmp, #:gottprel_lo12:imm]
1228 add dest, dest, tp
1230 Local Exec:
1231 mrs tp, tpidr_el0
1232 add t0, tp, #:tprel_hi12:imm, lsl #12
1233 add t0, t0, #:tprel_lo12_nc:imm
1236 static void
1237 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1238 enum aarch64_symbol_type type)
1240 switch (type)
1242 case SYMBOL_SMALL_ABSOLUTE:
1244 /* In ILP32, the mode of dest can be either SImode or DImode. */
1245 rtx tmp_reg = dest;
1246 machine_mode mode = GET_MODE (dest);
1248 gcc_assert (mode == Pmode || mode == ptr_mode);
1250 if (can_create_pseudo_p ())
1251 tmp_reg = gen_reg_rtx (mode);
1253 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1254 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1255 return;
1258 case SYMBOL_TINY_ABSOLUTE:
1259 emit_insn (gen_rtx_SET (dest, imm));
1260 return;
1262 case SYMBOL_SMALL_GOT_28K:
1264 machine_mode mode = GET_MODE (dest);
1265 rtx gp_rtx = pic_offset_table_rtx;
1266 rtx insn;
1267 rtx mem;
1269 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1270 here before rtl expand. Tree IVOPT will generate rtl pattern to
1271 decide rtx costs, in which case pic_offset_table_rtx is not
1272 initialized. For that case no need to generate the first adrp
1273 instruction as the final cost for global variable access is
1274 one instruction. */
1275 if (gp_rtx != NULL)
1277 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1278 using the page base as GOT base, the first page may be wasted,
1279 in the worst scenario, there is only 28K space for GOT).
1281 The generate instruction sequence for accessing global variable
1284 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1286 Only one instruction needed. But we must initialize
1287 pic_offset_table_rtx properly. We generate initialize insn for
1288 every global access, and allow CSE to remove all redundant.
1290 The final instruction sequences will look like the following
1291 for multiply global variables access.
1293 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1295 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1296 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1297 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1298 ... */
1300 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1301 crtl->uses_pic_offset_table = 1;
1302 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1304 if (mode != GET_MODE (gp_rtx))
1305 gp_rtx = simplify_gen_subreg (mode, gp_rtx, GET_MODE (gp_rtx), 0);
1308 if (mode == ptr_mode)
1310 if (mode == DImode)
1311 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1312 else
1313 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1315 mem = XVECEXP (SET_SRC (insn), 0, 0);
1317 else
1319 gcc_assert (mode == Pmode);
1321 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1322 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1325 /* The operand is expected to be MEM. Whenever the related insn
1326 pattern changed, above code which calculate mem should be
1327 updated. */
1328 gcc_assert (GET_CODE (mem) == MEM);
1329 MEM_READONLY_P (mem) = 1;
1330 MEM_NOTRAP_P (mem) = 1;
1331 emit_insn (insn);
1332 return;
1335 case SYMBOL_SMALL_GOT_4G:
1337 /* In ILP32, the mode of dest can be either SImode or DImode,
1338 while the got entry is always of SImode size. The mode of
1339 dest depends on how dest is used: if dest is assigned to a
1340 pointer (e.g. in the memory), it has SImode; it may have
1341 DImode if dest is dereferenced to access the memeory.
1342 This is why we have to handle three different ldr_got_small
1343 patterns here (two patterns for ILP32). */
1345 rtx insn;
1346 rtx mem;
1347 rtx tmp_reg = dest;
1348 machine_mode mode = GET_MODE (dest);
1350 if (can_create_pseudo_p ())
1351 tmp_reg = gen_reg_rtx (mode);
1353 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1354 if (mode == ptr_mode)
1356 if (mode == DImode)
1357 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1358 else
1359 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1361 mem = XVECEXP (SET_SRC (insn), 0, 0);
1363 else
1365 gcc_assert (mode == Pmode);
1367 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1368 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1371 gcc_assert (GET_CODE (mem) == MEM);
1372 MEM_READONLY_P (mem) = 1;
1373 MEM_NOTRAP_P (mem) = 1;
1374 emit_insn (insn);
1375 return;
1378 case SYMBOL_SMALL_TLSGD:
1380 rtx_insn *insns;
1381 rtx result = gen_rtx_REG (Pmode, R0_REGNUM);
1383 start_sequence ();
1384 aarch64_emit_call_insn (gen_tlsgd_small (result, imm));
1385 insns = get_insns ();
1386 end_sequence ();
1388 RTL_CONST_CALL_P (insns) = 1;
1389 emit_libcall_block (insns, dest, result, imm);
1390 return;
1393 case SYMBOL_SMALL_TLSDESC:
1395 machine_mode mode = GET_MODE (dest);
1396 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1397 rtx tp;
1399 gcc_assert (mode == Pmode || mode == ptr_mode);
1401 /* In ILP32, the got entry is always of SImode size. Unlike
1402 small GOT, the dest is fixed at reg 0. */
1403 if (TARGET_ILP32)
1404 emit_insn (gen_tlsdesc_small_si (imm));
1405 else
1406 emit_insn (gen_tlsdesc_small_di (imm));
1407 tp = aarch64_load_tp (NULL);
1409 if (mode != Pmode)
1410 tp = gen_lowpart (mode, tp);
1412 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1413 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1414 return;
1417 case SYMBOL_SMALL_TLSIE:
1419 /* In ILP32, the mode of dest can be either SImode or DImode,
1420 while the got entry is always of SImode size. The mode of
1421 dest depends on how dest is used: if dest is assigned to a
1422 pointer (e.g. in the memory), it has SImode; it may have
1423 DImode if dest is dereferenced to access the memeory.
1424 This is why we have to handle three different tlsie_small
1425 patterns here (two patterns for ILP32). */
1426 machine_mode mode = GET_MODE (dest);
1427 rtx tmp_reg = gen_reg_rtx (mode);
1428 rtx tp = aarch64_load_tp (NULL);
1430 if (mode == ptr_mode)
1432 if (mode == DImode)
1433 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1434 else
1436 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1437 tp = gen_lowpart (mode, tp);
1440 else
1442 gcc_assert (mode == Pmode);
1443 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1446 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1447 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1448 return;
1451 case SYMBOL_TLSLE12:
1452 case SYMBOL_TLSLE24:
1453 case SYMBOL_TLSLE32:
1454 case SYMBOL_TLSLE48:
1456 machine_mode mode = GET_MODE (dest);
1457 rtx tp = aarch64_load_tp (NULL);
1459 if (mode != Pmode)
1460 tp = gen_lowpart (mode, tp);
1462 switch (type)
1464 case SYMBOL_TLSLE12:
1465 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1466 (dest, tp, imm));
1467 break;
1468 case SYMBOL_TLSLE24:
1469 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1470 (dest, tp, imm));
1471 break;
1472 case SYMBOL_TLSLE32:
1473 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1474 (dest, imm));
1475 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1476 (dest, dest, tp));
1477 break;
1478 case SYMBOL_TLSLE48:
1479 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1480 (dest, imm));
1481 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1482 (dest, dest, tp));
1483 break;
1484 default:
1485 gcc_unreachable ();
1488 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1489 return;
1492 case SYMBOL_TINY_GOT:
1493 emit_insn (gen_ldr_got_tiny (dest, imm));
1494 return;
1496 case SYMBOL_TINY_TLSIE:
1498 machine_mode mode = GET_MODE (dest);
1499 rtx tp = aarch64_load_tp (NULL);
1501 if (mode == ptr_mode)
1503 if (mode == DImode)
1504 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1505 else
1507 tp = gen_lowpart (mode, tp);
1508 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1511 else
1513 gcc_assert (mode == Pmode);
1514 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1517 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1518 return;
1521 default:
1522 gcc_unreachable ();
1526 /* Emit a move from SRC to DEST. Assume that the move expanders can
1527 handle all moves if !can_create_pseudo_p (). The distinction is
1528 important because, unlike emit_move_insn, the move expanders know
1529 how to force Pmode objects into the constant pool even when the
1530 constant pool address is not itself legitimate. */
1531 static rtx
1532 aarch64_emit_move (rtx dest, rtx src)
1534 return (can_create_pseudo_p ()
1535 ? emit_move_insn (dest, src)
1536 : emit_move_insn_1 (dest, src));
1539 /* Split a 128-bit move operation into two 64-bit move operations,
1540 taking care to handle partial overlap of register to register
1541 copies. Special cases are needed when moving between GP regs and
1542 FP regs. SRC can be a register, constant or memory; DST a register
1543 or memory. If either operand is memory it must not have any side
1544 effects. */
1545 void
1546 aarch64_split_128bit_move (rtx dst, rtx src)
1548 rtx dst_lo, dst_hi;
1549 rtx src_lo, src_hi;
1551 machine_mode mode = GET_MODE (dst);
1553 gcc_assert (mode == TImode || mode == TFmode);
1554 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1555 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1557 if (REG_P (dst) && REG_P (src))
1559 int src_regno = REGNO (src);
1560 int dst_regno = REGNO (dst);
1562 /* Handle FP <-> GP regs. */
1563 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1565 src_lo = gen_lowpart (word_mode, src);
1566 src_hi = gen_highpart (word_mode, src);
1568 if (mode == TImode)
1570 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1571 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1573 else
1575 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1576 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1578 return;
1580 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1582 dst_lo = gen_lowpart (word_mode, dst);
1583 dst_hi = gen_highpart (word_mode, dst);
1585 if (mode == TImode)
1587 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1588 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1590 else
1592 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1593 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1595 return;
1599 dst_lo = gen_lowpart (word_mode, dst);
1600 dst_hi = gen_highpart (word_mode, dst);
1601 src_lo = gen_lowpart (word_mode, src);
1602 src_hi = gen_highpart_mode (word_mode, mode, src);
1604 /* At most one pairing may overlap. */
1605 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1607 aarch64_emit_move (dst_hi, src_hi);
1608 aarch64_emit_move (dst_lo, src_lo);
1610 else
1612 aarch64_emit_move (dst_lo, src_lo);
1613 aarch64_emit_move (dst_hi, src_hi);
1617 bool
1618 aarch64_split_128bit_move_p (rtx dst, rtx src)
1620 return (! REG_P (src)
1621 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1624 /* Split a complex SIMD combine. */
1626 void
1627 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1629 machine_mode src_mode = GET_MODE (src1);
1630 machine_mode dst_mode = GET_MODE (dst);
1632 gcc_assert (VECTOR_MODE_P (dst_mode));
1634 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1636 rtx (*gen) (rtx, rtx, rtx);
1638 switch (src_mode)
1640 case V8QImode:
1641 gen = gen_aarch64_simd_combinev8qi;
1642 break;
1643 case V4HImode:
1644 gen = gen_aarch64_simd_combinev4hi;
1645 break;
1646 case V2SImode:
1647 gen = gen_aarch64_simd_combinev2si;
1648 break;
1649 case V4HFmode:
1650 gen = gen_aarch64_simd_combinev4hf;
1651 break;
1652 case V2SFmode:
1653 gen = gen_aarch64_simd_combinev2sf;
1654 break;
1655 case DImode:
1656 gen = gen_aarch64_simd_combinedi;
1657 break;
1658 case DFmode:
1659 gen = gen_aarch64_simd_combinedf;
1660 break;
1661 default:
1662 gcc_unreachable ();
1665 emit_insn (gen (dst, src1, src2));
1666 return;
1670 /* Split a complex SIMD move. */
1672 void
1673 aarch64_split_simd_move (rtx dst, rtx src)
1675 machine_mode src_mode = GET_MODE (src);
1676 machine_mode dst_mode = GET_MODE (dst);
1678 gcc_assert (VECTOR_MODE_P (dst_mode));
1680 if (REG_P (dst) && REG_P (src))
1682 rtx (*gen) (rtx, rtx);
1684 gcc_assert (VECTOR_MODE_P (src_mode));
1686 switch (src_mode)
1688 case V16QImode:
1689 gen = gen_aarch64_split_simd_movv16qi;
1690 break;
1691 case V8HImode:
1692 gen = gen_aarch64_split_simd_movv8hi;
1693 break;
1694 case V4SImode:
1695 gen = gen_aarch64_split_simd_movv4si;
1696 break;
1697 case V2DImode:
1698 gen = gen_aarch64_split_simd_movv2di;
1699 break;
1700 case V8HFmode:
1701 gen = gen_aarch64_split_simd_movv8hf;
1702 break;
1703 case V4SFmode:
1704 gen = gen_aarch64_split_simd_movv4sf;
1705 break;
1706 case V2DFmode:
1707 gen = gen_aarch64_split_simd_movv2df;
1708 break;
1709 default:
1710 gcc_unreachable ();
1713 emit_insn (gen (dst, src));
1714 return;
1718 bool
1719 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1720 machine_mode ymode, rtx y)
1722 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1723 gcc_assert (r != NULL);
1724 return rtx_equal_p (x, r);
1728 static rtx
1729 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1731 if (can_create_pseudo_p ())
1732 return force_reg (mode, value);
1733 else
1735 x = aarch64_emit_move (x, value);
1736 return x;
1741 static rtx
1742 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1744 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1746 rtx high;
1747 /* Load the full offset into a register. This
1748 might be improvable in the future. */
1749 high = GEN_INT (offset);
1750 offset = 0;
1751 high = aarch64_force_temporary (mode, temp, high);
1752 reg = aarch64_force_temporary (mode, temp,
1753 gen_rtx_PLUS (mode, high, reg));
1755 return plus_constant (mode, reg, offset);
1758 static int
1759 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1760 machine_mode mode)
1762 int i;
1763 unsigned HOST_WIDE_INT val, val2, mask;
1764 int one_match, zero_match;
1765 int num_insns;
1767 val = INTVAL (imm);
1769 if (aarch64_move_imm (val, mode))
1771 if (generate)
1772 emit_insn (gen_rtx_SET (dest, imm));
1773 return 1;
1776 if ((val >> 32) == 0 || mode == SImode)
1778 if (generate)
1780 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1781 if (mode == SImode)
1782 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1783 GEN_INT ((val >> 16) & 0xffff)));
1784 else
1785 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1786 GEN_INT ((val >> 16) & 0xffff)));
1788 return 2;
1791 /* Remaining cases are all for DImode. */
1793 mask = 0xffff;
1794 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1795 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1796 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1797 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1799 if (zero_match != 2 && one_match != 2)
1801 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1802 For a 64-bit bitmask try whether changing 16 bits to all ones or
1803 zeroes creates a valid bitmask. To check any repeated bitmask,
1804 try using 16 bits from the other 32-bit half of val. */
1806 for (i = 0; i < 64; i += 16, mask <<= 16)
1808 val2 = val & ~mask;
1809 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1810 break;
1811 val2 = val | mask;
1812 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1813 break;
1814 val2 = val2 & ~mask;
1815 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1816 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1817 break;
1819 if (i != 64)
1821 if (generate)
1823 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1824 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1825 GEN_INT ((val >> i) & 0xffff)));
1827 return 2;
1831 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1832 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1833 otherwise skip zero bits. */
1835 num_insns = 1;
1836 mask = 0xffff;
1837 val2 = one_match > zero_match ? ~val : val;
1838 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1840 if (generate)
1841 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1842 ? (val | ~(mask << i))
1843 : (val & (mask << i)))));
1844 for (i += 16; i < 64; i += 16)
1846 if ((val2 & (mask << i)) == 0)
1847 continue;
1848 if (generate)
1849 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1850 GEN_INT ((val >> i) & 0xffff)));
1851 num_insns ++;
1854 return num_insns;
1858 void
1859 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1861 machine_mode mode = GET_MODE (dest);
1863 gcc_assert (mode == SImode || mode == DImode);
1865 /* Check on what type of symbol it is. */
1866 if (GET_CODE (imm) == SYMBOL_REF
1867 || GET_CODE (imm) == LABEL_REF
1868 || GET_CODE (imm) == CONST)
1870 rtx mem, base, offset;
1871 enum aarch64_symbol_type sty;
1873 /* If we have (const (plus symbol offset)), separate out the offset
1874 before we start classifying the symbol. */
1875 split_const (imm, &base, &offset);
1877 sty = aarch64_classify_symbol (base, offset);
1878 switch (sty)
1880 case SYMBOL_FORCE_TO_MEM:
1881 if (offset != const0_rtx
1882 && targetm.cannot_force_const_mem (mode, imm))
1884 gcc_assert (can_create_pseudo_p ());
1885 base = aarch64_force_temporary (mode, dest, base);
1886 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1887 aarch64_emit_move (dest, base);
1888 return;
1891 mem = force_const_mem (ptr_mode, imm);
1892 gcc_assert (mem);
1894 /* If we aren't generating PC relative literals, then
1895 we need to expand the literal pool access carefully.
1896 This is something that needs to be done in a number
1897 of places, so could well live as a separate function. */
1898 if (!aarch64_pcrelative_literal_loads)
1900 gcc_assert (can_create_pseudo_p ());
1901 base = gen_reg_rtx (ptr_mode);
1902 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1903 mem = gen_rtx_MEM (ptr_mode, base);
1906 if (mode != ptr_mode)
1907 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1909 emit_insn (gen_rtx_SET (dest, mem));
1911 return;
1913 case SYMBOL_SMALL_TLSGD:
1914 case SYMBOL_SMALL_TLSDESC:
1915 case SYMBOL_SMALL_TLSIE:
1916 case SYMBOL_SMALL_GOT_28K:
1917 case SYMBOL_SMALL_GOT_4G:
1918 case SYMBOL_TINY_GOT:
1919 case SYMBOL_TINY_TLSIE:
1920 if (offset != const0_rtx)
1922 gcc_assert(can_create_pseudo_p ());
1923 base = aarch64_force_temporary (mode, dest, base);
1924 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1925 aarch64_emit_move (dest, base);
1926 return;
1928 /* FALLTHRU */
1930 case SYMBOL_SMALL_ABSOLUTE:
1931 case SYMBOL_TINY_ABSOLUTE:
1932 case SYMBOL_TLSLE12:
1933 case SYMBOL_TLSLE24:
1934 case SYMBOL_TLSLE32:
1935 case SYMBOL_TLSLE48:
1936 aarch64_load_symref_appropriately (dest, imm, sty);
1937 return;
1939 default:
1940 gcc_unreachable ();
1944 if (!CONST_INT_P (imm))
1946 if (GET_CODE (imm) == HIGH)
1947 emit_insn (gen_rtx_SET (dest, imm));
1948 else
1950 rtx mem = force_const_mem (mode, imm);
1951 gcc_assert (mem);
1952 emit_insn (gen_rtx_SET (dest, mem));
1955 return;
1958 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1961 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to held
1962 intermediate value if necessary.
1964 This function is sometimes used to adjust the stack pointer, so we must
1965 ensure that it can never cause transient stack deallocation by writing an
1966 invalid value into REGNUM. */
1968 static void
1969 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
1970 HOST_WIDE_INT delta, bool frame_related_p)
1972 HOST_WIDE_INT mdelta = abs_hwi (delta);
1973 rtx this_rtx = gen_rtx_REG (mode, regnum);
1974 rtx_insn *insn;
1976 /* Do nothing if mdelta is zero. */
1977 if (!mdelta)
1978 return;
1980 /* We only need single instruction if the offset fit into add/sub. */
1981 if (aarch64_uimm12_shift (mdelta))
1983 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
1984 RTX_FRAME_RELATED_P (insn) = frame_related_p;
1985 return;
1988 /* We need two add/sub instructions, each one performing part of the
1989 calculation. Don't do this if the addend can be loaded into register with
1990 a single instruction, in that case we prefer a move to a scratch register
1991 following by an addition. */
1992 if (mdelta < 0x1000000 && !aarch64_move_imm (delta, mode))
1994 HOST_WIDE_INT low_off = mdelta & 0xfff;
1996 low_off = delta < 0 ? -low_off : low_off;
1997 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
1998 RTX_FRAME_RELATED_P (insn) = frame_related_p;
1999 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2000 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2001 return;
2004 /* Otherwise use generic function to handle all other situations. */
2005 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2006 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (delta), true, mode);
2007 insn = emit_insn (gen_add2_insn (this_rtx, scratch_rtx));
2008 if (frame_related_p)
2010 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2011 rtx adj = plus_constant (mode, this_rtx, delta);
2012 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2016 static bool
2017 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2018 tree exp ATTRIBUTE_UNUSED)
2020 /* Currently, always true. */
2021 return true;
2024 /* Implement TARGET_PASS_BY_REFERENCE. */
2026 static bool
2027 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2028 machine_mode mode,
2029 const_tree type,
2030 bool named ATTRIBUTE_UNUSED)
2032 HOST_WIDE_INT size;
2033 machine_mode dummymode;
2034 int nregs;
2036 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2037 size = (mode == BLKmode && type)
2038 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2040 /* Aggregates are passed by reference based on their size. */
2041 if (type && AGGREGATE_TYPE_P (type))
2043 size = int_size_in_bytes (type);
2046 /* Variable sized arguments are always returned by reference. */
2047 if (size < 0)
2048 return true;
2050 /* Can this be a candidate to be passed in fp/simd register(s)? */
2051 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2052 &dummymode, &nregs,
2053 NULL))
2054 return false;
2056 /* Arguments which are variable sized or larger than 2 registers are
2057 passed by reference unless they are a homogenous floating point
2058 aggregate. */
2059 return size > 2 * UNITS_PER_WORD;
2062 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2063 static bool
2064 aarch64_return_in_msb (const_tree valtype)
2066 machine_mode dummy_mode;
2067 int dummy_int;
2069 /* Never happens in little-endian mode. */
2070 if (!BYTES_BIG_ENDIAN)
2071 return false;
2073 /* Only composite types smaller than or equal to 16 bytes can
2074 be potentially returned in registers. */
2075 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2076 || int_size_in_bytes (valtype) <= 0
2077 || int_size_in_bytes (valtype) > 16)
2078 return false;
2080 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2081 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2082 is always passed/returned in the least significant bits of fp/simd
2083 register(s). */
2084 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2085 &dummy_mode, &dummy_int, NULL))
2086 return false;
2088 return true;
2091 /* Implement TARGET_FUNCTION_VALUE.
2092 Define how to find the value returned by a function. */
2094 static rtx
2095 aarch64_function_value (const_tree type, const_tree func,
2096 bool outgoing ATTRIBUTE_UNUSED)
2098 machine_mode mode;
2099 int unsignedp;
2100 int count;
2101 machine_mode ag_mode;
2103 mode = TYPE_MODE (type);
2104 if (INTEGRAL_TYPE_P (type))
2105 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2107 if (aarch64_return_in_msb (type))
2109 HOST_WIDE_INT size = int_size_in_bytes (type);
2111 if (size % UNITS_PER_WORD != 0)
2113 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2114 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2118 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2119 &ag_mode, &count, NULL))
2121 if (!aarch64_composite_type_p (type, mode))
2123 gcc_assert (count == 1 && mode == ag_mode);
2124 return gen_rtx_REG (mode, V0_REGNUM);
2126 else
2128 int i;
2129 rtx par;
2131 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2132 for (i = 0; i < count; i++)
2134 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2135 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2136 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2137 XVECEXP (par, 0, i) = tmp;
2139 return par;
2142 else
2143 return gen_rtx_REG (mode, R0_REGNUM);
2146 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2147 Return true if REGNO is the number of a hard register in which the values
2148 of called function may come back. */
2150 static bool
2151 aarch64_function_value_regno_p (const unsigned int regno)
2153 /* Maximum of 16 bytes can be returned in the general registers. Examples
2154 of 16-byte return values are: 128-bit integers and 16-byte small
2155 structures (excluding homogeneous floating-point aggregates). */
2156 if (regno == R0_REGNUM || regno == R1_REGNUM)
2157 return true;
2159 /* Up to four fp/simd registers can return a function value, e.g. a
2160 homogeneous floating-point aggregate having four members. */
2161 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2162 return TARGET_FLOAT;
2164 return false;
2167 /* Implement TARGET_RETURN_IN_MEMORY.
2169 If the type T of the result of a function is such that
2170 void func (T arg)
2171 would require that arg be passed as a value in a register (or set of
2172 registers) according to the parameter passing rules, then the result
2173 is returned in the same registers as would be used for such an
2174 argument. */
2176 static bool
2177 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2179 HOST_WIDE_INT size;
2180 machine_mode ag_mode;
2181 int count;
2183 if (!AGGREGATE_TYPE_P (type)
2184 && TREE_CODE (type) != COMPLEX_TYPE
2185 && TREE_CODE (type) != VECTOR_TYPE)
2186 /* Simple scalar types always returned in registers. */
2187 return false;
2189 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2190 type,
2191 &ag_mode,
2192 &count,
2193 NULL))
2194 return false;
2196 /* Types larger than 2 registers returned in memory. */
2197 size = int_size_in_bytes (type);
2198 return (size < 0 || size > 2 * UNITS_PER_WORD);
2201 static bool
2202 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2203 const_tree type, int *nregs)
2205 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2206 return aarch64_vfp_is_call_or_return_candidate (mode,
2207 type,
2208 &pcum->aapcs_vfp_rmode,
2209 nregs,
2210 NULL);
2213 /* Given MODE and TYPE of a function argument, return the alignment in
2214 bits. The idea is to suppress any stronger alignment requested by
2215 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2216 This is a helper function for local use only. */
2218 static unsigned int
2219 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2221 if (!type)
2222 return GET_MODE_ALIGNMENT (mode);
2223 if (integer_zerop (TYPE_SIZE (type)))
2224 return 0;
2226 gcc_assert (TYPE_MODE (type) == mode);
2228 if (!AGGREGATE_TYPE_P (type))
2229 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2231 if (TREE_CODE (type) == ARRAY_TYPE)
2232 return TYPE_ALIGN (TREE_TYPE (type));
2234 unsigned int alignment = 0;
2236 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2237 alignment = std::max (alignment, DECL_ALIGN (field));
2239 return alignment;
2242 /* Layout a function argument according to the AAPCS64 rules. The rule
2243 numbers refer to the rule numbers in the AAPCS64. */
2245 static void
2246 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2247 const_tree type,
2248 bool named ATTRIBUTE_UNUSED)
2250 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2251 int ncrn, nvrn, nregs;
2252 bool allocate_ncrn, allocate_nvrn;
2253 HOST_WIDE_INT size;
2255 /* We need to do this once per argument. */
2256 if (pcum->aapcs_arg_processed)
2257 return;
2259 pcum->aapcs_arg_processed = true;
2261 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2262 size
2263 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2264 UNITS_PER_WORD);
2266 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2267 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2268 mode,
2269 type,
2270 &nregs);
2272 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2273 The following code thus handles passing by SIMD/FP registers first. */
2275 nvrn = pcum->aapcs_nvrn;
2277 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2278 and homogenous short-vector aggregates (HVA). */
2279 if (allocate_nvrn)
2281 if (!TARGET_FLOAT)
2282 aarch64_err_no_fpadvsimd (mode, "argument");
2284 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2286 pcum->aapcs_nextnvrn = nvrn + nregs;
2287 if (!aarch64_composite_type_p (type, mode))
2289 gcc_assert (nregs == 1);
2290 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2292 else
2294 rtx par;
2295 int i;
2296 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2297 for (i = 0; i < nregs; i++)
2299 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2300 V0_REGNUM + nvrn + i);
2301 tmp = gen_rtx_EXPR_LIST
2302 (VOIDmode, tmp,
2303 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2304 XVECEXP (par, 0, i) = tmp;
2306 pcum->aapcs_reg = par;
2308 return;
2310 else
2312 /* C.3 NSRN is set to 8. */
2313 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2314 goto on_stack;
2318 ncrn = pcum->aapcs_ncrn;
2319 nregs = size / UNITS_PER_WORD;
2321 /* C6 - C9. though the sign and zero extension semantics are
2322 handled elsewhere. This is the case where the argument fits
2323 entirely general registers. */
2324 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2326 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2328 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2330 /* C.8 if the argument has an alignment of 16 then the NGRN is
2331 rounded up to the next even number. */
2332 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2334 ++ncrn;
2335 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2337 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2338 A reg is still generated for it, but the caller should be smart
2339 enough not to use it. */
2340 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2342 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2344 else
2346 rtx par;
2347 int i;
2349 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2350 for (i = 0; i < nregs; i++)
2352 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2353 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2354 GEN_INT (i * UNITS_PER_WORD));
2355 XVECEXP (par, 0, i) = tmp;
2357 pcum->aapcs_reg = par;
2360 pcum->aapcs_nextncrn = ncrn + nregs;
2361 return;
2364 /* C.11 */
2365 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2367 /* The argument is passed on stack; record the needed number of words for
2368 this argument and align the total size if necessary. */
2369 on_stack:
2370 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2371 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2372 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2373 16 / UNITS_PER_WORD);
2374 return;
2377 /* Implement TARGET_FUNCTION_ARG. */
2379 static rtx
2380 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2381 const_tree type, bool named)
2383 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2384 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2386 if (mode == VOIDmode)
2387 return NULL_RTX;
2389 aarch64_layout_arg (pcum_v, mode, type, named);
2390 return pcum->aapcs_reg;
2393 void
2394 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2395 const_tree fntype ATTRIBUTE_UNUSED,
2396 rtx libname ATTRIBUTE_UNUSED,
2397 const_tree fndecl ATTRIBUTE_UNUSED,
2398 unsigned n_named ATTRIBUTE_UNUSED)
2400 pcum->aapcs_ncrn = 0;
2401 pcum->aapcs_nvrn = 0;
2402 pcum->aapcs_nextncrn = 0;
2403 pcum->aapcs_nextnvrn = 0;
2404 pcum->pcs_variant = ARM_PCS_AAPCS64;
2405 pcum->aapcs_reg = NULL_RTX;
2406 pcum->aapcs_arg_processed = false;
2407 pcum->aapcs_stack_words = 0;
2408 pcum->aapcs_stack_size = 0;
2410 if (!TARGET_FLOAT
2411 && fndecl && TREE_PUBLIC (fndecl)
2412 && fntype && fntype != error_mark_node)
2414 const_tree type = TREE_TYPE (fntype);
2415 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2416 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2417 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2418 &mode, &nregs, NULL))
2419 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2421 return;
2424 static void
2425 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2426 machine_mode mode,
2427 const_tree type,
2428 bool named)
2430 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2431 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2433 aarch64_layout_arg (pcum_v, mode, type, named);
2434 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2435 != (pcum->aapcs_stack_words != 0));
2436 pcum->aapcs_arg_processed = false;
2437 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2438 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2439 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2440 pcum->aapcs_stack_words = 0;
2441 pcum->aapcs_reg = NULL_RTX;
2445 bool
2446 aarch64_function_arg_regno_p (unsigned regno)
2448 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2449 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2452 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2453 PARM_BOUNDARY bits of alignment, but will be given anything up
2454 to STACK_BOUNDARY bits if the type requires it. This makes sure
2455 that both before and after the layout of each argument, the Next
2456 Stacked Argument Address (NSAA) will have a minimum alignment of
2457 8 bytes. */
2459 static unsigned int
2460 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2462 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2464 if (alignment < PARM_BOUNDARY)
2465 alignment = PARM_BOUNDARY;
2466 if (alignment > STACK_BOUNDARY)
2467 alignment = STACK_BOUNDARY;
2468 return alignment;
2471 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2473 Return true if an argument passed on the stack should be padded upwards,
2474 i.e. if the least-significant byte of the stack slot has useful data.
2476 Small aggregate types are placed in the lowest memory address.
2478 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2480 bool
2481 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2483 /* On little-endian targets, the least significant byte of every stack
2484 argument is passed at the lowest byte address of the stack slot. */
2485 if (!BYTES_BIG_ENDIAN)
2486 return true;
2488 /* Otherwise, integral, floating-point and pointer types are padded downward:
2489 the least significant byte of a stack argument is passed at the highest
2490 byte address of the stack slot. */
2491 if (type
2492 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2493 || POINTER_TYPE_P (type))
2494 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2495 return false;
2497 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2498 return true;
2501 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2503 It specifies padding for the last (may also be the only)
2504 element of a block move between registers and memory. If
2505 assuming the block is in the memory, padding upward means that
2506 the last element is padded after its highest significant byte,
2507 while in downward padding, the last element is padded at the
2508 its least significant byte side.
2510 Small aggregates and small complex types are always padded
2511 upwards.
2513 We don't need to worry about homogeneous floating-point or
2514 short-vector aggregates; their move is not affected by the
2515 padding direction determined here. Regardless of endianness,
2516 each element of such an aggregate is put in the least
2517 significant bits of a fp/simd register.
2519 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2520 register has useful data, and return the opposite if the most
2521 significant byte does. */
2523 bool
2524 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2525 bool first ATTRIBUTE_UNUSED)
2528 /* Small composite types are always padded upward. */
2529 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2531 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2532 : GET_MODE_SIZE (mode));
2533 if (size < 2 * UNITS_PER_WORD)
2534 return true;
2537 /* Otherwise, use the default padding. */
2538 return !BYTES_BIG_ENDIAN;
2541 static machine_mode
2542 aarch64_libgcc_cmp_return_mode (void)
2544 return SImode;
2547 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2549 /* We use the 12-bit shifted immediate arithmetic instructions so values
2550 must be multiple of (1 << 12), i.e. 4096. */
2551 #define ARITH_FACTOR 4096
2553 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2554 #error Cannot use simple address calculation for stack probing
2555 #endif
2557 /* The pair of scratch registers used for stack probing. */
2558 #define PROBE_STACK_FIRST_REG 9
2559 #define PROBE_STACK_SECOND_REG 10
2561 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2562 inclusive. These are offsets from the current stack pointer. */
2564 static void
2565 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2567 rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2569 /* See the same assertion on PROBE_INTERVAL above. */
2570 gcc_assert ((first % ARITH_FACTOR) == 0);
2572 /* See if we have a constant small number of probes to generate. If so,
2573 that's the easy case. */
2574 if (size <= PROBE_INTERVAL)
2576 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2578 emit_set_insn (reg1,
2579 plus_constant (ptr_mode,
2580 stack_pointer_rtx, -(first + base)));
2581 emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2584 /* The run-time loop is made up of 8 insns in the generic case while the
2585 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2586 else if (size <= 4 * PROBE_INTERVAL)
2588 HOST_WIDE_INT i, rem;
2590 emit_set_insn (reg1,
2591 plus_constant (ptr_mode,
2592 stack_pointer_rtx,
2593 -(first + PROBE_INTERVAL)));
2594 emit_stack_probe (reg1);
2596 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2597 it exceeds SIZE. If only two probes are needed, this will not
2598 generate any code. Then probe at FIRST + SIZE. */
2599 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2601 emit_set_insn (reg1,
2602 plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2603 emit_stack_probe (reg1);
2606 rem = size - (i - PROBE_INTERVAL);
2607 if (rem > 256)
2609 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2611 emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2612 emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2614 else
2615 emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2618 /* Otherwise, do the same as above, but in a loop. Note that we must be
2619 extra careful with variables wrapping around because we might be at
2620 the very top (or the very bottom) of the address space and we have
2621 to be able to handle this case properly; in particular, we use an
2622 equality test for the loop condition. */
2623 else
2625 rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2627 /* Step 1: round SIZE to the previous multiple of the interval. */
2629 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2632 /* Step 2: compute initial and final value of the loop counter. */
2634 /* TEST_ADDR = SP + FIRST. */
2635 emit_set_insn (reg1,
2636 plus_constant (ptr_mode, stack_pointer_rtx, -first));
2638 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2639 emit_set_insn (reg2,
2640 plus_constant (ptr_mode, stack_pointer_rtx,
2641 -(first + rounded_size)));
2644 /* Step 3: the loop
2648 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2649 probe at TEST_ADDR
2651 while (TEST_ADDR != LAST_ADDR)
2653 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2654 until it is equal to ROUNDED_SIZE. */
2656 if (ptr_mode == DImode)
2657 emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2658 else
2659 emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2662 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2663 that SIZE is equal to ROUNDED_SIZE. */
2665 if (size != rounded_size)
2667 HOST_WIDE_INT rem = size - rounded_size;
2669 if (rem > 256)
2671 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2673 emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2674 emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2676 else
2677 emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2681 /* Make sure nothing is scheduled before we are done. */
2682 emit_insn (gen_blockage ());
2685 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2686 absolute addresses. */
2688 const char *
2689 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2691 static int labelno = 0;
2692 char loop_lab[32];
2693 rtx xops[2];
2695 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2697 /* Loop. */
2698 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2700 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2701 xops[0] = reg1;
2702 xops[1] = GEN_INT (PROBE_INTERVAL);
2703 output_asm_insn ("sub\t%0, %0, %1", xops);
2705 /* Probe at TEST_ADDR. */
2706 output_asm_insn ("str\txzr, [%0]", xops);
2708 /* Test if TEST_ADDR == LAST_ADDR. */
2709 xops[1] = reg2;
2710 output_asm_insn ("cmp\t%0, %1", xops);
2712 /* Branch. */
2713 fputs ("\tb.ne\t", asm_out_file);
2714 assemble_name_raw (asm_out_file, loop_lab);
2715 fputc ('\n', asm_out_file);
2717 return "";
2720 static bool
2721 aarch64_frame_pointer_required (void)
2723 /* In aarch64_override_options_after_change
2724 flag_omit_leaf_frame_pointer turns off the frame pointer by
2725 default. Turn it back on now if we've not got a leaf
2726 function. */
2727 if (flag_omit_leaf_frame_pointer
2728 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2729 return true;
2731 return false;
2734 /* Mark the registers that need to be saved by the callee and calculate
2735 the size of the callee-saved registers area and frame record (both FP
2736 and LR may be omitted). */
2737 static void
2738 aarch64_layout_frame (void)
2740 HOST_WIDE_INT offset = 0;
2741 int regno;
2743 if (reload_completed && cfun->machine->frame.laid_out)
2744 return;
2746 #define SLOT_NOT_REQUIRED (-2)
2747 #define SLOT_REQUIRED (-1)
2749 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2750 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2752 /* First mark all the registers that really need to be saved... */
2753 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2754 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2756 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2757 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2759 /* ... that includes the eh data registers (if needed)... */
2760 if (crtl->calls_eh_return)
2761 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2762 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2763 = SLOT_REQUIRED;
2765 /* ... and any callee saved register that dataflow says is live. */
2766 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2767 if (df_regs_ever_live_p (regno)
2768 && (regno == R30_REGNUM
2769 || !call_used_regs[regno]))
2770 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2772 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2773 if (df_regs_ever_live_p (regno)
2774 && !call_used_regs[regno])
2775 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2777 if (frame_pointer_needed)
2779 /* FP and LR are placed in the linkage record. */
2780 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2781 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2782 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2783 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2784 offset += 2 * UNITS_PER_WORD;
2787 /* Now assign stack slots for them. */
2788 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2789 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2791 cfun->machine->frame.reg_offset[regno] = offset;
2792 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2793 cfun->machine->frame.wb_candidate1 = regno;
2794 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2795 cfun->machine->frame.wb_candidate2 = regno;
2796 offset += UNITS_PER_WORD;
2799 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2800 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2802 cfun->machine->frame.reg_offset[regno] = offset;
2803 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2804 cfun->machine->frame.wb_candidate1 = regno;
2805 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2806 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2807 cfun->machine->frame.wb_candidate2 = regno;
2808 offset += UNITS_PER_WORD;
2811 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2813 cfun->machine->frame.saved_regs_size = offset;
2815 HOST_WIDE_INT varargs_and_saved_regs_size
2816 = offset + cfun->machine->frame.saved_varargs_size;
2818 cfun->machine->frame.hard_fp_offset
2819 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2820 STACK_BOUNDARY / BITS_PER_UNIT);
2822 cfun->machine->frame.frame_size
2823 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2824 + crtl->outgoing_args_size,
2825 STACK_BOUNDARY / BITS_PER_UNIT);
2827 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2829 cfun->machine->frame.initial_adjust = 0;
2830 cfun->machine->frame.final_adjust = 0;
2831 cfun->machine->frame.callee_adjust = 0;
2832 cfun->machine->frame.callee_offset = 0;
2834 HOST_WIDE_INT max_push_offset = 0;
2835 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2836 max_push_offset = 512;
2837 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2838 max_push_offset = 256;
2840 if (cfun->machine->frame.frame_size < max_push_offset
2841 && crtl->outgoing_args_size == 0)
2843 /* Simple, small frame with no outgoing arguments:
2844 stp reg1, reg2, [sp, -frame_size]!
2845 stp reg3, reg4, [sp, 16] */
2846 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2848 else if ((crtl->outgoing_args_size
2849 + cfun->machine->frame.saved_regs_size < 512)
2850 && !(cfun->calls_alloca
2851 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2853 /* Frame with small outgoing arguments:
2854 sub sp, sp, frame_size
2855 stp reg1, reg2, [sp, outgoing_args_size]
2856 stp reg3, reg4, [sp, outgoing_args_size + 16] */
2857 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2858 cfun->machine->frame.callee_offset
2859 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2861 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2863 /* Frame with large outgoing arguments but a small local area:
2864 stp reg1, reg2, [sp, -hard_fp_offset]!
2865 stp reg3, reg4, [sp, 16]
2866 sub sp, sp, outgoing_args_size */
2867 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2868 cfun->machine->frame.final_adjust
2869 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2871 else if (!frame_pointer_needed
2872 && varargs_and_saved_regs_size < max_push_offset)
2874 /* Frame with large local area and outgoing arguments (this pushes the
2875 callee-saves first, followed by the locals and outgoing area):
2876 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2877 stp reg3, reg4, [sp, 16]
2878 sub sp, sp, frame_size - varargs_and_saved_regs_size */
2879 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2880 cfun->machine->frame.final_adjust
2881 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2882 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2883 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2885 else
2887 /* Frame with large local area and outgoing arguments using frame pointer:
2888 sub sp, sp, hard_fp_offset
2889 stp x29, x30, [sp, 0]
2890 add x29, sp, 0
2891 stp reg3, reg4, [sp, 16]
2892 sub sp, sp, outgoing_args_size */
2893 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2894 cfun->machine->frame.final_adjust
2895 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2898 cfun->machine->frame.laid_out = true;
2901 static bool
2902 aarch64_register_saved_on_entry (int regno)
2904 return cfun->machine->frame.reg_offset[regno] >= 0;
2907 static unsigned
2908 aarch64_next_callee_save (unsigned regno, unsigned limit)
2910 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2911 regno ++;
2912 return regno;
2915 static void
2916 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2917 HOST_WIDE_INT adjustment)
2919 rtx base_rtx = stack_pointer_rtx;
2920 rtx insn, reg, mem;
2922 reg = gen_rtx_REG (mode, regno);
2923 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2924 plus_constant (Pmode, base_rtx, -adjustment));
2925 mem = gen_rtx_MEM (mode, mem);
2927 insn = emit_move_insn (mem, reg);
2928 RTX_FRAME_RELATED_P (insn) = 1;
2931 static rtx
2932 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2933 HOST_WIDE_INT adjustment)
2935 switch (mode)
2937 case DImode:
2938 return gen_storewb_pairdi_di (base, base, reg, reg2,
2939 GEN_INT (-adjustment),
2940 GEN_INT (UNITS_PER_WORD - adjustment));
2941 case DFmode:
2942 return gen_storewb_pairdf_di (base, base, reg, reg2,
2943 GEN_INT (-adjustment),
2944 GEN_INT (UNITS_PER_WORD - adjustment));
2945 default:
2946 gcc_unreachable ();
2950 static void
2951 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
2953 rtx_insn *insn;
2954 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
2956 if (regno2 == INVALID_REGNUM)
2957 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
2959 rtx reg1 = gen_rtx_REG (mode, regno1);
2960 rtx reg2 = gen_rtx_REG (mode, regno2);
2962 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
2963 reg2, adjustment));
2964 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
2965 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
2966 RTX_FRAME_RELATED_P (insn) = 1;
2969 static rtx
2970 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
2971 HOST_WIDE_INT adjustment)
2973 switch (mode)
2975 case DImode:
2976 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
2977 GEN_INT (UNITS_PER_WORD));
2978 case DFmode:
2979 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
2980 GEN_INT (UNITS_PER_WORD));
2981 default:
2982 gcc_unreachable ();
2986 static void
2987 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
2988 rtx *cfi_ops)
2990 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
2991 rtx reg1 = gen_rtx_REG (mode, regno1);
2993 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
2995 if (regno2 == INVALID_REGNUM)
2997 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
2998 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
2999 emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3001 else
3003 rtx reg2 = gen_rtx_REG (mode, regno2);
3004 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3005 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3006 reg2, adjustment));
3010 static rtx
3011 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3012 rtx reg2)
3014 switch (mode)
3016 case DImode:
3017 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3019 case DFmode:
3020 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3022 default:
3023 gcc_unreachable ();
3027 static rtx
3028 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3029 rtx mem2)
3031 switch (mode)
3033 case DImode:
3034 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3036 case DFmode:
3037 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3039 default:
3040 gcc_unreachable ();
3045 static void
3046 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3047 unsigned start, unsigned limit, bool skip_wb)
3049 rtx_insn *insn;
3050 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3051 ? gen_frame_mem : gen_rtx_MEM);
3052 unsigned regno;
3053 unsigned regno2;
3055 for (regno = aarch64_next_callee_save (start, limit);
3056 regno <= limit;
3057 regno = aarch64_next_callee_save (regno + 1, limit))
3059 rtx reg, mem;
3060 HOST_WIDE_INT offset;
3062 if (skip_wb
3063 && (regno == cfun->machine->frame.wb_candidate1
3064 || regno == cfun->machine->frame.wb_candidate2))
3065 continue;
3067 reg = gen_rtx_REG (mode, regno);
3068 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3069 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3070 offset));
3072 regno2 = aarch64_next_callee_save (regno + 1, limit);
3074 if (regno2 <= limit
3075 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3076 == cfun->machine->frame.reg_offset[regno2]))
3079 rtx reg2 = gen_rtx_REG (mode, regno2);
3080 rtx mem2;
3082 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3083 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3084 offset));
3085 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3086 reg2));
3088 /* The first part of a frame-related parallel insn is
3089 always assumed to be relevant to the frame
3090 calculations; subsequent parts, are only
3091 frame-related if explicitly marked. */
3092 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3093 regno = regno2;
3095 else
3096 insn = emit_move_insn (mem, reg);
3098 RTX_FRAME_RELATED_P (insn) = 1;
3102 static void
3103 aarch64_restore_callee_saves (machine_mode mode,
3104 HOST_WIDE_INT start_offset, unsigned start,
3105 unsigned limit, bool skip_wb, rtx *cfi_ops)
3107 rtx base_rtx = stack_pointer_rtx;
3108 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3109 ? gen_frame_mem : gen_rtx_MEM);
3110 unsigned regno;
3111 unsigned regno2;
3112 HOST_WIDE_INT offset;
3114 for (regno = aarch64_next_callee_save (start, limit);
3115 regno <= limit;
3116 regno = aarch64_next_callee_save (regno + 1, limit))
3118 rtx reg, mem;
3120 if (skip_wb
3121 && (regno == cfun->machine->frame.wb_candidate1
3122 || regno == cfun->machine->frame.wb_candidate2))
3123 continue;
3125 reg = gen_rtx_REG (mode, regno);
3126 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3127 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3129 regno2 = aarch64_next_callee_save (regno + 1, limit);
3131 if (regno2 <= limit
3132 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3133 == cfun->machine->frame.reg_offset[regno2]))
3135 rtx reg2 = gen_rtx_REG (mode, regno2);
3136 rtx mem2;
3138 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3139 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3140 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3142 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3143 regno = regno2;
3145 else
3146 emit_move_insn (reg, mem);
3147 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3151 /* AArch64 stack frames generated by this compiler look like:
3153 +-------------------------------+
3155 | incoming stack arguments |
3157 +-------------------------------+
3158 | | <-- incoming stack pointer (aligned)
3159 | callee-allocated save area |
3160 | for register varargs |
3162 +-------------------------------+
3163 | local variables | <-- frame_pointer_rtx
3165 +-------------------------------+
3166 | padding0 | \
3167 +-------------------------------+ |
3168 | callee-saved registers | | frame.saved_regs_size
3169 +-------------------------------+ |
3170 | LR' | |
3171 +-------------------------------+ |
3172 | FP' | / <- hard_frame_pointer_rtx (aligned)
3173 +-------------------------------+
3174 | dynamic allocation |
3175 +-------------------------------+
3176 | padding |
3177 +-------------------------------+
3178 | outgoing stack arguments | <-- arg_pointer
3180 +-------------------------------+
3181 | | <-- stack_pointer_rtx (aligned)
3183 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3184 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3185 unchanged. */
3187 /* Generate the prologue instructions for entry into a function.
3188 Establish the stack frame by decreasing the stack pointer with a
3189 properly calculated size and, if necessary, create a frame record
3190 filled with the values of LR and previous frame pointer. The
3191 current FP is also set up if it is in use. */
3193 void
3194 aarch64_expand_prologue (void)
3196 aarch64_layout_frame ();
3198 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3199 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3200 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3201 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3202 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3203 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3204 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3205 rtx_insn *insn;
3207 if (flag_stack_usage_info)
3208 current_function_static_stack_size = frame_size;
3210 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3212 if (crtl->is_leaf && !cfun->calls_alloca)
3214 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3215 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3216 frame_size - STACK_CHECK_PROTECT);
3218 else if (frame_size > 0)
3219 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3222 aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, -initial_adjust, true);
3224 if (callee_adjust != 0)
3225 aarch64_push_regs (reg1, reg2, callee_adjust);
3227 if (frame_pointer_needed)
3229 if (callee_adjust == 0)
3230 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3231 R30_REGNUM, false);
3232 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3233 stack_pointer_rtx,
3234 GEN_INT (callee_offset)));
3235 RTX_FRAME_RELATED_P (insn) = 1;
3236 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3239 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3240 callee_adjust != 0 || frame_pointer_needed);
3241 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3242 callee_adjust != 0 || frame_pointer_needed);
3243 aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, -final_adjust,
3244 !frame_pointer_needed);
3247 /* Return TRUE if we can use a simple_return insn.
3249 This function checks whether the callee saved stack is empty, which
3250 means no restore actions are need. The pro_and_epilogue will use
3251 this to check whether shrink-wrapping opt is feasible. */
3253 bool
3254 aarch64_use_return_insn_p (void)
3256 if (!reload_completed)
3257 return false;
3259 if (crtl->profile)
3260 return false;
3262 aarch64_layout_frame ();
3264 return cfun->machine->frame.frame_size == 0;
3267 /* Generate the epilogue instructions for returning from a function.
3268 This is almost exactly the reverse of the prolog sequence, except
3269 that we need to insert barriers to avoid scheduling loads that read
3270 from a deallocated stack, and we optimize the unwind records by
3271 emitting them all together if possible. */
3272 void
3273 aarch64_expand_epilogue (bool for_sibcall)
3275 aarch64_layout_frame ();
3277 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3278 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3279 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3280 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3281 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3282 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3283 rtx cfi_ops = NULL;
3284 rtx_insn *insn;
3286 /* We need to add memory barrier to prevent read from deallocated stack. */
3287 bool need_barrier_p = (get_frame_size ()
3288 + cfun->machine->frame.saved_varargs_size) != 0;
3290 /* Emit a barrier to prevent loads from a deallocated stack. */
3291 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca)
3293 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3294 need_barrier_p = false;
3297 /* Restore the stack pointer from the frame pointer if it may not
3298 be the same as the stack pointer. */
3299 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3301 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3302 hard_frame_pointer_rtx,
3303 GEN_INT (-callee_offset)));
3304 /* If writeback is used when restoring callee-saves, the CFA
3305 is restored on the instruction doing the writeback. */
3306 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3308 else
3309 aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, final_adjust, true);
3311 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3312 callee_adjust != 0, &cfi_ops);
3313 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3314 callee_adjust != 0, &cfi_ops);
3316 if (need_barrier_p)
3317 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3319 if (callee_adjust != 0)
3320 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3322 if (callee_adjust != 0 || initial_adjust > 65536)
3324 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3325 insn = get_last_insn ();
3326 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3327 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3328 RTX_FRAME_RELATED_P (insn) = 1;
3329 cfi_ops = NULL;
3332 aarch64_add_constant (Pmode, SP_REGNUM, IP0_REGNUM, initial_adjust, true);
3334 if (cfi_ops)
3336 /* Emit delayed restores and reset the CFA to be SP. */
3337 insn = get_last_insn ();
3338 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3339 REG_NOTES (insn) = cfi_ops;
3340 RTX_FRAME_RELATED_P (insn) = 1;
3343 /* Stack adjustment for exception handler. */
3344 if (crtl->calls_eh_return)
3346 /* We need to unwind the stack by the offset computed by
3347 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3348 to be SP; letting the CFA move during this adjustment
3349 is just as correct as retaining the CFA from the body
3350 of the function. Therefore, do nothing special. */
3351 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3354 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3355 if (!for_sibcall)
3356 emit_jump_insn (ret_rtx);
3359 /* Return the place to copy the exception unwinding return address to.
3360 This will probably be a stack slot, but could (in theory be the
3361 return register). */
3363 aarch64_final_eh_return_addr (void)
3365 HOST_WIDE_INT fp_offset;
3367 aarch64_layout_frame ();
3369 fp_offset = cfun->machine->frame.frame_size
3370 - cfun->machine->frame.hard_fp_offset;
3372 if (cfun->machine->frame.reg_offset[LR_REGNUM] < 0)
3373 return gen_rtx_REG (DImode, LR_REGNUM);
3375 /* DSE and CSELIB do not detect an alias between sp+k1 and fp+k2. This can
3376 result in a store to save LR introduced by builtin_eh_return () being
3377 incorrectly deleted because the alias is not detected.
3378 So in the calculation of the address to copy the exception unwinding
3379 return address to, we note 2 cases.
3380 If FP is needed and the fp_offset is 0, it means that SP = FP and hence
3381 we return a SP-relative location since all the addresses are SP-relative
3382 in this case. This prevents the store from being optimized away.
3383 If the fp_offset is not 0, then the addresses will be FP-relative and
3384 therefore we return a FP-relative location. */
3386 if (frame_pointer_needed)
3388 if (fp_offset)
3389 return gen_frame_mem (DImode,
3390 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3391 else
3392 return gen_frame_mem (DImode,
3393 plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD));
3396 /* If FP is not needed, we calculate the location of LR, which would be
3397 at the top of the saved registers block. */
3399 return gen_frame_mem (DImode,
3400 plus_constant (Pmode,
3401 stack_pointer_rtx,
3402 fp_offset
3403 + cfun->machine->frame.saved_regs_size
3404 - 2 * UNITS_PER_WORD));
3407 /* Output code to add DELTA to the first argument, and then jump
3408 to FUNCTION. Used for C++ multiple inheritance. */
3409 static void
3410 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3411 HOST_WIDE_INT delta,
3412 HOST_WIDE_INT vcall_offset,
3413 tree function)
3415 /* The this pointer is always in x0. Note that this differs from
3416 Arm where the this pointer maybe bumped to r1 if r0 is required
3417 to return a pointer to an aggregate. On AArch64 a result value
3418 pointer will be in x8. */
3419 int this_regno = R0_REGNUM;
3420 rtx this_rtx, temp0, temp1, addr, funexp;
3421 rtx_insn *insn;
3423 reload_completed = 1;
3424 emit_note (NOTE_INSN_PROLOGUE_END);
3426 if (vcall_offset == 0)
3427 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta, false);
3428 else
3430 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3432 this_rtx = gen_rtx_REG (Pmode, this_regno);
3433 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3434 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3436 addr = this_rtx;
3437 if (delta != 0)
3439 if (delta >= -256 && delta < 256)
3440 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3441 plus_constant (Pmode, this_rtx, delta));
3442 else
3443 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta, false);
3446 if (Pmode == ptr_mode)
3447 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3448 else
3449 aarch64_emit_move (temp0,
3450 gen_rtx_ZERO_EXTEND (Pmode,
3451 gen_rtx_MEM (ptr_mode, addr)));
3453 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3454 addr = plus_constant (Pmode, temp0, vcall_offset);
3455 else
3457 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3458 Pmode);
3459 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3462 if (Pmode == ptr_mode)
3463 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3464 else
3465 aarch64_emit_move (temp1,
3466 gen_rtx_SIGN_EXTEND (Pmode,
3467 gen_rtx_MEM (ptr_mode, addr)));
3469 emit_insn (gen_add2_insn (this_rtx, temp1));
3472 /* Generate a tail call to the target function. */
3473 if (!TREE_USED (function))
3475 assemble_external (function);
3476 TREE_USED (function) = 1;
3478 funexp = XEXP (DECL_RTL (function), 0);
3479 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3480 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3481 SIBLING_CALL_P (insn) = 1;
3483 insn = get_insns ();
3484 shorten_branches (insn);
3485 final_start_function (insn, file, 1);
3486 final (insn, file, 1);
3487 final_end_function ();
3489 /* Stop pretending to be a post-reload pass. */
3490 reload_completed = 0;
3493 static bool
3494 aarch64_tls_referenced_p (rtx x)
3496 if (!TARGET_HAVE_TLS)
3497 return false;
3498 subrtx_iterator::array_type array;
3499 FOR_EACH_SUBRTX (iter, array, x, ALL)
3501 const_rtx x = *iter;
3502 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3503 return true;
3504 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3505 TLS offsets, not real symbol references. */
3506 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3507 iter.skip_subrtxes ();
3509 return false;
3513 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3514 a left shift of 0 or 12 bits. */
3515 bool
3516 aarch64_uimm12_shift (HOST_WIDE_INT val)
3518 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3519 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3524 /* Return true if val is an immediate that can be loaded into a
3525 register by a MOVZ instruction. */
3526 static bool
3527 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3529 if (GET_MODE_SIZE (mode) > 4)
3531 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3532 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3533 return 1;
3535 else
3537 /* Ignore sign extension. */
3538 val &= (HOST_WIDE_INT) 0xffffffff;
3540 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3541 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3544 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3546 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3548 0x0000000100000001ull,
3549 0x0001000100010001ull,
3550 0x0101010101010101ull,
3551 0x1111111111111111ull,
3552 0x5555555555555555ull,
3556 /* Return true if val is a valid bitmask immediate. */
3558 bool
3559 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3561 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3562 int bits;
3564 /* Check for a single sequence of one bits and return quickly if so.
3565 The special cases of all ones and all zeroes returns false. */
3566 val = (unsigned HOST_WIDE_INT) val_in;
3567 tmp = val + (val & -val);
3569 if (tmp == (tmp & -tmp))
3570 return (val + 1) > 1;
3572 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
3573 if (mode == SImode)
3574 val = (val << 32) | (val & 0xffffffff);
3576 /* Invert if the immediate doesn't start with a zero bit - this means we
3577 only need to search for sequences of one bits. */
3578 if (val & 1)
3579 val = ~val;
3581 /* Find the first set bit and set tmp to val with the first sequence of one
3582 bits removed. Return success if there is a single sequence of ones. */
3583 first_one = val & -val;
3584 tmp = val & (val + first_one);
3586 if (tmp == 0)
3587 return true;
3589 /* Find the next set bit and compute the difference in bit position. */
3590 next_one = tmp & -tmp;
3591 bits = clz_hwi (first_one) - clz_hwi (next_one);
3592 mask = val ^ tmp;
3594 /* Check the bit position difference is a power of 2, and that the first
3595 sequence of one bits fits within 'bits' bits. */
3596 if ((mask >> bits) != 0 || bits != (bits & -bits))
3597 return false;
3599 /* Check the sequence of one bits is repeated 64/bits times. */
3600 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3604 /* Return true if val is an immediate that can be loaded into a
3605 register in a single instruction. */
3606 bool
3607 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
3609 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
3610 return 1;
3611 return aarch64_bitmask_imm (val, mode);
3614 static bool
3615 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
3617 rtx base, offset;
3619 if (GET_CODE (x) == HIGH)
3620 return true;
3622 split_const (x, &base, &offset);
3623 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
3625 if (aarch64_classify_symbol (base, offset)
3626 != SYMBOL_FORCE_TO_MEM)
3627 return true;
3628 else
3629 /* Avoid generating a 64-bit relocation in ILP32; leave
3630 to aarch64_expand_mov_immediate to handle it properly. */
3631 return mode != ptr_mode;
3634 return aarch64_tls_referenced_p (x);
3637 /* Implement TARGET_CASE_VALUES_THRESHOLD.
3638 The expansion for a table switch is quite expensive due to the number
3639 of instructions, the table lookup and hard to predict indirect jump.
3640 When optimizing for speed, and -O3 enabled, use the per-core tuning if
3641 set, otherwise use tables for > 16 cases as a tradeoff between size and
3642 performance. When optimizing for size, use the default setting. */
3644 static unsigned int
3645 aarch64_case_values_threshold (void)
3647 /* Use the specified limit for the number of cases before using jump
3648 tables at higher optimization levels. */
3649 if (optimize > 2
3650 && selected_cpu->tune->max_case_values != 0)
3651 return selected_cpu->tune->max_case_values;
3652 else
3653 return optimize_size ? default_case_values_threshold () : 17;
3656 /* Return true if register REGNO is a valid index register.
3657 STRICT_P is true if REG_OK_STRICT is in effect. */
3659 bool
3660 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
3662 if (!HARD_REGISTER_NUM_P (regno))
3664 if (!strict_p)
3665 return true;
3667 if (!reg_renumber)
3668 return false;
3670 regno = reg_renumber[regno];
3672 return GP_REGNUM_P (regno);
3675 /* Return true if register REGNO is a valid base register for mode MODE.
3676 STRICT_P is true if REG_OK_STRICT is in effect. */
3678 bool
3679 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
3681 if (!HARD_REGISTER_NUM_P (regno))
3683 if (!strict_p)
3684 return true;
3686 if (!reg_renumber)
3687 return false;
3689 regno = reg_renumber[regno];
3692 /* The fake registers will be eliminated to either the stack or
3693 hard frame pointer, both of which are usually valid base registers.
3694 Reload deals with the cases where the eliminated form isn't valid. */
3695 return (GP_REGNUM_P (regno)
3696 || regno == SP_REGNUM
3697 || regno == FRAME_POINTER_REGNUM
3698 || regno == ARG_POINTER_REGNUM);
3701 /* Return true if X is a valid base register for mode MODE.
3702 STRICT_P is true if REG_OK_STRICT is in effect. */
3704 static bool
3705 aarch64_base_register_rtx_p (rtx x, bool strict_p)
3707 if (!strict_p && GET_CODE (x) == SUBREG)
3708 x = SUBREG_REG (x);
3710 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
3713 /* Return true if address offset is a valid index. If it is, fill in INFO
3714 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
3716 static bool
3717 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
3718 machine_mode mode, bool strict_p)
3720 enum aarch64_address_type type;
3721 rtx index;
3722 int shift;
3724 /* (reg:P) */
3725 if ((REG_P (x) || GET_CODE (x) == SUBREG)
3726 && GET_MODE (x) == Pmode)
3728 type = ADDRESS_REG_REG;
3729 index = x;
3730 shift = 0;
3732 /* (sign_extend:DI (reg:SI)) */
3733 else if ((GET_CODE (x) == SIGN_EXTEND
3734 || GET_CODE (x) == ZERO_EXTEND)
3735 && GET_MODE (x) == DImode
3736 && GET_MODE (XEXP (x, 0)) == SImode)
3738 type = (GET_CODE (x) == SIGN_EXTEND)
3739 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3740 index = XEXP (x, 0);
3741 shift = 0;
3743 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
3744 else if (GET_CODE (x) == MULT
3745 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3746 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3747 && GET_MODE (XEXP (x, 0)) == DImode
3748 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3749 && CONST_INT_P (XEXP (x, 1)))
3751 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3752 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3753 index = XEXP (XEXP (x, 0), 0);
3754 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3756 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
3757 else if (GET_CODE (x) == ASHIFT
3758 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
3759 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
3760 && GET_MODE (XEXP (x, 0)) == DImode
3761 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
3762 && CONST_INT_P (XEXP (x, 1)))
3764 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
3765 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3766 index = XEXP (XEXP (x, 0), 0);
3767 shift = INTVAL (XEXP (x, 1));
3769 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
3770 else if ((GET_CODE (x) == SIGN_EXTRACT
3771 || GET_CODE (x) == ZERO_EXTRACT)
3772 && GET_MODE (x) == DImode
3773 && GET_CODE (XEXP (x, 0)) == MULT
3774 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3775 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3777 type = (GET_CODE (x) == SIGN_EXTRACT)
3778 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3779 index = XEXP (XEXP (x, 0), 0);
3780 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3781 if (INTVAL (XEXP (x, 1)) != 32 + shift
3782 || INTVAL (XEXP (x, 2)) != 0)
3783 shift = -1;
3785 /* (and:DI (mult:DI (reg:DI) (const_int scale))
3786 (const_int 0xffffffff<<shift)) */
3787 else if (GET_CODE (x) == AND
3788 && GET_MODE (x) == DImode
3789 && GET_CODE (XEXP (x, 0)) == MULT
3790 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3791 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3792 && CONST_INT_P (XEXP (x, 1)))
3794 type = ADDRESS_REG_UXTW;
3795 index = XEXP (XEXP (x, 0), 0);
3796 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
3797 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3798 shift = -1;
3800 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
3801 else if ((GET_CODE (x) == SIGN_EXTRACT
3802 || GET_CODE (x) == ZERO_EXTRACT)
3803 && GET_MODE (x) == DImode
3804 && GET_CODE (XEXP (x, 0)) == ASHIFT
3805 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3806 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
3808 type = (GET_CODE (x) == SIGN_EXTRACT)
3809 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
3810 index = XEXP (XEXP (x, 0), 0);
3811 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3812 if (INTVAL (XEXP (x, 1)) != 32 + shift
3813 || INTVAL (XEXP (x, 2)) != 0)
3814 shift = -1;
3816 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
3817 (const_int 0xffffffff<<shift)) */
3818 else if (GET_CODE (x) == AND
3819 && GET_MODE (x) == DImode
3820 && GET_CODE (XEXP (x, 0)) == ASHIFT
3821 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
3822 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
3823 && CONST_INT_P (XEXP (x, 1)))
3825 type = ADDRESS_REG_UXTW;
3826 index = XEXP (XEXP (x, 0), 0);
3827 shift = INTVAL (XEXP (XEXP (x, 0), 1));
3828 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
3829 shift = -1;
3831 /* (mult:P (reg:P) (const_int scale)) */
3832 else if (GET_CODE (x) == MULT
3833 && GET_MODE (x) == Pmode
3834 && GET_MODE (XEXP (x, 0)) == Pmode
3835 && CONST_INT_P (XEXP (x, 1)))
3837 type = ADDRESS_REG_REG;
3838 index = XEXP (x, 0);
3839 shift = exact_log2 (INTVAL (XEXP (x, 1)));
3841 /* (ashift:P (reg:P) (const_int shift)) */
3842 else if (GET_CODE (x) == ASHIFT
3843 && GET_MODE (x) == Pmode
3844 && GET_MODE (XEXP (x, 0)) == Pmode
3845 && CONST_INT_P (XEXP (x, 1)))
3847 type = ADDRESS_REG_REG;
3848 index = XEXP (x, 0);
3849 shift = INTVAL (XEXP (x, 1));
3851 else
3852 return false;
3854 if (GET_CODE (index) == SUBREG)
3855 index = SUBREG_REG (index);
3857 if ((shift == 0 ||
3858 (shift > 0 && shift <= 3
3859 && (1 << shift) == GET_MODE_SIZE (mode)))
3860 && REG_P (index)
3861 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
3863 info->type = type;
3864 info->offset = index;
3865 info->shift = shift;
3866 return true;
3869 return false;
3872 bool
3873 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3875 return (offset >= -64 * GET_MODE_SIZE (mode)
3876 && offset < 64 * GET_MODE_SIZE (mode)
3877 && offset % GET_MODE_SIZE (mode) == 0);
3880 static inline bool
3881 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3882 HOST_WIDE_INT offset)
3884 return offset >= -256 && offset < 256;
3887 static inline bool
3888 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3890 return (offset >= 0
3891 && offset < 4096 * GET_MODE_SIZE (mode)
3892 && offset % GET_MODE_SIZE (mode) == 0);
3895 /* Return true if MODE is one of the modes for which we
3896 support LDP/STP operations. */
3898 static bool
3899 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
3901 return mode == SImode || mode == DImode
3902 || mode == SFmode || mode == DFmode
3903 || (aarch64_vector_mode_supported_p (mode)
3904 && GET_MODE_SIZE (mode) == 8);
3907 /* Return true if REGNO is a virtual pointer register, or an eliminable
3908 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
3909 include stack_pointer or hard_frame_pointer. */
3910 static bool
3911 virt_or_elim_regno_p (unsigned regno)
3913 return ((regno >= FIRST_VIRTUAL_REGISTER
3914 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
3915 || regno == FRAME_POINTER_REGNUM
3916 || regno == ARG_POINTER_REGNUM);
3919 /* Return true if X is a valid address for machine mode MODE. If it is,
3920 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
3921 effect. OUTER_CODE is PARALLEL for a load/store pair. */
3923 static bool
3924 aarch64_classify_address (struct aarch64_address_info *info,
3925 rtx x, machine_mode mode,
3926 RTX_CODE outer_code, bool strict_p)
3928 enum rtx_code code = GET_CODE (x);
3929 rtx op0, op1;
3931 /* On BE, we use load/store pair for all large int mode load/stores. */
3932 bool load_store_pair_p = (outer_code == PARALLEL
3933 || (BYTES_BIG_ENDIAN
3934 && aarch64_vect_struct_mode_p (mode)));
3936 bool allow_reg_index_p =
3937 !load_store_pair_p
3938 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
3939 && !aarch64_vect_struct_mode_p (mode);
3941 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
3942 REG addressing. */
3943 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
3944 && (code != POST_INC && code != REG))
3945 return false;
3947 switch (code)
3949 case REG:
3950 case SUBREG:
3951 info->type = ADDRESS_REG_IMM;
3952 info->base = x;
3953 info->offset = const0_rtx;
3954 return aarch64_base_register_rtx_p (x, strict_p);
3956 case PLUS:
3957 op0 = XEXP (x, 0);
3958 op1 = XEXP (x, 1);
3960 if (! strict_p
3961 && REG_P (op0)
3962 && virt_or_elim_regno_p (REGNO (op0))
3963 && CONST_INT_P (op1))
3965 info->type = ADDRESS_REG_IMM;
3966 info->base = op0;
3967 info->offset = op1;
3969 return true;
3972 if (GET_MODE_SIZE (mode) != 0
3973 && CONST_INT_P (op1)
3974 && aarch64_base_register_rtx_p (op0, strict_p))
3976 HOST_WIDE_INT offset = INTVAL (op1);
3978 info->type = ADDRESS_REG_IMM;
3979 info->base = op0;
3980 info->offset = op1;
3982 /* TImode and TFmode values are allowed in both pairs of X
3983 registers and individual Q registers. The available
3984 address modes are:
3985 X,X: 7-bit signed scaled offset
3986 Q: 9-bit signed offset
3987 We conservatively require an offset representable in either mode.
3988 When performing the check for pairs of X registers i.e. LDP/STP
3989 pass down DImode since that is the natural size of the LDP/STP
3990 instruction memory accesses. */
3991 if (mode == TImode || mode == TFmode)
3992 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
3993 && offset_9bit_signed_unscaled_p (mode, offset));
3995 /* A 7bit offset check because OImode will emit a ldp/stp
3996 instruction (only big endian will get here).
3997 For ldp/stp instructions, the offset is scaled for the size of a
3998 single element of the pair. */
3999 if (mode == OImode)
4000 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4002 /* Three 9/12 bit offsets checks because CImode will emit three
4003 ldr/str instructions (only big endian will get here). */
4004 if (mode == CImode)
4005 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4006 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4007 || offset_12bit_unsigned_scaled_p (V16QImode,
4008 offset + 32)));
4010 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4011 instructions (only big endian will get here). */
4012 if (mode == XImode)
4013 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4014 && aarch64_offset_7bit_signed_scaled_p (TImode,
4015 offset + 32));
4017 if (load_store_pair_p)
4018 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4019 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4020 else
4021 return (offset_9bit_signed_unscaled_p (mode, offset)
4022 || offset_12bit_unsigned_scaled_p (mode, offset));
4025 if (allow_reg_index_p)
4027 /* Look for base + (scaled/extended) index register. */
4028 if (aarch64_base_register_rtx_p (op0, strict_p)
4029 && aarch64_classify_index (info, op1, mode, strict_p))
4031 info->base = op0;
4032 return true;
4034 if (aarch64_base_register_rtx_p (op1, strict_p)
4035 && aarch64_classify_index (info, op0, mode, strict_p))
4037 info->base = op1;
4038 return true;
4042 return false;
4044 case POST_INC:
4045 case POST_DEC:
4046 case PRE_INC:
4047 case PRE_DEC:
4048 info->type = ADDRESS_REG_WB;
4049 info->base = XEXP (x, 0);
4050 info->offset = NULL_RTX;
4051 return aarch64_base_register_rtx_p (info->base, strict_p);
4053 case POST_MODIFY:
4054 case PRE_MODIFY:
4055 info->type = ADDRESS_REG_WB;
4056 info->base = XEXP (x, 0);
4057 if (GET_CODE (XEXP (x, 1)) == PLUS
4058 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4059 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4060 && aarch64_base_register_rtx_p (info->base, strict_p))
4062 HOST_WIDE_INT offset;
4063 info->offset = XEXP (XEXP (x, 1), 1);
4064 offset = INTVAL (info->offset);
4066 /* TImode and TFmode values are allowed in both pairs of X
4067 registers and individual Q registers. The available
4068 address modes are:
4069 X,X: 7-bit signed scaled offset
4070 Q: 9-bit signed offset
4071 We conservatively require an offset representable in either mode.
4073 if (mode == TImode || mode == TFmode)
4074 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4075 && offset_9bit_signed_unscaled_p (mode, offset));
4077 if (load_store_pair_p)
4078 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4079 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4080 else
4081 return offset_9bit_signed_unscaled_p (mode, offset);
4083 return false;
4085 case CONST:
4086 case SYMBOL_REF:
4087 case LABEL_REF:
4088 /* load literal: pc-relative constant pool entry. Only supported
4089 for SI mode or larger. */
4090 info->type = ADDRESS_SYMBOLIC;
4092 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4094 rtx sym, addend;
4096 split_const (x, &sym, &addend);
4097 return ((GET_CODE (sym) == LABEL_REF
4098 || (GET_CODE (sym) == SYMBOL_REF
4099 && CONSTANT_POOL_ADDRESS_P (sym)
4100 && aarch64_pcrelative_literal_loads)));
4102 return false;
4104 case LO_SUM:
4105 info->type = ADDRESS_LO_SUM;
4106 info->base = XEXP (x, 0);
4107 info->offset = XEXP (x, 1);
4108 if (allow_reg_index_p
4109 && aarch64_base_register_rtx_p (info->base, strict_p))
4111 rtx sym, offs;
4112 split_const (info->offset, &sym, &offs);
4113 if (GET_CODE (sym) == SYMBOL_REF
4114 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4116 /* The symbol and offset must be aligned to the access size. */
4117 unsigned int align;
4118 unsigned int ref_size;
4120 if (CONSTANT_POOL_ADDRESS_P (sym))
4121 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4122 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4124 tree exp = SYMBOL_REF_DECL (sym);
4125 align = TYPE_ALIGN (TREE_TYPE (exp));
4126 align = CONSTANT_ALIGNMENT (exp, align);
4128 else if (SYMBOL_REF_DECL (sym))
4129 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4130 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4131 && SYMBOL_REF_BLOCK (sym) != NULL)
4132 align = SYMBOL_REF_BLOCK (sym)->alignment;
4133 else
4134 align = BITS_PER_UNIT;
4136 ref_size = GET_MODE_SIZE (mode);
4137 if (ref_size == 0)
4138 ref_size = GET_MODE_SIZE (DImode);
4140 return ((INTVAL (offs) & (ref_size - 1)) == 0
4141 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4144 return false;
4146 default:
4147 return false;
4151 bool
4152 aarch64_symbolic_address_p (rtx x)
4154 rtx offset;
4156 split_const (x, &x, &offset);
4157 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4160 /* Classify the base of symbolic expression X. */
4162 enum aarch64_symbol_type
4163 aarch64_classify_symbolic_expression (rtx x)
4165 rtx offset;
4167 split_const (x, &x, &offset);
4168 return aarch64_classify_symbol (x, offset);
4172 /* Return TRUE if X is a legitimate address for accessing memory in
4173 mode MODE. */
4174 static bool
4175 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4177 struct aarch64_address_info addr;
4179 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4182 /* Return TRUE if X is a legitimate address for accessing memory in
4183 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4184 pair operation. */
4185 bool
4186 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4187 RTX_CODE outer_code, bool strict_p)
4189 struct aarch64_address_info addr;
4191 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4194 /* Return TRUE if rtx X is immediate constant 0.0 */
4195 bool
4196 aarch64_float_const_zero_rtx_p (rtx x)
4198 if (GET_MODE (x) == VOIDmode)
4199 return false;
4201 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4202 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4203 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4206 /* Return the fixed registers used for condition codes. */
4208 static bool
4209 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4211 *p1 = CC_REGNUM;
4212 *p2 = INVALID_REGNUM;
4213 return true;
4216 /* Emit call insn with PAT and do aarch64-specific handling. */
4218 void
4219 aarch64_emit_call_insn (rtx pat)
4221 rtx insn = emit_call_insn (pat);
4223 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4224 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4225 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4228 machine_mode
4229 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4231 /* All floating point compares return CCFP if it is an equality
4232 comparison, and CCFPE otherwise. */
4233 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4235 switch (code)
4237 case EQ:
4238 case NE:
4239 case UNORDERED:
4240 case ORDERED:
4241 case UNLT:
4242 case UNLE:
4243 case UNGT:
4244 case UNGE:
4245 case UNEQ:
4246 case LTGT:
4247 return CCFPmode;
4249 case LT:
4250 case LE:
4251 case GT:
4252 case GE:
4253 return CCFPEmode;
4255 default:
4256 gcc_unreachable ();
4260 /* Equality comparisons of short modes against zero can be performed
4261 using the TST instruction with the appropriate bitmask. */
4262 if (y == const0_rtx && REG_P (x)
4263 && (code == EQ || code == NE)
4264 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4265 return CC_NZmode;
4267 /* Similarly, comparisons of zero_extends from shorter modes can
4268 be performed using an ANDS with an immediate mask. */
4269 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4270 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4271 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4272 && (code == EQ || code == NE))
4273 return CC_NZmode;
4275 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4276 && y == const0_rtx
4277 && (code == EQ || code == NE || code == LT || code == GE)
4278 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4279 || GET_CODE (x) == NEG
4280 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4281 && CONST_INT_P (XEXP (x, 2)))))
4282 return CC_NZmode;
4284 /* A compare with a shifted operand. Because of canonicalization,
4285 the comparison will have to be swapped when we emit the assembly
4286 code. */
4287 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4288 && (REG_P (y) || GET_CODE (y) == SUBREG)
4289 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4290 || GET_CODE (x) == LSHIFTRT
4291 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4292 return CC_SWPmode;
4294 /* Similarly for a negated operand, but we can only do this for
4295 equalities. */
4296 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4297 && (REG_P (y) || GET_CODE (y) == SUBREG)
4298 && (code == EQ || code == NE)
4299 && GET_CODE (x) == NEG)
4300 return CC_Zmode;
4302 /* A test for unsigned overflow. */
4303 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4304 && code == NE
4305 && GET_CODE (x) == PLUS
4306 && GET_CODE (y) == ZERO_EXTEND)
4307 return CC_Cmode;
4309 /* For everything else, return CCmode. */
4310 return CCmode;
4313 static int
4314 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4317 aarch64_get_condition_code (rtx x)
4319 machine_mode mode = GET_MODE (XEXP (x, 0));
4320 enum rtx_code comp_code = GET_CODE (x);
4322 if (GET_MODE_CLASS (mode) != MODE_CC)
4323 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4324 return aarch64_get_condition_code_1 (mode, comp_code);
4327 static int
4328 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4330 switch (mode)
4332 case CCFPmode:
4333 case CCFPEmode:
4334 switch (comp_code)
4336 case GE: return AARCH64_GE;
4337 case GT: return AARCH64_GT;
4338 case LE: return AARCH64_LS;
4339 case LT: return AARCH64_MI;
4340 case NE: return AARCH64_NE;
4341 case EQ: return AARCH64_EQ;
4342 case ORDERED: return AARCH64_VC;
4343 case UNORDERED: return AARCH64_VS;
4344 case UNLT: return AARCH64_LT;
4345 case UNLE: return AARCH64_LE;
4346 case UNGT: return AARCH64_HI;
4347 case UNGE: return AARCH64_PL;
4348 default: return -1;
4350 break;
4352 case CCmode:
4353 switch (comp_code)
4355 case NE: return AARCH64_NE;
4356 case EQ: return AARCH64_EQ;
4357 case GE: return AARCH64_GE;
4358 case GT: return AARCH64_GT;
4359 case LE: return AARCH64_LE;
4360 case LT: return AARCH64_LT;
4361 case GEU: return AARCH64_CS;
4362 case GTU: return AARCH64_HI;
4363 case LEU: return AARCH64_LS;
4364 case LTU: return AARCH64_CC;
4365 default: return -1;
4367 break;
4369 case CC_SWPmode:
4370 switch (comp_code)
4372 case NE: return AARCH64_NE;
4373 case EQ: return AARCH64_EQ;
4374 case GE: return AARCH64_LE;
4375 case GT: return AARCH64_LT;
4376 case LE: return AARCH64_GE;
4377 case LT: return AARCH64_GT;
4378 case GEU: return AARCH64_LS;
4379 case GTU: return AARCH64_CC;
4380 case LEU: return AARCH64_CS;
4381 case LTU: return AARCH64_HI;
4382 default: return -1;
4384 break;
4386 case CC_NZmode:
4387 switch (comp_code)
4389 case NE: return AARCH64_NE;
4390 case EQ: return AARCH64_EQ;
4391 case GE: return AARCH64_PL;
4392 case LT: return AARCH64_MI;
4393 default: return -1;
4395 break;
4397 case CC_Zmode:
4398 switch (comp_code)
4400 case NE: return AARCH64_NE;
4401 case EQ: return AARCH64_EQ;
4402 default: return -1;
4404 break;
4406 case CC_Cmode:
4407 switch (comp_code)
4409 case NE: return AARCH64_CS;
4410 case EQ: return AARCH64_CC;
4411 default: return -1;
4413 break;
4415 default:
4416 return -1;
4417 break;
4420 return -1;
4423 bool
4424 aarch64_const_vec_all_same_in_range_p (rtx x,
4425 HOST_WIDE_INT minval,
4426 HOST_WIDE_INT maxval)
4428 HOST_WIDE_INT firstval;
4429 int count, i;
4431 if (GET_CODE (x) != CONST_VECTOR
4432 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4433 return false;
4435 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4436 if (firstval < minval || firstval > maxval)
4437 return false;
4439 count = CONST_VECTOR_NUNITS (x);
4440 for (i = 1; i < count; i++)
4441 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4442 return false;
4444 return true;
4447 bool
4448 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4450 return aarch64_const_vec_all_same_in_range_p (x, val, val);
4454 /* N Z C V. */
4455 #define AARCH64_CC_V 1
4456 #define AARCH64_CC_C (1 << 1)
4457 #define AARCH64_CC_Z (1 << 2)
4458 #define AARCH64_CC_N (1 << 3)
4460 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
4461 static const int aarch64_nzcv_codes[] =
4463 0, /* EQ, Z == 1. */
4464 AARCH64_CC_Z, /* NE, Z == 0. */
4465 0, /* CS, C == 1. */
4466 AARCH64_CC_C, /* CC, C == 0. */
4467 0, /* MI, N == 1. */
4468 AARCH64_CC_N, /* PL, N == 0. */
4469 0, /* VS, V == 1. */
4470 AARCH64_CC_V, /* VC, V == 0. */
4471 0, /* HI, C ==1 && Z == 0. */
4472 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
4473 AARCH64_CC_V, /* GE, N == V. */
4474 0, /* LT, N != V. */
4475 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
4476 0, /* LE, !(Z == 0 && N == V). */
4477 0, /* AL, Any. */
4478 0 /* NV, Any. */
4481 static void
4482 aarch64_print_operand (FILE *f, rtx x, int code)
4484 switch (code)
4486 /* An integer or symbol address without a preceding # sign. */
4487 case 'c':
4488 switch (GET_CODE (x))
4490 case CONST_INT:
4491 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4492 break;
4494 case SYMBOL_REF:
4495 output_addr_const (f, x);
4496 break;
4498 case CONST:
4499 if (GET_CODE (XEXP (x, 0)) == PLUS
4500 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4502 output_addr_const (f, x);
4503 break;
4505 /* Fall through. */
4507 default:
4508 output_operand_lossage ("Unsupported operand for code '%c'", code);
4510 break;
4512 case 'e':
4513 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4515 int n;
4517 if (!CONST_INT_P (x)
4518 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4520 output_operand_lossage ("invalid operand for '%%%c'", code);
4521 return;
4524 switch (n)
4526 case 3:
4527 fputc ('b', f);
4528 break;
4529 case 4:
4530 fputc ('h', f);
4531 break;
4532 case 5:
4533 fputc ('w', f);
4534 break;
4535 default:
4536 output_operand_lossage ("invalid operand for '%%%c'", code);
4537 return;
4540 break;
4542 case 'p':
4544 int n;
4546 /* Print N such that 2^N == X. */
4547 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4549 output_operand_lossage ("invalid operand for '%%%c'", code);
4550 return;
4553 asm_fprintf (f, "%d", n);
4555 break;
4557 case 'P':
4558 /* Print the number of non-zero bits in X (a const_int). */
4559 if (!CONST_INT_P (x))
4561 output_operand_lossage ("invalid operand for '%%%c'", code);
4562 return;
4565 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4566 break;
4568 case 'H':
4569 /* Print the higher numbered register of a pair (TImode) of regs. */
4570 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4572 output_operand_lossage ("invalid operand for '%%%c'", code);
4573 return;
4576 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4577 break;
4579 case 'M':
4580 case 'm':
4582 int cond_code;
4583 /* Print a condition (eq, ne, etc) or its inverse. */
4585 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
4586 if (x == const_true_rtx)
4588 if (code == 'M')
4589 fputs ("nv", f);
4590 return;
4593 if (!COMPARISON_P (x))
4595 output_operand_lossage ("invalid operand for '%%%c'", code);
4596 return;
4599 cond_code = aarch64_get_condition_code (x);
4600 gcc_assert (cond_code >= 0);
4601 if (code == 'M')
4602 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
4603 fputs (aarch64_condition_codes[cond_code], f);
4605 break;
4607 case 'b':
4608 case 'h':
4609 case 's':
4610 case 'd':
4611 case 'q':
4612 /* Print a scalar FP/SIMD register name. */
4613 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4615 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4616 return;
4618 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
4619 break;
4621 case 'S':
4622 case 'T':
4623 case 'U':
4624 case 'V':
4625 /* Print the first FP/SIMD register name in a list. */
4626 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4628 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4629 return;
4631 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
4632 break;
4634 case 'R':
4635 /* Print a scalar FP/SIMD register name + 1. */
4636 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
4638 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
4639 return;
4641 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
4642 break;
4644 case 'X':
4645 /* Print bottom 16 bits of integer constant in hex. */
4646 if (!CONST_INT_P (x))
4648 output_operand_lossage ("invalid operand for '%%%c'", code);
4649 return;
4651 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
4652 break;
4654 case 'w':
4655 case 'x':
4656 /* Print a general register name or the zero register (32-bit or
4657 64-bit). */
4658 if (x == const0_rtx
4659 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
4661 asm_fprintf (f, "%czr", code);
4662 break;
4665 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
4667 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
4668 break;
4671 if (REG_P (x) && REGNO (x) == SP_REGNUM)
4673 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
4674 break;
4677 /* Fall through */
4679 case 0:
4680 /* Print a normal operand, if it's a general register, then we
4681 assume DImode. */
4682 if (x == NULL)
4684 output_operand_lossage ("missing operand");
4685 return;
4688 switch (GET_CODE (x))
4690 case REG:
4691 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
4692 break;
4694 case MEM:
4695 output_address (GET_MODE (x), XEXP (x, 0));
4696 break;
4698 case CONST:
4699 case LABEL_REF:
4700 case SYMBOL_REF:
4701 output_addr_const (asm_out_file, x);
4702 break;
4704 case CONST_INT:
4705 asm_fprintf (f, "%wd", INTVAL (x));
4706 break;
4708 case CONST_VECTOR:
4709 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
4711 gcc_assert (
4712 aarch64_const_vec_all_same_in_range_p (x,
4713 HOST_WIDE_INT_MIN,
4714 HOST_WIDE_INT_MAX));
4715 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
4717 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
4719 fputc ('0', f);
4721 else
4722 gcc_unreachable ();
4723 break;
4725 case CONST_DOUBLE:
4726 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
4727 be getting CONST_DOUBLEs holding integers. */
4728 gcc_assert (GET_MODE (x) != VOIDmode);
4729 if (aarch64_float_const_zero_rtx_p (x))
4731 fputc ('0', f);
4732 break;
4734 else if (aarch64_float_const_representable_p (x))
4736 #define buf_size 20
4737 char float_buf[buf_size] = {'\0'};
4738 real_to_decimal_for_mode (float_buf,
4739 CONST_DOUBLE_REAL_VALUE (x),
4740 buf_size, buf_size,
4741 1, GET_MODE (x));
4742 asm_fprintf (asm_out_file, "%s", float_buf);
4743 break;
4744 #undef buf_size
4746 output_operand_lossage ("invalid constant");
4747 return;
4748 default:
4749 output_operand_lossage ("invalid operand");
4750 return;
4752 break;
4754 case 'A':
4755 if (GET_CODE (x) == HIGH)
4756 x = XEXP (x, 0);
4758 switch (aarch64_classify_symbolic_expression (x))
4760 case SYMBOL_SMALL_GOT_4G:
4761 asm_fprintf (asm_out_file, ":got:");
4762 break;
4764 case SYMBOL_SMALL_TLSGD:
4765 asm_fprintf (asm_out_file, ":tlsgd:");
4766 break;
4768 case SYMBOL_SMALL_TLSDESC:
4769 asm_fprintf (asm_out_file, ":tlsdesc:");
4770 break;
4772 case SYMBOL_SMALL_TLSIE:
4773 asm_fprintf (asm_out_file, ":gottprel:");
4774 break;
4776 case SYMBOL_TLSLE24:
4777 asm_fprintf (asm_out_file, ":tprel:");
4778 break;
4780 case SYMBOL_TINY_GOT:
4781 gcc_unreachable ();
4782 break;
4784 default:
4785 break;
4787 output_addr_const (asm_out_file, x);
4788 break;
4790 case 'L':
4791 switch (aarch64_classify_symbolic_expression (x))
4793 case SYMBOL_SMALL_GOT_4G:
4794 asm_fprintf (asm_out_file, ":lo12:");
4795 break;
4797 case SYMBOL_SMALL_TLSGD:
4798 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
4799 break;
4801 case SYMBOL_SMALL_TLSDESC:
4802 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
4803 break;
4805 case SYMBOL_SMALL_TLSIE:
4806 asm_fprintf (asm_out_file, ":gottprel_lo12:");
4807 break;
4809 case SYMBOL_TLSLE12:
4810 asm_fprintf (asm_out_file, ":tprel_lo12:");
4811 break;
4813 case SYMBOL_TLSLE24:
4814 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
4815 break;
4817 case SYMBOL_TINY_GOT:
4818 asm_fprintf (asm_out_file, ":got:");
4819 break;
4821 case SYMBOL_TINY_TLSIE:
4822 asm_fprintf (asm_out_file, ":gottprel:");
4823 break;
4825 default:
4826 break;
4828 output_addr_const (asm_out_file, x);
4829 break;
4831 case 'G':
4833 switch (aarch64_classify_symbolic_expression (x))
4835 case SYMBOL_TLSLE24:
4836 asm_fprintf (asm_out_file, ":tprel_hi12:");
4837 break;
4838 default:
4839 break;
4841 output_addr_const (asm_out_file, x);
4842 break;
4844 case 'k':
4846 HOST_WIDE_INT cond_code;
4847 /* Print nzcv. */
4849 if (!CONST_INT_P (x))
4851 output_operand_lossage ("invalid operand for '%%%c'", code);
4852 return;
4855 cond_code = INTVAL (x);
4856 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
4857 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
4859 break;
4861 default:
4862 output_operand_lossage ("invalid operand prefix '%%%c'", code);
4863 return;
4867 static void
4868 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
4870 struct aarch64_address_info addr;
4872 if (aarch64_classify_address (&addr, x, mode, MEM, true))
4873 switch (addr.type)
4875 case ADDRESS_REG_IMM:
4876 if (addr.offset == const0_rtx)
4877 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
4878 else
4879 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
4880 INTVAL (addr.offset));
4881 return;
4883 case ADDRESS_REG_REG:
4884 if (addr.shift == 0)
4885 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
4886 reg_names [REGNO (addr.offset)]);
4887 else
4888 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
4889 reg_names [REGNO (addr.offset)], addr.shift);
4890 return;
4892 case ADDRESS_REG_UXTW:
4893 if (addr.shift == 0)
4894 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
4895 REGNO (addr.offset) - R0_REGNUM);
4896 else
4897 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
4898 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4899 return;
4901 case ADDRESS_REG_SXTW:
4902 if (addr.shift == 0)
4903 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
4904 REGNO (addr.offset) - R0_REGNUM);
4905 else
4906 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
4907 REGNO (addr.offset) - R0_REGNUM, addr.shift);
4908 return;
4910 case ADDRESS_REG_WB:
4911 switch (GET_CODE (x))
4913 case PRE_INC:
4914 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
4915 GET_MODE_SIZE (mode));
4916 return;
4917 case POST_INC:
4918 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
4919 GET_MODE_SIZE (mode));
4920 return;
4921 case PRE_DEC:
4922 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
4923 GET_MODE_SIZE (mode));
4924 return;
4925 case POST_DEC:
4926 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
4927 GET_MODE_SIZE (mode));
4928 return;
4929 case PRE_MODIFY:
4930 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
4931 INTVAL (addr.offset));
4932 return;
4933 case POST_MODIFY:
4934 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
4935 INTVAL (addr.offset));
4936 return;
4937 default:
4938 break;
4940 break;
4942 case ADDRESS_LO_SUM:
4943 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
4944 output_addr_const (f, addr.offset);
4945 asm_fprintf (f, "]");
4946 return;
4948 case ADDRESS_SYMBOLIC:
4949 break;
4952 output_addr_const (f, x);
4955 bool
4956 aarch64_label_mentioned_p (rtx x)
4958 const char *fmt;
4959 int i;
4961 if (GET_CODE (x) == LABEL_REF)
4962 return true;
4964 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
4965 referencing instruction, but they are constant offsets, not
4966 symbols. */
4967 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
4968 return false;
4970 fmt = GET_RTX_FORMAT (GET_CODE (x));
4971 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
4973 if (fmt[i] == 'E')
4975 int j;
4977 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
4978 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
4979 return 1;
4981 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
4982 return 1;
4985 return 0;
4988 /* Implement REGNO_REG_CLASS. */
4990 enum reg_class
4991 aarch64_regno_regclass (unsigned regno)
4993 if (GP_REGNUM_P (regno))
4994 return GENERAL_REGS;
4996 if (regno == SP_REGNUM)
4997 return STACK_REG;
4999 if (regno == FRAME_POINTER_REGNUM
5000 || regno == ARG_POINTER_REGNUM)
5001 return POINTER_REGS;
5003 if (FP_REGNUM_P (regno))
5004 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5006 return NO_REGS;
5009 static rtx
5010 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5012 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5013 where mask is selected by alignment and size of the offset.
5014 We try to pick as large a range for the offset as possible to
5015 maximize the chance of a CSE. However, for aligned addresses
5016 we limit the range to 4k so that structures with different sized
5017 elements are likely to use the same base. We need to be careful
5018 not to split a CONST for some forms of address expression, otherwise
5019 it will generate sub-optimal code. */
5021 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5023 rtx base = XEXP (x, 0);
5024 rtx offset_rtx = XEXP (x, 1);
5025 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5027 if (GET_CODE (base) == PLUS)
5029 rtx op0 = XEXP (base, 0);
5030 rtx op1 = XEXP (base, 1);
5032 /* Force any scaling into a temp for CSE. */
5033 op0 = force_reg (Pmode, op0);
5034 op1 = force_reg (Pmode, op1);
5036 /* Let the pointer register be in op0. */
5037 if (REG_POINTER (op1))
5038 std::swap (op0, op1);
5040 /* If the pointer is virtual or frame related, then we know that
5041 virtual register instantiation or register elimination is going
5042 to apply a second constant. We want the two constants folded
5043 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5044 if (virt_or_elim_regno_p (REGNO (op0)))
5046 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5047 NULL_RTX, true, OPTAB_DIRECT);
5048 return gen_rtx_PLUS (Pmode, base, op1);
5051 /* Otherwise, in order to encourage CSE (and thence loop strength
5052 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5053 base = expand_binop (Pmode, add_optab, op0, op1,
5054 NULL_RTX, true, OPTAB_DIRECT);
5055 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5058 /* Does it look like we'll need a load/store-pair operation? */
5059 HOST_WIDE_INT base_offset;
5060 if (GET_MODE_SIZE (mode) > 16
5061 || mode == TImode)
5062 base_offset = ((offset + 64 * GET_MODE_SIZE (mode))
5063 & ~((128 * GET_MODE_SIZE (mode)) - 1));
5064 /* For offsets aren't a multiple of the access size, the limit is
5065 -256...255. */
5066 else if (offset & (GET_MODE_SIZE (mode) - 1))
5067 base_offset = (offset + 0x100) & ~0x1ff;
5068 else
5069 base_offset = offset & ~0xfff;
5071 if (base_offset != 0)
5073 base = plus_constant (Pmode, base, base_offset);
5074 base = force_operand (base, NULL_RTX);
5075 return plus_constant (Pmode, base, offset - base_offset);
5079 return x;
5082 /* Return the reload icode required for a constant pool in mode. */
5083 static enum insn_code
5084 aarch64_constant_pool_reload_icode (machine_mode mode)
5086 switch (mode)
5088 case SFmode:
5089 return CODE_FOR_aarch64_reload_movcpsfdi;
5091 case DFmode:
5092 return CODE_FOR_aarch64_reload_movcpdfdi;
5094 case TFmode:
5095 return CODE_FOR_aarch64_reload_movcptfdi;
5097 case V8QImode:
5098 return CODE_FOR_aarch64_reload_movcpv8qidi;
5100 case V16QImode:
5101 return CODE_FOR_aarch64_reload_movcpv16qidi;
5103 case V4HImode:
5104 return CODE_FOR_aarch64_reload_movcpv4hidi;
5106 case V8HImode:
5107 return CODE_FOR_aarch64_reload_movcpv8hidi;
5109 case V2SImode:
5110 return CODE_FOR_aarch64_reload_movcpv2sidi;
5112 case V4SImode:
5113 return CODE_FOR_aarch64_reload_movcpv4sidi;
5115 case V2DImode:
5116 return CODE_FOR_aarch64_reload_movcpv2didi;
5118 case V2DFmode:
5119 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5121 default:
5122 gcc_unreachable ();
5125 gcc_unreachable ();
5127 static reg_class_t
5128 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5129 reg_class_t rclass,
5130 machine_mode mode,
5131 secondary_reload_info *sri)
5134 /* If we have to disable direct literal pool loads and stores because the
5135 function is too big, then we need a scratch register. */
5136 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5137 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5138 || targetm.vector_mode_supported_p (GET_MODE (x)))
5139 && !aarch64_pcrelative_literal_loads)
5141 sri->icode = aarch64_constant_pool_reload_icode (mode);
5142 return NO_REGS;
5145 /* Without the TARGET_SIMD instructions we cannot move a Q register
5146 to a Q register directly. We need a scratch. */
5147 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5148 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5149 && reg_class_subset_p (rclass, FP_REGS))
5151 if (mode == TFmode)
5152 sri->icode = CODE_FOR_aarch64_reload_movtf;
5153 else if (mode == TImode)
5154 sri->icode = CODE_FOR_aarch64_reload_movti;
5155 return NO_REGS;
5158 /* A TFmode or TImode memory access should be handled via an FP_REGS
5159 because AArch64 has richer addressing modes for LDR/STR instructions
5160 than LDP/STP instructions. */
5161 if (TARGET_FLOAT && rclass == GENERAL_REGS
5162 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5163 return FP_REGS;
5165 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5166 return GENERAL_REGS;
5168 return NO_REGS;
5171 static bool
5172 aarch64_can_eliminate (const int from, const int to)
5174 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5175 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5177 if (frame_pointer_needed)
5179 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5180 return true;
5181 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5182 return false;
5183 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5184 && !cfun->calls_alloca)
5185 return true;
5186 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5187 return true;
5189 return false;
5191 else
5193 /* If we decided that we didn't need a leaf frame pointer but then used
5194 LR in the function, then we'll want a frame pointer after all, so
5195 prevent this elimination to ensure a frame pointer is used. */
5196 if (to == STACK_POINTER_REGNUM
5197 && flag_omit_leaf_frame_pointer
5198 && df_regs_ever_live_p (LR_REGNUM))
5199 return false;
5202 return true;
5205 HOST_WIDE_INT
5206 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5208 aarch64_layout_frame ();
5210 if (to == HARD_FRAME_POINTER_REGNUM)
5212 if (from == ARG_POINTER_REGNUM)
5213 return cfun->machine->frame.hard_fp_offset;
5215 if (from == FRAME_POINTER_REGNUM)
5216 return cfun->machine->frame.hard_fp_offset
5217 - cfun->machine->frame.locals_offset;
5220 if (to == STACK_POINTER_REGNUM)
5222 if (from == FRAME_POINTER_REGNUM)
5223 return cfun->machine->frame.frame_size
5224 - cfun->machine->frame.locals_offset;
5227 return cfun->machine->frame.frame_size;
5230 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5231 previous frame. */
5234 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5236 if (count != 0)
5237 return const0_rtx;
5238 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5242 static void
5243 aarch64_asm_trampoline_template (FILE *f)
5245 if (TARGET_ILP32)
5247 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5248 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5250 else
5252 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5253 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5255 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5256 assemble_aligned_integer (4, const0_rtx);
5257 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5258 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5261 static void
5262 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5264 rtx fnaddr, mem, a_tramp;
5265 const int tramp_code_sz = 16;
5267 /* Don't need to copy the trailing D-words, we fill those in below. */
5268 emit_block_move (m_tramp, assemble_trampoline_template (),
5269 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5270 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5271 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5272 if (GET_MODE (fnaddr) != ptr_mode)
5273 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5274 emit_move_insn (mem, fnaddr);
5276 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5277 emit_move_insn (mem, chain_value);
5279 /* XXX We should really define a "clear_cache" pattern and use
5280 gen_clear_cache(). */
5281 a_tramp = XEXP (m_tramp, 0);
5282 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5283 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5284 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5285 ptr_mode);
5288 static unsigned char
5289 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5291 switch (regclass)
5293 case CALLER_SAVE_REGS:
5294 case POINTER_REGS:
5295 case GENERAL_REGS:
5296 case ALL_REGS:
5297 case FP_REGS:
5298 case FP_LO_REGS:
5299 return
5300 aarch64_vector_mode_p (mode)
5301 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5302 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5303 case STACK_REG:
5304 return 1;
5306 case NO_REGS:
5307 return 0;
5309 default:
5310 break;
5312 gcc_unreachable ();
5315 static reg_class_t
5316 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5318 if (regclass == POINTER_REGS)
5319 return GENERAL_REGS;
5321 if (regclass == STACK_REG)
5323 if (REG_P(x)
5324 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5325 return regclass;
5327 return NO_REGS;
5330 /* If it's an integer immediate that MOVI can't handle, then
5331 FP_REGS is not an option, so we return NO_REGS instead. */
5332 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5333 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5334 return NO_REGS;
5336 /* Register eliminiation can result in a request for
5337 SP+constant->FP_REGS. We cannot support such operations which
5338 use SP as source and an FP_REG as destination, so reject out
5339 right now. */
5340 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5342 rtx lhs = XEXP (x, 0);
5344 /* Look through a possible SUBREG introduced by ILP32. */
5345 if (GET_CODE (lhs) == SUBREG)
5346 lhs = SUBREG_REG (lhs);
5348 gcc_assert (REG_P (lhs));
5349 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5350 POINTER_REGS));
5351 return NO_REGS;
5354 return regclass;
5357 void
5358 aarch64_asm_output_labelref (FILE* f, const char *name)
5360 asm_fprintf (f, "%U%s", name);
5363 static void
5364 aarch64_elf_asm_constructor (rtx symbol, int priority)
5366 if (priority == DEFAULT_INIT_PRIORITY)
5367 default_ctor_section_asm_out_constructor (symbol, priority);
5368 else
5370 section *s;
5371 char buf[18];
5372 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5373 s = get_section (buf, SECTION_WRITE, NULL);
5374 switch_to_section (s);
5375 assemble_align (POINTER_SIZE);
5376 assemble_aligned_integer (POINTER_BYTES, symbol);
5380 static void
5381 aarch64_elf_asm_destructor (rtx symbol, int priority)
5383 if (priority == DEFAULT_INIT_PRIORITY)
5384 default_dtor_section_asm_out_destructor (symbol, priority);
5385 else
5387 section *s;
5388 char buf[18];
5389 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5390 s = get_section (buf, SECTION_WRITE, NULL);
5391 switch_to_section (s);
5392 assemble_align (POINTER_SIZE);
5393 assemble_aligned_integer (POINTER_BYTES, symbol);
5397 const char*
5398 aarch64_output_casesi (rtx *operands)
5400 char buf[100];
5401 char label[100];
5402 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5403 int index;
5404 static const char *const patterns[4][2] =
5407 "ldrb\t%w3, [%0,%w1,uxtw]",
5408 "add\t%3, %4, %w3, sxtb #2"
5411 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5412 "add\t%3, %4, %w3, sxth #2"
5415 "ldr\t%w3, [%0,%w1,uxtw #2]",
5416 "add\t%3, %4, %w3, sxtw #2"
5418 /* We assume that DImode is only generated when not optimizing and
5419 that we don't really need 64-bit address offsets. That would
5420 imply an object file with 8GB of code in a single function! */
5422 "ldr\t%w3, [%0,%w1,uxtw #2]",
5423 "add\t%3, %4, %w3, sxtw #2"
5427 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5429 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5431 gcc_assert (index >= 0 && index <= 3);
5433 /* Need to implement table size reduction, by chaning the code below. */
5434 output_asm_insn (patterns[index][0], operands);
5435 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5436 snprintf (buf, sizeof (buf),
5437 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5438 output_asm_insn (buf, operands);
5439 output_asm_insn (patterns[index][1], operands);
5440 output_asm_insn ("br\t%3", operands);
5441 assemble_label (asm_out_file, label);
5442 return "";
5446 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5447 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5448 operator. */
5451 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5453 if (shift >= 0 && shift <= 3)
5455 int size;
5456 for (size = 8; size <= 32; size *= 2)
5458 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5459 if (mask == bits << shift)
5460 return size;
5463 return 0;
5466 /* Constant pools are per function only when PC relative
5467 literal loads are true or we are in the large memory
5468 model. */
5470 static inline bool
5471 aarch64_can_use_per_function_literal_pools_p (void)
5473 return (aarch64_pcrelative_literal_loads
5474 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5477 static bool
5478 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5480 /* Fixme:: In an ideal world this would work similar
5481 to the logic in aarch64_select_rtx_section but this
5482 breaks bootstrap in gcc go. For now we workaround
5483 this by returning false here. */
5484 return false;
5487 /* Select appropriate section for constants depending
5488 on where we place literal pools. */
5490 static section *
5491 aarch64_select_rtx_section (machine_mode mode,
5492 rtx x,
5493 unsigned HOST_WIDE_INT align)
5495 if (aarch64_can_use_per_function_literal_pools_p ())
5496 return function_section (current_function_decl);
5498 return default_elf_select_rtx_section (mode, x, align);
5501 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
5502 void
5503 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5504 HOST_WIDE_INT offset)
5506 /* When using per-function literal pools, we must ensure that any code
5507 section is aligned to the minimal instruction length, lest we get
5508 errors from the assembler re "unaligned instructions". */
5509 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5510 ASM_OUTPUT_ALIGN (f, 2);
5513 /* Costs. */
5515 /* Helper function for rtx cost calculation. Strip a shift expression
5516 from X. Returns the inner operand if successful, or the original
5517 expression on failure. */
5518 static rtx
5519 aarch64_strip_shift (rtx x)
5521 rtx op = x;
5523 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5524 we can convert both to ROR during final output. */
5525 if ((GET_CODE (op) == ASHIFT
5526 || GET_CODE (op) == ASHIFTRT
5527 || GET_CODE (op) == LSHIFTRT
5528 || GET_CODE (op) == ROTATERT
5529 || GET_CODE (op) == ROTATE)
5530 && CONST_INT_P (XEXP (op, 1)))
5531 return XEXP (op, 0);
5533 if (GET_CODE (op) == MULT
5534 && CONST_INT_P (XEXP (op, 1))
5535 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5536 return XEXP (op, 0);
5538 return x;
5541 /* Helper function for rtx cost calculation. Strip an extend
5542 expression from X. Returns the inner operand if successful, or the
5543 original expression on failure. We deal with a number of possible
5544 canonicalization variations here. */
5545 static rtx
5546 aarch64_strip_extend (rtx x)
5548 rtx op = x;
5550 /* Zero and sign extraction of a widened value. */
5551 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5552 && XEXP (op, 2) == const0_rtx
5553 && GET_CODE (XEXP (op, 0)) == MULT
5554 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5555 XEXP (op, 1)))
5556 return XEXP (XEXP (op, 0), 0);
5558 /* It can also be represented (for zero-extend) as an AND with an
5559 immediate. */
5560 if (GET_CODE (op) == AND
5561 && GET_CODE (XEXP (op, 0)) == MULT
5562 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
5563 && CONST_INT_P (XEXP (op, 1))
5564 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
5565 INTVAL (XEXP (op, 1))) != 0)
5566 return XEXP (XEXP (op, 0), 0);
5568 /* Now handle extended register, as this may also have an optional
5569 left shift by 1..4. */
5570 if (GET_CODE (op) == ASHIFT
5571 && CONST_INT_P (XEXP (op, 1))
5572 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
5573 op = XEXP (op, 0);
5575 if (GET_CODE (op) == ZERO_EXTEND
5576 || GET_CODE (op) == SIGN_EXTEND)
5577 op = XEXP (op, 0);
5579 if (op != x)
5580 return op;
5582 return x;
5585 /* Return true iff CODE is a shift supported in combination
5586 with arithmetic instructions. */
5588 static bool
5589 aarch64_shift_p (enum rtx_code code)
5591 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
5594 /* Helper function for rtx cost calculation. Calculate the cost of
5595 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
5596 Return the calculated cost of the expression, recursing manually in to
5597 operands where needed. */
5599 static int
5600 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
5602 rtx op0, op1;
5603 const struct cpu_cost_table *extra_cost
5604 = aarch64_tune_params.insn_extra_cost;
5605 int cost = 0;
5606 bool compound_p = (outer == PLUS || outer == MINUS);
5607 machine_mode mode = GET_MODE (x);
5609 gcc_checking_assert (code == MULT);
5611 op0 = XEXP (x, 0);
5612 op1 = XEXP (x, 1);
5614 if (VECTOR_MODE_P (mode))
5615 mode = GET_MODE_INNER (mode);
5617 /* Integer multiply/fma. */
5618 if (GET_MODE_CLASS (mode) == MODE_INT)
5620 /* The multiply will be canonicalized as a shift, cost it as such. */
5621 if (aarch64_shift_p (GET_CODE (x))
5622 || (CONST_INT_P (op1)
5623 && exact_log2 (INTVAL (op1)) > 0))
5625 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
5626 || GET_CODE (op0) == SIGN_EXTEND;
5627 if (speed)
5629 if (compound_p)
5631 if (REG_P (op1))
5632 /* ARITH + shift-by-register. */
5633 cost += extra_cost->alu.arith_shift_reg;
5634 else if (is_extend)
5635 /* ARITH + extended register. We don't have a cost field
5636 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
5637 cost += extra_cost->alu.extend_arith;
5638 else
5639 /* ARITH + shift-by-immediate. */
5640 cost += extra_cost->alu.arith_shift;
5642 else
5643 /* LSL (immediate). */
5644 cost += extra_cost->alu.shift;
5647 /* Strip extends as we will have costed them in the case above. */
5648 if (is_extend)
5649 op0 = aarch64_strip_extend (op0);
5651 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
5653 return cost;
5656 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
5657 compound and let the below cases handle it. After all, MNEG is a
5658 special-case alias of MSUB. */
5659 if (GET_CODE (op0) == NEG)
5661 op0 = XEXP (op0, 0);
5662 compound_p = true;
5665 /* Integer multiplies or FMAs have zero/sign extending variants. */
5666 if ((GET_CODE (op0) == ZERO_EXTEND
5667 && GET_CODE (op1) == ZERO_EXTEND)
5668 || (GET_CODE (op0) == SIGN_EXTEND
5669 && GET_CODE (op1) == SIGN_EXTEND))
5671 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
5672 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
5674 if (speed)
5676 if (compound_p)
5677 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
5678 cost += extra_cost->mult[0].extend_add;
5679 else
5680 /* MUL/SMULL/UMULL. */
5681 cost += extra_cost->mult[0].extend;
5684 return cost;
5687 /* This is either an integer multiply or a MADD. In both cases
5688 we want to recurse and cost the operands. */
5689 cost += rtx_cost (op0, mode, MULT, 0, speed);
5690 cost += rtx_cost (op1, mode, MULT, 1, speed);
5692 if (speed)
5694 if (compound_p)
5695 /* MADD/MSUB. */
5696 cost += extra_cost->mult[mode == DImode].add;
5697 else
5698 /* MUL. */
5699 cost += extra_cost->mult[mode == DImode].simple;
5702 return cost;
5704 else
5706 if (speed)
5708 /* Floating-point FMA/FMUL can also support negations of the
5709 operands, unless the rounding mode is upward or downward in
5710 which case FNMUL is different than FMUL with operand negation. */
5711 bool neg0 = GET_CODE (op0) == NEG;
5712 bool neg1 = GET_CODE (op1) == NEG;
5713 if (compound_p || !flag_rounding_math || (neg0 && neg1))
5715 if (neg0)
5716 op0 = XEXP (op0, 0);
5717 if (neg1)
5718 op1 = XEXP (op1, 0);
5721 if (compound_p)
5722 /* FMADD/FNMADD/FNMSUB/FMSUB. */
5723 cost += extra_cost->fp[mode == DFmode].fma;
5724 else
5725 /* FMUL/FNMUL. */
5726 cost += extra_cost->fp[mode == DFmode].mult;
5729 cost += rtx_cost (op0, mode, MULT, 0, speed);
5730 cost += rtx_cost (op1, mode, MULT, 1, speed);
5731 return cost;
5735 static int
5736 aarch64_address_cost (rtx x,
5737 machine_mode mode,
5738 addr_space_t as ATTRIBUTE_UNUSED,
5739 bool speed)
5741 enum rtx_code c = GET_CODE (x);
5742 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
5743 struct aarch64_address_info info;
5744 int cost = 0;
5745 info.shift = 0;
5747 if (!aarch64_classify_address (&info, x, mode, c, false))
5749 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
5751 /* This is a CONST or SYMBOL ref which will be split
5752 in a different way depending on the code model in use.
5753 Cost it through the generic infrastructure. */
5754 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
5755 /* Divide through by the cost of one instruction to
5756 bring it to the same units as the address costs. */
5757 cost_symbol_ref /= COSTS_N_INSNS (1);
5758 /* The cost is then the cost of preparing the address,
5759 followed by an immediate (possibly 0) offset. */
5760 return cost_symbol_ref + addr_cost->imm_offset;
5762 else
5764 /* This is most likely a jump table from a case
5765 statement. */
5766 return addr_cost->register_offset;
5770 switch (info.type)
5772 case ADDRESS_LO_SUM:
5773 case ADDRESS_SYMBOLIC:
5774 case ADDRESS_REG_IMM:
5775 cost += addr_cost->imm_offset;
5776 break;
5778 case ADDRESS_REG_WB:
5779 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
5780 cost += addr_cost->pre_modify;
5781 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
5782 cost += addr_cost->post_modify;
5783 else
5784 gcc_unreachable ();
5786 break;
5788 case ADDRESS_REG_REG:
5789 cost += addr_cost->register_offset;
5790 break;
5792 case ADDRESS_REG_SXTW:
5793 cost += addr_cost->register_sextend;
5794 break;
5796 case ADDRESS_REG_UXTW:
5797 cost += addr_cost->register_zextend;
5798 break;
5800 default:
5801 gcc_unreachable ();
5805 if (info.shift > 0)
5807 /* For the sake of calculating the cost of the shifted register
5808 component, we can treat same sized modes in the same way. */
5809 switch (GET_MODE_BITSIZE (mode))
5811 case 16:
5812 cost += addr_cost->addr_scale_costs.hi;
5813 break;
5815 case 32:
5816 cost += addr_cost->addr_scale_costs.si;
5817 break;
5819 case 64:
5820 cost += addr_cost->addr_scale_costs.di;
5821 break;
5823 /* We can't tell, or this is a 128-bit vector. */
5824 default:
5825 cost += addr_cost->addr_scale_costs.ti;
5826 break;
5830 return cost;
5833 /* Return the cost of a branch. If SPEED_P is true then the compiler is
5834 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
5835 to be taken. */
5838 aarch64_branch_cost (bool speed_p, bool predictable_p)
5840 /* When optimizing for speed, use the cost of unpredictable branches. */
5841 const struct cpu_branch_cost *branch_costs =
5842 aarch64_tune_params.branch_costs;
5844 if (!speed_p || predictable_p)
5845 return branch_costs->predictable;
5846 else
5847 return branch_costs->unpredictable;
5850 /* Return true if the RTX X in mode MODE is a zero or sign extract
5851 usable in an ADD or SUB (extended register) instruction. */
5852 static bool
5853 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
5855 /* Catch add with a sign extract.
5856 This is add_<optab><mode>_multp2. */
5857 if (GET_CODE (x) == SIGN_EXTRACT
5858 || GET_CODE (x) == ZERO_EXTRACT)
5860 rtx op0 = XEXP (x, 0);
5861 rtx op1 = XEXP (x, 1);
5862 rtx op2 = XEXP (x, 2);
5864 if (GET_CODE (op0) == MULT
5865 && CONST_INT_P (op1)
5866 && op2 == const0_rtx
5867 && CONST_INT_P (XEXP (op0, 1))
5868 && aarch64_is_extend_from_extract (mode,
5869 XEXP (op0, 1),
5870 op1))
5872 return true;
5875 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
5876 No shift. */
5877 else if (GET_CODE (x) == SIGN_EXTEND
5878 || GET_CODE (x) == ZERO_EXTEND)
5879 return REG_P (XEXP (x, 0));
5881 return false;
5884 static bool
5885 aarch64_frint_unspec_p (unsigned int u)
5887 switch (u)
5889 case UNSPEC_FRINTZ:
5890 case UNSPEC_FRINTP:
5891 case UNSPEC_FRINTM:
5892 case UNSPEC_FRINTA:
5893 case UNSPEC_FRINTN:
5894 case UNSPEC_FRINTX:
5895 case UNSPEC_FRINTI:
5896 return true;
5898 default:
5899 return false;
5903 /* Return true iff X is an rtx that will match an extr instruction
5904 i.e. as described in the *extr<mode>5_insn family of patterns.
5905 OP0 and OP1 will be set to the operands of the shifts involved
5906 on success and will be NULL_RTX otherwise. */
5908 static bool
5909 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
5911 rtx op0, op1;
5912 machine_mode mode = GET_MODE (x);
5914 *res_op0 = NULL_RTX;
5915 *res_op1 = NULL_RTX;
5917 if (GET_CODE (x) != IOR)
5918 return false;
5920 op0 = XEXP (x, 0);
5921 op1 = XEXP (x, 1);
5923 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
5924 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
5926 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
5927 if (GET_CODE (op1) == ASHIFT)
5928 std::swap (op0, op1);
5930 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
5931 return false;
5933 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
5934 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
5936 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
5937 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
5939 *res_op0 = XEXP (op0, 0);
5940 *res_op1 = XEXP (op1, 0);
5941 return true;
5945 return false;
5948 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
5949 storing it in *COST. Result is true if the total cost of the operation
5950 has now been calculated. */
5951 static bool
5952 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
5954 rtx inner;
5955 rtx comparator;
5956 enum rtx_code cmpcode;
5958 if (COMPARISON_P (op0))
5960 inner = XEXP (op0, 0);
5961 comparator = XEXP (op0, 1);
5962 cmpcode = GET_CODE (op0);
5964 else
5966 inner = op0;
5967 comparator = const0_rtx;
5968 cmpcode = NE;
5971 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
5973 /* Conditional branch. */
5974 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
5975 return true;
5976 else
5978 if (cmpcode == NE || cmpcode == EQ)
5980 if (comparator == const0_rtx)
5982 /* TBZ/TBNZ/CBZ/CBNZ. */
5983 if (GET_CODE (inner) == ZERO_EXTRACT)
5984 /* TBZ/TBNZ. */
5985 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
5986 ZERO_EXTRACT, 0, speed);
5987 else
5988 /* CBZ/CBNZ. */
5989 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
5991 return true;
5994 else if (cmpcode == LT || cmpcode == GE)
5996 /* TBZ/TBNZ. */
5997 if (comparator == const0_rtx)
5998 return true;
6002 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6004 /* CCMP. */
6005 if (GET_CODE (op1) == COMPARE)
6007 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6008 if (XEXP (op1, 1) == const0_rtx)
6009 *cost += 1;
6010 if (speed)
6012 machine_mode mode = GET_MODE (XEXP (op1, 0));
6013 const struct cpu_cost_table *extra_cost
6014 = aarch64_tune_params.insn_extra_cost;
6016 if (GET_MODE_CLASS (mode) == MODE_INT)
6017 *cost += extra_cost->alu.arith;
6018 else
6019 *cost += extra_cost->fp[mode == DFmode].compare;
6021 return true;
6024 /* It's a conditional operation based on the status flags,
6025 so it must be some flavor of CSEL. */
6027 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6028 if (GET_CODE (op1) == NEG
6029 || GET_CODE (op1) == NOT
6030 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6031 op1 = XEXP (op1, 0);
6032 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6034 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6035 op1 = XEXP (op1, 0);
6036 op2 = XEXP (op2, 0);
6039 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6040 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6041 return true;
6044 /* We don't know what this is, cost all operands. */
6045 return false;
6048 /* Check whether X is a bitfield operation of the form shift + extend that
6049 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6050 operand to which the bitfield operation is applied. Otherwise return
6051 NULL_RTX. */
6053 static rtx
6054 aarch64_extend_bitfield_pattern_p (rtx x)
6056 rtx_code outer_code = GET_CODE (x);
6057 machine_mode outer_mode = GET_MODE (x);
6059 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6060 && outer_mode != SImode && outer_mode != DImode)
6061 return NULL_RTX;
6063 rtx inner = XEXP (x, 0);
6064 rtx_code inner_code = GET_CODE (inner);
6065 machine_mode inner_mode = GET_MODE (inner);
6066 rtx op = NULL_RTX;
6068 switch (inner_code)
6070 case ASHIFT:
6071 if (CONST_INT_P (XEXP (inner, 1))
6072 && (inner_mode == QImode || inner_mode == HImode))
6073 op = XEXP (inner, 0);
6074 break;
6075 case LSHIFTRT:
6076 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6077 && (inner_mode == QImode || inner_mode == HImode))
6078 op = XEXP (inner, 0);
6079 break;
6080 case ASHIFTRT:
6081 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6082 && (inner_mode == QImode || inner_mode == HImode))
6083 op = XEXP (inner, 0);
6084 break;
6085 default:
6086 break;
6089 return op;
6092 /* Return true if the mask and a shift amount from an RTX of the form
6093 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6094 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6096 bool
6097 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6099 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6100 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6101 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6102 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6105 /* Calculate the cost of calculating X, storing it in *COST. Result
6106 is true if the total cost of the operation has now been calculated. */
6107 static bool
6108 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6109 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6111 rtx op0, op1, op2;
6112 const struct cpu_cost_table *extra_cost
6113 = aarch64_tune_params.insn_extra_cost;
6114 int code = GET_CODE (x);
6116 /* By default, assume that everything has equivalent cost to the
6117 cheapest instruction. Any additional costs are applied as a delta
6118 above this default. */
6119 *cost = COSTS_N_INSNS (1);
6121 switch (code)
6123 case SET:
6124 /* The cost depends entirely on the operands to SET. */
6125 *cost = 0;
6126 op0 = SET_DEST (x);
6127 op1 = SET_SRC (x);
6129 switch (GET_CODE (op0))
6131 case MEM:
6132 if (speed)
6134 rtx address = XEXP (op0, 0);
6135 if (VECTOR_MODE_P (mode))
6136 *cost += extra_cost->ldst.storev;
6137 else if (GET_MODE_CLASS (mode) == MODE_INT)
6138 *cost += extra_cost->ldst.store;
6139 else if (mode == SFmode)
6140 *cost += extra_cost->ldst.storef;
6141 else if (mode == DFmode)
6142 *cost += extra_cost->ldst.stored;
6144 *cost +=
6145 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6146 0, speed));
6149 *cost += rtx_cost (op1, mode, SET, 1, speed);
6150 return true;
6152 case SUBREG:
6153 if (! REG_P (SUBREG_REG (op0)))
6154 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6156 /* Fall through. */
6157 case REG:
6158 /* The cost is one per vector-register copied. */
6159 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6161 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6162 / GET_MODE_SIZE (V4SImode);
6163 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6165 /* const0_rtx is in general free, but we will use an
6166 instruction to set a register to 0. */
6167 else if (REG_P (op1) || op1 == const0_rtx)
6169 /* The cost is 1 per register copied. */
6170 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6171 / UNITS_PER_WORD;
6172 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6174 else
6175 /* Cost is just the cost of the RHS of the set. */
6176 *cost += rtx_cost (op1, mode, SET, 1, speed);
6177 return true;
6179 case ZERO_EXTRACT:
6180 case SIGN_EXTRACT:
6181 /* Bit-field insertion. Strip any redundant widening of
6182 the RHS to meet the width of the target. */
6183 if (GET_CODE (op1) == SUBREG)
6184 op1 = SUBREG_REG (op1);
6185 if ((GET_CODE (op1) == ZERO_EXTEND
6186 || GET_CODE (op1) == SIGN_EXTEND)
6187 && CONST_INT_P (XEXP (op0, 1))
6188 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6189 >= INTVAL (XEXP (op0, 1))))
6190 op1 = XEXP (op1, 0);
6192 if (CONST_INT_P (op1))
6194 /* MOV immediate is assumed to always be cheap. */
6195 *cost = COSTS_N_INSNS (1);
6197 else
6199 /* BFM. */
6200 if (speed)
6201 *cost += extra_cost->alu.bfi;
6202 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6205 return true;
6207 default:
6208 /* We can't make sense of this, assume default cost. */
6209 *cost = COSTS_N_INSNS (1);
6210 return false;
6212 return false;
6214 case CONST_INT:
6215 /* If an instruction can incorporate a constant within the
6216 instruction, the instruction's expression avoids calling
6217 rtx_cost() on the constant. If rtx_cost() is called on a
6218 constant, then it is usually because the constant must be
6219 moved into a register by one or more instructions.
6221 The exception is constant 0, which can be expressed
6222 as XZR/WZR and is therefore free. The exception to this is
6223 if we have (set (reg) (const0_rtx)) in which case we must cost
6224 the move. However, we can catch that when we cost the SET, so
6225 we don't need to consider that here. */
6226 if (x == const0_rtx)
6227 *cost = 0;
6228 else
6230 /* To an approximation, building any other constant is
6231 proportionally expensive to the number of instructions
6232 required to build that constant. This is true whether we
6233 are compiling for SPEED or otherwise. */
6234 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6235 (NULL_RTX, x, false, mode));
6237 return true;
6239 case CONST_DOUBLE:
6240 if (speed)
6242 /* mov[df,sf]_aarch64. */
6243 if (aarch64_float_const_representable_p (x))
6244 /* FMOV (scalar immediate). */
6245 *cost += extra_cost->fp[mode == DFmode].fpconst;
6246 else if (!aarch64_float_const_zero_rtx_p (x))
6248 /* This will be a load from memory. */
6249 if (mode == DFmode)
6250 *cost += extra_cost->ldst.loadd;
6251 else
6252 *cost += extra_cost->ldst.loadf;
6254 else
6255 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6256 or MOV v0.s[0], wzr - neither of which are modeled by the
6257 cost tables. Just use the default cost. */
6262 return true;
6264 case MEM:
6265 if (speed)
6267 /* For loads we want the base cost of a load, plus an
6268 approximation for the additional cost of the addressing
6269 mode. */
6270 rtx address = XEXP (x, 0);
6271 if (VECTOR_MODE_P (mode))
6272 *cost += extra_cost->ldst.loadv;
6273 else if (GET_MODE_CLASS (mode) == MODE_INT)
6274 *cost += extra_cost->ldst.load;
6275 else if (mode == SFmode)
6276 *cost += extra_cost->ldst.loadf;
6277 else if (mode == DFmode)
6278 *cost += extra_cost->ldst.loadd;
6280 *cost +=
6281 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6282 0, speed));
6285 return true;
6287 case NEG:
6288 op0 = XEXP (x, 0);
6290 if (VECTOR_MODE_P (mode))
6292 if (speed)
6294 /* FNEG. */
6295 *cost += extra_cost->vect.alu;
6297 return false;
6300 if (GET_MODE_CLASS (mode) == MODE_INT)
6302 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6303 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6305 /* CSETM. */
6306 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6307 return true;
6310 /* Cost this as SUB wzr, X. */
6311 op0 = CONST0_RTX (mode);
6312 op1 = XEXP (x, 0);
6313 goto cost_minus;
6316 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6318 /* Support (neg(fma...)) as a single instruction only if
6319 sign of zeros is unimportant. This matches the decision
6320 making in aarch64.md. */
6321 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6323 /* FNMADD. */
6324 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6325 return true;
6327 if (GET_CODE (op0) == MULT)
6329 /* FNMUL. */
6330 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6331 return true;
6333 if (speed)
6334 /* FNEG. */
6335 *cost += extra_cost->fp[mode == DFmode].neg;
6336 return false;
6339 return false;
6341 case CLRSB:
6342 case CLZ:
6343 if (speed)
6345 if (VECTOR_MODE_P (mode))
6346 *cost += extra_cost->vect.alu;
6347 else
6348 *cost += extra_cost->alu.clz;
6351 return false;
6353 case COMPARE:
6354 op0 = XEXP (x, 0);
6355 op1 = XEXP (x, 1);
6357 if (op1 == const0_rtx
6358 && GET_CODE (op0) == AND)
6360 x = op0;
6361 mode = GET_MODE (op0);
6362 goto cost_logic;
6365 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6367 /* TODO: A write to the CC flags possibly costs extra, this
6368 needs encoding in the cost tables. */
6370 mode = GET_MODE (op0);
6371 /* ANDS. */
6372 if (GET_CODE (op0) == AND)
6374 x = op0;
6375 goto cost_logic;
6378 if (GET_CODE (op0) == PLUS)
6380 /* ADDS (and CMN alias). */
6381 x = op0;
6382 goto cost_plus;
6385 if (GET_CODE (op0) == MINUS)
6387 /* SUBS. */
6388 x = op0;
6389 goto cost_minus;
6392 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6393 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6394 && CONST_INT_P (XEXP (op0, 2)))
6396 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6397 Handle it here directly rather than going to cost_logic
6398 since we know the immediate generated for the TST is valid
6399 so we can avoid creating an intermediate rtx for it only
6400 for costing purposes. */
6401 if (speed)
6402 *cost += extra_cost->alu.logical;
6404 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6405 ZERO_EXTRACT, 0, speed);
6406 return true;
6409 if (GET_CODE (op1) == NEG)
6411 /* CMN. */
6412 if (speed)
6413 *cost += extra_cost->alu.arith;
6415 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6416 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6417 return true;
6420 /* CMP.
6422 Compare can freely swap the order of operands, and
6423 canonicalization puts the more complex operation first.
6424 But the integer MINUS logic expects the shift/extend
6425 operation in op1. */
6426 if (! (REG_P (op0)
6427 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6429 op0 = XEXP (x, 1);
6430 op1 = XEXP (x, 0);
6432 goto cost_minus;
6435 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6437 /* FCMP. */
6438 if (speed)
6439 *cost += extra_cost->fp[mode == DFmode].compare;
6441 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6443 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6444 /* FCMP supports constant 0.0 for no extra cost. */
6445 return true;
6447 return false;
6450 if (VECTOR_MODE_P (mode))
6452 /* Vector compare. */
6453 if (speed)
6454 *cost += extra_cost->vect.alu;
6456 if (aarch64_float_const_zero_rtx_p (op1))
6458 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6459 cost. */
6460 return true;
6462 return false;
6464 return false;
6466 case MINUS:
6468 op0 = XEXP (x, 0);
6469 op1 = XEXP (x, 1);
6471 cost_minus:
6472 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6474 /* Detect valid immediates. */
6475 if ((GET_MODE_CLASS (mode) == MODE_INT
6476 || (GET_MODE_CLASS (mode) == MODE_CC
6477 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6478 && CONST_INT_P (op1)
6479 && aarch64_uimm12_shift (INTVAL (op1)))
6481 if (speed)
6482 /* SUB(S) (immediate). */
6483 *cost += extra_cost->alu.arith;
6484 return true;
6487 /* Look for SUB (extended register). */
6488 if (aarch64_rtx_arith_op_extract_p (op1, mode))
6490 if (speed)
6491 *cost += extra_cost->alu.extend_arith;
6493 op1 = aarch64_strip_extend (op1);
6494 *cost += rtx_cost (op1, VOIDmode,
6495 (enum rtx_code) GET_CODE (op1), 0, speed);
6496 return true;
6499 rtx new_op1 = aarch64_strip_extend (op1);
6501 /* Cost this as an FMA-alike operation. */
6502 if ((GET_CODE (new_op1) == MULT
6503 || aarch64_shift_p (GET_CODE (new_op1)))
6504 && code != COMPARE)
6506 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6507 (enum rtx_code) code,
6508 speed);
6509 return true;
6512 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6514 if (speed)
6516 if (VECTOR_MODE_P (mode))
6518 /* Vector SUB. */
6519 *cost += extra_cost->vect.alu;
6521 else if (GET_MODE_CLASS (mode) == MODE_INT)
6523 /* SUB(S). */
6524 *cost += extra_cost->alu.arith;
6526 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6528 /* FSUB. */
6529 *cost += extra_cost->fp[mode == DFmode].addsub;
6532 return true;
6535 case PLUS:
6537 rtx new_op0;
6539 op0 = XEXP (x, 0);
6540 op1 = XEXP (x, 1);
6542 cost_plus:
6543 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6544 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6546 /* CSINC. */
6547 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6548 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6549 return true;
6552 if (GET_MODE_CLASS (mode) == MODE_INT
6553 && CONST_INT_P (op1)
6554 && aarch64_uimm12_shift (INTVAL (op1)))
6556 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6558 if (speed)
6559 /* ADD (immediate). */
6560 *cost += extra_cost->alu.arith;
6561 return true;
6564 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6566 /* Look for ADD (extended register). */
6567 if (aarch64_rtx_arith_op_extract_p (op0, mode))
6569 if (speed)
6570 *cost += extra_cost->alu.extend_arith;
6572 op0 = aarch64_strip_extend (op0);
6573 *cost += rtx_cost (op0, VOIDmode,
6574 (enum rtx_code) GET_CODE (op0), 0, speed);
6575 return true;
6578 /* Strip any extend, leave shifts behind as we will
6579 cost them through mult_cost. */
6580 new_op0 = aarch64_strip_extend (op0);
6582 if (GET_CODE (new_op0) == MULT
6583 || aarch64_shift_p (GET_CODE (new_op0)))
6585 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
6586 speed);
6587 return true;
6590 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
6592 if (speed)
6594 if (VECTOR_MODE_P (mode))
6596 /* Vector ADD. */
6597 *cost += extra_cost->vect.alu;
6599 else if (GET_MODE_CLASS (mode) == MODE_INT)
6601 /* ADD. */
6602 *cost += extra_cost->alu.arith;
6604 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6606 /* FADD. */
6607 *cost += extra_cost->fp[mode == DFmode].addsub;
6610 return true;
6613 case BSWAP:
6614 *cost = COSTS_N_INSNS (1);
6616 if (speed)
6618 if (VECTOR_MODE_P (mode))
6619 *cost += extra_cost->vect.alu;
6620 else
6621 *cost += extra_cost->alu.rev;
6623 return false;
6625 case IOR:
6626 if (aarch_rev16_p (x))
6628 *cost = COSTS_N_INSNS (1);
6630 if (speed)
6632 if (VECTOR_MODE_P (mode))
6633 *cost += extra_cost->vect.alu;
6634 else
6635 *cost += extra_cost->alu.rev;
6637 return true;
6640 if (aarch64_extr_rtx_p (x, &op0, &op1))
6642 *cost += rtx_cost (op0, mode, IOR, 0, speed);
6643 *cost += rtx_cost (op1, mode, IOR, 1, speed);
6644 if (speed)
6645 *cost += extra_cost->alu.shift;
6647 return true;
6649 /* Fall through. */
6650 case XOR:
6651 case AND:
6652 cost_logic:
6653 op0 = XEXP (x, 0);
6654 op1 = XEXP (x, 1);
6656 if (VECTOR_MODE_P (mode))
6658 if (speed)
6659 *cost += extra_cost->vect.alu;
6660 return true;
6663 if (code == AND
6664 && GET_CODE (op0) == MULT
6665 && CONST_INT_P (XEXP (op0, 1))
6666 && CONST_INT_P (op1)
6667 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
6668 INTVAL (op1)) != 0)
6670 /* This is a UBFM/SBFM. */
6671 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
6672 if (speed)
6673 *cost += extra_cost->alu.bfx;
6674 return true;
6677 if (GET_MODE_CLASS (mode) == MODE_INT)
6679 if (CONST_INT_P (op1))
6681 /* We have a mask + shift version of a UBFIZ
6682 i.e. the *andim_ashift<mode>_bfiz pattern. */
6683 if (GET_CODE (op0) == ASHIFT
6684 && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
6685 XEXP (op0, 1)))
6687 *cost += rtx_cost (XEXP (op0, 0), mode,
6688 (enum rtx_code) code, 0, speed);
6689 if (speed)
6690 *cost += extra_cost->alu.bfx;
6692 return true;
6694 else if (aarch64_bitmask_imm (INTVAL (op1), mode))
6696 /* We possibly get the immediate for free, this is not
6697 modelled. */
6698 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6699 if (speed)
6700 *cost += extra_cost->alu.logical;
6702 return true;
6705 else
6707 rtx new_op0 = op0;
6709 /* Handle ORN, EON, or BIC. */
6710 if (GET_CODE (op0) == NOT)
6711 op0 = XEXP (op0, 0);
6713 new_op0 = aarch64_strip_shift (op0);
6715 /* If we had a shift on op0 then this is a logical-shift-
6716 by-register/immediate operation. Otherwise, this is just
6717 a logical operation. */
6718 if (speed)
6720 if (new_op0 != op0)
6722 /* Shift by immediate. */
6723 if (CONST_INT_P (XEXP (op0, 1)))
6724 *cost += extra_cost->alu.log_shift;
6725 else
6726 *cost += extra_cost->alu.log_shift_reg;
6728 else
6729 *cost += extra_cost->alu.logical;
6732 /* In both cases we want to cost both operands. */
6733 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
6734 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
6736 return true;
6739 return false;
6741 case NOT:
6742 x = XEXP (x, 0);
6743 op0 = aarch64_strip_shift (x);
6745 if (VECTOR_MODE_P (mode))
6747 /* Vector NOT. */
6748 *cost += extra_cost->vect.alu;
6749 return false;
6752 /* MVN-shifted-reg. */
6753 if (op0 != x)
6755 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6757 if (speed)
6758 *cost += extra_cost->alu.log_shift;
6760 return true;
6762 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
6763 Handle the second form here taking care that 'a' in the above can
6764 be a shift. */
6765 else if (GET_CODE (op0) == XOR)
6767 rtx newop0 = XEXP (op0, 0);
6768 rtx newop1 = XEXP (op0, 1);
6769 rtx op0_stripped = aarch64_strip_shift (newop0);
6771 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
6772 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
6774 if (speed)
6776 if (op0_stripped != newop0)
6777 *cost += extra_cost->alu.log_shift;
6778 else
6779 *cost += extra_cost->alu.logical;
6782 return true;
6784 /* MVN. */
6785 if (speed)
6786 *cost += extra_cost->alu.logical;
6788 return false;
6790 case ZERO_EXTEND:
6792 op0 = XEXP (x, 0);
6793 /* If a value is written in SI mode, then zero extended to DI
6794 mode, the operation will in general be free as a write to
6795 a 'w' register implicitly zeroes the upper bits of an 'x'
6796 register. However, if this is
6798 (set (reg) (zero_extend (reg)))
6800 we must cost the explicit register move. */
6801 if (mode == DImode
6802 && GET_MODE (op0) == SImode
6803 && outer == SET)
6805 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
6807 /* If OP_COST is non-zero, then the cost of the zero extend
6808 is effectively the cost of the inner operation. Otherwise
6809 we have a MOV instruction and we take the cost from the MOV
6810 itself. This is true independently of whether we are
6811 optimizing for space or time. */
6812 if (op_cost)
6813 *cost = op_cost;
6815 return true;
6817 else if (MEM_P (op0))
6819 /* All loads can zero extend to any size for free. */
6820 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
6821 return true;
6824 op0 = aarch64_extend_bitfield_pattern_p (x);
6825 if (op0)
6827 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
6828 if (speed)
6829 *cost += extra_cost->alu.bfx;
6830 return true;
6833 if (speed)
6835 if (VECTOR_MODE_P (mode))
6837 /* UMOV. */
6838 *cost += extra_cost->vect.alu;
6840 else
6842 /* We generate an AND instead of UXTB/UXTH. */
6843 *cost += extra_cost->alu.logical;
6846 return false;
6848 case SIGN_EXTEND:
6849 if (MEM_P (XEXP (x, 0)))
6851 /* LDRSH. */
6852 if (speed)
6854 rtx address = XEXP (XEXP (x, 0), 0);
6855 *cost += extra_cost->ldst.load_sign_extend;
6857 *cost +=
6858 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6859 0, speed));
6861 return true;
6864 op0 = aarch64_extend_bitfield_pattern_p (x);
6865 if (op0)
6867 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
6868 if (speed)
6869 *cost += extra_cost->alu.bfx;
6870 return true;
6873 if (speed)
6875 if (VECTOR_MODE_P (mode))
6876 *cost += extra_cost->vect.alu;
6877 else
6878 *cost += extra_cost->alu.extend;
6880 return false;
6882 case ASHIFT:
6883 op0 = XEXP (x, 0);
6884 op1 = XEXP (x, 1);
6886 if (CONST_INT_P (op1))
6888 if (speed)
6890 if (VECTOR_MODE_P (mode))
6892 /* Vector shift (immediate). */
6893 *cost += extra_cost->vect.alu;
6895 else
6897 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
6898 aliases. */
6899 *cost += extra_cost->alu.shift;
6903 /* We can incorporate zero/sign extend for free. */
6904 if (GET_CODE (op0) == ZERO_EXTEND
6905 || GET_CODE (op0) == SIGN_EXTEND)
6906 op0 = XEXP (op0, 0);
6908 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
6909 return true;
6911 else
6913 if (speed)
6915 if (VECTOR_MODE_P (mode))
6917 /* Vector shift (register). */
6918 *cost += extra_cost->vect.alu;
6920 else
6922 /* LSLV. */
6923 *cost += extra_cost->alu.shift_reg;
6926 return false; /* All arguments need to be in registers. */
6929 case ROTATE:
6930 case ROTATERT:
6931 case LSHIFTRT:
6932 case ASHIFTRT:
6933 op0 = XEXP (x, 0);
6934 op1 = XEXP (x, 1);
6936 if (CONST_INT_P (op1))
6938 /* ASR (immediate) and friends. */
6939 if (speed)
6941 if (VECTOR_MODE_P (mode))
6942 *cost += extra_cost->vect.alu;
6943 else
6944 *cost += extra_cost->alu.shift;
6947 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
6948 return true;
6950 else
6953 /* ASR (register) and friends. */
6954 if (speed)
6956 if (VECTOR_MODE_P (mode))
6957 *cost += extra_cost->vect.alu;
6958 else
6959 *cost += extra_cost->alu.shift_reg;
6961 return false; /* All arguments need to be in registers. */
6964 case SYMBOL_REF:
6966 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
6967 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
6969 /* LDR. */
6970 if (speed)
6971 *cost += extra_cost->ldst.load;
6973 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
6974 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
6976 /* ADRP, followed by ADD. */
6977 *cost += COSTS_N_INSNS (1);
6978 if (speed)
6979 *cost += 2 * extra_cost->alu.arith;
6981 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
6982 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
6984 /* ADR. */
6985 if (speed)
6986 *cost += extra_cost->alu.arith;
6989 if (flag_pic)
6991 /* One extra load instruction, after accessing the GOT. */
6992 *cost += COSTS_N_INSNS (1);
6993 if (speed)
6994 *cost += extra_cost->ldst.load;
6996 return true;
6998 case HIGH:
6999 case LO_SUM:
7000 /* ADRP/ADD (immediate). */
7001 if (speed)
7002 *cost += extra_cost->alu.arith;
7003 return true;
7005 case ZERO_EXTRACT:
7006 case SIGN_EXTRACT:
7007 /* UBFX/SBFX. */
7008 if (speed)
7010 if (VECTOR_MODE_P (mode))
7011 *cost += extra_cost->vect.alu;
7012 else
7013 *cost += extra_cost->alu.bfx;
7016 /* We can trust that the immediates used will be correct (there
7017 are no by-register forms), so we need only cost op0. */
7018 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7019 return true;
7021 case MULT:
7022 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7023 /* aarch64_rtx_mult_cost always handles recursion to its
7024 operands. */
7025 return true;
7027 case MOD:
7028 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7029 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7030 an unconditional negate. This case should only ever be reached through
7031 the set_smod_pow2_cheap check in expmed.c. */
7032 if (CONST_INT_P (XEXP (x, 1))
7033 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7034 && (mode == SImode || mode == DImode))
7036 /* We expand to 4 instructions. Reset the baseline. */
7037 *cost = COSTS_N_INSNS (4);
7039 if (speed)
7040 *cost += 2 * extra_cost->alu.logical
7041 + 2 * extra_cost->alu.arith;
7043 return true;
7046 /* Fall-through. */
7047 case UMOD:
7048 if (speed)
7050 if (VECTOR_MODE_P (mode))
7051 *cost += extra_cost->vect.alu;
7052 else if (GET_MODE_CLASS (mode) == MODE_INT)
7053 *cost += (extra_cost->mult[mode == DImode].add
7054 + extra_cost->mult[mode == DImode].idiv);
7055 else if (mode == DFmode)
7056 *cost += (extra_cost->fp[1].mult
7057 + extra_cost->fp[1].div);
7058 else if (mode == SFmode)
7059 *cost += (extra_cost->fp[0].mult
7060 + extra_cost->fp[0].div);
7062 return false; /* All arguments need to be in registers. */
7064 case DIV:
7065 case UDIV:
7066 case SQRT:
7067 if (speed)
7069 if (VECTOR_MODE_P (mode))
7070 *cost += extra_cost->vect.alu;
7071 else if (GET_MODE_CLASS (mode) == MODE_INT)
7072 /* There is no integer SQRT, so only DIV and UDIV can get
7073 here. */
7074 *cost += extra_cost->mult[mode == DImode].idiv;
7075 else
7076 *cost += extra_cost->fp[mode == DFmode].div;
7078 return false; /* All arguments need to be in registers. */
7080 case IF_THEN_ELSE:
7081 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7082 XEXP (x, 2), cost, speed);
7084 case EQ:
7085 case NE:
7086 case GT:
7087 case GTU:
7088 case LT:
7089 case LTU:
7090 case GE:
7091 case GEU:
7092 case LE:
7093 case LEU:
7095 return false; /* All arguments must be in registers. */
7097 case FMA:
7098 op0 = XEXP (x, 0);
7099 op1 = XEXP (x, 1);
7100 op2 = XEXP (x, 2);
7102 if (speed)
7104 if (VECTOR_MODE_P (mode))
7105 *cost += extra_cost->vect.alu;
7106 else
7107 *cost += extra_cost->fp[mode == DFmode].fma;
7110 /* FMSUB, FNMADD, and FNMSUB are free. */
7111 if (GET_CODE (op0) == NEG)
7112 op0 = XEXP (op0, 0);
7114 if (GET_CODE (op2) == NEG)
7115 op2 = XEXP (op2, 0);
7117 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7118 and the by-element operand as operand 0. */
7119 if (GET_CODE (op1) == NEG)
7120 op1 = XEXP (op1, 0);
7122 /* Catch vector-by-element operations. The by-element operand can
7123 either be (vec_duplicate (vec_select (x))) or just
7124 (vec_select (x)), depending on whether we are multiplying by
7125 a vector or a scalar.
7127 Canonicalization is not very good in these cases, FMA4 will put the
7128 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7129 if (GET_CODE (op0) == VEC_DUPLICATE)
7130 op0 = XEXP (op0, 0);
7131 else if (GET_CODE (op1) == VEC_DUPLICATE)
7132 op1 = XEXP (op1, 0);
7134 if (GET_CODE (op0) == VEC_SELECT)
7135 op0 = XEXP (op0, 0);
7136 else if (GET_CODE (op1) == VEC_SELECT)
7137 op1 = XEXP (op1, 0);
7139 /* If the remaining parameters are not registers,
7140 get the cost to put them into registers. */
7141 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7142 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7143 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7144 return true;
7146 case FLOAT:
7147 case UNSIGNED_FLOAT:
7148 if (speed)
7149 *cost += extra_cost->fp[mode == DFmode].fromint;
7150 return false;
7152 case FLOAT_EXTEND:
7153 if (speed)
7155 if (VECTOR_MODE_P (mode))
7157 /*Vector truncate. */
7158 *cost += extra_cost->vect.alu;
7160 else
7161 *cost += extra_cost->fp[mode == DFmode].widen;
7163 return false;
7165 case FLOAT_TRUNCATE:
7166 if (speed)
7168 if (VECTOR_MODE_P (mode))
7170 /*Vector conversion. */
7171 *cost += extra_cost->vect.alu;
7173 else
7174 *cost += extra_cost->fp[mode == DFmode].narrow;
7176 return false;
7178 case FIX:
7179 case UNSIGNED_FIX:
7180 x = XEXP (x, 0);
7181 /* Strip the rounding part. They will all be implemented
7182 by the fcvt* family of instructions anyway. */
7183 if (GET_CODE (x) == UNSPEC)
7185 unsigned int uns_code = XINT (x, 1);
7187 if (uns_code == UNSPEC_FRINTA
7188 || uns_code == UNSPEC_FRINTM
7189 || uns_code == UNSPEC_FRINTN
7190 || uns_code == UNSPEC_FRINTP
7191 || uns_code == UNSPEC_FRINTZ)
7192 x = XVECEXP (x, 0, 0);
7195 if (speed)
7197 if (VECTOR_MODE_P (mode))
7198 *cost += extra_cost->vect.alu;
7199 else
7200 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7203 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7204 fixed-point fcvt. */
7205 if (GET_CODE (x) == MULT
7206 && ((VECTOR_MODE_P (mode)
7207 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7208 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7210 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7211 0, speed);
7212 return true;
7215 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7216 return true;
7218 case ABS:
7219 if (VECTOR_MODE_P (mode))
7221 /* ABS (vector). */
7222 if (speed)
7223 *cost += extra_cost->vect.alu;
7225 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7227 op0 = XEXP (x, 0);
7229 /* FABD, which is analogous to FADD. */
7230 if (GET_CODE (op0) == MINUS)
7232 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7233 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7234 if (speed)
7235 *cost += extra_cost->fp[mode == DFmode].addsub;
7237 return true;
7239 /* Simple FABS is analogous to FNEG. */
7240 if (speed)
7241 *cost += extra_cost->fp[mode == DFmode].neg;
7243 else
7245 /* Integer ABS will either be split to
7246 two arithmetic instructions, or will be an ABS
7247 (scalar), which we don't model. */
7248 *cost = COSTS_N_INSNS (2);
7249 if (speed)
7250 *cost += 2 * extra_cost->alu.arith;
7252 return false;
7254 case SMAX:
7255 case SMIN:
7256 if (speed)
7258 if (VECTOR_MODE_P (mode))
7259 *cost += extra_cost->vect.alu;
7260 else
7262 /* FMAXNM/FMINNM/FMAX/FMIN.
7263 TODO: This may not be accurate for all implementations, but
7264 we do not model this in the cost tables. */
7265 *cost += extra_cost->fp[mode == DFmode].addsub;
7268 return false;
7270 case UNSPEC:
7271 /* The floating point round to integer frint* instructions. */
7272 if (aarch64_frint_unspec_p (XINT (x, 1)))
7274 if (speed)
7275 *cost += extra_cost->fp[mode == DFmode].roundint;
7277 return false;
7280 if (XINT (x, 1) == UNSPEC_RBIT)
7282 if (speed)
7283 *cost += extra_cost->alu.rev;
7285 return false;
7287 break;
7289 case TRUNCATE:
7291 /* Decompose <su>muldi3_highpart. */
7292 if (/* (truncate:DI */
7293 mode == DImode
7294 /* (lshiftrt:TI */
7295 && GET_MODE (XEXP (x, 0)) == TImode
7296 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7297 /* (mult:TI */
7298 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7299 /* (ANY_EXTEND:TI (reg:DI))
7300 (ANY_EXTEND:TI (reg:DI))) */
7301 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7302 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7303 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7304 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7305 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7306 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7307 /* (const_int 64) */
7308 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7309 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7311 /* UMULH/SMULH. */
7312 if (speed)
7313 *cost += extra_cost->mult[mode == DImode].extend;
7314 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7315 mode, MULT, 0, speed);
7316 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7317 mode, MULT, 1, speed);
7318 return true;
7321 /* Fall through. */
7322 default:
7323 break;
7326 if (dump_file && (dump_flags & TDF_DETAILS))
7327 fprintf (dump_file,
7328 "\nFailed to cost RTX. Assuming default cost.\n");
7330 return true;
7333 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7334 calculated for X. This cost is stored in *COST. Returns true
7335 if the total cost of X was calculated. */
7336 static bool
7337 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7338 int param, int *cost, bool speed)
7340 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7342 if (dump_file && (dump_flags & TDF_DETAILS))
7344 print_rtl_single (dump_file, x);
7345 fprintf (dump_file, "\n%s cost: %d (%s)\n",
7346 speed ? "Hot" : "Cold",
7347 *cost, result ? "final" : "partial");
7350 return result;
7353 static int
7354 aarch64_register_move_cost (machine_mode mode,
7355 reg_class_t from_i, reg_class_t to_i)
7357 enum reg_class from = (enum reg_class) from_i;
7358 enum reg_class to = (enum reg_class) to_i;
7359 const struct cpu_regmove_cost *regmove_cost
7360 = aarch64_tune_params.regmove_cost;
7362 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
7363 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7364 to = GENERAL_REGS;
7366 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7367 from = GENERAL_REGS;
7369 /* Moving between GPR and stack cost is the same as GP2GP. */
7370 if ((from == GENERAL_REGS && to == STACK_REG)
7371 || (to == GENERAL_REGS && from == STACK_REG))
7372 return regmove_cost->GP2GP;
7374 /* To/From the stack register, we move via the gprs. */
7375 if (to == STACK_REG || from == STACK_REG)
7376 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7377 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7379 if (GET_MODE_SIZE (mode) == 16)
7381 /* 128-bit operations on general registers require 2 instructions. */
7382 if (from == GENERAL_REGS && to == GENERAL_REGS)
7383 return regmove_cost->GP2GP * 2;
7384 else if (from == GENERAL_REGS)
7385 return regmove_cost->GP2FP * 2;
7386 else if (to == GENERAL_REGS)
7387 return regmove_cost->FP2GP * 2;
7389 /* When AdvSIMD instructions are disabled it is not possible to move
7390 a 128-bit value directly between Q registers. This is handled in
7391 secondary reload. A general register is used as a scratch to move
7392 the upper DI value and the lower DI value is moved directly,
7393 hence the cost is the sum of three moves. */
7394 if (! TARGET_SIMD)
7395 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7397 return regmove_cost->FP2FP;
7400 if (from == GENERAL_REGS && to == GENERAL_REGS)
7401 return regmove_cost->GP2GP;
7402 else if (from == GENERAL_REGS)
7403 return regmove_cost->GP2FP;
7404 else if (to == GENERAL_REGS)
7405 return regmove_cost->FP2GP;
7407 return regmove_cost->FP2FP;
7410 static int
7411 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7412 reg_class_t rclass ATTRIBUTE_UNUSED,
7413 bool in ATTRIBUTE_UNUSED)
7415 return aarch64_tune_params.memmov_cost;
7418 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7419 to optimize 1.0/sqrt. */
7421 static bool
7422 use_rsqrt_p (machine_mode mode)
7424 return (!flag_trapping_math
7425 && flag_unsafe_math_optimizations
7426 && ((aarch64_tune_params.approx_modes->recip_sqrt
7427 & AARCH64_APPROX_MODE (mode))
7428 || flag_mrecip_low_precision_sqrt));
7431 /* Function to decide when to use the approximate reciprocal square root
7432 builtin. */
7434 static tree
7435 aarch64_builtin_reciprocal (tree fndecl)
7437 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7439 if (!use_rsqrt_p (mode))
7440 return NULL_TREE;
7441 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7444 typedef rtx (*rsqrte_type) (rtx, rtx);
7446 /* Select reciprocal square root initial estimate insn depending on machine
7447 mode. */
7449 static rsqrte_type
7450 get_rsqrte_type (machine_mode mode)
7452 switch (mode)
7454 case DFmode: return gen_aarch64_rsqrtedf;
7455 case SFmode: return gen_aarch64_rsqrtesf;
7456 case V2DFmode: return gen_aarch64_rsqrtev2df;
7457 case V2SFmode: return gen_aarch64_rsqrtev2sf;
7458 case V4SFmode: return gen_aarch64_rsqrtev4sf;
7459 default: gcc_unreachable ();
7463 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7465 /* Select reciprocal square root series step insn depending on machine mode. */
7467 static rsqrts_type
7468 get_rsqrts_type (machine_mode mode)
7470 switch (mode)
7472 case DFmode: return gen_aarch64_rsqrtsdf;
7473 case SFmode: return gen_aarch64_rsqrtssf;
7474 case V2DFmode: return gen_aarch64_rsqrtsv2df;
7475 case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7476 case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7477 default: gcc_unreachable ();
7481 /* Emit instruction sequence to compute either the approximate square root
7482 or its approximate reciprocal, depending on the flag RECP, and return
7483 whether the sequence was emitted or not. */
7485 bool
7486 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7488 machine_mode mode = GET_MODE (dst);
7490 if (GET_MODE_INNER (mode) == HFmode)
7491 return false;
7493 machine_mode mmsk = mode_for_vector
7494 (int_mode_for_mode (GET_MODE_INNER (mode)),
7495 GET_MODE_NUNITS (mode));
7496 bool use_approx_sqrt_p = (!recp
7497 && (flag_mlow_precision_sqrt
7498 || (aarch64_tune_params.approx_modes->sqrt
7499 & AARCH64_APPROX_MODE (mode))));
7500 bool use_approx_rsqrt_p = (recp
7501 && (flag_mrecip_low_precision_sqrt
7502 || (aarch64_tune_params.approx_modes->recip_sqrt
7503 & AARCH64_APPROX_MODE (mode))));
7505 if (!flag_finite_math_only
7506 || flag_trapping_math
7507 || !flag_unsafe_math_optimizations
7508 || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7509 || optimize_function_for_size_p (cfun))
7510 return false;
7512 rtx xmsk = gen_reg_rtx (mmsk);
7513 if (!recp)
7514 /* When calculating the approximate square root, compare the argument with
7515 0.0 and create a mask. */
7516 emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7517 CONST0_RTX (mode)))));
7519 /* Estimate the approximate reciprocal square root. */
7520 rtx xdst = gen_reg_rtx (mode);
7521 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7523 /* Iterate over the series twice for SF and thrice for DF. */
7524 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7526 /* Optionally iterate over the series once less for faster performance
7527 while sacrificing the accuracy. */
7528 if ((recp && flag_mrecip_low_precision_sqrt)
7529 || (!recp && flag_mlow_precision_sqrt))
7530 iterations--;
7532 /* Iterate over the series to calculate the approximate reciprocal square
7533 root. */
7534 rtx x1 = gen_reg_rtx (mode);
7535 while (iterations--)
7537 rtx x2 = gen_reg_rtx (mode);
7538 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7540 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7542 if (iterations > 0)
7543 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7546 if (!recp)
7548 /* Qualify the approximate reciprocal square root when the argument is
7549 0.0 by squashing the intermediary result to 0.0. */
7550 rtx xtmp = gen_reg_rtx (mmsk);
7551 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7552 gen_rtx_SUBREG (mmsk, xdst, 0)));
7553 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7555 /* Calculate the approximate square root. */
7556 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7559 /* Finalize the approximation. */
7560 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
7562 return true;
7565 typedef rtx (*recpe_type) (rtx, rtx);
7567 /* Select reciprocal initial estimate insn depending on machine mode. */
7569 static recpe_type
7570 get_recpe_type (machine_mode mode)
7572 switch (mode)
7574 case SFmode: return (gen_aarch64_frecpesf);
7575 case V2SFmode: return (gen_aarch64_frecpev2sf);
7576 case V4SFmode: return (gen_aarch64_frecpev4sf);
7577 case DFmode: return (gen_aarch64_frecpedf);
7578 case V2DFmode: return (gen_aarch64_frecpev2df);
7579 default: gcc_unreachable ();
7583 typedef rtx (*recps_type) (rtx, rtx, rtx);
7585 /* Select reciprocal series step insn depending on machine mode. */
7587 static recps_type
7588 get_recps_type (machine_mode mode)
7590 switch (mode)
7592 case SFmode: return (gen_aarch64_frecpssf);
7593 case V2SFmode: return (gen_aarch64_frecpsv2sf);
7594 case V4SFmode: return (gen_aarch64_frecpsv4sf);
7595 case DFmode: return (gen_aarch64_frecpsdf);
7596 case V2DFmode: return (gen_aarch64_frecpsv2df);
7597 default: gcc_unreachable ();
7601 /* Emit the instruction sequence to compute the approximation for the division
7602 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
7604 bool
7605 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
7607 machine_mode mode = GET_MODE (quo);
7609 if (GET_MODE_INNER (mode) == HFmode)
7610 return false;
7612 bool use_approx_division_p = (flag_mlow_precision_div
7613 || (aarch64_tune_params.approx_modes->division
7614 & AARCH64_APPROX_MODE (mode)));
7616 if (!flag_finite_math_only
7617 || flag_trapping_math
7618 || !flag_unsafe_math_optimizations
7619 || optimize_function_for_size_p (cfun)
7620 || !use_approx_division_p)
7621 return false;
7623 /* Estimate the approximate reciprocal. */
7624 rtx xrcp = gen_reg_rtx (mode);
7625 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
7627 /* Iterate over the series twice for SF and thrice for DF. */
7628 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7630 /* Optionally iterate over the series once less for faster performance,
7631 while sacrificing the accuracy. */
7632 if (flag_mlow_precision_div)
7633 iterations--;
7635 /* Iterate over the series to calculate the approximate reciprocal. */
7636 rtx xtmp = gen_reg_rtx (mode);
7637 while (iterations--)
7639 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
7641 if (iterations > 0)
7642 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
7645 if (num != CONST1_RTX (mode))
7647 /* As the approximate reciprocal of DEN is already calculated, only
7648 calculate the approximate division when NUM is not 1.0. */
7649 rtx xnum = force_reg (mode, num);
7650 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
7653 /* Finalize the approximation. */
7654 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
7655 return true;
7658 /* Return the number of instructions that can be issued per cycle. */
7659 static int
7660 aarch64_sched_issue_rate (void)
7662 return aarch64_tune_params.issue_rate;
7665 static int
7666 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
7668 int issue_rate = aarch64_sched_issue_rate ();
7670 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
7674 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
7675 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
7676 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
7678 static int
7679 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
7680 int ready_index)
7682 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
7686 /* Vectorizer cost model target hooks. */
7688 /* Implement targetm.vectorize.builtin_vectorization_cost. */
7689 static int
7690 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
7691 tree vectype,
7692 int misalign ATTRIBUTE_UNUSED)
7694 unsigned elements;
7696 switch (type_of_cost)
7698 case scalar_stmt:
7699 return aarch64_tune_params.vec_costs->scalar_stmt_cost;
7701 case scalar_load:
7702 return aarch64_tune_params.vec_costs->scalar_load_cost;
7704 case scalar_store:
7705 return aarch64_tune_params.vec_costs->scalar_store_cost;
7707 case vector_stmt:
7708 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7710 case vector_load:
7711 return aarch64_tune_params.vec_costs->vec_align_load_cost;
7713 case vector_store:
7714 return aarch64_tune_params.vec_costs->vec_store_cost;
7716 case vec_to_scalar:
7717 return aarch64_tune_params.vec_costs->vec_to_scalar_cost;
7719 case scalar_to_vec:
7720 return aarch64_tune_params.vec_costs->scalar_to_vec_cost;
7722 case unaligned_load:
7723 return aarch64_tune_params.vec_costs->vec_unalign_load_cost;
7725 case unaligned_store:
7726 return aarch64_tune_params.vec_costs->vec_unalign_store_cost;
7728 case cond_branch_taken:
7729 return aarch64_tune_params.vec_costs->cond_taken_branch_cost;
7731 case cond_branch_not_taken:
7732 return aarch64_tune_params.vec_costs->cond_not_taken_branch_cost;
7734 case vec_perm:
7735 return aarch64_tune_params.vec_costs->vec_permute_cost;
7737 case vec_promote_demote:
7738 return aarch64_tune_params.vec_costs->vec_stmt_cost;
7740 case vec_construct:
7741 elements = TYPE_VECTOR_SUBPARTS (vectype);
7742 return elements / 2 + 1;
7744 default:
7745 gcc_unreachable ();
7749 /* Implement targetm.vectorize.add_stmt_cost. */
7750 static unsigned
7751 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
7752 struct _stmt_vec_info *stmt_info, int misalign,
7753 enum vect_cost_model_location where)
7755 unsigned *cost = (unsigned *) data;
7756 unsigned retval = 0;
7758 if (flag_vect_cost_model)
7760 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
7761 int stmt_cost =
7762 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
7764 /* Statements in an inner loop relative to the loop being
7765 vectorized are weighted more heavily. The value here is
7766 arbitrary and could potentially be improved with analysis. */
7767 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
7768 count *= 50; /* FIXME */
7770 retval = (unsigned) (count * stmt_cost);
7771 cost[where] += retval;
7774 return retval;
7777 static void initialize_aarch64_code_model (struct gcc_options *);
7779 /* Parse the TO_PARSE string and put the architecture struct that it
7780 selects into RES and the architectural features into ISA_FLAGS.
7781 Return an aarch64_parse_opt_result describing the parse result.
7782 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
7784 static enum aarch64_parse_opt_result
7785 aarch64_parse_arch (const char *to_parse, const struct processor **res,
7786 unsigned long *isa_flags)
7788 char *ext;
7789 const struct processor *arch;
7790 char *str = (char *) alloca (strlen (to_parse) + 1);
7791 size_t len;
7793 strcpy (str, to_parse);
7795 ext = strchr (str, '+');
7797 if (ext != NULL)
7798 len = ext - str;
7799 else
7800 len = strlen (str);
7802 if (len == 0)
7803 return AARCH64_PARSE_MISSING_ARG;
7806 /* Loop through the list of supported ARCHes to find a match. */
7807 for (arch = all_architectures; arch->name != NULL; arch++)
7809 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
7811 unsigned long isa_temp = arch->flags;
7813 if (ext != NULL)
7815 /* TO_PARSE string contains at least one extension. */
7816 enum aarch64_parse_opt_result ext_res
7817 = aarch64_parse_extension (ext, &isa_temp);
7819 if (ext_res != AARCH64_PARSE_OK)
7820 return ext_res;
7822 /* Extension parsing was successful. Confirm the result
7823 arch and ISA flags. */
7824 *res = arch;
7825 *isa_flags = isa_temp;
7826 return AARCH64_PARSE_OK;
7830 /* ARCH name not found in list. */
7831 return AARCH64_PARSE_INVALID_ARG;
7834 /* Parse the TO_PARSE string and put the result tuning in RES and the
7835 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
7836 describing the parse result. If there is an error parsing, RES and
7837 ISA_FLAGS are left unchanged. */
7839 static enum aarch64_parse_opt_result
7840 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
7841 unsigned long *isa_flags)
7843 char *ext;
7844 const struct processor *cpu;
7845 char *str = (char *) alloca (strlen (to_parse) + 1);
7846 size_t len;
7848 strcpy (str, to_parse);
7850 ext = strchr (str, '+');
7852 if (ext != NULL)
7853 len = ext - str;
7854 else
7855 len = strlen (str);
7857 if (len == 0)
7858 return AARCH64_PARSE_MISSING_ARG;
7861 /* Loop through the list of supported CPUs to find a match. */
7862 for (cpu = all_cores; cpu->name != NULL; cpu++)
7864 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
7866 unsigned long isa_temp = cpu->flags;
7869 if (ext != NULL)
7871 /* TO_PARSE string contains at least one extension. */
7872 enum aarch64_parse_opt_result ext_res
7873 = aarch64_parse_extension (ext, &isa_temp);
7875 if (ext_res != AARCH64_PARSE_OK)
7876 return ext_res;
7878 /* Extension parsing was successfull. Confirm the result
7879 cpu and ISA flags. */
7880 *res = cpu;
7881 *isa_flags = isa_temp;
7882 return AARCH64_PARSE_OK;
7886 /* CPU name not found in list. */
7887 return AARCH64_PARSE_INVALID_ARG;
7890 /* Parse the TO_PARSE string and put the cpu it selects into RES.
7891 Return an aarch64_parse_opt_result describing the parse result.
7892 If the parsing fails the RES does not change. */
7894 static enum aarch64_parse_opt_result
7895 aarch64_parse_tune (const char *to_parse, const struct processor **res)
7897 const struct processor *cpu;
7898 char *str = (char *) alloca (strlen (to_parse) + 1);
7900 strcpy (str, to_parse);
7902 /* Loop through the list of supported CPUs to find a match. */
7903 for (cpu = all_cores; cpu->name != NULL; cpu++)
7905 if (strcmp (cpu->name, str) == 0)
7907 *res = cpu;
7908 return AARCH64_PARSE_OK;
7912 /* CPU name not found in list. */
7913 return AARCH64_PARSE_INVALID_ARG;
7916 /* Parse TOKEN, which has length LENGTH to see if it is an option
7917 described in FLAG. If it is, return the index bit for that fusion type.
7918 If not, error (printing OPTION_NAME) and return zero. */
7920 static unsigned int
7921 aarch64_parse_one_option_token (const char *token,
7922 size_t length,
7923 const struct aarch64_flag_desc *flag,
7924 const char *option_name)
7926 for (; flag->name != NULL; flag++)
7928 if (length == strlen (flag->name)
7929 && !strncmp (flag->name, token, length))
7930 return flag->flag;
7933 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
7934 return 0;
7937 /* Parse OPTION which is a comma-separated list of flags to enable.
7938 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
7939 default state we inherit from the CPU tuning structures. OPTION_NAME
7940 gives the top-level option we are parsing in the -moverride string,
7941 for use in error messages. */
7943 static unsigned int
7944 aarch64_parse_boolean_options (const char *option,
7945 const struct aarch64_flag_desc *flags,
7946 unsigned int initial_state,
7947 const char *option_name)
7949 const char separator = '.';
7950 const char* specs = option;
7951 const char* ntoken = option;
7952 unsigned int found_flags = initial_state;
7954 while ((ntoken = strchr (specs, separator)))
7956 size_t token_length = ntoken - specs;
7957 unsigned token_ops = aarch64_parse_one_option_token (specs,
7958 token_length,
7959 flags,
7960 option_name);
7961 /* If we find "none" (or, for simplicity's sake, an error) anywhere
7962 in the token stream, reset the supported operations. So:
7964 adrp+add.cmp+branch.none.adrp+add
7966 would have the result of turning on only adrp+add fusion. */
7967 if (!token_ops)
7968 found_flags = 0;
7970 found_flags |= token_ops;
7971 specs = ++ntoken;
7974 /* We ended with a comma, print something. */
7975 if (!(*specs))
7977 error ("%s string ill-formed\n", option_name);
7978 return 0;
7981 /* We still have one more token to parse. */
7982 size_t token_length = strlen (specs);
7983 unsigned token_ops = aarch64_parse_one_option_token (specs,
7984 token_length,
7985 flags,
7986 option_name);
7987 if (!token_ops)
7988 found_flags = 0;
7990 found_flags |= token_ops;
7991 return found_flags;
7994 /* Support for overriding instruction fusion. */
7996 static void
7997 aarch64_parse_fuse_string (const char *fuse_string,
7998 struct tune_params *tune)
8000 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8001 aarch64_fusible_pairs,
8002 tune->fusible_ops,
8003 "fuse=");
8006 /* Support for overriding other tuning flags. */
8008 static void
8009 aarch64_parse_tune_string (const char *tune_string,
8010 struct tune_params *tune)
8012 tune->extra_tuning_flags
8013 = aarch64_parse_boolean_options (tune_string,
8014 aarch64_tuning_flags,
8015 tune->extra_tuning_flags,
8016 "tune=");
8019 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8020 we understand. If it is, extract the option string and handoff to
8021 the appropriate function. */
8023 void
8024 aarch64_parse_one_override_token (const char* token,
8025 size_t length,
8026 struct tune_params *tune)
8028 const struct aarch64_tuning_override_function *fn
8029 = aarch64_tuning_override_functions;
8031 const char *option_part = strchr (token, '=');
8032 if (!option_part)
8034 error ("tuning string missing in option (%s)", token);
8035 return;
8038 /* Get the length of the option name. */
8039 length = option_part - token;
8040 /* Skip the '=' to get to the option string. */
8041 option_part++;
8043 for (; fn->name != NULL; fn++)
8045 if (!strncmp (fn->name, token, length))
8047 fn->parse_override (option_part, tune);
8048 return;
8052 error ("unknown tuning option (%s)",token);
8053 return;
8056 /* A checking mechanism for the implementation of the tls size. */
8058 static void
8059 initialize_aarch64_tls_size (struct gcc_options *opts)
8061 if (aarch64_tls_size == 0)
8062 aarch64_tls_size = 24;
8064 switch (opts->x_aarch64_cmodel_var)
8066 case AARCH64_CMODEL_TINY:
8067 /* Both the default and maximum TLS size allowed under tiny is 1M which
8068 needs two instructions to address, so we clamp the size to 24. */
8069 if (aarch64_tls_size > 24)
8070 aarch64_tls_size = 24;
8071 break;
8072 case AARCH64_CMODEL_SMALL:
8073 /* The maximum TLS size allowed under small is 4G. */
8074 if (aarch64_tls_size > 32)
8075 aarch64_tls_size = 32;
8076 break;
8077 case AARCH64_CMODEL_LARGE:
8078 /* The maximum TLS size allowed under large is 16E.
8079 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8080 if (aarch64_tls_size > 48)
8081 aarch64_tls_size = 48;
8082 break;
8083 default:
8084 gcc_unreachable ();
8087 return;
8090 /* Parse STRING looking for options in the format:
8091 string :: option:string
8092 option :: name=substring
8093 name :: {a-z}
8094 substring :: defined by option. */
8096 static void
8097 aarch64_parse_override_string (const char* input_string,
8098 struct tune_params* tune)
8100 const char separator = ':';
8101 size_t string_length = strlen (input_string) + 1;
8102 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8103 char *string = string_root;
8104 strncpy (string, input_string, string_length);
8105 string[string_length - 1] = '\0';
8107 char* ntoken = string;
8109 while ((ntoken = strchr (string, separator)))
8111 size_t token_length = ntoken - string;
8112 /* Make this substring look like a string. */
8113 *ntoken = '\0';
8114 aarch64_parse_one_override_token (string, token_length, tune);
8115 string = ++ntoken;
8118 /* One last option to parse. */
8119 aarch64_parse_one_override_token (string, strlen (string), tune);
8120 free (string_root);
8124 static void
8125 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8127 /* The logic here is that if we are disabling all frame pointer generation
8128 then we do not need to disable leaf frame pointer generation as a
8129 separate operation. But if we are *only* disabling leaf frame pointer
8130 generation then we set flag_omit_frame_pointer to true, but in
8131 aarch64_frame_pointer_required we return false only for leaf functions.
8133 PR 70044: We have to be careful about being called multiple times for the
8134 same function. Once we have decided to set flag_omit_frame_pointer just
8135 so that we can omit leaf frame pointers, we must then not interpret a
8136 second call as meaning that all frame pointer generation should be
8137 omitted. We do this by setting flag_omit_frame_pointer to a special,
8138 non-zero value. */
8139 if (opts->x_flag_omit_frame_pointer == 2)
8140 opts->x_flag_omit_frame_pointer = 0;
8142 if (opts->x_flag_omit_frame_pointer)
8143 opts->x_flag_omit_leaf_frame_pointer = false;
8144 else if (opts->x_flag_omit_leaf_frame_pointer)
8145 opts->x_flag_omit_frame_pointer = 2;
8147 /* If not optimizing for size, set the default
8148 alignment to what the target wants. */
8149 if (!opts->x_optimize_size)
8151 if (opts->x_align_loops <= 0)
8152 opts->x_align_loops = aarch64_tune_params.loop_align;
8153 if (opts->x_align_jumps <= 0)
8154 opts->x_align_jumps = aarch64_tune_params.jump_align;
8155 if (opts->x_align_functions <= 0)
8156 opts->x_align_functions = aarch64_tune_params.function_align;
8159 /* We default to no pc-relative literal loads. */
8161 aarch64_pcrelative_literal_loads = false;
8163 /* If -mpc-relative-literal-loads is set on the command line, this
8164 implies that the user asked for PC relative literal loads. */
8165 if (opts->x_pcrelative_literal_loads == 1)
8166 aarch64_pcrelative_literal_loads = true;
8168 /* This is PR70113. When building the Linux kernel with
8169 CONFIG_ARM64_ERRATUM_843419, support for relocations
8170 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8171 removed from the kernel to avoid loading objects with possibly
8172 offending sequences. Without -mpc-relative-literal-loads we would
8173 generate such relocations, preventing the kernel build from
8174 succeeding. */
8175 if (opts->x_pcrelative_literal_loads == 2
8176 && TARGET_FIX_ERR_A53_843419)
8177 aarch64_pcrelative_literal_loads = true;
8179 /* In the tiny memory model it makes no sense to disallow PC relative
8180 literal pool loads. */
8181 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8182 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8183 aarch64_pcrelative_literal_loads = true;
8185 /* When enabling the lower precision Newton series for the square root, also
8186 enable it for the reciprocal square root, since the latter is an
8187 intermediary step for the former. */
8188 if (flag_mlow_precision_sqrt)
8189 flag_mrecip_low_precision_sqrt = true;
8192 /* 'Unpack' up the internal tuning structs and update the options
8193 in OPTS. The caller must have set up selected_tune and selected_arch
8194 as all the other target-specific codegen decisions are
8195 derived from them. */
8197 void
8198 aarch64_override_options_internal (struct gcc_options *opts)
8200 aarch64_tune_flags = selected_tune->flags;
8201 aarch64_tune = selected_tune->sched_core;
8202 /* Make a copy of the tuning parameters attached to the core, which
8203 we may later overwrite. */
8204 aarch64_tune_params = *(selected_tune->tune);
8205 aarch64_architecture_version = selected_arch->architecture_version;
8207 if (opts->x_aarch64_override_tune_string)
8208 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8209 &aarch64_tune_params);
8211 /* This target defaults to strict volatile bitfields. */
8212 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8213 opts->x_flag_strict_volatile_bitfields = 1;
8215 initialize_aarch64_code_model (opts);
8216 initialize_aarch64_tls_size (opts);
8218 int queue_depth = 0;
8219 switch (aarch64_tune_params.autoprefetcher_model)
8221 case tune_params::AUTOPREFETCHER_OFF:
8222 queue_depth = -1;
8223 break;
8224 case tune_params::AUTOPREFETCHER_WEAK:
8225 queue_depth = 0;
8226 break;
8227 case tune_params::AUTOPREFETCHER_STRONG:
8228 queue_depth = max_insn_queue_index + 1;
8229 break;
8230 default:
8231 gcc_unreachable ();
8234 /* We don't mind passing in global_options_set here as we don't use
8235 the *options_set structs anyway. */
8236 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8237 queue_depth,
8238 opts->x_param_values,
8239 global_options_set.x_param_values);
8241 /* Set the L1 cache line size. */
8242 if (selected_cpu->tune->cache_line_size != 0)
8243 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8244 selected_cpu->tune->cache_line_size,
8245 opts->x_param_values,
8246 global_options_set.x_param_values);
8248 aarch64_override_options_after_change_1 (opts);
8251 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8252 specified in STR and throw errors if appropriate. Put the results if
8253 they are valid in RES and ISA_FLAGS. Return whether the option is
8254 valid. */
8256 static bool
8257 aarch64_validate_mcpu (const char *str, const struct processor **res,
8258 unsigned long *isa_flags)
8260 enum aarch64_parse_opt_result parse_res
8261 = aarch64_parse_cpu (str, res, isa_flags);
8263 if (parse_res == AARCH64_PARSE_OK)
8264 return true;
8266 switch (parse_res)
8268 case AARCH64_PARSE_MISSING_ARG:
8269 error ("missing cpu name in -mcpu=%qs", str);
8270 break;
8271 case AARCH64_PARSE_INVALID_ARG:
8272 error ("unknown value %qs for -mcpu", str);
8273 break;
8274 case AARCH64_PARSE_INVALID_FEATURE:
8275 error ("invalid feature modifier in -mcpu=%qs", str);
8276 break;
8277 default:
8278 gcc_unreachable ();
8281 return false;
8284 /* Validate a command-line -march option. Parse the arch and extensions
8285 (if any) specified in STR and throw errors if appropriate. Put the
8286 results, if they are valid, in RES and ISA_FLAGS. Return whether the
8287 option is valid. */
8289 static bool
8290 aarch64_validate_march (const char *str, const struct processor **res,
8291 unsigned long *isa_flags)
8293 enum aarch64_parse_opt_result parse_res
8294 = aarch64_parse_arch (str, res, isa_flags);
8296 if (parse_res == AARCH64_PARSE_OK)
8297 return true;
8299 switch (parse_res)
8301 case AARCH64_PARSE_MISSING_ARG:
8302 error ("missing arch name in -march=%qs", str);
8303 break;
8304 case AARCH64_PARSE_INVALID_ARG:
8305 error ("unknown value %qs for -march", str);
8306 break;
8307 case AARCH64_PARSE_INVALID_FEATURE:
8308 error ("invalid feature modifier in -march=%qs", str);
8309 break;
8310 default:
8311 gcc_unreachable ();
8314 return false;
8317 /* Validate a command-line -mtune option. Parse the cpu
8318 specified in STR and throw errors if appropriate. Put the
8319 result, if it is valid, in RES. Return whether the option is
8320 valid. */
8322 static bool
8323 aarch64_validate_mtune (const char *str, const struct processor **res)
8325 enum aarch64_parse_opt_result parse_res
8326 = aarch64_parse_tune (str, res);
8328 if (parse_res == AARCH64_PARSE_OK)
8329 return true;
8331 switch (parse_res)
8333 case AARCH64_PARSE_MISSING_ARG:
8334 error ("missing cpu name in -mtune=%qs", str);
8335 break;
8336 case AARCH64_PARSE_INVALID_ARG:
8337 error ("unknown value %qs for -mtune", str);
8338 break;
8339 default:
8340 gcc_unreachable ();
8342 return false;
8345 /* Return the CPU corresponding to the enum CPU.
8346 If it doesn't specify a cpu, return the default. */
8348 static const struct processor *
8349 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8351 if (cpu != aarch64_none)
8352 return &all_cores[cpu];
8354 /* The & 0x3f is to extract the bottom 6 bits that encode the
8355 default cpu as selected by the --with-cpu GCC configure option
8356 in config.gcc.
8357 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8358 flags mechanism should be reworked to make it more sane. */
8359 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8362 /* Return the architecture corresponding to the enum ARCH.
8363 If it doesn't specify a valid architecture, return the default. */
8365 static const struct processor *
8366 aarch64_get_arch (enum aarch64_arch arch)
8368 if (arch != aarch64_no_arch)
8369 return &all_architectures[arch];
8371 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8373 return &all_architectures[cpu->arch];
8376 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
8377 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8378 tuning structs. In particular it must set selected_tune and
8379 aarch64_isa_flags that define the available ISA features and tuning
8380 decisions. It must also set selected_arch as this will be used to
8381 output the .arch asm tags for each function. */
8383 static void
8384 aarch64_override_options (void)
8386 unsigned long cpu_isa = 0;
8387 unsigned long arch_isa = 0;
8388 aarch64_isa_flags = 0;
8390 bool valid_cpu = true;
8391 bool valid_tune = true;
8392 bool valid_arch = true;
8394 selected_cpu = NULL;
8395 selected_arch = NULL;
8396 selected_tune = NULL;
8398 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8399 If either of -march or -mtune is given, they override their
8400 respective component of -mcpu. */
8401 if (aarch64_cpu_string)
8402 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8403 &cpu_isa);
8405 if (aarch64_arch_string)
8406 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8407 &arch_isa);
8409 if (aarch64_tune_string)
8410 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8412 /* If the user did not specify a processor, choose the default
8413 one for them. This will be the CPU set during configuration using
8414 --with-cpu, otherwise it is "generic". */
8415 if (!selected_cpu)
8417 if (selected_arch)
8419 selected_cpu = &all_cores[selected_arch->ident];
8420 aarch64_isa_flags = arch_isa;
8421 explicit_arch = selected_arch->arch;
8423 else
8425 /* Get default configure-time CPU. */
8426 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8427 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8430 if (selected_tune)
8431 explicit_tune_core = selected_tune->ident;
8433 /* If both -mcpu and -march are specified check that they are architecturally
8434 compatible, warn if they're not and prefer the -march ISA flags. */
8435 else if (selected_arch)
8437 if (selected_arch->arch != selected_cpu->arch)
8439 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8440 all_architectures[selected_cpu->arch].name,
8441 selected_arch->name);
8443 aarch64_isa_flags = arch_isa;
8444 explicit_arch = selected_arch->arch;
8445 explicit_tune_core = selected_tune ? selected_tune->ident
8446 : selected_cpu->ident;
8448 else
8450 /* -mcpu but no -march. */
8451 aarch64_isa_flags = cpu_isa;
8452 explicit_tune_core = selected_tune ? selected_tune->ident
8453 : selected_cpu->ident;
8454 gcc_assert (selected_cpu);
8455 selected_arch = &all_architectures[selected_cpu->arch];
8456 explicit_arch = selected_arch->arch;
8459 /* Set the arch as well as we will need it when outputing
8460 the .arch directive in assembly. */
8461 if (!selected_arch)
8463 gcc_assert (selected_cpu);
8464 selected_arch = &all_architectures[selected_cpu->arch];
8467 if (!selected_tune)
8468 selected_tune = selected_cpu;
8470 #ifndef HAVE_AS_MABI_OPTION
8471 /* The compiler may have been configured with 2.23.* binutils, which does
8472 not have support for ILP32. */
8473 if (TARGET_ILP32)
8474 error ("Assembler does not support -mabi=ilp32");
8475 #endif
8477 /* Make sure we properly set up the explicit options. */
8478 if ((aarch64_cpu_string && valid_cpu)
8479 || (aarch64_tune_string && valid_tune))
8480 gcc_assert (explicit_tune_core != aarch64_none);
8482 if ((aarch64_cpu_string && valid_cpu)
8483 || (aarch64_arch_string && valid_arch))
8484 gcc_assert (explicit_arch != aarch64_no_arch);
8486 aarch64_override_options_internal (&global_options);
8488 /* Save these options as the default ones in case we push and pop them later
8489 while processing functions with potential target attributes. */
8490 target_option_default_node = target_option_current_node
8491 = build_target_option_node (&global_options);
8493 aarch64_register_fma_steering ();
8497 /* Implement targetm.override_options_after_change. */
8499 static void
8500 aarch64_override_options_after_change (void)
8502 aarch64_override_options_after_change_1 (&global_options);
8505 static struct machine_function *
8506 aarch64_init_machine_status (void)
8508 struct machine_function *machine;
8509 machine = ggc_cleared_alloc<machine_function> ();
8510 return machine;
8513 void
8514 aarch64_init_expanders (void)
8516 init_machine_status = aarch64_init_machine_status;
8519 /* A checking mechanism for the implementation of the various code models. */
8520 static void
8521 initialize_aarch64_code_model (struct gcc_options *opts)
8523 if (opts->x_flag_pic)
8525 switch (opts->x_aarch64_cmodel_var)
8527 case AARCH64_CMODEL_TINY:
8528 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
8529 break;
8530 case AARCH64_CMODEL_SMALL:
8531 #ifdef HAVE_AS_SMALL_PIC_RELOCS
8532 aarch64_cmodel = (flag_pic == 2
8533 ? AARCH64_CMODEL_SMALL_PIC
8534 : AARCH64_CMODEL_SMALL_SPIC);
8535 #else
8536 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
8537 #endif
8538 break;
8539 case AARCH64_CMODEL_LARGE:
8540 sorry ("code model %qs with -f%s", "large",
8541 opts->x_flag_pic > 1 ? "PIC" : "pic");
8542 break;
8543 default:
8544 gcc_unreachable ();
8547 else
8548 aarch64_cmodel = opts->x_aarch64_cmodel_var;
8551 /* Implement TARGET_OPTION_SAVE. */
8553 static void
8554 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
8556 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
8559 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
8560 using the information saved in PTR. */
8562 static void
8563 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
8565 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
8566 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8567 opts->x_explicit_arch = ptr->x_explicit_arch;
8568 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
8569 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
8571 aarch64_override_options_internal (opts);
8574 /* Implement TARGET_OPTION_PRINT. */
8576 static void
8577 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
8579 const struct processor *cpu
8580 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
8581 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
8582 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
8583 std::string extension
8584 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
8586 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
8587 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
8588 arch->name, extension.c_str ());
8591 static GTY(()) tree aarch64_previous_fndecl;
8593 void
8594 aarch64_reset_previous_fndecl (void)
8596 aarch64_previous_fndecl = NULL;
8599 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
8600 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
8601 make sure optab availability predicates are recomputed when necessary. */
8603 void
8604 aarch64_save_restore_target_globals (tree new_tree)
8606 if (TREE_TARGET_GLOBALS (new_tree))
8607 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
8608 else if (new_tree == target_option_default_node)
8609 restore_target_globals (&default_target_globals);
8610 else
8611 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
8614 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
8615 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
8616 of the function, if such exists. This function may be called multiple
8617 times on a single function so use aarch64_previous_fndecl to avoid
8618 setting up identical state. */
8620 static void
8621 aarch64_set_current_function (tree fndecl)
8623 if (!fndecl || fndecl == aarch64_previous_fndecl)
8624 return;
8626 tree old_tree = (aarch64_previous_fndecl
8627 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
8628 : NULL_TREE);
8630 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
8632 /* If current function has no attributes but the previous one did,
8633 use the default node. */
8634 if (!new_tree && old_tree)
8635 new_tree = target_option_default_node;
8637 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
8638 the default have been handled by aarch64_save_restore_target_globals from
8639 aarch64_pragma_target_parse. */
8640 if (old_tree == new_tree)
8641 return;
8643 aarch64_previous_fndecl = fndecl;
8645 /* First set the target options. */
8646 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
8648 aarch64_save_restore_target_globals (new_tree);
8651 /* Enum describing the various ways we can handle attributes.
8652 In many cases we can reuse the generic option handling machinery. */
8654 enum aarch64_attr_opt_type
8656 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
8657 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
8658 aarch64_attr_enum, /* Attribute sets an enum variable. */
8659 aarch64_attr_custom /* Attribute requires a custom handling function. */
8662 /* All the information needed to handle a target attribute.
8663 NAME is the name of the attribute.
8664 ATTR_TYPE specifies the type of behavior of the attribute as described
8665 in the definition of enum aarch64_attr_opt_type.
8666 ALLOW_NEG is true if the attribute supports a "no-" form.
8667 HANDLER is the function that takes the attribute string and whether
8668 it is a pragma or attribute and handles the option. It is needed only
8669 when the ATTR_TYPE is aarch64_attr_custom.
8670 OPT_NUM is the enum specifying the option that the attribute modifies.
8671 This is needed for attributes that mirror the behavior of a command-line
8672 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
8673 aarch64_attr_enum. */
8675 struct aarch64_attribute_info
8677 const char *name;
8678 enum aarch64_attr_opt_type attr_type;
8679 bool allow_neg;
8680 bool (*handler) (const char *, const char *);
8681 enum opt_code opt_num;
8684 /* Handle the ARCH_STR argument to the arch= target attribute.
8685 PRAGMA_OR_ATTR is used in potential error messages. */
8687 static bool
8688 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
8690 const struct processor *tmp_arch = NULL;
8691 enum aarch64_parse_opt_result parse_res
8692 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
8694 if (parse_res == AARCH64_PARSE_OK)
8696 gcc_assert (tmp_arch);
8697 selected_arch = tmp_arch;
8698 explicit_arch = selected_arch->arch;
8699 return true;
8702 switch (parse_res)
8704 case AARCH64_PARSE_MISSING_ARG:
8705 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
8706 break;
8707 case AARCH64_PARSE_INVALID_ARG:
8708 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
8709 break;
8710 case AARCH64_PARSE_INVALID_FEATURE:
8711 error ("invalid feature modifier %qs for 'arch' target %s",
8712 str, pragma_or_attr);
8713 break;
8714 default:
8715 gcc_unreachable ();
8718 return false;
8721 /* Handle the argument CPU_STR to the cpu= target attribute.
8722 PRAGMA_OR_ATTR is used in potential error messages. */
8724 static bool
8725 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
8727 const struct processor *tmp_cpu = NULL;
8728 enum aarch64_parse_opt_result parse_res
8729 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
8731 if (parse_res == AARCH64_PARSE_OK)
8733 gcc_assert (tmp_cpu);
8734 selected_tune = tmp_cpu;
8735 explicit_tune_core = selected_tune->ident;
8737 selected_arch = &all_architectures[tmp_cpu->arch];
8738 explicit_arch = selected_arch->arch;
8739 return true;
8742 switch (parse_res)
8744 case AARCH64_PARSE_MISSING_ARG:
8745 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
8746 break;
8747 case AARCH64_PARSE_INVALID_ARG:
8748 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
8749 break;
8750 case AARCH64_PARSE_INVALID_FEATURE:
8751 error ("invalid feature modifier %qs for 'cpu' target %s",
8752 str, pragma_or_attr);
8753 break;
8754 default:
8755 gcc_unreachable ();
8758 return false;
8761 /* Handle the argument STR to the tune= target attribute.
8762 PRAGMA_OR_ATTR is used in potential error messages. */
8764 static bool
8765 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
8767 const struct processor *tmp_tune = NULL;
8768 enum aarch64_parse_opt_result parse_res
8769 = aarch64_parse_tune (str, &tmp_tune);
8771 if (parse_res == AARCH64_PARSE_OK)
8773 gcc_assert (tmp_tune);
8774 selected_tune = tmp_tune;
8775 explicit_tune_core = selected_tune->ident;
8776 return true;
8779 switch (parse_res)
8781 case AARCH64_PARSE_INVALID_ARG:
8782 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
8783 break;
8784 default:
8785 gcc_unreachable ();
8788 return false;
8791 /* Parse an architecture extensions target attribute string specified in STR.
8792 For example "+fp+nosimd". Show any errors if needed. Return TRUE
8793 if successful. Update aarch64_isa_flags to reflect the ISA features
8794 modified.
8795 PRAGMA_OR_ATTR is used in potential error messages. */
8797 static bool
8798 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
8800 enum aarch64_parse_opt_result parse_res;
8801 unsigned long isa_flags = aarch64_isa_flags;
8803 /* We allow "+nothing" in the beginning to clear out all architectural
8804 features if the user wants to handpick specific features. */
8805 if (strncmp ("+nothing", str, 8) == 0)
8807 isa_flags = 0;
8808 str += 8;
8811 parse_res = aarch64_parse_extension (str, &isa_flags);
8813 if (parse_res == AARCH64_PARSE_OK)
8815 aarch64_isa_flags = isa_flags;
8816 return true;
8819 switch (parse_res)
8821 case AARCH64_PARSE_MISSING_ARG:
8822 error ("missing feature modifier in target %s %qs",
8823 pragma_or_attr, str);
8824 break;
8826 case AARCH64_PARSE_INVALID_FEATURE:
8827 error ("invalid feature modifier in target %s %qs",
8828 pragma_or_attr, str);
8829 break;
8831 default:
8832 gcc_unreachable ();
8835 return false;
8838 /* The target attributes that we support. On top of these we also support just
8839 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
8840 handled explicitly in aarch64_process_one_target_attr. */
8842 static const struct aarch64_attribute_info aarch64_attributes[] =
8844 { "general-regs-only", aarch64_attr_mask, false, NULL,
8845 OPT_mgeneral_regs_only },
8846 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
8847 OPT_mfix_cortex_a53_835769 },
8848 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
8849 OPT_mfix_cortex_a53_843419 },
8850 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
8851 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
8852 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
8853 OPT_momit_leaf_frame_pointer },
8854 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
8855 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
8856 OPT_march_ },
8857 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
8858 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
8859 OPT_mtune_ },
8860 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
8863 /* Parse ARG_STR which contains the definition of one target attribute.
8864 Show appropriate errors if any or return true if the attribute is valid.
8865 PRAGMA_OR_ATTR holds the string to use in error messages about whether
8866 we're processing a target attribute or pragma. */
8868 static bool
8869 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
8871 bool invert = false;
8873 size_t len = strlen (arg_str);
8875 if (len == 0)
8877 error ("malformed target %s", pragma_or_attr);
8878 return false;
8881 char *str_to_check = (char *) alloca (len + 1);
8882 strcpy (str_to_check, arg_str);
8884 /* Skip leading whitespace. */
8885 while (*str_to_check == ' ' || *str_to_check == '\t')
8886 str_to_check++;
8888 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
8889 It is easier to detect and handle it explicitly here rather than going
8890 through the machinery for the rest of the target attributes in this
8891 function. */
8892 if (*str_to_check == '+')
8893 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
8895 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
8897 invert = true;
8898 str_to_check += 3;
8900 char *arg = strchr (str_to_check, '=');
8902 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
8903 and point ARG to "foo". */
8904 if (arg)
8906 *arg = '\0';
8907 arg++;
8909 const struct aarch64_attribute_info *p_attr;
8910 bool found = false;
8911 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
8913 /* If the names don't match up, or the user has given an argument
8914 to an attribute that doesn't accept one, or didn't give an argument
8915 to an attribute that expects one, fail to match. */
8916 if (strcmp (str_to_check, p_attr->name) != 0)
8917 continue;
8919 found = true;
8920 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
8921 || p_attr->attr_type == aarch64_attr_enum;
8923 if (attr_need_arg_p ^ (arg != NULL))
8925 error ("target %s %qs does not accept an argument",
8926 pragma_or_attr, str_to_check);
8927 return false;
8930 /* If the name matches but the attribute does not allow "no-" versions
8931 then we can't match. */
8932 if (invert && !p_attr->allow_neg)
8934 error ("target %s %qs does not allow a negated form",
8935 pragma_or_attr, str_to_check);
8936 return false;
8939 switch (p_attr->attr_type)
8941 /* Has a custom handler registered.
8942 For example, cpu=, arch=, tune=. */
8943 case aarch64_attr_custom:
8944 gcc_assert (p_attr->handler);
8945 if (!p_attr->handler (arg, pragma_or_attr))
8946 return false;
8947 break;
8949 /* Either set or unset a boolean option. */
8950 case aarch64_attr_bool:
8952 struct cl_decoded_option decoded;
8954 generate_option (p_attr->opt_num, NULL, !invert,
8955 CL_TARGET, &decoded);
8956 aarch64_handle_option (&global_options, &global_options_set,
8957 &decoded, input_location);
8958 break;
8960 /* Set or unset a bit in the target_flags. aarch64_handle_option
8961 should know what mask to apply given the option number. */
8962 case aarch64_attr_mask:
8964 struct cl_decoded_option decoded;
8965 /* We only need to specify the option number.
8966 aarch64_handle_option will know which mask to apply. */
8967 decoded.opt_index = p_attr->opt_num;
8968 decoded.value = !invert;
8969 aarch64_handle_option (&global_options, &global_options_set,
8970 &decoded, input_location);
8971 break;
8973 /* Use the option setting machinery to set an option to an enum. */
8974 case aarch64_attr_enum:
8976 gcc_assert (arg);
8977 bool valid;
8978 int value;
8979 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
8980 &value, CL_TARGET);
8981 if (valid)
8983 set_option (&global_options, NULL, p_attr->opt_num, value,
8984 NULL, DK_UNSPECIFIED, input_location,
8985 global_dc);
8987 else
8989 error ("target %s %s=%s is not valid",
8990 pragma_or_attr, str_to_check, arg);
8992 break;
8994 default:
8995 gcc_unreachable ();
8999 /* If we reached here we either have found an attribute and validated
9000 it or didn't match any. If we matched an attribute but its arguments
9001 were malformed we will have returned false already. */
9002 return found;
9005 /* Count how many times the character C appears in
9006 NULL-terminated string STR. */
9008 static unsigned int
9009 num_occurences_in_str (char c, char *str)
9011 unsigned int res = 0;
9012 while (*str != '\0')
9014 if (*str == c)
9015 res++;
9017 str++;
9020 return res;
9023 /* Parse the tree in ARGS that contains the target attribute information
9024 and update the global target options space. PRAGMA_OR_ATTR is a string
9025 to be used in error messages, specifying whether this is processing
9026 a target attribute or a target pragma. */
9028 bool
9029 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9031 if (TREE_CODE (args) == TREE_LIST)
9035 tree head = TREE_VALUE (args);
9036 if (head)
9038 if (!aarch64_process_target_attr (head, pragma_or_attr))
9039 return false;
9041 args = TREE_CHAIN (args);
9042 } while (args);
9044 return true;
9046 /* We expect to find a string to parse. */
9047 gcc_assert (TREE_CODE (args) == STRING_CST);
9049 size_t len = strlen (TREE_STRING_POINTER (args));
9050 char *str_to_check = (char *) alloca (len + 1);
9051 strcpy (str_to_check, TREE_STRING_POINTER (args));
9053 if (len == 0)
9055 error ("malformed target %s value", pragma_or_attr);
9056 return false;
9059 /* Used to catch empty spaces between commas i.e.
9060 attribute ((target ("attr1,,attr2"))). */
9061 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9063 /* Handle multiple target attributes separated by ','. */
9064 char *token = strtok (str_to_check, ",");
9066 unsigned int num_attrs = 0;
9067 while (token)
9069 num_attrs++;
9070 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9072 error ("target %s %qs is invalid", pragma_or_attr, token);
9073 return false;
9076 token = strtok (NULL, ",");
9079 if (num_attrs != num_commas + 1)
9081 error ("malformed target %s list %qs",
9082 pragma_or_attr, TREE_STRING_POINTER (args));
9083 return false;
9086 return true;
9089 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9090 process attribute ((target ("..."))). */
9092 static bool
9093 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9095 struct cl_target_option cur_target;
9096 bool ret;
9097 tree old_optimize;
9098 tree new_target, new_optimize;
9099 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9101 /* If what we're processing is the current pragma string then the
9102 target option node is already stored in target_option_current_node
9103 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9104 having to re-parse the string. This is especially useful to keep
9105 arm_neon.h compile times down since that header contains a lot
9106 of intrinsics enclosed in pragmas. */
9107 if (!existing_target && args == current_target_pragma)
9109 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9110 return true;
9112 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9114 old_optimize = build_optimization_node (&global_options);
9115 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9117 /* If the function changed the optimization levels as well as setting
9118 target options, start with the optimizations specified. */
9119 if (func_optimize && func_optimize != old_optimize)
9120 cl_optimization_restore (&global_options,
9121 TREE_OPTIMIZATION (func_optimize));
9123 /* Save the current target options to restore at the end. */
9124 cl_target_option_save (&cur_target, &global_options);
9126 /* If fndecl already has some target attributes applied to it, unpack
9127 them so that we add this attribute on top of them, rather than
9128 overwriting them. */
9129 if (existing_target)
9131 struct cl_target_option *existing_options
9132 = TREE_TARGET_OPTION (existing_target);
9134 if (existing_options)
9135 cl_target_option_restore (&global_options, existing_options);
9137 else
9138 cl_target_option_restore (&global_options,
9139 TREE_TARGET_OPTION (target_option_current_node));
9142 ret = aarch64_process_target_attr (args, "attribute");
9144 /* Set up any additional state. */
9145 if (ret)
9147 aarch64_override_options_internal (&global_options);
9148 /* Initialize SIMD builtins if we haven't already.
9149 Set current_target_pragma to NULL for the duration so that
9150 the builtin initialization code doesn't try to tag the functions
9151 being built with the attributes specified by any current pragma, thus
9152 going into an infinite recursion. */
9153 if (TARGET_SIMD)
9155 tree saved_current_target_pragma = current_target_pragma;
9156 current_target_pragma = NULL;
9157 aarch64_init_simd_builtins ();
9158 current_target_pragma = saved_current_target_pragma;
9160 new_target = build_target_option_node (&global_options);
9162 else
9163 new_target = NULL;
9165 new_optimize = build_optimization_node (&global_options);
9167 if (fndecl && ret)
9169 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9171 if (old_optimize != new_optimize)
9172 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9175 cl_target_option_restore (&global_options, &cur_target);
9177 if (old_optimize != new_optimize)
9178 cl_optimization_restore (&global_options,
9179 TREE_OPTIMIZATION (old_optimize));
9180 return ret;
9183 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
9184 tri-bool options (yes, no, don't care) and the default value is
9185 DEF, determine whether to reject inlining. */
9187 static bool
9188 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9189 int dont_care, int def)
9191 /* If the callee doesn't care, always allow inlining. */
9192 if (callee == dont_care)
9193 return true;
9195 /* If the caller doesn't care, always allow inlining. */
9196 if (caller == dont_care)
9197 return true;
9199 /* Otherwise, allow inlining if either the callee and caller values
9200 agree, or if the callee is using the default value. */
9201 return (callee == caller || callee == def);
9204 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
9205 to inline CALLEE into CALLER based on target-specific info.
9206 Make sure that the caller and callee have compatible architectural
9207 features. Then go through the other possible target attributes
9208 and see if they can block inlining. Try not to reject always_inline
9209 callees unless they are incompatible architecturally. */
9211 static bool
9212 aarch64_can_inline_p (tree caller, tree callee)
9214 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9215 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9217 /* If callee has no option attributes, then it is ok to inline. */
9218 if (!callee_tree)
9219 return true;
9221 struct cl_target_option *caller_opts
9222 = TREE_TARGET_OPTION (caller_tree ? caller_tree
9223 : target_option_default_node);
9225 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9228 /* Callee's ISA flags should be a subset of the caller's. */
9229 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9230 != callee_opts->x_aarch64_isa_flags)
9231 return false;
9233 /* Allow non-strict aligned functions inlining into strict
9234 aligned ones. */
9235 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9236 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9237 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9238 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9239 return false;
9241 bool always_inline = lookup_attribute ("always_inline",
9242 DECL_ATTRIBUTES (callee));
9244 /* If the architectural features match up and the callee is always_inline
9245 then the other attributes don't matter. */
9246 if (always_inline)
9247 return true;
9249 if (caller_opts->x_aarch64_cmodel_var
9250 != callee_opts->x_aarch64_cmodel_var)
9251 return false;
9253 if (caller_opts->x_aarch64_tls_dialect
9254 != callee_opts->x_aarch64_tls_dialect)
9255 return false;
9257 /* Honour explicit requests to workaround errata. */
9258 if (!aarch64_tribools_ok_for_inlining_p (
9259 caller_opts->x_aarch64_fix_a53_err835769,
9260 callee_opts->x_aarch64_fix_a53_err835769,
9261 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9262 return false;
9264 if (!aarch64_tribools_ok_for_inlining_p (
9265 caller_opts->x_aarch64_fix_a53_err843419,
9266 callee_opts->x_aarch64_fix_a53_err843419,
9267 2, TARGET_FIX_ERR_A53_843419))
9268 return false;
9270 /* If the user explicitly specified -momit-leaf-frame-pointer for the
9271 caller and calle and they don't match up, reject inlining. */
9272 if (!aarch64_tribools_ok_for_inlining_p (
9273 caller_opts->x_flag_omit_leaf_frame_pointer,
9274 callee_opts->x_flag_omit_leaf_frame_pointer,
9275 2, 1))
9276 return false;
9278 /* If the callee has specific tuning overrides, respect them. */
9279 if (callee_opts->x_aarch64_override_tune_string != NULL
9280 && caller_opts->x_aarch64_override_tune_string == NULL)
9281 return false;
9283 /* If the user specified tuning override strings for the
9284 caller and callee and they don't match up, reject inlining.
9285 We just do a string compare here, we don't analyze the meaning
9286 of the string, as it would be too costly for little gain. */
9287 if (callee_opts->x_aarch64_override_tune_string
9288 && caller_opts->x_aarch64_override_tune_string
9289 && (strcmp (callee_opts->x_aarch64_override_tune_string,
9290 caller_opts->x_aarch64_override_tune_string) != 0))
9291 return false;
9293 return true;
9296 /* Return true if SYMBOL_REF X binds locally. */
9298 static bool
9299 aarch64_symbol_binds_local_p (const_rtx x)
9301 return (SYMBOL_REF_DECL (x)
9302 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9303 : SYMBOL_REF_LOCAL_P (x));
9306 /* Return true if SYMBOL_REF X is thread local */
9307 static bool
9308 aarch64_tls_symbol_p (rtx x)
9310 if (! TARGET_HAVE_TLS)
9311 return false;
9313 if (GET_CODE (x) != SYMBOL_REF)
9314 return false;
9316 return SYMBOL_REF_TLS_MODEL (x) != 0;
9319 /* Classify a TLS symbol into one of the TLS kinds. */
9320 enum aarch64_symbol_type
9321 aarch64_classify_tls_symbol (rtx x)
9323 enum tls_model tls_kind = tls_symbolic_operand_type (x);
9325 switch (tls_kind)
9327 case TLS_MODEL_GLOBAL_DYNAMIC:
9328 case TLS_MODEL_LOCAL_DYNAMIC:
9329 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9331 case TLS_MODEL_INITIAL_EXEC:
9332 switch (aarch64_cmodel)
9334 case AARCH64_CMODEL_TINY:
9335 case AARCH64_CMODEL_TINY_PIC:
9336 return SYMBOL_TINY_TLSIE;
9337 default:
9338 return SYMBOL_SMALL_TLSIE;
9341 case TLS_MODEL_LOCAL_EXEC:
9342 if (aarch64_tls_size == 12)
9343 return SYMBOL_TLSLE12;
9344 else if (aarch64_tls_size == 24)
9345 return SYMBOL_TLSLE24;
9346 else if (aarch64_tls_size == 32)
9347 return SYMBOL_TLSLE32;
9348 else if (aarch64_tls_size == 48)
9349 return SYMBOL_TLSLE48;
9350 else
9351 gcc_unreachable ();
9353 case TLS_MODEL_EMULATED:
9354 case TLS_MODEL_NONE:
9355 return SYMBOL_FORCE_TO_MEM;
9357 default:
9358 gcc_unreachable ();
9362 /* Return the method that should be used to access SYMBOL_REF or
9363 LABEL_REF X. */
9365 enum aarch64_symbol_type
9366 aarch64_classify_symbol (rtx x, rtx offset)
9368 if (GET_CODE (x) == LABEL_REF)
9370 switch (aarch64_cmodel)
9372 case AARCH64_CMODEL_LARGE:
9373 return SYMBOL_FORCE_TO_MEM;
9375 case AARCH64_CMODEL_TINY_PIC:
9376 case AARCH64_CMODEL_TINY:
9377 return SYMBOL_TINY_ABSOLUTE;
9379 case AARCH64_CMODEL_SMALL_SPIC:
9380 case AARCH64_CMODEL_SMALL_PIC:
9381 case AARCH64_CMODEL_SMALL:
9382 return SYMBOL_SMALL_ABSOLUTE;
9384 default:
9385 gcc_unreachable ();
9389 if (GET_CODE (x) == SYMBOL_REF)
9391 if (aarch64_tls_symbol_p (x))
9392 return aarch64_classify_tls_symbol (x);
9394 switch (aarch64_cmodel)
9396 case AARCH64_CMODEL_TINY:
9397 /* When we retrieve symbol + offset address, we have to make sure
9398 the offset does not cause overflow of the final address. But
9399 we have no way of knowing the address of symbol at compile time
9400 so we can't accurately say if the distance between the PC and
9401 symbol + offset is outside the addressible range of +/-1M in the
9402 TINY code model. So we rely on images not being greater than
9403 1M and cap the offset at 1M and anything beyond 1M will have to
9404 be loaded using an alternative mechanism. Furthermore if the
9405 symbol is a weak reference to something that isn't known to
9406 resolve to a symbol in this module, then force to memory. */
9407 if ((SYMBOL_REF_WEAK (x)
9408 && !aarch64_symbol_binds_local_p (x))
9409 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9410 return SYMBOL_FORCE_TO_MEM;
9411 return SYMBOL_TINY_ABSOLUTE;
9413 case AARCH64_CMODEL_SMALL:
9414 /* Same reasoning as the tiny code model, but the offset cap here is
9415 4G. */
9416 if ((SYMBOL_REF_WEAK (x)
9417 && !aarch64_symbol_binds_local_p (x))
9418 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9419 HOST_WIDE_INT_C (4294967264)))
9420 return SYMBOL_FORCE_TO_MEM;
9421 return SYMBOL_SMALL_ABSOLUTE;
9423 case AARCH64_CMODEL_TINY_PIC:
9424 if (!aarch64_symbol_binds_local_p (x))
9425 return SYMBOL_TINY_GOT;
9426 return SYMBOL_TINY_ABSOLUTE;
9428 case AARCH64_CMODEL_SMALL_SPIC:
9429 case AARCH64_CMODEL_SMALL_PIC:
9430 if (!aarch64_symbol_binds_local_p (x))
9431 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9432 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9433 return SYMBOL_SMALL_ABSOLUTE;
9435 case AARCH64_CMODEL_LARGE:
9436 /* This is alright even in PIC code as the constant
9437 pool reference is always PC relative and within
9438 the same translation unit. */
9439 if (CONSTANT_POOL_ADDRESS_P (x))
9440 return SYMBOL_SMALL_ABSOLUTE;
9441 else
9442 return SYMBOL_FORCE_TO_MEM;
9444 default:
9445 gcc_unreachable ();
9449 /* By default push everything into the constant pool. */
9450 return SYMBOL_FORCE_TO_MEM;
9453 bool
9454 aarch64_constant_address_p (rtx x)
9456 return (CONSTANT_P (x) && memory_address_p (DImode, x));
9459 bool
9460 aarch64_legitimate_pic_operand_p (rtx x)
9462 if (GET_CODE (x) == SYMBOL_REF
9463 || (GET_CODE (x) == CONST
9464 && GET_CODE (XEXP (x, 0)) == PLUS
9465 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9466 return false;
9468 return true;
9471 /* Return true if X holds either a quarter-precision or
9472 floating-point +0.0 constant. */
9473 static bool
9474 aarch64_valid_floating_const (machine_mode mode, rtx x)
9476 if (!CONST_DOUBLE_P (x))
9477 return false;
9479 if (aarch64_float_const_zero_rtx_p (x))
9480 return true;
9482 /* We only handle moving 0.0 to a TFmode register. */
9483 if (!(mode == SFmode || mode == DFmode))
9484 return false;
9486 return aarch64_float_const_representable_p (x);
9489 static bool
9490 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9492 /* Do not allow vector struct mode constants. We could support
9493 0 and -1 easily, but they need support in aarch64-simd.md. */
9494 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9495 return false;
9497 /* This could probably go away because
9498 we now decompose CONST_INTs according to expand_mov_immediate. */
9499 if ((GET_CODE (x) == CONST_VECTOR
9500 && aarch64_simd_valid_immediate (x, mode, false, NULL))
9501 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9502 return !targetm.cannot_force_const_mem (mode, x);
9504 if (GET_CODE (x) == HIGH
9505 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9506 return true;
9508 return aarch64_constant_address_p (x);
9512 aarch64_load_tp (rtx target)
9514 if (!target
9515 || GET_MODE (target) != Pmode
9516 || !register_operand (target, Pmode))
9517 target = gen_reg_rtx (Pmode);
9519 /* Can return in any reg. */
9520 emit_insn (gen_aarch64_load_tp_hard (target));
9521 return target;
9524 /* On AAPCS systems, this is the "struct __va_list". */
9525 static GTY(()) tree va_list_type;
9527 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
9528 Return the type to use as __builtin_va_list.
9530 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
9532 struct __va_list
9534 void *__stack;
9535 void *__gr_top;
9536 void *__vr_top;
9537 int __gr_offs;
9538 int __vr_offs;
9539 }; */
9541 static tree
9542 aarch64_build_builtin_va_list (void)
9544 tree va_list_name;
9545 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9547 /* Create the type. */
9548 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
9549 /* Give it the required name. */
9550 va_list_name = build_decl (BUILTINS_LOCATION,
9551 TYPE_DECL,
9552 get_identifier ("__va_list"),
9553 va_list_type);
9554 DECL_ARTIFICIAL (va_list_name) = 1;
9555 TYPE_NAME (va_list_type) = va_list_name;
9556 TYPE_STUB_DECL (va_list_type) = va_list_name;
9558 /* Create the fields. */
9559 f_stack = build_decl (BUILTINS_LOCATION,
9560 FIELD_DECL, get_identifier ("__stack"),
9561 ptr_type_node);
9562 f_grtop = build_decl (BUILTINS_LOCATION,
9563 FIELD_DECL, get_identifier ("__gr_top"),
9564 ptr_type_node);
9565 f_vrtop = build_decl (BUILTINS_LOCATION,
9566 FIELD_DECL, get_identifier ("__vr_top"),
9567 ptr_type_node);
9568 f_groff = build_decl (BUILTINS_LOCATION,
9569 FIELD_DECL, get_identifier ("__gr_offs"),
9570 integer_type_node);
9571 f_vroff = build_decl (BUILTINS_LOCATION,
9572 FIELD_DECL, get_identifier ("__vr_offs"),
9573 integer_type_node);
9575 /* Tell tree-stdarg pass about our internal offset fields.
9576 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
9577 purpose to identify whether the code is updating va_list internal
9578 offset fields through irregular way. */
9579 va_list_gpr_counter_field = f_groff;
9580 va_list_fpr_counter_field = f_vroff;
9582 DECL_ARTIFICIAL (f_stack) = 1;
9583 DECL_ARTIFICIAL (f_grtop) = 1;
9584 DECL_ARTIFICIAL (f_vrtop) = 1;
9585 DECL_ARTIFICIAL (f_groff) = 1;
9586 DECL_ARTIFICIAL (f_vroff) = 1;
9588 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
9589 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
9590 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
9591 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
9592 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
9594 TYPE_FIELDS (va_list_type) = f_stack;
9595 DECL_CHAIN (f_stack) = f_grtop;
9596 DECL_CHAIN (f_grtop) = f_vrtop;
9597 DECL_CHAIN (f_vrtop) = f_groff;
9598 DECL_CHAIN (f_groff) = f_vroff;
9600 /* Compute its layout. */
9601 layout_type (va_list_type);
9603 return va_list_type;
9606 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
9607 static void
9608 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
9610 const CUMULATIVE_ARGS *cum;
9611 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9612 tree stack, grtop, vrtop, groff, vroff;
9613 tree t;
9614 int gr_save_area_size = cfun->va_list_gpr_size;
9615 int vr_save_area_size = cfun->va_list_fpr_size;
9616 int vr_offset;
9618 cum = &crtl->args.info;
9619 if (cfun->va_list_gpr_size)
9620 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
9621 cfun->va_list_gpr_size);
9622 if (cfun->va_list_fpr_size)
9623 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
9624 * UNITS_PER_VREG, cfun->va_list_fpr_size);
9626 if (!TARGET_FLOAT)
9628 gcc_assert (cum->aapcs_nvrn == 0);
9629 vr_save_area_size = 0;
9632 f_stack = TYPE_FIELDS (va_list_type_node);
9633 f_grtop = DECL_CHAIN (f_stack);
9634 f_vrtop = DECL_CHAIN (f_grtop);
9635 f_groff = DECL_CHAIN (f_vrtop);
9636 f_vroff = DECL_CHAIN (f_groff);
9638 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
9639 NULL_TREE);
9640 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
9641 NULL_TREE);
9642 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
9643 NULL_TREE);
9644 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
9645 NULL_TREE);
9646 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
9647 NULL_TREE);
9649 /* Emit code to initialize STACK, which points to the next varargs stack
9650 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
9651 by named arguments. STACK is 8-byte aligned. */
9652 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
9653 if (cum->aapcs_stack_size > 0)
9654 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
9655 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
9656 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9658 /* Emit code to initialize GRTOP, the top of the GR save area.
9659 virtual_incoming_args_rtx should have been 16 byte aligned. */
9660 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
9661 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
9662 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9664 /* Emit code to initialize VRTOP, the top of the VR save area.
9665 This address is gr_save_area_bytes below GRTOP, rounded
9666 down to the next 16-byte boundary. */
9667 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
9668 vr_offset = ROUND_UP (gr_save_area_size,
9669 STACK_BOUNDARY / BITS_PER_UNIT);
9671 if (vr_offset)
9672 t = fold_build_pointer_plus_hwi (t, -vr_offset);
9673 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
9674 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9676 /* Emit code to initialize GROFF, the offset from GRTOP of the
9677 next GPR argument. */
9678 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
9679 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
9680 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9682 /* Likewise emit code to initialize VROFF, the offset from FTOP
9683 of the next VR argument. */
9684 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
9685 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
9686 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
9689 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
9691 static tree
9692 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
9693 gimple_seq *post_p ATTRIBUTE_UNUSED)
9695 tree addr;
9696 bool indirect_p;
9697 bool is_ha; /* is HFA or HVA. */
9698 bool dw_align; /* double-word align. */
9699 machine_mode ag_mode = VOIDmode;
9700 int nregs;
9701 machine_mode mode;
9703 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
9704 tree stack, f_top, f_off, off, arg, roundup, on_stack;
9705 HOST_WIDE_INT size, rsize, adjust, align;
9706 tree t, u, cond1, cond2;
9708 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
9709 if (indirect_p)
9710 type = build_pointer_type (type);
9712 mode = TYPE_MODE (type);
9714 f_stack = TYPE_FIELDS (va_list_type_node);
9715 f_grtop = DECL_CHAIN (f_stack);
9716 f_vrtop = DECL_CHAIN (f_grtop);
9717 f_groff = DECL_CHAIN (f_vrtop);
9718 f_vroff = DECL_CHAIN (f_groff);
9720 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
9721 f_stack, NULL_TREE);
9722 size = int_size_in_bytes (type);
9723 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
9725 dw_align = false;
9726 adjust = 0;
9727 if (aarch64_vfp_is_call_or_return_candidate (mode,
9728 type,
9729 &ag_mode,
9730 &nregs,
9731 &is_ha))
9733 /* TYPE passed in fp/simd registers. */
9734 if (!TARGET_FLOAT)
9735 aarch64_err_no_fpadvsimd (mode, "varargs");
9737 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
9738 unshare_expr (valist), f_vrtop, NULL_TREE);
9739 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
9740 unshare_expr (valist), f_vroff, NULL_TREE);
9742 rsize = nregs * UNITS_PER_VREG;
9744 if (is_ha)
9746 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
9747 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
9749 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
9750 && size < UNITS_PER_VREG)
9752 adjust = UNITS_PER_VREG - size;
9755 else
9757 /* TYPE passed in general registers. */
9758 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
9759 unshare_expr (valist), f_grtop, NULL_TREE);
9760 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
9761 unshare_expr (valist), f_groff, NULL_TREE);
9762 rsize = ROUND_UP (size, UNITS_PER_WORD);
9763 nregs = rsize / UNITS_PER_WORD;
9765 if (align > 8)
9766 dw_align = true;
9768 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9769 && size < UNITS_PER_WORD)
9771 adjust = UNITS_PER_WORD - size;
9775 /* Get a local temporary for the field value. */
9776 off = get_initialized_tmp_var (f_off, pre_p, NULL);
9778 /* Emit code to branch if off >= 0. */
9779 t = build2 (GE_EXPR, boolean_type_node, off,
9780 build_int_cst (TREE_TYPE (off), 0));
9781 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
9783 if (dw_align)
9785 /* Emit: offs = (offs + 15) & -16. */
9786 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9787 build_int_cst (TREE_TYPE (off), 15));
9788 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
9789 build_int_cst (TREE_TYPE (off), -16));
9790 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
9792 else
9793 roundup = NULL;
9795 /* Update ap.__[g|v]r_offs */
9796 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
9797 build_int_cst (TREE_TYPE (off), rsize));
9798 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
9800 /* String up. */
9801 if (roundup)
9802 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9804 /* [cond2] if (ap.__[g|v]r_offs > 0) */
9805 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
9806 build_int_cst (TREE_TYPE (f_off), 0));
9807 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
9809 /* String up: make sure the assignment happens before the use. */
9810 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
9811 COND_EXPR_ELSE (cond1) = t;
9813 /* Prepare the trees handling the argument that is passed on the stack;
9814 the top level node will store in ON_STACK. */
9815 arg = get_initialized_tmp_var (stack, pre_p, NULL);
9816 if (align > 8)
9818 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
9819 t = fold_convert (intDI_type_node, arg);
9820 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9821 build_int_cst (TREE_TYPE (t), 15));
9822 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9823 build_int_cst (TREE_TYPE (t), -16));
9824 t = fold_convert (TREE_TYPE (arg), t);
9825 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
9827 else
9828 roundup = NULL;
9829 /* Advance ap.__stack */
9830 t = fold_convert (intDI_type_node, arg);
9831 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
9832 build_int_cst (TREE_TYPE (t), size + 7));
9833 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
9834 build_int_cst (TREE_TYPE (t), -8));
9835 t = fold_convert (TREE_TYPE (arg), t);
9836 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
9837 /* String up roundup and advance. */
9838 if (roundup)
9839 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
9840 /* String up with arg */
9841 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
9842 /* Big-endianness related address adjustment. */
9843 if (BLOCK_REG_PADDING (mode, type, 1) == downward
9844 && size < UNITS_PER_WORD)
9846 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
9847 size_int (UNITS_PER_WORD - size));
9848 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
9851 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
9852 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
9854 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
9855 t = off;
9856 if (adjust)
9857 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
9858 build_int_cst (TREE_TYPE (off), adjust));
9860 t = fold_convert (sizetype, t);
9861 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
9863 if (is_ha)
9865 /* type ha; // treat as "struct {ftype field[n];}"
9866 ... [computing offs]
9867 for (i = 0; i <nregs; ++i, offs += 16)
9868 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
9869 return ha; */
9870 int i;
9871 tree tmp_ha, field_t, field_ptr_t;
9873 /* Declare a local variable. */
9874 tmp_ha = create_tmp_var_raw (type, "ha");
9875 gimple_add_tmp_var (tmp_ha);
9877 /* Establish the base type. */
9878 switch (ag_mode)
9880 case SFmode:
9881 field_t = float_type_node;
9882 field_ptr_t = float_ptr_type_node;
9883 break;
9884 case DFmode:
9885 field_t = double_type_node;
9886 field_ptr_t = double_ptr_type_node;
9887 break;
9888 case TFmode:
9889 field_t = long_double_type_node;
9890 field_ptr_t = long_double_ptr_type_node;
9891 break;
9892 case HFmode:
9893 field_t = aarch64_fp16_type_node;
9894 field_ptr_t = aarch64_fp16_ptr_type_node;
9895 break;
9896 case V2SImode:
9897 case V4SImode:
9899 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
9900 field_t = build_vector_type_for_mode (innertype, ag_mode);
9901 field_ptr_t = build_pointer_type (field_t);
9903 break;
9904 default:
9905 gcc_assert (0);
9908 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
9909 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
9910 addr = t;
9911 t = fold_convert (field_ptr_t, addr);
9912 t = build2 (MODIFY_EXPR, field_t,
9913 build1 (INDIRECT_REF, field_t, tmp_ha),
9914 build1 (INDIRECT_REF, field_t, t));
9916 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
9917 for (i = 1; i < nregs; ++i)
9919 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
9920 u = fold_convert (field_ptr_t, addr);
9921 u = build2 (MODIFY_EXPR, field_t,
9922 build2 (MEM_REF, field_t, tmp_ha,
9923 build_int_cst (field_ptr_t,
9924 (i *
9925 int_size_in_bytes (field_t)))),
9926 build1 (INDIRECT_REF, field_t, u));
9927 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
9930 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
9931 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
9934 COND_EXPR_ELSE (cond2) = t;
9935 addr = fold_convert (build_pointer_type (type), cond1);
9936 addr = build_va_arg_indirect_ref (addr);
9938 if (indirect_p)
9939 addr = build_va_arg_indirect_ref (addr);
9941 return addr;
9944 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
9946 static void
9947 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
9948 tree type, int *pretend_size ATTRIBUTE_UNUSED,
9949 int no_rtl)
9951 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
9952 CUMULATIVE_ARGS local_cum;
9953 int gr_saved = cfun->va_list_gpr_size;
9954 int vr_saved = cfun->va_list_fpr_size;
9956 /* The caller has advanced CUM up to, but not beyond, the last named
9957 argument. Advance a local copy of CUM past the last "real" named
9958 argument, to find out how many registers are left over. */
9959 local_cum = *cum;
9960 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
9962 /* Found out how many registers we need to save.
9963 Honor tree-stdvar analysis results. */
9964 if (cfun->va_list_gpr_size)
9965 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
9966 cfun->va_list_gpr_size / UNITS_PER_WORD);
9967 if (cfun->va_list_fpr_size)
9968 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
9969 cfun->va_list_fpr_size / UNITS_PER_VREG);
9971 if (!TARGET_FLOAT)
9973 gcc_assert (local_cum.aapcs_nvrn == 0);
9974 vr_saved = 0;
9977 if (!no_rtl)
9979 if (gr_saved > 0)
9981 rtx ptr, mem;
9983 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
9984 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
9985 - gr_saved * UNITS_PER_WORD);
9986 mem = gen_frame_mem (BLKmode, ptr);
9987 set_mem_alias_set (mem, get_varargs_alias_set ());
9989 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
9990 mem, gr_saved);
9992 if (vr_saved > 0)
9994 /* We can't use move_block_from_reg, because it will use
9995 the wrong mode, storing D regs only. */
9996 machine_mode mode = TImode;
9997 int off, i, vr_start;
9999 /* Set OFF to the offset from virtual_incoming_args_rtx of
10000 the first vector register. The VR save area lies below
10001 the GR one, and is aligned to 16 bytes. */
10002 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10003 STACK_BOUNDARY / BITS_PER_UNIT);
10004 off -= vr_saved * UNITS_PER_VREG;
10006 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10007 for (i = 0; i < vr_saved; ++i)
10009 rtx ptr, mem;
10011 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10012 mem = gen_frame_mem (mode, ptr);
10013 set_mem_alias_set (mem, get_varargs_alias_set ());
10014 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10015 off += UNITS_PER_VREG;
10020 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10021 any complication of having crtl->args.pretend_args_size changed. */
10022 cfun->machine->frame.saved_varargs_size
10023 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10024 STACK_BOUNDARY / BITS_PER_UNIT)
10025 + vr_saved * UNITS_PER_VREG);
10028 static void
10029 aarch64_conditional_register_usage (void)
10031 int i;
10032 if (!TARGET_FLOAT)
10034 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10036 fixed_regs[i] = 1;
10037 call_used_regs[i] = 1;
10042 /* Walk down the type tree of TYPE counting consecutive base elements.
10043 If *MODEP is VOIDmode, then set it to the first valid floating point
10044 type. If a non-floating point type is found, or if a floating point
10045 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10046 otherwise return the count in the sub-tree. */
10047 static int
10048 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10050 machine_mode mode;
10051 HOST_WIDE_INT size;
10053 switch (TREE_CODE (type))
10055 case REAL_TYPE:
10056 mode = TYPE_MODE (type);
10057 if (mode != DFmode && mode != SFmode
10058 && mode != TFmode && mode != HFmode)
10059 return -1;
10061 if (*modep == VOIDmode)
10062 *modep = mode;
10064 if (*modep == mode)
10065 return 1;
10067 break;
10069 case COMPLEX_TYPE:
10070 mode = TYPE_MODE (TREE_TYPE (type));
10071 if (mode != DFmode && mode != SFmode
10072 && mode != TFmode && mode != HFmode)
10073 return -1;
10075 if (*modep == VOIDmode)
10076 *modep = mode;
10078 if (*modep == mode)
10079 return 2;
10081 break;
10083 case VECTOR_TYPE:
10084 /* Use V2SImode and V4SImode as representatives of all 64-bit
10085 and 128-bit vector types. */
10086 size = int_size_in_bytes (type);
10087 switch (size)
10089 case 8:
10090 mode = V2SImode;
10091 break;
10092 case 16:
10093 mode = V4SImode;
10094 break;
10095 default:
10096 return -1;
10099 if (*modep == VOIDmode)
10100 *modep = mode;
10102 /* Vector modes are considered to be opaque: two vectors are
10103 equivalent for the purposes of being homogeneous aggregates
10104 if they are the same size. */
10105 if (*modep == mode)
10106 return 1;
10108 break;
10110 case ARRAY_TYPE:
10112 int count;
10113 tree index = TYPE_DOMAIN (type);
10115 /* Can't handle incomplete types nor sizes that are not
10116 fixed. */
10117 if (!COMPLETE_TYPE_P (type)
10118 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10119 return -1;
10121 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10122 if (count == -1
10123 || !index
10124 || !TYPE_MAX_VALUE (index)
10125 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10126 || !TYPE_MIN_VALUE (index)
10127 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10128 || count < 0)
10129 return -1;
10131 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10132 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10134 /* There must be no padding. */
10135 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10136 return -1;
10138 return count;
10141 case RECORD_TYPE:
10143 int count = 0;
10144 int sub_count;
10145 tree field;
10147 /* Can't handle incomplete types nor sizes that are not
10148 fixed. */
10149 if (!COMPLETE_TYPE_P (type)
10150 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10151 return -1;
10153 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10155 if (TREE_CODE (field) != FIELD_DECL)
10156 continue;
10158 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10159 if (sub_count < 0)
10160 return -1;
10161 count += sub_count;
10164 /* There must be no padding. */
10165 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10166 return -1;
10168 return count;
10171 case UNION_TYPE:
10172 case QUAL_UNION_TYPE:
10174 /* These aren't very interesting except in a degenerate case. */
10175 int count = 0;
10176 int sub_count;
10177 tree field;
10179 /* Can't handle incomplete types nor sizes that are not
10180 fixed. */
10181 if (!COMPLETE_TYPE_P (type)
10182 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10183 return -1;
10185 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10187 if (TREE_CODE (field) != FIELD_DECL)
10188 continue;
10190 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10191 if (sub_count < 0)
10192 return -1;
10193 count = count > sub_count ? count : sub_count;
10196 /* There must be no padding. */
10197 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10198 return -1;
10200 return count;
10203 default:
10204 break;
10207 return -1;
10210 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10211 type as described in AAPCS64 \S 4.1.2.
10213 See the comment above aarch64_composite_type_p for the notes on MODE. */
10215 static bool
10216 aarch64_short_vector_p (const_tree type,
10217 machine_mode mode)
10219 HOST_WIDE_INT size = -1;
10221 if (type && TREE_CODE (type) == VECTOR_TYPE)
10222 size = int_size_in_bytes (type);
10223 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10224 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10225 size = GET_MODE_SIZE (mode);
10227 return (size == 8 || size == 16);
10230 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10231 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
10232 array types. The C99 floating-point complex types are also considered
10233 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
10234 types, which are GCC extensions and out of the scope of AAPCS64, are
10235 treated as composite types here as well.
10237 Note that MODE itself is not sufficient in determining whether a type
10238 is such a composite type or not. This is because
10239 stor-layout.c:compute_record_mode may have already changed the MODE
10240 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10241 structure with only one field may have its MODE set to the mode of the
10242 field. Also an integer mode whose size matches the size of the
10243 RECORD_TYPE type may be used to substitute the original mode
10244 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10245 solely relied on. */
10247 static bool
10248 aarch64_composite_type_p (const_tree type,
10249 machine_mode mode)
10251 if (aarch64_short_vector_p (type, mode))
10252 return false;
10254 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10255 return true;
10257 if (mode == BLKmode
10258 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10259 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10260 return true;
10262 return false;
10265 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10266 shall be passed or returned in simd/fp register(s) (providing these
10267 parameter passing registers are available).
10269 Upon successful return, *COUNT returns the number of needed registers,
10270 *BASE_MODE returns the mode of the individual register and when IS_HAF
10271 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10272 floating-point aggregate or a homogeneous short-vector aggregate. */
10274 static bool
10275 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10276 const_tree type,
10277 machine_mode *base_mode,
10278 int *count,
10279 bool *is_ha)
10281 machine_mode new_mode = VOIDmode;
10282 bool composite_p = aarch64_composite_type_p (type, mode);
10284 if (is_ha != NULL) *is_ha = false;
10286 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10287 || aarch64_short_vector_p (type, mode))
10289 *count = 1;
10290 new_mode = mode;
10292 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10294 if (is_ha != NULL) *is_ha = true;
10295 *count = 2;
10296 new_mode = GET_MODE_INNER (mode);
10298 else if (type && composite_p)
10300 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10302 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10304 if (is_ha != NULL) *is_ha = true;
10305 *count = ag_count;
10307 else
10308 return false;
10310 else
10311 return false;
10313 *base_mode = new_mode;
10314 return true;
10317 /* Implement TARGET_STRUCT_VALUE_RTX. */
10319 static rtx
10320 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10321 int incoming ATTRIBUTE_UNUSED)
10323 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10326 /* Implements target hook vector_mode_supported_p. */
10327 static bool
10328 aarch64_vector_mode_supported_p (machine_mode mode)
10330 if (TARGET_SIMD
10331 && (mode == V4SImode || mode == V8HImode
10332 || mode == V16QImode || mode == V2DImode
10333 || mode == V2SImode || mode == V4HImode
10334 || mode == V8QImode || mode == V2SFmode
10335 || mode == V4SFmode || mode == V2DFmode
10336 || mode == V4HFmode || mode == V8HFmode
10337 || mode == V1DFmode))
10338 return true;
10340 return false;
10343 /* Return appropriate SIMD container
10344 for MODE within a vector of WIDTH bits. */
10345 static machine_mode
10346 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10348 gcc_assert (width == 64 || width == 128);
10349 if (TARGET_SIMD)
10351 if (width == 128)
10352 switch (mode)
10354 case DFmode:
10355 return V2DFmode;
10356 case SFmode:
10357 return V4SFmode;
10358 case SImode:
10359 return V4SImode;
10360 case HImode:
10361 return V8HImode;
10362 case QImode:
10363 return V16QImode;
10364 case DImode:
10365 return V2DImode;
10366 default:
10367 break;
10369 else
10370 switch (mode)
10372 case SFmode:
10373 return V2SFmode;
10374 case SImode:
10375 return V2SImode;
10376 case HImode:
10377 return V4HImode;
10378 case QImode:
10379 return V8QImode;
10380 default:
10381 break;
10384 return word_mode;
10387 /* Return 128-bit container as the preferred SIMD mode for MODE. */
10388 static machine_mode
10389 aarch64_preferred_simd_mode (machine_mode mode)
10391 return aarch64_simd_container_mode (mode, 128);
10394 /* Return the bitmask of possible vector sizes for the vectorizer
10395 to iterate over. */
10396 static unsigned int
10397 aarch64_autovectorize_vector_sizes (void)
10399 return (16 | 8);
10402 /* Implement TARGET_MANGLE_TYPE. */
10404 static const char *
10405 aarch64_mangle_type (const_tree type)
10407 /* The AArch64 ABI documents say that "__va_list" has to be
10408 managled as if it is in the "std" namespace. */
10409 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10410 return "St9__va_list";
10412 /* Half-precision float. */
10413 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10414 return "Dh";
10416 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
10417 builtin types. */
10418 if (TYPE_NAME (type) != NULL)
10419 return aarch64_mangle_builtin_type (type);
10421 /* Use the default mangling. */
10422 return NULL;
10426 /* Return true if the rtx_insn contains a MEM RTX somewhere
10427 in it. */
10429 static bool
10430 has_memory_op (rtx_insn *mem_insn)
10432 subrtx_iterator::array_type array;
10433 FOR_EACH_SUBRTX (iter, array, PATTERN (mem_insn), ALL)
10434 if (MEM_P (*iter))
10435 return true;
10437 return false;
10440 /* Find the first rtx_insn before insn that will generate an assembly
10441 instruction. */
10443 static rtx_insn *
10444 aarch64_prev_real_insn (rtx_insn *insn)
10446 if (!insn)
10447 return NULL;
10451 insn = prev_real_insn (insn);
10453 while (insn && recog_memoized (insn) < 0);
10455 return insn;
10458 static bool
10459 is_madd_op (enum attr_type t1)
10461 unsigned int i;
10462 /* A number of these may be AArch32 only. */
10463 enum attr_type mlatypes[] = {
10464 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10465 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10466 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10469 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10471 if (t1 == mlatypes[i])
10472 return true;
10475 return false;
10478 /* Check if there is a register dependency between a load and the insn
10479 for which we hold recog_data. */
10481 static bool
10482 dep_between_memop_and_curr (rtx memop)
10484 rtx load_reg;
10485 int opno;
10487 gcc_assert (GET_CODE (memop) == SET);
10489 if (!REG_P (SET_DEST (memop)))
10490 return false;
10492 load_reg = SET_DEST (memop);
10493 for (opno = 1; opno < recog_data.n_operands; opno++)
10495 rtx operand = recog_data.operand[opno];
10496 if (REG_P (operand)
10497 && reg_overlap_mentioned_p (load_reg, operand))
10498 return true;
10501 return false;
10505 /* When working around the Cortex-A53 erratum 835769,
10506 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10507 instruction and has a preceding memory instruction such that a NOP
10508 should be inserted between them. */
10510 bool
10511 aarch64_madd_needs_nop (rtx_insn* insn)
10513 enum attr_type attr_type;
10514 rtx_insn *prev;
10515 rtx body;
10517 if (!TARGET_FIX_ERR_A53_835769)
10518 return false;
10520 if (!INSN_P (insn) || recog_memoized (insn) < 0)
10521 return false;
10523 attr_type = get_attr_type (insn);
10524 if (!is_madd_op (attr_type))
10525 return false;
10527 prev = aarch64_prev_real_insn (insn);
10528 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
10529 Restore recog state to INSN to avoid state corruption. */
10530 extract_constrain_insn_cached (insn);
10532 if (!prev || !has_memory_op (prev))
10533 return false;
10535 body = single_set (prev);
10537 /* If the previous insn is a memory op and there is no dependency between
10538 it and the DImode madd, emit a NOP between them. If body is NULL then we
10539 have a complex memory operation, probably a load/store pair.
10540 Be conservative for now and emit a NOP. */
10541 if (GET_MODE (recog_data.operand[0]) == DImode
10542 && (!body || !dep_between_memop_and_curr (body)))
10543 return true;
10545 return false;
10550 /* Implement FINAL_PRESCAN_INSN. */
10552 void
10553 aarch64_final_prescan_insn (rtx_insn *insn)
10555 if (aarch64_madd_needs_nop (insn))
10556 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
10560 /* Return the equivalent letter for size. */
10561 static char
10562 sizetochar (int size)
10564 switch (size)
10566 case 64: return 'd';
10567 case 32: return 's';
10568 case 16: return 'h';
10569 case 8 : return 'b';
10570 default: gcc_unreachable ();
10574 /* Return true iff x is a uniform vector of floating-point
10575 constants, and the constant can be represented in
10576 quarter-precision form. Note, as aarch64_float_const_representable
10577 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
10578 static bool
10579 aarch64_vect_float_const_representable_p (rtx x)
10581 rtx elt;
10582 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
10583 && const_vec_duplicate_p (x, &elt)
10584 && aarch64_float_const_representable_p (elt));
10587 /* Return true for valid and false for invalid. */
10588 bool
10589 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
10590 struct simd_immediate_info *info)
10592 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
10593 matches = 1; \
10594 for (i = 0; i < idx; i += (STRIDE)) \
10595 if (!(TEST)) \
10596 matches = 0; \
10597 if (matches) \
10599 immtype = (CLASS); \
10600 elsize = (ELSIZE); \
10601 eshift = (SHIFT); \
10602 emvn = (NEG); \
10603 break; \
10606 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
10607 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
10608 unsigned char bytes[16];
10609 int immtype = -1, matches;
10610 unsigned int invmask = inverse ? 0xff : 0;
10611 int eshift, emvn;
10613 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10615 if (! (aarch64_simd_imm_zero_p (op, mode)
10616 || aarch64_vect_float_const_representable_p (op)))
10617 return false;
10619 if (info)
10621 info->value = CONST_VECTOR_ELT (op, 0);
10622 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
10623 info->mvn = false;
10624 info->shift = 0;
10627 return true;
10630 /* Splat vector constant out into a byte vector. */
10631 for (i = 0; i < n_elts; i++)
10633 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
10634 it must be laid out in the vector register in reverse order. */
10635 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
10636 unsigned HOST_WIDE_INT elpart;
10638 gcc_assert (CONST_INT_P (el));
10639 elpart = INTVAL (el);
10641 for (unsigned int byte = 0; byte < innersize; byte++)
10643 bytes[idx++] = (elpart & 0xff) ^ invmask;
10644 elpart >>= BITS_PER_UNIT;
10649 /* Sanity check. */
10650 gcc_assert (idx == GET_MODE_SIZE (mode));
10654 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
10655 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
10657 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10658 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10660 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
10661 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10663 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
10664 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
10666 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
10668 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
10670 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
10671 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
10673 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10674 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10676 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
10677 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10679 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
10680 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
10682 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
10684 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
10686 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
10687 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
10689 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
10690 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
10692 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
10693 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
10695 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
10696 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
10698 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
10700 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
10701 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
10703 while (0);
10705 if (immtype == -1)
10706 return false;
10708 if (info)
10710 info->element_width = elsize;
10711 info->mvn = emvn != 0;
10712 info->shift = eshift;
10714 unsigned HOST_WIDE_INT imm = 0;
10716 if (immtype >= 12 && immtype <= 15)
10717 info->msl = true;
10719 /* Un-invert bytes of recognized vector, if necessary. */
10720 if (invmask != 0)
10721 for (i = 0; i < idx; i++)
10722 bytes[i] ^= invmask;
10724 if (immtype == 17)
10726 /* FIXME: Broken on 32-bit H_W_I hosts. */
10727 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
10729 for (i = 0; i < 8; i++)
10730 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
10731 << (i * BITS_PER_UNIT);
10734 info->value = GEN_INT (imm);
10736 else
10738 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
10739 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
10741 /* Construct 'abcdefgh' because the assembler cannot handle
10742 generic constants. */
10743 if (info->mvn)
10744 imm = ~imm;
10745 imm = (imm >> info->shift) & 0xff;
10746 info->value = GEN_INT (imm);
10750 return true;
10751 #undef CHECK
10754 /* Check of immediate shift constants are within range. */
10755 bool
10756 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
10758 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
10759 if (left)
10760 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
10761 else
10762 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
10765 /* Return true if X is a uniform vector where all elements
10766 are either the floating-point constant 0.0 or the
10767 integer constant 0. */
10768 bool
10769 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
10771 return x == CONST0_RTX (mode);
10775 /* Return the bitmask CONST_INT to select the bits required by a zero extract
10776 operation of width WIDTH at bit position POS. */
10779 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
10781 gcc_assert (CONST_INT_P (width));
10782 gcc_assert (CONST_INT_P (pos));
10784 unsigned HOST_WIDE_INT mask
10785 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
10786 return GEN_INT (mask << UINTVAL (pos));
10789 bool
10790 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
10792 HOST_WIDE_INT imm = INTVAL (x);
10793 int i;
10795 for (i = 0; i < 8; i++)
10797 unsigned int byte = imm & 0xff;
10798 if (byte != 0xff && byte != 0)
10799 return false;
10800 imm >>= 8;
10803 return true;
10806 bool
10807 aarch64_mov_operand_p (rtx x, machine_mode mode)
10809 if (GET_CODE (x) == HIGH
10810 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10811 return true;
10813 if (CONST_INT_P (x))
10814 return true;
10816 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
10817 return true;
10819 return aarch64_classify_symbolic_expression (x)
10820 == SYMBOL_TINY_ABSOLUTE;
10823 /* Return a const_int vector of VAL. */
10825 aarch64_simd_gen_const_vector_dup (machine_mode mode, int val)
10827 int nunits = GET_MODE_NUNITS (mode);
10828 rtvec v = rtvec_alloc (nunits);
10829 int i;
10831 for (i=0; i < nunits; i++)
10832 RTVEC_ELT (v, i) = GEN_INT (val);
10834 return gen_rtx_CONST_VECTOR (mode, v);
10837 /* Check OP is a legal scalar immediate for the MOVI instruction. */
10839 bool
10840 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
10842 machine_mode vmode;
10844 gcc_assert (!VECTOR_MODE_P (mode));
10845 vmode = aarch64_preferred_simd_mode (mode);
10846 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
10847 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
10850 /* Construct and return a PARALLEL RTX vector with elements numbering the
10851 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
10852 the vector - from the perspective of the architecture. This does not
10853 line up with GCC's perspective on lane numbers, so we end up with
10854 different masks depending on our target endian-ness. The diagram
10855 below may help. We must draw the distinction when building masks
10856 which select one half of the vector. An instruction selecting
10857 architectural low-lanes for a big-endian target, must be described using
10858 a mask selecting GCC high-lanes.
10860 Big-Endian Little-Endian
10862 GCC 0 1 2 3 3 2 1 0
10863 | x | x | x | x | | x | x | x | x |
10864 Architecture 3 2 1 0 3 2 1 0
10866 Low Mask: { 2, 3 } { 0, 1 }
10867 High Mask: { 0, 1 } { 2, 3 }
10871 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
10873 int nunits = GET_MODE_NUNITS (mode);
10874 rtvec v = rtvec_alloc (nunits / 2);
10875 int high_base = nunits / 2;
10876 int low_base = 0;
10877 int base;
10878 rtx t1;
10879 int i;
10881 if (BYTES_BIG_ENDIAN)
10882 base = high ? low_base : high_base;
10883 else
10884 base = high ? high_base : low_base;
10886 for (i = 0; i < nunits / 2; i++)
10887 RTVEC_ELT (v, i) = GEN_INT (base + i);
10889 t1 = gen_rtx_PARALLEL (mode, v);
10890 return t1;
10893 /* Check OP for validity as a PARALLEL RTX vector with elements
10894 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
10895 from the perspective of the architecture. See the diagram above
10896 aarch64_simd_vect_par_cnst_half for more details. */
10898 bool
10899 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
10900 bool high)
10902 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
10903 HOST_WIDE_INT count_op = XVECLEN (op, 0);
10904 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
10905 int i = 0;
10907 if (!VECTOR_MODE_P (mode))
10908 return false;
10910 if (count_op != count_ideal)
10911 return false;
10913 for (i = 0; i < count_ideal; i++)
10915 rtx elt_op = XVECEXP (op, 0, i);
10916 rtx elt_ideal = XVECEXP (ideal, 0, i);
10918 if (!CONST_INT_P (elt_op)
10919 || INTVAL (elt_ideal) != INTVAL (elt_op))
10920 return false;
10922 return true;
10925 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
10926 HIGH (exclusive). */
10927 void
10928 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
10929 const_tree exp)
10931 HOST_WIDE_INT lane;
10932 gcc_assert (CONST_INT_P (operand));
10933 lane = INTVAL (operand);
10935 if (lane < low || lane >= high)
10937 if (exp)
10938 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
10939 else
10940 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
10944 /* Return TRUE if OP is a valid vector addressing mode. */
10945 bool
10946 aarch64_simd_mem_operand_p (rtx op)
10948 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
10949 || REG_P (XEXP (op, 0)));
10952 /* Emit a register copy from operand to operand, taking care not to
10953 early-clobber source registers in the process.
10955 COUNT is the number of components into which the copy needs to be
10956 decomposed. */
10957 void
10958 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
10959 unsigned int count)
10961 unsigned int i;
10962 int rdest = REGNO (operands[0]);
10963 int rsrc = REGNO (operands[1]);
10965 if (!reg_overlap_mentioned_p (operands[0], operands[1])
10966 || rdest < rsrc)
10967 for (i = 0; i < count; i++)
10968 emit_move_insn (gen_rtx_REG (mode, rdest + i),
10969 gen_rtx_REG (mode, rsrc + i));
10970 else
10971 for (i = 0; i < count; i++)
10972 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
10973 gen_rtx_REG (mode, rsrc + count - i - 1));
10976 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
10977 one of VSTRUCT modes: OI, CI, or XI. */
10979 aarch64_simd_attr_length_rglist (enum machine_mode mode)
10981 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
10984 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
10985 alignment of a vector to 128 bits. */
10986 static HOST_WIDE_INT
10987 aarch64_simd_vector_alignment (const_tree type)
10989 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
10990 return MIN (align, 128);
10993 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
10994 static bool
10995 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
10997 if (is_packed)
10998 return false;
11000 /* We guarantee alignment for vectors up to 128-bits. */
11001 if (tree_int_cst_compare (TYPE_SIZE (type),
11002 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11003 return false;
11005 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11006 return true;
11009 /* If VALS is a vector constant that can be loaded into a register
11010 using DUP, generate instructions to do so and return an RTX to
11011 assign to the register. Otherwise return NULL_RTX. */
11012 static rtx
11013 aarch64_simd_dup_constant (rtx vals)
11015 machine_mode mode = GET_MODE (vals);
11016 machine_mode inner_mode = GET_MODE_INNER (mode);
11017 rtx x;
11019 if (!const_vec_duplicate_p (vals, &x))
11020 return NULL_RTX;
11022 /* We can load this constant by using DUP and a constant in a
11023 single ARM register. This will be cheaper than a vector
11024 load. */
11025 x = copy_to_mode_reg (inner_mode, x);
11026 return gen_rtx_VEC_DUPLICATE (mode, x);
11030 /* Generate code to load VALS, which is a PARALLEL containing only
11031 constants (for vec_init) or CONST_VECTOR, efficiently into a
11032 register. Returns an RTX to copy into the register, or NULL_RTX
11033 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11034 static rtx
11035 aarch64_simd_make_constant (rtx vals)
11037 machine_mode mode = GET_MODE (vals);
11038 rtx const_dup;
11039 rtx const_vec = NULL_RTX;
11040 int n_elts = GET_MODE_NUNITS (mode);
11041 int n_const = 0;
11042 int i;
11044 if (GET_CODE (vals) == CONST_VECTOR)
11045 const_vec = vals;
11046 else if (GET_CODE (vals) == PARALLEL)
11048 /* A CONST_VECTOR must contain only CONST_INTs and
11049 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11050 Only store valid constants in a CONST_VECTOR. */
11051 for (i = 0; i < n_elts; ++i)
11053 rtx x = XVECEXP (vals, 0, i);
11054 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11055 n_const++;
11057 if (n_const == n_elts)
11058 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11060 else
11061 gcc_unreachable ();
11063 if (const_vec != NULL_RTX
11064 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11065 /* Load using MOVI/MVNI. */
11066 return const_vec;
11067 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11068 /* Loaded using DUP. */
11069 return const_dup;
11070 else if (const_vec != NULL_RTX)
11071 /* Load from constant pool. We can not take advantage of single-cycle
11072 LD1 because we need a PC-relative addressing mode. */
11073 return const_vec;
11074 else
11075 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11076 We can not construct an initializer. */
11077 return NULL_RTX;
11080 /* Expand a vector initialisation sequence, such that TARGET is
11081 initialised to contain VALS. */
11083 void
11084 aarch64_expand_vector_init (rtx target, rtx vals)
11086 machine_mode mode = GET_MODE (target);
11087 machine_mode inner_mode = GET_MODE_INNER (mode);
11088 /* The number of vector elements. */
11089 int n_elts = GET_MODE_NUNITS (mode);
11090 /* The number of vector elements which are not constant. */
11091 int n_var = 0;
11092 rtx any_const = NULL_RTX;
11093 /* The first element of vals. */
11094 rtx v0 = XVECEXP (vals, 0, 0);
11095 bool all_same = true;
11097 /* Count the number of variable elements to initialise. */
11098 for (int i = 0; i < n_elts; ++i)
11100 rtx x = XVECEXP (vals, 0, i);
11101 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11102 ++n_var;
11103 else
11104 any_const = x;
11106 all_same &= rtx_equal_p (x, v0);
11109 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11110 how best to handle this. */
11111 if (n_var == 0)
11113 rtx constant = aarch64_simd_make_constant (vals);
11114 if (constant != NULL_RTX)
11116 emit_move_insn (target, constant);
11117 return;
11121 /* Splat a single non-constant element if we can. */
11122 if (all_same)
11124 rtx x = copy_to_mode_reg (inner_mode, v0);
11125 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11126 return;
11129 /* Initialise a vector which is part-variable. We want to first try
11130 to build those lanes which are constant in the most efficient way we
11131 can. */
11132 if (n_var != n_elts)
11134 rtx copy = copy_rtx (vals);
11136 /* Load constant part of vector. We really don't care what goes into the
11137 parts we will overwrite, but we're more likely to be able to load the
11138 constant efficiently if it has fewer, larger, repeating parts
11139 (see aarch64_simd_valid_immediate). */
11140 for (int i = 0; i < n_elts; i++)
11142 rtx x = XVECEXP (vals, 0, i);
11143 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11144 continue;
11145 rtx subst = any_const;
11146 for (int bit = n_elts / 2; bit > 0; bit /= 2)
11148 /* Look in the copied vector, as more elements are const. */
11149 rtx test = XVECEXP (copy, 0, i ^ bit);
11150 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11152 subst = test;
11153 break;
11156 XVECEXP (copy, 0, i) = subst;
11158 aarch64_expand_vector_init (target, copy);
11161 /* Insert the variable lanes directly. */
11163 enum insn_code icode = optab_handler (vec_set_optab, mode);
11164 gcc_assert (icode != CODE_FOR_nothing);
11166 for (int i = 0; i < n_elts; i++)
11168 rtx x = XVECEXP (vals, 0, i);
11169 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11170 continue;
11171 x = copy_to_mode_reg (inner_mode, x);
11172 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11176 static unsigned HOST_WIDE_INT
11177 aarch64_shift_truncation_mask (machine_mode mode)
11179 return
11180 (!SHIFT_COUNT_TRUNCATED
11181 || aarch64_vector_mode_supported_p (mode)
11182 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11185 /* Select a format to encode pointers in exception handling data. */
11187 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11189 int type;
11190 switch (aarch64_cmodel)
11192 case AARCH64_CMODEL_TINY:
11193 case AARCH64_CMODEL_TINY_PIC:
11194 case AARCH64_CMODEL_SMALL:
11195 case AARCH64_CMODEL_SMALL_PIC:
11196 case AARCH64_CMODEL_SMALL_SPIC:
11197 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
11198 for everything. */
11199 type = DW_EH_PE_sdata4;
11200 break;
11201 default:
11202 /* No assumptions here. 8-byte relocs required. */
11203 type = DW_EH_PE_sdata8;
11204 break;
11206 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11209 /* The last .arch and .tune assembly strings that we printed. */
11210 static std::string aarch64_last_printed_arch_string;
11211 static std::string aarch64_last_printed_tune_string;
11213 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
11214 by the function fndecl. */
11216 void
11217 aarch64_declare_function_name (FILE *stream, const char* name,
11218 tree fndecl)
11220 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11222 struct cl_target_option *targ_options;
11223 if (target_parts)
11224 targ_options = TREE_TARGET_OPTION (target_parts);
11225 else
11226 targ_options = TREE_TARGET_OPTION (target_option_current_node);
11227 gcc_assert (targ_options);
11229 const struct processor *this_arch
11230 = aarch64_get_arch (targ_options->x_explicit_arch);
11232 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11233 std::string extension
11234 = aarch64_get_extension_string_for_isa_flags (isa_flags,
11235 this_arch->flags);
11236 /* Only update the assembler .arch string if it is distinct from the last
11237 such string we printed. */
11238 std::string to_print = this_arch->name + extension;
11239 if (to_print != aarch64_last_printed_arch_string)
11241 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11242 aarch64_last_printed_arch_string = to_print;
11245 /* Print the cpu name we're tuning for in the comments, might be
11246 useful to readers of the generated asm. Do it only when it changes
11247 from function to function and verbose assembly is requested. */
11248 const struct processor *this_tune
11249 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11251 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11253 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11254 this_tune->name);
11255 aarch64_last_printed_tune_string = this_tune->name;
11258 /* Don't forget the type directive for ELF. */
11259 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11260 ASM_OUTPUT_LABEL (stream, name);
11263 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
11265 static void
11266 aarch64_start_file (void)
11268 struct cl_target_option *default_options
11269 = TREE_TARGET_OPTION (target_option_default_node);
11271 const struct processor *default_arch
11272 = aarch64_get_arch (default_options->x_explicit_arch);
11273 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11274 std::string extension
11275 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11276 default_arch->flags);
11278 aarch64_last_printed_arch_string = default_arch->name + extension;
11279 aarch64_last_printed_tune_string = "";
11280 asm_fprintf (asm_out_file, "\t.arch %s\n",
11281 aarch64_last_printed_arch_string.c_str ());
11283 default_file_start ();
11286 /* Emit load exclusive. */
11288 static void
11289 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11290 rtx mem, rtx model_rtx)
11292 rtx (*gen) (rtx, rtx, rtx);
11294 switch (mode)
11296 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11297 case HImode: gen = gen_aarch64_load_exclusivehi; break;
11298 case SImode: gen = gen_aarch64_load_exclusivesi; break;
11299 case DImode: gen = gen_aarch64_load_exclusivedi; break;
11300 default:
11301 gcc_unreachable ();
11304 emit_insn (gen (rval, mem, model_rtx));
11307 /* Emit store exclusive. */
11309 static void
11310 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11311 rtx rval, rtx mem, rtx model_rtx)
11313 rtx (*gen) (rtx, rtx, rtx, rtx);
11315 switch (mode)
11317 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11318 case HImode: gen = gen_aarch64_store_exclusivehi; break;
11319 case SImode: gen = gen_aarch64_store_exclusivesi; break;
11320 case DImode: gen = gen_aarch64_store_exclusivedi; break;
11321 default:
11322 gcc_unreachable ();
11325 emit_insn (gen (bval, rval, mem, model_rtx));
11328 /* Mark the previous jump instruction as unlikely. */
11330 static void
11331 aarch64_emit_unlikely_jump (rtx insn)
11333 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11335 insn = emit_jump_insn (insn);
11336 add_int_reg_note (insn, REG_BR_PROB, very_unlikely);
11339 /* Expand a compare and swap pattern. */
11341 void
11342 aarch64_expand_compare_and_swap (rtx operands[])
11344 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11345 machine_mode mode, cmp_mode;
11346 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11347 int idx;
11348 gen_cas_fn gen;
11349 const gen_cas_fn split_cas[] =
11351 gen_aarch64_compare_and_swapqi,
11352 gen_aarch64_compare_and_swaphi,
11353 gen_aarch64_compare_and_swapsi,
11354 gen_aarch64_compare_and_swapdi
11356 const gen_cas_fn atomic_cas[] =
11358 gen_aarch64_compare_and_swapqi_lse,
11359 gen_aarch64_compare_and_swaphi_lse,
11360 gen_aarch64_compare_and_swapsi_lse,
11361 gen_aarch64_compare_and_swapdi_lse
11364 bval = operands[0];
11365 rval = operands[1];
11366 mem = operands[2];
11367 oldval = operands[3];
11368 newval = operands[4];
11369 is_weak = operands[5];
11370 mod_s = operands[6];
11371 mod_f = operands[7];
11372 mode = GET_MODE (mem);
11373 cmp_mode = mode;
11375 /* Normally the succ memory model must be stronger than fail, but in the
11376 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11377 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
11379 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11380 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11381 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11383 switch (mode)
11385 case QImode:
11386 case HImode:
11387 /* For short modes, we're going to perform the comparison in SImode,
11388 so do the zero-extension now. */
11389 cmp_mode = SImode;
11390 rval = gen_reg_rtx (SImode);
11391 oldval = convert_modes (SImode, mode, oldval, true);
11392 /* Fall through. */
11394 case SImode:
11395 case DImode:
11396 /* Force the value into a register if needed. */
11397 if (!aarch64_plus_operand (oldval, mode))
11398 oldval = force_reg (cmp_mode, oldval);
11399 break;
11401 default:
11402 gcc_unreachable ();
11405 switch (mode)
11407 case QImode: idx = 0; break;
11408 case HImode: idx = 1; break;
11409 case SImode: idx = 2; break;
11410 case DImode: idx = 3; break;
11411 default:
11412 gcc_unreachable ();
11414 if (TARGET_LSE)
11415 gen = atomic_cas[idx];
11416 else
11417 gen = split_cas[idx];
11419 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11421 if (mode == QImode || mode == HImode)
11422 emit_move_insn (operands[1], gen_lowpart (mode, rval));
11424 x = gen_rtx_REG (CCmode, CC_REGNUM);
11425 x = gen_rtx_EQ (SImode, x, const0_rtx);
11426 emit_insn (gen_rtx_SET (bval, x));
11429 /* Test whether the target supports using a atomic load-operate instruction.
11430 CODE is the operation and AFTER is TRUE if the data in memory after the
11431 operation should be returned and FALSE if the data before the operation
11432 should be returned. Returns FALSE if the operation isn't supported by the
11433 architecture. */
11435 bool
11436 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11438 if (!TARGET_LSE)
11439 return false;
11441 switch (code)
11443 case SET:
11444 case AND:
11445 case IOR:
11446 case XOR:
11447 case MINUS:
11448 case PLUS:
11449 return true;
11450 default:
11451 return false;
11455 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11456 sequence implementing an atomic operation. */
11458 static void
11459 aarch64_emit_post_barrier (enum memmodel model)
11461 const enum memmodel base_model = memmodel_base (model);
11463 if (is_mm_sync (model)
11464 && (base_model == MEMMODEL_ACQUIRE
11465 || base_model == MEMMODEL_ACQ_REL
11466 || base_model == MEMMODEL_SEQ_CST))
11468 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11472 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
11473 for the data in memory. EXPECTED is the value expected to be in memory.
11474 DESIRED is the value to store to memory. MEM is the memory location. MODEL
11475 is the memory ordering to use. */
11477 void
11478 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11479 rtx expected, rtx desired,
11480 rtx model)
11482 rtx (*gen) (rtx, rtx, rtx, rtx);
11483 machine_mode mode;
11485 mode = GET_MODE (mem);
11487 switch (mode)
11489 case QImode: gen = gen_aarch64_atomic_casqi; break;
11490 case HImode: gen = gen_aarch64_atomic_cashi; break;
11491 case SImode: gen = gen_aarch64_atomic_cassi; break;
11492 case DImode: gen = gen_aarch64_atomic_casdi; break;
11493 default:
11494 gcc_unreachable ();
11497 /* Move the expected value into the CAS destination register. */
11498 emit_insn (gen_rtx_SET (rval, expected));
11500 /* Emit the CAS. */
11501 emit_insn (gen (rval, mem, desired, model));
11503 /* Compare the expected value with the value loaded by the CAS, to establish
11504 whether the swap was made. */
11505 aarch64_gen_compare_reg (EQ, rval, expected);
11508 /* Split a compare and swap pattern. */
11510 void
11511 aarch64_split_compare_and_swap (rtx operands[])
11513 rtx rval, mem, oldval, newval, scratch;
11514 machine_mode mode;
11515 bool is_weak;
11516 rtx_code_label *label1, *label2;
11517 rtx x, cond;
11518 enum memmodel model;
11519 rtx model_rtx;
11521 rval = operands[0];
11522 mem = operands[1];
11523 oldval = operands[2];
11524 newval = operands[3];
11525 is_weak = (operands[4] != const0_rtx);
11526 model_rtx = operands[5];
11527 scratch = operands[7];
11528 mode = GET_MODE (mem);
11529 model = memmodel_from_int (INTVAL (model_rtx));
11531 label1 = NULL;
11532 if (!is_weak)
11534 label1 = gen_label_rtx ();
11535 emit_label (label1);
11537 label2 = gen_label_rtx ();
11539 /* The initial load can be relaxed for a __sync operation since a final
11540 barrier will be emitted to stop code hoisting. */
11541 if (is_mm_sync (model))
11542 aarch64_emit_load_exclusive (mode, rval, mem,
11543 GEN_INT (MEMMODEL_RELAXED));
11544 else
11545 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
11547 cond = aarch64_gen_compare_reg (NE, rval, oldval);
11548 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11549 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11550 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
11551 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11553 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
11555 if (!is_weak)
11557 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
11558 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11559 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
11560 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11562 else
11564 cond = gen_rtx_REG (CCmode, CC_REGNUM);
11565 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
11566 emit_insn (gen_rtx_SET (cond, x));
11569 emit_label (label2);
11571 /* Emit any final barrier needed for a __sync operation. */
11572 if (is_mm_sync (model))
11573 aarch64_emit_post_barrier (model);
11576 /* Emit a BIC instruction. */
11578 static void
11579 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
11581 rtx shift_rtx = GEN_INT (shift);
11582 rtx (*gen) (rtx, rtx, rtx, rtx);
11584 switch (mode)
11586 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
11587 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
11588 default:
11589 gcc_unreachable ();
11592 emit_insn (gen (dst, s2, shift_rtx, s1));
11595 /* Emit an atomic swap. */
11597 static void
11598 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
11599 rtx mem, rtx model)
11601 rtx (*gen) (rtx, rtx, rtx, rtx);
11603 switch (mode)
11605 case QImode: gen = gen_aarch64_atomic_swpqi; break;
11606 case HImode: gen = gen_aarch64_atomic_swphi; break;
11607 case SImode: gen = gen_aarch64_atomic_swpsi; break;
11608 case DImode: gen = gen_aarch64_atomic_swpdi; break;
11609 default:
11610 gcc_unreachable ();
11613 emit_insn (gen (dst, mem, value, model));
11616 /* Operations supported by aarch64_emit_atomic_load_op. */
11618 enum aarch64_atomic_load_op_code
11620 AARCH64_LDOP_PLUS, /* A + B */
11621 AARCH64_LDOP_XOR, /* A ^ B */
11622 AARCH64_LDOP_OR, /* A | B */
11623 AARCH64_LDOP_BIC /* A & ~B */
11626 /* Emit an atomic load-operate. */
11628 static void
11629 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
11630 machine_mode mode, rtx dst, rtx src,
11631 rtx mem, rtx model)
11633 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
11634 const aarch64_atomic_load_op_fn plus[] =
11636 gen_aarch64_atomic_loadaddqi,
11637 gen_aarch64_atomic_loadaddhi,
11638 gen_aarch64_atomic_loadaddsi,
11639 gen_aarch64_atomic_loadadddi
11641 const aarch64_atomic_load_op_fn eor[] =
11643 gen_aarch64_atomic_loadeorqi,
11644 gen_aarch64_atomic_loadeorhi,
11645 gen_aarch64_atomic_loadeorsi,
11646 gen_aarch64_atomic_loadeordi
11648 const aarch64_atomic_load_op_fn ior[] =
11650 gen_aarch64_atomic_loadsetqi,
11651 gen_aarch64_atomic_loadsethi,
11652 gen_aarch64_atomic_loadsetsi,
11653 gen_aarch64_atomic_loadsetdi
11655 const aarch64_atomic_load_op_fn bic[] =
11657 gen_aarch64_atomic_loadclrqi,
11658 gen_aarch64_atomic_loadclrhi,
11659 gen_aarch64_atomic_loadclrsi,
11660 gen_aarch64_atomic_loadclrdi
11662 aarch64_atomic_load_op_fn gen;
11663 int idx = 0;
11665 switch (mode)
11667 case QImode: idx = 0; break;
11668 case HImode: idx = 1; break;
11669 case SImode: idx = 2; break;
11670 case DImode: idx = 3; break;
11671 default:
11672 gcc_unreachable ();
11675 switch (code)
11677 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
11678 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
11679 case AARCH64_LDOP_OR: gen = ior[idx]; break;
11680 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
11681 default:
11682 gcc_unreachable ();
11685 emit_insn (gen (dst, mem, src, model));
11688 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
11689 location to store the data read from memory. OUT_RESULT is the location to
11690 store the result of the operation. MEM is the memory location to read and
11691 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
11692 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
11693 be NULL. */
11695 void
11696 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
11697 rtx mem, rtx value, rtx model_rtx)
11699 machine_mode mode = GET_MODE (mem);
11700 machine_mode wmode = (mode == DImode ? DImode : SImode);
11701 const bool short_mode = (mode < SImode);
11702 aarch64_atomic_load_op_code ldop_code;
11703 rtx src;
11704 rtx x;
11706 if (out_data)
11707 out_data = gen_lowpart (mode, out_data);
11709 if (out_result)
11710 out_result = gen_lowpart (mode, out_result);
11712 /* Make sure the value is in a register, putting it into a destination
11713 register if it needs to be manipulated. */
11714 if (!register_operand (value, mode)
11715 || code == AND || code == MINUS)
11717 src = out_result ? out_result : out_data;
11718 emit_move_insn (src, gen_lowpart (mode, value));
11720 else
11721 src = value;
11722 gcc_assert (register_operand (src, mode));
11724 /* Preprocess the data for the operation as necessary. If the operation is
11725 a SET then emit a swap instruction and finish. */
11726 switch (code)
11728 case SET:
11729 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
11730 return;
11732 case MINUS:
11733 /* Negate the value and treat it as a PLUS. */
11735 rtx neg_src;
11737 /* Resize the value if necessary. */
11738 if (short_mode)
11739 src = gen_lowpart (wmode, src);
11741 neg_src = gen_rtx_NEG (wmode, src);
11742 emit_insn (gen_rtx_SET (src, neg_src));
11744 if (short_mode)
11745 src = gen_lowpart (mode, src);
11747 /* Fall-through. */
11748 case PLUS:
11749 ldop_code = AARCH64_LDOP_PLUS;
11750 break;
11752 case IOR:
11753 ldop_code = AARCH64_LDOP_OR;
11754 break;
11756 case XOR:
11757 ldop_code = AARCH64_LDOP_XOR;
11758 break;
11760 case AND:
11762 rtx not_src;
11764 /* Resize the value if necessary. */
11765 if (short_mode)
11766 src = gen_lowpart (wmode, src);
11768 not_src = gen_rtx_NOT (wmode, src);
11769 emit_insn (gen_rtx_SET (src, not_src));
11771 if (short_mode)
11772 src = gen_lowpart (mode, src);
11774 ldop_code = AARCH64_LDOP_BIC;
11775 break;
11777 default:
11778 /* The operation can't be done with atomic instructions. */
11779 gcc_unreachable ();
11782 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
11784 /* If necessary, calculate the data in memory after the update by redoing the
11785 operation from values in registers. */
11786 if (!out_result)
11787 return;
11789 if (short_mode)
11791 src = gen_lowpart (wmode, src);
11792 out_data = gen_lowpart (wmode, out_data);
11793 out_result = gen_lowpart (wmode, out_result);
11796 x = NULL_RTX;
11798 switch (code)
11800 case MINUS:
11801 case PLUS:
11802 x = gen_rtx_PLUS (wmode, out_data, src);
11803 break;
11804 case IOR:
11805 x = gen_rtx_IOR (wmode, out_data, src);
11806 break;
11807 case XOR:
11808 x = gen_rtx_XOR (wmode, out_data, src);
11809 break;
11810 case AND:
11811 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
11812 return;
11813 default:
11814 gcc_unreachable ();
11817 emit_set_insn (out_result, x);
11819 return;
11822 /* Split an atomic operation. */
11824 void
11825 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
11826 rtx value, rtx model_rtx, rtx cond)
11828 machine_mode mode = GET_MODE (mem);
11829 machine_mode wmode = (mode == DImode ? DImode : SImode);
11830 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
11831 const bool is_sync = is_mm_sync (model);
11832 rtx_code_label *label;
11833 rtx x;
11835 /* Split the atomic operation into a sequence. */
11836 label = gen_label_rtx ();
11837 emit_label (label);
11839 if (new_out)
11840 new_out = gen_lowpart (wmode, new_out);
11841 if (old_out)
11842 old_out = gen_lowpart (wmode, old_out);
11843 else
11844 old_out = new_out;
11845 value = simplify_gen_subreg (wmode, value, mode, 0);
11847 /* The initial load can be relaxed for a __sync operation since a final
11848 barrier will be emitted to stop code hoisting. */
11849 if (is_sync)
11850 aarch64_emit_load_exclusive (mode, old_out, mem,
11851 GEN_INT (MEMMODEL_RELAXED));
11852 else
11853 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
11855 switch (code)
11857 case SET:
11858 new_out = value;
11859 break;
11861 case NOT:
11862 x = gen_rtx_AND (wmode, old_out, value);
11863 emit_insn (gen_rtx_SET (new_out, x));
11864 x = gen_rtx_NOT (wmode, new_out);
11865 emit_insn (gen_rtx_SET (new_out, x));
11866 break;
11868 case MINUS:
11869 if (CONST_INT_P (value))
11871 value = GEN_INT (-INTVAL (value));
11872 code = PLUS;
11874 /* Fall through. */
11876 default:
11877 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
11878 emit_insn (gen_rtx_SET (new_out, x));
11879 break;
11882 aarch64_emit_store_exclusive (mode, cond, mem,
11883 gen_lowpart (mode, new_out), model_rtx);
11885 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
11886 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
11887 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
11888 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
11890 /* Emit any final barrier needed for a __sync operation. */
11891 if (is_sync)
11892 aarch64_emit_post_barrier (model);
11895 static void
11896 aarch64_init_libfuncs (void)
11898 /* Half-precision float operations. The compiler handles all operations
11899 with NULL libfuncs by converting to SFmode. */
11901 /* Conversions. */
11902 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
11903 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
11905 /* Arithmetic. */
11906 set_optab_libfunc (add_optab, HFmode, NULL);
11907 set_optab_libfunc (sdiv_optab, HFmode, NULL);
11908 set_optab_libfunc (smul_optab, HFmode, NULL);
11909 set_optab_libfunc (neg_optab, HFmode, NULL);
11910 set_optab_libfunc (sub_optab, HFmode, NULL);
11912 /* Comparisons. */
11913 set_optab_libfunc (eq_optab, HFmode, NULL);
11914 set_optab_libfunc (ne_optab, HFmode, NULL);
11915 set_optab_libfunc (lt_optab, HFmode, NULL);
11916 set_optab_libfunc (le_optab, HFmode, NULL);
11917 set_optab_libfunc (ge_optab, HFmode, NULL);
11918 set_optab_libfunc (gt_optab, HFmode, NULL);
11919 set_optab_libfunc (unord_optab, HFmode, NULL);
11922 /* Target hook for c_mode_for_suffix. */
11923 static machine_mode
11924 aarch64_c_mode_for_suffix (char suffix)
11926 if (suffix == 'q')
11927 return TFmode;
11929 return VOIDmode;
11932 /* We can only represent floating point constants which will fit in
11933 "quarter-precision" values. These values are characterised by
11934 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
11937 (-1)^s * (n/16) * 2^r
11939 Where:
11940 's' is the sign bit.
11941 'n' is an integer in the range 16 <= n <= 31.
11942 'r' is an integer in the range -3 <= r <= 4. */
11944 /* Return true iff X can be represented by a quarter-precision
11945 floating point immediate operand X. Note, we cannot represent 0.0. */
11946 bool
11947 aarch64_float_const_representable_p (rtx x)
11949 /* This represents our current view of how many bits
11950 make up the mantissa. */
11951 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
11952 int exponent;
11953 unsigned HOST_WIDE_INT mantissa, mask;
11954 REAL_VALUE_TYPE r, m;
11955 bool fail;
11957 if (!CONST_DOUBLE_P (x))
11958 return false;
11960 /* We don't support HFmode constants yet. */
11961 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
11962 return false;
11964 r = *CONST_DOUBLE_REAL_VALUE (x);
11966 /* We cannot represent infinities, NaNs or +/-zero. We won't
11967 know if we have +zero until we analyse the mantissa, but we
11968 can reject the other invalid values. */
11969 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
11970 || REAL_VALUE_MINUS_ZERO (r))
11971 return false;
11973 /* Extract exponent. */
11974 r = real_value_abs (&r);
11975 exponent = REAL_EXP (&r);
11977 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
11978 highest (sign) bit, with a fixed binary point at bit point_pos.
11979 m1 holds the low part of the mantissa, m2 the high part.
11980 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
11981 bits for the mantissa, this can fail (low bits will be lost). */
11982 real_ldexp (&m, &r, point_pos - exponent);
11983 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
11985 /* If the low part of the mantissa has bits set we cannot represent
11986 the value. */
11987 if (w.elt (0) != 0)
11988 return false;
11989 /* We have rejected the lower HOST_WIDE_INT, so update our
11990 understanding of how many bits lie in the mantissa and
11991 look only at the high HOST_WIDE_INT. */
11992 mantissa = w.elt (1);
11993 point_pos -= HOST_BITS_PER_WIDE_INT;
11995 /* We can only represent values with a mantissa of the form 1.xxxx. */
11996 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
11997 if ((mantissa & mask) != 0)
11998 return false;
12000 /* Having filtered unrepresentable values, we may now remove all
12001 but the highest 5 bits. */
12002 mantissa >>= point_pos - 5;
12004 /* We cannot represent the value 0.0, so reject it. This is handled
12005 elsewhere. */
12006 if (mantissa == 0)
12007 return false;
12009 /* Then, as bit 4 is always set, we can mask it off, leaving
12010 the mantissa in the range [0, 15]. */
12011 mantissa &= ~(1 << 4);
12012 gcc_assert (mantissa <= 15);
12014 /* GCC internally does not use IEEE754-like encoding (where normalized
12015 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12016 Our mantissa values are shifted 4 places to the left relative to
12017 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12018 by 5 places to correct for GCC's representation. */
12019 exponent = 5 - exponent;
12021 return (exponent >= 0 && exponent <= 7);
12024 char*
12025 aarch64_output_simd_mov_immediate (rtx const_vector,
12026 machine_mode mode,
12027 unsigned width)
12029 bool is_valid;
12030 static char templ[40];
12031 const char *mnemonic;
12032 const char *shift_op;
12033 unsigned int lane_count = 0;
12034 char element_char;
12036 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12038 /* This will return true to show const_vector is legal for use as either
12039 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12040 also update INFO to show how the immediate should be generated. */
12041 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12042 gcc_assert (is_valid);
12044 element_char = sizetochar (info.element_width);
12045 lane_count = width / info.element_width;
12047 mode = GET_MODE_INNER (mode);
12048 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12050 gcc_assert (info.shift == 0 && ! info.mvn);
12051 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12052 move immediate path. */
12053 if (aarch64_float_const_zero_rtx_p (info.value))
12054 info.value = GEN_INT (0);
12055 else
12057 const unsigned int buf_size = 20;
12058 char float_buf[buf_size] = {'\0'};
12059 real_to_decimal_for_mode (float_buf,
12060 CONST_DOUBLE_REAL_VALUE (info.value),
12061 buf_size, buf_size, 1, mode);
12063 if (lane_count == 1)
12064 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12065 else
12066 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12067 lane_count, element_char, float_buf);
12068 return templ;
12072 mnemonic = info.mvn ? "mvni" : "movi";
12073 shift_op = info.msl ? "msl" : "lsl";
12075 gcc_assert (CONST_INT_P (info.value));
12076 if (lane_count == 1)
12077 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12078 mnemonic, UINTVAL (info.value));
12079 else if (info.shift)
12080 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12081 ", %s %d", mnemonic, lane_count, element_char,
12082 UINTVAL (info.value), shift_op, info.shift);
12083 else
12084 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12085 mnemonic, lane_count, element_char, UINTVAL (info.value));
12086 return templ;
12089 char*
12090 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12091 machine_mode mode)
12093 machine_mode vmode;
12095 gcc_assert (!VECTOR_MODE_P (mode));
12096 vmode = aarch64_simd_container_mode (mode, 64);
12097 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12098 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12101 /* Split operands into moves from op[1] + op[2] into op[0]. */
12103 void
12104 aarch64_split_combinev16qi (rtx operands[3])
12106 unsigned int dest = REGNO (operands[0]);
12107 unsigned int src1 = REGNO (operands[1]);
12108 unsigned int src2 = REGNO (operands[2]);
12109 machine_mode halfmode = GET_MODE (operands[1]);
12110 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12111 rtx destlo, desthi;
12113 gcc_assert (halfmode == V16QImode);
12115 if (src1 == dest && src2 == dest + halfregs)
12117 /* No-op move. Can't split to nothing; emit something. */
12118 emit_note (NOTE_INSN_DELETED);
12119 return;
12122 /* Preserve register attributes for variable tracking. */
12123 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12124 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12125 GET_MODE_SIZE (halfmode));
12127 /* Special case of reversed high/low parts. */
12128 if (reg_overlap_mentioned_p (operands[2], destlo)
12129 && reg_overlap_mentioned_p (operands[1], desthi))
12131 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12132 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12133 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12135 else if (!reg_overlap_mentioned_p (operands[2], destlo))
12137 /* Try to avoid unnecessary moves if part of the result
12138 is in the right place already. */
12139 if (src1 != dest)
12140 emit_move_insn (destlo, operands[1]);
12141 if (src2 != dest + halfregs)
12142 emit_move_insn (desthi, operands[2]);
12144 else
12146 if (src2 != dest + halfregs)
12147 emit_move_insn (desthi, operands[2]);
12148 if (src1 != dest)
12149 emit_move_insn (destlo, operands[1]);
12153 /* vec_perm support. */
12155 #define MAX_VECT_LEN 16
12157 struct expand_vec_perm_d
12159 rtx target, op0, op1;
12160 unsigned char perm[MAX_VECT_LEN];
12161 machine_mode vmode;
12162 unsigned char nelt;
12163 bool one_vector_p;
12164 bool testing_p;
12167 /* Generate a variable permutation. */
12169 static void
12170 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12172 machine_mode vmode = GET_MODE (target);
12173 bool one_vector_p = rtx_equal_p (op0, op1);
12175 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12176 gcc_checking_assert (GET_MODE (op0) == vmode);
12177 gcc_checking_assert (GET_MODE (op1) == vmode);
12178 gcc_checking_assert (GET_MODE (sel) == vmode);
12179 gcc_checking_assert (TARGET_SIMD);
12181 if (one_vector_p)
12183 if (vmode == V8QImode)
12185 /* Expand the argument to a V16QI mode by duplicating it. */
12186 rtx pair = gen_reg_rtx (V16QImode);
12187 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12188 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12190 else
12192 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12195 else
12197 rtx pair;
12199 if (vmode == V8QImode)
12201 pair = gen_reg_rtx (V16QImode);
12202 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12203 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12205 else
12207 pair = gen_reg_rtx (OImode);
12208 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12209 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12214 void
12215 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12217 machine_mode vmode = GET_MODE (target);
12218 unsigned int nelt = GET_MODE_NUNITS (vmode);
12219 bool one_vector_p = rtx_equal_p (op0, op1);
12220 rtx mask;
12222 /* The TBL instruction does not use a modulo index, so we must take care
12223 of that ourselves. */
12224 mask = aarch64_simd_gen_const_vector_dup (vmode,
12225 one_vector_p ? nelt - 1 : 2 * nelt - 1);
12226 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12228 /* For big-endian, we also need to reverse the index within the vector
12229 (but not which vector). */
12230 if (BYTES_BIG_ENDIAN)
12232 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
12233 if (!one_vector_p)
12234 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12235 sel = expand_simple_binop (vmode, XOR, sel, mask,
12236 NULL, 0, OPTAB_LIB_WIDEN);
12238 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12241 /* Recognize patterns suitable for the TRN instructions. */
12242 static bool
12243 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12245 unsigned int i, odd, mask, nelt = d->nelt;
12246 rtx out, in0, in1, x;
12247 rtx (*gen) (rtx, rtx, rtx);
12248 machine_mode vmode = d->vmode;
12250 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12251 return false;
12253 /* Note that these are little-endian tests.
12254 We correct for big-endian later. */
12255 if (d->perm[0] == 0)
12256 odd = 0;
12257 else if (d->perm[0] == 1)
12258 odd = 1;
12259 else
12260 return false;
12261 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12263 for (i = 0; i < nelt; i += 2)
12265 if (d->perm[i] != i + odd)
12266 return false;
12267 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12268 return false;
12271 /* Success! */
12272 if (d->testing_p)
12273 return true;
12275 in0 = d->op0;
12276 in1 = d->op1;
12277 if (BYTES_BIG_ENDIAN)
12279 x = in0, in0 = in1, in1 = x;
12280 odd = !odd;
12282 out = d->target;
12284 if (odd)
12286 switch (vmode)
12288 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12289 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12290 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12291 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12292 case V4SImode: gen = gen_aarch64_trn2v4si; break;
12293 case V2SImode: gen = gen_aarch64_trn2v2si; break;
12294 case V2DImode: gen = gen_aarch64_trn2v2di; break;
12295 case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12296 case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12297 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12298 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12299 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12300 default:
12301 return false;
12304 else
12306 switch (vmode)
12308 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12309 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12310 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12311 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12312 case V4SImode: gen = gen_aarch64_trn1v4si; break;
12313 case V2SImode: gen = gen_aarch64_trn1v2si; break;
12314 case V2DImode: gen = gen_aarch64_trn1v2di; break;
12315 case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12316 case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12317 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12318 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12319 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12320 default:
12321 return false;
12325 emit_insn (gen (out, in0, in1));
12326 return true;
12329 /* Recognize patterns suitable for the UZP instructions. */
12330 static bool
12331 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12333 unsigned int i, odd, mask, nelt = d->nelt;
12334 rtx out, in0, in1, x;
12335 rtx (*gen) (rtx, rtx, rtx);
12336 machine_mode vmode = d->vmode;
12338 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12339 return false;
12341 /* Note that these are little-endian tests.
12342 We correct for big-endian later. */
12343 if (d->perm[0] == 0)
12344 odd = 0;
12345 else if (d->perm[0] == 1)
12346 odd = 1;
12347 else
12348 return false;
12349 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12351 for (i = 0; i < nelt; i++)
12353 unsigned elt = (i * 2 + odd) & mask;
12354 if (d->perm[i] != elt)
12355 return false;
12358 /* Success! */
12359 if (d->testing_p)
12360 return true;
12362 in0 = d->op0;
12363 in1 = d->op1;
12364 if (BYTES_BIG_ENDIAN)
12366 x = in0, in0 = in1, in1 = x;
12367 odd = !odd;
12369 out = d->target;
12371 if (odd)
12373 switch (vmode)
12375 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12376 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12377 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12378 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12379 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12380 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12381 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12382 case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12383 case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12384 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12385 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12386 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12387 default:
12388 return false;
12391 else
12393 switch (vmode)
12395 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12396 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12397 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12398 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12399 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12400 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12401 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12402 case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12403 case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12404 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12405 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12406 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12407 default:
12408 return false;
12412 emit_insn (gen (out, in0, in1));
12413 return true;
12416 /* Recognize patterns suitable for the ZIP instructions. */
12417 static bool
12418 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12420 unsigned int i, high, mask, nelt = d->nelt;
12421 rtx out, in0, in1, x;
12422 rtx (*gen) (rtx, rtx, rtx);
12423 machine_mode vmode = d->vmode;
12425 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12426 return false;
12428 /* Note that these are little-endian tests.
12429 We correct for big-endian later. */
12430 high = nelt / 2;
12431 if (d->perm[0] == high)
12432 /* Do Nothing. */
12434 else if (d->perm[0] == 0)
12435 high = 0;
12436 else
12437 return false;
12438 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12440 for (i = 0; i < nelt / 2; i++)
12442 unsigned elt = (i + high) & mask;
12443 if (d->perm[i * 2] != elt)
12444 return false;
12445 elt = (elt + nelt) & mask;
12446 if (d->perm[i * 2 + 1] != elt)
12447 return false;
12450 /* Success! */
12451 if (d->testing_p)
12452 return true;
12454 in0 = d->op0;
12455 in1 = d->op1;
12456 if (BYTES_BIG_ENDIAN)
12458 x = in0, in0 = in1, in1 = x;
12459 high = !high;
12461 out = d->target;
12463 if (high)
12465 switch (vmode)
12467 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12468 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12469 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12470 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12471 case V4SImode: gen = gen_aarch64_zip2v4si; break;
12472 case V2SImode: gen = gen_aarch64_zip2v2si; break;
12473 case V2DImode: gen = gen_aarch64_zip2v2di; break;
12474 case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12475 case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12476 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12477 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12478 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12479 default:
12480 return false;
12483 else
12485 switch (vmode)
12487 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
12488 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
12489 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
12490 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
12491 case V4SImode: gen = gen_aarch64_zip1v4si; break;
12492 case V2SImode: gen = gen_aarch64_zip1v2si; break;
12493 case V2DImode: gen = gen_aarch64_zip1v2di; break;
12494 case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
12495 case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
12496 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
12497 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
12498 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
12499 default:
12500 return false;
12504 emit_insn (gen (out, in0, in1));
12505 return true;
12508 /* Recognize patterns for the EXT insn. */
12510 static bool
12511 aarch64_evpc_ext (struct expand_vec_perm_d *d)
12513 unsigned int i, nelt = d->nelt;
12514 rtx (*gen) (rtx, rtx, rtx, rtx);
12515 rtx offset;
12517 unsigned int location = d->perm[0]; /* Always < nelt. */
12519 /* Check if the extracted indices are increasing by one. */
12520 for (i = 1; i < nelt; i++)
12522 unsigned int required = location + i;
12523 if (d->one_vector_p)
12525 /* We'll pass the same vector in twice, so allow indices to wrap. */
12526 required &= (nelt - 1);
12528 if (d->perm[i] != required)
12529 return false;
12532 switch (d->vmode)
12534 case V16QImode: gen = gen_aarch64_extv16qi; break;
12535 case V8QImode: gen = gen_aarch64_extv8qi; break;
12536 case V4HImode: gen = gen_aarch64_extv4hi; break;
12537 case V8HImode: gen = gen_aarch64_extv8hi; break;
12538 case V2SImode: gen = gen_aarch64_extv2si; break;
12539 case V4SImode: gen = gen_aarch64_extv4si; break;
12540 case V4HFmode: gen = gen_aarch64_extv4hf; break;
12541 case V8HFmode: gen = gen_aarch64_extv8hf; break;
12542 case V2SFmode: gen = gen_aarch64_extv2sf; break;
12543 case V4SFmode: gen = gen_aarch64_extv4sf; break;
12544 case V2DImode: gen = gen_aarch64_extv2di; break;
12545 case V2DFmode: gen = gen_aarch64_extv2df; break;
12546 default:
12547 return false;
12550 /* Success! */
12551 if (d->testing_p)
12552 return true;
12554 /* The case where (location == 0) is a no-op for both big- and little-endian,
12555 and is removed by the mid-end at optimization levels -O1 and higher. */
12557 if (BYTES_BIG_ENDIAN && (location != 0))
12559 /* After setup, we want the high elements of the first vector (stored
12560 at the LSB end of the register), and the low elements of the second
12561 vector (stored at the MSB end of the register). So swap. */
12562 std::swap (d->op0, d->op1);
12563 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
12564 location = nelt - location;
12567 offset = GEN_INT (location);
12568 emit_insn (gen (d->target, d->op0, d->op1, offset));
12569 return true;
12572 /* Recognize patterns for the REV insns. */
12574 static bool
12575 aarch64_evpc_rev (struct expand_vec_perm_d *d)
12577 unsigned int i, j, diff, nelt = d->nelt;
12578 rtx (*gen) (rtx, rtx);
12580 if (!d->one_vector_p)
12581 return false;
12583 diff = d->perm[0];
12584 switch (diff)
12586 case 7:
12587 switch (d->vmode)
12589 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
12590 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
12591 default:
12592 return false;
12594 break;
12595 case 3:
12596 switch (d->vmode)
12598 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
12599 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
12600 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
12601 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
12602 default:
12603 return false;
12605 break;
12606 case 1:
12607 switch (d->vmode)
12609 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
12610 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
12611 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
12612 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
12613 case V4SImode: gen = gen_aarch64_rev64v4si; break;
12614 case V2SImode: gen = gen_aarch64_rev64v2si; break;
12615 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
12616 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
12617 case V8HFmode: gen = gen_aarch64_rev64v8hf; break;
12618 case V4HFmode: gen = gen_aarch64_rev64v4hf; break;
12619 default:
12620 return false;
12622 break;
12623 default:
12624 return false;
12627 for (i = 0; i < nelt ; i += diff + 1)
12628 for (j = 0; j <= diff; j += 1)
12630 /* This is guaranteed to be true as the value of diff
12631 is 7, 3, 1 and we should have enough elements in the
12632 queue to generate this. Getting a vector mask with a
12633 value of diff other than these values implies that
12634 something is wrong by the time we get here. */
12635 gcc_assert (i + j < nelt);
12636 if (d->perm[i + j] != i + diff - j)
12637 return false;
12640 /* Success! */
12641 if (d->testing_p)
12642 return true;
12644 emit_insn (gen (d->target, d->op0));
12645 return true;
12648 static bool
12649 aarch64_evpc_dup (struct expand_vec_perm_d *d)
12651 rtx (*gen) (rtx, rtx, rtx);
12652 rtx out = d->target;
12653 rtx in0;
12654 machine_mode vmode = d->vmode;
12655 unsigned int i, elt, nelt = d->nelt;
12656 rtx lane;
12658 elt = d->perm[0];
12659 for (i = 1; i < nelt; i++)
12661 if (elt != d->perm[i])
12662 return false;
12665 /* The generic preparation in aarch64_expand_vec_perm_const_1
12666 swaps the operand order and the permute indices if it finds
12667 d->perm[0] to be in the second operand. Thus, we can always
12668 use d->op0 and need not do any extra arithmetic to get the
12669 correct lane number. */
12670 in0 = d->op0;
12671 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
12673 switch (vmode)
12675 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
12676 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
12677 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
12678 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
12679 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
12680 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
12681 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
12682 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
12683 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
12684 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
12685 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
12686 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
12687 default:
12688 return false;
12691 emit_insn (gen (out, in0, lane));
12692 return true;
12695 static bool
12696 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
12698 rtx rperm[MAX_VECT_LEN], sel;
12699 machine_mode vmode = d->vmode;
12700 unsigned int i, nelt = d->nelt;
12702 if (d->testing_p)
12703 return true;
12705 /* Generic code will try constant permutation twice. Once with the
12706 original mode and again with the elements lowered to QImode.
12707 So wait and don't do the selector expansion ourselves. */
12708 if (vmode != V8QImode && vmode != V16QImode)
12709 return false;
12711 for (i = 0; i < nelt; ++i)
12713 int nunits = GET_MODE_NUNITS (vmode);
12715 /* If big-endian and two vectors we end up with a weird mixed-endian
12716 mode on NEON. Reverse the index within each word but not the word
12717 itself. */
12718 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
12719 : d->perm[i]);
12721 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
12722 sel = force_reg (vmode, sel);
12724 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
12725 return true;
12728 static bool
12729 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
12731 /* The pattern matching functions above are written to look for a small
12732 number to begin the sequence (0, 1, N/2). If we begin with an index
12733 from the second operand, we can swap the operands. */
12734 if (d->perm[0] >= d->nelt)
12736 unsigned i, nelt = d->nelt;
12738 gcc_assert (nelt == (nelt & -nelt));
12739 for (i = 0; i < nelt; ++i)
12740 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
12742 std::swap (d->op0, d->op1);
12745 if (TARGET_SIMD)
12747 if (aarch64_evpc_rev (d))
12748 return true;
12749 else if (aarch64_evpc_ext (d))
12750 return true;
12751 else if (aarch64_evpc_dup (d))
12752 return true;
12753 else if (aarch64_evpc_zip (d))
12754 return true;
12755 else if (aarch64_evpc_uzp (d))
12756 return true;
12757 else if (aarch64_evpc_trn (d))
12758 return true;
12759 return aarch64_evpc_tbl (d);
12761 return false;
12764 /* Expand a vec_perm_const pattern. */
12766 bool
12767 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
12769 struct expand_vec_perm_d d;
12770 int i, nelt, which;
12772 d.target = target;
12773 d.op0 = op0;
12774 d.op1 = op1;
12776 d.vmode = GET_MODE (target);
12777 gcc_assert (VECTOR_MODE_P (d.vmode));
12778 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12779 d.testing_p = false;
12781 for (i = which = 0; i < nelt; ++i)
12783 rtx e = XVECEXP (sel, 0, i);
12784 int ei = INTVAL (e) & (2 * nelt - 1);
12785 which |= (ei < nelt ? 1 : 2);
12786 d.perm[i] = ei;
12789 switch (which)
12791 default:
12792 gcc_unreachable ();
12794 case 3:
12795 d.one_vector_p = false;
12796 if (!rtx_equal_p (op0, op1))
12797 break;
12799 /* The elements of PERM do not suggest that only the first operand
12800 is used, but both operands are identical. Allow easier matching
12801 of the permutation by folding the permutation into the single
12802 input vector. */
12803 /* Fall Through. */
12804 case 2:
12805 for (i = 0; i < nelt; ++i)
12806 d.perm[i] &= nelt - 1;
12807 d.op0 = op1;
12808 d.one_vector_p = true;
12809 break;
12811 case 1:
12812 d.op1 = op0;
12813 d.one_vector_p = true;
12814 break;
12817 return aarch64_expand_vec_perm_const_1 (&d);
12820 static bool
12821 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
12822 const unsigned char *sel)
12824 struct expand_vec_perm_d d;
12825 unsigned int i, nelt, which;
12826 bool ret;
12828 d.vmode = vmode;
12829 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
12830 d.testing_p = true;
12831 memcpy (d.perm, sel, nelt);
12833 /* Calculate whether all elements are in one vector. */
12834 for (i = which = 0; i < nelt; ++i)
12836 unsigned char e = d.perm[i];
12837 gcc_assert (e < 2 * nelt);
12838 which |= (e < nelt ? 1 : 2);
12841 /* If all elements are from the second vector, reindex as if from the
12842 first vector. */
12843 if (which == 2)
12844 for (i = 0; i < nelt; ++i)
12845 d.perm[i] -= nelt;
12847 /* Check whether the mask can be applied to a single vector. */
12848 d.one_vector_p = (which != 3);
12850 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
12851 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
12852 if (!d.one_vector_p)
12853 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
12855 start_sequence ();
12856 ret = aarch64_expand_vec_perm_const_1 (&d);
12857 end_sequence ();
12859 return ret;
12863 aarch64_reverse_mask (enum machine_mode mode)
12865 /* We have to reverse each vector because we dont have
12866 a permuted load that can reverse-load according to ABI rules. */
12867 rtx mask;
12868 rtvec v = rtvec_alloc (16);
12869 int i, j;
12870 int nunits = GET_MODE_NUNITS (mode);
12871 int usize = GET_MODE_UNIT_SIZE (mode);
12873 gcc_assert (BYTES_BIG_ENDIAN);
12874 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
12876 for (i = 0; i < nunits; i++)
12877 for (j = 0; j < usize; j++)
12878 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
12879 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
12880 return force_reg (V16QImode, mask);
12883 /* Implement MODES_TIEABLE_P. In principle we should always return true.
12884 However due to issues with register allocation it is preferable to avoid
12885 tieing integer scalar and FP scalar modes. Executing integer operations
12886 in general registers is better than treating them as scalar vector
12887 operations. This reduces latency and avoids redundant int<->FP moves.
12888 So tie modes if they are either the same class, or vector modes with
12889 other vector modes, vector structs or any scalar mode.
12892 bool
12893 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
12895 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
12896 return true;
12898 /* We specifically want to allow elements of "structure" modes to
12899 be tieable to the structure. This more general condition allows
12900 other rarer situations too. */
12901 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
12902 return true;
12904 /* Also allow any scalar modes with vectors. */
12905 if (aarch64_vector_mode_supported_p (mode1)
12906 || aarch64_vector_mode_supported_p (mode2))
12907 return true;
12909 return false;
12912 /* Return a new RTX holding the result of moving POINTER forward by
12913 AMOUNT bytes. */
12915 static rtx
12916 aarch64_move_pointer (rtx pointer, int amount)
12918 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
12920 return adjust_automodify_address (pointer, GET_MODE (pointer),
12921 next, amount);
12924 /* Return a new RTX holding the result of moving POINTER forward by the
12925 size of the mode it points to. */
12927 static rtx
12928 aarch64_progress_pointer (rtx pointer)
12930 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
12932 return aarch64_move_pointer (pointer, amount);
12935 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
12936 MODE bytes. */
12938 static void
12939 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
12940 machine_mode mode)
12942 rtx reg = gen_reg_rtx (mode);
12944 /* "Cast" the pointers to the correct mode. */
12945 *src = adjust_address (*src, mode, 0);
12946 *dst = adjust_address (*dst, mode, 0);
12947 /* Emit the memcpy. */
12948 emit_move_insn (reg, *src);
12949 emit_move_insn (*dst, reg);
12950 /* Move the pointers forward. */
12951 *src = aarch64_progress_pointer (*src);
12952 *dst = aarch64_progress_pointer (*dst);
12955 /* Expand movmem, as if from a __builtin_memcpy. Return true if
12956 we succeed, otherwise return false. */
12958 bool
12959 aarch64_expand_movmem (rtx *operands)
12961 unsigned int n;
12962 rtx dst = operands[0];
12963 rtx src = operands[1];
12964 rtx base;
12965 bool speed_p = !optimize_function_for_size_p (cfun);
12967 /* When optimizing for size, give a better estimate of the length of a
12968 memcpy call, but use the default otherwise. */
12969 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
12971 /* We can't do anything smart if the amount to copy is not constant. */
12972 if (!CONST_INT_P (operands[2]))
12973 return false;
12975 n = UINTVAL (operands[2]);
12977 /* Try to keep the number of instructions low. For cases below 16 bytes we
12978 need to make at most two moves. For cases above 16 bytes it will be one
12979 move for each 16 byte chunk, then at most two additional moves. */
12980 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
12981 return false;
12983 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
12984 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
12986 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
12987 src = adjust_automodify_address (src, VOIDmode, base, 0);
12989 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
12990 1-byte chunk. */
12991 if (n < 4)
12993 if (n >= 2)
12995 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
12996 n -= 2;
12999 if (n == 1)
13000 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13002 return true;
13005 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13006 4-byte chunk, partially overlapping with the previously copied chunk. */
13007 if (n < 8)
13009 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13010 n -= 4;
13011 if (n > 0)
13013 int move = n - 4;
13015 src = aarch64_move_pointer (src, move);
13016 dst = aarch64_move_pointer (dst, move);
13017 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13019 return true;
13022 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13023 them, then (if applicable) an 8-byte chunk. */
13024 while (n >= 8)
13026 if (n / 16)
13028 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13029 n -= 16;
13031 else
13033 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13034 n -= 8;
13038 /* Finish the final bytes of the copy. We can always do this in one
13039 instruction. We either copy the exact amount we need, or partially
13040 overlap with the previous chunk we copied and copy 8-bytes. */
13041 if (n == 0)
13042 return true;
13043 else if (n == 1)
13044 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13045 else if (n == 2)
13046 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13047 else if (n == 4)
13048 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13049 else
13051 if (n == 3)
13053 src = aarch64_move_pointer (src, -1);
13054 dst = aarch64_move_pointer (dst, -1);
13055 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13057 else
13059 int move = n - 8;
13061 src = aarch64_move_pointer (src, move);
13062 dst = aarch64_move_pointer (dst, move);
13063 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13067 return true;
13070 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13072 static unsigned HOST_WIDE_INT
13073 aarch64_asan_shadow_offset (void)
13075 return (HOST_WIDE_INT_1 << 36);
13078 static bool
13079 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13080 unsigned int align,
13081 enum by_pieces_operation op,
13082 bool speed_p)
13084 /* STORE_BY_PIECES can be used when copying a constant string, but
13085 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13086 For now we always fail this and let the move_by_pieces code copy
13087 the string from read-only memory. */
13088 if (op == STORE_BY_PIECES)
13089 return false;
13091 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13094 static rtx
13095 aarch64_gen_ccmp_first (rtx *prep_seq, rtx *gen_seq,
13096 int code, tree treeop0, tree treeop1)
13098 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13099 rtx op0, op1;
13100 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13101 insn_code icode;
13102 struct expand_operand ops[4];
13104 start_sequence ();
13105 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13107 op_mode = GET_MODE (op0);
13108 if (op_mode == VOIDmode)
13109 op_mode = GET_MODE (op1);
13111 switch (op_mode)
13113 case QImode:
13114 case HImode:
13115 case SImode:
13116 cmp_mode = SImode;
13117 icode = CODE_FOR_cmpsi;
13118 break;
13120 case DImode:
13121 cmp_mode = DImode;
13122 icode = CODE_FOR_cmpdi;
13123 break;
13125 case SFmode:
13126 cmp_mode = SFmode;
13127 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13128 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13129 break;
13131 case DFmode:
13132 cmp_mode = DFmode;
13133 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13134 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13135 break;
13137 default:
13138 end_sequence ();
13139 return NULL_RTX;
13142 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13143 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13144 if (!op0 || !op1)
13146 end_sequence ();
13147 return NULL_RTX;
13149 *prep_seq = get_insns ();
13150 end_sequence ();
13152 create_fixed_operand (&ops[0], op0);
13153 create_fixed_operand (&ops[1], op1);
13155 start_sequence ();
13156 if (!maybe_expand_insn (icode, 2, ops))
13158 end_sequence ();
13159 return NULL_RTX;
13161 *gen_seq = get_insns ();
13162 end_sequence ();
13164 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13165 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13168 static rtx
13169 aarch64_gen_ccmp_next (rtx *prep_seq, rtx *gen_seq, rtx prev, int cmp_code,
13170 tree treeop0, tree treeop1, int bit_code)
13172 rtx op0, op1, target;
13173 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13174 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13175 insn_code icode;
13176 struct expand_operand ops[6];
13177 int aarch64_cond;
13179 push_to_sequence ((rtx_insn*) *prep_seq);
13180 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13182 op_mode = GET_MODE (op0);
13183 if (op_mode == VOIDmode)
13184 op_mode = GET_MODE (op1);
13186 switch (op_mode)
13188 case QImode:
13189 case HImode:
13190 case SImode:
13191 cmp_mode = SImode;
13192 icode = CODE_FOR_ccmpsi;
13193 break;
13195 case DImode:
13196 cmp_mode = DImode;
13197 icode = CODE_FOR_ccmpdi;
13198 break;
13200 case SFmode:
13201 cmp_mode = SFmode;
13202 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13203 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13204 break;
13206 case DFmode:
13207 cmp_mode = DFmode;
13208 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13209 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13210 break;
13212 default:
13213 end_sequence ();
13214 return NULL_RTX;
13217 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13218 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13219 if (!op0 || !op1)
13221 end_sequence ();
13222 return NULL_RTX;
13224 *prep_seq = get_insns ();
13225 end_sequence ();
13227 target = gen_rtx_REG (cc_mode, CC_REGNUM);
13228 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13230 if (bit_code != AND)
13232 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13233 GET_MODE (XEXP (prev, 0))),
13234 VOIDmode, XEXP (prev, 0), const0_rtx);
13235 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13238 create_fixed_operand (&ops[0], XEXP (prev, 0));
13239 create_fixed_operand (&ops[1], target);
13240 create_fixed_operand (&ops[2], op0);
13241 create_fixed_operand (&ops[3], op1);
13242 create_fixed_operand (&ops[4], prev);
13243 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13245 push_to_sequence ((rtx_insn*) *gen_seq);
13246 if (!maybe_expand_insn (icode, 6, ops))
13248 end_sequence ();
13249 return NULL_RTX;
13252 *gen_seq = get_insns ();
13253 end_sequence ();
13255 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13258 #undef TARGET_GEN_CCMP_FIRST
13259 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13261 #undef TARGET_GEN_CCMP_NEXT
13262 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13264 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
13265 instruction fusion of some sort. */
13267 static bool
13268 aarch64_macro_fusion_p (void)
13270 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13274 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
13275 should be kept together during scheduling. */
13277 static bool
13278 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13280 rtx set_dest;
13281 rtx prev_set = single_set (prev);
13282 rtx curr_set = single_set (curr);
13283 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
13284 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13286 if (!aarch64_macro_fusion_p ())
13287 return false;
13289 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13291 /* We are trying to match:
13292 prev (mov) == (set (reg r0) (const_int imm16))
13293 curr (movk) == (set (zero_extract (reg r0)
13294 (const_int 16)
13295 (const_int 16))
13296 (const_int imm16_1)) */
13298 set_dest = SET_DEST (curr_set);
13300 if (GET_CODE (set_dest) == ZERO_EXTRACT
13301 && CONST_INT_P (SET_SRC (curr_set))
13302 && CONST_INT_P (SET_SRC (prev_set))
13303 && CONST_INT_P (XEXP (set_dest, 2))
13304 && INTVAL (XEXP (set_dest, 2)) == 16
13305 && REG_P (XEXP (set_dest, 0))
13306 && REG_P (SET_DEST (prev_set))
13307 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13309 return true;
13313 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13316 /* We're trying to match:
13317 prev (adrp) == (set (reg r1)
13318 (high (symbol_ref ("SYM"))))
13319 curr (add) == (set (reg r0)
13320 (lo_sum (reg r1)
13321 (symbol_ref ("SYM"))))
13322 Note that r0 need not necessarily be the same as r1, especially
13323 during pre-regalloc scheduling. */
13325 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13326 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13328 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13329 && REG_P (XEXP (SET_SRC (curr_set), 0))
13330 && REGNO (XEXP (SET_SRC (curr_set), 0))
13331 == REGNO (SET_DEST (prev_set))
13332 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13333 XEXP (SET_SRC (curr_set), 1)))
13334 return true;
13338 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13341 /* We're trying to match:
13342 prev (movk) == (set (zero_extract (reg r0)
13343 (const_int 16)
13344 (const_int 32))
13345 (const_int imm16_1))
13346 curr (movk) == (set (zero_extract (reg r0)
13347 (const_int 16)
13348 (const_int 48))
13349 (const_int imm16_2)) */
13351 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13352 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13353 && REG_P (XEXP (SET_DEST (prev_set), 0))
13354 && REG_P (XEXP (SET_DEST (curr_set), 0))
13355 && REGNO (XEXP (SET_DEST (prev_set), 0))
13356 == REGNO (XEXP (SET_DEST (curr_set), 0))
13357 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13358 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13359 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13360 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13361 && CONST_INT_P (SET_SRC (prev_set))
13362 && CONST_INT_P (SET_SRC (curr_set)))
13363 return true;
13366 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13368 /* We're trying to match:
13369 prev (adrp) == (set (reg r0)
13370 (high (symbol_ref ("SYM"))))
13371 curr (ldr) == (set (reg r1)
13372 (mem (lo_sum (reg r0)
13373 (symbol_ref ("SYM")))))
13375 curr (ldr) == (set (reg r1)
13376 (zero_extend (mem
13377 (lo_sum (reg r0)
13378 (symbol_ref ("SYM")))))) */
13379 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13380 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13382 rtx curr_src = SET_SRC (curr_set);
13384 if (GET_CODE (curr_src) == ZERO_EXTEND)
13385 curr_src = XEXP (curr_src, 0);
13387 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13388 && REG_P (XEXP (XEXP (curr_src, 0), 0))
13389 && REGNO (XEXP (XEXP (curr_src, 0), 0))
13390 == REGNO (SET_DEST (prev_set))
13391 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13392 XEXP (SET_SRC (prev_set), 0)))
13393 return true;
13397 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
13398 && aarch_crypto_can_dual_issue (prev, curr))
13399 return true;
13401 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
13402 && any_condjump_p (curr))
13404 enum attr_type prev_type = get_attr_type (prev);
13406 /* FIXME: this misses some which is considered simple arthematic
13407 instructions for ThunderX. Simple shifts are missed here. */
13408 if (prev_type == TYPE_ALUS_SREG
13409 || prev_type == TYPE_ALUS_IMM
13410 || prev_type == TYPE_LOGICS_REG
13411 || prev_type == TYPE_LOGICS_IMM)
13412 return true;
13415 return false;
13418 /* Return true iff the instruction fusion described by OP is enabled. */
13420 bool
13421 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13423 return (aarch64_tune_params.fusible_ops & op) != 0;
13426 /* If MEM is in the form of [base+offset], extract the two parts
13427 of address and set to BASE and OFFSET, otherwise return false
13428 after clearing BASE and OFFSET. */
13430 bool
13431 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
13433 rtx addr;
13435 gcc_assert (MEM_P (mem));
13437 addr = XEXP (mem, 0);
13439 if (REG_P (addr))
13441 *base = addr;
13442 *offset = const0_rtx;
13443 return true;
13446 if (GET_CODE (addr) == PLUS
13447 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
13449 *base = XEXP (addr, 0);
13450 *offset = XEXP (addr, 1);
13451 return true;
13454 *base = NULL_RTX;
13455 *offset = NULL_RTX;
13457 return false;
13460 /* Types for scheduling fusion. */
13461 enum sched_fusion_type
13463 SCHED_FUSION_NONE = 0,
13464 SCHED_FUSION_LD_SIGN_EXTEND,
13465 SCHED_FUSION_LD_ZERO_EXTEND,
13466 SCHED_FUSION_LD,
13467 SCHED_FUSION_ST,
13468 SCHED_FUSION_NUM
13471 /* If INSN is a load or store of address in the form of [base+offset],
13472 extract the two parts and set to BASE and OFFSET. Return scheduling
13473 fusion type this INSN is. */
13475 static enum sched_fusion_type
13476 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
13478 rtx x, dest, src;
13479 enum sched_fusion_type fusion = SCHED_FUSION_LD;
13481 gcc_assert (INSN_P (insn));
13482 x = PATTERN (insn);
13483 if (GET_CODE (x) != SET)
13484 return SCHED_FUSION_NONE;
13486 src = SET_SRC (x);
13487 dest = SET_DEST (x);
13489 machine_mode dest_mode = GET_MODE (dest);
13491 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
13492 return SCHED_FUSION_NONE;
13494 if (GET_CODE (src) == SIGN_EXTEND)
13496 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
13497 src = XEXP (src, 0);
13498 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13499 return SCHED_FUSION_NONE;
13501 else if (GET_CODE (src) == ZERO_EXTEND)
13503 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
13504 src = XEXP (src, 0);
13505 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
13506 return SCHED_FUSION_NONE;
13509 if (GET_CODE (src) == MEM && REG_P (dest))
13510 extract_base_offset_in_addr (src, base, offset);
13511 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
13513 fusion = SCHED_FUSION_ST;
13514 extract_base_offset_in_addr (dest, base, offset);
13516 else
13517 return SCHED_FUSION_NONE;
13519 if (*base == NULL_RTX || *offset == NULL_RTX)
13520 fusion = SCHED_FUSION_NONE;
13522 return fusion;
13525 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
13527 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
13528 and PRI are only calculated for these instructions. For other instruction,
13529 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
13530 type instruction fusion can be added by returning different priorities.
13532 It's important that irrelevant instructions get the largest FUSION_PRI. */
13534 static void
13535 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
13536 int *fusion_pri, int *pri)
13538 int tmp, off_val;
13539 rtx base, offset;
13540 enum sched_fusion_type fusion;
13542 gcc_assert (INSN_P (insn));
13544 tmp = max_pri - 1;
13545 fusion = fusion_load_store (insn, &base, &offset);
13546 if (fusion == SCHED_FUSION_NONE)
13548 *pri = tmp;
13549 *fusion_pri = tmp;
13550 return;
13553 /* Set FUSION_PRI according to fusion type and base register. */
13554 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
13556 /* Calculate PRI. */
13557 tmp /= 2;
13559 /* INSN with smaller offset goes first. */
13560 off_val = (int)(INTVAL (offset));
13561 if (off_val >= 0)
13562 tmp -= (off_val & 0xfffff);
13563 else
13564 tmp += ((- off_val) & 0xfffff);
13566 *pri = tmp;
13567 return;
13570 /* Given OPERANDS of consecutive load/store, check if we can merge
13571 them into ldp/stp. LOAD is true if they are load instructions.
13572 MODE is the mode of memory operands. */
13574 bool
13575 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
13576 enum machine_mode mode)
13578 HOST_WIDE_INT offval_1, offval_2, msize;
13579 enum reg_class rclass_1, rclass_2;
13580 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
13582 if (load)
13584 mem_1 = operands[1];
13585 mem_2 = operands[3];
13586 reg_1 = operands[0];
13587 reg_2 = operands[2];
13588 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
13589 if (REGNO (reg_1) == REGNO (reg_2))
13590 return false;
13592 else
13594 mem_1 = operands[0];
13595 mem_2 = operands[2];
13596 reg_1 = operands[1];
13597 reg_2 = operands[3];
13600 /* The mems cannot be volatile. */
13601 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
13602 return false;
13604 /* Check if the addresses are in the form of [base+offset]. */
13605 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13606 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13607 return false;
13608 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13609 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13610 return false;
13612 /* Check if the bases are same. */
13613 if (!rtx_equal_p (base_1, base_2))
13614 return false;
13616 offval_1 = INTVAL (offset_1);
13617 offval_2 = INTVAL (offset_2);
13618 msize = GET_MODE_SIZE (mode);
13619 /* Check if the offsets are consecutive. */
13620 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
13621 return false;
13623 /* Check if the addresses are clobbered by load. */
13624 if (load)
13626 if (reg_mentioned_p (reg_1, mem_1))
13627 return false;
13629 /* In increasing order, the last load can clobber the address. */
13630 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
13631 return false;
13634 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13635 rclass_1 = FP_REGS;
13636 else
13637 rclass_1 = GENERAL_REGS;
13639 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13640 rclass_2 = FP_REGS;
13641 else
13642 rclass_2 = GENERAL_REGS;
13644 /* Check if the registers are of same class. */
13645 if (rclass_1 != rclass_2)
13646 return false;
13648 return true;
13651 /* Given OPERANDS of consecutive load/store, check if we can merge
13652 them into ldp/stp by adjusting the offset. LOAD is true if they
13653 are load instructions. MODE is the mode of memory operands.
13655 Given below consecutive stores:
13657 str w1, [xb, 0x100]
13658 str w1, [xb, 0x104]
13659 str w1, [xb, 0x108]
13660 str w1, [xb, 0x10c]
13662 Though the offsets are out of the range supported by stp, we can
13663 still pair them after adjusting the offset, like:
13665 add scratch, xb, 0x100
13666 stp w1, w1, [scratch]
13667 stp w1, w1, [scratch, 0x8]
13669 The peephole patterns detecting this opportunity should guarantee
13670 the scratch register is avaliable. */
13672 bool
13673 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
13674 enum machine_mode mode)
13676 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
13677 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
13678 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
13679 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
13681 if (load)
13683 reg_1 = operands[0];
13684 mem_1 = operands[1];
13685 reg_2 = operands[2];
13686 mem_2 = operands[3];
13687 reg_3 = operands[4];
13688 mem_3 = operands[5];
13689 reg_4 = operands[6];
13690 mem_4 = operands[7];
13691 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
13692 && REG_P (reg_3) && REG_P (reg_4));
13693 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
13694 return false;
13696 else
13698 mem_1 = operands[0];
13699 reg_1 = operands[1];
13700 mem_2 = operands[2];
13701 reg_2 = operands[3];
13702 mem_3 = operands[4];
13703 reg_3 = operands[5];
13704 mem_4 = operands[6];
13705 reg_4 = operands[7];
13707 /* Skip if memory operand is by itslef valid for ldp/stp. */
13708 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
13709 return false;
13711 /* The mems cannot be volatile. */
13712 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
13713 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
13714 return false;
13716 /* Check if the addresses are in the form of [base+offset]. */
13717 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
13718 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
13719 return false;
13720 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
13721 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
13722 return false;
13723 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
13724 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
13725 return false;
13726 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
13727 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
13728 return false;
13730 /* Check if the bases are same. */
13731 if (!rtx_equal_p (base_1, base_2)
13732 || !rtx_equal_p (base_2, base_3)
13733 || !rtx_equal_p (base_3, base_4))
13734 return false;
13736 offval_1 = INTVAL (offset_1);
13737 offval_2 = INTVAL (offset_2);
13738 offval_3 = INTVAL (offset_3);
13739 offval_4 = INTVAL (offset_4);
13740 msize = GET_MODE_SIZE (mode);
13741 /* Check if the offsets are consecutive. */
13742 if ((offval_1 != (offval_2 + msize)
13743 || offval_1 != (offval_3 + msize * 2)
13744 || offval_1 != (offval_4 + msize * 3))
13745 && (offval_4 != (offval_3 + msize)
13746 || offval_4 != (offval_2 + msize * 2)
13747 || offval_4 != (offval_1 + msize * 3)))
13748 return false;
13750 /* Check if the addresses are clobbered by load. */
13751 if (load)
13753 if (reg_mentioned_p (reg_1, mem_1)
13754 || reg_mentioned_p (reg_2, mem_2)
13755 || reg_mentioned_p (reg_3, mem_3))
13756 return false;
13758 /* In increasing order, the last load can clobber the address. */
13759 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
13760 return false;
13763 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
13764 rclass_1 = FP_REGS;
13765 else
13766 rclass_1 = GENERAL_REGS;
13768 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
13769 rclass_2 = FP_REGS;
13770 else
13771 rclass_2 = GENERAL_REGS;
13773 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
13774 rclass_3 = FP_REGS;
13775 else
13776 rclass_3 = GENERAL_REGS;
13778 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
13779 rclass_4 = FP_REGS;
13780 else
13781 rclass_4 = GENERAL_REGS;
13783 /* Check if the registers are of same class. */
13784 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
13785 return false;
13787 return true;
13790 /* Given OPERANDS of consecutive load/store, this function pairs them
13791 into ldp/stp after adjusting the offset. It depends on the fact
13792 that addresses of load/store instructions are in increasing order.
13793 MODE is the mode of memory operands. CODE is the rtl operator
13794 which should be applied to all memory operands, it's SIGN_EXTEND,
13795 ZERO_EXTEND or UNKNOWN. */
13797 bool
13798 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
13799 enum machine_mode mode, RTX_CODE code)
13801 rtx base, offset, t1, t2;
13802 rtx mem_1, mem_2, mem_3, mem_4;
13803 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
13805 if (load)
13807 mem_1 = operands[1];
13808 mem_2 = operands[3];
13809 mem_3 = operands[5];
13810 mem_4 = operands[7];
13812 else
13814 mem_1 = operands[0];
13815 mem_2 = operands[2];
13816 mem_3 = operands[4];
13817 mem_4 = operands[6];
13818 gcc_assert (code == UNKNOWN);
13821 extract_base_offset_in_addr (mem_1, &base, &offset);
13822 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
13824 /* Adjust offset thus it can fit in ldp/stp instruction. */
13825 msize = GET_MODE_SIZE (mode);
13826 stp_off_limit = msize * 0x40;
13827 off_val = INTVAL (offset);
13828 abs_off = (off_val < 0) ? -off_val : off_val;
13829 new_off = abs_off % stp_off_limit;
13830 adj_off = abs_off - new_off;
13832 /* Further adjust to make sure all offsets are OK. */
13833 if ((new_off + msize * 2) >= stp_off_limit)
13835 adj_off += stp_off_limit;
13836 new_off -= stp_off_limit;
13839 /* Make sure the adjustment can be done with ADD/SUB instructions. */
13840 if (adj_off >= 0x1000)
13841 return false;
13843 if (off_val < 0)
13845 adj_off = -adj_off;
13846 new_off = -new_off;
13849 /* Create new memory references. */
13850 mem_1 = change_address (mem_1, VOIDmode,
13851 plus_constant (DImode, operands[8], new_off));
13853 /* Check if the adjusted address is OK for ldp/stp. */
13854 if (!aarch64_mem_pair_operand (mem_1, mode))
13855 return false;
13857 msize = GET_MODE_SIZE (mode);
13858 mem_2 = change_address (mem_2, VOIDmode,
13859 plus_constant (DImode,
13860 operands[8],
13861 new_off + msize));
13862 mem_3 = change_address (mem_3, VOIDmode,
13863 plus_constant (DImode,
13864 operands[8],
13865 new_off + msize * 2));
13866 mem_4 = change_address (mem_4, VOIDmode,
13867 plus_constant (DImode,
13868 operands[8],
13869 new_off + msize * 3));
13871 if (code == ZERO_EXTEND)
13873 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
13874 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
13875 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
13876 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
13878 else if (code == SIGN_EXTEND)
13880 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
13881 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
13882 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
13883 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
13886 if (load)
13888 operands[1] = mem_1;
13889 operands[3] = mem_2;
13890 operands[5] = mem_3;
13891 operands[7] = mem_4;
13893 else
13895 operands[0] = mem_1;
13896 operands[2] = mem_2;
13897 operands[4] = mem_3;
13898 operands[6] = mem_4;
13901 /* Emit adjusting instruction. */
13902 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
13903 /* Emit ldp/stp instructions. */
13904 t1 = gen_rtx_SET (operands[0], operands[1]);
13905 t2 = gen_rtx_SET (operands[2], operands[3]);
13906 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13907 t1 = gen_rtx_SET (operands[4], operands[5]);
13908 t2 = gen_rtx_SET (operands[6], operands[7]);
13909 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
13910 return true;
13913 /* Return 1 if pseudo register should be created and used to hold
13914 GOT address for PIC code. */
13916 bool
13917 aarch64_use_pseudo_pic_reg (void)
13919 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
13922 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
13924 static int
13925 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
13927 switch (XINT (x, 1))
13929 case UNSPEC_GOTSMALLPIC:
13930 case UNSPEC_GOTSMALLPIC28K:
13931 case UNSPEC_GOTTINYPIC:
13932 return 0;
13933 default:
13934 break;
13937 return default_unspec_may_trap_p (x, flags);
13941 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
13942 return the log2 of that value. Otherwise return -1. */
13945 aarch64_fpconst_pow_of_2 (rtx x)
13947 const REAL_VALUE_TYPE *r;
13949 if (!CONST_DOUBLE_P (x))
13950 return -1;
13952 r = CONST_DOUBLE_REAL_VALUE (x);
13954 if (REAL_VALUE_NEGATIVE (*r)
13955 || REAL_VALUE_ISNAN (*r)
13956 || REAL_VALUE_ISINF (*r)
13957 || !real_isinteger (r, DFmode))
13958 return -1;
13960 return exact_log2 (real_to_integer (r));
13963 /* If X is a vector of equal CONST_DOUBLE values and that value is
13964 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
13967 aarch64_vec_fpconst_pow_of_2 (rtx x)
13969 if (GET_CODE (x) != CONST_VECTOR)
13970 return -1;
13972 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
13973 return -1;
13975 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
13976 if (firstval <= 0)
13977 return -1;
13979 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
13980 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
13981 return -1;
13983 return firstval;
13986 /* Implement TARGET_PROMOTED_TYPE to promote __fp16 to float. */
13987 static tree
13988 aarch64_promoted_type (const_tree t)
13990 if (SCALAR_FLOAT_TYPE_P (t) && TYPE_PRECISION (t) == 16)
13991 return float_type_node;
13992 return NULL_TREE;
13995 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
13997 static bool
13998 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
13999 optimization_type opt_type)
14001 switch (op)
14003 case rsqrt_optab:
14004 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14006 default:
14007 return true;
14011 #undef TARGET_ADDRESS_COST
14012 #define TARGET_ADDRESS_COST aarch64_address_cost
14014 /* This hook will determines whether unnamed bitfields affect the alignment
14015 of the containing structure. The hook returns true if the structure
14016 should inherit the alignment requirements of an unnamed bitfield's
14017 type. */
14018 #undef TARGET_ALIGN_ANON_BITFIELD
14019 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14021 #undef TARGET_ASM_ALIGNED_DI_OP
14022 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14024 #undef TARGET_ASM_ALIGNED_HI_OP
14025 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14027 #undef TARGET_ASM_ALIGNED_SI_OP
14028 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14030 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14031 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14032 hook_bool_const_tree_hwi_hwi_const_tree_true
14034 #undef TARGET_ASM_FILE_START
14035 #define TARGET_ASM_FILE_START aarch64_start_file
14037 #undef TARGET_ASM_OUTPUT_MI_THUNK
14038 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14040 #undef TARGET_ASM_SELECT_RTX_SECTION
14041 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14043 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14044 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14046 #undef TARGET_BUILD_BUILTIN_VA_LIST
14047 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14049 #undef TARGET_CALLEE_COPIES
14050 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14052 #undef TARGET_CAN_ELIMINATE
14053 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14055 #undef TARGET_CAN_INLINE_P
14056 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14058 #undef TARGET_CANNOT_FORCE_CONST_MEM
14059 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14061 #undef TARGET_CASE_VALUES_THRESHOLD
14062 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14064 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14065 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14067 /* Only the least significant bit is used for initialization guard
14068 variables. */
14069 #undef TARGET_CXX_GUARD_MASK_BIT
14070 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14072 #undef TARGET_C_MODE_FOR_SUFFIX
14073 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14075 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14076 #undef TARGET_DEFAULT_TARGET_FLAGS
14077 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14078 #endif
14080 #undef TARGET_CLASS_MAX_NREGS
14081 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14083 #undef TARGET_BUILTIN_DECL
14084 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14086 #undef TARGET_BUILTIN_RECIPROCAL
14087 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14089 #undef TARGET_EXPAND_BUILTIN
14090 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14092 #undef TARGET_EXPAND_BUILTIN_VA_START
14093 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14095 #undef TARGET_FOLD_BUILTIN
14096 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14098 #undef TARGET_FUNCTION_ARG
14099 #define TARGET_FUNCTION_ARG aarch64_function_arg
14101 #undef TARGET_FUNCTION_ARG_ADVANCE
14102 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14104 #undef TARGET_FUNCTION_ARG_BOUNDARY
14105 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14107 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14108 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14110 #undef TARGET_FUNCTION_VALUE
14111 #define TARGET_FUNCTION_VALUE aarch64_function_value
14113 #undef TARGET_FUNCTION_VALUE_REGNO_P
14114 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14116 #undef TARGET_FRAME_POINTER_REQUIRED
14117 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14119 #undef TARGET_GIMPLE_FOLD_BUILTIN
14120 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14122 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14123 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14125 #undef TARGET_INIT_BUILTINS
14126 #define TARGET_INIT_BUILTINS aarch64_init_builtins
14128 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14129 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14130 aarch64_ira_change_pseudo_allocno_class
14132 #undef TARGET_LEGITIMATE_ADDRESS_P
14133 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14135 #undef TARGET_LEGITIMATE_CONSTANT_P
14136 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14138 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14139 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14141 #undef TARGET_LRA_P
14142 #define TARGET_LRA_P hook_bool_void_true
14144 #undef TARGET_MANGLE_TYPE
14145 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14147 #undef TARGET_MEMORY_MOVE_COST
14148 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14150 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14151 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14153 #undef TARGET_MUST_PASS_IN_STACK
14154 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14156 /* This target hook should return true if accesses to volatile bitfields
14157 should use the narrowest mode possible. It should return false if these
14158 accesses should use the bitfield container type. */
14159 #undef TARGET_NARROW_VOLATILE_BITFIELD
14160 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14162 #undef TARGET_OPTION_OVERRIDE
14163 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14165 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14166 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14167 aarch64_override_options_after_change
14169 #undef TARGET_OPTION_SAVE
14170 #define TARGET_OPTION_SAVE aarch64_option_save
14172 #undef TARGET_OPTION_RESTORE
14173 #define TARGET_OPTION_RESTORE aarch64_option_restore
14175 #undef TARGET_OPTION_PRINT
14176 #define TARGET_OPTION_PRINT aarch64_option_print
14178 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14179 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14181 #undef TARGET_SET_CURRENT_FUNCTION
14182 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14184 #undef TARGET_PASS_BY_REFERENCE
14185 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14187 #undef TARGET_PREFERRED_RELOAD_CLASS
14188 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14190 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14191 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14193 #undef TARGET_PROMOTED_TYPE
14194 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14196 #undef TARGET_SECONDARY_RELOAD
14197 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14199 #undef TARGET_SHIFT_TRUNCATION_MASK
14200 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14202 #undef TARGET_SETUP_INCOMING_VARARGS
14203 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14205 #undef TARGET_STRUCT_VALUE_RTX
14206 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
14208 #undef TARGET_REGISTER_MOVE_COST
14209 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14211 #undef TARGET_RETURN_IN_MEMORY
14212 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14214 #undef TARGET_RETURN_IN_MSB
14215 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14217 #undef TARGET_RTX_COSTS
14218 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14220 #undef TARGET_SCHED_ISSUE_RATE
14221 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14223 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14224 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14225 aarch64_sched_first_cycle_multipass_dfa_lookahead
14227 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14228 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14229 aarch64_first_cycle_multipass_dfa_lookahead_guard
14231 #undef TARGET_TRAMPOLINE_INIT
14232 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
14234 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
14235 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
14237 #undef TARGET_VECTOR_MODE_SUPPORTED_P
14238 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
14240 #undef TARGET_ARRAY_MODE_SUPPORTED_P
14241 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
14243 #undef TARGET_VECTORIZE_ADD_STMT_COST
14244 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
14246 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
14247 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
14248 aarch64_builtin_vectorization_cost
14250 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
14251 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
14253 #undef TARGET_VECTORIZE_BUILTINS
14254 #define TARGET_VECTORIZE_BUILTINS
14256 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
14257 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
14258 aarch64_builtin_vectorized_function
14260 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
14261 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
14262 aarch64_autovectorize_vector_sizes
14264 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
14265 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
14266 aarch64_atomic_assign_expand_fenv
14268 /* Section anchor support. */
14270 #undef TARGET_MIN_ANCHOR_OFFSET
14271 #define TARGET_MIN_ANCHOR_OFFSET -256
14273 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
14274 byte offset; we can do much more for larger data types, but have no way
14275 to determine the size of the access. We assume accesses are aligned. */
14276 #undef TARGET_MAX_ANCHOR_OFFSET
14277 #define TARGET_MAX_ANCHOR_OFFSET 4095
14279 #undef TARGET_VECTOR_ALIGNMENT
14280 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
14282 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
14283 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
14284 aarch64_simd_vector_alignment_reachable
14286 /* vec_perm support. */
14288 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
14289 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
14290 aarch64_vectorize_vec_perm_const_ok
14292 #undef TARGET_INIT_LIBFUNCS
14293 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
14295 #undef TARGET_FIXED_CONDITION_CODE_REGS
14296 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
14298 #undef TARGET_FLAGS_REGNUM
14299 #define TARGET_FLAGS_REGNUM CC_REGNUM
14301 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
14302 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
14304 #undef TARGET_ASAN_SHADOW_OFFSET
14305 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
14307 #undef TARGET_LEGITIMIZE_ADDRESS
14308 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
14310 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
14311 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
14312 aarch64_use_by_pieces_infrastructure_p
14314 #undef TARGET_CAN_USE_DOLOOP_P
14315 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
14317 #undef TARGET_SCHED_MACRO_FUSION_P
14318 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
14320 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
14321 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
14323 #undef TARGET_SCHED_FUSION_PRIORITY
14324 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
14326 #undef TARGET_UNSPEC_MAY_TRAP_P
14327 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
14329 #undef TARGET_USE_PSEUDO_PIC_REG
14330 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
14332 #undef TARGET_PRINT_OPERAND
14333 #define TARGET_PRINT_OPERAND aarch64_print_operand
14335 #undef TARGET_PRINT_OPERAND_ADDRESS
14336 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
14338 #undef TARGET_OPTAB_SUPPORTED_P
14339 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
14341 #undef TARGET_OMIT_STRUCT_RETURN_REG
14342 #define TARGET_OMIT_STRUCT_RETURN_REG true
14344 struct gcc_target targetm = TARGET_INITIALIZER;
14346 #include "gt-aarch64.h"