PR target/77728
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob1e58e9d781912eb1eb764720a3baed96eec5f88c
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "insn-attr.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "stor-layout.h"
45 #include "calls.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "flags.h"
49 #include "explow.h"
50 #include "expr.h"
51 #include "reload.h"
52 #include "langhooks.h"
53 #include "opts.h"
54 #include "params.h"
55 #include "gimplify.h"
56 #include "dwarf2.h"
57 #include "gimple-iterator.h"
58 #include "tree-vectorizer.h"
59 #include "aarch64-cost-tables.h"
60 #include "dumpfile.h"
61 #include "builtins.h"
62 #include "rtl-iter.h"
63 #include "tm-constrs.h"
64 #include "sched-int.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
67 #include "selftest.h"
68 #include "selftest-rtl.h"
70 /* This file should be included last. */
71 #include "target-def.h"
73 /* Defined for convenience. */
74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
76 /* Classifies an address.
78 ADDRESS_REG_IMM
79 A simple base register plus immediate offset.
81 ADDRESS_REG_WB
82 A base register indexed by immediate offset with writeback.
84 ADDRESS_REG_REG
85 A base register indexed by (optionally scaled) register.
87 ADDRESS_REG_UXTW
88 A base register indexed by (optionally scaled) zero-extended register.
90 ADDRESS_REG_SXTW
91 A base register indexed by (optionally scaled) sign-extended register.
93 ADDRESS_LO_SUM
94 A LO_SUM rtx with a base register and "LO12" symbol relocation.
96 ADDRESS_SYMBOLIC:
97 A constant symbolic address, in pc-relative literal pool. */
99 enum aarch64_address_type {
100 ADDRESS_REG_IMM,
101 ADDRESS_REG_WB,
102 ADDRESS_REG_REG,
103 ADDRESS_REG_UXTW,
104 ADDRESS_REG_SXTW,
105 ADDRESS_LO_SUM,
106 ADDRESS_SYMBOLIC
109 struct aarch64_address_info {
110 enum aarch64_address_type type;
111 rtx base;
112 rtx offset;
113 int shift;
114 enum aarch64_symbol_type symbol_type;
117 struct simd_immediate_info
119 rtx value;
120 int shift;
121 int element_width;
122 bool mvn;
123 bool msl;
126 /* The current code model. */
127 enum aarch64_code_model aarch64_cmodel;
129 #ifdef HAVE_AS_TLS
130 #undef TARGET_HAVE_TLS
131 #define TARGET_HAVE_TLS 1
132 #endif
134 static bool aarch64_composite_type_p (const_tree, machine_mode);
135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
136 const_tree,
137 machine_mode *, int *,
138 bool *);
139 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
140 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_override_options_after_change (void);
142 static bool aarch64_vector_mode_supported_p (machine_mode);
143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
144 const unsigned char *sel);
145 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
147 const_tree type,
148 int misalignment,
149 bool is_packed);
151 /* Major revision number of the ARM Architecture implemented by the target. */
152 unsigned aarch64_architecture_version;
154 /* The processor for which instructions should be scheduled. */
155 enum aarch64_processor aarch64_tune = cortexa53;
157 /* Mask to specify which instruction scheduling options should be used. */
158 unsigned long aarch64_tune_flags = 0;
160 /* Global flag for PC relative loads. */
161 bool aarch64_pcrelative_literal_loads;
163 /* Support for command line parsing of boolean flags in the tuning
164 structures. */
165 struct aarch64_flag_desc
167 const char* name;
168 unsigned int flag;
171 #define AARCH64_FUSION_PAIR(name, internal_name) \
172 { name, AARCH64_FUSE_##internal_name },
173 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
175 { "none", AARCH64_FUSE_NOTHING },
176 #include "aarch64-fusion-pairs.def"
177 { "all", AARCH64_FUSE_ALL },
178 { NULL, AARCH64_FUSE_NOTHING }
181 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
182 { name, AARCH64_EXTRA_TUNE_##internal_name },
183 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
185 { "none", AARCH64_EXTRA_TUNE_NONE },
186 #include "aarch64-tuning-flags.def"
187 { "all", AARCH64_EXTRA_TUNE_ALL },
188 { NULL, AARCH64_EXTRA_TUNE_NONE }
191 /* Tuning parameters. */
193 static const struct cpu_addrcost_table generic_addrcost_table =
196 0, /* hi */
197 0, /* si */
198 0, /* di */
199 0, /* ti */
201 0, /* pre_modify */
202 0, /* post_modify */
203 0, /* register_offset */
204 0, /* register_sextend */
205 0, /* register_zextend */
206 0 /* imm_offset */
209 static const struct cpu_addrcost_table cortexa57_addrcost_table =
212 1, /* hi */
213 0, /* si */
214 0, /* di */
215 1, /* ti */
217 0, /* pre_modify */
218 0, /* post_modify */
219 0, /* register_offset */
220 0, /* register_sextend */
221 0, /* register_zextend */
222 0, /* imm_offset */
225 static const struct cpu_addrcost_table exynosm1_addrcost_table =
228 0, /* hi */
229 0, /* si */
230 0, /* di */
231 2, /* ti */
233 0, /* pre_modify */
234 0, /* post_modify */
235 1, /* register_offset */
236 1, /* register_sextend */
237 2, /* register_zextend */
238 0, /* imm_offset */
241 static const struct cpu_addrcost_table xgene1_addrcost_table =
244 1, /* hi */
245 0, /* si */
246 0, /* di */
247 1, /* ti */
249 1, /* pre_modify */
250 0, /* post_modify */
251 0, /* register_offset */
252 1, /* register_sextend */
253 1, /* register_zextend */
254 0, /* imm_offset */
257 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
260 1, /* hi */
261 0, /* si */
262 0, /* di */
263 1, /* ti */
265 0, /* pre_modify */
266 0, /* post_modify */
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
270 0 /* imm_offset */
273 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
276 1, /* hi */
277 1, /* si */
278 1, /* di */
279 2, /* ti */
281 0, /* pre_modify */
282 0, /* post_modify */
283 2, /* register_offset */
284 3, /* register_sextend */
285 3, /* register_zextend */
286 0, /* imm_offset */
289 static const struct cpu_regmove_cost generic_regmove_cost =
291 1, /* GP2GP */
292 /* Avoid the use of slow int<->fp moves for spilling by setting
293 their cost higher than memmov_cost. */
294 5, /* GP2FP */
295 5, /* FP2GP */
296 2 /* FP2FP */
299 static const struct cpu_regmove_cost cortexa57_regmove_cost =
301 1, /* GP2GP */
302 /* Avoid the use of slow int<->fp moves for spilling by setting
303 their cost higher than memmov_cost. */
304 5, /* GP2FP */
305 5, /* FP2GP */
306 2 /* FP2FP */
309 static const struct cpu_regmove_cost cortexa53_regmove_cost =
311 1, /* GP2GP */
312 /* Avoid the use of slow int<->fp moves for spilling by setting
313 their cost higher than memmov_cost. */
314 5, /* GP2FP */
315 5, /* FP2GP */
316 2 /* FP2FP */
319 static const struct cpu_regmove_cost exynosm1_regmove_cost =
321 1, /* GP2GP */
322 /* Avoid the use of slow int<->fp moves for spilling by setting
323 their cost higher than memmov_cost (actual, 4 and 9). */
324 9, /* GP2FP */
325 9, /* FP2GP */
326 1 /* FP2FP */
329 static const struct cpu_regmove_cost thunderx_regmove_cost =
331 2, /* GP2GP */
332 2, /* GP2FP */
333 6, /* FP2GP */
334 4 /* FP2FP */
337 static const struct cpu_regmove_cost xgene1_regmove_cost =
339 1, /* GP2GP */
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
342 8, /* GP2FP */
343 8, /* FP2GP */
344 2 /* FP2FP */
347 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
349 2, /* GP2GP */
350 /* Avoid the use of int<->fp moves for spilling. */
351 6, /* GP2FP */
352 6, /* FP2GP */
353 4 /* FP2FP */
356 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
358 1, /* GP2GP */
359 /* Avoid the use of int<->fp moves for spilling. */
360 8, /* GP2FP */
361 8, /* FP2GP */
362 4 /* FP2FP */
365 /* Generic costs for vector insn classes. */
366 static const struct cpu_vector_cost generic_vector_cost =
368 1, /* scalar_int_stmt_cost */
369 1, /* scalar_fp_stmt_cost */
370 1, /* scalar_load_cost */
371 1, /* scalar_store_cost */
372 1, /* vec_int_stmt_cost */
373 1, /* vec_fp_stmt_cost */
374 2, /* vec_permute_cost */
375 1, /* vec_to_scalar_cost */
376 1, /* scalar_to_vec_cost */
377 1, /* vec_align_load_cost */
378 1, /* vec_unalign_load_cost */
379 1, /* vec_unalign_store_cost */
380 1, /* vec_store_cost */
381 3, /* cond_taken_branch_cost */
382 1 /* cond_not_taken_branch_cost */
385 /* ThunderX costs for vector insn classes. */
386 static const struct cpu_vector_cost thunderx_vector_cost =
388 1, /* scalar_int_stmt_cost */
389 1, /* scalar_fp_stmt_cost */
390 3, /* scalar_load_cost */
391 1, /* scalar_store_cost */
392 4, /* vec_int_stmt_cost */
393 4, /* vec_fp_stmt_cost */
394 4, /* vec_permute_cost */
395 2, /* vec_to_scalar_cost */
396 2, /* scalar_to_vec_cost */
397 3, /* vec_align_load_cost */
398 10, /* vec_unalign_load_cost */
399 10, /* vec_unalign_store_cost */
400 1, /* vec_store_cost */
401 3, /* cond_taken_branch_cost */
402 3 /* cond_not_taken_branch_cost */
405 /* Generic costs for vector insn classes. */
406 static const struct cpu_vector_cost cortexa57_vector_cost =
408 1, /* scalar_int_stmt_cost */
409 1, /* scalar_fp_stmt_cost */
410 4, /* scalar_load_cost */
411 1, /* scalar_store_cost */
412 2, /* vec_int_stmt_cost */
413 2, /* vec_fp_stmt_cost */
414 3, /* vec_permute_cost */
415 8, /* vec_to_scalar_cost */
416 8, /* scalar_to_vec_cost */
417 4, /* vec_align_load_cost */
418 4, /* vec_unalign_load_cost */
419 1, /* vec_unalign_store_cost */
420 1, /* vec_store_cost */
421 1, /* cond_taken_branch_cost */
422 1 /* cond_not_taken_branch_cost */
425 static const struct cpu_vector_cost exynosm1_vector_cost =
427 1, /* scalar_int_stmt_cost */
428 1, /* scalar_fp_stmt_cost */
429 5, /* scalar_load_cost */
430 1, /* scalar_store_cost */
431 3, /* vec_int_stmt_cost */
432 3, /* vec_fp_stmt_cost */
433 3, /* vec_permute_cost */
434 3, /* vec_to_scalar_cost */
435 3, /* scalar_to_vec_cost */
436 5, /* vec_align_load_cost */
437 5, /* vec_unalign_load_cost */
438 1, /* vec_unalign_store_cost */
439 1, /* vec_store_cost */
440 1, /* cond_taken_branch_cost */
441 1 /* cond_not_taken_branch_cost */
444 /* Generic costs for vector insn classes. */
445 static const struct cpu_vector_cost xgene1_vector_cost =
447 1, /* scalar_int_stmt_cost */
448 1, /* scalar_fp_stmt_cost */
449 5, /* scalar_load_cost */
450 1, /* scalar_store_cost */
451 2, /* vec_int_stmt_cost */
452 2, /* vec_fp_stmt_cost */
453 2, /* vec_permute_cost */
454 4, /* vec_to_scalar_cost */
455 4, /* scalar_to_vec_cost */
456 10, /* vec_align_load_cost */
457 10, /* vec_unalign_load_cost */
458 2, /* vec_unalign_store_cost */
459 2, /* vec_store_cost */
460 2, /* cond_taken_branch_cost */
461 1 /* cond_not_taken_branch_cost */
464 /* Costs for vector insn classes for Vulcan. */
465 static const struct cpu_vector_cost thunderx2t99_vector_cost =
467 1, /* scalar_int_stmt_cost */
468 6, /* scalar_fp_stmt_cost */
469 4, /* scalar_load_cost */
470 1, /* scalar_store_cost */
471 5, /* vec_int_stmt_cost */
472 6, /* vec_fp_stmt_cost */
473 3, /* vec_permute_cost */
474 6, /* vec_to_scalar_cost */
475 5, /* scalar_to_vec_cost */
476 8, /* vec_align_load_cost */
477 8, /* vec_unalign_load_cost */
478 4, /* vec_unalign_store_cost */
479 4, /* vec_store_cost */
480 2, /* cond_taken_branch_cost */
481 1 /* cond_not_taken_branch_cost */
484 /* Generic costs for branch instructions. */
485 static const struct cpu_branch_cost generic_branch_cost =
487 1, /* Predictable. */
488 3 /* Unpredictable. */
491 /* Branch costs for Cortex-A57. */
492 static const struct cpu_branch_cost cortexa57_branch_cost =
494 1, /* Predictable. */
495 3 /* Unpredictable. */
498 /* Branch costs for Vulcan. */
499 static const struct cpu_branch_cost thunderx2t99_branch_cost =
501 1, /* Predictable. */
502 3 /* Unpredictable. */
505 /* Generic approximation modes. */
506 static const cpu_approx_modes generic_approx_modes =
508 AARCH64_APPROX_NONE, /* division */
509 AARCH64_APPROX_NONE, /* sqrt */
510 AARCH64_APPROX_NONE /* recip_sqrt */
513 /* Approximation modes for Exynos M1. */
514 static const cpu_approx_modes exynosm1_approx_modes =
516 AARCH64_APPROX_NONE, /* division */
517 AARCH64_APPROX_ALL, /* sqrt */
518 AARCH64_APPROX_ALL /* recip_sqrt */
521 /* Approximation modes for X-Gene 1. */
522 static const cpu_approx_modes xgene1_approx_modes =
524 AARCH64_APPROX_NONE, /* division */
525 AARCH64_APPROX_NONE, /* sqrt */
526 AARCH64_APPROX_ALL /* recip_sqrt */
529 static const struct tune_params generic_tunings =
531 &cortexa57_extra_costs,
532 &generic_addrcost_table,
533 &generic_regmove_cost,
534 &generic_vector_cost,
535 &generic_branch_cost,
536 &generic_approx_modes,
537 4, /* memmov_cost */
538 2, /* issue_rate */
539 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
540 8, /* function_align. */
541 8, /* jump_align. */
542 4, /* loop_align. */
543 2, /* int_reassoc_width. */
544 4, /* fp_reassoc_width. */
545 1, /* vec_reassoc_width. */
546 2, /* min_div_recip_mul_sf. */
547 2, /* min_div_recip_mul_df. */
548 0, /* max_case_values. */
549 0, /* cache_line_size. */
550 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
551 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
554 static const struct tune_params cortexa35_tunings =
556 &cortexa53_extra_costs,
557 &generic_addrcost_table,
558 &cortexa53_regmove_cost,
559 &generic_vector_cost,
560 &cortexa57_branch_cost,
561 &generic_approx_modes,
562 4, /* memmov_cost */
563 1, /* issue_rate */
564 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
565 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
566 16, /* function_align. */
567 8, /* jump_align. */
568 8, /* loop_align. */
569 2, /* int_reassoc_width. */
570 4, /* fp_reassoc_width. */
571 1, /* vec_reassoc_width. */
572 2, /* min_div_recip_mul_sf. */
573 2, /* min_div_recip_mul_df. */
574 0, /* max_case_values. */
575 0, /* cache_line_size. */
576 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
577 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
580 static const struct tune_params cortexa53_tunings =
582 &cortexa53_extra_costs,
583 &generic_addrcost_table,
584 &cortexa53_regmove_cost,
585 &generic_vector_cost,
586 &cortexa57_branch_cost,
587 &generic_approx_modes,
588 4, /* memmov_cost */
589 2, /* issue_rate */
590 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
591 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
592 16, /* function_align. */
593 8, /* jump_align. */
594 8, /* loop_align. */
595 2, /* int_reassoc_width. */
596 4, /* fp_reassoc_width. */
597 1, /* vec_reassoc_width. */
598 2, /* min_div_recip_mul_sf. */
599 2, /* min_div_recip_mul_df. */
600 0, /* max_case_values. */
601 0, /* cache_line_size. */
602 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
603 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
606 static const struct tune_params cortexa57_tunings =
608 &cortexa57_extra_costs,
609 &cortexa57_addrcost_table,
610 &cortexa57_regmove_cost,
611 &cortexa57_vector_cost,
612 &cortexa57_branch_cost,
613 &generic_approx_modes,
614 4, /* memmov_cost */
615 3, /* issue_rate */
616 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
617 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
618 16, /* function_align. */
619 8, /* jump_align. */
620 8, /* loop_align. */
621 2, /* int_reassoc_width. */
622 4, /* fp_reassoc_width. */
623 1, /* vec_reassoc_width. */
624 2, /* min_div_recip_mul_sf. */
625 2, /* min_div_recip_mul_df. */
626 0, /* max_case_values. */
627 0, /* cache_line_size. */
628 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
629 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
632 static const struct tune_params cortexa72_tunings =
634 &cortexa57_extra_costs,
635 &cortexa57_addrcost_table,
636 &cortexa57_regmove_cost,
637 &cortexa57_vector_cost,
638 &cortexa57_branch_cost,
639 &generic_approx_modes,
640 4, /* memmov_cost */
641 3, /* issue_rate */
642 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
643 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
644 16, /* function_align. */
645 8, /* jump_align. */
646 8, /* loop_align. */
647 2, /* int_reassoc_width. */
648 4, /* fp_reassoc_width. */
649 1, /* vec_reassoc_width. */
650 2, /* min_div_recip_mul_sf. */
651 2, /* min_div_recip_mul_df. */
652 0, /* max_case_values. */
653 0, /* cache_line_size. */
654 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
655 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
658 static const struct tune_params cortexa73_tunings =
660 &cortexa57_extra_costs,
661 &cortexa57_addrcost_table,
662 &cortexa57_regmove_cost,
663 &cortexa57_vector_cost,
664 &cortexa57_branch_cost,
665 &generic_approx_modes,
666 4, /* memmov_cost. */
667 2, /* issue_rate. */
668 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
669 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
670 16, /* function_align. */
671 8, /* jump_align. */
672 8, /* loop_align. */
673 2, /* int_reassoc_width. */
674 4, /* fp_reassoc_width. */
675 1, /* vec_reassoc_width. */
676 2, /* min_div_recip_mul_sf. */
677 2, /* min_div_recip_mul_df. */
678 0, /* max_case_values. */
679 0, /* cache_line_size. */
680 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
681 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
684 static const struct tune_params exynosm1_tunings =
686 &exynosm1_extra_costs,
687 &exynosm1_addrcost_table,
688 &exynosm1_regmove_cost,
689 &exynosm1_vector_cost,
690 &generic_branch_cost,
691 &exynosm1_approx_modes,
692 4, /* memmov_cost */
693 3, /* issue_rate */
694 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
695 4, /* function_align. */
696 4, /* jump_align. */
697 4, /* loop_align. */
698 2, /* int_reassoc_width. */
699 4, /* fp_reassoc_width. */
700 1, /* vec_reassoc_width. */
701 2, /* min_div_recip_mul_sf. */
702 2, /* min_div_recip_mul_df. */
703 48, /* max_case_values. */
704 64, /* cache_line_size. */
705 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
706 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
709 static const struct tune_params thunderx_tunings =
711 &thunderx_extra_costs,
712 &generic_addrcost_table,
713 &thunderx_regmove_cost,
714 &thunderx_vector_cost,
715 &generic_branch_cost,
716 &generic_approx_modes,
717 6, /* memmov_cost */
718 2, /* issue_rate */
719 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
720 8, /* function_align. */
721 8, /* jump_align. */
722 8, /* loop_align. */
723 2, /* int_reassoc_width. */
724 4, /* fp_reassoc_width. */
725 1, /* vec_reassoc_width. */
726 2, /* min_div_recip_mul_sf. */
727 2, /* min_div_recip_mul_df. */
728 0, /* max_case_values. */
729 0, /* cache_line_size. */
730 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
731 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) /* tune_flags. */
734 static const struct tune_params xgene1_tunings =
736 &xgene1_extra_costs,
737 &xgene1_addrcost_table,
738 &xgene1_regmove_cost,
739 &xgene1_vector_cost,
740 &generic_branch_cost,
741 &xgene1_approx_modes,
742 6, /* memmov_cost */
743 4, /* issue_rate */
744 AARCH64_FUSE_NOTHING, /* fusible_ops */
745 16, /* function_align. */
746 8, /* jump_align. */
747 16, /* loop_align. */
748 2, /* int_reassoc_width. */
749 4, /* fp_reassoc_width. */
750 1, /* vec_reassoc_width. */
751 2, /* min_div_recip_mul_sf. */
752 2, /* min_div_recip_mul_df. */
753 0, /* max_case_values. */
754 0, /* cache_line_size. */
755 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
756 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
759 static const struct tune_params qdf24xx_tunings =
761 &qdf24xx_extra_costs,
762 &qdf24xx_addrcost_table,
763 &qdf24xx_regmove_cost,
764 &generic_vector_cost,
765 &generic_branch_cost,
766 &generic_approx_modes,
767 4, /* memmov_cost */
768 4, /* issue_rate */
769 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
770 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
771 16, /* function_align. */
772 8, /* jump_align. */
773 16, /* loop_align. */
774 2, /* int_reassoc_width. */
775 4, /* fp_reassoc_width. */
776 1, /* vec_reassoc_width. */
777 2, /* min_div_recip_mul_sf. */
778 2, /* min_div_recip_mul_df. */
779 0, /* max_case_values. */
780 64, /* cache_line_size. */
781 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
782 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
785 static const struct tune_params thunderx2t99_tunings =
787 &thunderx2t99_extra_costs,
788 &thunderx2t99_addrcost_table,
789 &thunderx2t99_regmove_cost,
790 &thunderx2t99_vector_cost,
791 &thunderx2t99_branch_cost,
792 &generic_approx_modes,
793 4, /* memmov_cost. */
794 4, /* issue_rate. */
795 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops */
796 16, /* function_align. */
797 8, /* jump_align. */
798 16, /* loop_align. */
799 3, /* int_reassoc_width. */
800 2, /* fp_reassoc_width. */
801 2, /* vec_reassoc_width. */
802 2, /* min_div_recip_mul_sf. */
803 2, /* min_div_recip_mul_df. */
804 0, /* max_case_values. */
805 64, /* cache_line_size. */
806 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
807 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
810 /* Support for fine-grained override of the tuning structures. */
811 struct aarch64_tuning_override_function
813 const char* name;
814 void (*parse_override)(const char*, struct tune_params*);
817 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
818 static void aarch64_parse_tune_string (const char*, struct tune_params*);
820 static const struct aarch64_tuning_override_function
821 aarch64_tuning_override_functions[] =
823 { "fuse", aarch64_parse_fuse_string },
824 { "tune", aarch64_parse_tune_string },
825 { NULL, NULL }
828 /* A processor implementing AArch64. */
829 struct processor
831 const char *const name;
832 enum aarch64_processor ident;
833 enum aarch64_processor sched_core;
834 enum aarch64_arch arch;
835 unsigned architecture_version;
836 const unsigned long flags;
837 const struct tune_params *const tune;
840 /* Architectures implementing AArch64. */
841 static const struct processor all_architectures[] =
843 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
844 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
845 #include "aarch64-arches.def"
846 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
849 /* Processor cores implementing AArch64. */
850 static const struct processor all_cores[] =
852 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
853 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
854 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
855 FLAGS, &COSTS##_tunings},
856 #include "aarch64-cores.def"
857 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
858 AARCH64_FL_FOR_ARCH8, &generic_tunings},
859 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
863 /* Target specification. These are populated by the -march, -mtune, -mcpu
864 handling code or by target attributes. */
865 static const struct processor *selected_arch;
866 static const struct processor *selected_cpu;
867 static const struct processor *selected_tune;
869 /* The current tuning set. */
870 struct tune_params aarch64_tune_params = generic_tunings;
872 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
874 /* An ISA extension in the co-processor and main instruction set space. */
875 struct aarch64_option_extension
877 const char *const name;
878 const unsigned long flags_on;
879 const unsigned long flags_off;
882 typedef enum aarch64_cond_code
884 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
885 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
886 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
888 aarch64_cc;
890 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
892 /* The condition codes of the processor, and the inverse function. */
893 static const char * const aarch64_condition_codes[] =
895 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
896 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
899 /* Generate code to enable conditional branches in functions over 1 MiB. */
900 const char *
901 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
902 const char * branch_format)
904 rtx_code_label * tmp_label = gen_label_rtx ();
905 char label_buf[256];
906 char buffer[128];
907 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
908 CODE_LABEL_NUMBER (tmp_label));
909 const char *label_ptr = targetm.strip_name_encoding (label_buf);
910 rtx dest_label = operands[pos_label];
911 operands[pos_label] = tmp_label;
913 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
914 output_asm_insn (buffer, operands);
916 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
917 operands[pos_label] = dest_label;
918 output_asm_insn (buffer, operands);
919 return "";
922 void
923 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
925 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
926 if (TARGET_GENERAL_REGS_ONLY)
927 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
928 else
929 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
932 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
933 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
934 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
935 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
936 cost (in this case the best class is the lowest cost one). Using ALL_REGS
937 irrespectively of its cost results in bad allocations with many redundant
938 int<->FP moves which are expensive on various cores.
939 To avoid this we don't allow ALL_REGS as the allocno class, but force a
940 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
941 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
942 Otherwise set the allocno class depending on the mode.
943 The result of this is that it is no longer inefficient to have a higher
944 memory move cost than the register move cost.
947 static reg_class_t
948 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
949 reg_class_t best_class)
951 enum machine_mode mode;
953 if (allocno_class != ALL_REGS)
954 return allocno_class;
956 if (best_class != ALL_REGS)
957 return best_class;
959 mode = PSEUDO_REGNO_MODE (regno);
960 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
963 static unsigned int
964 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
966 if (GET_MODE_UNIT_SIZE (mode) == 4)
967 return aarch64_tune_params.min_div_recip_mul_sf;
968 return aarch64_tune_params.min_div_recip_mul_df;
971 static int
972 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
973 enum machine_mode mode)
975 if (VECTOR_MODE_P (mode))
976 return aarch64_tune_params.vec_reassoc_width;
977 if (INTEGRAL_MODE_P (mode))
978 return aarch64_tune_params.int_reassoc_width;
979 if (FLOAT_MODE_P (mode))
980 return aarch64_tune_params.fp_reassoc_width;
981 return 1;
984 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
985 unsigned
986 aarch64_dbx_register_number (unsigned regno)
988 if (GP_REGNUM_P (regno))
989 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
990 else if (regno == SP_REGNUM)
991 return AARCH64_DWARF_SP;
992 else if (FP_REGNUM_P (regno))
993 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
995 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
996 equivalent DWARF register. */
997 return DWARF_FRAME_REGISTERS;
1000 /* Return TRUE if MODE is any of the large INT modes. */
1001 static bool
1002 aarch64_vect_struct_mode_p (machine_mode mode)
1004 return mode == OImode || mode == CImode || mode == XImode;
1007 /* Return TRUE if MODE is any of the vector modes. */
1008 static bool
1009 aarch64_vector_mode_p (machine_mode mode)
1011 return aarch64_vector_mode_supported_p (mode)
1012 || aarch64_vect_struct_mode_p (mode);
1015 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1016 static bool
1017 aarch64_array_mode_supported_p (machine_mode mode,
1018 unsigned HOST_WIDE_INT nelems)
1020 if (TARGET_SIMD
1021 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1022 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1023 && (nelems >= 2 && nelems <= 4))
1024 return true;
1026 return false;
1029 /* Implement HARD_REGNO_NREGS. */
1032 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1034 switch (aarch64_regno_regclass (regno))
1036 case FP_REGS:
1037 case FP_LO_REGS:
1038 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1039 default:
1040 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1042 gcc_unreachable ();
1045 /* Implement HARD_REGNO_MODE_OK. */
1048 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1050 if (GET_MODE_CLASS (mode) == MODE_CC)
1051 return regno == CC_REGNUM;
1053 if (regno == SP_REGNUM)
1054 /* The purpose of comparing with ptr_mode is to support the
1055 global register variable associated with the stack pointer
1056 register via the syntax of asm ("wsp") in ILP32. */
1057 return mode == Pmode || mode == ptr_mode;
1059 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1060 return mode == Pmode;
1062 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1063 return 1;
1065 if (FP_REGNUM_P (regno))
1067 if (aarch64_vect_struct_mode_p (mode))
1068 return
1069 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1070 else
1071 return 1;
1074 return 0;
1077 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1078 machine_mode
1079 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1080 machine_mode mode)
1082 /* Handle modes that fit within single registers. */
1083 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1085 if (GET_MODE_SIZE (mode) >= 4)
1086 return mode;
1087 else
1088 return SImode;
1090 /* Fall back to generic for multi-reg and very large modes. */
1091 else
1092 return choose_hard_reg_mode (regno, nregs, false);
1095 /* Return true if calls to DECL should be treated as
1096 long-calls (ie called via a register). */
1097 static bool
1098 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1100 return false;
1103 /* Return true if calls to symbol-ref SYM should be treated as
1104 long-calls (ie called via a register). */
1105 bool
1106 aarch64_is_long_call_p (rtx sym)
1108 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1111 /* Return true if calls to symbol-ref SYM should not go through
1112 plt stubs. */
1114 bool
1115 aarch64_is_noplt_call_p (rtx sym)
1117 const_tree decl = SYMBOL_REF_DECL (sym);
1119 if (flag_pic
1120 && decl
1121 && (!flag_plt
1122 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1123 && !targetm.binds_local_p (decl))
1124 return true;
1126 return false;
1129 /* Return true if the offsets to a zero/sign-extract operation
1130 represent an expression that matches an extend operation. The
1131 operands represent the paramters from
1133 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1134 bool
1135 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1136 rtx extract_imm)
1138 HOST_WIDE_INT mult_val, extract_val;
1140 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1141 return false;
1143 mult_val = INTVAL (mult_imm);
1144 extract_val = INTVAL (extract_imm);
1146 if (extract_val > 8
1147 && extract_val < GET_MODE_BITSIZE (mode)
1148 && exact_log2 (extract_val & ~7) > 0
1149 && (extract_val & 7) <= 4
1150 && mult_val == (1 << (extract_val & 7)))
1151 return true;
1153 return false;
1156 /* Emit an insn that's a simple single-set. Both the operands must be
1157 known to be valid. */
1158 inline static rtx_insn *
1159 emit_set_insn (rtx x, rtx y)
1161 return emit_insn (gen_rtx_SET (x, y));
1164 /* X and Y are two things to compare using CODE. Emit the compare insn and
1165 return the rtx for register 0 in the proper mode. */
1167 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1169 machine_mode mode = SELECT_CC_MODE (code, x, y);
1170 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1172 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1173 return cc_reg;
1176 /* Build the SYMBOL_REF for __tls_get_addr. */
1178 static GTY(()) rtx tls_get_addr_libfunc;
1181 aarch64_tls_get_addr (void)
1183 if (!tls_get_addr_libfunc)
1184 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1185 return tls_get_addr_libfunc;
1188 /* Return the TLS model to use for ADDR. */
1190 static enum tls_model
1191 tls_symbolic_operand_type (rtx addr)
1193 enum tls_model tls_kind = TLS_MODEL_NONE;
1194 rtx sym, addend;
1196 if (GET_CODE (addr) == CONST)
1198 split_const (addr, &sym, &addend);
1199 if (GET_CODE (sym) == SYMBOL_REF)
1200 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1202 else if (GET_CODE (addr) == SYMBOL_REF)
1203 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1205 return tls_kind;
1208 /* We'll allow lo_sum's in addresses in our legitimate addresses
1209 so that combine would take care of combining addresses where
1210 necessary, but for generation purposes, we'll generate the address
1211 as :
1212 RTL Absolute
1213 tmp = hi (symbol_ref); adrp x1, foo
1214 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1217 PIC TLS
1218 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1219 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1220 bl __tls_get_addr
1223 Load TLS symbol, depending on TLS mechanism and TLS access model.
1225 Global Dynamic - Traditional TLS:
1226 adrp tmp, :tlsgd:imm
1227 add dest, tmp, #:tlsgd_lo12:imm
1228 bl __tls_get_addr
1230 Global Dynamic - TLS Descriptors:
1231 adrp dest, :tlsdesc:imm
1232 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1233 add dest, dest, #:tlsdesc_lo12:imm
1234 blr tmp
1235 mrs tp, tpidr_el0
1236 add dest, dest, tp
1238 Initial Exec:
1239 mrs tp, tpidr_el0
1240 adrp tmp, :gottprel:imm
1241 ldr dest, [tmp, #:gottprel_lo12:imm]
1242 add dest, dest, tp
1244 Local Exec:
1245 mrs tp, tpidr_el0
1246 add t0, tp, #:tprel_hi12:imm, lsl #12
1247 add t0, t0, #:tprel_lo12_nc:imm
1250 static void
1251 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1252 enum aarch64_symbol_type type)
1254 switch (type)
1256 case SYMBOL_SMALL_ABSOLUTE:
1258 /* In ILP32, the mode of dest can be either SImode or DImode. */
1259 rtx tmp_reg = dest;
1260 machine_mode mode = GET_MODE (dest);
1262 gcc_assert (mode == Pmode || mode == ptr_mode);
1264 if (can_create_pseudo_p ())
1265 tmp_reg = gen_reg_rtx (mode);
1267 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1268 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1269 return;
1272 case SYMBOL_TINY_ABSOLUTE:
1273 emit_insn (gen_rtx_SET (dest, imm));
1274 return;
1276 case SYMBOL_SMALL_GOT_28K:
1278 machine_mode mode = GET_MODE (dest);
1279 rtx gp_rtx = pic_offset_table_rtx;
1280 rtx insn;
1281 rtx mem;
1283 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1284 here before rtl expand. Tree IVOPT will generate rtl pattern to
1285 decide rtx costs, in which case pic_offset_table_rtx is not
1286 initialized. For that case no need to generate the first adrp
1287 instruction as the final cost for global variable access is
1288 one instruction. */
1289 if (gp_rtx != NULL)
1291 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1292 using the page base as GOT base, the first page may be wasted,
1293 in the worst scenario, there is only 28K space for GOT).
1295 The generate instruction sequence for accessing global variable
1298 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1300 Only one instruction needed. But we must initialize
1301 pic_offset_table_rtx properly. We generate initialize insn for
1302 every global access, and allow CSE to remove all redundant.
1304 The final instruction sequences will look like the following
1305 for multiply global variables access.
1307 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1309 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1310 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1311 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1312 ... */
1314 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1315 crtl->uses_pic_offset_table = 1;
1316 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1318 if (mode != GET_MODE (gp_rtx))
1319 gp_rtx = gen_lowpart (mode, gp_rtx);
1323 if (mode == ptr_mode)
1325 if (mode == DImode)
1326 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1327 else
1328 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1330 mem = XVECEXP (SET_SRC (insn), 0, 0);
1332 else
1334 gcc_assert (mode == Pmode);
1336 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1337 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1340 /* The operand is expected to be MEM. Whenever the related insn
1341 pattern changed, above code which calculate mem should be
1342 updated. */
1343 gcc_assert (GET_CODE (mem) == MEM);
1344 MEM_READONLY_P (mem) = 1;
1345 MEM_NOTRAP_P (mem) = 1;
1346 emit_insn (insn);
1347 return;
1350 case SYMBOL_SMALL_GOT_4G:
1352 /* In ILP32, the mode of dest can be either SImode or DImode,
1353 while the got entry is always of SImode size. The mode of
1354 dest depends on how dest is used: if dest is assigned to a
1355 pointer (e.g. in the memory), it has SImode; it may have
1356 DImode if dest is dereferenced to access the memeory.
1357 This is why we have to handle three different ldr_got_small
1358 patterns here (two patterns for ILP32). */
1360 rtx insn;
1361 rtx mem;
1362 rtx tmp_reg = dest;
1363 machine_mode mode = GET_MODE (dest);
1365 if (can_create_pseudo_p ())
1366 tmp_reg = gen_reg_rtx (mode);
1368 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1369 if (mode == ptr_mode)
1371 if (mode == DImode)
1372 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1373 else
1374 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1376 mem = XVECEXP (SET_SRC (insn), 0, 0);
1378 else
1380 gcc_assert (mode == Pmode);
1382 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1383 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1386 gcc_assert (GET_CODE (mem) == MEM);
1387 MEM_READONLY_P (mem) = 1;
1388 MEM_NOTRAP_P (mem) = 1;
1389 emit_insn (insn);
1390 return;
1393 case SYMBOL_SMALL_TLSGD:
1395 rtx_insn *insns;
1396 machine_mode mode = GET_MODE (dest);
1397 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1399 start_sequence ();
1400 if (TARGET_ILP32)
1401 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1402 else
1403 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1404 insns = get_insns ();
1405 end_sequence ();
1407 RTL_CONST_CALL_P (insns) = 1;
1408 emit_libcall_block (insns, dest, result, imm);
1409 return;
1412 case SYMBOL_SMALL_TLSDESC:
1414 machine_mode mode = GET_MODE (dest);
1415 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1416 rtx tp;
1418 gcc_assert (mode == Pmode || mode == ptr_mode);
1420 /* In ILP32, the got entry is always of SImode size. Unlike
1421 small GOT, the dest is fixed at reg 0. */
1422 if (TARGET_ILP32)
1423 emit_insn (gen_tlsdesc_small_si (imm));
1424 else
1425 emit_insn (gen_tlsdesc_small_di (imm));
1426 tp = aarch64_load_tp (NULL);
1428 if (mode != Pmode)
1429 tp = gen_lowpart (mode, tp);
1431 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1432 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1433 return;
1436 case SYMBOL_SMALL_TLSIE:
1438 /* In ILP32, the mode of dest can be either SImode or DImode,
1439 while the got entry is always of SImode size. The mode of
1440 dest depends on how dest is used: if dest is assigned to a
1441 pointer (e.g. in the memory), it has SImode; it may have
1442 DImode if dest is dereferenced to access the memeory.
1443 This is why we have to handle three different tlsie_small
1444 patterns here (two patterns for ILP32). */
1445 machine_mode mode = GET_MODE (dest);
1446 rtx tmp_reg = gen_reg_rtx (mode);
1447 rtx tp = aarch64_load_tp (NULL);
1449 if (mode == ptr_mode)
1451 if (mode == DImode)
1452 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1453 else
1455 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1456 tp = gen_lowpart (mode, tp);
1459 else
1461 gcc_assert (mode == Pmode);
1462 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1465 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1466 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1467 return;
1470 case SYMBOL_TLSLE12:
1471 case SYMBOL_TLSLE24:
1472 case SYMBOL_TLSLE32:
1473 case SYMBOL_TLSLE48:
1475 machine_mode mode = GET_MODE (dest);
1476 rtx tp = aarch64_load_tp (NULL);
1478 if (mode != Pmode)
1479 tp = gen_lowpart (mode, tp);
1481 switch (type)
1483 case SYMBOL_TLSLE12:
1484 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1485 (dest, tp, imm));
1486 break;
1487 case SYMBOL_TLSLE24:
1488 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1489 (dest, tp, imm));
1490 break;
1491 case SYMBOL_TLSLE32:
1492 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1493 (dest, imm));
1494 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1495 (dest, dest, tp));
1496 break;
1497 case SYMBOL_TLSLE48:
1498 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1499 (dest, imm));
1500 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1501 (dest, dest, tp));
1502 break;
1503 default:
1504 gcc_unreachable ();
1507 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1508 return;
1511 case SYMBOL_TINY_GOT:
1512 emit_insn (gen_ldr_got_tiny (dest, imm));
1513 return;
1515 case SYMBOL_TINY_TLSIE:
1517 machine_mode mode = GET_MODE (dest);
1518 rtx tp = aarch64_load_tp (NULL);
1520 if (mode == ptr_mode)
1522 if (mode == DImode)
1523 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1524 else
1526 tp = gen_lowpart (mode, tp);
1527 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1530 else
1532 gcc_assert (mode == Pmode);
1533 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1536 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1537 return;
1540 default:
1541 gcc_unreachable ();
1545 /* Emit a move from SRC to DEST. Assume that the move expanders can
1546 handle all moves if !can_create_pseudo_p (). The distinction is
1547 important because, unlike emit_move_insn, the move expanders know
1548 how to force Pmode objects into the constant pool even when the
1549 constant pool address is not itself legitimate. */
1550 static rtx
1551 aarch64_emit_move (rtx dest, rtx src)
1553 return (can_create_pseudo_p ()
1554 ? emit_move_insn (dest, src)
1555 : emit_move_insn_1 (dest, src));
1558 /* Split a 128-bit move operation into two 64-bit move operations,
1559 taking care to handle partial overlap of register to register
1560 copies. Special cases are needed when moving between GP regs and
1561 FP regs. SRC can be a register, constant or memory; DST a register
1562 or memory. If either operand is memory it must not have any side
1563 effects. */
1564 void
1565 aarch64_split_128bit_move (rtx dst, rtx src)
1567 rtx dst_lo, dst_hi;
1568 rtx src_lo, src_hi;
1570 machine_mode mode = GET_MODE (dst);
1572 gcc_assert (mode == TImode || mode == TFmode);
1573 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1574 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1576 if (REG_P (dst) && REG_P (src))
1578 int src_regno = REGNO (src);
1579 int dst_regno = REGNO (dst);
1581 /* Handle FP <-> GP regs. */
1582 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1584 src_lo = gen_lowpart (word_mode, src);
1585 src_hi = gen_highpart (word_mode, src);
1587 if (mode == TImode)
1589 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1590 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1592 else
1594 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1595 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1597 return;
1599 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1601 dst_lo = gen_lowpart (word_mode, dst);
1602 dst_hi = gen_highpart (word_mode, dst);
1604 if (mode == TImode)
1606 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1607 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1609 else
1611 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1612 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1614 return;
1618 dst_lo = gen_lowpart (word_mode, dst);
1619 dst_hi = gen_highpart (word_mode, dst);
1620 src_lo = gen_lowpart (word_mode, src);
1621 src_hi = gen_highpart_mode (word_mode, mode, src);
1623 /* At most one pairing may overlap. */
1624 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1626 aarch64_emit_move (dst_hi, src_hi);
1627 aarch64_emit_move (dst_lo, src_lo);
1629 else
1631 aarch64_emit_move (dst_lo, src_lo);
1632 aarch64_emit_move (dst_hi, src_hi);
1636 bool
1637 aarch64_split_128bit_move_p (rtx dst, rtx src)
1639 return (! REG_P (src)
1640 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1643 /* Split a complex SIMD combine. */
1645 void
1646 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1648 machine_mode src_mode = GET_MODE (src1);
1649 machine_mode dst_mode = GET_MODE (dst);
1651 gcc_assert (VECTOR_MODE_P (dst_mode));
1653 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1655 rtx (*gen) (rtx, rtx, rtx);
1657 switch (src_mode)
1659 case V8QImode:
1660 gen = gen_aarch64_simd_combinev8qi;
1661 break;
1662 case V4HImode:
1663 gen = gen_aarch64_simd_combinev4hi;
1664 break;
1665 case V2SImode:
1666 gen = gen_aarch64_simd_combinev2si;
1667 break;
1668 case V4HFmode:
1669 gen = gen_aarch64_simd_combinev4hf;
1670 break;
1671 case V2SFmode:
1672 gen = gen_aarch64_simd_combinev2sf;
1673 break;
1674 case DImode:
1675 gen = gen_aarch64_simd_combinedi;
1676 break;
1677 case DFmode:
1678 gen = gen_aarch64_simd_combinedf;
1679 break;
1680 default:
1681 gcc_unreachable ();
1684 emit_insn (gen (dst, src1, src2));
1685 return;
1689 /* Split a complex SIMD move. */
1691 void
1692 aarch64_split_simd_move (rtx dst, rtx src)
1694 machine_mode src_mode = GET_MODE (src);
1695 machine_mode dst_mode = GET_MODE (dst);
1697 gcc_assert (VECTOR_MODE_P (dst_mode));
1699 if (REG_P (dst) && REG_P (src))
1701 rtx (*gen) (rtx, rtx);
1703 gcc_assert (VECTOR_MODE_P (src_mode));
1705 switch (src_mode)
1707 case V16QImode:
1708 gen = gen_aarch64_split_simd_movv16qi;
1709 break;
1710 case V8HImode:
1711 gen = gen_aarch64_split_simd_movv8hi;
1712 break;
1713 case V4SImode:
1714 gen = gen_aarch64_split_simd_movv4si;
1715 break;
1716 case V2DImode:
1717 gen = gen_aarch64_split_simd_movv2di;
1718 break;
1719 case V8HFmode:
1720 gen = gen_aarch64_split_simd_movv8hf;
1721 break;
1722 case V4SFmode:
1723 gen = gen_aarch64_split_simd_movv4sf;
1724 break;
1725 case V2DFmode:
1726 gen = gen_aarch64_split_simd_movv2df;
1727 break;
1728 default:
1729 gcc_unreachable ();
1732 emit_insn (gen (dst, src));
1733 return;
1737 bool
1738 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1739 machine_mode ymode, rtx y)
1741 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1742 gcc_assert (r != NULL);
1743 return rtx_equal_p (x, r);
1747 static rtx
1748 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1750 if (can_create_pseudo_p ())
1751 return force_reg (mode, value);
1752 else
1754 x = aarch64_emit_move (x, value);
1755 return x;
1760 static rtx
1761 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1763 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1765 rtx high;
1766 /* Load the full offset into a register. This
1767 might be improvable in the future. */
1768 high = GEN_INT (offset);
1769 offset = 0;
1770 high = aarch64_force_temporary (mode, temp, high);
1771 reg = aarch64_force_temporary (mode, temp,
1772 gen_rtx_PLUS (mode, high, reg));
1774 return plus_constant (mode, reg, offset);
1777 static int
1778 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1779 machine_mode mode)
1781 int i;
1782 unsigned HOST_WIDE_INT val, val2, mask;
1783 int one_match, zero_match;
1784 int num_insns;
1786 val = INTVAL (imm);
1788 if (aarch64_move_imm (val, mode))
1790 if (generate)
1791 emit_insn (gen_rtx_SET (dest, imm));
1792 return 1;
1795 if ((val >> 32) == 0 || mode == SImode)
1797 if (generate)
1799 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1800 if (mode == SImode)
1801 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1802 GEN_INT ((val >> 16) & 0xffff)));
1803 else
1804 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1805 GEN_INT ((val >> 16) & 0xffff)));
1807 return 2;
1810 /* Remaining cases are all for DImode. */
1812 mask = 0xffff;
1813 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1814 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1815 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1816 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1818 if (zero_match != 2 && one_match != 2)
1820 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1821 For a 64-bit bitmask try whether changing 16 bits to all ones or
1822 zeroes creates a valid bitmask. To check any repeated bitmask,
1823 try using 16 bits from the other 32-bit half of val. */
1825 for (i = 0; i < 64; i += 16, mask <<= 16)
1827 val2 = val & ~mask;
1828 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1829 break;
1830 val2 = val | mask;
1831 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1832 break;
1833 val2 = val2 & ~mask;
1834 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1835 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1836 break;
1838 if (i != 64)
1840 if (generate)
1842 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1843 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1844 GEN_INT ((val >> i) & 0xffff)));
1846 return 2;
1850 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1851 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1852 otherwise skip zero bits. */
1854 num_insns = 1;
1855 mask = 0xffff;
1856 val2 = one_match > zero_match ? ~val : val;
1857 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1859 if (generate)
1860 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1861 ? (val | ~(mask << i))
1862 : (val & (mask << i)))));
1863 for (i += 16; i < 64; i += 16)
1865 if ((val2 & (mask << i)) == 0)
1866 continue;
1867 if (generate)
1868 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1869 GEN_INT ((val >> i) & 0xffff)));
1870 num_insns ++;
1873 return num_insns;
1877 void
1878 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1880 machine_mode mode = GET_MODE (dest);
1882 gcc_assert (mode == SImode || mode == DImode);
1884 /* Check on what type of symbol it is. */
1885 if (GET_CODE (imm) == SYMBOL_REF
1886 || GET_CODE (imm) == LABEL_REF
1887 || GET_CODE (imm) == CONST)
1889 rtx mem, base, offset;
1890 enum aarch64_symbol_type sty;
1892 /* If we have (const (plus symbol offset)), separate out the offset
1893 before we start classifying the symbol. */
1894 split_const (imm, &base, &offset);
1896 sty = aarch64_classify_symbol (base, offset);
1897 switch (sty)
1899 case SYMBOL_FORCE_TO_MEM:
1900 if (offset != const0_rtx
1901 && targetm.cannot_force_const_mem (mode, imm))
1903 gcc_assert (can_create_pseudo_p ());
1904 base = aarch64_force_temporary (mode, dest, base);
1905 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1906 aarch64_emit_move (dest, base);
1907 return;
1910 mem = force_const_mem (ptr_mode, imm);
1911 gcc_assert (mem);
1913 /* If we aren't generating PC relative literals, then
1914 we need to expand the literal pool access carefully.
1915 This is something that needs to be done in a number
1916 of places, so could well live as a separate function. */
1917 if (!aarch64_pcrelative_literal_loads)
1919 gcc_assert (can_create_pseudo_p ());
1920 base = gen_reg_rtx (ptr_mode);
1921 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1922 mem = gen_rtx_MEM (ptr_mode, base);
1925 if (mode != ptr_mode)
1926 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1928 emit_insn (gen_rtx_SET (dest, mem));
1930 return;
1932 case SYMBOL_SMALL_TLSGD:
1933 case SYMBOL_SMALL_TLSDESC:
1934 case SYMBOL_SMALL_TLSIE:
1935 case SYMBOL_SMALL_GOT_28K:
1936 case SYMBOL_SMALL_GOT_4G:
1937 case SYMBOL_TINY_GOT:
1938 case SYMBOL_TINY_TLSIE:
1939 if (offset != const0_rtx)
1941 gcc_assert(can_create_pseudo_p ());
1942 base = aarch64_force_temporary (mode, dest, base);
1943 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1944 aarch64_emit_move (dest, base);
1945 return;
1947 /* FALLTHRU */
1949 case SYMBOL_SMALL_ABSOLUTE:
1950 case SYMBOL_TINY_ABSOLUTE:
1951 case SYMBOL_TLSLE12:
1952 case SYMBOL_TLSLE24:
1953 case SYMBOL_TLSLE32:
1954 case SYMBOL_TLSLE48:
1955 aarch64_load_symref_appropriately (dest, imm, sty);
1956 return;
1958 default:
1959 gcc_unreachable ();
1963 if (!CONST_INT_P (imm))
1965 if (GET_CODE (imm) == HIGH)
1966 emit_insn (gen_rtx_SET (dest, imm));
1967 else
1969 rtx mem = force_const_mem (mode, imm);
1970 gcc_assert (mem);
1971 emit_insn (gen_rtx_SET (dest, mem));
1974 return;
1977 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1980 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
1981 temporary value if necessary. FRAME_RELATED_P should be true if
1982 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1983 to the generated instructions. If SCRATCHREG is known to hold
1984 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1985 immediate again.
1987 Since this function may be used to adjust the stack pointer, we must
1988 ensure that it cannot cause transient stack deallocation (for example
1989 by first incrementing SP and then decrementing when adjusting by a
1990 large immediate). */
1992 static void
1993 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
1994 HOST_WIDE_INT delta, bool frame_related_p,
1995 bool emit_move_imm)
1997 HOST_WIDE_INT mdelta = abs_hwi (delta);
1998 rtx this_rtx = gen_rtx_REG (mode, regnum);
1999 rtx_insn *insn;
2001 if (!mdelta)
2002 return;
2004 /* Single instruction adjustment. */
2005 if (aarch64_uimm12_shift (mdelta))
2007 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2008 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2009 return;
2012 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2013 Only do this if mdelta is not a 16-bit move as adjusting using a move
2014 is better. */
2015 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2017 HOST_WIDE_INT low_off = mdelta & 0xfff;
2019 low_off = delta < 0 ? -low_off : low_off;
2020 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2021 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2022 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2023 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2024 return;
2027 /* Emit a move immediate if required and an addition/subtraction. */
2028 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2029 if (emit_move_imm)
2030 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2031 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2032 : gen_add2_insn (this_rtx, scratch_rtx));
2033 if (frame_related_p)
2035 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2036 rtx adj = plus_constant (mode, this_rtx, delta);
2037 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2041 static inline void
2042 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2043 HOST_WIDE_INT delta)
2045 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2048 static inline void
2049 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2051 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2052 true, emit_move_imm);
2055 static inline void
2056 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2058 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2059 frame_related_p, true);
2062 static bool
2063 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2064 tree exp ATTRIBUTE_UNUSED)
2066 /* Currently, always true. */
2067 return true;
2070 /* Implement TARGET_PASS_BY_REFERENCE. */
2072 static bool
2073 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2074 machine_mode mode,
2075 const_tree type,
2076 bool named ATTRIBUTE_UNUSED)
2078 HOST_WIDE_INT size;
2079 machine_mode dummymode;
2080 int nregs;
2082 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2083 size = (mode == BLKmode && type)
2084 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2086 /* Aggregates are passed by reference based on their size. */
2087 if (type && AGGREGATE_TYPE_P (type))
2089 size = int_size_in_bytes (type);
2092 /* Variable sized arguments are always returned by reference. */
2093 if (size < 0)
2094 return true;
2096 /* Can this be a candidate to be passed in fp/simd register(s)? */
2097 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2098 &dummymode, &nregs,
2099 NULL))
2100 return false;
2102 /* Arguments which are variable sized or larger than 2 registers are
2103 passed by reference unless they are a homogenous floating point
2104 aggregate. */
2105 return size > 2 * UNITS_PER_WORD;
2108 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2109 static bool
2110 aarch64_return_in_msb (const_tree valtype)
2112 machine_mode dummy_mode;
2113 int dummy_int;
2115 /* Never happens in little-endian mode. */
2116 if (!BYTES_BIG_ENDIAN)
2117 return false;
2119 /* Only composite types smaller than or equal to 16 bytes can
2120 be potentially returned in registers. */
2121 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2122 || int_size_in_bytes (valtype) <= 0
2123 || int_size_in_bytes (valtype) > 16)
2124 return false;
2126 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2127 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2128 is always passed/returned in the least significant bits of fp/simd
2129 register(s). */
2130 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2131 &dummy_mode, &dummy_int, NULL))
2132 return false;
2134 return true;
2137 /* Implement TARGET_FUNCTION_VALUE.
2138 Define how to find the value returned by a function. */
2140 static rtx
2141 aarch64_function_value (const_tree type, const_tree func,
2142 bool outgoing ATTRIBUTE_UNUSED)
2144 machine_mode mode;
2145 int unsignedp;
2146 int count;
2147 machine_mode ag_mode;
2149 mode = TYPE_MODE (type);
2150 if (INTEGRAL_TYPE_P (type))
2151 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2153 if (aarch64_return_in_msb (type))
2155 HOST_WIDE_INT size = int_size_in_bytes (type);
2157 if (size % UNITS_PER_WORD != 0)
2159 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2160 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2164 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2165 &ag_mode, &count, NULL))
2167 if (!aarch64_composite_type_p (type, mode))
2169 gcc_assert (count == 1 && mode == ag_mode);
2170 return gen_rtx_REG (mode, V0_REGNUM);
2172 else
2174 int i;
2175 rtx par;
2177 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2178 for (i = 0; i < count; i++)
2180 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2181 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2182 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2183 XVECEXP (par, 0, i) = tmp;
2185 return par;
2188 else
2189 return gen_rtx_REG (mode, R0_REGNUM);
2192 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2193 Return true if REGNO is the number of a hard register in which the values
2194 of called function may come back. */
2196 static bool
2197 aarch64_function_value_regno_p (const unsigned int regno)
2199 /* Maximum of 16 bytes can be returned in the general registers. Examples
2200 of 16-byte return values are: 128-bit integers and 16-byte small
2201 structures (excluding homogeneous floating-point aggregates). */
2202 if (regno == R0_REGNUM || regno == R1_REGNUM)
2203 return true;
2205 /* Up to four fp/simd registers can return a function value, e.g. a
2206 homogeneous floating-point aggregate having four members. */
2207 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2208 return TARGET_FLOAT;
2210 return false;
2213 /* Implement TARGET_RETURN_IN_MEMORY.
2215 If the type T of the result of a function is such that
2216 void func (T arg)
2217 would require that arg be passed as a value in a register (or set of
2218 registers) according to the parameter passing rules, then the result
2219 is returned in the same registers as would be used for such an
2220 argument. */
2222 static bool
2223 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2225 HOST_WIDE_INT size;
2226 machine_mode ag_mode;
2227 int count;
2229 if (!AGGREGATE_TYPE_P (type)
2230 && TREE_CODE (type) != COMPLEX_TYPE
2231 && TREE_CODE (type) != VECTOR_TYPE)
2232 /* Simple scalar types always returned in registers. */
2233 return false;
2235 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2236 type,
2237 &ag_mode,
2238 &count,
2239 NULL))
2240 return false;
2242 /* Types larger than 2 registers returned in memory. */
2243 size = int_size_in_bytes (type);
2244 return (size < 0 || size > 2 * UNITS_PER_WORD);
2247 static bool
2248 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2249 const_tree type, int *nregs)
2251 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2252 return aarch64_vfp_is_call_or_return_candidate (mode,
2253 type,
2254 &pcum->aapcs_vfp_rmode,
2255 nregs,
2256 NULL);
2259 struct aarch64_fn_arg_alignment
2261 /* Alignment for FIELD_DECLs in function arguments. */
2262 unsigned int alignment;
2263 /* Alignment for decls other than FIELD_DECLs in function arguments. */
2264 unsigned int warn_alignment;
2267 /* Given MODE and TYPE of a function argument, return a pair of alignments in
2268 bits. The idea is to suppress any stronger alignment requested by
2269 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2270 This is a helper function for local use only. */
2272 static struct aarch64_fn_arg_alignment
2273 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2275 struct aarch64_fn_arg_alignment aa;
2276 aa.alignment = 0;
2277 aa.warn_alignment = 0;
2279 if (!type)
2281 aa.alignment = GET_MODE_ALIGNMENT (mode);
2282 return aa;
2285 if (integer_zerop (TYPE_SIZE (type)))
2286 return aa;
2288 gcc_assert (TYPE_MODE (type) == mode);
2290 if (!AGGREGATE_TYPE_P (type))
2292 aa.alignment = TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2293 return aa;
2296 if (TREE_CODE (type) == ARRAY_TYPE)
2298 aa.alignment = TYPE_ALIGN (TREE_TYPE (type));
2299 return aa;
2302 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2304 if (TREE_CODE (field) == FIELD_DECL)
2305 aa.alignment = std::max (aa.alignment, DECL_ALIGN (field));
2306 else
2307 aa.warn_alignment = std::max (aa.warn_alignment, DECL_ALIGN (field));
2310 return aa;
2313 /* Layout a function argument according to the AAPCS64 rules. The rule
2314 numbers refer to the rule numbers in the AAPCS64. */
2316 static void
2317 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2318 const_tree type,
2319 bool named ATTRIBUTE_UNUSED)
2321 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2322 int ncrn, nvrn, nregs;
2323 bool allocate_ncrn, allocate_nvrn;
2324 HOST_WIDE_INT size;
2326 /* We need to do this once per argument. */
2327 if (pcum->aapcs_arg_processed)
2328 return;
2330 pcum->aapcs_arg_processed = true;
2332 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2333 size
2334 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2335 UNITS_PER_WORD);
2337 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2338 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2339 mode,
2340 type,
2341 &nregs);
2343 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2344 The following code thus handles passing by SIMD/FP registers first. */
2346 nvrn = pcum->aapcs_nvrn;
2348 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2349 and homogenous short-vector aggregates (HVA). */
2350 if (allocate_nvrn)
2352 if (!TARGET_FLOAT)
2353 aarch64_err_no_fpadvsimd (mode, "argument");
2355 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2357 pcum->aapcs_nextnvrn = nvrn + nregs;
2358 if (!aarch64_composite_type_p (type, mode))
2360 gcc_assert (nregs == 1);
2361 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2363 else
2365 rtx par;
2366 int i;
2367 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2368 for (i = 0; i < nregs; i++)
2370 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2371 V0_REGNUM + nvrn + i);
2372 tmp = gen_rtx_EXPR_LIST
2373 (VOIDmode, tmp,
2374 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2375 XVECEXP (par, 0, i) = tmp;
2377 pcum->aapcs_reg = par;
2379 return;
2381 else
2383 /* C.3 NSRN is set to 8. */
2384 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2385 goto on_stack;
2389 ncrn = pcum->aapcs_ncrn;
2390 nregs = size / UNITS_PER_WORD;
2392 /* C6 - C9. though the sign and zero extension semantics are
2393 handled elsewhere. This is the case where the argument fits
2394 entirely general registers. */
2395 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2398 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2400 /* C.8 if the argument has an alignment of 16 then the NGRN is
2401 rounded up to the next even number. */
2402 if (nregs == 2 && ncrn % 2)
2404 struct aarch64_fn_arg_alignment aa
2405 = aarch64_function_arg_alignment (mode, type);
2407 /* The == 16 * BITS_PER_UNIT instead of >= 16 * BITS_PER_UNIT
2408 comparisons are there because for > 16 * BITS_PER_UNIT
2409 alignment nregs should be > 2 and therefore it should be
2410 passed by reference rather than value. */
2411 if (aa.warn_alignment == 16 * BITS_PER_UNIT
2412 && aa.alignment < aa.warn_alignment
2413 && warn_psabi
2414 && currently_expanding_gimple_stmt)
2415 inform (input_location,
2416 "parameter passing for argument of type %qT "
2417 "changed in GCC 7.1", type);
2418 else if (aa.alignment == 16 * BITS_PER_UNIT)
2420 ++ncrn;
2421 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2425 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2426 A reg is still generated for it, but the caller should be smart
2427 enough not to use it. */
2428 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2429 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2430 else
2432 rtx par;
2433 int i;
2435 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2436 for (i = 0; i < nregs; i++)
2438 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2439 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2440 GEN_INT (i * UNITS_PER_WORD));
2441 XVECEXP (par, 0, i) = tmp;
2443 pcum->aapcs_reg = par;
2446 pcum->aapcs_nextncrn = ncrn + nregs;
2447 return;
2450 /* C.11 */
2451 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2453 /* The argument is passed on stack; record the needed number of words for
2454 this argument and align the total size if necessary. */
2455 on_stack:
2456 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2457 struct aarch64_fn_arg_alignment aa
2458 = aarch64_function_arg_alignment (mode, type);
2460 if (aa.alignment == 16 * BITS_PER_UNIT)
2461 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2462 16 / UNITS_PER_WORD);
2463 return;
2466 /* Implement TARGET_FUNCTION_ARG. */
2468 static rtx
2469 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2470 const_tree type, bool named)
2472 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2473 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2475 if (mode == VOIDmode)
2476 return NULL_RTX;
2478 aarch64_layout_arg (pcum_v, mode, type, named);
2479 return pcum->aapcs_reg;
2482 void
2483 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2484 const_tree fntype ATTRIBUTE_UNUSED,
2485 rtx libname ATTRIBUTE_UNUSED,
2486 const_tree fndecl ATTRIBUTE_UNUSED,
2487 unsigned n_named ATTRIBUTE_UNUSED)
2489 pcum->aapcs_ncrn = 0;
2490 pcum->aapcs_nvrn = 0;
2491 pcum->aapcs_nextncrn = 0;
2492 pcum->aapcs_nextnvrn = 0;
2493 pcum->pcs_variant = ARM_PCS_AAPCS64;
2494 pcum->aapcs_reg = NULL_RTX;
2495 pcum->aapcs_arg_processed = false;
2496 pcum->aapcs_stack_words = 0;
2497 pcum->aapcs_stack_size = 0;
2499 if (!TARGET_FLOAT
2500 && fndecl && TREE_PUBLIC (fndecl)
2501 && fntype && fntype != error_mark_node)
2503 const_tree type = TREE_TYPE (fntype);
2504 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2505 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2506 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2507 &mode, &nregs, NULL))
2508 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2510 return;
2513 static void
2514 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2515 machine_mode mode,
2516 const_tree type,
2517 bool named)
2519 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2520 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2522 aarch64_layout_arg (pcum_v, mode, type, named);
2523 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2524 != (pcum->aapcs_stack_words != 0));
2525 pcum->aapcs_arg_processed = false;
2526 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2527 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2528 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2529 pcum->aapcs_stack_words = 0;
2530 pcum->aapcs_reg = NULL_RTX;
2534 bool
2535 aarch64_function_arg_regno_p (unsigned regno)
2537 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2538 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2541 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2542 PARM_BOUNDARY bits of alignment, but will be given anything up
2543 to STACK_BOUNDARY bits if the type requires it. This makes sure
2544 that both before and after the layout of each argument, the Next
2545 Stacked Argument Address (NSAA) will have a minimum alignment of
2546 8 bytes. */
2548 static unsigned int
2549 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2551 struct aarch64_fn_arg_alignment aa
2552 = aarch64_function_arg_alignment (mode, type);
2553 aa.alignment = MIN (MAX (aa.alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2554 aa.warn_alignment
2555 = MIN (MAX (aa.warn_alignment, PARM_BOUNDARY), STACK_BOUNDARY);
2557 if (warn_psabi && aa.warn_alignment > aa.alignment)
2558 inform (input_location, "parameter passing for argument of type %qT "
2559 "changed in GCC 7.1", type);
2561 return aa.alignment;
2564 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2566 Return true if an argument passed on the stack should be padded upwards,
2567 i.e. if the least-significant byte of the stack slot has useful data.
2569 Small aggregate types are placed in the lowest memory address.
2571 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2573 bool
2574 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2576 /* On little-endian targets, the least significant byte of every stack
2577 argument is passed at the lowest byte address of the stack slot. */
2578 if (!BYTES_BIG_ENDIAN)
2579 return true;
2581 /* Otherwise, integral, floating-point and pointer types are padded downward:
2582 the least significant byte of a stack argument is passed at the highest
2583 byte address of the stack slot. */
2584 if (type
2585 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2586 || POINTER_TYPE_P (type))
2587 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2588 return false;
2590 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2591 return true;
2594 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2596 It specifies padding for the last (may also be the only)
2597 element of a block move between registers and memory. If
2598 assuming the block is in the memory, padding upward means that
2599 the last element is padded after its highest significant byte,
2600 while in downward padding, the last element is padded at the
2601 its least significant byte side.
2603 Small aggregates and small complex types are always padded
2604 upwards.
2606 We don't need to worry about homogeneous floating-point or
2607 short-vector aggregates; their move is not affected by the
2608 padding direction determined here. Regardless of endianness,
2609 each element of such an aggregate is put in the least
2610 significant bits of a fp/simd register.
2612 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2613 register has useful data, and return the opposite if the most
2614 significant byte does. */
2616 bool
2617 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2618 bool first ATTRIBUTE_UNUSED)
2621 /* Small composite types are always padded upward. */
2622 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2624 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2625 : GET_MODE_SIZE (mode));
2626 if (size < 2 * UNITS_PER_WORD)
2627 return true;
2630 /* Otherwise, use the default padding. */
2631 return !BYTES_BIG_ENDIAN;
2634 static machine_mode
2635 aarch64_libgcc_cmp_return_mode (void)
2637 return SImode;
2640 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2642 /* We use the 12-bit shifted immediate arithmetic instructions so values
2643 must be multiple of (1 << 12), i.e. 4096. */
2644 #define ARITH_FACTOR 4096
2646 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2647 #error Cannot use simple address calculation for stack probing
2648 #endif
2650 /* The pair of scratch registers used for stack probing. */
2651 #define PROBE_STACK_FIRST_REG 9
2652 #define PROBE_STACK_SECOND_REG 10
2654 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2655 inclusive. These are offsets from the current stack pointer. */
2657 static void
2658 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2660 rtx reg1 = gen_rtx_REG (Pmode, PROBE_STACK_FIRST_REG);
2662 /* See the same assertion on PROBE_INTERVAL above. */
2663 gcc_assert ((first % ARITH_FACTOR) == 0);
2665 /* See if we have a constant small number of probes to generate. If so,
2666 that's the easy case. */
2667 if (size <= PROBE_INTERVAL)
2669 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2671 emit_set_insn (reg1,
2672 plus_constant (Pmode,
2673 stack_pointer_rtx, -(first + base)));
2674 emit_stack_probe (plus_constant (Pmode, reg1, base - size));
2677 /* The run-time loop is made up of 8 insns in the generic case while the
2678 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2679 else if (size <= 4 * PROBE_INTERVAL)
2681 HOST_WIDE_INT i, rem;
2683 emit_set_insn (reg1,
2684 plus_constant (Pmode,
2685 stack_pointer_rtx,
2686 -(first + PROBE_INTERVAL)));
2687 emit_stack_probe (reg1);
2689 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2690 it exceeds SIZE. If only two probes are needed, this will not
2691 generate any code. Then probe at FIRST + SIZE. */
2692 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2694 emit_set_insn (reg1,
2695 plus_constant (Pmode, reg1, -PROBE_INTERVAL));
2696 emit_stack_probe (reg1);
2699 rem = size - (i - PROBE_INTERVAL);
2700 if (rem > 256)
2702 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2704 emit_set_insn (reg1, plus_constant (Pmode, reg1, -base));
2705 emit_stack_probe (plus_constant (Pmode, reg1, base - rem));
2707 else
2708 emit_stack_probe (plus_constant (Pmode, reg1, -rem));
2711 /* Otherwise, do the same as above, but in a loop. Note that we must be
2712 extra careful with variables wrapping around because we might be at
2713 the very top (or the very bottom) of the address space and we have
2714 to be able to handle this case properly; in particular, we use an
2715 equality test for the loop condition. */
2716 else
2718 rtx reg2 = gen_rtx_REG (Pmode, PROBE_STACK_SECOND_REG);
2720 /* Step 1: round SIZE to the previous multiple of the interval. */
2722 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2725 /* Step 2: compute initial and final value of the loop counter. */
2727 /* TEST_ADDR = SP + FIRST. */
2728 emit_set_insn (reg1,
2729 plus_constant (Pmode, stack_pointer_rtx, -first));
2731 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2732 emit_set_insn (reg2,
2733 plus_constant (Pmode, stack_pointer_rtx,
2734 -(first + rounded_size)));
2737 /* Step 3: the loop
2741 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2742 probe at TEST_ADDR
2744 while (TEST_ADDR != LAST_ADDR)
2746 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2747 until it is equal to ROUNDED_SIZE. */
2749 emit_insn (gen_probe_stack_range (reg1, reg1, reg2));
2752 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2753 that SIZE is equal to ROUNDED_SIZE. */
2755 if (size != rounded_size)
2757 HOST_WIDE_INT rem = size - rounded_size;
2759 if (rem > 256)
2761 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2763 emit_set_insn (reg2, plus_constant (Pmode, reg2, -base));
2764 emit_stack_probe (plus_constant (Pmode, reg2, base - rem));
2766 else
2767 emit_stack_probe (plus_constant (Pmode, reg2, -rem));
2771 /* Make sure nothing is scheduled before we are done. */
2772 emit_insn (gen_blockage ());
2775 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2776 absolute addresses. */
2778 const char *
2779 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2781 static int labelno = 0;
2782 char loop_lab[32];
2783 rtx xops[2];
2785 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2787 /* Loop. */
2788 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2790 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2791 xops[0] = reg1;
2792 xops[1] = GEN_INT (PROBE_INTERVAL);
2793 output_asm_insn ("sub\t%0, %0, %1", xops);
2795 /* Probe at TEST_ADDR. */
2796 output_asm_insn ("str\txzr, [%0]", xops);
2798 /* Test if TEST_ADDR == LAST_ADDR. */
2799 xops[1] = reg2;
2800 output_asm_insn ("cmp\t%0, %1", xops);
2802 /* Branch. */
2803 fputs ("\tb.ne\t", asm_out_file);
2804 assemble_name_raw (asm_out_file, loop_lab);
2805 fputc ('\n', asm_out_file);
2807 return "";
2810 static bool
2811 aarch64_frame_pointer_required (void)
2813 /* In aarch64_override_options_after_change
2814 flag_omit_leaf_frame_pointer turns off the frame pointer by
2815 default. Turn it back on now if we've not got a leaf
2816 function. */
2817 if (flag_omit_leaf_frame_pointer
2818 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2819 return true;
2821 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2822 if (crtl->calls_eh_return)
2823 return true;
2825 return false;
2828 /* Mark the registers that need to be saved by the callee and calculate
2829 the size of the callee-saved registers area and frame record (both FP
2830 and LR may be omitted). */
2831 static void
2832 aarch64_layout_frame (void)
2834 HOST_WIDE_INT offset = 0;
2835 int regno, last_fp_reg = INVALID_REGNUM;
2837 if (reload_completed && cfun->machine->frame.laid_out)
2838 return;
2840 #define SLOT_NOT_REQUIRED (-2)
2841 #define SLOT_REQUIRED (-1)
2843 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2844 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2846 /* First mark all the registers that really need to be saved... */
2847 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2848 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2850 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2851 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2853 /* ... that includes the eh data registers (if needed)... */
2854 if (crtl->calls_eh_return)
2855 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2856 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2857 = SLOT_REQUIRED;
2859 /* ... and any callee saved register that dataflow says is live. */
2860 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2861 if (df_regs_ever_live_p (regno)
2862 && (regno == R30_REGNUM
2863 || !call_used_regs[regno]))
2864 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2866 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2867 if (df_regs_ever_live_p (regno)
2868 && !call_used_regs[regno])
2870 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2871 last_fp_reg = regno;
2874 if (frame_pointer_needed)
2876 /* FP and LR are placed in the linkage record. */
2877 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2878 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2879 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2880 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2881 offset += 2 * UNITS_PER_WORD;
2884 /* Now assign stack slots for them. */
2885 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2886 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2888 cfun->machine->frame.reg_offset[regno] = offset;
2889 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2890 cfun->machine->frame.wb_candidate1 = regno;
2891 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2892 cfun->machine->frame.wb_candidate2 = regno;
2893 offset += UNITS_PER_WORD;
2896 HOST_WIDE_INT max_int_offset = offset;
2897 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2898 bool has_align_gap = offset != max_int_offset;
2900 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2901 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2903 /* If there is an alignment gap between integer and fp callee-saves,
2904 allocate the last fp register to it if possible. */
2905 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2907 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2908 break;
2911 cfun->machine->frame.reg_offset[regno] = offset;
2912 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2913 cfun->machine->frame.wb_candidate1 = regno;
2914 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2915 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2916 cfun->machine->frame.wb_candidate2 = regno;
2917 offset += UNITS_PER_WORD;
2920 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2922 cfun->machine->frame.saved_regs_size = offset;
2924 HOST_WIDE_INT varargs_and_saved_regs_size
2925 = offset + cfun->machine->frame.saved_varargs_size;
2927 cfun->machine->frame.hard_fp_offset
2928 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2929 STACK_BOUNDARY / BITS_PER_UNIT);
2931 cfun->machine->frame.frame_size
2932 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2933 + crtl->outgoing_args_size,
2934 STACK_BOUNDARY / BITS_PER_UNIT);
2936 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2938 cfun->machine->frame.initial_adjust = 0;
2939 cfun->machine->frame.final_adjust = 0;
2940 cfun->machine->frame.callee_adjust = 0;
2941 cfun->machine->frame.callee_offset = 0;
2943 HOST_WIDE_INT max_push_offset = 0;
2944 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2945 max_push_offset = 512;
2946 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2947 max_push_offset = 256;
2949 if (cfun->machine->frame.frame_size < max_push_offset
2950 && crtl->outgoing_args_size == 0)
2952 /* Simple, small frame with no outgoing arguments:
2953 stp reg1, reg2, [sp, -frame_size]!
2954 stp reg3, reg4, [sp, 16] */
2955 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2957 else if ((crtl->outgoing_args_size
2958 + cfun->machine->frame.saved_regs_size < 512)
2959 && !(cfun->calls_alloca
2960 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2962 /* Frame with small outgoing arguments:
2963 sub sp, sp, frame_size
2964 stp reg1, reg2, [sp, outgoing_args_size]
2965 stp reg3, reg4, [sp, outgoing_args_size + 16] */
2966 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2967 cfun->machine->frame.callee_offset
2968 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2970 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2972 /* Frame with large outgoing arguments but a small local area:
2973 stp reg1, reg2, [sp, -hard_fp_offset]!
2974 stp reg3, reg4, [sp, 16]
2975 sub sp, sp, outgoing_args_size */
2976 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2977 cfun->machine->frame.final_adjust
2978 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2980 else if (!frame_pointer_needed
2981 && varargs_and_saved_regs_size < max_push_offset)
2983 /* Frame with large local area and outgoing arguments (this pushes the
2984 callee-saves first, followed by the locals and outgoing area):
2985 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2986 stp reg3, reg4, [sp, 16]
2987 sub sp, sp, frame_size - varargs_and_saved_regs_size */
2988 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2989 cfun->machine->frame.final_adjust
2990 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2991 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2992 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2994 else
2996 /* Frame with large local area and outgoing arguments using frame pointer:
2997 sub sp, sp, hard_fp_offset
2998 stp x29, x30, [sp, 0]
2999 add x29, sp, 0
3000 stp reg3, reg4, [sp, 16]
3001 sub sp, sp, outgoing_args_size */
3002 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
3003 cfun->machine->frame.final_adjust
3004 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
3007 cfun->machine->frame.laid_out = true;
3010 /* Return true if the register REGNO is saved on entry to
3011 the current function. */
3013 static bool
3014 aarch64_register_saved_on_entry (int regno)
3016 return cfun->machine->frame.reg_offset[regno] >= 0;
3019 /* Return the next register up from REGNO up to LIMIT for the callee
3020 to save. */
3022 static unsigned
3023 aarch64_next_callee_save (unsigned regno, unsigned limit)
3025 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
3026 regno ++;
3027 return regno;
3030 /* Push the register number REGNO of mode MODE to the stack with write-back
3031 adjusting the stack by ADJUSTMENT. */
3033 static void
3034 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
3035 HOST_WIDE_INT adjustment)
3037 rtx base_rtx = stack_pointer_rtx;
3038 rtx insn, reg, mem;
3040 reg = gen_rtx_REG (mode, regno);
3041 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
3042 plus_constant (Pmode, base_rtx, -adjustment));
3043 mem = gen_rtx_MEM (mode, mem);
3045 insn = emit_move_insn (mem, reg);
3046 RTX_FRAME_RELATED_P (insn) = 1;
3049 /* Generate and return an instruction to store the pair of registers
3050 REG and REG2 of mode MODE to location BASE with write-back adjusting
3051 the stack location BASE by ADJUSTMENT. */
3053 static rtx
3054 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3055 HOST_WIDE_INT adjustment)
3057 switch (mode)
3059 case DImode:
3060 return gen_storewb_pairdi_di (base, base, reg, reg2,
3061 GEN_INT (-adjustment),
3062 GEN_INT (UNITS_PER_WORD - adjustment));
3063 case DFmode:
3064 return gen_storewb_pairdf_di (base, base, reg, reg2,
3065 GEN_INT (-adjustment),
3066 GEN_INT (UNITS_PER_WORD - adjustment));
3067 default:
3068 gcc_unreachable ();
3072 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3073 stack pointer by ADJUSTMENT. */
3075 static void
3076 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3078 rtx_insn *insn;
3079 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3081 if (regno2 == INVALID_REGNUM)
3082 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3084 rtx reg1 = gen_rtx_REG (mode, regno1);
3085 rtx reg2 = gen_rtx_REG (mode, regno2);
3087 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3088 reg2, adjustment));
3089 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3090 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3091 RTX_FRAME_RELATED_P (insn) = 1;
3094 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3095 adjusting it by ADJUSTMENT afterwards. */
3097 static rtx
3098 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3099 HOST_WIDE_INT adjustment)
3101 switch (mode)
3103 case DImode:
3104 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3105 GEN_INT (UNITS_PER_WORD));
3106 case DFmode:
3107 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3108 GEN_INT (UNITS_PER_WORD));
3109 default:
3110 gcc_unreachable ();
3114 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3115 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3116 into CFI_OPS. */
3118 static void
3119 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3120 rtx *cfi_ops)
3122 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3123 rtx reg1 = gen_rtx_REG (mode, regno1);
3125 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3127 if (regno2 == INVALID_REGNUM)
3129 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3130 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3131 emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3133 else
3135 rtx reg2 = gen_rtx_REG (mode, regno2);
3136 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3137 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3138 reg2, adjustment));
3142 /* Generate and return a store pair instruction of mode MODE to store
3143 register REG1 to MEM1 and register REG2 to MEM2. */
3145 static rtx
3146 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3147 rtx reg2)
3149 switch (mode)
3151 case DImode:
3152 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3154 case DFmode:
3155 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3157 default:
3158 gcc_unreachable ();
3162 /* Generate and regurn a load pair isntruction of mode MODE to load register
3163 REG1 from MEM1 and register REG2 from MEM2. */
3165 static rtx
3166 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3167 rtx mem2)
3169 switch (mode)
3171 case DImode:
3172 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3174 case DFmode:
3175 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3177 default:
3178 gcc_unreachable ();
3182 /* Return TRUE if return address signing should be enabled for the current
3183 function, otherwise return FALSE. */
3185 bool
3186 aarch64_return_address_signing_enabled (void)
3188 /* This function should only be called after frame laid out. */
3189 gcc_assert (cfun->machine->frame.laid_out);
3191 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3192 if it's LR is pushed onto stack. */
3193 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3194 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3195 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3198 /* Emit code to save the callee-saved registers from register number START
3199 to LIMIT to the stack at the location starting at offset START_OFFSET,
3200 skipping any write-back candidates if SKIP_WB is true. */
3202 static void
3203 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3204 unsigned start, unsigned limit, bool skip_wb)
3206 rtx_insn *insn;
3207 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3208 ? gen_frame_mem : gen_rtx_MEM);
3209 unsigned regno;
3210 unsigned regno2;
3212 for (regno = aarch64_next_callee_save (start, limit);
3213 regno <= limit;
3214 regno = aarch64_next_callee_save (regno + 1, limit))
3216 rtx reg, mem;
3217 HOST_WIDE_INT offset;
3219 if (skip_wb
3220 && (regno == cfun->machine->frame.wb_candidate1
3221 || regno == cfun->machine->frame.wb_candidate2))
3222 continue;
3224 if (cfun->machine->reg_is_wrapped_separately[regno])
3225 continue;
3227 reg = gen_rtx_REG (mode, regno);
3228 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3229 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3230 offset));
3232 regno2 = aarch64_next_callee_save (regno + 1, limit);
3234 if (regno2 <= limit
3235 && !cfun->machine->reg_is_wrapped_separately[regno2]
3236 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3237 == cfun->machine->frame.reg_offset[regno2]))
3240 rtx reg2 = gen_rtx_REG (mode, regno2);
3241 rtx mem2;
3243 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3244 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3245 offset));
3246 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3247 reg2));
3249 /* The first part of a frame-related parallel insn is
3250 always assumed to be relevant to the frame
3251 calculations; subsequent parts, are only
3252 frame-related if explicitly marked. */
3253 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3254 regno = regno2;
3256 else
3257 insn = emit_move_insn (mem, reg);
3259 RTX_FRAME_RELATED_P (insn) = 1;
3263 /* Emit code to restore the callee registers of mode MODE from register
3264 number START up to and including LIMIT. Restore from the stack offset
3265 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3266 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3268 static void
3269 aarch64_restore_callee_saves (machine_mode mode,
3270 HOST_WIDE_INT start_offset, unsigned start,
3271 unsigned limit, bool skip_wb, rtx *cfi_ops)
3273 rtx base_rtx = stack_pointer_rtx;
3274 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3275 ? gen_frame_mem : gen_rtx_MEM);
3276 unsigned regno;
3277 unsigned regno2;
3278 HOST_WIDE_INT offset;
3280 for (regno = aarch64_next_callee_save (start, limit);
3281 regno <= limit;
3282 regno = aarch64_next_callee_save (regno + 1, limit))
3284 if (cfun->machine->reg_is_wrapped_separately[regno])
3285 continue;
3287 rtx reg, mem;
3289 if (skip_wb
3290 && (regno == cfun->machine->frame.wb_candidate1
3291 || regno == cfun->machine->frame.wb_candidate2))
3292 continue;
3294 reg = gen_rtx_REG (mode, regno);
3295 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3296 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3298 regno2 = aarch64_next_callee_save (regno + 1, limit);
3300 if (regno2 <= limit
3301 && !cfun->machine->reg_is_wrapped_separately[regno2]
3302 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3303 == cfun->machine->frame.reg_offset[regno2]))
3305 rtx reg2 = gen_rtx_REG (mode, regno2);
3306 rtx mem2;
3308 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3309 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3310 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3312 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3313 regno = regno2;
3315 else
3316 emit_move_insn (reg, mem);
3317 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3321 static inline bool
3322 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3323 HOST_WIDE_INT offset)
3325 return offset >= -256 && offset < 256;
3328 static inline bool
3329 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3331 return (offset >= 0
3332 && offset < 4096 * GET_MODE_SIZE (mode)
3333 && offset % GET_MODE_SIZE (mode) == 0);
3336 bool
3337 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3339 return (offset >= -64 * GET_MODE_SIZE (mode)
3340 && offset < 64 * GET_MODE_SIZE (mode)
3341 && offset % GET_MODE_SIZE (mode) == 0);
3344 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3346 static sbitmap
3347 aarch64_get_separate_components (void)
3349 aarch64_layout_frame ();
3351 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3352 bitmap_clear (components);
3354 /* The registers we need saved to the frame. */
3355 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3356 if (aarch64_register_saved_on_entry (regno))
3358 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3359 if (!frame_pointer_needed)
3360 offset += cfun->machine->frame.frame_size
3361 - cfun->machine->frame.hard_fp_offset;
3362 /* Check that we can access the stack slot of the register with one
3363 direct load with no adjustments needed. */
3364 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3365 bitmap_set_bit (components, regno);
3368 /* Don't mess with the hard frame pointer. */
3369 if (frame_pointer_needed)
3370 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3372 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3373 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3374 /* If aarch64_layout_frame has chosen registers to store/restore with
3375 writeback don't interfere with them to avoid having to output explicit
3376 stack adjustment instructions. */
3377 if (reg2 != INVALID_REGNUM)
3378 bitmap_clear_bit (components, reg2);
3379 if (reg1 != INVALID_REGNUM)
3380 bitmap_clear_bit (components, reg1);
3382 bitmap_clear_bit (components, LR_REGNUM);
3383 bitmap_clear_bit (components, SP_REGNUM);
3385 return components;
3388 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3390 static sbitmap
3391 aarch64_components_for_bb (basic_block bb)
3393 bitmap in = DF_LIVE_IN (bb);
3394 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3395 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3397 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3398 bitmap_clear (components);
3400 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3401 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3402 if ((!call_used_regs[regno])
3403 && (bitmap_bit_p (in, regno)
3404 || bitmap_bit_p (gen, regno)
3405 || bitmap_bit_p (kill, regno)))
3406 bitmap_set_bit (components, regno);
3408 return components;
3411 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3412 Nothing to do for aarch64. */
3414 static void
3415 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3419 /* Return the next set bit in BMP from START onwards. Return the total number
3420 of bits in BMP if no set bit is found at or after START. */
3422 static unsigned int
3423 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3425 unsigned int nbits = SBITMAP_SIZE (bmp);
3426 if (start == nbits)
3427 return start;
3429 gcc_assert (start < nbits);
3430 for (unsigned int i = start; i < nbits; i++)
3431 if (bitmap_bit_p (bmp, i))
3432 return i;
3434 return nbits;
3437 /* Do the work for aarch64_emit_prologue_components and
3438 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3439 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3440 for these components or the epilogue sequence. That is, it determines
3441 whether we should emit stores or loads and what kind of CFA notes to attach
3442 to the insns. Otherwise the logic for the two sequences is very
3443 similar. */
3445 static void
3446 aarch64_process_components (sbitmap components, bool prologue_p)
3448 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3449 ? HARD_FRAME_POINTER_REGNUM
3450 : STACK_POINTER_REGNUM);
3452 unsigned last_regno = SBITMAP_SIZE (components);
3453 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3454 rtx_insn *insn = NULL;
3456 while (regno != last_regno)
3458 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3459 so DFmode for the vector registers is enough. */
3460 machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3461 rtx reg = gen_rtx_REG (mode, regno);
3462 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3463 if (!frame_pointer_needed)
3464 offset += cfun->machine->frame.frame_size
3465 - cfun->machine->frame.hard_fp_offset;
3466 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3467 rtx mem = gen_frame_mem (mode, addr);
3469 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3470 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3471 /* No more registers to handle after REGNO.
3472 Emit a single save/restore and exit. */
3473 if (regno2 == last_regno)
3475 insn = emit_insn (set);
3476 RTX_FRAME_RELATED_P (insn) = 1;
3477 if (prologue_p)
3478 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3479 else
3480 add_reg_note (insn, REG_CFA_RESTORE, reg);
3481 break;
3484 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3485 /* The next register is not of the same class or its offset is not
3486 mergeable with the current one into a pair. */
3487 if (!satisfies_constraint_Ump (mem)
3488 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3489 || (offset2 - cfun->machine->frame.reg_offset[regno])
3490 != GET_MODE_SIZE (mode))
3492 insn = emit_insn (set);
3493 RTX_FRAME_RELATED_P (insn) = 1;
3494 if (prologue_p)
3495 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3496 else
3497 add_reg_note (insn, REG_CFA_RESTORE, reg);
3499 regno = regno2;
3500 continue;
3503 /* REGNO2 can be saved/restored in a pair with REGNO. */
3504 rtx reg2 = gen_rtx_REG (mode, regno2);
3505 if (!frame_pointer_needed)
3506 offset2 += cfun->machine->frame.frame_size
3507 - cfun->machine->frame.hard_fp_offset;
3508 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3509 rtx mem2 = gen_frame_mem (mode, addr2);
3510 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3511 : gen_rtx_SET (reg2, mem2);
3513 if (prologue_p)
3514 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3515 else
3516 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3518 RTX_FRAME_RELATED_P (insn) = 1;
3519 if (prologue_p)
3521 add_reg_note (insn, REG_CFA_OFFSET, set);
3522 add_reg_note (insn, REG_CFA_OFFSET, set2);
3524 else
3526 add_reg_note (insn, REG_CFA_RESTORE, reg);
3527 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3530 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3534 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3536 static void
3537 aarch64_emit_prologue_components (sbitmap components)
3539 aarch64_process_components (components, true);
3542 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3544 static void
3545 aarch64_emit_epilogue_components (sbitmap components)
3547 aarch64_process_components (components, false);
3550 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3552 static void
3553 aarch64_set_handled_components (sbitmap components)
3555 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3556 if (bitmap_bit_p (components, regno))
3557 cfun->machine->reg_is_wrapped_separately[regno] = true;
3560 /* AArch64 stack frames generated by this compiler look like:
3562 +-------------------------------+
3564 | incoming stack arguments |
3566 +-------------------------------+
3567 | | <-- incoming stack pointer (aligned)
3568 | callee-allocated save area |
3569 | for register varargs |
3571 +-------------------------------+
3572 | local variables | <-- frame_pointer_rtx
3574 +-------------------------------+
3575 | padding0 | \
3576 +-------------------------------+ |
3577 | callee-saved registers | | frame.saved_regs_size
3578 +-------------------------------+ |
3579 | LR' | |
3580 +-------------------------------+ |
3581 | FP' | / <- hard_frame_pointer_rtx (aligned)
3582 +-------------------------------+
3583 | dynamic allocation |
3584 +-------------------------------+
3585 | padding |
3586 +-------------------------------+
3587 | outgoing stack arguments | <-- arg_pointer
3589 +-------------------------------+
3590 | | <-- stack_pointer_rtx (aligned)
3592 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3593 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3594 unchanged. */
3596 /* Generate the prologue instructions for entry into a function.
3597 Establish the stack frame by decreasing the stack pointer with a
3598 properly calculated size and, if necessary, create a frame record
3599 filled with the values of LR and previous frame pointer. The
3600 current FP is also set up if it is in use. */
3602 void
3603 aarch64_expand_prologue (void)
3605 aarch64_layout_frame ();
3607 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3608 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3609 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3610 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3611 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3612 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3613 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3614 rtx_insn *insn;
3616 /* Sign return address for functions. */
3617 if (aarch64_return_address_signing_enabled ())
3619 insn = emit_insn (gen_pacisp ());
3620 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3621 RTX_FRAME_RELATED_P (insn) = 1;
3624 if (flag_stack_usage_info)
3625 current_function_static_stack_size = frame_size;
3627 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3629 if (crtl->is_leaf && !cfun->calls_alloca)
3631 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3632 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3633 frame_size - STACK_CHECK_PROTECT);
3635 else if (frame_size > 0)
3636 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3639 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3641 if (callee_adjust != 0)
3642 aarch64_push_regs (reg1, reg2, callee_adjust);
3644 if (frame_pointer_needed)
3646 if (callee_adjust == 0)
3647 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3648 R30_REGNUM, false);
3649 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3650 stack_pointer_rtx,
3651 GEN_INT (callee_offset)));
3652 RTX_FRAME_RELATED_P (insn) = 1;
3653 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3656 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3657 callee_adjust != 0 || frame_pointer_needed);
3658 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3659 callee_adjust != 0 || frame_pointer_needed);
3660 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3663 /* Return TRUE if we can use a simple_return insn.
3665 This function checks whether the callee saved stack is empty, which
3666 means no restore actions are need. The pro_and_epilogue will use
3667 this to check whether shrink-wrapping opt is feasible. */
3669 bool
3670 aarch64_use_return_insn_p (void)
3672 if (!reload_completed)
3673 return false;
3675 if (crtl->profile)
3676 return false;
3678 aarch64_layout_frame ();
3680 return cfun->machine->frame.frame_size == 0;
3683 /* Generate the epilogue instructions for returning from a function.
3684 This is almost exactly the reverse of the prolog sequence, except
3685 that we need to insert barriers to avoid scheduling loads that read
3686 from a deallocated stack, and we optimize the unwind records by
3687 emitting them all together if possible. */
3688 void
3689 aarch64_expand_epilogue (bool for_sibcall)
3691 aarch64_layout_frame ();
3693 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3694 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3695 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3696 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3697 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3698 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3699 rtx cfi_ops = NULL;
3700 rtx_insn *insn;
3702 /* We need to add memory barrier to prevent read from deallocated stack. */
3703 bool need_barrier_p = (get_frame_size ()
3704 + cfun->machine->frame.saved_varargs_size) != 0;
3706 /* Emit a barrier to prevent loads from a deallocated stack. */
3707 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3708 || crtl->calls_eh_return)
3710 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3711 need_barrier_p = false;
3714 /* Restore the stack pointer from the frame pointer if it may not
3715 be the same as the stack pointer. */
3716 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3718 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3719 hard_frame_pointer_rtx,
3720 GEN_INT (-callee_offset)));
3721 /* If writeback is used when restoring callee-saves, the CFA
3722 is restored on the instruction doing the writeback. */
3723 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3725 else
3726 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3728 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3729 callee_adjust != 0, &cfi_ops);
3730 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3731 callee_adjust != 0, &cfi_ops);
3733 if (need_barrier_p)
3734 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3736 if (callee_adjust != 0)
3737 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3739 if (callee_adjust != 0 || initial_adjust > 65536)
3741 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3742 insn = get_last_insn ();
3743 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3744 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3745 RTX_FRAME_RELATED_P (insn) = 1;
3746 cfi_ops = NULL;
3749 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3751 if (cfi_ops)
3753 /* Emit delayed restores and reset the CFA to be SP. */
3754 insn = get_last_insn ();
3755 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3756 REG_NOTES (insn) = cfi_ops;
3757 RTX_FRAME_RELATED_P (insn) = 1;
3760 /* We prefer to emit the combined return/authenticate instruction RETAA,
3761 however there are three cases in which we must instead emit an explicit
3762 authentication instruction.
3764 1) Sibcalls don't return in a normal way, so if we're about to call one
3765 we must authenticate.
3767 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3768 generating code for !TARGET_ARMV8_3 we can't use it and must
3769 explicitly authenticate.
3771 3) On an eh_return path we make extra stack adjustments to update the
3772 canonical frame address to be the exception handler's CFA. We want
3773 to authenticate using the CFA of the function which calls eh_return.
3775 if (aarch64_return_address_signing_enabled ()
3776 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3778 insn = emit_insn (gen_autisp ());
3779 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3780 RTX_FRAME_RELATED_P (insn) = 1;
3783 /* Stack adjustment for exception handler. */
3784 if (crtl->calls_eh_return)
3786 /* We need to unwind the stack by the offset computed by
3787 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3788 to be SP; letting the CFA move during this adjustment
3789 is just as correct as retaining the CFA from the body
3790 of the function. Therefore, do nothing special. */
3791 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3794 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3795 if (!for_sibcall)
3796 emit_jump_insn (ret_rtx);
3799 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3800 normally or return to a previous frame after unwinding.
3802 An EH return uses a single shared return sequence. The epilogue is
3803 exactly like a normal epilogue except that it has an extra input
3804 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3805 that must be applied after the frame has been destroyed. An extra label
3806 is inserted before the epilogue which initializes this register to zero,
3807 and this is the entry point for a normal return.
3809 An actual EH return updates the return address, initializes the stack
3810 adjustment and jumps directly into the epilogue (bypassing the zeroing
3811 of the adjustment). Since the return address is typically saved on the
3812 stack when a function makes a call, the saved LR must be updated outside
3813 the epilogue.
3815 This poses problems as the store is generated well before the epilogue,
3816 so the offset of LR is not known yet. Also optimizations will remove the
3817 store as it appears dead, even after the epilogue is generated (as the
3818 base or offset for loading LR is different in many cases).
3820 To avoid these problems this implementation forces the frame pointer
3821 in eh_return functions so that the location of LR is fixed and known early.
3822 It also marks the store volatile, so no optimization is permitted to
3823 remove the store. */
3825 aarch64_eh_return_handler_rtx (void)
3827 rtx tmp = gen_frame_mem (Pmode,
3828 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3830 /* Mark the store volatile, so no optimization is permitted to remove it. */
3831 MEM_VOLATILE_P (tmp) = true;
3832 return tmp;
3835 /* Output code to add DELTA to the first argument, and then jump
3836 to FUNCTION. Used for C++ multiple inheritance. */
3837 static void
3838 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3839 HOST_WIDE_INT delta,
3840 HOST_WIDE_INT vcall_offset,
3841 tree function)
3843 /* The this pointer is always in x0. Note that this differs from
3844 Arm where the this pointer maybe bumped to r1 if r0 is required
3845 to return a pointer to an aggregate. On AArch64 a result value
3846 pointer will be in x8. */
3847 int this_regno = R0_REGNUM;
3848 rtx this_rtx, temp0, temp1, addr, funexp;
3849 rtx_insn *insn;
3851 reload_completed = 1;
3852 emit_note (NOTE_INSN_PROLOGUE_END);
3854 if (vcall_offset == 0)
3855 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3856 else
3858 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3860 this_rtx = gen_rtx_REG (Pmode, this_regno);
3861 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3862 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3864 addr = this_rtx;
3865 if (delta != 0)
3867 if (delta >= -256 && delta < 256)
3868 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3869 plus_constant (Pmode, this_rtx, delta));
3870 else
3871 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3874 if (Pmode == ptr_mode)
3875 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3876 else
3877 aarch64_emit_move (temp0,
3878 gen_rtx_ZERO_EXTEND (Pmode,
3879 gen_rtx_MEM (ptr_mode, addr)));
3881 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3882 addr = plus_constant (Pmode, temp0, vcall_offset);
3883 else
3885 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3886 Pmode);
3887 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3890 if (Pmode == ptr_mode)
3891 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3892 else
3893 aarch64_emit_move (temp1,
3894 gen_rtx_SIGN_EXTEND (Pmode,
3895 gen_rtx_MEM (ptr_mode, addr)));
3897 emit_insn (gen_add2_insn (this_rtx, temp1));
3900 /* Generate a tail call to the target function. */
3901 if (!TREE_USED (function))
3903 assemble_external (function);
3904 TREE_USED (function) = 1;
3906 funexp = XEXP (DECL_RTL (function), 0);
3907 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3908 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3909 SIBLING_CALL_P (insn) = 1;
3911 insn = get_insns ();
3912 shorten_branches (insn);
3913 final_start_function (insn, file, 1);
3914 final (insn, file, 1);
3915 final_end_function ();
3917 /* Stop pretending to be a post-reload pass. */
3918 reload_completed = 0;
3921 static bool
3922 aarch64_tls_referenced_p (rtx x)
3924 if (!TARGET_HAVE_TLS)
3925 return false;
3926 subrtx_iterator::array_type array;
3927 FOR_EACH_SUBRTX (iter, array, x, ALL)
3929 const_rtx x = *iter;
3930 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3931 return true;
3932 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3933 TLS offsets, not real symbol references. */
3934 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3935 iter.skip_subrtxes ();
3937 return false;
3941 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3942 a left shift of 0 or 12 bits. */
3943 bool
3944 aarch64_uimm12_shift (HOST_WIDE_INT val)
3946 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3947 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3952 /* Return true if val is an immediate that can be loaded into a
3953 register by a MOVZ instruction. */
3954 static bool
3955 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3957 if (GET_MODE_SIZE (mode) > 4)
3959 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3960 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3961 return 1;
3963 else
3965 /* Ignore sign extension. */
3966 val &= (HOST_WIDE_INT) 0xffffffff;
3968 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3969 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3972 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3974 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3976 0x0000000100000001ull,
3977 0x0001000100010001ull,
3978 0x0101010101010101ull,
3979 0x1111111111111111ull,
3980 0x5555555555555555ull,
3984 /* Return true if val is a valid bitmask immediate. */
3986 bool
3987 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3989 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3990 int bits;
3992 /* Check for a single sequence of one bits and return quickly if so.
3993 The special cases of all ones and all zeroes returns false. */
3994 val = (unsigned HOST_WIDE_INT) val_in;
3995 tmp = val + (val & -val);
3997 if (tmp == (tmp & -tmp))
3998 return (val + 1) > 1;
4000 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
4001 if (mode == SImode)
4002 val = (val << 32) | (val & 0xffffffff);
4004 /* Invert if the immediate doesn't start with a zero bit - this means we
4005 only need to search for sequences of one bits. */
4006 if (val & 1)
4007 val = ~val;
4009 /* Find the first set bit and set tmp to val with the first sequence of one
4010 bits removed. Return success if there is a single sequence of ones. */
4011 first_one = val & -val;
4012 tmp = val & (val + first_one);
4014 if (tmp == 0)
4015 return true;
4017 /* Find the next set bit and compute the difference in bit position. */
4018 next_one = tmp & -tmp;
4019 bits = clz_hwi (first_one) - clz_hwi (next_one);
4020 mask = val ^ tmp;
4022 /* Check the bit position difference is a power of 2, and that the first
4023 sequence of one bits fits within 'bits' bits. */
4024 if ((mask >> bits) != 0 || bits != (bits & -bits))
4025 return false;
4027 /* Check the sequence of one bits is repeated 64/bits times. */
4028 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
4031 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
4032 Assumed precondition: VAL_IN Is not zero. */
4034 unsigned HOST_WIDE_INT
4035 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
4037 int lowest_bit_set = ctz_hwi (val_in);
4038 int highest_bit_set = floor_log2 (val_in);
4039 gcc_assert (val_in != 0);
4041 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
4042 (HOST_WIDE_INT_1U << lowest_bit_set));
4045 /* Create constant where bits outside of lowest bit set to highest bit set
4046 are set to 1. */
4048 unsigned HOST_WIDE_INT
4049 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4051 return val_in | ~aarch64_and_split_imm1 (val_in);
4054 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4056 bool
4057 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4059 if (aarch64_bitmask_imm (val_in, mode))
4060 return false;
4062 if (aarch64_move_imm (val_in, mode))
4063 return false;
4065 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4067 return aarch64_bitmask_imm (imm2, mode);
4070 /* Return true if val is an immediate that can be loaded into a
4071 register in a single instruction. */
4072 bool
4073 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4075 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4076 return 1;
4077 return aarch64_bitmask_imm (val, mode);
4080 static bool
4081 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4083 rtx base, offset;
4085 if (GET_CODE (x) == HIGH)
4086 return true;
4088 split_const (x, &base, &offset);
4089 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4091 if (aarch64_classify_symbol (base, offset)
4092 != SYMBOL_FORCE_TO_MEM)
4093 return true;
4094 else
4095 /* Avoid generating a 64-bit relocation in ILP32; leave
4096 to aarch64_expand_mov_immediate to handle it properly. */
4097 return mode != ptr_mode;
4100 return aarch64_tls_referenced_p (x);
4103 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4104 The expansion for a table switch is quite expensive due to the number
4105 of instructions, the table lookup and hard to predict indirect jump.
4106 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4107 set, otherwise use tables for > 16 cases as a tradeoff between size and
4108 performance. When optimizing for size, use the default setting. */
4110 static unsigned int
4111 aarch64_case_values_threshold (void)
4113 /* Use the specified limit for the number of cases before using jump
4114 tables at higher optimization levels. */
4115 if (optimize > 2
4116 && selected_cpu->tune->max_case_values != 0)
4117 return selected_cpu->tune->max_case_values;
4118 else
4119 return optimize_size ? default_case_values_threshold () : 17;
4122 /* Return true if register REGNO is a valid index register.
4123 STRICT_P is true if REG_OK_STRICT is in effect. */
4125 bool
4126 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4128 if (!HARD_REGISTER_NUM_P (regno))
4130 if (!strict_p)
4131 return true;
4133 if (!reg_renumber)
4134 return false;
4136 regno = reg_renumber[regno];
4138 return GP_REGNUM_P (regno);
4141 /* Return true if register REGNO is a valid base register for mode MODE.
4142 STRICT_P is true if REG_OK_STRICT is in effect. */
4144 bool
4145 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4147 if (!HARD_REGISTER_NUM_P (regno))
4149 if (!strict_p)
4150 return true;
4152 if (!reg_renumber)
4153 return false;
4155 regno = reg_renumber[regno];
4158 /* The fake registers will be eliminated to either the stack or
4159 hard frame pointer, both of which are usually valid base registers.
4160 Reload deals with the cases where the eliminated form isn't valid. */
4161 return (GP_REGNUM_P (regno)
4162 || regno == SP_REGNUM
4163 || regno == FRAME_POINTER_REGNUM
4164 || regno == ARG_POINTER_REGNUM);
4167 /* Return true if X is a valid base register for mode MODE.
4168 STRICT_P is true if REG_OK_STRICT is in effect. */
4170 static bool
4171 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4173 if (!strict_p && GET_CODE (x) == SUBREG)
4174 x = SUBREG_REG (x);
4176 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4179 /* Return true if address offset is a valid index. If it is, fill in INFO
4180 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4182 static bool
4183 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4184 machine_mode mode, bool strict_p)
4186 enum aarch64_address_type type;
4187 rtx index;
4188 int shift;
4190 /* (reg:P) */
4191 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4192 && GET_MODE (x) == Pmode)
4194 type = ADDRESS_REG_REG;
4195 index = x;
4196 shift = 0;
4198 /* (sign_extend:DI (reg:SI)) */
4199 else if ((GET_CODE (x) == SIGN_EXTEND
4200 || GET_CODE (x) == ZERO_EXTEND)
4201 && GET_MODE (x) == DImode
4202 && GET_MODE (XEXP (x, 0)) == SImode)
4204 type = (GET_CODE (x) == SIGN_EXTEND)
4205 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4206 index = XEXP (x, 0);
4207 shift = 0;
4209 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4210 else if (GET_CODE (x) == MULT
4211 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4212 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4213 && GET_MODE (XEXP (x, 0)) == DImode
4214 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4215 && CONST_INT_P (XEXP (x, 1)))
4217 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4218 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4219 index = XEXP (XEXP (x, 0), 0);
4220 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4222 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4223 else if (GET_CODE (x) == ASHIFT
4224 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4225 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4226 && GET_MODE (XEXP (x, 0)) == DImode
4227 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4228 && CONST_INT_P (XEXP (x, 1)))
4230 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4231 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4232 index = XEXP (XEXP (x, 0), 0);
4233 shift = INTVAL (XEXP (x, 1));
4235 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4236 else if ((GET_CODE (x) == SIGN_EXTRACT
4237 || GET_CODE (x) == ZERO_EXTRACT)
4238 && GET_MODE (x) == DImode
4239 && GET_CODE (XEXP (x, 0)) == MULT
4240 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4241 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4243 type = (GET_CODE (x) == SIGN_EXTRACT)
4244 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4245 index = XEXP (XEXP (x, 0), 0);
4246 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4247 if (INTVAL (XEXP (x, 1)) != 32 + shift
4248 || INTVAL (XEXP (x, 2)) != 0)
4249 shift = -1;
4251 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4252 (const_int 0xffffffff<<shift)) */
4253 else if (GET_CODE (x) == AND
4254 && GET_MODE (x) == DImode
4255 && GET_CODE (XEXP (x, 0)) == MULT
4256 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4257 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4258 && CONST_INT_P (XEXP (x, 1)))
4260 type = ADDRESS_REG_UXTW;
4261 index = XEXP (XEXP (x, 0), 0);
4262 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4263 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4264 shift = -1;
4266 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4267 else if ((GET_CODE (x) == SIGN_EXTRACT
4268 || GET_CODE (x) == ZERO_EXTRACT)
4269 && GET_MODE (x) == DImode
4270 && GET_CODE (XEXP (x, 0)) == ASHIFT
4271 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4272 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4274 type = (GET_CODE (x) == SIGN_EXTRACT)
4275 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4276 index = XEXP (XEXP (x, 0), 0);
4277 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4278 if (INTVAL (XEXP (x, 1)) != 32 + shift
4279 || INTVAL (XEXP (x, 2)) != 0)
4280 shift = -1;
4282 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4283 (const_int 0xffffffff<<shift)) */
4284 else if (GET_CODE (x) == AND
4285 && GET_MODE (x) == DImode
4286 && GET_CODE (XEXP (x, 0)) == ASHIFT
4287 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4288 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4289 && CONST_INT_P (XEXP (x, 1)))
4291 type = ADDRESS_REG_UXTW;
4292 index = XEXP (XEXP (x, 0), 0);
4293 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4294 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4295 shift = -1;
4297 /* (mult:P (reg:P) (const_int scale)) */
4298 else if (GET_CODE (x) == MULT
4299 && GET_MODE (x) == Pmode
4300 && GET_MODE (XEXP (x, 0)) == Pmode
4301 && CONST_INT_P (XEXP (x, 1)))
4303 type = ADDRESS_REG_REG;
4304 index = XEXP (x, 0);
4305 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4307 /* (ashift:P (reg:P) (const_int shift)) */
4308 else if (GET_CODE (x) == ASHIFT
4309 && GET_MODE (x) == Pmode
4310 && GET_MODE (XEXP (x, 0)) == Pmode
4311 && CONST_INT_P (XEXP (x, 1)))
4313 type = ADDRESS_REG_REG;
4314 index = XEXP (x, 0);
4315 shift = INTVAL (XEXP (x, 1));
4317 else
4318 return false;
4320 if (GET_CODE (index) == SUBREG)
4321 index = SUBREG_REG (index);
4323 if ((shift == 0 ||
4324 (shift > 0 && shift <= 3
4325 && (1 << shift) == GET_MODE_SIZE (mode)))
4326 && REG_P (index)
4327 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4329 info->type = type;
4330 info->offset = index;
4331 info->shift = shift;
4332 return true;
4335 return false;
4338 /* Return true if MODE is one of the modes for which we
4339 support LDP/STP operations. */
4341 static bool
4342 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4344 return mode == SImode || mode == DImode
4345 || mode == SFmode || mode == DFmode
4346 || (aarch64_vector_mode_supported_p (mode)
4347 && GET_MODE_SIZE (mode) == 8);
4350 /* Return true if REGNO is a virtual pointer register, or an eliminable
4351 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4352 include stack_pointer or hard_frame_pointer. */
4353 static bool
4354 virt_or_elim_regno_p (unsigned regno)
4356 return ((regno >= FIRST_VIRTUAL_REGISTER
4357 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4358 || regno == FRAME_POINTER_REGNUM
4359 || regno == ARG_POINTER_REGNUM);
4362 /* Return true if X is a valid address for machine mode MODE. If it is,
4363 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4364 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4366 static bool
4367 aarch64_classify_address (struct aarch64_address_info *info,
4368 rtx x, machine_mode mode,
4369 RTX_CODE outer_code, bool strict_p)
4371 enum rtx_code code = GET_CODE (x);
4372 rtx op0, op1;
4374 /* On BE, we use load/store pair for all large int mode load/stores.
4375 TI/TFmode may also use a load/store pair. */
4376 bool load_store_pair_p = (outer_code == PARALLEL
4377 || mode == TImode
4378 || mode == TFmode
4379 || (BYTES_BIG_ENDIAN
4380 && aarch64_vect_struct_mode_p (mode)));
4382 bool allow_reg_index_p =
4383 !load_store_pair_p
4384 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4385 && !aarch64_vect_struct_mode_p (mode);
4387 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4388 REG addressing. */
4389 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4390 && (code != POST_INC && code != REG))
4391 return false;
4393 switch (code)
4395 case REG:
4396 case SUBREG:
4397 info->type = ADDRESS_REG_IMM;
4398 info->base = x;
4399 info->offset = const0_rtx;
4400 return aarch64_base_register_rtx_p (x, strict_p);
4402 case PLUS:
4403 op0 = XEXP (x, 0);
4404 op1 = XEXP (x, 1);
4406 if (! strict_p
4407 && REG_P (op0)
4408 && virt_or_elim_regno_p (REGNO (op0))
4409 && CONST_INT_P (op1))
4411 info->type = ADDRESS_REG_IMM;
4412 info->base = op0;
4413 info->offset = op1;
4415 return true;
4418 if (GET_MODE_SIZE (mode) != 0
4419 && CONST_INT_P (op1)
4420 && aarch64_base_register_rtx_p (op0, strict_p))
4422 HOST_WIDE_INT offset = INTVAL (op1);
4424 info->type = ADDRESS_REG_IMM;
4425 info->base = op0;
4426 info->offset = op1;
4428 /* TImode and TFmode values are allowed in both pairs of X
4429 registers and individual Q registers. The available
4430 address modes are:
4431 X,X: 7-bit signed scaled offset
4432 Q: 9-bit signed offset
4433 We conservatively require an offset representable in either mode.
4434 When performing the check for pairs of X registers i.e. LDP/STP
4435 pass down DImode since that is the natural size of the LDP/STP
4436 instruction memory accesses. */
4437 if (mode == TImode || mode == TFmode)
4438 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4439 && (offset_9bit_signed_unscaled_p (mode, offset)
4440 || offset_12bit_unsigned_scaled_p (mode, offset)));
4442 /* A 7bit offset check because OImode will emit a ldp/stp
4443 instruction (only big endian will get here).
4444 For ldp/stp instructions, the offset is scaled for the size of a
4445 single element of the pair. */
4446 if (mode == OImode)
4447 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4449 /* Three 9/12 bit offsets checks because CImode will emit three
4450 ldr/str instructions (only big endian will get here). */
4451 if (mode == CImode)
4452 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4453 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4454 || offset_12bit_unsigned_scaled_p (V16QImode,
4455 offset + 32)));
4457 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4458 instructions (only big endian will get here). */
4459 if (mode == XImode)
4460 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4461 && aarch64_offset_7bit_signed_scaled_p (TImode,
4462 offset + 32));
4464 if (load_store_pair_p)
4465 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4466 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4467 else
4468 return (offset_9bit_signed_unscaled_p (mode, offset)
4469 || offset_12bit_unsigned_scaled_p (mode, offset));
4472 if (allow_reg_index_p)
4474 /* Look for base + (scaled/extended) index register. */
4475 if (aarch64_base_register_rtx_p (op0, strict_p)
4476 && aarch64_classify_index (info, op1, mode, strict_p))
4478 info->base = op0;
4479 return true;
4481 if (aarch64_base_register_rtx_p (op1, strict_p)
4482 && aarch64_classify_index (info, op0, mode, strict_p))
4484 info->base = op1;
4485 return true;
4489 return false;
4491 case POST_INC:
4492 case POST_DEC:
4493 case PRE_INC:
4494 case PRE_DEC:
4495 info->type = ADDRESS_REG_WB;
4496 info->base = XEXP (x, 0);
4497 info->offset = NULL_RTX;
4498 return aarch64_base_register_rtx_p (info->base, strict_p);
4500 case POST_MODIFY:
4501 case PRE_MODIFY:
4502 info->type = ADDRESS_REG_WB;
4503 info->base = XEXP (x, 0);
4504 if (GET_CODE (XEXP (x, 1)) == PLUS
4505 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4506 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4507 && aarch64_base_register_rtx_p (info->base, strict_p))
4509 HOST_WIDE_INT offset;
4510 info->offset = XEXP (XEXP (x, 1), 1);
4511 offset = INTVAL (info->offset);
4513 /* TImode and TFmode values are allowed in both pairs of X
4514 registers and individual Q registers. The available
4515 address modes are:
4516 X,X: 7-bit signed scaled offset
4517 Q: 9-bit signed offset
4518 We conservatively require an offset representable in either mode.
4520 if (mode == TImode || mode == TFmode)
4521 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4522 && offset_9bit_signed_unscaled_p (mode, offset));
4524 if (load_store_pair_p)
4525 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4526 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4527 else
4528 return offset_9bit_signed_unscaled_p (mode, offset);
4530 return false;
4532 case CONST:
4533 case SYMBOL_REF:
4534 case LABEL_REF:
4535 /* load literal: pc-relative constant pool entry. Only supported
4536 for SI mode or larger. */
4537 info->type = ADDRESS_SYMBOLIC;
4539 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4541 rtx sym, addend;
4543 split_const (x, &sym, &addend);
4544 return ((GET_CODE (sym) == LABEL_REF
4545 || (GET_CODE (sym) == SYMBOL_REF
4546 && CONSTANT_POOL_ADDRESS_P (sym)
4547 && aarch64_pcrelative_literal_loads)));
4549 return false;
4551 case LO_SUM:
4552 info->type = ADDRESS_LO_SUM;
4553 info->base = XEXP (x, 0);
4554 info->offset = XEXP (x, 1);
4555 if (allow_reg_index_p
4556 && aarch64_base_register_rtx_p (info->base, strict_p))
4558 rtx sym, offs;
4559 split_const (info->offset, &sym, &offs);
4560 if (GET_CODE (sym) == SYMBOL_REF
4561 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4563 /* The symbol and offset must be aligned to the access size. */
4564 unsigned int align;
4565 unsigned int ref_size;
4567 if (CONSTANT_POOL_ADDRESS_P (sym))
4568 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4569 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4571 tree exp = SYMBOL_REF_DECL (sym);
4572 align = TYPE_ALIGN (TREE_TYPE (exp));
4573 align = CONSTANT_ALIGNMENT (exp, align);
4575 else if (SYMBOL_REF_DECL (sym))
4576 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4577 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4578 && SYMBOL_REF_BLOCK (sym) != NULL)
4579 align = SYMBOL_REF_BLOCK (sym)->alignment;
4580 else
4581 align = BITS_PER_UNIT;
4583 ref_size = GET_MODE_SIZE (mode);
4584 if (ref_size == 0)
4585 ref_size = GET_MODE_SIZE (DImode);
4587 return ((INTVAL (offs) & (ref_size - 1)) == 0
4588 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4591 return false;
4593 default:
4594 return false;
4598 bool
4599 aarch64_symbolic_address_p (rtx x)
4601 rtx offset;
4603 split_const (x, &x, &offset);
4604 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4607 /* Classify the base of symbolic expression X. */
4609 enum aarch64_symbol_type
4610 aarch64_classify_symbolic_expression (rtx x)
4612 rtx offset;
4614 split_const (x, &x, &offset);
4615 return aarch64_classify_symbol (x, offset);
4619 /* Return TRUE if X is a legitimate address for accessing memory in
4620 mode MODE. */
4621 static bool
4622 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4624 struct aarch64_address_info addr;
4626 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4629 /* Return TRUE if X is a legitimate address for accessing memory in
4630 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4631 pair operation. */
4632 bool
4633 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4634 RTX_CODE outer_code, bool strict_p)
4636 struct aarch64_address_info addr;
4638 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4641 /* Split an out-of-range address displacement into a base and offset.
4642 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4643 to increase opportunities for sharing the base address of different sizes.
4644 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4645 static bool
4646 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4648 HOST_WIDE_INT offset = INTVAL (*disp);
4649 HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4651 if (mode == TImode || mode == TFmode
4652 || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4653 base = (offset + 0x100) & ~0x1ff;
4655 *off = GEN_INT (base);
4656 *disp = GEN_INT (offset - base);
4657 return true;
4660 /* Return TRUE if rtx X is immediate constant 0.0 */
4661 bool
4662 aarch64_float_const_zero_rtx_p (rtx x)
4664 if (GET_MODE (x) == VOIDmode)
4665 return false;
4667 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4668 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4669 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4672 /* Return the fixed registers used for condition codes. */
4674 static bool
4675 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4677 *p1 = CC_REGNUM;
4678 *p2 = INVALID_REGNUM;
4679 return true;
4682 /* Emit call insn with PAT and do aarch64-specific handling. */
4684 void
4685 aarch64_emit_call_insn (rtx pat)
4687 rtx insn = emit_call_insn (pat);
4689 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4690 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4691 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4694 machine_mode
4695 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4697 /* All floating point compares return CCFP if it is an equality
4698 comparison, and CCFPE otherwise. */
4699 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4701 switch (code)
4703 case EQ:
4704 case NE:
4705 case UNORDERED:
4706 case ORDERED:
4707 case UNLT:
4708 case UNLE:
4709 case UNGT:
4710 case UNGE:
4711 case UNEQ:
4712 case LTGT:
4713 return CCFPmode;
4715 case LT:
4716 case LE:
4717 case GT:
4718 case GE:
4719 return CCFPEmode;
4721 default:
4722 gcc_unreachable ();
4726 /* Equality comparisons of short modes against zero can be performed
4727 using the TST instruction with the appropriate bitmask. */
4728 if (y == const0_rtx && REG_P (x)
4729 && (code == EQ || code == NE)
4730 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4731 return CC_NZmode;
4733 /* Similarly, comparisons of zero_extends from shorter modes can
4734 be performed using an ANDS with an immediate mask. */
4735 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4736 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4737 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4738 && (code == EQ || code == NE))
4739 return CC_NZmode;
4741 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4742 && y == const0_rtx
4743 && (code == EQ || code == NE || code == LT || code == GE)
4744 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4745 || GET_CODE (x) == NEG
4746 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4747 && CONST_INT_P (XEXP (x, 2)))))
4748 return CC_NZmode;
4750 /* A compare with a shifted operand. Because of canonicalization,
4751 the comparison will have to be swapped when we emit the assembly
4752 code. */
4753 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4754 && (REG_P (y) || GET_CODE (y) == SUBREG)
4755 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4756 || GET_CODE (x) == LSHIFTRT
4757 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4758 return CC_SWPmode;
4760 /* Similarly for a negated operand, but we can only do this for
4761 equalities. */
4762 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4763 && (REG_P (y) || GET_CODE (y) == SUBREG)
4764 && (code == EQ || code == NE)
4765 && GET_CODE (x) == NEG)
4766 return CC_Zmode;
4768 /* A test for unsigned overflow. */
4769 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4770 && code == NE
4771 && GET_CODE (x) == PLUS
4772 && GET_CODE (y) == ZERO_EXTEND)
4773 return CC_Cmode;
4775 /* For everything else, return CCmode. */
4776 return CCmode;
4779 static int
4780 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4783 aarch64_get_condition_code (rtx x)
4785 machine_mode mode = GET_MODE (XEXP (x, 0));
4786 enum rtx_code comp_code = GET_CODE (x);
4788 if (GET_MODE_CLASS (mode) != MODE_CC)
4789 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4790 return aarch64_get_condition_code_1 (mode, comp_code);
4793 static int
4794 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4796 switch (mode)
4798 case CCFPmode:
4799 case CCFPEmode:
4800 switch (comp_code)
4802 case GE: return AARCH64_GE;
4803 case GT: return AARCH64_GT;
4804 case LE: return AARCH64_LS;
4805 case LT: return AARCH64_MI;
4806 case NE: return AARCH64_NE;
4807 case EQ: return AARCH64_EQ;
4808 case ORDERED: return AARCH64_VC;
4809 case UNORDERED: return AARCH64_VS;
4810 case UNLT: return AARCH64_LT;
4811 case UNLE: return AARCH64_LE;
4812 case UNGT: return AARCH64_HI;
4813 case UNGE: return AARCH64_PL;
4814 default: return -1;
4816 break;
4818 case CCmode:
4819 switch (comp_code)
4821 case NE: return AARCH64_NE;
4822 case EQ: return AARCH64_EQ;
4823 case GE: return AARCH64_GE;
4824 case GT: return AARCH64_GT;
4825 case LE: return AARCH64_LE;
4826 case LT: return AARCH64_LT;
4827 case GEU: return AARCH64_CS;
4828 case GTU: return AARCH64_HI;
4829 case LEU: return AARCH64_LS;
4830 case LTU: return AARCH64_CC;
4831 default: return -1;
4833 break;
4835 case CC_SWPmode:
4836 switch (comp_code)
4838 case NE: return AARCH64_NE;
4839 case EQ: return AARCH64_EQ;
4840 case GE: return AARCH64_LE;
4841 case GT: return AARCH64_LT;
4842 case LE: return AARCH64_GE;
4843 case LT: return AARCH64_GT;
4844 case GEU: return AARCH64_LS;
4845 case GTU: return AARCH64_CC;
4846 case LEU: return AARCH64_CS;
4847 case LTU: return AARCH64_HI;
4848 default: return -1;
4850 break;
4852 case CC_NZmode:
4853 switch (comp_code)
4855 case NE: return AARCH64_NE;
4856 case EQ: return AARCH64_EQ;
4857 case GE: return AARCH64_PL;
4858 case LT: return AARCH64_MI;
4859 default: return -1;
4861 break;
4863 case CC_Zmode:
4864 switch (comp_code)
4866 case NE: return AARCH64_NE;
4867 case EQ: return AARCH64_EQ;
4868 default: return -1;
4870 break;
4872 case CC_Cmode:
4873 switch (comp_code)
4875 case NE: return AARCH64_CS;
4876 case EQ: return AARCH64_CC;
4877 default: return -1;
4879 break;
4881 default:
4882 return -1;
4885 return -1;
4888 bool
4889 aarch64_const_vec_all_same_in_range_p (rtx x,
4890 HOST_WIDE_INT minval,
4891 HOST_WIDE_INT maxval)
4893 HOST_WIDE_INT firstval;
4894 int count, i;
4896 if (GET_CODE (x) != CONST_VECTOR
4897 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4898 return false;
4900 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4901 if (firstval < minval || firstval > maxval)
4902 return false;
4904 count = CONST_VECTOR_NUNITS (x);
4905 for (i = 1; i < count; i++)
4906 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4907 return false;
4909 return true;
4912 bool
4913 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4915 return aarch64_const_vec_all_same_in_range_p (x, val, val);
4919 /* N Z C V. */
4920 #define AARCH64_CC_V 1
4921 #define AARCH64_CC_C (1 << 1)
4922 #define AARCH64_CC_Z (1 << 2)
4923 #define AARCH64_CC_N (1 << 3)
4925 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
4926 static const int aarch64_nzcv_codes[] =
4928 0, /* EQ, Z == 1. */
4929 AARCH64_CC_Z, /* NE, Z == 0. */
4930 0, /* CS, C == 1. */
4931 AARCH64_CC_C, /* CC, C == 0. */
4932 0, /* MI, N == 1. */
4933 AARCH64_CC_N, /* PL, N == 0. */
4934 0, /* VS, V == 1. */
4935 AARCH64_CC_V, /* VC, V == 0. */
4936 0, /* HI, C ==1 && Z == 0. */
4937 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
4938 AARCH64_CC_V, /* GE, N == V. */
4939 0, /* LT, N != V. */
4940 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
4941 0, /* LE, !(Z == 0 && N == V). */
4942 0, /* AL, Any. */
4943 0 /* NV, Any. */
4946 static void
4947 aarch64_print_operand (FILE *f, rtx x, int code)
4949 switch (code)
4951 /* An integer or symbol address without a preceding # sign. */
4952 case 'c':
4953 switch (GET_CODE (x))
4955 case CONST_INT:
4956 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4957 break;
4959 case SYMBOL_REF:
4960 output_addr_const (f, x);
4961 break;
4963 case CONST:
4964 if (GET_CODE (XEXP (x, 0)) == PLUS
4965 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4967 output_addr_const (f, x);
4968 break;
4970 /* Fall through. */
4972 default:
4973 output_operand_lossage ("Unsupported operand for code '%c'", code);
4975 break;
4977 case 'e':
4978 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4980 int n;
4982 if (!CONST_INT_P (x)
4983 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4985 output_operand_lossage ("invalid operand for '%%%c'", code);
4986 return;
4989 switch (n)
4991 case 3:
4992 fputc ('b', f);
4993 break;
4994 case 4:
4995 fputc ('h', f);
4996 break;
4997 case 5:
4998 fputc ('w', f);
4999 break;
5000 default:
5001 output_operand_lossage ("invalid operand for '%%%c'", code);
5002 return;
5005 break;
5007 case 'p':
5009 int n;
5011 /* Print N such that 2^N == X. */
5012 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
5014 output_operand_lossage ("invalid operand for '%%%c'", code);
5015 return;
5018 asm_fprintf (f, "%d", n);
5020 break;
5022 case 'P':
5023 /* Print the number of non-zero bits in X (a const_int). */
5024 if (!CONST_INT_P (x))
5026 output_operand_lossage ("invalid operand for '%%%c'", code);
5027 return;
5030 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
5031 break;
5033 case 'H':
5034 /* Print the higher numbered register of a pair (TImode) of regs. */
5035 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
5037 output_operand_lossage ("invalid operand for '%%%c'", code);
5038 return;
5041 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
5042 break;
5044 case 'M':
5045 case 'm':
5047 int cond_code;
5048 /* Print a condition (eq, ne, etc) or its inverse. */
5050 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5051 if (x == const_true_rtx)
5053 if (code == 'M')
5054 fputs ("nv", f);
5055 return;
5058 if (!COMPARISON_P (x))
5060 output_operand_lossage ("invalid operand for '%%%c'", code);
5061 return;
5064 cond_code = aarch64_get_condition_code (x);
5065 gcc_assert (cond_code >= 0);
5066 if (code == 'M')
5067 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5068 fputs (aarch64_condition_codes[cond_code], f);
5070 break;
5072 case 'b':
5073 case 'h':
5074 case 's':
5075 case 'd':
5076 case 'q':
5077 /* Print a scalar FP/SIMD register name. */
5078 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5080 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5081 return;
5083 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5084 break;
5086 case 'S':
5087 case 'T':
5088 case 'U':
5089 case 'V':
5090 /* Print the first FP/SIMD register name in a list. */
5091 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5093 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5094 return;
5096 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5097 break;
5099 case 'R':
5100 /* Print a scalar FP/SIMD register name + 1. */
5101 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5103 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5104 return;
5106 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5107 break;
5109 case 'X':
5110 /* Print bottom 16 bits of integer constant in hex. */
5111 if (!CONST_INT_P (x))
5113 output_operand_lossage ("invalid operand for '%%%c'", code);
5114 return;
5116 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5117 break;
5119 case 'w':
5120 case 'x':
5121 /* Print a general register name or the zero register (32-bit or
5122 64-bit). */
5123 if (x == const0_rtx
5124 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5126 asm_fprintf (f, "%czr", code);
5127 break;
5130 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5132 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5133 break;
5136 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5138 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5139 break;
5142 /* Fall through */
5144 case 0:
5145 /* Print a normal operand, if it's a general register, then we
5146 assume DImode. */
5147 if (x == NULL)
5149 output_operand_lossage ("missing operand");
5150 return;
5153 switch (GET_CODE (x))
5155 case REG:
5156 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5157 break;
5159 case MEM:
5160 output_address (GET_MODE (x), XEXP (x, 0));
5161 break;
5163 case CONST:
5164 case LABEL_REF:
5165 case SYMBOL_REF:
5166 output_addr_const (asm_out_file, x);
5167 break;
5169 case CONST_INT:
5170 asm_fprintf (f, "%wd", INTVAL (x));
5171 break;
5173 case CONST_VECTOR:
5174 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5176 gcc_assert (
5177 aarch64_const_vec_all_same_in_range_p (x,
5178 HOST_WIDE_INT_MIN,
5179 HOST_WIDE_INT_MAX));
5180 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5182 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5184 fputc ('0', f);
5186 else
5187 gcc_unreachable ();
5188 break;
5190 case CONST_DOUBLE:
5191 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5192 be getting CONST_DOUBLEs holding integers. */
5193 gcc_assert (GET_MODE (x) != VOIDmode);
5194 if (aarch64_float_const_zero_rtx_p (x))
5196 fputc ('0', f);
5197 break;
5199 else if (aarch64_float_const_representable_p (x))
5201 #define buf_size 20
5202 char float_buf[buf_size] = {'\0'};
5203 real_to_decimal_for_mode (float_buf,
5204 CONST_DOUBLE_REAL_VALUE (x),
5205 buf_size, buf_size,
5206 1, GET_MODE (x));
5207 asm_fprintf (asm_out_file, "%s", float_buf);
5208 break;
5209 #undef buf_size
5211 output_operand_lossage ("invalid constant");
5212 return;
5213 default:
5214 output_operand_lossage ("invalid operand");
5215 return;
5217 break;
5219 case 'A':
5220 if (GET_CODE (x) == HIGH)
5221 x = XEXP (x, 0);
5223 switch (aarch64_classify_symbolic_expression (x))
5225 case SYMBOL_SMALL_GOT_4G:
5226 asm_fprintf (asm_out_file, ":got:");
5227 break;
5229 case SYMBOL_SMALL_TLSGD:
5230 asm_fprintf (asm_out_file, ":tlsgd:");
5231 break;
5233 case SYMBOL_SMALL_TLSDESC:
5234 asm_fprintf (asm_out_file, ":tlsdesc:");
5235 break;
5237 case SYMBOL_SMALL_TLSIE:
5238 asm_fprintf (asm_out_file, ":gottprel:");
5239 break;
5241 case SYMBOL_TLSLE24:
5242 asm_fprintf (asm_out_file, ":tprel:");
5243 break;
5245 case SYMBOL_TINY_GOT:
5246 gcc_unreachable ();
5247 break;
5249 default:
5250 break;
5252 output_addr_const (asm_out_file, x);
5253 break;
5255 case 'L':
5256 switch (aarch64_classify_symbolic_expression (x))
5258 case SYMBOL_SMALL_GOT_4G:
5259 asm_fprintf (asm_out_file, ":lo12:");
5260 break;
5262 case SYMBOL_SMALL_TLSGD:
5263 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5264 break;
5266 case SYMBOL_SMALL_TLSDESC:
5267 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5268 break;
5270 case SYMBOL_SMALL_TLSIE:
5271 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5272 break;
5274 case SYMBOL_TLSLE12:
5275 asm_fprintf (asm_out_file, ":tprel_lo12:");
5276 break;
5278 case SYMBOL_TLSLE24:
5279 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5280 break;
5282 case SYMBOL_TINY_GOT:
5283 asm_fprintf (asm_out_file, ":got:");
5284 break;
5286 case SYMBOL_TINY_TLSIE:
5287 asm_fprintf (asm_out_file, ":gottprel:");
5288 break;
5290 default:
5291 break;
5293 output_addr_const (asm_out_file, x);
5294 break;
5296 case 'G':
5298 switch (aarch64_classify_symbolic_expression (x))
5300 case SYMBOL_TLSLE24:
5301 asm_fprintf (asm_out_file, ":tprel_hi12:");
5302 break;
5303 default:
5304 break;
5306 output_addr_const (asm_out_file, x);
5307 break;
5309 case 'k':
5311 HOST_WIDE_INT cond_code;
5312 /* Print nzcv. */
5314 if (!CONST_INT_P (x))
5316 output_operand_lossage ("invalid operand for '%%%c'", code);
5317 return;
5320 cond_code = INTVAL (x);
5321 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5322 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5324 break;
5326 default:
5327 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5328 return;
5332 static void
5333 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5335 struct aarch64_address_info addr;
5337 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5338 switch (addr.type)
5340 case ADDRESS_REG_IMM:
5341 if (addr.offset == const0_rtx)
5342 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5343 else
5344 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5345 INTVAL (addr.offset));
5346 return;
5348 case ADDRESS_REG_REG:
5349 if (addr.shift == 0)
5350 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5351 reg_names [REGNO (addr.offset)]);
5352 else
5353 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5354 reg_names [REGNO (addr.offset)], addr.shift);
5355 return;
5357 case ADDRESS_REG_UXTW:
5358 if (addr.shift == 0)
5359 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5360 REGNO (addr.offset) - R0_REGNUM);
5361 else
5362 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5363 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5364 return;
5366 case ADDRESS_REG_SXTW:
5367 if (addr.shift == 0)
5368 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5369 REGNO (addr.offset) - R0_REGNUM);
5370 else
5371 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5372 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5373 return;
5375 case ADDRESS_REG_WB:
5376 switch (GET_CODE (x))
5378 case PRE_INC:
5379 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5380 GET_MODE_SIZE (mode));
5381 return;
5382 case POST_INC:
5383 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5384 GET_MODE_SIZE (mode));
5385 return;
5386 case PRE_DEC:
5387 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5388 GET_MODE_SIZE (mode));
5389 return;
5390 case POST_DEC:
5391 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5392 GET_MODE_SIZE (mode));
5393 return;
5394 case PRE_MODIFY:
5395 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5396 INTVAL (addr.offset));
5397 return;
5398 case POST_MODIFY:
5399 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5400 INTVAL (addr.offset));
5401 return;
5402 default:
5403 break;
5405 break;
5407 case ADDRESS_LO_SUM:
5408 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5409 output_addr_const (f, addr.offset);
5410 asm_fprintf (f, "]");
5411 return;
5413 case ADDRESS_SYMBOLIC:
5414 break;
5417 output_addr_const (f, x);
5420 bool
5421 aarch64_label_mentioned_p (rtx x)
5423 const char *fmt;
5424 int i;
5426 if (GET_CODE (x) == LABEL_REF)
5427 return true;
5429 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5430 referencing instruction, but they are constant offsets, not
5431 symbols. */
5432 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5433 return false;
5435 fmt = GET_RTX_FORMAT (GET_CODE (x));
5436 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5438 if (fmt[i] == 'E')
5440 int j;
5442 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5443 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5444 return 1;
5446 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5447 return 1;
5450 return 0;
5453 /* Implement REGNO_REG_CLASS. */
5455 enum reg_class
5456 aarch64_regno_regclass (unsigned regno)
5458 if (GP_REGNUM_P (regno))
5459 return GENERAL_REGS;
5461 if (regno == SP_REGNUM)
5462 return STACK_REG;
5464 if (regno == FRAME_POINTER_REGNUM
5465 || regno == ARG_POINTER_REGNUM)
5466 return POINTER_REGS;
5468 if (FP_REGNUM_P (regno))
5469 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5471 return NO_REGS;
5474 static rtx
5475 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5477 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5478 where mask is selected by alignment and size of the offset.
5479 We try to pick as large a range for the offset as possible to
5480 maximize the chance of a CSE. However, for aligned addresses
5481 we limit the range to 4k so that structures with different sized
5482 elements are likely to use the same base. We need to be careful
5483 not to split a CONST for some forms of address expression, otherwise
5484 it will generate sub-optimal code. */
5486 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5488 rtx base = XEXP (x, 0);
5489 rtx offset_rtx = XEXP (x, 1);
5490 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5492 if (GET_CODE (base) == PLUS)
5494 rtx op0 = XEXP (base, 0);
5495 rtx op1 = XEXP (base, 1);
5497 /* Force any scaling into a temp for CSE. */
5498 op0 = force_reg (Pmode, op0);
5499 op1 = force_reg (Pmode, op1);
5501 /* Let the pointer register be in op0. */
5502 if (REG_POINTER (op1))
5503 std::swap (op0, op1);
5505 /* If the pointer is virtual or frame related, then we know that
5506 virtual register instantiation or register elimination is going
5507 to apply a second constant. We want the two constants folded
5508 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5509 if (virt_or_elim_regno_p (REGNO (op0)))
5511 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5512 NULL_RTX, true, OPTAB_DIRECT);
5513 return gen_rtx_PLUS (Pmode, base, op1);
5516 /* Otherwise, in order to encourage CSE (and thence loop strength
5517 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5518 base = expand_binop (Pmode, add_optab, op0, op1,
5519 NULL_RTX, true, OPTAB_DIRECT);
5520 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5523 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5524 HOST_WIDE_INT base_offset;
5525 if (GET_MODE_SIZE (mode) > 16)
5526 base_offset = (offset + 0x400) & ~0x7f0;
5527 /* For offsets aren't a multiple of the access size, the limit is
5528 -256...255. */
5529 else if (offset & (GET_MODE_SIZE (mode) - 1))
5531 base_offset = (offset + 0x100) & ~0x1ff;
5533 /* BLKmode typically uses LDP of X-registers. */
5534 if (mode == BLKmode)
5535 base_offset = (offset + 512) & ~0x3ff;
5537 /* Small negative offsets are supported. */
5538 else if (IN_RANGE (offset, -256, 0))
5539 base_offset = 0;
5540 else if (mode == TImode || mode == TFmode)
5541 base_offset = (offset + 0x100) & ~0x1ff;
5542 /* Use 12-bit offset by access size. */
5543 else
5544 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5546 if (base_offset != 0)
5548 base = plus_constant (Pmode, base, base_offset);
5549 base = force_operand (base, NULL_RTX);
5550 return plus_constant (Pmode, base, offset - base_offset);
5554 return x;
5557 /* Return the reload icode required for a constant pool in mode. */
5558 static enum insn_code
5559 aarch64_constant_pool_reload_icode (machine_mode mode)
5561 switch (mode)
5563 case SFmode:
5564 return CODE_FOR_aarch64_reload_movcpsfdi;
5566 case DFmode:
5567 return CODE_FOR_aarch64_reload_movcpdfdi;
5569 case TFmode:
5570 return CODE_FOR_aarch64_reload_movcptfdi;
5572 case V8QImode:
5573 return CODE_FOR_aarch64_reload_movcpv8qidi;
5575 case V16QImode:
5576 return CODE_FOR_aarch64_reload_movcpv16qidi;
5578 case V4HImode:
5579 return CODE_FOR_aarch64_reload_movcpv4hidi;
5581 case V8HImode:
5582 return CODE_FOR_aarch64_reload_movcpv8hidi;
5584 case V2SImode:
5585 return CODE_FOR_aarch64_reload_movcpv2sidi;
5587 case V4SImode:
5588 return CODE_FOR_aarch64_reload_movcpv4sidi;
5590 case V2DImode:
5591 return CODE_FOR_aarch64_reload_movcpv2didi;
5593 case V2DFmode:
5594 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5596 default:
5597 gcc_unreachable ();
5600 gcc_unreachable ();
5602 static reg_class_t
5603 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5604 reg_class_t rclass,
5605 machine_mode mode,
5606 secondary_reload_info *sri)
5609 /* If we have to disable direct literal pool loads and stores because the
5610 function is too big, then we need a scratch register. */
5611 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5612 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5613 || targetm.vector_mode_supported_p (GET_MODE (x)))
5614 && !aarch64_pcrelative_literal_loads)
5616 sri->icode = aarch64_constant_pool_reload_icode (mode);
5617 return NO_REGS;
5620 /* Without the TARGET_SIMD instructions we cannot move a Q register
5621 to a Q register directly. We need a scratch. */
5622 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5623 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5624 && reg_class_subset_p (rclass, FP_REGS))
5626 if (mode == TFmode)
5627 sri->icode = CODE_FOR_aarch64_reload_movtf;
5628 else if (mode == TImode)
5629 sri->icode = CODE_FOR_aarch64_reload_movti;
5630 return NO_REGS;
5633 /* A TFmode or TImode memory access should be handled via an FP_REGS
5634 because AArch64 has richer addressing modes for LDR/STR instructions
5635 than LDP/STP instructions. */
5636 if (TARGET_FLOAT && rclass == GENERAL_REGS
5637 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5638 return FP_REGS;
5640 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5641 return GENERAL_REGS;
5643 return NO_REGS;
5646 static bool
5647 aarch64_can_eliminate (const int from, const int to)
5649 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5650 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5652 if (frame_pointer_needed)
5654 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5655 return true;
5656 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5657 return false;
5658 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5659 && !cfun->calls_alloca)
5660 return true;
5661 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5662 return true;
5664 return false;
5666 else
5668 /* If we decided that we didn't need a leaf frame pointer but then used
5669 LR in the function, then we'll want a frame pointer after all, so
5670 prevent this elimination to ensure a frame pointer is used. */
5671 if (to == STACK_POINTER_REGNUM
5672 && flag_omit_leaf_frame_pointer
5673 && df_regs_ever_live_p (LR_REGNUM))
5674 return false;
5677 return true;
5680 HOST_WIDE_INT
5681 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5683 aarch64_layout_frame ();
5685 if (to == HARD_FRAME_POINTER_REGNUM)
5687 if (from == ARG_POINTER_REGNUM)
5688 return cfun->machine->frame.hard_fp_offset;
5690 if (from == FRAME_POINTER_REGNUM)
5691 return cfun->machine->frame.hard_fp_offset
5692 - cfun->machine->frame.locals_offset;
5695 if (to == STACK_POINTER_REGNUM)
5697 if (from == FRAME_POINTER_REGNUM)
5698 return cfun->machine->frame.frame_size
5699 - cfun->machine->frame.locals_offset;
5702 return cfun->machine->frame.frame_size;
5705 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5706 previous frame. */
5709 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5711 if (count != 0)
5712 return const0_rtx;
5713 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5717 static void
5718 aarch64_asm_trampoline_template (FILE *f)
5720 if (TARGET_ILP32)
5722 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5723 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5725 else
5727 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5728 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5730 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5731 assemble_aligned_integer (4, const0_rtx);
5732 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5733 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5736 static void
5737 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5739 rtx fnaddr, mem, a_tramp;
5740 const int tramp_code_sz = 16;
5742 /* Don't need to copy the trailing D-words, we fill those in below. */
5743 emit_block_move (m_tramp, assemble_trampoline_template (),
5744 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5745 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5746 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5747 if (GET_MODE (fnaddr) != ptr_mode)
5748 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5749 emit_move_insn (mem, fnaddr);
5751 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5752 emit_move_insn (mem, chain_value);
5754 /* XXX We should really define a "clear_cache" pattern and use
5755 gen_clear_cache(). */
5756 a_tramp = XEXP (m_tramp, 0);
5757 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5758 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5759 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5760 ptr_mode);
5763 static unsigned char
5764 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5766 switch (regclass)
5768 case CALLER_SAVE_REGS:
5769 case POINTER_REGS:
5770 case GENERAL_REGS:
5771 case ALL_REGS:
5772 case FP_REGS:
5773 case FP_LO_REGS:
5774 return
5775 aarch64_vector_mode_p (mode)
5776 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5777 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5778 case STACK_REG:
5779 return 1;
5781 case NO_REGS:
5782 return 0;
5784 default:
5785 break;
5787 gcc_unreachable ();
5790 static reg_class_t
5791 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5793 if (regclass == POINTER_REGS)
5794 return GENERAL_REGS;
5796 if (regclass == STACK_REG)
5798 if (REG_P(x)
5799 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5800 return regclass;
5802 return NO_REGS;
5805 /* If it's an integer immediate that MOVI can't handle, then
5806 FP_REGS is not an option, so we return NO_REGS instead. */
5807 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5808 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5809 return NO_REGS;
5811 /* Register eliminiation can result in a request for
5812 SP+constant->FP_REGS. We cannot support such operations which
5813 use SP as source and an FP_REG as destination, so reject out
5814 right now. */
5815 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5817 rtx lhs = XEXP (x, 0);
5819 /* Look through a possible SUBREG introduced by ILP32. */
5820 if (GET_CODE (lhs) == SUBREG)
5821 lhs = SUBREG_REG (lhs);
5823 gcc_assert (REG_P (lhs));
5824 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5825 POINTER_REGS));
5826 return NO_REGS;
5829 return regclass;
5832 void
5833 aarch64_asm_output_labelref (FILE* f, const char *name)
5835 asm_fprintf (f, "%U%s", name);
5838 static void
5839 aarch64_elf_asm_constructor (rtx symbol, int priority)
5841 if (priority == DEFAULT_INIT_PRIORITY)
5842 default_ctor_section_asm_out_constructor (symbol, priority);
5843 else
5845 section *s;
5846 /* While priority is known to be in range [0, 65535], so 18 bytes
5847 would be enough, the compiler might not know that. To avoid
5848 -Wformat-truncation false positive, use a larger size. */
5849 char buf[23];
5850 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5851 s = get_section (buf, SECTION_WRITE, NULL);
5852 switch_to_section (s);
5853 assemble_align (POINTER_SIZE);
5854 assemble_aligned_integer (POINTER_BYTES, symbol);
5858 static void
5859 aarch64_elf_asm_destructor (rtx symbol, int priority)
5861 if (priority == DEFAULT_INIT_PRIORITY)
5862 default_dtor_section_asm_out_destructor (symbol, priority);
5863 else
5865 section *s;
5866 /* While priority is known to be in range [0, 65535], so 18 bytes
5867 would be enough, the compiler might not know that. To avoid
5868 -Wformat-truncation false positive, use a larger size. */
5869 char buf[23];
5870 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5871 s = get_section (buf, SECTION_WRITE, NULL);
5872 switch_to_section (s);
5873 assemble_align (POINTER_SIZE);
5874 assemble_aligned_integer (POINTER_BYTES, symbol);
5878 const char*
5879 aarch64_output_casesi (rtx *operands)
5881 char buf[100];
5882 char label[100];
5883 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5884 int index;
5885 static const char *const patterns[4][2] =
5888 "ldrb\t%w3, [%0,%w1,uxtw]",
5889 "add\t%3, %4, %w3, sxtb #2"
5892 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5893 "add\t%3, %4, %w3, sxth #2"
5896 "ldr\t%w3, [%0,%w1,uxtw #2]",
5897 "add\t%3, %4, %w3, sxtw #2"
5899 /* We assume that DImode is only generated when not optimizing and
5900 that we don't really need 64-bit address offsets. That would
5901 imply an object file with 8GB of code in a single function! */
5903 "ldr\t%w3, [%0,%w1,uxtw #2]",
5904 "add\t%3, %4, %w3, sxtw #2"
5908 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5910 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5912 gcc_assert (index >= 0 && index <= 3);
5914 /* Need to implement table size reduction, by chaning the code below. */
5915 output_asm_insn (patterns[index][0], operands);
5916 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5917 snprintf (buf, sizeof (buf),
5918 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5919 output_asm_insn (buf, operands);
5920 output_asm_insn (patterns[index][1], operands);
5921 output_asm_insn ("br\t%3", operands);
5922 assemble_label (asm_out_file, label);
5923 return "";
5927 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5928 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5929 operator. */
5932 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5934 if (shift >= 0 && shift <= 3)
5936 int size;
5937 for (size = 8; size <= 32; size *= 2)
5939 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5940 if (mask == bits << shift)
5941 return size;
5944 return 0;
5947 /* Constant pools are per function only when PC relative
5948 literal loads are true or we are in the large memory
5949 model. */
5951 static inline bool
5952 aarch64_can_use_per_function_literal_pools_p (void)
5954 return (aarch64_pcrelative_literal_loads
5955 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5958 static bool
5959 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5961 /* Fixme:: In an ideal world this would work similar
5962 to the logic in aarch64_select_rtx_section but this
5963 breaks bootstrap in gcc go. For now we workaround
5964 this by returning false here. */
5965 return false;
5968 /* Select appropriate section for constants depending
5969 on where we place literal pools. */
5971 static section *
5972 aarch64_select_rtx_section (machine_mode mode,
5973 rtx x,
5974 unsigned HOST_WIDE_INT align)
5976 if (aarch64_can_use_per_function_literal_pools_p ())
5977 return function_section (current_function_decl);
5979 return default_elf_select_rtx_section (mode, x, align);
5982 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
5983 void
5984 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5985 HOST_WIDE_INT offset)
5987 /* When using per-function literal pools, we must ensure that any code
5988 section is aligned to the minimal instruction length, lest we get
5989 errors from the assembler re "unaligned instructions". */
5990 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5991 ASM_OUTPUT_ALIGN (f, 2);
5994 /* Costs. */
5996 /* Helper function for rtx cost calculation. Strip a shift expression
5997 from X. Returns the inner operand if successful, or the original
5998 expression on failure. */
5999 static rtx
6000 aarch64_strip_shift (rtx x)
6002 rtx op = x;
6004 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
6005 we can convert both to ROR during final output. */
6006 if ((GET_CODE (op) == ASHIFT
6007 || GET_CODE (op) == ASHIFTRT
6008 || GET_CODE (op) == LSHIFTRT
6009 || GET_CODE (op) == ROTATERT
6010 || GET_CODE (op) == ROTATE)
6011 && CONST_INT_P (XEXP (op, 1)))
6012 return XEXP (op, 0);
6014 if (GET_CODE (op) == MULT
6015 && CONST_INT_P (XEXP (op, 1))
6016 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
6017 return XEXP (op, 0);
6019 return x;
6022 /* Helper function for rtx cost calculation. Strip an extend
6023 expression from X. Returns the inner operand if successful, or the
6024 original expression on failure. We deal with a number of possible
6025 canonicalization variations here. */
6026 static rtx
6027 aarch64_strip_extend (rtx x)
6029 rtx op = x;
6031 /* Zero and sign extraction of a widened value. */
6032 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
6033 && XEXP (op, 2) == const0_rtx
6034 && GET_CODE (XEXP (op, 0)) == MULT
6035 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
6036 XEXP (op, 1)))
6037 return XEXP (XEXP (op, 0), 0);
6039 /* It can also be represented (for zero-extend) as an AND with an
6040 immediate. */
6041 if (GET_CODE (op) == AND
6042 && GET_CODE (XEXP (op, 0)) == MULT
6043 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6044 && CONST_INT_P (XEXP (op, 1))
6045 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6046 INTVAL (XEXP (op, 1))) != 0)
6047 return XEXP (XEXP (op, 0), 0);
6049 /* Now handle extended register, as this may also have an optional
6050 left shift by 1..4. */
6051 if (GET_CODE (op) == ASHIFT
6052 && CONST_INT_P (XEXP (op, 1))
6053 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6054 op = XEXP (op, 0);
6056 if (GET_CODE (op) == ZERO_EXTEND
6057 || GET_CODE (op) == SIGN_EXTEND)
6058 op = XEXP (op, 0);
6060 if (op != x)
6061 return op;
6063 return x;
6066 /* Return true iff CODE is a shift supported in combination
6067 with arithmetic instructions. */
6069 static bool
6070 aarch64_shift_p (enum rtx_code code)
6072 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6075 /* Helper function for rtx cost calculation. Calculate the cost of
6076 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6077 Return the calculated cost of the expression, recursing manually in to
6078 operands where needed. */
6080 static int
6081 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6083 rtx op0, op1;
6084 const struct cpu_cost_table *extra_cost
6085 = aarch64_tune_params.insn_extra_cost;
6086 int cost = 0;
6087 bool compound_p = (outer == PLUS || outer == MINUS);
6088 machine_mode mode = GET_MODE (x);
6090 gcc_checking_assert (code == MULT);
6092 op0 = XEXP (x, 0);
6093 op1 = XEXP (x, 1);
6095 if (VECTOR_MODE_P (mode))
6096 mode = GET_MODE_INNER (mode);
6098 /* Integer multiply/fma. */
6099 if (GET_MODE_CLASS (mode) == MODE_INT)
6101 /* The multiply will be canonicalized as a shift, cost it as such. */
6102 if (aarch64_shift_p (GET_CODE (x))
6103 || (CONST_INT_P (op1)
6104 && exact_log2 (INTVAL (op1)) > 0))
6106 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6107 || GET_CODE (op0) == SIGN_EXTEND;
6108 if (speed)
6110 if (compound_p)
6112 if (REG_P (op1))
6113 /* ARITH + shift-by-register. */
6114 cost += extra_cost->alu.arith_shift_reg;
6115 else if (is_extend)
6116 /* ARITH + extended register. We don't have a cost field
6117 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6118 cost += extra_cost->alu.extend_arith;
6119 else
6120 /* ARITH + shift-by-immediate. */
6121 cost += extra_cost->alu.arith_shift;
6123 else
6124 /* LSL (immediate). */
6125 cost += extra_cost->alu.shift;
6128 /* Strip extends as we will have costed them in the case above. */
6129 if (is_extend)
6130 op0 = aarch64_strip_extend (op0);
6132 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6134 return cost;
6137 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6138 compound and let the below cases handle it. After all, MNEG is a
6139 special-case alias of MSUB. */
6140 if (GET_CODE (op0) == NEG)
6142 op0 = XEXP (op0, 0);
6143 compound_p = true;
6146 /* Integer multiplies or FMAs have zero/sign extending variants. */
6147 if ((GET_CODE (op0) == ZERO_EXTEND
6148 && GET_CODE (op1) == ZERO_EXTEND)
6149 || (GET_CODE (op0) == SIGN_EXTEND
6150 && GET_CODE (op1) == SIGN_EXTEND))
6152 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6153 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6155 if (speed)
6157 if (compound_p)
6158 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6159 cost += extra_cost->mult[0].extend_add;
6160 else
6161 /* MUL/SMULL/UMULL. */
6162 cost += extra_cost->mult[0].extend;
6165 return cost;
6168 /* This is either an integer multiply or a MADD. In both cases
6169 we want to recurse and cost the operands. */
6170 cost += rtx_cost (op0, mode, MULT, 0, speed);
6171 cost += rtx_cost (op1, mode, MULT, 1, speed);
6173 if (speed)
6175 if (compound_p)
6176 /* MADD/MSUB. */
6177 cost += extra_cost->mult[mode == DImode].add;
6178 else
6179 /* MUL. */
6180 cost += extra_cost->mult[mode == DImode].simple;
6183 return cost;
6185 else
6187 if (speed)
6189 /* Floating-point FMA/FMUL can also support negations of the
6190 operands, unless the rounding mode is upward or downward in
6191 which case FNMUL is different than FMUL with operand negation. */
6192 bool neg0 = GET_CODE (op0) == NEG;
6193 bool neg1 = GET_CODE (op1) == NEG;
6194 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6196 if (neg0)
6197 op0 = XEXP (op0, 0);
6198 if (neg1)
6199 op1 = XEXP (op1, 0);
6202 if (compound_p)
6203 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6204 cost += extra_cost->fp[mode == DFmode].fma;
6205 else
6206 /* FMUL/FNMUL. */
6207 cost += extra_cost->fp[mode == DFmode].mult;
6210 cost += rtx_cost (op0, mode, MULT, 0, speed);
6211 cost += rtx_cost (op1, mode, MULT, 1, speed);
6212 return cost;
6216 static int
6217 aarch64_address_cost (rtx x,
6218 machine_mode mode,
6219 addr_space_t as ATTRIBUTE_UNUSED,
6220 bool speed)
6222 enum rtx_code c = GET_CODE (x);
6223 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6224 struct aarch64_address_info info;
6225 int cost = 0;
6226 info.shift = 0;
6228 if (!aarch64_classify_address (&info, x, mode, c, false))
6230 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6232 /* This is a CONST or SYMBOL ref which will be split
6233 in a different way depending on the code model in use.
6234 Cost it through the generic infrastructure. */
6235 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6236 /* Divide through by the cost of one instruction to
6237 bring it to the same units as the address costs. */
6238 cost_symbol_ref /= COSTS_N_INSNS (1);
6239 /* The cost is then the cost of preparing the address,
6240 followed by an immediate (possibly 0) offset. */
6241 return cost_symbol_ref + addr_cost->imm_offset;
6243 else
6245 /* This is most likely a jump table from a case
6246 statement. */
6247 return addr_cost->register_offset;
6251 switch (info.type)
6253 case ADDRESS_LO_SUM:
6254 case ADDRESS_SYMBOLIC:
6255 case ADDRESS_REG_IMM:
6256 cost += addr_cost->imm_offset;
6257 break;
6259 case ADDRESS_REG_WB:
6260 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6261 cost += addr_cost->pre_modify;
6262 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6263 cost += addr_cost->post_modify;
6264 else
6265 gcc_unreachable ();
6267 break;
6269 case ADDRESS_REG_REG:
6270 cost += addr_cost->register_offset;
6271 break;
6273 case ADDRESS_REG_SXTW:
6274 cost += addr_cost->register_sextend;
6275 break;
6277 case ADDRESS_REG_UXTW:
6278 cost += addr_cost->register_zextend;
6279 break;
6281 default:
6282 gcc_unreachable ();
6286 if (info.shift > 0)
6288 /* For the sake of calculating the cost of the shifted register
6289 component, we can treat same sized modes in the same way. */
6290 switch (GET_MODE_BITSIZE (mode))
6292 case 16:
6293 cost += addr_cost->addr_scale_costs.hi;
6294 break;
6296 case 32:
6297 cost += addr_cost->addr_scale_costs.si;
6298 break;
6300 case 64:
6301 cost += addr_cost->addr_scale_costs.di;
6302 break;
6304 /* We can't tell, or this is a 128-bit vector. */
6305 default:
6306 cost += addr_cost->addr_scale_costs.ti;
6307 break;
6311 return cost;
6314 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6315 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6316 to be taken. */
6319 aarch64_branch_cost (bool speed_p, bool predictable_p)
6321 /* When optimizing for speed, use the cost of unpredictable branches. */
6322 const struct cpu_branch_cost *branch_costs =
6323 aarch64_tune_params.branch_costs;
6325 if (!speed_p || predictable_p)
6326 return branch_costs->predictable;
6327 else
6328 return branch_costs->unpredictable;
6331 /* Return true if the RTX X in mode MODE is a zero or sign extract
6332 usable in an ADD or SUB (extended register) instruction. */
6333 static bool
6334 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6336 /* Catch add with a sign extract.
6337 This is add_<optab><mode>_multp2. */
6338 if (GET_CODE (x) == SIGN_EXTRACT
6339 || GET_CODE (x) == ZERO_EXTRACT)
6341 rtx op0 = XEXP (x, 0);
6342 rtx op1 = XEXP (x, 1);
6343 rtx op2 = XEXP (x, 2);
6345 if (GET_CODE (op0) == MULT
6346 && CONST_INT_P (op1)
6347 && op2 == const0_rtx
6348 && CONST_INT_P (XEXP (op0, 1))
6349 && aarch64_is_extend_from_extract (mode,
6350 XEXP (op0, 1),
6351 op1))
6353 return true;
6356 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6357 No shift. */
6358 else if (GET_CODE (x) == SIGN_EXTEND
6359 || GET_CODE (x) == ZERO_EXTEND)
6360 return REG_P (XEXP (x, 0));
6362 return false;
6365 static bool
6366 aarch64_frint_unspec_p (unsigned int u)
6368 switch (u)
6370 case UNSPEC_FRINTZ:
6371 case UNSPEC_FRINTP:
6372 case UNSPEC_FRINTM:
6373 case UNSPEC_FRINTA:
6374 case UNSPEC_FRINTN:
6375 case UNSPEC_FRINTX:
6376 case UNSPEC_FRINTI:
6377 return true;
6379 default:
6380 return false;
6384 /* Return true iff X is an rtx that will match an extr instruction
6385 i.e. as described in the *extr<mode>5_insn family of patterns.
6386 OP0 and OP1 will be set to the operands of the shifts involved
6387 on success and will be NULL_RTX otherwise. */
6389 static bool
6390 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6392 rtx op0, op1;
6393 machine_mode mode = GET_MODE (x);
6395 *res_op0 = NULL_RTX;
6396 *res_op1 = NULL_RTX;
6398 if (GET_CODE (x) != IOR)
6399 return false;
6401 op0 = XEXP (x, 0);
6402 op1 = XEXP (x, 1);
6404 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6405 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6407 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6408 if (GET_CODE (op1) == ASHIFT)
6409 std::swap (op0, op1);
6411 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6412 return false;
6414 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6415 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6417 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6418 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6420 *res_op0 = XEXP (op0, 0);
6421 *res_op1 = XEXP (op1, 0);
6422 return true;
6426 return false;
6429 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6430 storing it in *COST. Result is true if the total cost of the operation
6431 has now been calculated. */
6432 static bool
6433 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6435 rtx inner;
6436 rtx comparator;
6437 enum rtx_code cmpcode;
6439 if (COMPARISON_P (op0))
6441 inner = XEXP (op0, 0);
6442 comparator = XEXP (op0, 1);
6443 cmpcode = GET_CODE (op0);
6445 else
6447 inner = op0;
6448 comparator = const0_rtx;
6449 cmpcode = NE;
6452 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6454 /* Conditional branch. */
6455 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6456 return true;
6457 else
6459 if (cmpcode == NE || cmpcode == EQ)
6461 if (comparator == const0_rtx)
6463 /* TBZ/TBNZ/CBZ/CBNZ. */
6464 if (GET_CODE (inner) == ZERO_EXTRACT)
6465 /* TBZ/TBNZ. */
6466 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6467 ZERO_EXTRACT, 0, speed);
6468 else
6469 /* CBZ/CBNZ. */
6470 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6472 return true;
6475 else if (cmpcode == LT || cmpcode == GE)
6477 /* TBZ/TBNZ. */
6478 if (comparator == const0_rtx)
6479 return true;
6483 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6485 /* CCMP. */
6486 if (GET_CODE (op1) == COMPARE)
6488 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6489 if (XEXP (op1, 1) == const0_rtx)
6490 *cost += 1;
6491 if (speed)
6493 machine_mode mode = GET_MODE (XEXP (op1, 0));
6494 const struct cpu_cost_table *extra_cost
6495 = aarch64_tune_params.insn_extra_cost;
6497 if (GET_MODE_CLASS (mode) == MODE_INT)
6498 *cost += extra_cost->alu.arith;
6499 else
6500 *cost += extra_cost->fp[mode == DFmode].compare;
6502 return true;
6505 /* It's a conditional operation based on the status flags,
6506 so it must be some flavor of CSEL. */
6508 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6509 if (GET_CODE (op1) == NEG
6510 || GET_CODE (op1) == NOT
6511 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6512 op1 = XEXP (op1, 0);
6513 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6515 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6516 op1 = XEXP (op1, 0);
6517 op2 = XEXP (op2, 0);
6520 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6521 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6522 return true;
6525 /* We don't know what this is, cost all operands. */
6526 return false;
6529 /* Check whether X is a bitfield operation of the form shift + extend that
6530 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6531 operand to which the bitfield operation is applied. Otherwise return
6532 NULL_RTX. */
6534 static rtx
6535 aarch64_extend_bitfield_pattern_p (rtx x)
6537 rtx_code outer_code = GET_CODE (x);
6538 machine_mode outer_mode = GET_MODE (x);
6540 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6541 && outer_mode != SImode && outer_mode != DImode)
6542 return NULL_RTX;
6544 rtx inner = XEXP (x, 0);
6545 rtx_code inner_code = GET_CODE (inner);
6546 machine_mode inner_mode = GET_MODE (inner);
6547 rtx op = NULL_RTX;
6549 switch (inner_code)
6551 case ASHIFT:
6552 if (CONST_INT_P (XEXP (inner, 1))
6553 && (inner_mode == QImode || inner_mode == HImode))
6554 op = XEXP (inner, 0);
6555 break;
6556 case LSHIFTRT:
6557 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6558 && (inner_mode == QImode || inner_mode == HImode))
6559 op = XEXP (inner, 0);
6560 break;
6561 case ASHIFTRT:
6562 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6563 && (inner_mode == QImode || inner_mode == HImode))
6564 op = XEXP (inner, 0);
6565 break;
6566 default:
6567 break;
6570 return op;
6573 /* Return true if the mask and a shift amount from an RTX of the form
6574 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6575 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6577 bool
6578 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6580 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6581 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6582 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6583 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6586 /* Calculate the cost of calculating X, storing it in *COST. Result
6587 is true if the total cost of the operation has now been calculated. */
6588 static bool
6589 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6590 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6592 rtx op0, op1, op2;
6593 const struct cpu_cost_table *extra_cost
6594 = aarch64_tune_params.insn_extra_cost;
6595 int code = GET_CODE (x);
6597 /* By default, assume that everything has equivalent cost to the
6598 cheapest instruction. Any additional costs are applied as a delta
6599 above this default. */
6600 *cost = COSTS_N_INSNS (1);
6602 switch (code)
6604 case SET:
6605 /* The cost depends entirely on the operands to SET. */
6606 *cost = 0;
6607 op0 = SET_DEST (x);
6608 op1 = SET_SRC (x);
6610 switch (GET_CODE (op0))
6612 case MEM:
6613 if (speed)
6615 rtx address = XEXP (op0, 0);
6616 if (VECTOR_MODE_P (mode))
6617 *cost += extra_cost->ldst.storev;
6618 else if (GET_MODE_CLASS (mode) == MODE_INT)
6619 *cost += extra_cost->ldst.store;
6620 else if (mode == SFmode)
6621 *cost += extra_cost->ldst.storef;
6622 else if (mode == DFmode)
6623 *cost += extra_cost->ldst.stored;
6625 *cost +=
6626 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6627 0, speed));
6630 *cost += rtx_cost (op1, mode, SET, 1, speed);
6631 return true;
6633 case SUBREG:
6634 if (! REG_P (SUBREG_REG (op0)))
6635 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6637 /* Fall through. */
6638 case REG:
6639 /* The cost is one per vector-register copied. */
6640 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6642 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6643 / GET_MODE_SIZE (V4SImode);
6644 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6646 /* const0_rtx is in general free, but we will use an
6647 instruction to set a register to 0. */
6648 else if (REG_P (op1) || op1 == const0_rtx)
6650 /* The cost is 1 per register copied. */
6651 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6652 / UNITS_PER_WORD;
6653 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6655 else
6656 /* Cost is just the cost of the RHS of the set. */
6657 *cost += rtx_cost (op1, mode, SET, 1, speed);
6658 return true;
6660 case ZERO_EXTRACT:
6661 case SIGN_EXTRACT:
6662 /* Bit-field insertion. Strip any redundant widening of
6663 the RHS to meet the width of the target. */
6664 if (GET_CODE (op1) == SUBREG)
6665 op1 = SUBREG_REG (op1);
6666 if ((GET_CODE (op1) == ZERO_EXTEND
6667 || GET_CODE (op1) == SIGN_EXTEND)
6668 && CONST_INT_P (XEXP (op0, 1))
6669 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6670 >= INTVAL (XEXP (op0, 1))))
6671 op1 = XEXP (op1, 0);
6673 if (CONST_INT_P (op1))
6675 /* MOV immediate is assumed to always be cheap. */
6676 *cost = COSTS_N_INSNS (1);
6678 else
6680 /* BFM. */
6681 if (speed)
6682 *cost += extra_cost->alu.bfi;
6683 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6686 return true;
6688 default:
6689 /* We can't make sense of this, assume default cost. */
6690 *cost = COSTS_N_INSNS (1);
6691 return false;
6693 return false;
6695 case CONST_INT:
6696 /* If an instruction can incorporate a constant within the
6697 instruction, the instruction's expression avoids calling
6698 rtx_cost() on the constant. If rtx_cost() is called on a
6699 constant, then it is usually because the constant must be
6700 moved into a register by one or more instructions.
6702 The exception is constant 0, which can be expressed
6703 as XZR/WZR and is therefore free. The exception to this is
6704 if we have (set (reg) (const0_rtx)) in which case we must cost
6705 the move. However, we can catch that when we cost the SET, so
6706 we don't need to consider that here. */
6707 if (x == const0_rtx)
6708 *cost = 0;
6709 else
6711 /* To an approximation, building any other constant is
6712 proportionally expensive to the number of instructions
6713 required to build that constant. This is true whether we
6714 are compiling for SPEED or otherwise. */
6715 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6716 (NULL_RTX, x, false, mode));
6718 return true;
6720 case CONST_DOUBLE:
6721 if (speed)
6723 /* mov[df,sf]_aarch64. */
6724 if (aarch64_float_const_representable_p (x))
6725 /* FMOV (scalar immediate). */
6726 *cost += extra_cost->fp[mode == DFmode].fpconst;
6727 else if (!aarch64_float_const_zero_rtx_p (x))
6729 /* This will be a load from memory. */
6730 if (mode == DFmode)
6731 *cost += extra_cost->ldst.loadd;
6732 else
6733 *cost += extra_cost->ldst.loadf;
6735 else
6736 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6737 or MOV v0.s[0], wzr - neither of which are modeled by the
6738 cost tables. Just use the default cost. */
6743 return true;
6745 case MEM:
6746 if (speed)
6748 /* For loads we want the base cost of a load, plus an
6749 approximation for the additional cost of the addressing
6750 mode. */
6751 rtx address = XEXP (x, 0);
6752 if (VECTOR_MODE_P (mode))
6753 *cost += extra_cost->ldst.loadv;
6754 else if (GET_MODE_CLASS (mode) == MODE_INT)
6755 *cost += extra_cost->ldst.load;
6756 else if (mode == SFmode)
6757 *cost += extra_cost->ldst.loadf;
6758 else if (mode == DFmode)
6759 *cost += extra_cost->ldst.loadd;
6761 *cost +=
6762 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6763 0, speed));
6766 return true;
6768 case NEG:
6769 op0 = XEXP (x, 0);
6771 if (VECTOR_MODE_P (mode))
6773 if (speed)
6775 /* FNEG. */
6776 *cost += extra_cost->vect.alu;
6778 return false;
6781 if (GET_MODE_CLASS (mode) == MODE_INT)
6783 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6784 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6786 /* CSETM. */
6787 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6788 return true;
6791 /* Cost this as SUB wzr, X. */
6792 op0 = CONST0_RTX (mode);
6793 op1 = XEXP (x, 0);
6794 goto cost_minus;
6797 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6799 /* Support (neg(fma...)) as a single instruction only if
6800 sign of zeros is unimportant. This matches the decision
6801 making in aarch64.md. */
6802 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6804 /* FNMADD. */
6805 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6806 return true;
6808 if (GET_CODE (op0) == MULT)
6810 /* FNMUL. */
6811 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6812 return true;
6814 if (speed)
6815 /* FNEG. */
6816 *cost += extra_cost->fp[mode == DFmode].neg;
6817 return false;
6820 return false;
6822 case CLRSB:
6823 case CLZ:
6824 if (speed)
6826 if (VECTOR_MODE_P (mode))
6827 *cost += extra_cost->vect.alu;
6828 else
6829 *cost += extra_cost->alu.clz;
6832 return false;
6834 case COMPARE:
6835 op0 = XEXP (x, 0);
6836 op1 = XEXP (x, 1);
6838 if (op1 == const0_rtx
6839 && GET_CODE (op0) == AND)
6841 x = op0;
6842 mode = GET_MODE (op0);
6843 goto cost_logic;
6846 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6848 /* TODO: A write to the CC flags possibly costs extra, this
6849 needs encoding in the cost tables. */
6851 mode = GET_MODE (op0);
6852 /* ANDS. */
6853 if (GET_CODE (op0) == AND)
6855 x = op0;
6856 goto cost_logic;
6859 if (GET_CODE (op0) == PLUS)
6861 /* ADDS (and CMN alias). */
6862 x = op0;
6863 goto cost_plus;
6866 if (GET_CODE (op0) == MINUS)
6868 /* SUBS. */
6869 x = op0;
6870 goto cost_minus;
6873 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6874 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6875 && CONST_INT_P (XEXP (op0, 2)))
6877 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6878 Handle it here directly rather than going to cost_logic
6879 since we know the immediate generated for the TST is valid
6880 so we can avoid creating an intermediate rtx for it only
6881 for costing purposes. */
6882 if (speed)
6883 *cost += extra_cost->alu.logical;
6885 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6886 ZERO_EXTRACT, 0, speed);
6887 return true;
6890 if (GET_CODE (op1) == NEG)
6892 /* CMN. */
6893 if (speed)
6894 *cost += extra_cost->alu.arith;
6896 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6897 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6898 return true;
6901 /* CMP.
6903 Compare can freely swap the order of operands, and
6904 canonicalization puts the more complex operation first.
6905 But the integer MINUS logic expects the shift/extend
6906 operation in op1. */
6907 if (! (REG_P (op0)
6908 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6910 op0 = XEXP (x, 1);
6911 op1 = XEXP (x, 0);
6913 goto cost_minus;
6916 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6918 /* FCMP. */
6919 if (speed)
6920 *cost += extra_cost->fp[mode == DFmode].compare;
6922 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6924 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6925 /* FCMP supports constant 0.0 for no extra cost. */
6926 return true;
6928 return false;
6931 if (VECTOR_MODE_P (mode))
6933 /* Vector compare. */
6934 if (speed)
6935 *cost += extra_cost->vect.alu;
6937 if (aarch64_float_const_zero_rtx_p (op1))
6939 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6940 cost. */
6941 return true;
6943 return false;
6945 return false;
6947 case MINUS:
6949 op0 = XEXP (x, 0);
6950 op1 = XEXP (x, 1);
6952 cost_minus:
6953 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6955 /* Detect valid immediates. */
6956 if ((GET_MODE_CLASS (mode) == MODE_INT
6957 || (GET_MODE_CLASS (mode) == MODE_CC
6958 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6959 && CONST_INT_P (op1)
6960 && aarch64_uimm12_shift (INTVAL (op1)))
6962 if (speed)
6963 /* SUB(S) (immediate). */
6964 *cost += extra_cost->alu.arith;
6965 return true;
6968 /* Look for SUB (extended register). */
6969 if (aarch64_rtx_arith_op_extract_p (op1, mode))
6971 if (speed)
6972 *cost += extra_cost->alu.extend_arith;
6974 op1 = aarch64_strip_extend (op1);
6975 *cost += rtx_cost (op1, VOIDmode,
6976 (enum rtx_code) GET_CODE (op1), 0, speed);
6977 return true;
6980 rtx new_op1 = aarch64_strip_extend (op1);
6982 /* Cost this as an FMA-alike operation. */
6983 if ((GET_CODE (new_op1) == MULT
6984 || aarch64_shift_p (GET_CODE (new_op1)))
6985 && code != COMPARE)
6987 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6988 (enum rtx_code) code,
6989 speed);
6990 return true;
6993 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6995 if (speed)
6997 if (VECTOR_MODE_P (mode))
6999 /* Vector SUB. */
7000 *cost += extra_cost->vect.alu;
7002 else if (GET_MODE_CLASS (mode) == MODE_INT)
7004 /* SUB(S). */
7005 *cost += extra_cost->alu.arith;
7007 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7009 /* FSUB. */
7010 *cost += extra_cost->fp[mode == DFmode].addsub;
7013 return true;
7016 case PLUS:
7018 rtx new_op0;
7020 op0 = XEXP (x, 0);
7021 op1 = XEXP (x, 1);
7023 cost_plus:
7024 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
7025 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
7027 /* CSINC. */
7028 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
7029 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7030 return true;
7033 if (GET_MODE_CLASS (mode) == MODE_INT
7034 && CONST_INT_P (op1)
7035 && aarch64_uimm12_shift (INTVAL (op1)))
7037 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
7039 if (speed)
7040 /* ADD (immediate). */
7041 *cost += extra_cost->alu.arith;
7042 return true;
7045 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7047 /* Look for ADD (extended register). */
7048 if (aarch64_rtx_arith_op_extract_p (op0, mode))
7050 if (speed)
7051 *cost += extra_cost->alu.extend_arith;
7053 op0 = aarch64_strip_extend (op0);
7054 *cost += rtx_cost (op0, VOIDmode,
7055 (enum rtx_code) GET_CODE (op0), 0, speed);
7056 return true;
7059 /* Strip any extend, leave shifts behind as we will
7060 cost them through mult_cost. */
7061 new_op0 = aarch64_strip_extend (op0);
7063 if (GET_CODE (new_op0) == MULT
7064 || aarch64_shift_p (GET_CODE (new_op0)))
7066 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7067 speed);
7068 return true;
7071 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7073 if (speed)
7075 if (VECTOR_MODE_P (mode))
7077 /* Vector ADD. */
7078 *cost += extra_cost->vect.alu;
7080 else if (GET_MODE_CLASS (mode) == MODE_INT)
7082 /* ADD. */
7083 *cost += extra_cost->alu.arith;
7085 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7087 /* FADD. */
7088 *cost += extra_cost->fp[mode == DFmode].addsub;
7091 return true;
7094 case BSWAP:
7095 *cost = COSTS_N_INSNS (1);
7097 if (speed)
7099 if (VECTOR_MODE_P (mode))
7100 *cost += extra_cost->vect.alu;
7101 else
7102 *cost += extra_cost->alu.rev;
7104 return false;
7106 case IOR:
7107 if (aarch_rev16_p (x))
7109 *cost = COSTS_N_INSNS (1);
7111 if (speed)
7113 if (VECTOR_MODE_P (mode))
7114 *cost += extra_cost->vect.alu;
7115 else
7116 *cost += extra_cost->alu.rev;
7118 return true;
7121 if (aarch64_extr_rtx_p (x, &op0, &op1))
7123 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7124 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7125 if (speed)
7126 *cost += extra_cost->alu.shift;
7128 return true;
7130 /* Fall through. */
7131 case XOR:
7132 case AND:
7133 cost_logic:
7134 op0 = XEXP (x, 0);
7135 op1 = XEXP (x, 1);
7137 if (VECTOR_MODE_P (mode))
7139 if (speed)
7140 *cost += extra_cost->vect.alu;
7141 return true;
7144 if (code == AND
7145 && GET_CODE (op0) == MULT
7146 && CONST_INT_P (XEXP (op0, 1))
7147 && CONST_INT_P (op1)
7148 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7149 INTVAL (op1)) != 0)
7151 /* This is a UBFM/SBFM. */
7152 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7153 if (speed)
7154 *cost += extra_cost->alu.bfx;
7155 return true;
7158 if (GET_MODE_CLASS (mode) == MODE_INT)
7160 if (CONST_INT_P (op1))
7162 /* We have a mask + shift version of a UBFIZ
7163 i.e. the *andim_ashift<mode>_bfiz pattern. */
7164 if (GET_CODE (op0) == ASHIFT
7165 && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7166 XEXP (op0, 1)))
7168 *cost += rtx_cost (XEXP (op0, 0), mode,
7169 (enum rtx_code) code, 0, speed);
7170 if (speed)
7171 *cost += extra_cost->alu.bfx;
7173 return true;
7175 else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7177 /* We possibly get the immediate for free, this is not
7178 modelled. */
7179 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7180 if (speed)
7181 *cost += extra_cost->alu.logical;
7183 return true;
7186 else
7188 rtx new_op0 = op0;
7190 /* Handle ORN, EON, or BIC. */
7191 if (GET_CODE (op0) == NOT)
7192 op0 = XEXP (op0, 0);
7194 new_op0 = aarch64_strip_shift (op0);
7196 /* If we had a shift on op0 then this is a logical-shift-
7197 by-register/immediate operation. Otherwise, this is just
7198 a logical operation. */
7199 if (speed)
7201 if (new_op0 != op0)
7203 /* Shift by immediate. */
7204 if (CONST_INT_P (XEXP (op0, 1)))
7205 *cost += extra_cost->alu.log_shift;
7206 else
7207 *cost += extra_cost->alu.log_shift_reg;
7209 else
7210 *cost += extra_cost->alu.logical;
7213 /* In both cases we want to cost both operands. */
7214 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7215 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7217 return true;
7220 return false;
7222 case NOT:
7223 x = XEXP (x, 0);
7224 op0 = aarch64_strip_shift (x);
7226 if (VECTOR_MODE_P (mode))
7228 /* Vector NOT. */
7229 *cost += extra_cost->vect.alu;
7230 return false;
7233 /* MVN-shifted-reg. */
7234 if (op0 != x)
7236 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7238 if (speed)
7239 *cost += extra_cost->alu.log_shift;
7241 return true;
7243 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7244 Handle the second form here taking care that 'a' in the above can
7245 be a shift. */
7246 else if (GET_CODE (op0) == XOR)
7248 rtx newop0 = XEXP (op0, 0);
7249 rtx newop1 = XEXP (op0, 1);
7250 rtx op0_stripped = aarch64_strip_shift (newop0);
7252 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7253 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7255 if (speed)
7257 if (op0_stripped != newop0)
7258 *cost += extra_cost->alu.log_shift;
7259 else
7260 *cost += extra_cost->alu.logical;
7263 return true;
7265 /* MVN. */
7266 if (speed)
7267 *cost += extra_cost->alu.logical;
7269 return false;
7271 case ZERO_EXTEND:
7273 op0 = XEXP (x, 0);
7274 /* If a value is written in SI mode, then zero extended to DI
7275 mode, the operation will in general be free as a write to
7276 a 'w' register implicitly zeroes the upper bits of an 'x'
7277 register. However, if this is
7279 (set (reg) (zero_extend (reg)))
7281 we must cost the explicit register move. */
7282 if (mode == DImode
7283 && GET_MODE (op0) == SImode
7284 && outer == SET)
7286 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7288 /* If OP_COST is non-zero, then the cost of the zero extend
7289 is effectively the cost of the inner operation. Otherwise
7290 we have a MOV instruction and we take the cost from the MOV
7291 itself. This is true independently of whether we are
7292 optimizing for space or time. */
7293 if (op_cost)
7294 *cost = op_cost;
7296 return true;
7298 else if (MEM_P (op0))
7300 /* All loads can zero extend to any size for free. */
7301 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7302 return true;
7305 op0 = aarch64_extend_bitfield_pattern_p (x);
7306 if (op0)
7308 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7309 if (speed)
7310 *cost += extra_cost->alu.bfx;
7311 return true;
7314 if (speed)
7316 if (VECTOR_MODE_P (mode))
7318 /* UMOV. */
7319 *cost += extra_cost->vect.alu;
7321 else
7323 /* We generate an AND instead of UXTB/UXTH. */
7324 *cost += extra_cost->alu.logical;
7327 return false;
7329 case SIGN_EXTEND:
7330 if (MEM_P (XEXP (x, 0)))
7332 /* LDRSH. */
7333 if (speed)
7335 rtx address = XEXP (XEXP (x, 0), 0);
7336 *cost += extra_cost->ldst.load_sign_extend;
7338 *cost +=
7339 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7340 0, speed));
7342 return true;
7345 op0 = aarch64_extend_bitfield_pattern_p (x);
7346 if (op0)
7348 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7349 if (speed)
7350 *cost += extra_cost->alu.bfx;
7351 return true;
7354 if (speed)
7356 if (VECTOR_MODE_P (mode))
7357 *cost += extra_cost->vect.alu;
7358 else
7359 *cost += extra_cost->alu.extend;
7361 return false;
7363 case ASHIFT:
7364 op0 = XEXP (x, 0);
7365 op1 = XEXP (x, 1);
7367 if (CONST_INT_P (op1))
7369 if (speed)
7371 if (VECTOR_MODE_P (mode))
7373 /* Vector shift (immediate). */
7374 *cost += extra_cost->vect.alu;
7376 else
7378 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7379 aliases. */
7380 *cost += extra_cost->alu.shift;
7384 /* We can incorporate zero/sign extend for free. */
7385 if (GET_CODE (op0) == ZERO_EXTEND
7386 || GET_CODE (op0) == SIGN_EXTEND)
7387 op0 = XEXP (op0, 0);
7389 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7390 return true;
7392 else
7394 if (speed)
7396 if (VECTOR_MODE_P (mode))
7398 /* Vector shift (register). */
7399 *cost += extra_cost->vect.alu;
7401 else
7403 /* LSLV. */
7404 *cost += extra_cost->alu.shift_reg;
7407 return false; /* All arguments need to be in registers. */
7410 case ROTATE:
7411 case ROTATERT:
7412 case LSHIFTRT:
7413 case ASHIFTRT:
7414 op0 = XEXP (x, 0);
7415 op1 = XEXP (x, 1);
7417 if (CONST_INT_P (op1))
7419 /* ASR (immediate) and friends. */
7420 if (speed)
7422 if (VECTOR_MODE_P (mode))
7423 *cost += extra_cost->vect.alu;
7424 else
7425 *cost += extra_cost->alu.shift;
7428 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7429 return true;
7431 else
7434 /* ASR (register) and friends. */
7435 if (speed)
7437 if (VECTOR_MODE_P (mode))
7438 *cost += extra_cost->vect.alu;
7439 else
7440 *cost += extra_cost->alu.shift_reg;
7442 return false; /* All arguments need to be in registers. */
7445 case SYMBOL_REF:
7447 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7448 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7450 /* LDR. */
7451 if (speed)
7452 *cost += extra_cost->ldst.load;
7454 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7455 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7457 /* ADRP, followed by ADD. */
7458 *cost += COSTS_N_INSNS (1);
7459 if (speed)
7460 *cost += 2 * extra_cost->alu.arith;
7462 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7463 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7465 /* ADR. */
7466 if (speed)
7467 *cost += extra_cost->alu.arith;
7470 if (flag_pic)
7472 /* One extra load instruction, after accessing the GOT. */
7473 *cost += COSTS_N_INSNS (1);
7474 if (speed)
7475 *cost += extra_cost->ldst.load;
7477 return true;
7479 case HIGH:
7480 case LO_SUM:
7481 /* ADRP/ADD (immediate). */
7482 if (speed)
7483 *cost += extra_cost->alu.arith;
7484 return true;
7486 case ZERO_EXTRACT:
7487 case SIGN_EXTRACT:
7488 /* UBFX/SBFX. */
7489 if (speed)
7491 if (VECTOR_MODE_P (mode))
7492 *cost += extra_cost->vect.alu;
7493 else
7494 *cost += extra_cost->alu.bfx;
7497 /* We can trust that the immediates used will be correct (there
7498 are no by-register forms), so we need only cost op0. */
7499 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7500 return true;
7502 case MULT:
7503 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7504 /* aarch64_rtx_mult_cost always handles recursion to its
7505 operands. */
7506 return true;
7508 case MOD:
7509 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7510 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7511 an unconditional negate. This case should only ever be reached through
7512 the set_smod_pow2_cheap check in expmed.c. */
7513 if (CONST_INT_P (XEXP (x, 1))
7514 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7515 && (mode == SImode || mode == DImode))
7517 /* We expand to 4 instructions. Reset the baseline. */
7518 *cost = COSTS_N_INSNS (4);
7520 if (speed)
7521 *cost += 2 * extra_cost->alu.logical
7522 + 2 * extra_cost->alu.arith;
7524 return true;
7527 /* Fall-through. */
7528 case UMOD:
7529 if (speed)
7531 if (VECTOR_MODE_P (mode))
7532 *cost += extra_cost->vect.alu;
7533 else if (GET_MODE_CLASS (mode) == MODE_INT)
7534 *cost += (extra_cost->mult[mode == DImode].add
7535 + extra_cost->mult[mode == DImode].idiv);
7536 else if (mode == DFmode)
7537 *cost += (extra_cost->fp[1].mult
7538 + extra_cost->fp[1].div);
7539 else if (mode == SFmode)
7540 *cost += (extra_cost->fp[0].mult
7541 + extra_cost->fp[0].div);
7543 return false; /* All arguments need to be in registers. */
7545 case DIV:
7546 case UDIV:
7547 case SQRT:
7548 if (speed)
7550 if (VECTOR_MODE_P (mode))
7551 *cost += extra_cost->vect.alu;
7552 else if (GET_MODE_CLASS (mode) == MODE_INT)
7553 /* There is no integer SQRT, so only DIV and UDIV can get
7554 here. */
7555 *cost += extra_cost->mult[mode == DImode].idiv;
7556 else
7557 *cost += extra_cost->fp[mode == DFmode].div;
7559 return false; /* All arguments need to be in registers. */
7561 case IF_THEN_ELSE:
7562 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7563 XEXP (x, 2), cost, speed);
7565 case EQ:
7566 case NE:
7567 case GT:
7568 case GTU:
7569 case LT:
7570 case LTU:
7571 case GE:
7572 case GEU:
7573 case LE:
7574 case LEU:
7576 return false; /* All arguments must be in registers. */
7578 case FMA:
7579 op0 = XEXP (x, 0);
7580 op1 = XEXP (x, 1);
7581 op2 = XEXP (x, 2);
7583 if (speed)
7585 if (VECTOR_MODE_P (mode))
7586 *cost += extra_cost->vect.alu;
7587 else
7588 *cost += extra_cost->fp[mode == DFmode].fma;
7591 /* FMSUB, FNMADD, and FNMSUB are free. */
7592 if (GET_CODE (op0) == NEG)
7593 op0 = XEXP (op0, 0);
7595 if (GET_CODE (op2) == NEG)
7596 op2 = XEXP (op2, 0);
7598 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7599 and the by-element operand as operand 0. */
7600 if (GET_CODE (op1) == NEG)
7601 op1 = XEXP (op1, 0);
7603 /* Catch vector-by-element operations. The by-element operand can
7604 either be (vec_duplicate (vec_select (x))) or just
7605 (vec_select (x)), depending on whether we are multiplying by
7606 a vector or a scalar.
7608 Canonicalization is not very good in these cases, FMA4 will put the
7609 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7610 if (GET_CODE (op0) == VEC_DUPLICATE)
7611 op0 = XEXP (op0, 0);
7612 else if (GET_CODE (op1) == VEC_DUPLICATE)
7613 op1 = XEXP (op1, 0);
7615 if (GET_CODE (op0) == VEC_SELECT)
7616 op0 = XEXP (op0, 0);
7617 else if (GET_CODE (op1) == VEC_SELECT)
7618 op1 = XEXP (op1, 0);
7620 /* If the remaining parameters are not registers,
7621 get the cost to put them into registers. */
7622 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7623 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7624 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7625 return true;
7627 case FLOAT:
7628 case UNSIGNED_FLOAT:
7629 if (speed)
7630 *cost += extra_cost->fp[mode == DFmode].fromint;
7631 return false;
7633 case FLOAT_EXTEND:
7634 if (speed)
7636 if (VECTOR_MODE_P (mode))
7638 /*Vector truncate. */
7639 *cost += extra_cost->vect.alu;
7641 else
7642 *cost += extra_cost->fp[mode == DFmode].widen;
7644 return false;
7646 case FLOAT_TRUNCATE:
7647 if (speed)
7649 if (VECTOR_MODE_P (mode))
7651 /*Vector conversion. */
7652 *cost += extra_cost->vect.alu;
7654 else
7655 *cost += extra_cost->fp[mode == DFmode].narrow;
7657 return false;
7659 case FIX:
7660 case UNSIGNED_FIX:
7661 x = XEXP (x, 0);
7662 /* Strip the rounding part. They will all be implemented
7663 by the fcvt* family of instructions anyway. */
7664 if (GET_CODE (x) == UNSPEC)
7666 unsigned int uns_code = XINT (x, 1);
7668 if (uns_code == UNSPEC_FRINTA
7669 || uns_code == UNSPEC_FRINTM
7670 || uns_code == UNSPEC_FRINTN
7671 || uns_code == UNSPEC_FRINTP
7672 || uns_code == UNSPEC_FRINTZ)
7673 x = XVECEXP (x, 0, 0);
7676 if (speed)
7678 if (VECTOR_MODE_P (mode))
7679 *cost += extra_cost->vect.alu;
7680 else
7681 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7684 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7685 fixed-point fcvt. */
7686 if (GET_CODE (x) == MULT
7687 && ((VECTOR_MODE_P (mode)
7688 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7689 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7691 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7692 0, speed);
7693 return true;
7696 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7697 return true;
7699 case ABS:
7700 if (VECTOR_MODE_P (mode))
7702 /* ABS (vector). */
7703 if (speed)
7704 *cost += extra_cost->vect.alu;
7706 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7708 op0 = XEXP (x, 0);
7710 /* FABD, which is analogous to FADD. */
7711 if (GET_CODE (op0) == MINUS)
7713 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7714 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7715 if (speed)
7716 *cost += extra_cost->fp[mode == DFmode].addsub;
7718 return true;
7720 /* Simple FABS is analogous to FNEG. */
7721 if (speed)
7722 *cost += extra_cost->fp[mode == DFmode].neg;
7724 else
7726 /* Integer ABS will either be split to
7727 two arithmetic instructions, or will be an ABS
7728 (scalar), which we don't model. */
7729 *cost = COSTS_N_INSNS (2);
7730 if (speed)
7731 *cost += 2 * extra_cost->alu.arith;
7733 return false;
7735 case SMAX:
7736 case SMIN:
7737 if (speed)
7739 if (VECTOR_MODE_P (mode))
7740 *cost += extra_cost->vect.alu;
7741 else
7743 /* FMAXNM/FMINNM/FMAX/FMIN.
7744 TODO: This may not be accurate for all implementations, but
7745 we do not model this in the cost tables. */
7746 *cost += extra_cost->fp[mode == DFmode].addsub;
7749 return false;
7751 case UNSPEC:
7752 /* The floating point round to integer frint* instructions. */
7753 if (aarch64_frint_unspec_p (XINT (x, 1)))
7755 if (speed)
7756 *cost += extra_cost->fp[mode == DFmode].roundint;
7758 return false;
7761 if (XINT (x, 1) == UNSPEC_RBIT)
7763 if (speed)
7764 *cost += extra_cost->alu.rev;
7766 return false;
7768 break;
7770 case TRUNCATE:
7772 /* Decompose <su>muldi3_highpart. */
7773 if (/* (truncate:DI */
7774 mode == DImode
7775 /* (lshiftrt:TI */
7776 && GET_MODE (XEXP (x, 0)) == TImode
7777 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7778 /* (mult:TI */
7779 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7780 /* (ANY_EXTEND:TI (reg:DI))
7781 (ANY_EXTEND:TI (reg:DI))) */
7782 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7783 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7784 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7785 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7786 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7787 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7788 /* (const_int 64) */
7789 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7790 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7792 /* UMULH/SMULH. */
7793 if (speed)
7794 *cost += extra_cost->mult[mode == DImode].extend;
7795 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7796 mode, MULT, 0, speed);
7797 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7798 mode, MULT, 1, speed);
7799 return true;
7802 /* Fall through. */
7803 default:
7804 break;
7807 if (dump_file
7808 && flag_aarch64_verbose_cost)
7809 fprintf (dump_file,
7810 "\nFailed to cost RTX. Assuming default cost.\n");
7812 return true;
7815 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7816 calculated for X. This cost is stored in *COST. Returns true
7817 if the total cost of X was calculated. */
7818 static bool
7819 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7820 int param, int *cost, bool speed)
7822 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7824 if (dump_file
7825 && flag_aarch64_verbose_cost)
7827 print_rtl_single (dump_file, x);
7828 fprintf (dump_file, "\n%s cost: %d (%s)\n",
7829 speed ? "Hot" : "Cold",
7830 *cost, result ? "final" : "partial");
7833 return result;
7836 static int
7837 aarch64_register_move_cost (machine_mode mode,
7838 reg_class_t from_i, reg_class_t to_i)
7840 enum reg_class from = (enum reg_class) from_i;
7841 enum reg_class to = (enum reg_class) to_i;
7842 const struct cpu_regmove_cost *regmove_cost
7843 = aarch64_tune_params.regmove_cost;
7845 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
7846 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7847 to = GENERAL_REGS;
7849 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7850 from = GENERAL_REGS;
7852 /* Moving between GPR and stack cost is the same as GP2GP. */
7853 if ((from == GENERAL_REGS && to == STACK_REG)
7854 || (to == GENERAL_REGS && from == STACK_REG))
7855 return regmove_cost->GP2GP;
7857 /* To/From the stack register, we move via the gprs. */
7858 if (to == STACK_REG || from == STACK_REG)
7859 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7860 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7862 if (GET_MODE_SIZE (mode) == 16)
7864 /* 128-bit operations on general registers require 2 instructions. */
7865 if (from == GENERAL_REGS && to == GENERAL_REGS)
7866 return regmove_cost->GP2GP * 2;
7867 else if (from == GENERAL_REGS)
7868 return regmove_cost->GP2FP * 2;
7869 else if (to == GENERAL_REGS)
7870 return regmove_cost->FP2GP * 2;
7872 /* When AdvSIMD instructions are disabled it is not possible to move
7873 a 128-bit value directly between Q registers. This is handled in
7874 secondary reload. A general register is used as a scratch to move
7875 the upper DI value and the lower DI value is moved directly,
7876 hence the cost is the sum of three moves. */
7877 if (! TARGET_SIMD)
7878 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7880 return regmove_cost->FP2FP;
7883 if (from == GENERAL_REGS && to == GENERAL_REGS)
7884 return regmove_cost->GP2GP;
7885 else if (from == GENERAL_REGS)
7886 return regmove_cost->GP2FP;
7887 else if (to == GENERAL_REGS)
7888 return regmove_cost->FP2GP;
7890 return regmove_cost->FP2FP;
7893 static int
7894 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7895 reg_class_t rclass ATTRIBUTE_UNUSED,
7896 bool in ATTRIBUTE_UNUSED)
7898 return aarch64_tune_params.memmov_cost;
7901 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7902 to optimize 1.0/sqrt. */
7904 static bool
7905 use_rsqrt_p (machine_mode mode)
7907 return (!flag_trapping_math
7908 && flag_unsafe_math_optimizations
7909 && ((aarch64_tune_params.approx_modes->recip_sqrt
7910 & AARCH64_APPROX_MODE (mode))
7911 || flag_mrecip_low_precision_sqrt));
7914 /* Function to decide when to use the approximate reciprocal square root
7915 builtin. */
7917 static tree
7918 aarch64_builtin_reciprocal (tree fndecl)
7920 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7922 if (!use_rsqrt_p (mode))
7923 return NULL_TREE;
7924 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7927 typedef rtx (*rsqrte_type) (rtx, rtx);
7929 /* Select reciprocal square root initial estimate insn depending on machine
7930 mode. */
7932 static rsqrte_type
7933 get_rsqrte_type (machine_mode mode)
7935 switch (mode)
7937 case DFmode: return gen_aarch64_rsqrtedf;
7938 case SFmode: return gen_aarch64_rsqrtesf;
7939 case V2DFmode: return gen_aarch64_rsqrtev2df;
7940 case V2SFmode: return gen_aarch64_rsqrtev2sf;
7941 case V4SFmode: return gen_aarch64_rsqrtev4sf;
7942 default: gcc_unreachable ();
7946 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7948 /* Select reciprocal square root series step insn depending on machine mode. */
7950 static rsqrts_type
7951 get_rsqrts_type (machine_mode mode)
7953 switch (mode)
7955 case DFmode: return gen_aarch64_rsqrtsdf;
7956 case SFmode: return gen_aarch64_rsqrtssf;
7957 case V2DFmode: return gen_aarch64_rsqrtsv2df;
7958 case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7959 case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7960 default: gcc_unreachable ();
7964 /* Emit instruction sequence to compute either the approximate square root
7965 or its approximate reciprocal, depending on the flag RECP, and return
7966 whether the sequence was emitted or not. */
7968 bool
7969 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7971 machine_mode mode = GET_MODE (dst);
7973 if (GET_MODE_INNER (mode) == HFmode)
7974 return false;
7976 machine_mode mmsk = mode_for_vector
7977 (int_mode_for_mode (GET_MODE_INNER (mode)),
7978 GET_MODE_NUNITS (mode));
7979 bool use_approx_sqrt_p = (!recp
7980 && (flag_mlow_precision_sqrt
7981 || (aarch64_tune_params.approx_modes->sqrt
7982 & AARCH64_APPROX_MODE (mode))));
7983 bool use_approx_rsqrt_p = (recp
7984 && (flag_mrecip_low_precision_sqrt
7985 || (aarch64_tune_params.approx_modes->recip_sqrt
7986 & AARCH64_APPROX_MODE (mode))));
7988 if (!flag_finite_math_only
7989 || flag_trapping_math
7990 || !flag_unsafe_math_optimizations
7991 || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7992 || optimize_function_for_size_p (cfun))
7993 return false;
7995 rtx xmsk = gen_reg_rtx (mmsk);
7996 if (!recp)
7997 /* When calculating the approximate square root, compare the argument with
7998 0.0 and create a mask. */
7999 emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
8000 CONST0_RTX (mode)))));
8002 /* Estimate the approximate reciprocal square root. */
8003 rtx xdst = gen_reg_rtx (mode);
8004 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
8006 /* Iterate over the series twice for SF and thrice for DF. */
8007 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8009 /* Optionally iterate over the series once less for faster performance
8010 while sacrificing the accuracy. */
8011 if ((recp && flag_mrecip_low_precision_sqrt)
8012 || (!recp && flag_mlow_precision_sqrt))
8013 iterations--;
8015 /* Iterate over the series to calculate the approximate reciprocal square
8016 root. */
8017 rtx x1 = gen_reg_rtx (mode);
8018 while (iterations--)
8020 rtx x2 = gen_reg_rtx (mode);
8021 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
8023 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
8025 if (iterations > 0)
8026 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
8029 if (!recp)
8031 /* Qualify the approximate reciprocal square root when the argument is
8032 0.0 by squashing the intermediary result to 0.0. */
8033 rtx xtmp = gen_reg_rtx (mmsk);
8034 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
8035 gen_rtx_SUBREG (mmsk, xdst, 0)));
8036 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
8038 /* Calculate the approximate square root. */
8039 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
8042 /* Finalize the approximation. */
8043 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8045 return true;
8048 typedef rtx (*recpe_type) (rtx, rtx);
8050 /* Select reciprocal initial estimate insn depending on machine mode. */
8052 static recpe_type
8053 get_recpe_type (machine_mode mode)
8055 switch (mode)
8057 case SFmode: return (gen_aarch64_frecpesf);
8058 case V2SFmode: return (gen_aarch64_frecpev2sf);
8059 case V4SFmode: return (gen_aarch64_frecpev4sf);
8060 case DFmode: return (gen_aarch64_frecpedf);
8061 case V2DFmode: return (gen_aarch64_frecpev2df);
8062 default: gcc_unreachable ();
8066 typedef rtx (*recps_type) (rtx, rtx, rtx);
8068 /* Select reciprocal series step insn depending on machine mode. */
8070 static recps_type
8071 get_recps_type (machine_mode mode)
8073 switch (mode)
8075 case SFmode: return (gen_aarch64_frecpssf);
8076 case V2SFmode: return (gen_aarch64_frecpsv2sf);
8077 case V4SFmode: return (gen_aarch64_frecpsv4sf);
8078 case DFmode: return (gen_aarch64_frecpsdf);
8079 case V2DFmode: return (gen_aarch64_frecpsv2df);
8080 default: gcc_unreachable ();
8084 /* Emit the instruction sequence to compute the approximation for the division
8085 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8087 bool
8088 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8090 machine_mode mode = GET_MODE (quo);
8092 if (GET_MODE_INNER (mode) == HFmode)
8093 return false;
8095 bool use_approx_division_p = (flag_mlow_precision_div
8096 || (aarch64_tune_params.approx_modes->division
8097 & AARCH64_APPROX_MODE (mode)));
8099 if (!flag_finite_math_only
8100 || flag_trapping_math
8101 || !flag_unsafe_math_optimizations
8102 || optimize_function_for_size_p (cfun)
8103 || !use_approx_division_p)
8104 return false;
8106 /* Estimate the approximate reciprocal. */
8107 rtx xrcp = gen_reg_rtx (mode);
8108 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8110 /* Iterate over the series twice for SF and thrice for DF. */
8111 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8113 /* Optionally iterate over the series once less for faster performance,
8114 while sacrificing the accuracy. */
8115 if (flag_mlow_precision_div)
8116 iterations--;
8118 /* Iterate over the series to calculate the approximate reciprocal. */
8119 rtx xtmp = gen_reg_rtx (mode);
8120 while (iterations--)
8122 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8124 if (iterations > 0)
8125 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8128 if (num != CONST1_RTX (mode))
8130 /* As the approximate reciprocal of DEN is already calculated, only
8131 calculate the approximate division when NUM is not 1.0. */
8132 rtx xnum = force_reg (mode, num);
8133 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8136 /* Finalize the approximation. */
8137 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8138 return true;
8141 /* Return the number of instructions that can be issued per cycle. */
8142 static int
8143 aarch64_sched_issue_rate (void)
8145 return aarch64_tune_params.issue_rate;
8148 static int
8149 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8151 int issue_rate = aarch64_sched_issue_rate ();
8153 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8157 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8158 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8159 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8161 static int
8162 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8163 int ready_index)
8165 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8169 /* Vectorizer cost model target hooks. */
8171 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8172 static int
8173 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8174 tree vectype,
8175 int misalign ATTRIBUTE_UNUSED)
8177 unsigned elements;
8178 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8179 bool fp = false;
8181 if (vectype != NULL)
8182 fp = FLOAT_TYPE_P (vectype);
8184 switch (type_of_cost)
8186 case scalar_stmt:
8187 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8189 case scalar_load:
8190 return costs->scalar_load_cost;
8192 case scalar_store:
8193 return costs->scalar_store_cost;
8195 case vector_stmt:
8196 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8198 case vector_load:
8199 return costs->vec_align_load_cost;
8201 case vector_store:
8202 return costs->vec_store_cost;
8204 case vec_to_scalar:
8205 return costs->vec_to_scalar_cost;
8207 case scalar_to_vec:
8208 return costs->scalar_to_vec_cost;
8210 case unaligned_load:
8211 return costs->vec_unalign_load_cost;
8213 case unaligned_store:
8214 return costs->vec_unalign_store_cost;
8216 case cond_branch_taken:
8217 return costs->cond_taken_branch_cost;
8219 case cond_branch_not_taken:
8220 return costs->cond_not_taken_branch_cost;
8222 case vec_perm:
8223 return costs->vec_permute_cost;
8225 case vec_promote_demote:
8226 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8228 case vec_construct:
8229 elements = TYPE_VECTOR_SUBPARTS (vectype);
8230 return elements / 2 + 1;
8232 default:
8233 gcc_unreachable ();
8237 /* Implement targetm.vectorize.add_stmt_cost. */
8238 static unsigned
8239 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8240 struct _stmt_vec_info *stmt_info, int misalign,
8241 enum vect_cost_model_location where)
8243 unsigned *cost = (unsigned *) data;
8244 unsigned retval = 0;
8246 if (flag_vect_cost_model)
8248 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8249 int stmt_cost =
8250 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8252 /* Statements in an inner loop relative to the loop being
8253 vectorized are weighted more heavily. The value here is
8254 arbitrary and could potentially be improved with analysis. */
8255 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8256 count *= 50; /* FIXME */
8258 retval = (unsigned) (count * stmt_cost);
8259 cost[where] += retval;
8262 return retval;
8265 static void initialize_aarch64_code_model (struct gcc_options *);
8267 /* Parse the TO_PARSE string and put the architecture struct that it
8268 selects into RES and the architectural features into ISA_FLAGS.
8269 Return an aarch64_parse_opt_result describing the parse result.
8270 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8272 static enum aarch64_parse_opt_result
8273 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8274 unsigned long *isa_flags)
8276 char *ext;
8277 const struct processor *arch;
8278 char *str = (char *) alloca (strlen (to_parse) + 1);
8279 size_t len;
8281 strcpy (str, to_parse);
8283 ext = strchr (str, '+');
8285 if (ext != NULL)
8286 len = ext - str;
8287 else
8288 len = strlen (str);
8290 if (len == 0)
8291 return AARCH64_PARSE_MISSING_ARG;
8294 /* Loop through the list of supported ARCHes to find a match. */
8295 for (arch = all_architectures; arch->name != NULL; arch++)
8297 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8299 unsigned long isa_temp = arch->flags;
8301 if (ext != NULL)
8303 /* TO_PARSE string contains at least one extension. */
8304 enum aarch64_parse_opt_result ext_res
8305 = aarch64_parse_extension (ext, &isa_temp);
8307 if (ext_res != AARCH64_PARSE_OK)
8308 return ext_res;
8310 /* Extension parsing was successful. Confirm the result
8311 arch and ISA flags. */
8312 *res = arch;
8313 *isa_flags = isa_temp;
8314 return AARCH64_PARSE_OK;
8318 /* ARCH name not found in list. */
8319 return AARCH64_PARSE_INVALID_ARG;
8322 /* Parse the TO_PARSE string and put the result tuning in RES and the
8323 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8324 describing the parse result. If there is an error parsing, RES and
8325 ISA_FLAGS are left unchanged. */
8327 static enum aarch64_parse_opt_result
8328 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8329 unsigned long *isa_flags)
8331 char *ext;
8332 const struct processor *cpu;
8333 char *str = (char *) alloca (strlen (to_parse) + 1);
8334 size_t len;
8336 strcpy (str, to_parse);
8338 ext = strchr (str, '+');
8340 if (ext != NULL)
8341 len = ext - str;
8342 else
8343 len = strlen (str);
8345 if (len == 0)
8346 return AARCH64_PARSE_MISSING_ARG;
8349 /* Loop through the list of supported CPUs to find a match. */
8350 for (cpu = all_cores; cpu->name != NULL; cpu++)
8352 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8354 unsigned long isa_temp = cpu->flags;
8357 if (ext != NULL)
8359 /* TO_PARSE string contains at least one extension. */
8360 enum aarch64_parse_opt_result ext_res
8361 = aarch64_parse_extension (ext, &isa_temp);
8363 if (ext_res != AARCH64_PARSE_OK)
8364 return ext_res;
8366 /* Extension parsing was successfull. Confirm the result
8367 cpu and ISA flags. */
8368 *res = cpu;
8369 *isa_flags = isa_temp;
8370 return AARCH64_PARSE_OK;
8374 /* CPU name not found in list. */
8375 return AARCH64_PARSE_INVALID_ARG;
8378 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8379 Return an aarch64_parse_opt_result describing the parse result.
8380 If the parsing fails the RES does not change. */
8382 static enum aarch64_parse_opt_result
8383 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8385 const struct processor *cpu;
8386 char *str = (char *) alloca (strlen (to_parse) + 1);
8388 strcpy (str, to_parse);
8390 /* Loop through the list of supported CPUs to find a match. */
8391 for (cpu = all_cores; cpu->name != NULL; cpu++)
8393 if (strcmp (cpu->name, str) == 0)
8395 *res = cpu;
8396 return AARCH64_PARSE_OK;
8400 /* CPU name not found in list. */
8401 return AARCH64_PARSE_INVALID_ARG;
8404 /* Parse TOKEN, which has length LENGTH to see if it is an option
8405 described in FLAG. If it is, return the index bit for that fusion type.
8406 If not, error (printing OPTION_NAME) and return zero. */
8408 static unsigned int
8409 aarch64_parse_one_option_token (const char *token,
8410 size_t length,
8411 const struct aarch64_flag_desc *flag,
8412 const char *option_name)
8414 for (; flag->name != NULL; flag++)
8416 if (length == strlen (flag->name)
8417 && !strncmp (flag->name, token, length))
8418 return flag->flag;
8421 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8422 return 0;
8425 /* Parse OPTION which is a comma-separated list of flags to enable.
8426 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8427 default state we inherit from the CPU tuning structures. OPTION_NAME
8428 gives the top-level option we are parsing in the -moverride string,
8429 for use in error messages. */
8431 static unsigned int
8432 aarch64_parse_boolean_options (const char *option,
8433 const struct aarch64_flag_desc *flags,
8434 unsigned int initial_state,
8435 const char *option_name)
8437 const char separator = '.';
8438 const char* specs = option;
8439 const char* ntoken = option;
8440 unsigned int found_flags = initial_state;
8442 while ((ntoken = strchr (specs, separator)))
8444 size_t token_length = ntoken - specs;
8445 unsigned token_ops = aarch64_parse_one_option_token (specs,
8446 token_length,
8447 flags,
8448 option_name);
8449 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8450 in the token stream, reset the supported operations. So:
8452 adrp+add.cmp+branch.none.adrp+add
8454 would have the result of turning on only adrp+add fusion. */
8455 if (!token_ops)
8456 found_flags = 0;
8458 found_flags |= token_ops;
8459 specs = ++ntoken;
8462 /* We ended with a comma, print something. */
8463 if (!(*specs))
8465 error ("%s string ill-formed\n", option_name);
8466 return 0;
8469 /* We still have one more token to parse. */
8470 size_t token_length = strlen (specs);
8471 unsigned token_ops = aarch64_parse_one_option_token (specs,
8472 token_length,
8473 flags,
8474 option_name);
8475 if (!token_ops)
8476 found_flags = 0;
8478 found_flags |= token_ops;
8479 return found_flags;
8482 /* Support for overriding instruction fusion. */
8484 static void
8485 aarch64_parse_fuse_string (const char *fuse_string,
8486 struct tune_params *tune)
8488 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8489 aarch64_fusible_pairs,
8490 tune->fusible_ops,
8491 "fuse=");
8494 /* Support for overriding other tuning flags. */
8496 static void
8497 aarch64_parse_tune_string (const char *tune_string,
8498 struct tune_params *tune)
8500 tune->extra_tuning_flags
8501 = aarch64_parse_boolean_options (tune_string,
8502 aarch64_tuning_flags,
8503 tune->extra_tuning_flags,
8504 "tune=");
8507 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8508 we understand. If it is, extract the option string and handoff to
8509 the appropriate function. */
8511 void
8512 aarch64_parse_one_override_token (const char* token,
8513 size_t length,
8514 struct tune_params *tune)
8516 const struct aarch64_tuning_override_function *fn
8517 = aarch64_tuning_override_functions;
8519 const char *option_part = strchr (token, '=');
8520 if (!option_part)
8522 error ("tuning string missing in option (%s)", token);
8523 return;
8526 /* Get the length of the option name. */
8527 length = option_part - token;
8528 /* Skip the '=' to get to the option string. */
8529 option_part++;
8531 for (; fn->name != NULL; fn++)
8533 if (!strncmp (fn->name, token, length))
8535 fn->parse_override (option_part, tune);
8536 return;
8540 error ("unknown tuning option (%s)",token);
8541 return;
8544 /* A checking mechanism for the implementation of the tls size. */
8546 static void
8547 initialize_aarch64_tls_size (struct gcc_options *opts)
8549 if (aarch64_tls_size == 0)
8550 aarch64_tls_size = 24;
8552 switch (opts->x_aarch64_cmodel_var)
8554 case AARCH64_CMODEL_TINY:
8555 /* Both the default and maximum TLS size allowed under tiny is 1M which
8556 needs two instructions to address, so we clamp the size to 24. */
8557 if (aarch64_tls_size > 24)
8558 aarch64_tls_size = 24;
8559 break;
8560 case AARCH64_CMODEL_SMALL:
8561 /* The maximum TLS size allowed under small is 4G. */
8562 if (aarch64_tls_size > 32)
8563 aarch64_tls_size = 32;
8564 break;
8565 case AARCH64_CMODEL_LARGE:
8566 /* The maximum TLS size allowed under large is 16E.
8567 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8568 if (aarch64_tls_size > 48)
8569 aarch64_tls_size = 48;
8570 break;
8571 default:
8572 gcc_unreachable ();
8575 return;
8578 /* Parse STRING looking for options in the format:
8579 string :: option:string
8580 option :: name=substring
8581 name :: {a-z}
8582 substring :: defined by option. */
8584 static void
8585 aarch64_parse_override_string (const char* input_string,
8586 struct tune_params* tune)
8588 const char separator = ':';
8589 size_t string_length = strlen (input_string) + 1;
8590 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8591 char *string = string_root;
8592 strncpy (string, input_string, string_length);
8593 string[string_length - 1] = '\0';
8595 char* ntoken = string;
8597 while ((ntoken = strchr (string, separator)))
8599 size_t token_length = ntoken - string;
8600 /* Make this substring look like a string. */
8601 *ntoken = '\0';
8602 aarch64_parse_one_override_token (string, token_length, tune);
8603 string = ++ntoken;
8606 /* One last option to parse. */
8607 aarch64_parse_one_override_token (string, strlen (string), tune);
8608 free (string_root);
8612 static void
8613 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8615 /* The logic here is that if we are disabling all frame pointer generation
8616 then we do not need to disable leaf frame pointer generation as a
8617 separate operation. But if we are *only* disabling leaf frame pointer
8618 generation then we set flag_omit_frame_pointer to true, but in
8619 aarch64_frame_pointer_required we return false only for leaf functions.
8621 PR 70044: We have to be careful about being called multiple times for the
8622 same function. Once we have decided to set flag_omit_frame_pointer just
8623 so that we can omit leaf frame pointers, we must then not interpret a
8624 second call as meaning that all frame pointer generation should be
8625 omitted. We do this by setting flag_omit_frame_pointer to a special,
8626 non-zero value. */
8627 if (opts->x_flag_omit_frame_pointer == 2)
8628 opts->x_flag_omit_frame_pointer = 0;
8630 if (opts->x_flag_omit_frame_pointer)
8631 opts->x_flag_omit_leaf_frame_pointer = false;
8632 else if (opts->x_flag_omit_leaf_frame_pointer)
8633 opts->x_flag_omit_frame_pointer = 2;
8635 /* If not optimizing for size, set the default
8636 alignment to what the target wants. */
8637 if (!opts->x_optimize_size)
8639 if (opts->x_align_loops <= 0)
8640 opts->x_align_loops = aarch64_tune_params.loop_align;
8641 if (opts->x_align_jumps <= 0)
8642 opts->x_align_jumps = aarch64_tune_params.jump_align;
8643 if (opts->x_align_functions <= 0)
8644 opts->x_align_functions = aarch64_tune_params.function_align;
8647 /* We default to no pc-relative literal loads. */
8649 aarch64_pcrelative_literal_loads = false;
8651 /* If -mpc-relative-literal-loads is set on the command line, this
8652 implies that the user asked for PC relative literal loads. */
8653 if (opts->x_pcrelative_literal_loads == 1)
8654 aarch64_pcrelative_literal_loads = true;
8656 /* This is PR70113. When building the Linux kernel with
8657 CONFIG_ARM64_ERRATUM_843419, support for relocations
8658 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8659 removed from the kernel to avoid loading objects with possibly
8660 offending sequences. Without -mpc-relative-literal-loads we would
8661 generate such relocations, preventing the kernel build from
8662 succeeding. */
8663 if (opts->x_pcrelative_literal_loads == 2
8664 && TARGET_FIX_ERR_A53_843419)
8665 aarch64_pcrelative_literal_loads = true;
8667 /* In the tiny memory model it makes no sense to disallow PC relative
8668 literal pool loads. */
8669 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8670 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8671 aarch64_pcrelative_literal_loads = true;
8673 /* When enabling the lower precision Newton series for the square root, also
8674 enable it for the reciprocal square root, since the latter is an
8675 intermediary step for the former. */
8676 if (flag_mlow_precision_sqrt)
8677 flag_mrecip_low_precision_sqrt = true;
8680 /* 'Unpack' up the internal tuning structs and update the options
8681 in OPTS. The caller must have set up selected_tune and selected_arch
8682 as all the other target-specific codegen decisions are
8683 derived from them. */
8685 void
8686 aarch64_override_options_internal (struct gcc_options *opts)
8688 aarch64_tune_flags = selected_tune->flags;
8689 aarch64_tune = selected_tune->sched_core;
8690 /* Make a copy of the tuning parameters attached to the core, which
8691 we may later overwrite. */
8692 aarch64_tune_params = *(selected_tune->tune);
8693 aarch64_architecture_version = selected_arch->architecture_version;
8695 if (opts->x_aarch64_override_tune_string)
8696 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8697 &aarch64_tune_params);
8699 /* This target defaults to strict volatile bitfields. */
8700 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8701 opts->x_flag_strict_volatile_bitfields = 1;
8703 initialize_aarch64_code_model (opts);
8704 initialize_aarch64_tls_size (opts);
8706 int queue_depth = 0;
8707 switch (aarch64_tune_params.autoprefetcher_model)
8709 case tune_params::AUTOPREFETCHER_OFF:
8710 queue_depth = -1;
8711 break;
8712 case tune_params::AUTOPREFETCHER_WEAK:
8713 queue_depth = 0;
8714 break;
8715 case tune_params::AUTOPREFETCHER_STRONG:
8716 queue_depth = max_insn_queue_index + 1;
8717 break;
8718 default:
8719 gcc_unreachable ();
8722 /* We don't mind passing in global_options_set here as we don't use
8723 the *options_set structs anyway. */
8724 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8725 queue_depth,
8726 opts->x_param_values,
8727 global_options_set.x_param_values);
8729 /* Set the L1 cache line size. */
8730 if (selected_cpu->tune->cache_line_size != 0)
8731 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8732 selected_cpu->tune->cache_line_size,
8733 opts->x_param_values,
8734 global_options_set.x_param_values);
8736 aarch64_override_options_after_change_1 (opts);
8739 /* Print a hint with a suggestion for a core or architecture name that
8740 most closely resembles what the user passed in STR. ARCH is true if
8741 the user is asking for an architecture name. ARCH is false if the user
8742 is asking for a core name. */
8744 static void
8745 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8747 auto_vec<const char *> candidates;
8748 const struct processor *entry = arch ? all_architectures : all_cores;
8749 for (; entry->name != NULL; entry++)
8750 candidates.safe_push (entry->name);
8751 char *s;
8752 const char *hint = candidates_list_and_hint (str, s, candidates);
8753 if (hint)
8754 inform (input_location, "valid arguments are: %s;"
8755 " did you mean %qs?", s, hint);
8756 XDELETEVEC (s);
8759 /* Print a hint with a suggestion for a core name that most closely resembles
8760 what the user passed in STR. */
8762 inline static void
8763 aarch64_print_hint_for_core (const char *str)
8765 aarch64_print_hint_for_core_or_arch (str, false);
8768 /* Print a hint with a suggestion for an architecture name that most closely
8769 resembles what the user passed in STR. */
8771 inline static void
8772 aarch64_print_hint_for_arch (const char *str)
8774 aarch64_print_hint_for_core_or_arch (str, true);
8777 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8778 specified in STR and throw errors if appropriate. Put the results if
8779 they are valid in RES and ISA_FLAGS. Return whether the option is
8780 valid. */
8782 static bool
8783 aarch64_validate_mcpu (const char *str, const struct processor **res,
8784 unsigned long *isa_flags)
8786 enum aarch64_parse_opt_result parse_res
8787 = aarch64_parse_cpu (str, res, isa_flags);
8789 if (parse_res == AARCH64_PARSE_OK)
8790 return true;
8792 switch (parse_res)
8794 case AARCH64_PARSE_MISSING_ARG:
8795 error ("missing cpu name in %<-mcpu=%s%>", str);
8796 break;
8797 case AARCH64_PARSE_INVALID_ARG:
8798 error ("unknown value %qs for -mcpu", str);
8799 aarch64_print_hint_for_core (str);
8800 break;
8801 case AARCH64_PARSE_INVALID_FEATURE:
8802 error ("invalid feature modifier in %<-mcpu=%s%>", str);
8803 break;
8804 default:
8805 gcc_unreachable ();
8808 return false;
8811 /* Validate a command-line -march option. Parse the arch and extensions
8812 (if any) specified in STR and throw errors if appropriate. Put the
8813 results, if they are valid, in RES and ISA_FLAGS. Return whether the
8814 option is valid. */
8816 static bool
8817 aarch64_validate_march (const char *str, const struct processor **res,
8818 unsigned long *isa_flags)
8820 enum aarch64_parse_opt_result parse_res
8821 = aarch64_parse_arch (str, res, isa_flags);
8823 if (parse_res == AARCH64_PARSE_OK)
8824 return true;
8826 switch (parse_res)
8828 case AARCH64_PARSE_MISSING_ARG:
8829 error ("missing arch name in %<-march=%s%>", str);
8830 break;
8831 case AARCH64_PARSE_INVALID_ARG:
8832 error ("unknown value %qs for -march", str);
8833 aarch64_print_hint_for_arch (str);
8834 break;
8835 case AARCH64_PARSE_INVALID_FEATURE:
8836 error ("invalid feature modifier in %<-march=%s%>", str);
8837 break;
8838 default:
8839 gcc_unreachable ();
8842 return false;
8845 /* Validate a command-line -mtune option. Parse the cpu
8846 specified in STR and throw errors if appropriate. Put the
8847 result, if it is valid, in RES. Return whether the option is
8848 valid. */
8850 static bool
8851 aarch64_validate_mtune (const char *str, const struct processor **res)
8853 enum aarch64_parse_opt_result parse_res
8854 = aarch64_parse_tune (str, res);
8856 if (parse_res == AARCH64_PARSE_OK)
8857 return true;
8859 switch (parse_res)
8861 case AARCH64_PARSE_MISSING_ARG:
8862 error ("missing cpu name in %<-mtune=%s%>", str);
8863 break;
8864 case AARCH64_PARSE_INVALID_ARG:
8865 error ("unknown value %qs for -mtune", str);
8866 aarch64_print_hint_for_core (str);
8867 break;
8868 default:
8869 gcc_unreachable ();
8871 return false;
8874 /* Return the CPU corresponding to the enum CPU.
8875 If it doesn't specify a cpu, return the default. */
8877 static const struct processor *
8878 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8880 if (cpu != aarch64_none)
8881 return &all_cores[cpu];
8883 /* The & 0x3f is to extract the bottom 6 bits that encode the
8884 default cpu as selected by the --with-cpu GCC configure option
8885 in config.gcc.
8886 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8887 flags mechanism should be reworked to make it more sane. */
8888 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8891 /* Return the architecture corresponding to the enum ARCH.
8892 If it doesn't specify a valid architecture, return the default. */
8894 static const struct processor *
8895 aarch64_get_arch (enum aarch64_arch arch)
8897 if (arch != aarch64_no_arch)
8898 return &all_architectures[arch];
8900 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8902 return &all_architectures[cpu->arch];
8905 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
8906 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8907 tuning structs. In particular it must set selected_tune and
8908 aarch64_isa_flags that define the available ISA features and tuning
8909 decisions. It must also set selected_arch as this will be used to
8910 output the .arch asm tags for each function. */
8912 static void
8913 aarch64_override_options (void)
8915 unsigned long cpu_isa = 0;
8916 unsigned long arch_isa = 0;
8917 aarch64_isa_flags = 0;
8919 bool valid_cpu = true;
8920 bool valid_tune = true;
8921 bool valid_arch = true;
8923 selected_cpu = NULL;
8924 selected_arch = NULL;
8925 selected_tune = NULL;
8927 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8928 If either of -march or -mtune is given, they override their
8929 respective component of -mcpu. */
8930 if (aarch64_cpu_string)
8931 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8932 &cpu_isa);
8934 if (aarch64_arch_string)
8935 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8936 &arch_isa);
8938 if (aarch64_tune_string)
8939 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8941 /* If the user did not specify a processor, choose the default
8942 one for them. This will be the CPU set during configuration using
8943 --with-cpu, otherwise it is "generic". */
8944 if (!selected_cpu)
8946 if (selected_arch)
8948 selected_cpu = &all_cores[selected_arch->ident];
8949 aarch64_isa_flags = arch_isa;
8950 explicit_arch = selected_arch->arch;
8952 else
8954 /* Get default configure-time CPU. */
8955 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8956 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8959 if (selected_tune)
8960 explicit_tune_core = selected_tune->ident;
8962 /* If both -mcpu and -march are specified check that they are architecturally
8963 compatible, warn if they're not and prefer the -march ISA flags. */
8964 else if (selected_arch)
8966 if (selected_arch->arch != selected_cpu->arch)
8968 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8969 all_architectures[selected_cpu->arch].name,
8970 selected_arch->name);
8972 aarch64_isa_flags = arch_isa;
8973 explicit_arch = selected_arch->arch;
8974 explicit_tune_core = selected_tune ? selected_tune->ident
8975 : selected_cpu->ident;
8977 else
8979 /* -mcpu but no -march. */
8980 aarch64_isa_flags = cpu_isa;
8981 explicit_tune_core = selected_tune ? selected_tune->ident
8982 : selected_cpu->ident;
8983 gcc_assert (selected_cpu);
8984 selected_arch = &all_architectures[selected_cpu->arch];
8985 explicit_arch = selected_arch->arch;
8988 /* Set the arch as well as we will need it when outputing
8989 the .arch directive in assembly. */
8990 if (!selected_arch)
8992 gcc_assert (selected_cpu);
8993 selected_arch = &all_architectures[selected_cpu->arch];
8996 if (!selected_tune)
8997 selected_tune = selected_cpu;
8999 #ifndef HAVE_AS_MABI_OPTION
9000 /* The compiler may have been configured with 2.23.* binutils, which does
9001 not have support for ILP32. */
9002 if (TARGET_ILP32)
9003 error ("Assembler does not support -mabi=ilp32");
9004 #endif
9006 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
9007 sorry ("Return address signing is only supported for -mabi=lp64");
9009 /* Make sure we properly set up the explicit options. */
9010 if ((aarch64_cpu_string && valid_cpu)
9011 || (aarch64_tune_string && valid_tune))
9012 gcc_assert (explicit_tune_core != aarch64_none);
9014 if ((aarch64_cpu_string && valid_cpu)
9015 || (aarch64_arch_string && valid_arch))
9016 gcc_assert (explicit_arch != aarch64_no_arch);
9018 aarch64_override_options_internal (&global_options);
9020 /* Save these options as the default ones in case we push and pop them later
9021 while processing functions with potential target attributes. */
9022 target_option_default_node = target_option_current_node
9023 = build_target_option_node (&global_options);
9026 /* Implement targetm.override_options_after_change. */
9028 static void
9029 aarch64_override_options_after_change (void)
9031 aarch64_override_options_after_change_1 (&global_options);
9034 static struct machine_function *
9035 aarch64_init_machine_status (void)
9037 struct machine_function *machine;
9038 machine = ggc_cleared_alloc<machine_function> ();
9039 return machine;
9042 void
9043 aarch64_init_expanders (void)
9045 init_machine_status = aarch64_init_machine_status;
9048 /* A checking mechanism for the implementation of the various code models. */
9049 static void
9050 initialize_aarch64_code_model (struct gcc_options *opts)
9052 if (opts->x_flag_pic)
9054 switch (opts->x_aarch64_cmodel_var)
9056 case AARCH64_CMODEL_TINY:
9057 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9058 break;
9059 case AARCH64_CMODEL_SMALL:
9060 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9061 aarch64_cmodel = (flag_pic == 2
9062 ? AARCH64_CMODEL_SMALL_PIC
9063 : AARCH64_CMODEL_SMALL_SPIC);
9064 #else
9065 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9066 #endif
9067 break;
9068 case AARCH64_CMODEL_LARGE:
9069 sorry ("code model %qs with -f%s", "large",
9070 opts->x_flag_pic > 1 ? "PIC" : "pic");
9071 break;
9072 default:
9073 gcc_unreachable ();
9076 else
9077 aarch64_cmodel = opts->x_aarch64_cmodel_var;
9080 /* Implement TARGET_OPTION_SAVE. */
9082 static void
9083 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9085 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9088 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9089 using the information saved in PTR. */
9091 static void
9092 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9094 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9095 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9096 opts->x_explicit_arch = ptr->x_explicit_arch;
9097 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9098 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9100 aarch64_override_options_internal (opts);
9103 /* Implement TARGET_OPTION_PRINT. */
9105 static void
9106 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9108 const struct processor *cpu
9109 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9110 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9111 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9112 std::string extension
9113 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9115 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9116 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9117 arch->name, extension.c_str ());
9120 static GTY(()) tree aarch64_previous_fndecl;
9122 void
9123 aarch64_reset_previous_fndecl (void)
9125 aarch64_previous_fndecl = NULL;
9128 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9129 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9130 make sure optab availability predicates are recomputed when necessary. */
9132 void
9133 aarch64_save_restore_target_globals (tree new_tree)
9135 if (TREE_TARGET_GLOBALS (new_tree))
9136 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9137 else if (new_tree == target_option_default_node)
9138 restore_target_globals (&default_target_globals);
9139 else
9140 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9143 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9144 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9145 of the function, if such exists. This function may be called multiple
9146 times on a single function so use aarch64_previous_fndecl to avoid
9147 setting up identical state. */
9149 static void
9150 aarch64_set_current_function (tree fndecl)
9152 if (!fndecl || fndecl == aarch64_previous_fndecl)
9153 return;
9155 tree old_tree = (aarch64_previous_fndecl
9156 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9157 : NULL_TREE);
9159 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9161 /* If current function has no attributes but the previous one did,
9162 use the default node. */
9163 if (!new_tree && old_tree)
9164 new_tree = target_option_default_node;
9166 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9167 the default have been handled by aarch64_save_restore_target_globals from
9168 aarch64_pragma_target_parse. */
9169 if (old_tree == new_tree)
9170 return;
9172 aarch64_previous_fndecl = fndecl;
9174 /* First set the target options. */
9175 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9177 aarch64_save_restore_target_globals (new_tree);
9180 /* Enum describing the various ways we can handle attributes.
9181 In many cases we can reuse the generic option handling machinery. */
9183 enum aarch64_attr_opt_type
9185 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9186 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9187 aarch64_attr_enum, /* Attribute sets an enum variable. */
9188 aarch64_attr_custom /* Attribute requires a custom handling function. */
9191 /* All the information needed to handle a target attribute.
9192 NAME is the name of the attribute.
9193 ATTR_TYPE specifies the type of behavior of the attribute as described
9194 in the definition of enum aarch64_attr_opt_type.
9195 ALLOW_NEG is true if the attribute supports a "no-" form.
9196 HANDLER is the function that takes the attribute string and whether
9197 it is a pragma or attribute and handles the option. It is needed only
9198 when the ATTR_TYPE is aarch64_attr_custom.
9199 OPT_NUM is the enum specifying the option that the attribute modifies.
9200 This is needed for attributes that mirror the behavior of a command-line
9201 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9202 aarch64_attr_enum. */
9204 struct aarch64_attribute_info
9206 const char *name;
9207 enum aarch64_attr_opt_type attr_type;
9208 bool allow_neg;
9209 bool (*handler) (const char *, const char *);
9210 enum opt_code opt_num;
9213 /* Handle the ARCH_STR argument to the arch= target attribute.
9214 PRAGMA_OR_ATTR is used in potential error messages. */
9216 static bool
9217 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9219 const struct processor *tmp_arch = NULL;
9220 enum aarch64_parse_opt_result parse_res
9221 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9223 if (parse_res == AARCH64_PARSE_OK)
9225 gcc_assert (tmp_arch);
9226 selected_arch = tmp_arch;
9227 explicit_arch = selected_arch->arch;
9228 return true;
9231 switch (parse_res)
9233 case AARCH64_PARSE_MISSING_ARG:
9234 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9235 break;
9236 case AARCH64_PARSE_INVALID_ARG:
9237 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9238 aarch64_print_hint_for_arch (str);
9239 break;
9240 case AARCH64_PARSE_INVALID_FEATURE:
9241 error ("invalid feature modifier %qs for 'arch' target %s",
9242 str, pragma_or_attr);
9243 break;
9244 default:
9245 gcc_unreachable ();
9248 return false;
9251 /* Handle the argument CPU_STR to the cpu= target attribute.
9252 PRAGMA_OR_ATTR is used in potential error messages. */
9254 static bool
9255 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9257 const struct processor *tmp_cpu = NULL;
9258 enum aarch64_parse_opt_result parse_res
9259 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9261 if (parse_res == AARCH64_PARSE_OK)
9263 gcc_assert (tmp_cpu);
9264 selected_tune = tmp_cpu;
9265 explicit_tune_core = selected_tune->ident;
9267 selected_arch = &all_architectures[tmp_cpu->arch];
9268 explicit_arch = selected_arch->arch;
9269 return true;
9272 switch (parse_res)
9274 case AARCH64_PARSE_MISSING_ARG:
9275 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9276 break;
9277 case AARCH64_PARSE_INVALID_ARG:
9278 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9279 aarch64_print_hint_for_core (str);
9280 break;
9281 case AARCH64_PARSE_INVALID_FEATURE:
9282 error ("invalid feature modifier %qs for 'cpu' target %s",
9283 str, pragma_or_attr);
9284 break;
9285 default:
9286 gcc_unreachable ();
9289 return false;
9292 /* Handle the argument STR to the tune= target attribute.
9293 PRAGMA_OR_ATTR is used in potential error messages. */
9295 static bool
9296 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9298 const struct processor *tmp_tune = NULL;
9299 enum aarch64_parse_opt_result parse_res
9300 = aarch64_parse_tune (str, &tmp_tune);
9302 if (parse_res == AARCH64_PARSE_OK)
9304 gcc_assert (tmp_tune);
9305 selected_tune = tmp_tune;
9306 explicit_tune_core = selected_tune->ident;
9307 return true;
9310 switch (parse_res)
9312 case AARCH64_PARSE_INVALID_ARG:
9313 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9314 aarch64_print_hint_for_core (str);
9315 break;
9316 default:
9317 gcc_unreachable ();
9320 return false;
9323 /* Parse an architecture extensions target attribute string specified in STR.
9324 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9325 if successful. Update aarch64_isa_flags to reflect the ISA features
9326 modified.
9327 PRAGMA_OR_ATTR is used in potential error messages. */
9329 static bool
9330 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9332 enum aarch64_parse_opt_result parse_res;
9333 unsigned long isa_flags = aarch64_isa_flags;
9335 /* We allow "+nothing" in the beginning to clear out all architectural
9336 features if the user wants to handpick specific features. */
9337 if (strncmp ("+nothing", str, 8) == 0)
9339 isa_flags = 0;
9340 str += 8;
9343 parse_res = aarch64_parse_extension (str, &isa_flags);
9345 if (parse_res == AARCH64_PARSE_OK)
9347 aarch64_isa_flags = isa_flags;
9348 return true;
9351 switch (parse_res)
9353 case AARCH64_PARSE_MISSING_ARG:
9354 error ("missing feature modifier in target %s %qs",
9355 pragma_or_attr, str);
9356 break;
9358 case AARCH64_PARSE_INVALID_FEATURE:
9359 error ("invalid feature modifier in target %s %qs",
9360 pragma_or_attr, str);
9361 break;
9363 default:
9364 gcc_unreachable ();
9367 return false;
9370 /* The target attributes that we support. On top of these we also support just
9371 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9372 handled explicitly in aarch64_process_one_target_attr. */
9374 static const struct aarch64_attribute_info aarch64_attributes[] =
9376 { "general-regs-only", aarch64_attr_mask, false, NULL,
9377 OPT_mgeneral_regs_only },
9378 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9379 OPT_mfix_cortex_a53_835769 },
9380 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9381 OPT_mfix_cortex_a53_843419 },
9382 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9383 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9384 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9385 OPT_momit_leaf_frame_pointer },
9386 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9387 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9388 OPT_march_ },
9389 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9390 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9391 OPT_mtune_ },
9392 { "sign-return-address", aarch64_attr_enum, false, NULL,
9393 OPT_msign_return_address_ },
9394 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9397 /* Parse ARG_STR which contains the definition of one target attribute.
9398 Show appropriate errors if any or return true if the attribute is valid.
9399 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9400 we're processing a target attribute or pragma. */
9402 static bool
9403 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9405 bool invert = false;
9407 size_t len = strlen (arg_str);
9409 if (len == 0)
9411 error ("malformed target %s", pragma_or_attr);
9412 return false;
9415 char *str_to_check = (char *) alloca (len + 1);
9416 strcpy (str_to_check, arg_str);
9418 /* Skip leading whitespace. */
9419 while (*str_to_check == ' ' || *str_to_check == '\t')
9420 str_to_check++;
9422 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9423 It is easier to detect and handle it explicitly here rather than going
9424 through the machinery for the rest of the target attributes in this
9425 function. */
9426 if (*str_to_check == '+')
9427 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9429 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9431 invert = true;
9432 str_to_check += 3;
9434 char *arg = strchr (str_to_check, '=');
9436 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9437 and point ARG to "foo". */
9438 if (arg)
9440 *arg = '\0';
9441 arg++;
9443 const struct aarch64_attribute_info *p_attr;
9444 bool found = false;
9445 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9447 /* If the names don't match up, or the user has given an argument
9448 to an attribute that doesn't accept one, or didn't give an argument
9449 to an attribute that expects one, fail to match. */
9450 if (strcmp (str_to_check, p_attr->name) != 0)
9451 continue;
9453 found = true;
9454 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9455 || p_attr->attr_type == aarch64_attr_enum;
9457 if (attr_need_arg_p ^ (arg != NULL))
9459 error ("target %s %qs does not accept an argument",
9460 pragma_or_attr, str_to_check);
9461 return false;
9464 /* If the name matches but the attribute does not allow "no-" versions
9465 then we can't match. */
9466 if (invert && !p_attr->allow_neg)
9468 error ("target %s %qs does not allow a negated form",
9469 pragma_or_attr, str_to_check);
9470 return false;
9473 switch (p_attr->attr_type)
9475 /* Has a custom handler registered.
9476 For example, cpu=, arch=, tune=. */
9477 case aarch64_attr_custom:
9478 gcc_assert (p_attr->handler);
9479 if (!p_attr->handler (arg, pragma_or_attr))
9480 return false;
9481 break;
9483 /* Either set or unset a boolean option. */
9484 case aarch64_attr_bool:
9486 struct cl_decoded_option decoded;
9488 generate_option (p_attr->opt_num, NULL, !invert,
9489 CL_TARGET, &decoded);
9490 aarch64_handle_option (&global_options, &global_options_set,
9491 &decoded, input_location);
9492 break;
9494 /* Set or unset a bit in the target_flags. aarch64_handle_option
9495 should know what mask to apply given the option number. */
9496 case aarch64_attr_mask:
9498 struct cl_decoded_option decoded;
9499 /* We only need to specify the option number.
9500 aarch64_handle_option will know which mask to apply. */
9501 decoded.opt_index = p_attr->opt_num;
9502 decoded.value = !invert;
9503 aarch64_handle_option (&global_options, &global_options_set,
9504 &decoded, input_location);
9505 break;
9507 /* Use the option setting machinery to set an option to an enum. */
9508 case aarch64_attr_enum:
9510 gcc_assert (arg);
9511 bool valid;
9512 int value;
9513 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9514 &value, CL_TARGET);
9515 if (valid)
9517 set_option (&global_options, NULL, p_attr->opt_num, value,
9518 NULL, DK_UNSPECIFIED, input_location,
9519 global_dc);
9521 else
9523 error ("target %s %s=%s is not valid",
9524 pragma_or_attr, str_to_check, arg);
9526 break;
9528 default:
9529 gcc_unreachable ();
9533 /* If we reached here we either have found an attribute and validated
9534 it or didn't match any. If we matched an attribute but its arguments
9535 were malformed we will have returned false already. */
9536 return found;
9539 /* Count how many times the character C appears in
9540 NULL-terminated string STR. */
9542 static unsigned int
9543 num_occurences_in_str (char c, char *str)
9545 unsigned int res = 0;
9546 while (*str != '\0')
9548 if (*str == c)
9549 res++;
9551 str++;
9554 return res;
9557 /* Parse the tree in ARGS that contains the target attribute information
9558 and update the global target options space. PRAGMA_OR_ATTR is a string
9559 to be used in error messages, specifying whether this is processing
9560 a target attribute or a target pragma. */
9562 bool
9563 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9565 if (TREE_CODE (args) == TREE_LIST)
9569 tree head = TREE_VALUE (args);
9570 if (head)
9572 if (!aarch64_process_target_attr (head, pragma_or_attr))
9573 return false;
9575 args = TREE_CHAIN (args);
9576 } while (args);
9578 return true;
9581 if (TREE_CODE (args) != STRING_CST)
9583 error ("attribute %<target%> argument not a string");
9584 return false;
9587 size_t len = strlen (TREE_STRING_POINTER (args));
9588 char *str_to_check = (char *) alloca (len + 1);
9589 strcpy (str_to_check, TREE_STRING_POINTER (args));
9591 if (len == 0)
9593 error ("malformed target %s value", pragma_or_attr);
9594 return false;
9597 /* Used to catch empty spaces between commas i.e.
9598 attribute ((target ("attr1,,attr2"))). */
9599 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9601 /* Handle multiple target attributes separated by ','. */
9602 char *token = strtok (str_to_check, ",");
9604 unsigned int num_attrs = 0;
9605 while (token)
9607 num_attrs++;
9608 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9610 error ("target %s %qs is invalid", pragma_or_attr, token);
9611 return false;
9614 token = strtok (NULL, ",");
9617 if (num_attrs != num_commas + 1)
9619 error ("malformed target %s list %qs",
9620 pragma_or_attr, TREE_STRING_POINTER (args));
9621 return false;
9624 return true;
9627 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9628 process attribute ((target ("..."))). */
9630 static bool
9631 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9633 struct cl_target_option cur_target;
9634 bool ret;
9635 tree old_optimize;
9636 tree new_target, new_optimize;
9637 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9639 /* If what we're processing is the current pragma string then the
9640 target option node is already stored in target_option_current_node
9641 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9642 having to re-parse the string. This is especially useful to keep
9643 arm_neon.h compile times down since that header contains a lot
9644 of intrinsics enclosed in pragmas. */
9645 if (!existing_target && args == current_target_pragma)
9647 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9648 return true;
9650 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9652 old_optimize = build_optimization_node (&global_options);
9653 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9655 /* If the function changed the optimization levels as well as setting
9656 target options, start with the optimizations specified. */
9657 if (func_optimize && func_optimize != old_optimize)
9658 cl_optimization_restore (&global_options,
9659 TREE_OPTIMIZATION (func_optimize));
9661 /* Save the current target options to restore at the end. */
9662 cl_target_option_save (&cur_target, &global_options);
9664 /* If fndecl already has some target attributes applied to it, unpack
9665 them so that we add this attribute on top of them, rather than
9666 overwriting them. */
9667 if (existing_target)
9669 struct cl_target_option *existing_options
9670 = TREE_TARGET_OPTION (existing_target);
9672 if (existing_options)
9673 cl_target_option_restore (&global_options, existing_options);
9675 else
9676 cl_target_option_restore (&global_options,
9677 TREE_TARGET_OPTION (target_option_current_node));
9680 ret = aarch64_process_target_attr (args, "attribute");
9682 /* Set up any additional state. */
9683 if (ret)
9685 aarch64_override_options_internal (&global_options);
9686 /* Initialize SIMD builtins if we haven't already.
9687 Set current_target_pragma to NULL for the duration so that
9688 the builtin initialization code doesn't try to tag the functions
9689 being built with the attributes specified by any current pragma, thus
9690 going into an infinite recursion. */
9691 if (TARGET_SIMD)
9693 tree saved_current_target_pragma = current_target_pragma;
9694 current_target_pragma = NULL;
9695 aarch64_init_simd_builtins ();
9696 current_target_pragma = saved_current_target_pragma;
9698 new_target = build_target_option_node (&global_options);
9700 else
9701 new_target = NULL;
9703 new_optimize = build_optimization_node (&global_options);
9705 if (fndecl && ret)
9707 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9709 if (old_optimize != new_optimize)
9710 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9713 cl_target_option_restore (&global_options, &cur_target);
9715 if (old_optimize != new_optimize)
9716 cl_optimization_restore (&global_options,
9717 TREE_OPTIMIZATION (old_optimize));
9718 return ret;
9721 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
9722 tri-bool options (yes, no, don't care) and the default value is
9723 DEF, determine whether to reject inlining. */
9725 static bool
9726 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9727 int dont_care, int def)
9729 /* If the callee doesn't care, always allow inlining. */
9730 if (callee == dont_care)
9731 return true;
9733 /* If the caller doesn't care, always allow inlining. */
9734 if (caller == dont_care)
9735 return true;
9737 /* Otherwise, allow inlining if either the callee and caller values
9738 agree, or if the callee is using the default value. */
9739 return (callee == caller || callee == def);
9742 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
9743 to inline CALLEE into CALLER based on target-specific info.
9744 Make sure that the caller and callee have compatible architectural
9745 features. Then go through the other possible target attributes
9746 and see if they can block inlining. Try not to reject always_inline
9747 callees unless they are incompatible architecturally. */
9749 static bool
9750 aarch64_can_inline_p (tree caller, tree callee)
9752 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9753 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9755 /* If callee has no option attributes, then it is ok to inline. */
9756 if (!callee_tree)
9757 return true;
9759 struct cl_target_option *caller_opts
9760 = TREE_TARGET_OPTION (caller_tree ? caller_tree
9761 : target_option_default_node);
9763 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9766 /* Callee's ISA flags should be a subset of the caller's. */
9767 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9768 != callee_opts->x_aarch64_isa_flags)
9769 return false;
9771 /* Allow non-strict aligned functions inlining into strict
9772 aligned ones. */
9773 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9774 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9775 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9776 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9777 return false;
9779 bool always_inline = lookup_attribute ("always_inline",
9780 DECL_ATTRIBUTES (callee));
9782 /* If the architectural features match up and the callee is always_inline
9783 then the other attributes don't matter. */
9784 if (always_inline)
9785 return true;
9787 if (caller_opts->x_aarch64_cmodel_var
9788 != callee_opts->x_aarch64_cmodel_var)
9789 return false;
9791 if (caller_opts->x_aarch64_tls_dialect
9792 != callee_opts->x_aarch64_tls_dialect)
9793 return false;
9795 /* Honour explicit requests to workaround errata. */
9796 if (!aarch64_tribools_ok_for_inlining_p (
9797 caller_opts->x_aarch64_fix_a53_err835769,
9798 callee_opts->x_aarch64_fix_a53_err835769,
9799 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9800 return false;
9802 if (!aarch64_tribools_ok_for_inlining_p (
9803 caller_opts->x_aarch64_fix_a53_err843419,
9804 callee_opts->x_aarch64_fix_a53_err843419,
9805 2, TARGET_FIX_ERR_A53_843419))
9806 return false;
9808 /* If the user explicitly specified -momit-leaf-frame-pointer for the
9809 caller and calle and they don't match up, reject inlining. */
9810 if (!aarch64_tribools_ok_for_inlining_p (
9811 caller_opts->x_flag_omit_leaf_frame_pointer,
9812 callee_opts->x_flag_omit_leaf_frame_pointer,
9813 2, 1))
9814 return false;
9816 /* If the callee has specific tuning overrides, respect them. */
9817 if (callee_opts->x_aarch64_override_tune_string != NULL
9818 && caller_opts->x_aarch64_override_tune_string == NULL)
9819 return false;
9821 /* If the user specified tuning override strings for the
9822 caller and callee and they don't match up, reject inlining.
9823 We just do a string compare here, we don't analyze the meaning
9824 of the string, as it would be too costly for little gain. */
9825 if (callee_opts->x_aarch64_override_tune_string
9826 && caller_opts->x_aarch64_override_tune_string
9827 && (strcmp (callee_opts->x_aarch64_override_tune_string,
9828 caller_opts->x_aarch64_override_tune_string) != 0))
9829 return false;
9831 return true;
9834 /* Return true if SYMBOL_REF X binds locally. */
9836 static bool
9837 aarch64_symbol_binds_local_p (const_rtx x)
9839 return (SYMBOL_REF_DECL (x)
9840 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9841 : SYMBOL_REF_LOCAL_P (x));
9844 /* Return true if SYMBOL_REF X is thread local */
9845 static bool
9846 aarch64_tls_symbol_p (rtx x)
9848 if (! TARGET_HAVE_TLS)
9849 return false;
9851 if (GET_CODE (x) != SYMBOL_REF)
9852 return false;
9854 return SYMBOL_REF_TLS_MODEL (x) != 0;
9857 /* Classify a TLS symbol into one of the TLS kinds. */
9858 enum aarch64_symbol_type
9859 aarch64_classify_tls_symbol (rtx x)
9861 enum tls_model tls_kind = tls_symbolic_operand_type (x);
9863 switch (tls_kind)
9865 case TLS_MODEL_GLOBAL_DYNAMIC:
9866 case TLS_MODEL_LOCAL_DYNAMIC:
9867 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9869 case TLS_MODEL_INITIAL_EXEC:
9870 switch (aarch64_cmodel)
9872 case AARCH64_CMODEL_TINY:
9873 case AARCH64_CMODEL_TINY_PIC:
9874 return SYMBOL_TINY_TLSIE;
9875 default:
9876 return SYMBOL_SMALL_TLSIE;
9879 case TLS_MODEL_LOCAL_EXEC:
9880 if (aarch64_tls_size == 12)
9881 return SYMBOL_TLSLE12;
9882 else if (aarch64_tls_size == 24)
9883 return SYMBOL_TLSLE24;
9884 else if (aarch64_tls_size == 32)
9885 return SYMBOL_TLSLE32;
9886 else if (aarch64_tls_size == 48)
9887 return SYMBOL_TLSLE48;
9888 else
9889 gcc_unreachable ();
9891 case TLS_MODEL_EMULATED:
9892 case TLS_MODEL_NONE:
9893 return SYMBOL_FORCE_TO_MEM;
9895 default:
9896 gcc_unreachable ();
9900 /* Return the method that should be used to access SYMBOL_REF or
9901 LABEL_REF X. */
9903 enum aarch64_symbol_type
9904 aarch64_classify_symbol (rtx x, rtx offset)
9906 if (GET_CODE (x) == LABEL_REF)
9908 switch (aarch64_cmodel)
9910 case AARCH64_CMODEL_LARGE:
9911 return SYMBOL_FORCE_TO_MEM;
9913 case AARCH64_CMODEL_TINY_PIC:
9914 case AARCH64_CMODEL_TINY:
9915 return SYMBOL_TINY_ABSOLUTE;
9917 case AARCH64_CMODEL_SMALL_SPIC:
9918 case AARCH64_CMODEL_SMALL_PIC:
9919 case AARCH64_CMODEL_SMALL:
9920 return SYMBOL_SMALL_ABSOLUTE;
9922 default:
9923 gcc_unreachable ();
9927 if (GET_CODE (x) == SYMBOL_REF)
9929 if (aarch64_tls_symbol_p (x))
9930 return aarch64_classify_tls_symbol (x);
9932 switch (aarch64_cmodel)
9934 case AARCH64_CMODEL_TINY:
9935 /* When we retrieve symbol + offset address, we have to make sure
9936 the offset does not cause overflow of the final address. But
9937 we have no way of knowing the address of symbol at compile time
9938 so we can't accurately say if the distance between the PC and
9939 symbol + offset is outside the addressible range of +/-1M in the
9940 TINY code model. So we rely on images not being greater than
9941 1M and cap the offset at 1M and anything beyond 1M will have to
9942 be loaded using an alternative mechanism. Furthermore if the
9943 symbol is a weak reference to something that isn't known to
9944 resolve to a symbol in this module, then force to memory. */
9945 if ((SYMBOL_REF_WEAK (x)
9946 && !aarch64_symbol_binds_local_p (x))
9947 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9948 return SYMBOL_FORCE_TO_MEM;
9949 return SYMBOL_TINY_ABSOLUTE;
9951 case AARCH64_CMODEL_SMALL:
9952 /* Same reasoning as the tiny code model, but the offset cap here is
9953 4G. */
9954 if ((SYMBOL_REF_WEAK (x)
9955 && !aarch64_symbol_binds_local_p (x))
9956 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9957 HOST_WIDE_INT_C (4294967264)))
9958 return SYMBOL_FORCE_TO_MEM;
9959 return SYMBOL_SMALL_ABSOLUTE;
9961 case AARCH64_CMODEL_TINY_PIC:
9962 if (!aarch64_symbol_binds_local_p (x))
9963 return SYMBOL_TINY_GOT;
9964 return SYMBOL_TINY_ABSOLUTE;
9966 case AARCH64_CMODEL_SMALL_SPIC:
9967 case AARCH64_CMODEL_SMALL_PIC:
9968 if (!aarch64_symbol_binds_local_p (x))
9969 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9970 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9971 return SYMBOL_SMALL_ABSOLUTE;
9973 case AARCH64_CMODEL_LARGE:
9974 /* This is alright even in PIC code as the constant
9975 pool reference is always PC relative and within
9976 the same translation unit. */
9977 if (CONSTANT_POOL_ADDRESS_P (x))
9978 return SYMBOL_SMALL_ABSOLUTE;
9979 else
9980 return SYMBOL_FORCE_TO_MEM;
9982 default:
9983 gcc_unreachable ();
9987 /* By default push everything into the constant pool. */
9988 return SYMBOL_FORCE_TO_MEM;
9991 bool
9992 aarch64_constant_address_p (rtx x)
9994 return (CONSTANT_P (x) && memory_address_p (DImode, x));
9997 bool
9998 aarch64_legitimate_pic_operand_p (rtx x)
10000 if (GET_CODE (x) == SYMBOL_REF
10001 || (GET_CODE (x) == CONST
10002 && GET_CODE (XEXP (x, 0)) == PLUS
10003 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
10004 return false;
10006 return true;
10009 /* Return true if X holds either a quarter-precision or
10010 floating-point +0.0 constant. */
10011 static bool
10012 aarch64_valid_floating_const (machine_mode mode, rtx x)
10014 if (!CONST_DOUBLE_P (x))
10015 return false;
10017 if (aarch64_float_const_zero_rtx_p (x))
10018 return true;
10020 /* We only handle moving 0.0 to a TFmode register. */
10021 if (!(mode == SFmode || mode == DFmode))
10022 return false;
10024 return aarch64_float_const_representable_p (x);
10027 static bool
10028 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
10030 /* Do not allow vector struct mode constants. We could support
10031 0 and -1 easily, but they need support in aarch64-simd.md. */
10032 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
10033 return false;
10035 /* This could probably go away because
10036 we now decompose CONST_INTs according to expand_mov_immediate. */
10037 if ((GET_CODE (x) == CONST_VECTOR
10038 && aarch64_simd_valid_immediate (x, mode, false, NULL))
10039 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
10040 return !targetm.cannot_force_const_mem (mode, x);
10042 if (GET_CODE (x) == HIGH
10043 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
10044 return true;
10046 return aarch64_constant_address_p (x);
10050 aarch64_load_tp (rtx target)
10052 if (!target
10053 || GET_MODE (target) != Pmode
10054 || !register_operand (target, Pmode))
10055 target = gen_reg_rtx (Pmode);
10057 /* Can return in any reg. */
10058 emit_insn (gen_aarch64_load_tp_hard (target));
10059 return target;
10062 /* On AAPCS systems, this is the "struct __va_list". */
10063 static GTY(()) tree va_list_type;
10065 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10066 Return the type to use as __builtin_va_list.
10068 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10070 struct __va_list
10072 void *__stack;
10073 void *__gr_top;
10074 void *__vr_top;
10075 int __gr_offs;
10076 int __vr_offs;
10077 }; */
10079 static tree
10080 aarch64_build_builtin_va_list (void)
10082 tree va_list_name;
10083 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10085 /* Create the type. */
10086 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10087 /* Give it the required name. */
10088 va_list_name = build_decl (BUILTINS_LOCATION,
10089 TYPE_DECL,
10090 get_identifier ("__va_list"),
10091 va_list_type);
10092 DECL_ARTIFICIAL (va_list_name) = 1;
10093 TYPE_NAME (va_list_type) = va_list_name;
10094 TYPE_STUB_DECL (va_list_type) = va_list_name;
10096 /* Create the fields. */
10097 f_stack = build_decl (BUILTINS_LOCATION,
10098 FIELD_DECL, get_identifier ("__stack"),
10099 ptr_type_node);
10100 f_grtop = build_decl (BUILTINS_LOCATION,
10101 FIELD_DECL, get_identifier ("__gr_top"),
10102 ptr_type_node);
10103 f_vrtop = build_decl (BUILTINS_LOCATION,
10104 FIELD_DECL, get_identifier ("__vr_top"),
10105 ptr_type_node);
10106 f_groff = build_decl (BUILTINS_LOCATION,
10107 FIELD_DECL, get_identifier ("__gr_offs"),
10108 integer_type_node);
10109 f_vroff = build_decl (BUILTINS_LOCATION,
10110 FIELD_DECL, get_identifier ("__vr_offs"),
10111 integer_type_node);
10113 /* Tell tree-stdarg pass about our internal offset fields.
10114 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10115 purpose to identify whether the code is updating va_list internal
10116 offset fields through irregular way. */
10117 va_list_gpr_counter_field = f_groff;
10118 va_list_fpr_counter_field = f_vroff;
10120 DECL_ARTIFICIAL (f_stack) = 1;
10121 DECL_ARTIFICIAL (f_grtop) = 1;
10122 DECL_ARTIFICIAL (f_vrtop) = 1;
10123 DECL_ARTIFICIAL (f_groff) = 1;
10124 DECL_ARTIFICIAL (f_vroff) = 1;
10126 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10127 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10128 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10129 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10130 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10132 TYPE_FIELDS (va_list_type) = f_stack;
10133 DECL_CHAIN (f_stack) = f_grtop;
10134 DECL_CHAIN (f_grtop) = f_vrtop;
10135 DECL_CHAIN (f_vrtop) = f_groff;
10136 DECL_CHAIN (f_groff) = f_vroff;
10138 /* Compute its layout. */
10139 layout_type (va_list_type);
10141 return va_list_type;
10144 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10145 static void
10146 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10148 const CUMULATIVE_ARGS *cum;
10149 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10150 tree stack, grtop, vrtop, groff, vroff;
10151 tree t;
10152 int gr_save_area_size = cfun->va_list_gpr_size;
10153 int vr_save_area_size = cfun->va_list_fpr_size;
10154 int vr_offset;
10156 cum = &crtl->args.info;
10157 if (cfun->va_list_gpr_size)
10158 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10159 cfun->va_list_gpr_size);
10160 if (cfun->va_list_fpr_size)
10161 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10162 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10164 if (!TARGET_FLOAT)
10166 gcc_assert (cum->aapcs_nvrn == 0);
10167 vr_save_area_size = 0;
10170 f_stack = TYPE_FIELDS (va_list_type_node);
10171 f_grtop = DECL_CHAIN (f_stack);
10172 f_vrtop = DECL_CHAIN (f_grtop);
10173 f_groff = DECL_CHAIN (f_vrtop);
10174 f_vroff = DECL_CHAIN (f_groff);
10176 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10177 NULL_TREE);
10178 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10179 NULL_TREE);
10180 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10181 NULL_TREE);
10182 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10183 NULL_TREE);
10184 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10185 NULL_TREE);
10187 /* Emit code to initialize STACK, which points to the next varargs stack
10188 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10189 by named arguments. STACK is 8-byte aligned. */
10190 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10191 if (cum->aapcs_stack_size > 0)
10192 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10193 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10194 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10196 /* Emit code to initialize GRTOP, the top of the GR save area.
10197 virtual_incoming_args_rtx should have been 16 byte aligned. */
10198 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10199 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10200 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10202 /* Emit code to initialize VRTOP, the top of the VR save area.
10203 This address is gr_save_area_bytes below GRTOP, rounded
10204 down to the next 16-byte boundary. */
10205 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10206 vr_offset = ROUND_UP (gr_save_area_size,
10207 STACK_BOUNDARY / BITS_PER_UNIT);
10209 if (vr_offset)
10210 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10211 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10212 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10214 /* Emit code to initialize GROFF, the offset from GRTOP of the
10215 next GPR argument. */
10216 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10217 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10218 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10220 /* Likewise emit code to initialize VROFF, the offset from FTOP
10221 of the next VR argument. */
10222 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10223 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10224 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10227 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10229 static tree
10230 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10231 gimple_seq *post_p ATTRIBUTE_UNUSED)
10233 tree addr;
10234 bool indirect_p;
10235 bool is_ha; /* is HFA or HVA. */
10236 bool dw_align; /* double-word align. */
10237 machine_mode ag_mode = VOIDmode;
10238 int nregs;
10239 machine_mode mode;
10241 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10242 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10243 HOST_WIDE_INT size, rsize, adjust, align;
10244 tree t, u, cond1, cond2;
10246 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10247 if (indirect_p)
10248 type = build_pointer_type (type);
10250 mode = TYPE_MODE (type);
10252 f_stack = TYPE_FIELDS (va_list_type_node);
10253 f_grtop = DECL_CHAIN (f_stack);
10254 f_vrtop = DECL_CHAIN (f_grtop);
10255 f_groff = DECL_CHAIN (f_vrtop);
10256 f_vroff = DECL_CHAIN (f_groff);
10258 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10259 f_stack, NULL_TREE);
10260 size = int_size_in_bytes (type);
10261 struct aarch64_fn_arg_alignment aa
10262 = aarch64_function_arg_alignment (mode, type);
10263 align = aa.alignment / BITS_PER_UNIT;
10265 dw_align = false;
10266 adjust = 0;
10267 if (aarch64_vfp_is_call_or_return_candidate (mode,
10268 type,
10269 &ag_mode,
10270 &nregs,
10271 &is_ha))
10273 /* TYPE passed in fp/simd registers. */
10274 if (!TARGET_FLOAT)
10275 aarch64_err_no_fpadvsimd (mode, "varargs");
10277 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10278 unshare_expr (valist), f_vrtop, NULL_TREE);
10279 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10280 unshare_expr (valist), f_vroff, NULL_TREE);
10282 rsize = nregs * UNITS_PER_VREG;
10284 if (is_ha)
10286 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10287 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10289 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10290 && size < UNITS_PER_VREG)
10292 adjust = UNITS_PER_VREG - size;
10295 else
10297 /* TYPE passed in general registers. */
10298 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10299 unshare_expr (valist), f_grtop, NULL_TREE);
10300 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10301 unshare_expr (valist), f_groff, NULL_TREE);
10302 rsize = ROUND_UP (size, UNITS_PER_WORD);
10303 nregs = rsize / UNITS_PER_WORD;
10305 if (align > 8)
10306 dw_align = true;
10308 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10309 && size < UNITS_PER_WORD)
10311 adjust = UNITS_PER_WORD - size;
10315 /* Get a local temporary for the field value. */
10316 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10318 /* Emit code to branch if off >= 0. */
10319 t = build2 (GE_EXPR, boolean_type_node, off,
10320 build_int_cst (TREE_TYPE (off), 0));
10321 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10323 if (dw_align)
10325 /* Emit: offs = (offs + 15) & -16. */
10326 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10327 build_int_cst (TREE_TYPE (off), 15));
10328 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10329 build_int_cst (TREE_TYPE (off), -16));
10330 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10332 else
10333 roundup = NULL;
10335 /* Update ap.__[g|v]r_offs */
10336 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10337 build_int_cst (TREE_TYPE (off), rsize));
10338 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10340 /* String up. */
10341 if (roundup)
10342 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10344 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10345 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10346 build_int_cst (TREE_TYPE (f_off), 0));
10347 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10349 /* String up: make sure the assignment happens before the use. */
10350 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10351 COND_EXPR_ELSE (cond1) = t;
10353 /* Prepare the trees handling the argument that is passed on the stack;
10354 the top level node will store in ON_STACK. */
10355 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10356 if (align > 8)
10358 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10359 t = fold_convert (intDI_type_node, arg);
10360 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10361 build_int_cst (TREE_TYPE (t), 15));
10362 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10363 build_int_cst (TREE_TYPE (t), -16));
10364 t = fold_convert (TREE_TYPE (arg), t);
10365 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10367 else
10368 roundup = NULL;
10369 /* Advance ap.__stack */
10370 t = fold_convert (intDI_type_node, arg);
10371 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10372 build_int_cst (TREE_TYPE (t), size + 7));
10373 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10374 build_int_cst (TREE_TYPE (t), -8));
10375 t = fold_convert (TREE_TYPE (arg), t);
10376 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10377 /* String up roundup and advance. */
10378 if (roundup)
10379 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10380 /* String up with arg */
10381 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10382 /* Big-endianness related address adjustment. */
10383 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10384 && size < UNITS_PER_WORD)
10386 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10387 size_int (UNITS_PER_WORD - size));
10388 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10391 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10392 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10394 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10395 t = off;
10396 if (adjust)
10397 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10398 build_int_cst (TREE_TYPE (off), adjust));
10400 t = fold_convert (sizetype, t);
10401 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10403 if (is_ha)
10405 /* type ha; // treat as "struct {ftype field[n];}"
10406 ... [computing offs]
10407 for (i = 0; i <nregs; ++i, offs += 16)
10408 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10409 return ha; */
10410 int i;
10411 tree tmp_ha, field_t, field_ptr_t;
10413 /* Declare a local variable. */
10414 tmp_ha = create_tmp_var_raw (type, "ha");
10415 gimple_add_tmp_var (tmp_ha);
10417 /* Establish the base type. */
10418 switch (ag_mode)
10420 case SFmode:
10421 field_t = float_type_node;
10422 field_ptr_t = float_ptr_type_node;
10423 break;
10424 case DFmode:
10425 field_t = double_type_node;
10426 field_ptr_t = double_ptr_type_node;
10427 break;
10428 case TFmode:
10429 field_t = long_double_type_node;
10430 field_ptr_t = long_double_ptr_type_node;
10431 break;
10432 case HFmode:
10433 field_t = aarch64_fp16_type_node;
10434 field_ptr_t = aarch64_fp16_ptr_type_node;
10435 break;
10436 case V2SImode:
10437 case V4SImode:
10439 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10440 field_t = build_vector_type_for_mode (innertype, ag_mode);
10441 field_ptr_t = build_pointer_type (field_t);
10443 break;
10444 default:
10445 gcc_assert (0);
10448 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10449 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10450 addr = t;
10451 t = fold_convert (field_ptr_t, addr);
10452 t = build2 (MODIFY_EXPR, field_t,
10453 build1 (INDIRECT_REF, field_t, tmp_ha),
10454 build1 (INDIRECT_REF, field_t, t));
10456 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10457 for (i = 1; i < nregs; ++i)
10459 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10460 u = fold_convert (field_ptr_t, addr);
10461 u = build2 (MODIFY_EXPR, field_t,
10462 build2 (MEM_REF, field_t, tmp_ha,
10463 build_int_cst (field_ptr_t,
10464 (i *
10465 int_size_in_bytes (field_t)))),
10466 build1 (INDIRECT_REF, field_t, u));
10467 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10470 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10471 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10474 COND_EXPR_ELSE (cond2) = t;
10475 addr = fold_convert (build_pointer_type (type), cond1);
10476 addr = build_va_arg_indirect_ref (addr);
10478 if (indirect_p)
10479 addr = build_va_arg_indirect_ref (addr);
10481 return addr;
10484 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10486 static void
10487 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10488 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10489 int no_rtl)
10491 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10492 CUMULATIVE_ARGS local_cum;
10493 int gr_saved = cfun->va_list_gpr_size;
10494 int vr_saved = cfun->va_list_fpr_size;
10496 /* The caller has advanced CUM up to, but not beyond, the last named
10497 argument. Advance a local copy of CUM past the last "real" named
10498 argument, to find out how many registers are left over. */
10499 local_cum = *cum;
10500 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10502 /* Found out how many registers we need to save.
10503 Honor tree-stdvar analysis results. */
10504 if (cfun->va_list_gpr_size)
10505 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10506 cfun->va_list_gpr_size / UNITS_PER_WORD);
10507 if (cfun->va_list_fpr_size)
10508 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10509 cfun->va_list_fpr_size / UNITS_PER_VREG);
10511 if (!TARGET_FLOAT)
10513 gcc_assert (local_cum.aapcs_nvrn == 0);
10514 vr_saved = 0;
10517 if (!no_rtl)
10519 if (gr_saved > 0)
10521 rtx ptr, mem;
10523 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10524 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10525 - gr_saved * UNITS_PER_WORD);
10526 mem = gen_frame_mem (BLKmode, ptr);
10527 set_mem_alias_set (mem, get_varargs_alias_set ());
10529 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10530 mem, gr_saved);
10532 if (vr_saved > 0)
10534 /* We can't use move_block_from_reg, because it will use
10535 the wrong mode, storing D regs only. */
10536 machine_mode mode = TImode;
10537 int off, i, vr_start;
10539 /* Set OFF to the offset from virtual_incoming_args_rtx of
10540 the first vector register. The VR save area lies below
10541 the GR one, and is aligned to 16 bytes. */
10542 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10543 STACK_BOUNDARY / BITS_PER_UNIT);
10544 off -= vr_saved * UNITS_PER_VREG;
10546 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10547 for (i = 0; i < vr_saved; ++i)
10549 rtx ptr, mem;
10551 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10552 mem = gen_frame_mem (mode, ptr);
10553 set_mem_alias_set (mem, get_varargs_alias_set ());
10554 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10555 off += UNITS_PER_VREG;
10560 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10561 any complication of having crtl->args.pretend_args_size changed. */
10562 cfun->machine->frame.saved_varargs_size
10563 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10564 STACK_BOUNDARY / BITS_PER_UNIT)
10565 + vr_saved * UNITS_PER_VREG);
10568 static void
10569 aarch64_conditional_register_usage (void)
10571 int i;
10572 if (!TARGET_FLOAT)
10574 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10576 fixed_regs[i] = 1;
10577 call_used_regs[i] = 1;
10582 /* Walk down the type tree of TYPE counting consecutive base elements.
10583 If *MODEP is VOIDmode, then set it to the first valid floating point
10584 type. If a non-floating point type is found, or if a floating point
10585 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10586 otherwise return the count in the sub-tree. */
10587 static int
10588 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10590 machine_mode mode;
10591 HOST_WIDE_INT size;
10593 switch (TREE_CODE (type))
10595 case REAL_TYPE:
10596 mode = TYPE_MODE (type);
10597 if (mode != DFmode && mode != SFmode
10598 && mode != TFmode && mode != HFmode)
10599 return -1;
10601 if (*modep == VOIDmode)
10602 *modep = mode;
10604 if (*modep == mode)
10605 return 1;
10607 break;
10609 case COMPLEX_TYPE:
10610 mode = TYPE_MODE (TREE_TYPE (type));
10611 if (mode != DFmode && mode != SFmode
10612 && mode != TFmode && mode != HFmode)
10613 return -1;
10615 if (*modep == VOIDmode)
10616 *modep = mode;
10618 if (*modep == mode)
10619 return 2;
10621 break;
10623 case VECTOR_TYPE:
10624 /* Use V2SImode and V4SImode as representatives of all 64-bit
10625 and 128-bit vector types. */
10626 size = int_size_in_bytes (type);
10627 switch (size)
10629 case 8:
10630 mode = V2SImode;
10631 break;
10632 case 16:
10633 mode = V4SImode;
10634 break;
10635 default:
10636 return -1;
10639 if (*modep == VOIDmode)
10640 *modep = mode;
10642 /* Vector modes are considered to be opaque: two vectors are
10643 equivalent for the purposes of being homogeneous aggregates
10644 if they are the same size. */
10645 if (*modep == mode)
10646 return 1;
10648 break;
10650 case ARRAY_TYPE:
10652 int count;
10653 tree index = TYPE_DOMAIN (type);
10655 /* Can't handle incomplete types nor sizes that are not
10656 fixed. */
10657 if (!COMPLETE_TYPE_P (type)
10658 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10659 return -1;
10661 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10662 if (count == -1
10663 || !index
10664 || !TYPE_MAX_VALUE (index)
10665 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10666 || !TYPE_MIN_VALUE (index)
10667 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10668 || count < 0)
10669 return -1;
10671 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10672 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10674 /* There must be no padding. */
10675 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10676 return -1;
10678 return count;
10681 case RECORD_TYPE:
10683 int count = 0;
10684 int sub_count;
10685 tree field;
10687 /* Can't handle incomplete types nor sizes that are not
10688 fixed. */
10689 if (!COMPLETE_TYPE_P (type)
10690 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10691 return -1;
10693 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10695 if (TREE_CODE (field) != FIELD_DECL)
10696 continue;
10698 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10699 if (sub_count < 0)
10700 return -1;
10701 count += sub_count;
10704 /* There must be no padding. */
10705 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10706 return -1;
10708 return count;
10711 case UNION_TYPE:
10712 case QUAL_UNION_TYPE:
10714 /* These aren't very interesting except in a degenerate case. */
10715 int count = 0;
10716 int sub_count;
10717 tree field;
10719 /* Can't handle incomplete types nor sizes that are not
10720 fixed. */
10721 if (!COMPLETE_TYPE_P (type)
10722 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10723 return -1;
10725 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10727 if (TREE_CODE (field) != FIELD_DECL)
10728 continue;
10730 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10731 if (sub_count < 0)
10732 return -1;
10733 count = count > sub_count ? count : sub_count;
10736 /* There must be no padding. */
10737 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10738 return -1;
10740 return count;
10743 default:
10744 break;
10747 return -1;
10750 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10751 type as described in AAPCS64 \S 4.1.2.
10753 See the comment above aarch64_composite_type_p for the notes on MODE. */
10755 static bool
10756 aarch64_short_vector_p (const_tree type,
10757 machine_mode mode)
10759 HOST_WIDE_INT size = -1;
10761 if (type && TREE_CODE (type) == VECTOR_TYPE)
10762 size = int_size_in_bytes (type);
10763 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10764 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10765 size = GET_MODE_SIZE (mode);
10767 return (size == 8 || size == 16);
10770 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10771 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
10772 array types. The C99 floating-point complex types are also considered
10773 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
10774 types, which are GCC extensions and out of the scope of AAPCS64, are
10775 treated as composite types here as well.
10777 Note that MODE itself is not sufficient in determining whether a type
10778 is such a composite type or not. This is because
10779 stor-layout.c:compute_record_mode may have already changed the MODE
10780 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10781 structure with only one field may have its MODE set to the mode of the
10782 field. Also an integer mode whose size matches the size of the
10783 RECORD_TYPE type may be used to substitute the original mode
10784 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10785 solely relied on. */
10787 static bool
10788 aarch64_composite_type_p (const_tree type,
10789 machine_mode mode)
10791 if (aarch64_short_vector_p (type, mode))
10792 return false;
10794 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10795 return true;
10797 if (mode == BLKmode
10798 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10799 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10800 return true;
10802 return false;
10805 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10806 shall be passed or returned in simd/fp register(s) (providing these
10807 parameter passing registers are available).
10809 Upon successful return, *COUNT returns the number of needed registers,
10810 *BASE_MODE returns the mode of the individual register and when IS_HAF
10811 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10812 floating-point aggregate or a homogeneous short-vector aggregate. */
10814 static bool
10815 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10816 const_tree type,
10817 machine_mode *base_mode,
10818 int *count,
10819 bool *is_ha)
10821 machine_mode new_mode = VOIDmode;
10822 bool composite_p = aarch64_composite_type_p (type, mode);
10824 if (is_ha != NULL) *is_ha = false;
10826 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10827 || aarch64_short_vector_p (type, mode))
10829 *count = 1;
10830 new_mode = mode;
10832 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10834 if (is_ha != NULL) *is_ha = true;
10835 *count = 2;
10836 new_mode = GET_MODE_INNER (mode);
10838 else if (type && composite_p)
10840 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10842 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10844 if (is_ha != NULL) *is_ha = true;
10845 *count = ag_count;
10847 else
10848 return false;
10850 else
10851 return false;
10853 *base_mode = new_mode;
10854 return true;
10857 /* Implement TARGET_STRUCT_VALUE_RTX. */
10859 static rtx
10860 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10861 int incoming ATTRIBUTE_UNUSED)
10863 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10866 /* Implements target hook vector_mode_supported_p. */
10867 static bool
10868 aarch64_vector_mode_supported_p (machine_mode mode)
10870 if (TARGET_SIMD
10871 && (mode == V4SImode || mode == V8HImode
10872 || mode == V16QImode || mode == V2DImode
10873 || mode == V2SImode || mode == V4HImode
10874 || mode == V8QImode || mode == V2SFmode
10875 || mode == V4SFmode || mode == V2DFmode
10876 || mode == V4HFmode || mode == V8HFmode
10877 || mode == V1DFmode))
10878 return true;
10880 return false;
10883 /* Return appropriate SIMD container
10884 for MODE within a vector of WIDTH bits. */
10885 static machine_mode
10886 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10888 gcc_assert (width == 64 || width == 128);
10889 if (TARGET_SIMD)
10891 if (width == 128)
10892 switch (mode)
10894 case DFmode:
10895 return V2DFmode;
10896 case SFmode:
10897 return V4SFmode;
10898 case HFmode:
10899 return V8HFmode;
10900 case SImode:
10901 return V4SImode;
10902 case HImode:
10903 return V8HImode;
10904 case QImode:
10905 return V16QImode;
10906 case DImode:
10907 return V2DImode;
10908 default:
10909 break;
10911 else
10912 switch (mode)
10914 case SFmode:
10915 return V2SFmode;
10916 case HFmode:
10917 return V4HFmode;
10918 case SImode:
10919 return V2SImode;
10920 case HImode:
10921 return V4HImode;
10922 case QImode:
10923 return V8QImode;
10924 default:
10925 break;
10928 return word_mode;
10931 /* Return 128-bit container as the preferred SIMD mode for MODE. */
10932 static machine_mode
10933 aarch64_preferred_simd_mode (machine_mode mode)
10935 return aarch64_simd_container_mode (mode, 128);
10938 /* Return the bitmask of possible vector sizes for the vectorizer
10939 to iterate over. */
10940 static unsigned int
10941 aarch64_autovectorize_vector_sizes (void)
10943 return (16 | 8);
10946 /* Implement TARGET_MANGLE_TYPE. */
10948 static const char *
10949 aarch64_mangle_type (const_tree type)
10951 /* The AArch64 ABI documents say that "__va_list" has to be
10952 managled as if it is in the "std" namespace. */
10953 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10954 return "St9__va_list";
10956 /* Half-precision float. */
10957 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10958 return "Dh";
10960 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
10961 builtin types. */
10962 if (TYPE_NAME (type) != NULL)
10963 return aarch64_mangle_builtin_type (type);
10965 /* Use the default mangling. */
10966 return NULL;
10969 /* Find the first rtx_insn before insn that will generate an assembly
10970 instruction. */
10972 static rtx_insn *
10973 aarch64_prev_real_insn (rtx_insn *insn)
10975 if (!insn)
10976 return NULL;
10980 insn = prev_real_insn (insn);
10982 while (insn && recog_memoized (insn) < 0);
10984 return insn;
10987 static bool
10988 is_madd_op (enum attr_type t1)
10990 unsigned int i;
10991 /* A number of these may be AArch32 only. */
10992 enum attr_type mlatypes[] = {
10993 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10994 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10995 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10998 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
11000 if (t1 == mlatypes[i])
11001 return true;
11004 return false;
11007 /* Check if there is a register dependency between a load and the insn
11008 for which we hold recog_data. */
11010 static bool
11011 dep_between_memop_and_curr (rtx memop)
11013 rtx load_reg;
11014 int opno;
11016 gcc_assert (GET_CODE (memop) == SET);
11018 if (!REG_P (SET_DEST (memop)))
11019 return false;
11021 load_reg = SET_DEST (memop);
11022 for (opno = 1; opno < recog_data.n_operands; opno++)
11024 rtx operand = recog_data.operand[opno];
11025 if (REG_P (operand)
11026 && reg_overlap_mentioned_p (load_reg, operand))
11027 return true;
11030 return false;
11034 /* When working around the Cortex-A53 erratum 835769,
11035 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
11036 instruction and has a preceding memory instruction such that a NOP
11037 should be inserted between them. */
11039 bool
11040 aarch64_madd_needs_nop (rtx_insn* insn)
11042 enum attr_type attr_type;
11043 rtx_insn *prev;
11044 rtx body;
11046 if (!TARGET_FIX_ERR_A53_835769)
11047 return false;
11049 if (!INSN_P (insn) || recog_memoized (insn) < 0)
11050 return false;
11052 attr_type = get_attr_type (insn);
11053 if (!is_madd_op (attr_type))
11054 return false;
11056 prev = aarch64_prev_real_insn (insn);
11057 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11058 Restore recog state to INSN to avoid state corruption. */
11059 extract_constrain_insn_cached (insn);
11061 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11062 return false;
11064 body = single_set (prev);
11066 /* If the previous insn is a memory op and there is no dependency between
11067 it and the DImode madd, emit a NOP between them. If body is NULL then we
11068 have a complex memory operation, probably a load/store pair.
11069 Be conservative for now and emit a NOP. */
11070 if (GET_MODE (recog_data.operand[0]) == DImode
11071 && (!body || !dep_between_memop_and_curr (body)))
11072 return true;
11074 return false;
11079 /* Implement FINAL_PRESCAN_INSN. */
11081 void
11082 aarch64_final_prescan_insn (rtx_insn *insn)
11084 if (aarch64_madd_needs_nop (insn))
11085 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11089 /* Return the equivalent letter for size. */
11090 static char
11091 sizetochar (int size)
11093 switch (size)
11095 case 64: return 'd';
11096 case 32: return 's';
11097 case 16: return 'h';
11098 case 8 : return 'b';
11099 default: gcc_unreachable ();
11103 /* Return true iff x is a uniform vector of floating-point
11104 constants, and the constant can be represented in
11105 quarter-precision form. Note, as aarch64_float_const_representable
11106 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11107 static bool
11108 aarch64_vect_float_const_representable_p (rtx x)
11110 rtx elt;
11111 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11112 && const_vec_duplicate_p (x, &elt)
11113 && aarch64_float_const_representable_p (elt));
11116 /* Return true for valid and false for invalid. */
11117 bool
11118 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11119 struct simd_immediate_info *info)
11121 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11122 matches = 1; \
11123 for (i = 0; i < idx; i += (STRIDE)) \
11124 if (!(TEST)) \
11125 matches = 0; \
11126 if (matches) \
11128 immtype = (CLASS); \
11129 elsize = (ELSIZE); \
11130 eshift = (SHIFT); \
11131 emvn = (NEG); \
11132 break; \
11135 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11136 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11137 unsigned char bytes[16];
11138 int immtype = -1, matches;
11139 unsigned int invmask = inverse ? 0xff : 0;
11140 int eshift, emvn;
11142 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11144 if (! (aarch64_simd_imm_zero_p (op, mode)
11145 || aarch64_vect_float_const_representable_p (op)))
11146 return false;
11148 if (info)
11150 info->value = CONST_VECTOR_ELT (op, 0);
11151 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11152 info->mvn = false;
11153 info->shift = 0;
11156 return true;
11159 /* Splat vector constant out into a byte vector. */
11160 for (i = 0; i < n_elts; i++)
11162 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11163 it must be laid out in the vector register in reverse order. */
11164 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11165 unsigned HOST_WIDE_INT elpart;
11167 gcc_assert (CONST_INT_P (el));
11168 elpart = INTVAL (el);
11170 for (unsigned int byte = 0; byte < innersize; byte++)
11172 bytes[idx++] = (elpart & 0xff) ^ invmask;
11173 elpart >>= BITS_PER_UNIT;
11178 /* Sanity check. */
11179 gcc_assert (idx == GET_MODE_SIZE (mode));
11183 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11184 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11186 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11187 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11189 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11190 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11192 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11193 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11195 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11197 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11199 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11200 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11202 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11203 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11205 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11206 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11208 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11209 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11211 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11213 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11215 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11216 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11218 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11219 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11221 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11222 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11224 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11225 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11227 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11229 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11230 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11232 while (0);
11234 if (immtype == -1)
11235 return false;
11237 if (info)
11239 info->element_width = elsize;
11240 info->mvn = emvn != 0;
11241 info->shift = eshift;
11243 unsigned HOST_WIDE_INT imm = 0;
11245 if (immtype >= 12 && immtype <= 15)
11246 info->msl = true;
11248 /* Un-invert bytes of recognized vector, if necessary. */
11249 if (invmask != 0)
11250 for (i = 0; i < idx; i++)
11251 bytes[i] ^= invmask;
11253 if (immtype == 17)
11255 /* FIXME: Broken on 32-bit H_W_I hosts. */
11256 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11258 for (i = 0; i < 8; i++)
11259 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11260 << (i * BITS_PER_UNIT);
11263 info->value = GEN_INT (imm);
11265 else
11267 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11268 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11270 /* Construct 'abcdefgh' because the assembler cannot handle
11271 generic constants. */
11272 if (info->mvn)
11273 imm = ~imm;
11274 imm = (imm >> info->shift) & 0xff;
11275 info->value = GEN_INT (imm);
11279 return true;
11280 #undef CHECK
11283 /* Check of immediate shift constants are within range. */
11284 bool
11285 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11287 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11288 if (left)
11289 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11290 else
11291 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11294 /* Return true if X is a uniform vector where all elements
11295 are either the floating-point constant 0.0 or the
11296 integer constant 0. */
11297 bool
11298 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11300 return x == CONST0_RTX (mode);
11304 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11305 operation of width WIDTH at bit position POS. */
11308 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11310 gcc_assert (CONST_INT_P (width));
11311 gcc_assert (CONST_INT_P (pos));
11313 unsigned HOST_WIDE_INT mask
11314 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11315 return GEN_INT (mask << UINTVAL (pos));
11318 bool
11319 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
11321 HOST_WIDE_INT imm = INTVAL (x);
11322 int i;
11324 for (i = 0; i < 8; i++)
11326 unsigned int byte = imm & 0xff;
11327 if (byte != 0xff && byte != 0)
11328 return false;
11329 imm >>= 8;
11332 return true;
11335 bool
11336 aarch64_mov_operand_p (rtx x, machine_mode mode)
11338 if (GET_CODE (x) == HIGH
11339 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11340 return true;
11342 if (CONST_INT_P (x))
11343 return true;
11345 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11346 return true;
11348 return aarch64_classify_symbolic_expression (x)
11349 == SYMBOL_TINY_ABSOLUTE;
11352 /* Return a const_int vector of VAL. */
11354 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11356 int nunits = GET_MODE_NUNITS (mode);
11357 rtvec v = rtvec_alloc (nunits);
11358 int i;
11360 rtx cache = GEN_INT (val);
11362 for (i=0; i < nunits; i++)
11363 RTVEC_ELT (v, i) = cache;
11365 return gen_rtx_CONST_VECTOR (mode, v);
11368 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11370 bool
11371 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11373 machine_mode vmode;
11375 gcc_assert (!VECTOR_MODE_P (mode));
11376 vmode = aarch64_preferred_simd_mode (mode);
11377 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11378 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11381 /* Construct and return a PARALLEL RTX vector with elements numbering the
11382 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11383 the vector - from the perspective of the architecture. This does not
11384 line up with GCC's perspective on lane numbers, so we end up with
11385 different masks depending on our target endian-ness. The diagram
11386 below may help. We must draw the distinction when building masks
11387 which select one half of the vector. An instruction selecting
11388 architectural low-lanes for a big-endian target, must be described using
11389 a mask selecting GCC high-lanes.
11391 Big-Endian Little-Endian
11393 GCC 0 1 2 3 3 2 1 0
11394 | x | x | x | x | | x | x | x | x |
11395 Architecture 3 2 1 0 3 2 1 0
11397 Low Mask: { 2, 3 } { 0, 1 }
11398 High Mask: { 0, 1 } { 2, 3 }
11402 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11404 int nunits = GET_MODE_NUNITS (mode);
11405 rtvec v = rtvec_alloc (nunits / 2);
11406 int high_base = nunits / 2;
11407 int low_base = 0;
11408 int base;
11409 rtx t1;
11410 int i;
11412 if (BYTES_BIG_ENDIAN)
11413 base = high ? low_base : high_base;
11414 else
11415 base = high ? high_base : low_base;
11417 for (i = 0; i < nunits / 2; i++)
11418 RTVEC_ELT (v, i) = GEN_INT (base + i);
11420 t1 = gen_rtx_PARALLEL (mode, v);
11421 return t1;
11424 /* Check OP for validity as a PARALLEL RTX vector with elements
11425 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11426 from the perspective of the architecture. See the diagram above
11427 aarch64_simd_vect_par_cnst_half for more details. */
11429 bool
11430 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11431 bool high)
11433 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11434 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11435 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11436 int i = 0;
11438 if (!VECTOR_MODE_P (mode))
11439 return false;
11441 if (count_op != count_ideal)
11442 return false;
11444 for (i = 0; i < count_ideal; i++)
11446 rtx elt_op = XVECEXP (op, 0, i);
11447 rtx elt_ideal = XVECEXP (ideal, 0, i);
11449 if (!CONST_INT_P (elt_op)
11450 || INTVAL (elt_ideal) != INTVAL (elt_op))
11451 return false;
11453 return true;
11456 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11457 HIGH (exclusive). */
11458 void
11459 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11460 const_tree exp)
11462 HOST_WIDE_INT lane;
11463 gcc_assert (CONST_INT_P (operand));
11464 lane = INTVAL (operand);
11466 if (lane < low || lane >= high)
11468 if (exp)
11469 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11470 else
11471 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11475 /* Return TRUE if OP is a valid vector addressing mode. */
11476 bool
11477 aarch64_simd_mem_operand_p (rtx op)
11479 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11480 || REG_P (XEXP (op, 0)));
11483 /* Emit a register copy from operand to operand, taking care not to
11484 early-clobber source registers in the process.
11486 COUNT is the number of components into which the copy needs to be
11487 decomposed. */
11488 void
11489 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11490 unsigned int count)
11492 unsigned int i;
11493 int rdest = REGNO (operands[0]);
11494 int rsrc = REGNO (operands[1]);
11496 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11497 || rdest < rsrc)
11498 for (i = 0; i < count; i++)
11499 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11500 gen_rtx_REG (mode, rsrc + i));
11501 else
11502 for (i = 0; i < count; i++)
11503 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11504 gen_rtx_REG (mode, rsrc + count - i - 1));
11507 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11508 one of VSTRUCT modes: OI, CI, or XI. */
11510 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11512 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11515 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11516 alignment of a vector to 128 bits. */
11517 static HOST_WIDE_INT
11518 aarch64_simd_vector_alignment (const_tree type)
11520 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11521 return MIN (align, 128);
11524 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11525 static bool
11526 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11528 if (is_packed)
11529 return false;
11531 /* We guarantee alignment for vectors up to 128-bits. */
11532 if (tree_int_cst_compare (TYPE_SIZE (type),
11533 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11534 return false;
11536 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11537 return true;
11540 /* Return true if the vector misalignment factor is supported by the
11541 target. */
11542 static bool
11543 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11544 const_tree type, int misalignment,
11545 bool is_packed)
11547 if (TARGET_SIMD && STRICT_ALIGNMENT)
11549 /* Return if movmisalign pattern is not supported for this mode. */
11550 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11551 return false;
11553 if (misalignment == -1)
11555 /* Misalignment factor is unknown at compile time but we know
11556 it's word aligned. */
11557 if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11559 int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11561 if (element_size != 64)
11562 return true;
11564 return false;
11567 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11568 is_packed);
11571 /* If VALS is a vector constant that can be loaded into a register
11572 using DUP, generate instructions to do so and return an RTX to
11573 assign to the register. Otherwise return NULL_RTX. */
11574 static rtx
11575 aarch64_simd_dup_constant (rtx vals)
11577 machine_mode mode = GET_MODE (vals);
11578 machine_mode inner_mode = GET_MODE_INNER (mode);
11579 rtx x;
11581 if (!const_vec_duplicate_p (vals, &x))
11582 return NULL_RTX;
11584 /* We can load this constant by using DUP and a constant in a
11585 single ARM register. This will be cheaper than a vector
11586 load. */
11587 x = copy_to_mode_reg (inner_mode, x);
11588 return gen_rtx_VEC_DUPLICATE (mode, x);
11592 /* Generate code to load VALS, which is a PARALLEL containing only
11593 constants (for vec_init) or CONST_VECTOR, efficiently into a
11594 register. Returns an RTX to copy into the register, or NULL_RTX
11595 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11596 static rtx
11597 aarch64_simd_make_constant (rtx vals)
11599 machine_mode mode = GET_MODE (vals);
11600 rtx const_dup;
11601 rtx const_vec = NULL_RTX;
11602 int n_elts = GET_MODE_NUNITS (mode);
11603 int n_const = 0;
11604 int i;
11606 if (GET_CODE (vals) == CONST_VECTOR)
11607 const_vec = vals;
11608 else if (GET_CODE (vals) == PARALLEL)
11610 /* A CONST_VECTOR must contain only CONST_INTs and
11611 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11612 Only store valid constants in a CONST_VECTOR. */
11613 for (i = 0; i < n_elts; ++i)
11615 rtx x = XVECEXP (vals, 0, i);
11616 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11617 n_const++;
11619 if (n_const == n_elts)
11620 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11622 else
11623 gcc_unreachable ();
11625 if (const_vec != NULL_RTX
11626 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11627 /* Load using MOVI/MVNI. */
11628 return const_vec;
11629 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11630 /* Loaded using DUP. */
11631 return const_dup;
11632 else if (const_vec != NULL_RTX)
11633 /* Load from constant pool. We can not take advantage of single-cycle
11634 LD1 because we need a PC-relative addressing mode. */
11635 return const_vec;
11636 else
11637 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11638 We can not construct an initializer. */
11639 return NULL_RTX;
11642 /* Expand a vector initialisation sequence, such that TARGET is
11643 initialised to contain VALS. */
11645 void
11646 aarch64_expand_vector_init (rtx target, rtx vals)
11648 machine_mode mode = GET_MODE (target);
11649 machine_mode inner_mode = GET_MODE_INNER (mode);
11650 /* The number of vector elements. */
11651 int n_elts = GET_MODE_NUNITS (mode);
11652 /* The number of vector elements which are not constant. */
11653 int n_var = 0;
11654 rtx any_const = NULL_RTX;
11655 /* The first element of vals. */
11656 rtx v0 = XVECEXP (vals, 0, 0);
11657 bool all_same = true;
11659 /* Count the number of variable elements to initialise. */
11660 for (int i = 0; i < n_elts; ++i)
11662 rtx x = XVECEXP (vals, 0, i);
11663 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11664 ++n_var;
11665 else
11666 any_const = x;
11668 all_same &= rtx_equal_p (x, v0);
11671 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11672 how best to handle this. */
11673 if (n_var == 0)
11675 rtx constant = aarch64_simd_make_constant (vals);
11676 if (constant != NULL_RTX)
11678 emit_move_insn (target, constant);
11679 return;
11683 /* Splat a single non-constant element if we can. */
11684 if (all_same)
11686 rtx x = copy_to_mode_reg (inner_mode, v0);
11687 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11688 return;
11691 /* Initialise a vector which is part-variable. We want to first try
11692 to build those lanes which are constant in the most efficient way we
11693 can. */
11694 if (n_var != n_elts)
11696 rtx copy = copy_rtx (vals);
11698 /* Load constant part of vector. We really don't care what goes into the
11699 parts we will overwrite, but we're more likely to be able to load the
11700 constant efficiently if it has fewer, larger, repeating parts
11701 (see aarch64_simd_valid_immediate). */
11702 for (int i = 0; i < n_elts; i++)
11704 rtx x = XVECEXP (vals, 0, i);
11705 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11706 continue;
11707 rtx subst = any_const;
11708 for (int bit = n_elts / 2; bit > 0; bit /= 2)
11710 /* Look in the copied vector, as more elements are const. */
11711 rtx test = XVECEXP (copy, 0, i ^ bit);
11712 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11714 subst = test;
11715 break;
11718 XVECEXP (copy, 0, i) = subst;
11720 aarch64_expand_vector_init (target, copy);
11723 /* Insert the variable lanes directly. */
11725 enum insn_code icode = optab_handler (vec_set_optab, mode);
11726 gcc_assert (icode != CODE_FOR_nothing);
11728 for (int i = 0; i < n_elts; i++)
11730 rtx x = XVECEXP (vals, 0, i);
11731 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11732 continue;
11733 x = copy_to_mode_reg (inner_mode, x);
11734 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11738 static unsigned HOST_WIDE_INT
11739 aarch64_shift_truncation_mask (machine_mode mode)
11741 return
11742 (!SHIFT_COUNT_TRUNCATED
11743 || aarch64_vector_mode_supported_p (mode)
11744 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11747 /* Select a format to encode pointers in exception handling data. */
11749 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11751 int type;
11752 switch (aarch64_cmodel)
11754 case AARCH64_CMODEL_TINY:
11755 case AARCH64_CMODEL_TINY_PIC:
11756 case AARCH64_CMODEL_SMALL:
11757 case AARCH64_CMODEL_SMALL_PIC:
11758 case AARCH64_CMODEL_SMALL_SPIC:
11759 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
11760 for everything. */
11761 type = DW_EH_PE_sdata4;
11762 break;
11763 default:
11764 /* No assumptions here. 8-byte relocs required. */
11765 type = DW_EH_PE_sdata8;
11766 break;
11768 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11771 /* The last .arch and .tune assembly strings that we printed. */
11772 static std::string aarch64_last_printed_arch_string;
11773 static std::string aarch64_last_printed_tune_string;
11775 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
11776 by the function fndecl. */
11778 void
11779 aarch64_declare_function_name (FILE *stream, const char* name,
11780 tree fndecl)
11782 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11784 struct cl_target_option *targ_options;
11785 if (target_parts)
11786 targ_options = TREE_TARGET_OPTION (target_parts);
11787 else
11788 targ_options = TREE_TARGET_OPTION (target_option_current_node);
11789 gcc_assert (targ_options);
11791 const struct processor *this_arch
11792 = aarch64_get_arch (targ_options->x_explicit_arch);
11794 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11795 std::string extension
11796 = aarch64_get_extension_string_for_isa_flags (isa_flags,
11797 this_arch->flags);
11798 /* Only update the assembler .arch string if it is distinct from the last
11799 such string we printed. */
11800 std::string to_print = this_arch->name + extension;
11801 if (to_print != aarch64_last_printed_arch_string)
11803 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11804 aarch64_last_printed_arch_string = to_print;
11807 /* Print the cpu name we're tuning for in the comments, might be
11808 useful to readers of the generated asm. Do it only when it changes
11809 from function to function and verbose assembly is requested. */
11810 const struct processor *this_tune
11811 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11813 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11815 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11816 this_tune->name);
11817 aarch64_last_printed_tune_string = this_tune->name;
11820 /* Don't forget the type directive for ELF. */
11821 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11822 ASM_OUTPUT_LABEL (stream, name);
11825 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
11827 static void
11828 aarch64_start_file (void)
11830 struct cl_target_option *default_options
11831 = TREE_TARGET_OPTION (target_option_default_node);
11833 const struct processor *default_arch
11834 = aarch64_get_arch (default_options->x_explicit_arch);
11835 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11836 std::string extension
11837 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11838 default_arch->flags);
11840 aarch64_last_printed_arch_string = default_arch->name + extension;
11841 aarch64_last_printed_tune_string = "";
11842 asm_fprintf (asm_out_file, "\t.arch %s\n",
11843 aarch64_last_printed_arch_string.c_str ());
11845 default_file_start ();
11848 /* Emit load exclusive. */
11850 static void
11851 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11852 rtx mem, rtx model_rtx)
11854 rtx (*gen) (rtx, rtx, rtx);
11856 switch (mode)
11858 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11859 case HImode: gen = gen_aarch64_load_exclusivehi; break;
11860 case SImode: gen = gen_aarch64_load_exclusivesi; break;
11861 case DImode: gen = gen_aarch64_load_exclusivedi; break;
11862 default:
11863 gcc_unreachable ();
11866 emit_insn (gen (rval, mem, model_rtx));
11869 /* Emit store exclusive. */
11871 static void
11872 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11873 rtx rval, rtx mem, rtx model_rtx)
11875 rtx (*gen) (rtx, rtx, rtx, rtx);
11877 switch (mode)
11879 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11880 case HImode: gen = gen_aarch64_store_exclusivehi; break;
11881 case SImode: gen = gen_aarch64_store_exclusivesi; break;
11882 case DImode: gen = gen_aarch64_store_exclusivedi; break;
11883 default:
11884 gcc_unreachable ();
11887 emit_insn (gen (bval, rval, mem, model_rtx));
11890 /* Mark the previous jump instruction as unlikely. */
11892 static void
11893 aarch64_emit_unlikely_jump (rtx insn)
11895 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11897 rtx_insn *jump = emit_jump_insn (insn);
11898 add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
11901 /* Expand a compare and swap pattern. */
11903 void
11904 aarch64_expand_compare_and_swap (rtx operands[])
11906 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11907 machine_mode mode, cmp_mode;
11908 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11909 int idx;
11910 gen_cas_fn gen;
11911 const gen_cas_fn split_cas[] =
11913 gen_aarch64_compare_and_swapqi,
11914 gen_aarch64_compare_and_swaphi,
11915 gen_aarch64_compare_and_swapsi,
11916 gen_aarch64_compare_and_swapdi
11918 const gen_cas_fn atomic_cas[] =
11920 gen_aarch64_compare_and_swapqi_lse,
11921 gen_aarch64_compare_and_swaphi_lse,
11922 gen_aarch64_compare_and_swapsi_lse,
11923 gen_aarch64_compare_and_swapdi_lse
11926 bval = operands[0];
11927 rval = operands[1];
11928 mem = operands[2];
11929 oldval = operands[3];
11930 newval = operands[4];
11931 is_weak = operands[5];
11932 mod_s = operands[6];
11933 mod_f = operands[7];
11934 mode = GET_MODE (mem);
11935 cmp_mode = mode;
11937 /* Normally the succ memory model must be stronger than fail, but in the
11938 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11939 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
11941 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11942 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11943 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11945 switch (mode)
11947 case QImode:
11948 case HImode:
11949 /* For short modes, we're going to perform the comparison in SImode,
11950 so do the zero-extension now. */
11951 cmp_mode = SImode;
11952 rval = gen_reg_rtx (SImode);
11953 oldval = convert_modes (SImode, mode, oldval, true);
11954 /* Fall through. */
11956 case SImode:
11957 case DImode:
11958 /* Force the value into a register if needed. */
11959 if (!aarch64_plus_operand (oldval, mode))
11960 oldval = force_reg (cmp_mode, oldval);
11961 break;
11963 default:
11964 gcc_unreachable ();
11967 switch (mode)
11969 case QImode: idx = 0; break;
11970 case HImode: idx = 1; break;
11971 case SImode: idx = 2; break;
11972 case DImode: idx = 3; break;
11973 default:
11974 gcc_unreachable ();
11976 if (TARGET_LSE)
11977 gen = atomic_cas[idx];
11978 else
11979 gen = split_cas[idx];
11981 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11983 if (mode == QImode || mode == HImode)
11984 emit_move_insn (operands[1], gen_lowpart (mode, rval));
11986 x = gen_rtx_REG (CCmode, CC_REGNUM);
11987 x = gen_rtx_EQ (SImode, x, const0_rtx);
11988 emit_insn (gen_rtx_SET (bval, x));
11991 /* Test whether the target supports using a atomic load-operate instruction.
11992 CODE is the operation and AFTER is TRUE if the data in memory after the
11993 operation should be returned and FALSE if the data before the operation
11994 should be returned. Returns FALSE if the operation isn't supported by the
11995 architecture. */
11997 bool
11998 aarch64_atomic_ldop_supported_p (enum rtx_code code)
12000 if (!TARGET_LSE)
12001 return false;
12003 switch (code)
12005 case SET:
12006 case AND:
12007 case IOR:
12008 case XOR:
12009 case MINUS:
12010 case PLUS:
12011 return true;
12012 default:
12013 return false;
12017 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
12018 sequence implementing an atomic operation. */
12020 static void
12021 aarch64_emit_post_barrier (enum memmodel model)
12023 const enum memmodel base_model = memmodel_base (model);
12025 if (is_mm_sync (model)
12026 && (base_model == MEMMODEL_ACQUIRE
12027 || base_model == MEMMODEL_ACQ_REL
12028 || base_model == MEMMODEL_SEQ_CST))
12030 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
12034 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
12035 for the data in memory. EXPECTED is the value expected to be in memory.
12036 DESIRED is the value to store to memory. MEM is the memory location. MODEL
12037 is the memory ordering to use. */
12039 void
12040 aarch64_gen_atomic_cas (rtx rval, rtx mem,
12041 rtx expected, rtx desired,
12042 rtx model)
12044 rtx (*gen) (rtx, rtx, rtx, rtx);
12045 machine_mode mode;
12047 mode = GET_MODE (mem);
12049 switch (mode)
12051 case QImode: gen = gen_aarch64_atomic_casqi; break;
12052 case HImode: gen = gen_aarch64_atomic_cashi; break;
12053 case SImode: gen = gen_aarch64_atomic_cassi; break;
12054 case DImode: gen = gen_aarch64_atomic_casdi; break;
12055 default:
12056 gcc_unreachable ();
12059 /* Move the expected value into the CAS destination register. */
12060 emit_insn (gen_rtx_SET (rval, expected));
12062 /* Emit the CAS. */
12063 emit_insn (gen (rval, mem, desired, model));
12065 /* Compare the expected value with the value loaded by the CAS, to establish
12066 whether the swap was made. */
12067 aarch64_gen_compare_reg (EQ, rval, expected);
12070 /* Split a compare and swap pattern. */
12072 void
12073 aarch64_split_compare_and_swap (rtx operands[])
12075 rtx rval, mem, oldval, newval, scratch;
12076 machine_mode mode;
12077 bool is_weak;
12078 rtx_code_label *label1, *label2;
12079 rtx x, cond;
12080 enum memmodel model;
12081 rtx model_rtx;
12083 rval = operands[0];
12084 mem = operands[1];
12085 oldval = operands[2];
12086 newval = operands[3];
12087 is_weak = (operands[4] != const0_rtx);
12088 model_rtx = operands[5];
12089 scratch = operands[7];
12090 mode = GET_MODE (mem);
12091 model = memmodel_from_int (INTVAL (model_rtx));
12093 label1 = NULL;
12094 if (!is_weak)
12096 label1 = gen_label_rtx ();
12097 emit_label (label1);
12099 label2 = gen_label_rtx ();
12101 /* The initial load can be relaxed for a __sync operation since a final
12102 barrier will be emitted to stop code hoisting. */
12103 if (is_mm_sync (model))
12104 aarch64_emit_load_exclusive (mode, rval, mem,
12105 GEN_INT (MEMMODEL_RELAXED));
12106 else
12107 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12109 cond = aarch64_gen_compare_reg (NE, rval, oldval);
12110 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12111 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12112 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12113 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12115 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12117 if (!is_weak)
12119 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12120 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12121 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12122 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12124 else
12126 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12127 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12128 emit_insn (gen_rtx_SET (cond, x));
12131 emit_label (label2);
12133 /* Emit any final barrier needed for a __sync operation. */
12134 if (is_mm_sync (model))
12135 aarch64_emit_post_barrier (model);
12138 /* Emit a BIC instruction. */
12140 static void
12141 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12143 rtx shift_rtx = GEN_INT (shift);
12144 rtx (*gen) (rtx, rtx, rtx, rtx);
12146 switch (mode)
12148 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12149 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12150 default:
12151 gcc_unreachable ();
12154 emit_insn (gen (dst, s2, shift_rtx, s1));
12157 /* Emit an atomic swap. */
12159 static void
12160 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12161 rtx mem, rtx model)
12163 rtx (*gen) (rtx, rtx, rtx, rtx);
12165 switch (mode)
12167 case QImode: gen = gen_aarch64_atomic_swpqi; break;
12168 case HImode: gen = gen_aarch64_atomic_swphi; break;
12169 case SImode: gen = gen_aarch64_atomic_swpsi; break;
12170 case DImode: gen = gen_aarch64_atomic_swpdi; break;
12171 default:
12172 gcc_unreachable ();
12175 emit_insn (gen (dst, mem, value, model));
12178 /* Operations supported by aarch64_emit_atomic_load_op. */
12180 enum aarch64_atomic_load_op_code
12182 AARCH64_LDOP_PLUS, /* A + B */
12183 AARCH64_LDOP_XOR, /* A ^ B */
12184 AARCH64_LDOP_OR, /* A | B */
12185 AARCH64_LDOP_BIC /* A & ~B */
12188 /* Emit an atomic load-operate. */
12190 static void
12191 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12192 machine_mode mode, rtx dst, rtx src,
12193 rtx mem, rtx model)
12195 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12196 const aarch64_atomic_load_op_fn plus[] =
12198 gen_aarch64_atomic_loadaddqi,
12199 gen_aarch64_atomic_loadaddhi,
12200 gen_aarch64_atomic_loadaddsi,
12201 gen_aarch64_atomic_loadadddi
12203 const aarch64_atomic_load_op_fn eor[] =
12205 gen_aarch64_atomic_loadeorqi,
12206 gen_aarch64_atomic_loadeorhi,
12207 gen_aarch64_atomic_loadeorsi,
12208 gen_aarch64_atomic_loadeordi
12210 const aarch64_atomic_load_op_fn ior[] =
12212 gen_aarch64_atomic_loadsetqi,
12213 gen_aarch64_atomic_loadsethi,
12214 gen_aarch64_atomic_loadsetsi,
12215 gen_aarch64_atomic_loadsetdi
12217 const aarch64_atomic_load_op_fn bic[] =
12219 gen_aarch64_atomic_loadclrqi,
12220 gen_aarch64_atomic_loadclrhi,
12221 gen_aarch64_atomic_loadclrsi,
12222 gen_aarch64_atomic_loadclrdi
12224 aarch64_atomic_load_op_fn gen;
12225 int idx = 0;
12227 switch (mode)
12229 case QImode: idx = 0; break;
12230 case HImode: idx = 1; break;
12231 case SImode: idx = 2; break;
12232 case DImode: idx = 3; break;
12233 default:
12234 gcc_unreachable ();
12237 switch (code)
12239 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12240 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12241 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12242 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12243 default:
12244 gcc_unreachable ();
12247 emit_insn (gen (dst, mem, src, model));
12250 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12251 location to store the data read from memory. OUT_RESULT is the location to
12252 store the result of the operation. MEM is the memory location to read and
12253 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12254 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12255 be NULL. */
12257 void
12258 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12259 rtx mem, rtx value, rtx model_rtx)
12261 machine_mode mode = GET_MODE (mem);
12262 machine_mode wmode = (mode == DImode ? DImode : SImode);
12263 const bool short_mode = (mode < SImode);
12264 aarch64_atomic_load_op_code ldop_code;
12265 rtx src;
12266 rtx x;
12268 if (out_data)
12269 out_data = gen_lowpart (mode, out_data);
12271 if (out_result)
12272 out_result = gen_lowpart (mode, out_result);
12274 /* Make sure the value is in a register, putting it into a destination
12275 register if it needs to be manipulated. */
12276 if (!register_operand (value, mode)
12277 || code == AND || code == MINUS)
12279 src = out_result ? out_result : out_data;
12280 emit_move_insn (src, gen_lowpart (mode, value));
12282 else
12283 src = value;
12284 gcc_assert (register_operand (src, mode));
12286 /* Preprocess the data for the operation as necessary. If the operation is
12287 a SET then emit a swap instruction and finish. */
12288 switch (code)
12290 case SET:
12291 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12292 return;
12294 case MINUS:
12295 /* Negate the value and treat it as a PLUS. */
12297 rtx neg_src;
12299 /* Resize the value if necessary. */
12300 if (short_mode)
12301 src = gen_lowpart (wmode, src);
12303 neg_src = gen_rtx_NEG (wmode, src);
12304 emit_insn (gen_rtx_SET (src, neg_src));
12306 if (short_mode)
12307 src = gen_lowpart (mode, src);
12309 /* Fall-through. */
12310 case PLUS:
12311 ldop_code = AARCH64_LDOP_PLUS;
12312 break;
12314 case IOR:
12315 ldop_code = AARCH64_LDOP_OR;
12316 break;
12318 case XOR:
12319 ldop_code = AARCH64_LDOP_XOR;
12320 break;
12322 case AND:
12324 rtx not_src;
12326 /* Resize the value if necessary. */
12327 if (short_mode)
12328 src = gen_lowpart (wmode, src);
12330 not_src = gen_rtx_NOT (wmode, src);
12331 emit_insn (gen_rtx_SET (src, not_src));
12333 if (short_mode)
12334 src = gen_lowpart (mode, src);
12336 ldop_code = AARCH64_LDOP_BIC;
12337 break;
12339 default:
12340 /* The operation can't be done with atomic instructions. */
12341 gcc_unreachable ();
12344 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12346 /* If necessary, calculate the data in memory after the update by redoing the
12347 operation from values in registers. */
12348 if (!out_result)
12349 return;
12351 if (short_mode)
12353 src = gen_lowpart (wmode, src);
12354 out_data = gen_lowpart (wmode, out_data);
12355 out_result = gen_lowpart (wmode, out_result);
12358 x = NULL_RTX;
12360 switch (code)
12362 case MINUS:
12363 case PLUS:
12364 x = gen_rtx_PLUS (wmode, out_data, src);
12365 break;
12366 case IOR:
12367 x = gen_rtx_IOR (wmode, out_data, src);
12368 break;
12369 case XOR:
12370 x = gen_rtx_XOR (wmode, out_data, src);
12371 break;
12372 case AND:
12373 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12374 return;
12375 default:
12376 gcc_unreachable ();
12379 emit_set_insn (out_result, x);
12381 return;
12384 /* Split an atomic operation. */
12386 void
12387 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12388 rtx value, rtx model_rtx, rtx cond)
12390 machine_mode mode = GET_MODE (mem);
12391 machine_mode wmode = (mode == DImode ? DImode : SImode);
12392 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12393 const bool is_sync = is_mm_sync (model);
12394 rtx_code_label *label;
12395 rtx x;
12397 /* Split the atomic operation into a sequence. */
12398 label = gen_label_rtx ();
12399 emit_label (label);
12401 if (new_out)
12402 new_out = gen_lowpart (wmode, new_out);
12403 if (old_out)
12404 old_out = gen_lowpart (wmode, old_out);
12405 else
12406 old_out = new_out;
12407 value = simplify_gen_subreg (wmode, value, mode, 0);
12409 /* The initial load can be relaxed for a __sync operation since a final
12410 barrier will be emitted to stop code hoisting. */
12411 if (is_sync)
12412 aarch64_emit_load_exclusive (mode, old_out, mem,
12413 GEN_INT (MEMMODEL_RELAXED));
12414 else
12415 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12417 switch (code)
12419 case SET:
12420 new_out = value;
12421 break;
12423 case NOT:
12424 x = gen_rtx_AND (wmode, old_out, value);
12425 emit_insn (gen_rtx_SET (new_out, x));
12426 x = gen_rtx_NOT (wmode, new_out);
12427 emit_insn (gen_rtx_SET (new_out, x));
12428 break;
12430 case MINUS:
12431 if (CONST_INT_P (value))
12433 value = GEN_INT (-INTVAL (value));
12434 code = PLUS;
12436 /* Fall through. */
12438 default:
12439 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12440 emit_insn (gen_rtx_SET (new_out, x));
12441 break;
12444 aarch64_emit_store_exclusive (mode, cond, mem,
12445 gen_lowpart (mode, new_out), model_rtx);
12447 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12448 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12449 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12450 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12452 /* Emit any final barrier needed for a __sync operation. */
12453 if (is_sync)
12454 aarch64_emit_post_barrier (model);
12457 static void
12458 aarch64_init_libfuncs (void)
12460 /* Half-precision float operations. The compiler handles all operations
12461 with NULL libfuncs by converting to SFmode. */
12463 /* Conversions. */
12464 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12465 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12467 /* Arithmetic. */
12468 set_optab_libfunc (add_optab, HFmode, NULL);
12469 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12470 set_optab_libfunc (smul_optab, HFmode, NULL);
12471 set_optab_libfunc (neg_optab, HFmode, NULL);
12472 set_optab_libfunc (sub_optab, HFmode, NULL);
12474 /* Comparisons. */
12475 set_optab_libfunc (eq_optab, HFmode, NULL);
12476 set_optab_libfunc (ne_optab, HFmode, NULL);
12477 set_optab_libfunc (lt_optab, HFmode, NULL);
12478 set_optab_libfunc (le_optab, HFmode, NULL);
12479 set_optab_libfunc (ge_optab, HFmode, NULL);
12480 set_optab_libfunc (gt_optab, HFmode, NULL);
12481 set_optab_libfunc (unord_optab, HFmode, NULL);
12484 /* Target hook for c_mode_for_suffix. */
12485 static machine_mode
12486 aarch64_c_mode_for_suffix (char suffix)
12488 if (suffix == 'q')
12489 return TFmode;
12491 return VOIDmode;
12494 /* We can only represent floating point constants which will fit in
12495 "quarter-precision" values. These values are characterised by
12496 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12499 (-1)^s * (n/16) * 2^r
12501 Where:
12502 's' is the sign bit.
12503 'n' is an integer in the range 16 <= n <= 31.
12504 'r' is an integer in the range -3 <= r <= 4. */
12506 /* Return true iff X can be represented by a quarter-precision
12507 floating point immediate operand X. Note, we cannot represent 0.0. */
12508 bool
12509 aarch64_float_const_representable_p (rtx x)
12511 /* This represents our current view of how many bits
12512 make up the mantissa. */
12513 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12514 int exponent;
12515 unsigned HOST_WIDE_INT mantissa, mask;
12516 REAL_VALUE_TYPE r, m;
12517 bool fail;
12519 if (!CONST_DOUBLE_P (x))
12520 return false;
12522 /* We don't support HFmode constants yet. */
12523 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12524 return false;
12526 r = *CONST_DOUBLE_REAL_VALUE (x);
12528 /* We cannot represent infinities, NaNs or +/-zero. We won't
12529 know if we have +zero until we analyse the mantissa, but we
12530 can reject the other invalid values. */
12531 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12532 || REAL_VALUE_MINUS_ZERO (r))
12533 return false;
12535 /* Extract exponent. */
12536 r = real_value_abs (&r);
12537 exponent = REAL_EXP (&r);
12539 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12540 highest (sign) bit, with a fixed binary point at bit point_pos.
12541 m1 holds the low part of the mantissa, m2 the high part.
12542 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12543 bits for the mantissa, this can fail (low bits will be lost). */
12544 real_ldexp (&m, &r, point_pos - exponent);
12545 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12547 /* If the low part of the mantissa has bits set we cannot represent
12548 the value. */
12549 if (w.ulow () != 0)
12550 return false;
12551 /* We have rejected the lower HOST_WIDE_INT, so update our
12552 understanding of how many bits lie in the mantissa and
12553 look only at the high HOST_WIDE_INT. */
12554 mantissa = w.elt (1);
12555 point_pos -= HOST_BITS_PER_WIDE_INT;
12557 /* We can only represent values with a mantissa of the form 1.xxxx. */
12558 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12559 if ((mantissa & mask) != 0)
12560 return false;
12562 /* Having filtered unrepresentable values, we may now remove all
12563 but the highest 5 bits. */
12564 mantissa >>= point_pos - 5;
12566 /* We cannot represent the value 0.0, so reject it. This is handled
12567 elsewhere. */
12568 if (mantissa == 0)
12569 return false;
12571 /* Then, as bit 4 is always set, we can mask it off, leaving
12572 the mantissa in the range [0, 15]. */
12573 mantissa &= ~(1 << 4);
12574 gcc_assert (mantissa <= 15);
12576 /* GCC internally does not use IEEE754-like encoding (where normalized
12577 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12578 Our mantissa values are shifted 4 places to the left relative to
12579 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12580 by 5 places to correct for GCC's representation. */
12581 exponent = 5 - exponent;
12583 return (exponent >= 0 && exponent <= 7);
12586 char*
12587 aarch64_output_simd_mov_immediate (rtx const_vector,
12588 machine_mode mode,
12589 unsigned width)
12591 bool is_valid;
12592 static char templ[40];
12593 const char *mnemonic;
12594 const char *shift_op;
12595 unsigned int lane_count = 0;
12596 char element_char;
12598 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12600 /* This will return true to show const_vector is legal for use as either
12601 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12602 also update INFO to show how the immediate should be generated. */
12603 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12604 gcc_assert (is_valid);
12606 element_char = sizetochar (info.element_width);
12607 lane_count = width / info.element_width;
12609 mode = GET_MODE_INNER (mode);
12610 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12612 gcc_assert (info.shift == 0 && ! info.mvn);
12613 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12614 move immediate path. */
12615 if (aarch64_float_const_zero_rtx_p (info.value))
12616 info.value = GEN_INT (0);
12617 else
12619 const unsigned int buf_size = 20;
12620 char float_buf[buf_size] = {'\0'};
12621 real_to_decimal_for_mode (float_buf,
12622 CONST_DOUBLE_REAL_VALUE (info.value),
12623 buf_size, buf_size, 1, mode);
12625 if (lane_count == 1)
12626 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12627 else
12628 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12629 lane_count, element_char, float_buf);
12630 return templ;
12634 mnemonic = info.mvn ? "mvni" : "movi";
12635 shift_op = info.msl ? "msl" : "lsl";
12637 gcc_assert (CONST_INT_P (info.value));
12638 if (lane_count == 1)
12639 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12640 mnemonic, UINTVAL (info.value));
12641 else if (info.shift)
12642 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12643 ", %s %d", mnemonic, lane_count, element_char,
12644 UINTVAL (info.value), shift_op, info.shift);
12645 else
12646 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12647 mnemonic, lane_count, element_char, UINTVAL (info.value));
12648 return templ;
12651 char*
12652 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12653 machine_mode mode)
12655 machine_mode vmode;
12657 gcc_assert (!VECTOR_MODE_P (mode));
12658 vmode = aarch64_simd_container_mode (mode, 64);
12659 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12660 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12663 /* Split operands into moves from op[1] + op[2] into op[0]. */
12665 void
12666 aarch64_split_combinev16qi (rtx operands[3])
12668 unsigned int dest = REGNO (operands[0]);
12669 unsigned int src1 = REGNO (operands[1]);
12670 unsigned int src2 = REGNO (operands[2]);
12671 machine_mode halfmode = GET_MODE (operands[1]);
12672 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12673 rtx destlo, desthi;
12675 gcc_assert (halfmode == V16QImode);
12677 if (src1 == dest && src2 == dest + halfregs)
12679 /* No-op move. Can't split to nothing; emit something. */
12680 emit_note (NOTE_INSN_DELETED);
12681 return;
12684 /* Preserve register attributes for variable tracking. */
12685 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12686 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12687 GET_MODE_SIZE (halfmode));
12689 /* Special case of reversed high/low parts. */
12690 if (reg_overlap_mentioned_p (operands[2], destlo)
12691 && reg_overlap_mentioned_p (operands[1], desthi))
12693 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12694 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12695 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12697 else if (!reg_overlap_mentioned_p (operands[2], destlo))
12699 /* Try to avoid unnecessary moves if part of the result
12700 is in the right place already. */
12701 if (src1 != dest)
12702 emit_move_insn (destlo, operands[1]);
12703 if (src2 != dest + halfregs)
12704 emit_move_insn (desthi, operands[2]);
12706 else
12708 if (src2 != dest + halfregs)
12709 emit_move_insn (desthi, operands[2]);
12710 if (src1 != dest)
12711 emit_move_insn (destlo, operands[1]);
12715 /* vec_perm support. */
12717 #define MAX_VECT_LEN 16
12719 struct expand_vec_perm_d
12721 rtx target, op0, op1;
12722 unsigned char perm[MAX_VECT_LEN];
12723 machine_mode vmode;
12724 unsigned char nelt;
12725 bool one_vector_p;
12726 bool testing_p;
12729 /* Generate a variable permutation. */
12731 static void
12732 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12734 machine_mode vmode = GET_MODE (target);
12735 bool one_vector_p = rtx_equal_p (op0, op1);
12737 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12738 gcc_checking_assert (GET_MODE (op0) == vmode);
12739 gcc_checking_assert (GET_MODE (op1) == vmode);
12740 gcc_checking_assert (GET_MODE (sel) == vmode);
12741 gcc_checking_assert (TARGET_SIMD);
12743 if (one_vector_p)
12745 if (vmode == V8QImode)
12747 /* Expand the argument to a V16QI mode by duplicating it. */
12748 rtx pair = gen_reg_rtx (V16QImode);
12749 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12750 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12752 else
12754 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12757 else
12759 rtx pair;
12761 if (vmode == V8QImode)
12763 pair = gen_reg_rtx (V16QImode);
12764 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12765 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12767 else
12769 pair = gen_reg_rtx (OImode);
12770 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12771 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12776 void
12777 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12779 machine_mode vmode = GET_MODE (target);
12780 unsigned int nelt = GET_MODE_NUNITS (vmode);
12781 bool one_vector_p = rtx_equal_p (op0, op1);
12782 rtx mask;
12784 /* The TBL instruction does not use a modulo index, so we must take care
12785 of that ourselves. */
12786 mask = aarch64_simd_gen_const_vector_dup (vmode,
12787 one_vector_p ? nelt - 1 : 2 * nelt - 1);
12788 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12790 /* For big-endian, we also need to reverse the index within the vector
12791 (but not which vector). */
12792 if (BYTES_BIG_ENDIAN)
12794 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
12795 if (!one_vector_p)
12796 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12797 sel = expand_simple_binop (vmode, XOR, sel, mask,
12798 NULL, 0, OPTAB_LIB_WIDEN);
12800 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12803 /* Recognize patterns suitable for the TRN instructions. */
12804 static bool
12805 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12807 unsigned int i, odd, mask, nelt = d->nelt;
12808 rtx out, in0, in1, x;
12809 rtx (*gen) (rtx, rtx, rtx);
12810 machine_mode vmode = d->vmode;
12812 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12813 return false;
12815 /* Note that these are little-endian tests.
12816 We correct for big-endian later. */
12817 if (d->perm[0] == 0)
12818 odd = 0;
12819 else if (d->perm[0] == 1)
12820 odd = 1;
12821 else
12822 return false;
12823 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12825 for (i = 0; i < nelt; i += 2)
12827 if (d->perm[i] != i + odd)
12828 return false;
12829 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12830 return false;
12833 /* Success! */
12834 if (d->testing_p)
12835 return true;
12837 in0 = d->op0;
12838 in1 = d->op1;
12839 if (BYTES_BIG_ENDIAN)
12841 x = in0, in0 = in1, in1 = x;
12842 odd = !odd;
12844 out = d->target;
12846 if (odd)
12848 switch (vmode)
12850 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12851 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12852 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12853 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12854 case V4SImode: gen = gen_aarch64_trn2v4si; break;
12855 case V2SImode: gen = gen_aarch64_trn2v2si; break;
12856 case V2DImode: gen = gen_aarch64_trn2v2di; break;
12857 case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12858 case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12859 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12860 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12861 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12862 default:
12863 return false;
12866 else
12868 switch (vmode)
12870 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12871 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12872 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12873 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12874 case V4SImode: gen = gen_aarch64_trn1v4si; break;
12875 case V2SImode: gen = gen_aarch64_trn1v2si; break;
12876 case V2DImode: gen = gen_aarch64_trn1v2di; break;
12877 case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12878 case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12879 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12880 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12881 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12882 default:
12883 return false;
12887 emit_insn (gen (out, in0, in1));
12888 return true;
12891 /* Recognize patterns suitable for the UZP instructions. */
12892 static bool
12893 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12895 unsigned int i, odd, mask, nelt = d->nelt;
12896 rtx out, in0, in1, x;
12897 rtx (*gen) (rtx, rtx, rtx);
12898 machine_mode vmode = d->vmode;
12900 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12901 return false;
12903 /* Note that these are little-endian tests.
12904 We correct for big-endian later. */
12905 if (d->perm[0] == 0)
12906 odd = 0;
12907 else if (d->perm[0] == 1)
12908 odd = 1;
12909 else
12910 return false;
12911 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12913 for (i = 0; i < nelt; i++)
12915 unsigned elt = (i * 2 + odd) & mask;
12916 if (d->perm[i] != elt)
12917 return false;
12920 /* Success! */
12921 if (d->testing_p)
12922 return true;
12924 in0 = d->op0;
12925 in1 = d->op1;
12926 if (BYTES_BIG_ENDIAN)
12928 x = in0, in0 = in1, in1 = x;
12929 odd = !odd;
12931 out = d->target;
12933 if (odd)
12935 switch (vmode)
12937 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12938 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12939 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12940 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12941 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12942 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12943 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12944 case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12945 case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12946 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12947 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12948 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12949 default:
12950 return false;
12953 else
12955 switch (vmode)
12957 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12958 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12959 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12960 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12961 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12962 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12963 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12964 case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12965 case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12966 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12967 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12968 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12969 default:
12970 return false;
12974 emit_insn (gen (out, in0, in1));
12975 return true;
12978 /* Recognize patterns suitable for the ZIP instructions. */
12979 static bool
12980 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12982 unsigned int i, high, mask, nelt = d->nelt;
12983 rtx out, in0, in1, x;
12984 rtx (*gen) (rtx, rtx, rtx);
12985 machine_mode vmode = d->vmode;
12987 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12988 return false;
12990 /* Note that these are little-endian tests.
12991 We correct for big-endian later. */
12992 high = nelt / 2;
12993 if (d->perm[0] == high)
12994 /* Do Nothing. */
12996 else if (d->perm[0] == 0)
12997 high = 0;
12998 else
12999 return false;
13000 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
13002 for (i = 0; i < nelt / 2; i++)
13004 unsigned elt = (i + high) & mask;
13005 if (d->perm[i * 2] != elt)
13006 return false;
13007 elt = (elt + nelt) & mask;
13008 if (d->perm[i * 2 + 1] != elt)
13009 return false;
13012 /* Success! */
13013 if (d->testing_p)
13014 return true;
13016 in0 = d->op0;
13017 in1 = d->op1;
13018 if (BYTES_BIG_ENDIAN)
13020 x = in0, in0 = in1, in1 = x;
13021 high = !high;
13023 out = d->target;
13025 if (high)
13027 switch (vmode)
13029 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
13030 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
13031 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
13032 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
13033 case V4SImode: gen = gen_aarch64_zip2v4si; break;
13034 case V2SImode: gen = gen_aarch64_zip2v2si; break;
13035 case V2DImode: gen = gen_aarch64_zip2v2di; break;
13036 case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
13037 case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
13038 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
13039 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
13040 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
13041 default:
13042 return false;
13045 else
13047 switch (vmode)
13049 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
13050 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
13051 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
13052 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
13053 case V4SImode: gen = gen_aarch64_zip1v4si; break;
13054 case V2SImode: gen = gen_aarch64_zip1v2si; break;
13055 case V2DImode: gen = gen_aarch64_zip1v2di; break;
13056 case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13057 case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13058 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13059 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13060 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
13061 default:
13062 return false;
13066 emit_insn (gen (out, in0, in1));
13067 return true;
13070 /* Recognize patterns for the EXT insn. */
13072 static bool
13073 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13075 unsigned int i, nelt = d->nelt;
13076 rtx (*gen) (rtx, rtx, rtx, rtx);
13077 rtx offset;
13079 unsigned int location = d->perm[0]; /* Always < nelt. */
13081 /* Check if the extracted indices are increasing by one. */
13082 for (i = 1; i < nelt; i++)
13084 unsigned int required = location + i;
13085 if (d->one_vector_p)
13087 /* We'll pass the same vector in twice, so allow indices to wrap. */
13088 required &= (nelt - 1);
13090 if (d->perm[i] != required)
13091 return false;
13094 switch (d->vmode)
13096 case V16QImode: gen = gen_aarch64_extv16qi; break;
13097 case V8QImode: gen = gen_aarch64_extv8qi; break;
13098 case V4HImode: gen = gen_aarch64_extv4hi; break;
13099 case V8HImode: gen = gen_aarch64_extv8hi; break;
13100 case V2SImode: gen = gen_aarch64_extv2si; break;
13101 case V4SImode: gen = gen_aarch64_extv4si; break;
13102 case V4HFmode: gen = gen_aarch64_extv4hf; break;
13103 case V8HFmode: gen = gen_aarch64_extv8hf; break;
13104 case V2SFmode: gen = gen_aarch64_extv2sf; break;
13105 case V4SFmode: gen = gen_aarch64_extv4sf; break;
13106 case V2DImode: gen = gen_aarch64_extv2di; break;
13107 case V2DFmode: gen = gen_aarch64_extv2df; break;
13108 default:
13109 return false;
13112 /* Success! */
13113 if (d->testing_p)
13114 return true;
13116 /* The case where (location == 0) is a no-op for both big- and little-endian,
13117 and is removed by the mid-end at optimization levels -O1 and higher. */
13119 if (BYTES_BIG_ENDIAN && (location != 0))
13121 /* After setup, we want the high elements of the first vector (stored
13122 at the LSB end of the register), and the low elements of the second
13123 vector (stored at the MSB end of the register). So swap. */
13124 std::swap (d->op0, d->op1);
13125 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13126 location = nelt - location;
13129 offset = GEN_INT (location);
13130 emit_insn (gen (d->target, d->op0, d->op1, offset));
13131 return true;
13134 /* Recognize patterns for the REV insns. */
13136 static bool
13137 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13139 unsigned int i, j, diff, nelt = d->nelt;
13140 rtx (*gen) (rtx, rtx);
13142 if (!d->one_vector_p)
13143 return false;
13145 diff = d->perm[0];
13146 switch (diff)
13148 case 7:
13149 switch (d->vmode)
13151 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
13152 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
13153 default:
13154 return false;
13156 break;
13157 case 3:
13158 switch (d->vmode)
13160 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13161 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
13162 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
13163 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
13164 default:
13165 return false;
13167 break;
13168 case 1:
13169 switch (d->vmode)
13171 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13172 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
13173 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
13174 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
13175 case V4SImode: gen = gen_aarch64_rev64v4si; break;
13176 case V2SImode: gen = gen_aarch64_rev64v2si; break;
13177 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13178 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13179 case V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13180 case V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13181 default:
13182 return false;
13184 break;
13185 default:
13186 return false;
13189 for (i = 0; i < nelt ; i += diff + 1)
13190 for (j = 0; j <= diff; j += 1)
13192 /* This is guaranteed to be true as the value of diff
13193 is 7, 3, 1 and we should have enough elements in the
13194 queue to generate this. Getting a vector mask with a
13195 value of diff other than these values implies that
13196 something is wrong by the time we get here. */
13197 gcc_assert (i + j < nelt);
13198 if (d->perm[i + j] != i + diff - j)
13199 return false;
13202 /* Success! */
13203 if (d->testing_p)
13204 return true;
13206 emit_insn (gen (d->target, d->op0));
13207 return true;
13210 static bool
13211 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13213 rtx (*gen) (rtx, rtx, rtx);
13214 rtx out = d->target;
13215 rtx in0;
13216 machine_mode vmode = d->vmode;
13217 unsigned int i, elt, nelt = d->nelt;
13218 rtx lane;
13220 elt = d->perm[0];
13221 for (i = 1; i < nelt; i++)
13223 if (elt != d->perm[i])
13224 return false;
13227 /* The generic preparation in aarch64_expand_vec_perm_const_1
13228 swaps the operand order and the permute indices if it finds
13229 d->perm[0] to be in the second operand. Thus, we can always
13230 use d->op0 and need not do any extra arithmetic to get the
13231 correct lane number. */
13232 in0 = d->op0;
13233 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13235 switch (vmode)
13237 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13238 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13239 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13240 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13241 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13242 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13243 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13244 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13245 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13246 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13247 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13248 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13249 default:
13250 return false;
13253 emit_insn (gen (out, in0, lane));
13254 return true;
13257 static bool
13258 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13260 rtx rperm[MAX_VECT_LEN], sel;
13261 machine_mode vmode = d->vmode;
13262 unsigned int i, nelt = d->nelt;
13264 if (d->testing_p)
13265 return true;
13267 /* Generic code will try constant permutation twice. Once with the
13268 original mode and again with the elements lowered to QImode.
13269 So wait and don't do the selector expansion ourselves. */
13270 if (vmode != V8QImode && vmode != V16QImode)
13271 return false;
13273 for (i = 0; i < nelt; ++i)
13275 int nunits = GET_MODE_NUNITS (vmode);
13277 /* If big-endian and two vectors we end up with a weird mixed-endian
13278 mode on NEON. Reverse the index within each word but not the word
13279 itself. */
13280 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13281 : d->perm[i]);
13283 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13284 sel = force_reg (vmode, sel);
13286 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13287 return true;
13290 static bool
13291 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13293 /* The pattern matching functions above are written to look for a small
13294 number to begin the sequence (0, 1, N/2). If we begin with an index
13295 from the second operand, we can swap the operands. */
13296 if (d->perm[0] >= d->nelt)
13298 unsigned i, nelt = d->nelt;
13300 gcc_assert (nelt == (nelt & -nelt));
13301 for (i = 0; i < nelt; ++i)
13302 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13304 std::swap (d->op0, d->op1);
13307 if (TARGET_SIMD)
13309 if (aarch64_evpc_rev (d))
13310 return true;
13311 else if (aarch64_evpc_ext (d))
13312 return true;
13313 else if (aarch64_evpc_dup (d))
13314 return true;
13315 else if (aarch64_evpc_zip (d))
13316 return true;
13317 else if (aarch64_evpc_uzp (d))
13318 return true;
13319 else if (aarch64_evpc_trn (d))
13320 return true;
13321 return aarch64_evpc_tbl (d);
13323 return false;
13326 /* Expand a vec_perm_const pattern. */
13328 bool
13329 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13331 struct expand_vec_perm_d d;
13332 int i, nelt, which;
13334 d.target = target;
13335 d.op0 = op0;
13336 d.op1 = op1;
13338 d.vmode = GET_MODE (target);
13339 gcc_assert (VECTOR_MODE_P (d.vmode));
13340 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13341 d.testing_p = false;
13343 for (i = which = 0; i < nelt; ++i)
13345 rtx e = XVECEXP (sel, 0, i);
13346 int ei = INTVAL (e) & (2 * nelt - 1);
13347 which |= (ei < nelt ? 1 : 2);
13348 d.perm[i] = ei;
13351 switch (which)
13353 default:
13354 gcc_unreachable ();
13356 case 3:
13357 d.one_vector_p = false;
13358 if (!rtx_equal_p (op0, op1))
13359 break;
13361 /* The elements of PERM do not suggest that only the first operand
13362 is used, but both operands are identical. Allow easier matching
13363 of the permutation by folding the permutation into the single
13364 input vector. */
13365 /* Fall Through. */
13366 case 2:
13367 for (i = 0; i < nelt; ++i)
13368 d.perm[i] &= nelt - 1;
13369 d.op0 = op1;
13370 d.one_vector_p = true;
13371 break;
13373 case 1:
13374 d.op1 = op0;
13375 d.one_vector_p = true;
13376 break;
13379 return aarch64_expand_vec_perm_const_1 (&d);
13382 static bool
13383 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13384 const unsigned char *sel)
13386 struct expand_vec_perm_d d;
13387 unsigned int i, nelt, which;
13388 bool ret;
13390 d.vmode = vmode;
13391 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13392 d.testing_p = true;
13393 memcpy (d.perm, sel, nelt);
13395 /* Calculate whether all elements are in one vector. */
13396 for (i = which = 0; i < nelt; ++i)
13398 unsigned char e = d.perm[i];
13399 gcc_assert (e < 2 * nelt);
13400 which |= (e < nelt ? 1 : 2);
13403 /* If all elements are from the second vector, reindex as if from the
13404 first vector. */
13405 if (which == 2)
13406 for (i = 0; i < nelt; ++i)
13407 d.perm[i] -= nelt;
13409 /* Check whether the mask can be applied to a single vector. */
13410 d.one_vector_p = (which != 3);
13412 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13413 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13414 if (!d.one_vector_p)
13415 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13417 start_sequence ();
13418 ret = aarch64_expand_vec_perm_const_1 (&d);
13419 end_sequence ();
13421 return ret;
13425 aarch64_reverse_mask (enum machine_mode mode)
13427 /* We have to reverse each vector because we dont have
13428 a permuted load that can reverse-load according to ABI rules. */
13429 rtx mask;
13430 rtvec v = rtvec_alloc (16);
13431 int i, j;
13432 int nunits = GET_MODE_NUNITS (mode);
13433 int usize = GET_MODE_UNIT_SIZE (mode);
13435 gcc_assert (BYTES_BIG_ENDIAN);
13436 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13438 for (i = 0; i < nunits; i++)
13439 for (j = 0; j < usize; j++)
13440 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13441 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13442 return force_reg (V16QImode, mask);
13445 /* Implement MODES_TIEABLE_P. In principle we should always return true.
13446 However due to issues with register allocation it is preferable to avoid
13447 tieing integer scalar and FP scalar modes. Executing integer operations
13448 in general registers is better than treating them as scalar vector
13449 operations. This reduces latency and avoids redundant int<->FP moves.
13450 So tie modes if they are either the same class, or vector modes with
13451 other vector modes, vector structs or any scalar mode.
13454 bool
13455 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13457 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13458 return true;
13460 /* We specifically want to allow elements of "structure" modes to
13461 be tieable to the structure. This more general condition allows
13462 other rarer situations too. */
13463 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13464 return true;
13466 /* Also allow any scalar modes with vectors. */
13467 if (aarch64_vector_mode_supported_p (mode1)
13468 || aarch64_vector_mode_supported_p (mode2))
13469 return true;
13471 return false;
13474 /* Return a new RTX holding the result of moving POINTER forward by
13475 AMOUNT bytes. */
13477 static rtx
13478 aarch64_move_pointer (rtx pointer, int amount)
13480 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13482 return adjust_automodify_address (pointer, GET_MODE (pointer),
13483 next, amount);
13486 /* Return a new RTX holding the result of moving POINTER forward by the
13487 size of the mode it points to. */
13489 static rtx
13490 aarch64_progress_pointer (rtx pointer)
13492 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13494 return aarch64_move_pointer (pointer, amount);
13497 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13498 MODE bytes. */
13500 static void
13501 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13502 machine_mode mode)
13504 rtx reg = gen_reg_rtx (mode);
13506 /* "Cast" the pointers to the correct mode. */
13507 *src = adjust_address (*src, mode, 0);
13508 *dst = adjust_address (*dst, mode, 0);
13509 /* Emit the memcpy. */
13510 emit_move_insn (reg, *src);
13511 emit_move_insn (*dst, reg);
13512 /* Move the pointers forward. */
13513 *src = aarch64_progress_pointer (*src);
13514 *dst = aarch64_progress_pointer (*dst);
13517 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13518 we succeed, otherwise return false. */
13520 bool
13521 aarch64_expand_movmem (rtx *operands)
13523 unsigned int n;
13524 rtx dst = operands[0];
13525 rtx src = operands[1];
13526 rtx base;
13527 bool speed_p = !optimize_function_for_size_p (cfun);
13529 /* When optimizing for size, give a better estimate of the length of a
13530 memcpy call, but use the default otherwise. */
13531 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13533 /* We can't do anything smart if the amount to copy is not constant. */
13534 if (!CONST_INT_P (operands[2]))
13535 return false;
13537 n = UINTVAL (operands[2]);
13539 /* Try to keep the number of instructions low. For cases below 16 bytes we
13540 need to make at most two moves. For cases above 16 bytes it will be one
13541 move for each 16 byte chunk, then at most two additional moves. */
13542 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13543 return false;
13545 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13546 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13548 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13549 src = adjust_automodify_address (src, VOIDmode, base, 0);
13551 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13552 1-byte chunk. */
13553 if (n < 4)
13555 if (n >= 2)
13557 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13558 n -= 2;
13561 if (n == 1)
13562 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13564 return true;
13567 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13568 4-byte chunk, partially overlapping with the previously copied chunk. */
13569 if (n < 8)
13571 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13572 n -= 4;
13573 if (n > 0)
13575 int move = n - 4;
13577 src = aarch64_move_pointer (src, move);
13578 dst = aarch64_move_pointer (dst, move);
13579 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13581 return true;
13584 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13585 them, then (if applicable) an 8-byte chunk. */
13586 while (n >= 8)
13588 if (n / 16)
13590 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13591 n -= 16;
13593 else
13595 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13596 n -= 8;
13600 /* Finish the final bytes of the copy. We can always do this in one
13601 instruction. We either copy the exact amount we need, or partially
13602 overlap with the previous chunk we copied and copy 8-bytes. */
13603 if (n == 0)
13604 return true;
13605 else if (n == 1)
13606 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13607 else if (n == 2)
13608 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13609 else if (n == 4)
13610 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13611 else
13613 if (n == 3)
13615 src = aarch64_move_pointer (src, -1);
13616 dst = aarch64_move_pointer (dst, -1);
13617 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13619 else
13621 int move = n - 8;
13623 src = aarch64_move_pointer (src, move);
13624 dst = aarch64_move_pointer (dst, move);
13625 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13629 return true;
13632 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13633 SImode stores. Handle the case when the constant has identical
13634 bottom and top halves. This is beneficial when the two stores can be
13635 merged into an STP and we avoid synthesising potentially expensive
13636 immediates twice. Return true if such a split is possible. */
13638 bool
13639 aarch64_split_dimode_const_store (rtx dst, rtx src)
13641 rtx lo = gen_lowpart (SImode, src);
13642 rtx hi = gen_highpart_mode (SImode, DImode, src);
13644 bool size_p = optimize_function_for_size_p (cfun);
13646 if (!rtx_equal_p (lo, hi))
13647 return false;
13649 unsigned int orig_cost
13650 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13651 unsigned int lo_cost
13652 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13654 /* We want to transform:
13655 MOV x1, 49370
13656 MOVK x1, 0x140, lsl 16
13657 MOVK x1, 0xc0da, lsl 32
13658 MOVK x1, 0x140, lsl 48
13659 STR x1, [x0]
13660 into:
13661 MOV w1, 49370
13662 MOVK w1, 0x140, lsl 16
13663 STP w1, w1, [x0]
13664 So we want to perform this only when we save two instructions
13665 or more. When optimizing for size, however, accept any code size
13666 savings we can. */
13667 if (size_p && orig_cost <= lo_cost)
13668 return false;
13670 if (!size_p
13671 && (orig_cost <= lo_cost + 1))
13672 return false;
13674 rtx mem_lo = adjust_address (dst, SImode, 0);
13675 if (!aarch64_mem_pair_operand (mem_lo, SImode))
13676 return false;
13678 rtx tmp_reg = gen_reg_rtx (SImode);
13679 aarch64_expand_mov_immediate (tmp_reg, lo);
13680 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13681 /* Don't emit an explicit store pair as this may not be always profitable.
13682 Let the sched-fusion logic decide whether to merge them. */
13683 emit_move_insn (mem_lo, tmp_reg);
13684 emit_move_insn (mem_hi, tmp_reg);
13686 return true;
13689 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13691 static unsigned HOST_WIDE_INT
13692 aarch64_asan_shadow_offset (void)
13694 return (HOST_WIDE_INT_1 << 36);
13697 static bool
13698 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13699 unsigned int align,
13700 enum by_pieces_operation op,
13701 bool speed_p)
13703 /* STORE_BY_PIECES can be used when copying a constant string, but
13704 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13705 For now we always fail this and let the move_by_pieces code copy
13706 the string from read-only memory. */
13707 if (op == STORE_BY_PIECES)
13708 return false;
13710 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13713 static rtx
13714 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13715 int code, tree treeop0, tree treeop1)
13717 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13718 rtx op0, op1;
13719 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13720 insn_code icode;
13721 struct expand_operand ops[4];
13723 start_sequence ();
13724 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13726 op_mode = GET_MODE (op0);
13727 if (op_mode == VOIDmode)
13728 op_mode = GET_MODE (op1);
13730 switch (op_mode)
13732 case QImode:
13733 case HImode:
13734 case SImode:
13735 cmp_mode = SImode;
13736 icode = CODE_FOR_cmpsi;
13737 break;
13739 case DImode:
13740 cmp_mode = DImode;
13741 icode = CODE_FOR_cmpdi;
13742 break;
13744 case SFmode:
13745 cmp_mode = SFmode;
13746 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13747 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13748 break;
13750 case DFmode:
13751 cmp_mode = DFmode;
13752 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13753 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13754 break;
13756 default:
13757 end_sequence ();
13758 return NULL_RTX;
13761 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13762 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13763 if (!op0 || !op1)
13765 end_sequence ();
13766 return NULL_RTX;
13768 *prep_seq = get_insns ();
13769 end_sequence ();
13771 create_fixed_operand (&ops[0], op0);
13772 create_fixed_operand (&ops[1], op1);
13774 start_sequence ();
13775 if (!maybe_expand_insn (icode, 2, ops))
13777 end_sequence ();
13778 return NULL_RTX;
13780 *gen_seq = get_insns ();
13781 end_sequence ();
13783 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13784 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13787 static rtx
13788 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
13789 int cmp_code, tree treeop0, tree treeop1, int bit_code)
13791 rtx op0, op1, target;
13792 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13793 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13794 insn_code icode;
13795 struct expand_operand ops[6];
13796 int aarch64_cond;
13798 push_to_sequence (*prep_seq);
13799 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13801 op_mode = GET_MODE (op0);
13802 if (op_mode == VOIDmode)
13803 op_mode = GET_MODE (op1);
13805 switch (op_mode)
13807 case QImode:
13808 case HImode:
13809 case SImode:
13810 cmp_mode = SImode;
13811 icode = CODE_FOR_ccmpsi;
13812 break;
13814 case DImode:
13815 cmp_mode = DImode;
13816 icode = CODE_FOR_ccmpdi;
13817 break;
13819 case SFmode:
13820 cmp_mode = SFmode;
13821 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13822 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13823 break;
13825 case DFmode:
13826 cmp_mode = DFmode;
13827 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13828 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13829 break;
13831 default:
13832 end_sequence ();
13833 return NULL_RTX;
13836 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13837 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13838 if (!op0 || !op1)
13840 end_sequence ();
13841 return NULL_RTX;
13843 *prep_seq = get_insns ();
13844 end_sequence ();
13846 target = gen_rtx_REG (cc_mode, CC_REGNUM);
13847 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13849 if (bit_code != AND)
13851 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13852 GET_MODE (XEXP (prev, 0))),
13853 VOIDmode, XEXP (prev, 0), const0_rtx);
13854 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13857 create_fixed_operand (&ops[0], XEXP (prev, 0));
13858 create_fixed_operand (&ops[1], target);
13859 create_fixed_operand (&ops[2], op0);
13860 create_fixed_operand (&ops[3], op1);
13861 create_fixed_operand (&ops[4], prev);
13862 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13864 push_to_sequence (*gen_seq);
13865 if (!maybe_expand_insn (icode, 6, ops))
13867 end_sequence ();
13868 return NULL_RTX;
13871 *gen_seq = get_insns ();
13872 end_sequence ();
13874 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13877 #undef TARGET_GEN_CCMP_FIRST
13878 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13880 #undef TARGET_GEN_CCMP_NEXT
13881 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13883 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
13884 instruction fusion of some sort. */
13886 static bool
13887 aarch64_macro_fusion_p (void)
13889 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13893 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
13894 should be kept together during scheduling. */
13896 static bool
13897 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13899 rtx set_dest;
13900 rtx prev_set = single_set (prev);
13901 rtx curr_set = single_set (curr);
13902 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
13903 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13905 if (!aarch64_macro_fusion_p ())
13906 return false;
13908 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13910 /* We are trying to match:
13911 prev (mov) == (set (reg r0) (const_int imm16))
13912 curr (movk) == (set (zero_extract (reg r0)
13913 (const_int 16)
13914 (const_int 16))
13915 (const_int imm16_1)) */
13917 set_dest = SET_DEST (curr_set);
13919 if (GET_CODE (set_dest) == ZERO_EXTRACT
13920 && CONST_INT_P (SET_SRC (curr_set))
13921 && CONST_INT_P (SET_SRC (prev_set))
13922 && CONST_INT_P (XEXP (set_dest, 2))
13923 && INTVAL (XEXP (set_dest, 2)) == 16
13924 && REG_P (XEXP (set_dest, 0))
13925 && REG_P (SET_DEST (prev_set))
13926 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13928 return true;
13932 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13935 /* We're trying to match:
13936 prev (adrp) == (set (reg r1)
13937 (high (symbol_ref ("SYM"))))
13938 curr (add) == (set (reg r0)
13939 (lo_sum (reg r1)
13940 (symbol_ref ("SYM"))))
13941 Note that r0 need not necessarily be the same as r1, especially
13942 during pre-regalloc scheduling. */
13944 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13945 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13947 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13948 && REG_P (XEXP (SET_SRC (curr_set), 0))
13949 && REGNO (XEXP (SET_SRC (curr_set), 0))
13950 == REGNO (SET_DEST (prev_set))
13951 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13952 XEXP (SET_SRC (curr_set), 1)))
13953 return true;
13957 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13960 /* We're trying to match:
13961 prev (movk) == (set (zero_extract (reg r0)
13962 (const_int 16)
13963 (const_int 32))
13964 (const_int imm16_1))
13965 curr (movk) == (set (zero_extract (reg r0)
13966 (const_int 16)
13967 (const_int 48))
13968 (const_int imm16_2)) */
13970 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13971 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13972 && REG_P (XEXP (SET_DEST (prev_set), 0))
13973 && REG_P (XEXP (SET_DEST (curr_set), 0))
13974 && REGNO (XEXP (SET_DEST (prev_set), 0))
13975 == REGNO (XEXP (SET_DEST (curr_set), 0))
13976 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13977 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13978 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13979 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13980 && CONST_INT_P (SET_SRC (prev_set))
13981 && CONST_INT_P (SET_SRC (curr_set)))
13982 return true;
13985 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13987 /* We're trying to match:
13988 prev (adrp) == (set (reg r0)
13989 (high (symbol_ref ("SYM"))))
13990 curr (ldr) == (set (reg r1)
13991 (mem (lo_sum (reg r0)
13992 (symbol_ref ("SYM")))))
13994 curr (ldr) == (set (reg r1)
13995 (zero_extend (mem
13996 (lo_sum (reg r0)
13997 (symbol_ref ("SYM")))))) */
13998 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13999 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
14001 rtx curr_src = SET_SRC (curr_set);
14003 if (GET_CODE (curr_src) == ZERO_EXTEND)
14004 curr_src = XEXP (curr_src, 0);
14006 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
14007 && REG_P (XEXP (XEXP (curr_src, 0), 0))
14008 && REGNO (XEXP (XEXP (curr_src, 0), 0))
14009 == REGNO (SET_DEST (prev_set))
14010 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
14011 XEXP (SET_SRC (prev_set), 0)))
14012 return true;
14016 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
14017 && aarch_crypto_can_dual_issue (prev, curr))
14018 return true;
14020 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
14021 && any_condjump_p (curr))
14023 enum attr_type prev_type = get_attr_type (prev);
14025 /* FIXME: this misses some which is considered simple arthematic
14026 instructions for ThunderX. Simple shifts are missed here. */
14027 if (prev_type == TYPE_ALUS_SREG
14028 || prev_type == TYPE_ALUS_IMM
14029 || prev_type == TYPE_LOGICS_REG
14030 || prev_type == TYPE_LOGICS_IMM)
14031 return true;
14034 return false;
14037 /* Return true iff the instruction fusion described by OP is enabled. */
14039 bool
14040 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
14042 return (aarch64_tune_params.fusible_ops & op) != 0;
14045 /* If MEM is in the form of [base+offset], extract the two parts
14046 of address and set to BASE and OFFSET, otherwise return false
14047 after clearing BASE and OFFSET. */
14049 bool
14050 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14052 rtx addr;
14054 gcc_assert (MEM_P (mem));
14056 addr = XEXP (mem, 0);
14058 if (REG_P (addr))
14060 *base = addr;
14061 *offset = const0_rtx;
14062 return true;
14065 if (GET_CODE (addr) == PLUS
14066 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14068 *base = XEXP (addr, 0);
14069 *offset = XEXP (addr, 1);
14070 return true;
14073 *base = NULL_RTX;
14074 *offset = NULL_RTX;
14076 return false;
14079 /* Types for scheduling fusion. */
14080 enum sched_fusion_type
14082 SCHED_FUSION_NONE = 0,
14083 SCHED_FUSION_LD_SIGN_EXTEND,
14084 SCHED_FUSION_LD_ZERO_EXTEND,
14085 SCHED_FUSION_LD,
14086 SCHED_FUSION_ST,
14087 SCHED_FUSION_NUM
14090 /* If INSN is a load or store of address in the form of [base+offset],
14091 extract the two parts and set to BASE and OFFSET. Return scheduling
14092 fusion type this INSN is. */
14094 static enum sched_fusion_type
14095 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14097 rtx x, dest, src;
14098 enum sched_fusion_type fusion = SCHED_FUSION_LD;
14100 gcc_assert (INSN_P (insn));
14101 x = PATTERN (insn);
14102 if (GET_CODE (x) != SET)
14103 return SCHED_FUSION_NONE;
14105 src = SET_SRC (x);
14106 dest = SET_DEST (x);
14108 machine_mode dest_mode = GET_MODE (dest);
14110 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14111 return SCHED_FUSION_NONE;
14113 if (GET_CODE (src) == SIGN_EXTEND)
14115 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14116 src = XEXP (src, 0);
14117 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14118 return SCHED_FUSION_NONE;
14120 else if (GET_CODE (src) == ZERO_EXTEND)
14122 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14123 src = XEXP (src, 0);
14124 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14125 return SCHED_FUSION_NONE;
14128 if (GET_CODE (src) == MEM && REG_P (dest))
14129 extract_base_offset_in_addr (src, base, offset);
14130 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14132 fusion = SCHED_FUSION_ST;
14133 extract_base_offset_in_addr (dest, base, offset);
14135 else
14136 return SCHED_FUSION_NONE;
14138 if (*base == NULL_RTX || *offset == NULL_RTX)
14139 fusion = SCHED_FUSION_NONE;
14141 return fusion;
14144 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14146 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14147 and PRI are only calculated for these instructions. For other instruction,
14148 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14149 type instruction fusion can be added by returning different priorities.
14151 It's important that irrelevant instructions get the largest FUSION_PRI. */
14153 static void
14154 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14155 int *fusion_pri, int *pri)
14157 int tmp, off_val;
14158 rtx base, offset;
14159 enum sched_fusion_type fusion;
14161 gcc_assert (INSN_P (insn));
14163 tmp = max_pri - 1;
14164 fusion = fusion_load_store (insn, &base, &offset);
14165 if (fusion == SCHED_FUSION_NONE)
14167 *pri = tmp;
14168 *fusion_pri = tmp;
14169 return;
14172 /* Set FUSION_PRI according to fusion type and base register. */
14173 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14175 /* Calculate PRI. */
14176 tmp /= 2;
14178 /* INSN with smaller offset goes first. */
14179 off_val = (int)(INTVAL (offset));
14180 if (off_val >= 0)
14181 tmp -= (off_val & 0xfffff);
14182 else
14183 tmp += ((- off_val) & 0xfffff);
14185 *pri = tmp;
14186 return;
14189 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14190 Adjust priority of sha1h instructions so they are scheduled before
14191 other SHA1 instructions. */
14193 static int
14194 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14196 rtx x = PATTERN (insn);
14198 if (GET_CODE (x) == SET)
14200 x = SET_SRC (x);
14202 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14203 return priority + 10;
14206 return priority;
14209 /* Given OPERANDS of consecutive load/store, check if we can merge
14210 them into ldp/stp. LOAD is true if they are load instructions.
14211 MODE is the mode of memory operands. */
14213 bool
14214 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14215 enum machine_mode mode)
14217 HOST_WIDE_INT offval_1, offval_2, msize;
14218 enum reg_class rclass_1, rclass_2;
14219 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14221 if (load)
14223 mem_1 = operands[1];
14224 mem_2 = operands[3];
14225 reg_1 = operands[0];
14226 reg_2 = operands[2];
14227 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14228 if (REGNO (reg_1) == REGNO (reg_2))
14229 return false;
14231 else
14233 mem_1 = operands[0];
14234 mem_2 = operands[2];
14235 reg_1 = operands[1];
14236 reg_2 = operands[3];
14239 /* The mems cannot be volatile. */
14240 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14241 return false;
14243 /* If we have SImode and slow unaligned ldp,
14244 check the alignment to be at least 8 byte. */
14245 if (mode == SImode
14246 && (aarch64_tune_params.extra_tuning_flags
14247 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14248 && !optimize_size
14249 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14250 return false;
14252 /* Check if the addresses are in the form of [base+offset]. */
14253 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14254 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14255 return false;
14256 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14257 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14258 return false;
14260 /* Check if the bases are same. */
14261 if (!rtx_equal_p (base_1, base_2))
14262 return false;
14264 offval_1 = INTVAL (offset_1);
14265 offval_2 = INTVAL (offset_2);
14266 msize = GET_MODE_SIZE (mode);
14267 /* Check if the offsets are consecutive. */
14268 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14269 return false;
14271 /* Check if the addresses are clobbered by load. */
14272 if (load)
14274 if (reg_mentioned_p (reg_1, mem_1))
14275 return false;
14277 /* In increasing order, the last load can clobber the address. */
14278 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14279 return false;
14282 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14283 rclass_1 = FP_REGS;
14284 else
14285 rclass_1 = GENERAL_REGS;
14287 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14288 rclass_2 = FP_REGS;
14289 else
14290 rclass_2 = GENERAL_REGS;
14292 /* Check if the registers are of same class. */
14293 if (rclass_1 != rclass_2)
14294 return false;
14296 return true;
14299 /* Given OPERANDS of consecutive load/store, check if we can merge
14300 them into ldp/stp by adjusting the offset. LOAD is true if they
14301 are load instructions. MODE is the mode of memory operands.
14303 Given below consecutive stores:
14305 str w1, [xb, 0x100]
14306 str w1, [xb, 0x104]
14307 str w1, [xb, 0x108]
14308 str w1, [xb, 0x10c]
14310 Though the offsets are out of the range supported by stp, we can
14311 still pair them after adjusting the offset, like:
14313 add scratch, xb, 0x100
14314 stp w1, w1, [scratch]
14315 stp w1, w1, [scratch, 0x8]
14317 The peephole patterns detecting this opportunity should guarantee
14318 the scratch register is avaliable. */
14320 bool
14321 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14322 enum machine_mode mode)
14324 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14325 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14326 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14327 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14329 if (load)
14331 reg_1 = operands[0];
14332 mem_1 = operands[1];
14333 reg_2 = operands[2];
14334 mem_2 = operands[3];
14335 reg_3 = operands[4];
14336 mem_3 = operands[5];
14337 reg_4 = operands[6];
14338 mem_4 = operands[7];
14339 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14340 && REG_P (reg_3) && REG_P (reg_4));
14341 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14342 return false;
14344 else
14346 mem_1 = operands[0];
14347 reg_1 = operands[1];
14348 mem_2 = operands[2];
14349 reg_2 = operands[3];
14350 mem_3 = operands[4];
14351 reg_3 = operands[5];
14352 mem_4 = operands[6];
14353 reg_4 = operands[7];
14355 /* Skip if memory operand is by itslef valid for ldp/stp. */
14356 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14357 return false;
14359 /* The mems cannot be volatile. */
14360 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14361 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14362 return false;
14364 /* Check if the addresses are in the form of [base+offset]. */
14365 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14366 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14367 return false;
14368 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14369 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14370 return false;
14371 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14372 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14373 return false;
14374 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14375 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14376 return false;
14378 /* Check if the bases are same. */
14379 if (!rtx_equal_p (base_1, base_2)
14380 || !rtx_equal_p (base_2, base_3)
14381 || !rtx_equal_p (base_3, base_4))
14382 return false;
14384 offval_1 = INTVAL (offset_1);
14385 offval_2 = INTVAL (offset_2);
14386 offval_3 = INTVAL (offset_3);
14387 offval_4 = INTVAL (offset_4);
14388 msize = GET_MODE_SIZE (mode);
14389 /* Check if the offsets are consecutive. */
14390 if ((offval_1 != (offval_2 + msize)
14391 || offval_1 != (offval_3 + msize * 2)
14392 || offval_1 != (offval_4 + msize * 3))
14393 && (offval_4 != (offval_3 + msize)
14394 || offval_4 != (offval_2 + msize * 2)
14395 || offval_4 != (offval_1 + msize * 3)))
14396 return false;
14398 /* Check if the addresses are clobbered by load. */
14399 if (load)
14401 if (reg_mentioned_p (reg_1, mem_1)
14402 || reg_mentioned_p (reg_2, mem_2)
14403 || reg_mentioned_p (reg_3, mem_3))
14404 return false;
14406 /* In increasing order, the last load can clobber the address. */
14407 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14408 return false;
14411 /* If we have SImode and slow unaligned ldp,
14412 check the alignment to be at least 8 byte. */
14413 if (mode == SImode
14414 && (aarch64_tune_params.extra_tuning_flags
14415 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14416 && !optimize_size
14417 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14418 return false;
14420 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14421 rclass_1 = FP_REGS;
14422 else
14423 rclass_1 = GENERAL_REGS;
14425 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14426 rclass_2 = FP_REGS;
14427 else
14428 rclass_2 = GENERAL_REGS;
14430 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14431 rclass_3 = FP_REGS;
14432 else
14433 rclass_3 = GENERAL_REGS;
14435 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14436 rclass_4 = FP_REGS;
14437 else
14438 rclass_4 = GENERAL_REGS;
14440 /* Check if the registers are of same class. */
14441 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14442 return false;
14444 return true;
14447 /* Given OPERANDS of consecutive load/store, this function pairs them
14448 into ldp/stp after adjusting the offset. It depends on the fact
14449 that addresses of load/store instructions are in increasing order.
14450 MODE is the mode of memory operands. CODE is the rtl operator
14451 which should be applied to all memory operands, it's SIGN_EXTEND,
14452 ZERO_EXTEND or UNKNOWN. */
14454 bool
14455 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14456 enum machine_mode mode, RTX_CODE code)
14458 rtx base, offset, t1, t2;
14459 rtx mem_1, mem_2, mem_3, mem_4;
14460 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14462 if (load)
14464 mem_1 = operands[1];
14465 mem_2 = operands[3];
14466 mem_3 = operands[5];
14467 mem_4 = operands[7];
14469 else
14471 mem_1 = operands[0];
14472 mem_2 = operands[2];
14473 mem_3 = operands[4];
14474 mem_4 = operands[6];
14475 gcc_assert (code == UNKNOWN);
14478 extract_base_offset_in_addr (mem_1, &base, &offset);
14479 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14481 /* Adjust offset thus it can fit in ldp/stp instruction. */
14482 msize = GET_MODE_SIZE (mode);
14483 stp_off_limit = msize * 0x40;
14484 off_val = INTVAL (offset);
14485 abs_off = (off_val < 0) ? -off_val : off_val;
14486 new_off = abs_off % stp_off_limit;
14487 adj_off = abs_off - new_off;
14489 /* Further adjust to make sure all offsets are OK. */
14490 if ((new_off + msize * 2) >= stp_off_limit)
14492 adj_off += stp_off_limit;
14493 new_off -= stp_off_limit;
14496 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14497 if (adj_off >= 0x1000)
14498 return false;
14500 if (off_val < 0)
14502 adj_off = -adj_off;
14503 new_off = -new_off;
14506 /* Create new memory references. */
14507 mem_1 = change_address (mem_1, VOIDmode,
14508 plus_constant (DImode, operands[8], new_off));
14510 /* Check if the adjusted address is OK for ldp/stp. */
14511 if (!aarch64_mem_pair_operand (mem_1, mode))
14512 return false;
14514 msize = GET_MODE_SIZE (mode);
14515 mem_2 = change_address (mem_2, VOIDmode,
14516 plus_constant (DImode,
14517 operands[8],
14518 new_off + msize));
14519 mem_3 = change_address (mem_3, VOIDmode,
14520 plus_constant (DImode,
14521 operands[8],
14522 new_off + msize * 2));
14523 mem_4 = change_address (mem_4, VOIDmode,
14524 plus_constant (DImode,
14525 operands[8],
14526 new_off + msize * 3));
14528 if (code == ZERO_EXTEND)
14530 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14531 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14532 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14533 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14535 else if (code == SIGN_EXTEND)
14537 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14538 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14539 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14540 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14543 if (load)
14545 operands[1] = mem_1;
14546 operands[3] = mem_2;
14547 operands[5] = mem_3;
14548 operands[7] = mem_4;
14550 else
14552 operands[0] = mem_1;
14553 operands[2] = mem_2;
14554 operands[4] = mem_3;
14555 operands[6] = mem_4;
14558 /* Emit adjusting instruction. */
14559 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14560 /* Emit ldp/stp instructions. */
14561 t1 = gen_rtx_SET (operands[0], operands[1]);
14562 t2 = gen_rtx_SET (operands[2], operands[3]);
14563 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14564 t1 = gen_rtx_SET (operands[4], operands[5]);
14565 t2 = gen_rtx_SET (operands[6], operands[7]);
14566 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14567 return true;
14570 /* Return 1 if pseudo register should be created and used to hold
14571 GOT address for PIC code. */
14573 bool
14574 aarch64_use_pseudo_pic_reg (void)
14576 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14579 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
14581 static int
14582 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14584 switch (XINT (x, 1))
14586 case UNSPEC_GOTSMALLPIC:
14587 case UNSPEC_GOTSMALLPIC28K:
14588 case UNSPEC_GOTTINYPIC:
14589 return 0;
14590 default:
14591 break;
14594 return default_unspec_may_trap_p (x, flags);
14598 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14599 return the log2 of that value. Otherwise return -1. */
14602 aarch64_fpconst_pow_of_2 (rtx x)
14604 const REAL_VALUE_TYPE *r;
14606 if (!CONST_DOUBLE_P (x))
14607 return -1;
14609 r = CONST_DOUBLE_REAL_VALUE (x);
14611 if (REAL_VALUE_NEGATIVE (*r)
14612 || REAL_VALUE_ISNAN (*r)
14613 || REAL_VALUE_ISINF (*r)
14614 || !real_isinteger (r, DFmode))
14615 return -1;
14617 return exact_log2 (real_to_integer (r));
14620 /* If X is a vector of equal CONST_DOUBLE values and that value is
14621 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
14624 aarch64_vec_fpconst_pow_of_2 (rtx x)
14626 if (GET_CODE (x) != CONST_VECTOR)
14627 return -1;
14629 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14630 return -1;
14632 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14633 if (firstval <= 0)
14634 return -1;
14636 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14637 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14638 return -1;
14640 return firstval;
14643 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14644 to float.
14646 __fp16 always promotes through this hook.
14647 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14648 through the generic excess precision logic rather than here. */
14650 static tree
14651 aarch64_promoted_type (const_tree t)
14653 if (SCALAR_FLOAT_TYPE_P (t)
14654 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14655 return float_type_node;
14657 return NULL_TREE;
14660 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
14662 static bool
14663 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14664 optimization_type opt_type)
14666 switch (op)
14668 case rsqrt_optab:
14669 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14671 default:
14672 return true;
14676 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
14677 if MODE is HFmode, and punt to the generic implementation otherwise. */
14679 static bool
14680 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
14682 return (mode == HFmode
14683 ? true
14684 : default_libgcc_floating_mode_supported_p (mode));
14687 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
14688 if MODE is HFmode, and punt to the generic implementation otherwise. */
14690 static bool
14691 aarch64_scalar_mode_supported_p (machine_mode mode)
14693 return (mode == HFmode
14694 ? true
14695 : default_scalar_mode_supported_p (mode));
14698 /* Set the value of FLT_EVAL_METHOD.
14699 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
14701 0: evaluate all operations and constants, whose semantic type has at
14702 most the range and precision of type float, to the range and
14703 precision of float; evaluate all other operations and constants to
14704 the range and precision of the semantic type;
14706 N, where _FloatN is a supported interchange floating type
14707 evaluate all operations and constants, whose semantic type has at
14708 most the range and precision of _FloatN type, to the range and
14709 precision of the _FloatN type; evaluate all other operations and
14710 constants to the range and precision of the semantic type;
14712 If we have the ARMv8.2-A extensions then we support _Float16 in native
14713 precision, so we should set this to 16. Otherwise, we support the type,
14714 but want to evaluate expressions in float precision, so set this to
14715 0. */
14717 static enum flt_eval_method
14718 aarch64_excess_precision (enum excess_precision_type type)
14720 switch (type)
14722 case EXCESS_PRECISION_TYPE_FAST:
14723 case EXCESS_PRECISION_TYPE_STANDARD:
14724 /* We can calculate either in 16-bit range and precision or
14725 32-bit range and precision. Make that decision based on whether
14726 we have native support for the ARMv8.2-A 16-bit floating-point
14727 instructions or not. */
14728 return (TARGET_FP_F16INST
14729 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
14730 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
14731 case EXCESS_PRECISION_TYPE_IMPLICIT:
14732 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
14733 default:
14734 gcc_unreachable ();
14736 return FLT_EVAL_METHOD_UNPREDICTABLE;
14739 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
14740 scheduled for speculative execution. Reject the long-running division
14741 and square-root instructions. */
14743 static bool
14744 aarch64_sched_can_speculate_insn (rtx_insn *insn)
14746 switch (get_attr_type (insn))
14748 case TYPE_SDIV:
14749 case TYPE_UDIV:
14750 case TYPE_FDIVS:
14751 case TYPE_FDIVD:
14752 case TYPE_FSQRTS:
14753 case TYPE_FSQRTD:
14754 case TYPE_NEON_FP_SQRT_S:
14755 case TYPE_NEON_FP_SQRT_D:
14756 case TYPE_NEON_FP_SQRT_S_Q:
14757 case TYPE_NEON_FP_SQRT_D_Q:
14758 case TYPE_NEON_FP_DIV_S:
14759 case TYPE_NEON_FP_DIV_D:
14760 case TYPE_NEON_FP_DIV_S_Q:
14761 case TYPE_NEON_FP_DIV_D_Q:
14762 return false;
14763 default:
14764 return true;
14768 /* Target-specific selftests. */
14770 #if CHECKING_P
14772 namespace selftest {
14774 /* Selftest for the RTL loader.
14775 Verify that the RTL loader copes with a dump from
14776 print_rtx_function. This is essentially just a test that class
14777 function_reader can handle a real dump, but it also verifies
14778 that lookup_reg_by_dump_name correctly handles hard regs.
14779 The presence of hard reg names in the dump means that the test is
14780 target-specific, hence it is in this file. */
14782 static void
14783 aarch64_test_loading_full_dump ()
14785 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
14787 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
14789 rtx_insn *insn_1 = get_insn_by_uid (1);
14790 ASSERT_EQ (NOTE, GET_CODE (insn_1));
14792 rtx_insn *insn_15 = get_insn_by_uid (15);
14793 ASSERT_EQ (INSN, GET_CODE (insn_15));
14794 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
14796 /* Verify crtl->return_rtx. */
14797 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
14798 ASSERT_EQ (0, REGNO (crtl->return_rtx));
14799 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
14802 /* Run all target-specific selftests. */
14804 static void
14805 aarch64_run_selftests (void)
14807 aarch64_test_loading_full_dump ();
14810 } // namespace selftest
14812 #endif /* #if CHECKING_P */
14814 #undef TARGET_ADDRESS_COST
14815 #define TARGET_ADDRESS_COST aarch64_address_cost
14817 /* This hook will determines whether unnamed bitfields affect the alignment
14818 of the containing structure. The hook returns true if the structure
14819 should inherit the alignment requirements of an unnamed bitfield's
14820 type. */
14821 #undef TARGET_ALIGN_ANON_BITFIELD
14822 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14824 #undef TARGET_ASM_ALIGNED_DI_OP
14825 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14827 #undef TARGET_ASM_ALIGNED_HI_OP
14828 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14830 #undef TARGET_ASM_ALIGNED_SI_OP
14831 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14833 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14834 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14835 hook_bool_const_tree_hwi_hwi_const_tree_true
14837 #undef TARGET_ASM_FILE_START
14838 #define TARGET_ASM_FILE_START aarch64_start_file
14840 #undef TARGET_ASM_OUTPUT_MI_THUNK
14841 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14843 #undef TARGET_ASM_SELECT_RTX_SECTION
14844 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14846 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14847 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14849 #undef TARGET_BUILD_BUILTIN_VA_LIST
14850 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14852 #undef TARGET_CALLEE_COPIES
14853 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14855 #undef TARGET_CAN_ELIMINATE
14856 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14858 #undef TARGET_CAN_INLINE_P
14859 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14861 #undef TARGET_CANNOT_FORCE_CONST_MEM
14862 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14864 #undef TARGET_CASE_VALUES_THRESHOLD
14865 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14867 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14868 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14870 /* Only the least significant bit is used for initialization guard
14871 variables. */
14872 #undef TARGET_CXX_GUARD_MASK_BIT
14873 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14875 #undef TARGET_C_MODE_FOR_SUFFIX
14876 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14878 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14879 #undef TARGET_DEFAULT_TARGET_FLAGS
14880 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14881 #endif
14883 #undef TARGET_CLASS_MAX_NREGS
14884 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14886 #undef TARGET_BUILTIN_DECL
14887 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14889 #undef TARGET_BUILTIN_RECIPROCAL
14890 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14892 #undef TARGET_C_EXCESS_PRECISION
14893 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
14895 #undef TARGET_EXPAND_BUILTIN
14896 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14898 #undef TARGET_EXPAND_BUILTIN_VA_START
14899 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14901 #undef TARGET_FOLD_BUILTIN
14902 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14904 #undef TARGET_FUNCTION_ARG
14905 #define TARGET_FUNCTION_ARG aarch64_function_arg
14907 #undef TARGET_FUNCTION_ARG_ADVANCE
14908 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14910 #undef TARGET_FUNCTION_ARG_BOUNDARY
14911 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14913 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14914 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14916 #undef TARGET_FUNCTION_VALUE
14917 #define TARGET_FUNCTION_VALUE aarch64_function_value
14919 #undef TARGET_FUNCTION_VALUE_REGNO_P
14920 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14922 #undef TARGET_FRAME_POINTER_REQUIRED
14923 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14925 #undef TARGET_GIMPLE_FOLD_BUILTIN
14926 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14928 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14929 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14931 #undef TARGET_INIT_BUILTINS
14932 #define TARGET_INIT_BUILTINS aarch64_init_builtins
14934 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14935 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14936 aarch64_ira_change_pseudo_allocno_class
14938 #undef TARGET_LEGITIMATE_ADDRESS_P
14939 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14941 #undef TARGET_LEGITIMATE_CONSTANT_P
14942 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14944 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14945 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14946 aarch64_legitimize_address_displacement
14948 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14949 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14951 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
14952 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
14953 aarch64_libgcc_floating_mode_supported_p
14955 #undef TARGET_MANGLE_TYPE
14956 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14958 #undef TARGET_MEMORY_MOVE_COST
14959 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14961 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14962 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14964 #undef TARGET_MUST_PASS_IN_STACK
14965 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14967 /* This target hook should return true if accesses to volatile bitfields
14968 should use the narrowest mode possible. It should return false if these
14969 accesses should use the bitfield container type. */
14970 #undef TARGET_NARROW_VOLATILE_BITFIELD
14971 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14973 #undef TARGET_OPTION_OVERRIDE
14974 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14976 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14977 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14978 aarch64_override_options_after_change
14980 #undef TARGET_OPTION_SAVE
14981 #define TARGET_OPTION_SAVE aarch64_option_save
14983 #undef TARGET_OPTION_RESTORE
14984 #define TARGET_OPTION_RESTORE aarch64_option_restore
14986 #undef TARGET_OPTION_PRINT
14987 #define TARGET_OPTION_PRINT aarch64_option_print
14989 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14990 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14992 #undef TARGET_SET_CURRENT_FUNCTION
14993 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14995 #undef TARGET_PASS_BY_REFERENCE
14996 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14998 #undef TARGET_PREFERRED_RELOAD_CLASS
14999 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
15001 #undef TARGET_SCHED_REASSOCIATION_WIDTH
15002 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
15004 #undef TARGET_PROMOTED_TYPE
15005 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
15007 #undef TARGET_SECONDARY_RELOAD
15008 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
15010 #undef TARGET_SHIFT_TRUNCATION_MASK
15011 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
15013 #undef TARGET_SETUP_INCOMING_VARARGS
15014 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
15016 #undef TARGET_STRUCT_VALUE_RTX
15017 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
15019 #undef TARGET_REGISTER_MOVE_COST
15020 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
15022 #undef TARGET_RETURN_IN_MEMORY
15023 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
15025 #undef TARGET_RETURN_IN_MSB
15026 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
15028 #undef TARGET_RTX_COSTS
15029 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
15031 #undef TARGET_SCALAR_MODE_SUPPORTED_P
15032 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
15034 #undef TARGET_SCHED_ISSUE_RATE
15035 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
15037 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
15038 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
15039 aarch64_sched_first_cycle_multipass_dfa_lookahead
15041 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
15042 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
15043 aarch64_first_cycle_multipass_dfa_lookahead_guard
15045 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
15046 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
15047 aarch64_get_separate_components
15049 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15050 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15051 aarch64_components_for_bb
15053 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15054 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15055 aarch64_disqualify_components
15057 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15058 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15059 aarch64_emit_prologue_components
15061 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15062 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15063 aarch64_emit_epilogue_components
15065 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15066 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15067 aarch64_set_handled_components
15069 #undef TARGET_TRAMPOLINE_INIT
15070 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15072 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15073 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15075 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15076 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15078 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15079 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15080 aarch64_builtin_support_vector_misalignment
15082 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15083 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15085 #undef TARGET_VECTORIZE_ADD_STMT_COST
15086 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15088 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15089 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15090 aarch64_builtin_vectorization_cost
15092 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15093 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15095 #undef TARGET_VECTORIZE_BUILTINS
15096 #define TARGET_VECTORIZE_BUILTINS
15098 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15099 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15100 aarch64_builtin_vectorized_function
15102 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15103 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15104 aarch64_autovectorize_vector_sizes
15106 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15107 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15108 aarch64_atomic_assign_expand_fenv
15110 /* Section anchor support. */
15112 #undef TARGET_MIN_ANCHOR_OFFSET
15113 #define TARGET_MIN_ANCHOR_OFFSET -256
15115 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15116 byte offset; we can do much more for larger data types, but have no way
15117 to determine the size of the access. We assume accesses are aligned. */
15118 #undef TARGET_MAX_ANCHOR_OFFSET
15119 #define TARGET_MAX_ANCHOR_OFFSET 4095
15121 #undef TARGET_VECTOR_ALIGNMENT
15122 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15124 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15125 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15126 aarch64_simd_vector_alignment_reachable
15128 /* vec_perm support. */
15130 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15131 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15132 aarch64_vectorize_vec_perm_const_ok
15134 #undef TARGET_INIT_LIBFUNCS
15135 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15137 #undef TARGET_FIXED_CONDITION_CODE_REGS
15138 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15140 #undef TARGET_FLAGS_REGNUM
15141 #define TARGET_FLAGS_REGNUM CC_REGNUM
15143 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15144 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15146 #undef TARGET_ASAN_SHADOW_OFFSET
15147 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15149 #undef TARGET_LEGITIMIZE_ADDRESS
15150 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15152 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15153 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15154 aarch64_use_by_pieces_infrastructure_p
15156 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15157 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15159 #undef TARGET_CAN_USE_DOLOOP_P
15160 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15162 #undef TARGET_SCHED_ADJUST_PRIORITY
15163 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15165 #undef TARGET_SCHED_MACRO_FUSION_P
15166 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15168 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15169 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15171 #undef TARGET_SCHED_FUSION_PRIORITY
15172 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15174 #undef TARGET_UNSPEC_MAY_TRAP_P
15175 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15177 #undef TARGET_USE_PSEUDO_PIC_REG
15178 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15180 #undef TARGET_PRINT_OPERAND
15181 #define TARGET_PRINT_OPERAND aarch64_print_operand
15183 #undef TARGET_PRINT_OPERAND_ADDRESS
15184 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15186 #undef TARGET_OPTAB_SUPPORTED_P
15187 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15189 #undef TARGET_OMIT_STRUCT_RETURN_REG
15190 #define TARGET_OMIT_STRUCT_RETURN_REG true
15192 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15193 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15194 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15196 #if CHECKING_P
15197 #undef TARGET_RUN_TARGET_SELFTESTS
15198 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15199 #endif /* #if CHECKING_P */
15201 struct gcc_target targetm = TARGET_INITIALIZER;
15203 #include "gt-aarch64.h"