[AArch64] Fix bootstrap due to wide_int .elt (0) uninit warning
[official-gcc.git] / gcc / config / aarch64 / aarch64.c
blob714bb79f52016a4e9ae15539412f94945e4911e7
1 /* Machine description for AArch64 architecture.
2 Copyright (C) 2009-2017 Free Software Foundation, Inc.
3 Contributed by ARM Ltd.
5 This file is part of GCC.
7 GCC is free software; you can redistribute it and/or modify it
8 under the terms of the GNU General Public License as published by
9 the Free Software Foundation; either version 3, or (at your option)
10 any later version.
12 GCC is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 General Public License for more details.
17 You should have received a copy of the GNU General Public License
18 along with GCC; see the file COPYING3. If not see
19 <http://www.gnu.org/licenses/>. */
21 #include "config.h"
22 #define INCLUDE_STRING
23 #include "system.h"
24 #include "coretypes.h"
25 #include "backend.h"
26 #include "target.h"
27 #include "rtl.h"
28 #include "tree.h"
29 #include "memmodel.h"
30 #include "gimple.h"
31 #include "cfghooks.h"
32 #include "cfgloop.h"
33 #include "df.h"
34 #include "tm_p.h"
35 #include "stringpool.h"
36 #include "optabs.h"
37 #include "regs.h"
38 #include "emit-rtl.h"
39 #include "recog.h"
40 #include "diagnostic.h"
41 #include "insn-attr.h"
42 #include "alias.h"
43 #include "fold-const.h"
44 #include "stor-layout.h"
45 #include "calls.h"
46 #include "varasm.h"
47 #include "output.h"
48 #include "flags.h"
49 #include "explow.h"
50 #include "expr.h"
51 #include "reload.h"
52 #include "langhooks.h"
53 #include "opts.h"
54 #include "params.h"
55 #include "gimplify.h"
56 #include "dwarf2.h"
57 #include "gimple-iterator.h"
58 #include "tree-vectorizer.h"
59 #include "aarch64-cost-tables.h"
60 #include "dumpfile.h"
61 #include "builtins.h"
62 #include "rtl-iter.h"
63 #include "tm-constrs.h"
64 #include "sched-int.h"
65 #include "target-globals.h"
66 #include "common/common-target.h"
67 #include "selftest.h"
68 #include "selftest-rtl.h"
70 /* This file should be included last. */
71 #include "target-def.h"
73 /* Defined for convenience. */
74 #define POINTER_BYTES (POINTER_SIZE / BITS_PER_UNIT)
76 /* Classifies an address.
78 ADDRESS_REG_IMM
79 A simple base register plus immediate offset.
81 ADDRESS_REG_WB
82 A base register indexed by immediate offset with writeback.
84 ADDRESS_REG_REG
85 A base register indexed by (optionally scaled) register.
87 ADDRESS_REG_UXTW
88 A base register indexed by (optionally scaled) zero-extended register.
90 ADDRESS_REG_SXTW
91 A base register indexed by (optionally scaled) sign-extended register.
93 ADDRESS_LO_SUM
94 A LO_SUM rtx with a base register and "LO12" symbol relocation.
96 ADDRESS_SYMBOLIC:
97 A constant symbolic address, in pc-relative literal pool. */
99 enum aarch64_address_type {
100 ADDRESS_REG_IMM,
101 ADDRESS_REG_WB,
102 ADDRESS_REG_REG,
103 ADDRESS_REG_UXTW,
104 ADDRESS_REG_SXTW,
105 ADDRESS_LO_SUM,
106 ADDRESS_SYMBOLIC
109 struct aarch64_address_info {
110 enum aarch64_address_type type;
111 rtx base;
112 rtx offset;
113 int shift;
114 enum aarch64_symbol_type symbol_type;
117 struct simd_immediate_info
119 rtx value;
120 int shift;
121 int element_width;
122 bool mvn;
123 bool msl;
126 /* The current code model. */
127 enum aarch64_code_model aarch64_cmodel;
129 #ifdef HAVE_AS_TLS
130 #undef TARGET_HAVE_TLS
131 #define TARGET_HAVE_TLS 1
132 #endif
134 static bool aarch64_composite_type_p (const_tree, machine_mode);
135 static bool aarch64_vfp_is_call_or_return_candidate (machine_mode,
136 const_tree,
137 machine_mode *, int *,
138 bool *);
139 static void aarch64_elf_asm_constructor (rtx, int) ATTRIBUTE_UNUSED;
140 static void aarch64_elf_asm_destructor (rtx, int) ATTRIBUTE_UNUSED;
141 static void aarch64_override_options_after_change (void);
142 static bool aarch64_vector_mode_supported_p (machine_mode);
143 static bool aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
144 const unsigned char *sel);
145 static int aarch64_address_cost (rtx, machine_mode, addr_space_t, bool);
146 static bool aarch64_builtin_support_vector_misalignment (machine_mode mode,
147 const_tree type,
148 int misalignment,
149 bool is_packed);
151 /* Major revision number of the ARM Architecture implemented by the target. */
152 unsigned aarch64_architecture_version;
154 /* The processor for which instructions should be scheduled. */
155 enum aarch64_processor aarch64_tune = cortexa53;
157 /* Mask to specify which instruction scheduling options should be used. */
158 unsigned long aarch64_tune_flags = 0;
160 /* Global flag for PC relative loads. */
161 bool aarch64_pcrelative_literal_loads;
163 /* Support for command line parsing of boolean flags in the tuning
164 structures. */
165 struct aarch64_flag_desc
167 const char* name;
168 unsigned int flag;
171 #define AARCH64_FUSION_PAIR(name, internal_name) \
172 { name, AARCH64_FUSE_##internal_name },
173 static const struct aarch64_flag_desc aarch64_fusible_pairs[] =
175 { "none", AARCH64_FUSE_NOTHING },
176 #include "aarch64-fusion-pairs.def"
177 { "all", AARCH64_FUSE_ALL },
178 { NULL, AARCH64_FUSE_NOTHING }
181 #define AARCH64_EXTRA_TUNING_OPTION(name, internal_name) \
182 { name, AARCH64_EXTRA_TUNE_##internal_name },
183 static const struct aarch64_flag_desc aarch64_tuning_flags[] =
185 { "none", AARCH64_EXTRA_TUNE_NONE },
186 #include "aarch64-tuning-flags.def"
187 { "all", AARCH64_EXTRA_TUNE_ALL },
188 { NULL, AARCH64_EXTRA_TUNE_NONE }
191 /* Tuning parameters. */
193 static const struct cpu_addrcost_table generic_addrcost_table =
196 0, /* hi */
197 0, /* si */
198 0, /* di */
199 0, /* ti */
201 0, /* pre_modify */
202 0, /* post_modify */
203 0, /* register_offset */
204 0, /* register_sextend */
205 0, /* register_zextend */
206 0 /* imm_offset */
209 static const struct cpu_addrcost_table cortexa57_addrcost_table =
212 1, /* hi */
213 0, /* si */
214 0, /* di */
215 1, /* ti */
217 0, /* pre_modify */
218 0, /* post_modify */
219 0, /* register_offset */
220 0, /* register_sextend */
221 0, /* register_zextend */
222 0, /* imm_offset */
225 static const struct cpu_addrcost_table exynosm1_addrcost_table =
228 0, /* hi */
229 0, /* si */
230 0, /* di */
231 2, /* ti */
233 0, /* pre_modify */
234 0, /* post_modify */
235 1, /* register_offset */
236 1, /* register_sextend */
237 2, /* register_zextend */
238 0, /* imm_offset */
241 static const struct cpu_addrcost_table xgene1_addrcost_table =
244 1, /* hi */
245 0, /* si */
246 0, /* di */
247 1, /* ti */
249 1, /* pre_modify */
250 0, /* post_modify */
251 0, /* register_offset */
252 1, /* register_sextend */
253 1, /* register_zextend */
254 0, /* imm_offset */
257 static const struct cpu_addrcost_table qdf24xx_addrcost_table =
260 1, /* hi */
261 0, /* si */
262 0, /* di */
263 1, /* ti */
265 0, /* pre_modify */
266 0, /* post_modify */
267 0, /* register_offset */
268 0, /* register_sextend */
269 0, /* register_zextend */
270 0 /* imm_offset */
273 static const struct cpu_addrcost_table thunderx2t99_addrcost_table =
276 1, /* hi */
277 1, /* si */
278 1, /* di */
279 2, /* ti */
281 0, /* pre_modify */
282 0, /* post_modify */
283 2, /* register_offset */
284 3, /* register_sextend */
285 3, /* register_zextend */
286 0, /* imm_offset */
289 static const struct cpu_regmove_cost generic_regmove_cost =
291 1, /* GP2GP */
292 /* Avoid the use of slow int<->fp moves for spilling by setting
293 their cost higher than memmov_cost. */
294 5, /* GP2FP */
295 5, /* FP2GP */
296 2 /* FP2FP */
299 static const struct cpu_regmove_cost cortexa57_regmove_cost =
301 1, /* GP2GP */
302 /* Avoid the use of slow int<->fp moves for spilling by setting
303 their cost higher than memmov_cost. */
304 5, /* GP2FP */
305 5, /* FP2GP */
306 2 /* FP2FP */
309 static const struct cpu_regmove_cost cortexa53_regmove_cost =
311 1, /* GP2GP */
312 /* Avoid the use of slow int<->fp moves for spilling by setting
313 their cost higher than memmov_cost. */
314 5, /* GP2FP */
315 5, /* FP2GP */
316 2 /* FP2FP */
319 static const struct cpu_regmove_cost exynosm1_regmove_cost =
321 1, /* GP2GP */
322 /* Avoid the use of slow int<->fp moves for spilling by setting
323 their cost higher than memmov_cost (actual, 4 and 9). */
324 9, /* GP2FP */
325 9, /* FP2GP */
326 1 /* FP2FP */
329 static const struct cpu_regmove_cost thunderx_regmove_cost =
331 2, /* GP2GP */
332 2, /* GP2FP */
333 6, /* FP2GP */
334 4 /* FP2FP */
337 static const struct cpu_regmove_cost xgene1_regmove_cost =
339 1, /* GP2GP */
340 /* Avoid the use of slow int<->fp moves for spilling by setting
341 their cost higher than memmov_cost. */
342 8, /* GP2FP */
343 8, /* FP2GP */
344 2 /* FP2FP */
347 static const struct cpu_regmove_cost qdf24xx_regmove_cost =
349 2, /* GP2GP */
350 /* Avoid the use of int<->fp moves for spilling. */
351 6, /* GP2FP */
352 6, /* FP2GP */
353 4 /* FP2FP */
356 static const struct cpu_regmove_cost thunderx2t99_regmove_cost =
358 1, /* GP2GP */
359 /* Avoid the use of int<->fp moves for spilling. */
360 8, /* GP2FP */
361 8, /* FP2GP */
362 4 /* FP2FP */
365 /* Generic costs for vector insn classes. */
366 static const struct cpu_vector_cost generic_vector_cost =
368 1, /* scalar_int_stmt_cost */
369 1, /* scalar_fp_stmt_cost */
370 1, /* scalar_load_cost */
371 1, /* scalar_store_cost */
372 1, /* vec_int_stmt_cost */
373 1, /* vec_fp_stmt_cost */
374 2, /* vec_permute_cost */
375 1, /* vec_to_scalar_cost */
376 1, /* scalar_to_vec_cost */
377 1, /* vec_align_load_cost */
378 1, /* vec_unalign_load_cost */
379 1, /* vec_unalign_store_cost */
380 1, /* vec_store_cost */
381 3, /* cond_taken_branch_cost */
382 1 /* cond_not_taken_branch_cost */
385 /* ThunderX costs for vector insn classes. */
386 static const struct cpu_vector_cost thunderx_vector_cost =
388 1, /* scalar_int_stmt_cost */
389 1, /* scalar_fp_stmt_cost */
390 3, /* scalar_load_cost */
391 1, /* scalar_store_cost */
392 4, /* vec_int_stmt_cost */
393 4, /* vec_fp_stmt_cost */
394 4, /* vec_permute_cost */
395 2, /* vec_to_scalar_cost */
396 2, /* scalar_to_vec_cost */
397 3, /* vec_align_load_cost */
398 10, /* vec_unalign_load_cost */
399 10, /* vec_unalign_store_cost */
400 1, /* vec_store_cost */
401 3, /* cond_taken_branch_cost */
402 3 /* cond_not_taken_branch_cost */
405 /* Generic costs for vector insn classes. */
406 static const struct cpu_vector_cost cortexa57_vector_cost =
408 1, /* scalar_int_stmt_cost */
409 1, /* scalar_fp_stmt_cost */
410 4, /* scalar_load_cost */
411 1, /* scalar_store_cost */
412 2, /* vec_int_stmt_cost */
413 2, /* vec_fp_stmt_cost */
414 3, /* vec_permute_cost */
415 8, /* vec_to_scalar_cost */
416 8, /* scalar_to_vec_cost */
417 4, /* vec_align_load_cost */
418 4, /* vec_unalign_load_cost */
419 1, /* vec_unalign_store_cost */
420 1, /* vec_store_cost */
421 1, /* cond_taken_branch_cost */
422 1 /* cond_not_taken_branch_cost */
425 static const struct cpu_vector_cost exynosm1_vector_cost =
427 1, /* scalar_int_stmt_cost */
428 1, /* scalar_fp_stmt_cost */
429 5, /* scalar_load_cost */
430 1, /* scalar_store_cost */
431 3, /* vec_int_stmt_cost */
432 3, /* vec_fp_stmt_cost */
433 3, /* vec_permute_cost */
434 3, /* vec_to_scalar_cost */
435 3, /* scalar_to_vec_cost */
436 5, /* vec_align_load_cost */
437 5, /* vec_unalign_load_cost */
438 1, /* vec_unalign_store_cost */
439 1, /* vec_store_cost */
440 1, /* cond_taken_branch_cost */
441 1 /* cond_not_taken_branch_cost */
444 /* Generic costs for vector insn classes. */
445 static const struct cpu_vector_cost xgene1_vector_cost =
447 1, /* scalar_int_stmt_cost */
448 1, /* scalar_fp_stmt_cost */
449 5, /* scalar_load_cost */
450 1, /* scalar_store_cost */
451 2, /* vec_int_stmt_cost */
452 2, /* vec_fp_stmt_cost */
453 2, /* vec_permute_cost */
454 4, /* vec_to_scalar_cost */
455 4, /* scalar_to_vec_cost */
456 10, /* vec_align_load_cost */
457 10, /* vec_unalign_load_cost */
458 2, /* vec_unalign_store_cost */
459 2, /* vec_store_cost */
460 2, /* cond_taken_branch_cost */
461 1 /* cond_not_taken_branch_cost */
464 /* Costs for vector insn classes for Vulcan. */
465 static const struct cpu_vector_cost thunderx2t99_vector_cost =
467 1, /* scalar_int_stmt_cost */
468 6, /* scalar_fp_stmt_cost */
469 4, /* scalar_load_cost */
470 1, /* scalar_store_cost */
471 5, /* vec_int_stmt_cost */
472 6, /* vec_fp_stmt_cost */
473 3, /* vec_permute_cost */
474 6, /* vec_to_scalar_cost */
475 5, /* scalar_to_vec_cost */
476 8, /* vec_align_load_cost */
477 8, /* vec_unalign_load_cost */
478 4, /* vec_unalign_store_cost */
479 4, /* vec_store_cost */
480 2, /* cond_taken_branch_cost */
481 1 /* cond_not_taken_branch_cost */
484 /* Generic costs for branch instructions. */
485 static const struct cpu_branch_cost generic_branch_cost =
487 2, /* Predictable. */
488 2 /* Unpredictable. */
491 /* Branch costs for Cortex-A57. */
492 static const struct cpu_branch_cost cortexa57_branch_cost =
494 1, /* Predictable. */
495 3 /* Unpredictable. */
498 /* Branch costs for Vulcan. */
499 static const struct cpu_branch_cost thunderx2t99_branch_cost =
501 1, /* Predictable. */
502 3 /* Unpredictable. */
505 /* Generic approximation modes. */
506 static const cpu_approx_modes generic_approx_modes =
508 AARCH64_APPROX_NONE, /* division */
509 AARCH64_APPROX_NONE, /* sqrt */
510 AARCH64_APPROX_NONE /* recip_sqrt */
513 /* Approximation modes for Exynos M1. */
514 static const cpu_approx_modes exynosm1_approx_modes =
516 AARCH64_APPROX_NONE, /* division */
517 AARCH64_APPROX_ALL, /* sqrt */
518 AARCH64_APPROX_ALL /* recip_sqrt */
521 /* Approximation modes for X-Gene 1. */
522 static const cpu_approx_modes xgene1_approx_modes =
524 AARCH64_APPROX_NONE, /* division */
525 AARCH64_APPROX_NONE, /* sqrt */
526 AARCH64_APPROX_ALL /* recip_sqrt */
529 static const struct tune_params generic_tunings =
531 &cortexa57_extra_costs,
532 &generic_addrcost_table,
533 &generic_regmove_cost,
534 &generic_vector_cost,
535 &generic_branch_cost,
536 &generic_approx_modes,
537 4, /* memmov_cost */
538 2, /* issue_rate */
539 AARCH64_FUSE_NOTHING, /* fusible_ops */
540 8, /* function_align. */
541 8, /* jump_align. */
542 4, /* loop_align. */
543 2, /* int_reassoc_width. */
544 4, /* fp_reassoc_width. */
545 1, /* vec_reassoc_width. */
546 2, /* min_div_recip_mul_sf. */
547 2, /* min_div_recip_mul_df. */
548 0, /* max_case_values. */
549 0, /* cache_line_size. */
550 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
551 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
554 static const struct tune_params cortexa35_tunings =
556 &cortexa53_extra_costs,
557 &generic_addrcost_table,
558 &cortexa53_regmove_cost,
559 &generic_vector_cost,
560 &cortexa57_branch_cost,
561 &generic_approx_modes,
562 4, /* memmov_cost */
563 1, /* issue_rate */
564 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
565 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
566 16, /* function_align. */
567 8, /* jump_align. */
568 8, /* loop_align. */
569 2, /* int_reassoc_width. */
570 4, /* fp_reassoc_width. */
571 1, /* vec_reassoc_width. */
572 2, /* min_div_recip_mul_sf. */
573 2, /* min_div_recip_mul_df. */
574 0, /* max_case_values. */
575 0, /* cache_line_size. */
576 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
577 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
580 static const struct tune_params cortexa53_tunings =
582 &cortexa53_extra_costs,
583 &generic_addrcost_table,
584 &cortexa53_regmove_cost,
585 &generic_vector_cost,
586 &cortexa57_branch_cost,
587 &generic_approx_modes,
588 4, /* memmov_cost */
589 2, /* issue_rate */
590 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
591 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
592 16, /* function_align. */
593 8, /* jump_align. */
594 8, /* loop_align. */
595 2, /* int_reassoc_width. */
596 4, /* fp_reassoc_width. */
597 1, /* vec_reassoc_width. */
598 2, /* min_div_recip_mul_sf. */
599 2, /* min_div_recip_mul_df. */
600 0, /* max_case_values. */
601 0, /* cache_line_size. */
602 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
603 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
606 static const struct tune_params cortexa57_tunings =
608 &cortexa57_extra_costs,
609 &cortexa57_addrcost_table,
610 &cortexa57_regmove_cost,
611 &cortexa57_vector_cost,
612 &cortexa57_branch_cost,
613 &generic_approx_modes,
614 4, /* memmov_cost */
615 3, /* issue_rate */
616 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
617 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
618 16, /* function_align. */
619 8, /* jump_align. */
620 8, /* loop_align. */
621 2, /* int_reassoc_width. */
622 4, /* fp_reassoc_width. */
623 1, /* vec_reassoc_width. */
624 2, /* min_div_recip_mul_sf. */
625 2, /* min_div_recip_mul_df. */
626 0, /* max_case_values. */
627 0, /* cache_line_size. */
628 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
629 (AARCH64_EXTRA_TUNE_RENAME_FMA_REGS) /* tune_flags. */
632 static const struct tune_params cortexa72_tunings =
634 &cortexa57_extra_costs,
635 &cortexa57_addrcost_table,
636 &cortexa57_regmove_cost,
637 &cortexa57_vector_cost,
638 &cortexa57_branch_cost,
639 &generic_approx_modes,
640 4, /* memmov_cost */
641 3, /* issue_rate */
642 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
643 | AARCH64_FUSE_MOVK_MOVK), /* fusible_ops */
644 16, /* function_align. */
645 8, /* jump_align. */
646 8, /* loop_align. */
647 2, /* int_reassoc_width. */
648 4, /* fp_reassoc_width. */
649 1, /* vec_reassoc_width. */
650 2, /* min_div_recip_mul_sf. */
651 2, /* min_div_recip_mul_df. */
652 0, /* max_case_values. */
653 0, /* cache_line_size. */
654 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
655 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
658 static const struct tune_params cortexa73_tunings =
660 &cortexa57_extra_costs,
661 &cortexa57_addrcost_table,
662 &cortexa57_regmove_cost,
663 &cortexa57_vector_cost,
664 &cortexa57_branch_cost,
665 &generic_approx_modes,
666 4, /* memmov_cost. */
667 2, /* issue_rate. */
668 (AARCH64_FUSE_AES_AESMC | AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
669 | AARCH64_FUSE_MOVK_MOVK | AARCH64_FUSE_ADRP_LDR), /* fusible_ops */
670 16, /* function_align. */
671 8, /* jump_align. */
672 8, /* loop_align. */
673 2, /* int_reassoc_width. */
674 4, /* fp_reassoc_width. */
675 1, /* vec_reassoc_width. */
676 2, /* min_div_recip_mul_sf. */
677 2, /* min_div_recip_mul_df. */
678 0, /* max_case_values. */
679 0, /* cache_line_size. */
680 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
681 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
684 static const struct tune_params exynosm1_tunings =
686 &exynosm1_extra_costs,
687 &exynosm1_addrcost_table,
688 &exynosm1_regmove_cost,
689 &exynosm1_vector_cost,
690 &generic_branch_cost,
691 &exynosm1_approx_modes,
692 4, /* memmov_cost */
693 3, /* issue_rate */
694 (AARCH64_FUSE_AES_AESMC), /* fusible_ops */
695 4, /* function_align. */
696 4, /* jump_align. */
697 4, /* loop_align. */
698 2, /* int_reassoc_width. */
699 4, /* fp_reassoc_width. */
700 1, /* vec_reassoc_width. */
701 2, /* min_div_recip_mul_sf. */
702 2, /* min_div_recip_mul_df. */
703 48, /* max_case_values. */
704 64, /* cache_line_size. */
705 tune_params::AUTOPREFETCHER_WEAK, /* autoprefetcher_model. */
706 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
709 static const struct tune_params thunderx_tunings =
711 &thunderx_extra_costs,
712 &generic_addrcost_table,
713 &thunderx_regmove_cost,
714 &thunderx_vector_cost,
715 &generic_branch_cost,
716 &generic_approx_modes,
717 6, /* memmov_cost */
718 2, /* issue_rate */
719 AARCH64_FUSE_CMP_BRANCH, /* fusible_ops */
720 8, /* function_align. */
721 8, /* jump_align. */
722 8, /* loop_align. */
723 2, /* int_reassoc_width. */
724 4, /* fp_reassoc_width. */
725 1, /* vec_reassoc_width. */
726 2, /* min_div_recip_mul_sf. */
727 2, /* min_div_recip_mul_df. */
728 0, /* max_case_values. */
729 0, /* cache_line_size. */
730 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
731 (AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW) /* tune_flags. */
734 static const struct tune_params xgene1_tunings =
736 &xgene1_extra_costs,
737 &xgene1_addrcost_table,
738 &xgene1_regmove_cost,
739 &xgene1_vector_cost,
740 &generic_branch_cost,
741 &xgene1_approx_modes,
742 6, /* memmov_cost */
743 4, /* issue_rate */
744 AARCH64_FUSE_NOTHING, /* fusible_ops */
745 16, /* function_align. */
746 8, /* jump_align. */
747 16, /* loop_align. */
748 2, /* int_reassoc_width. */
749 4, /* fp_reassoc_width. */
750 1, /* vec_reassoc_width. */
751 2, /* min_div_recip_mul_sf. */
752 2, /* min_div_recip_mul_df. */
753 0, /* max_case_values. */
754 0, /* cache_line_size. */
755 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
756 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
759 static const struct tune_params qdf24xx_tunings =
761 &qdf24xx_extra_costs,
762 &qdf24xx_addrcost_table,
763 &qdf24xx_regmove_cost,
764 &generic_vector_cost,
765 &generic_branch_cost,
766 &generic_approx_modes,
767 4, /* memmov_cost */
768 4, /* issue_rate */
769 (AARCH64_FUSE_MOV_MOVK | AARCH64_FUSE_ADRP_ADD
770 | AARCH64_FUSE_MOVK_MOVK), /* fuseable_ops */
771 16, /* function_align. */
772 8, /* jump_align. */
773 16, /* loop_align. */
774 2, /* int_reassoc_width. */
775 4, /* fp_reassoc_width. */
776 1, /* vec_reassoc_width. */
777 2, /* min_div_recip_mul_sf. */
778 2, /* min_div_recip_mul_df. */
779 0, /* max_case_values. */
780 64, /* cache_line_size. */
781 tune_params::AUTOPREFETCHER_STRONG, /* autoprefetcher_model. */
782 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
785 static const struct tune_params thunderx2t99_tunings =
787 &thunderx2t99_extra_costs,
788 &thunderx2t99_addrcost_table,
789 &thunderx2t99_regmove_cost,
790 &thunderx2t99_vector_cost,
791 &thunderx2t99_branch_cost,
792 &generic_approx_modes,
793 4, /* memmov_cost. */
794 4, /* issue_rate. */
795 (AARCH64_FUSE_CMP_BRANCH | AARCH64_FUSE_AES_AESMC), /* fusible_ops */
796 16, /* function_align. */
797 8, /* jump_align. */
798 16, /* loop_align. */
799 3, /* int_reassoc_width. */
800 2, /* fp_reassoc_width. */
801 2, /* vec_reassoc_width. */
802 2, /* min_div_recip_mul_sf. */
803 2, /* min_div_recip_mul_df. */
804 0, /* max_case_values. */
805 64, /* cache_line_size. */
806 tune_params::AUTOPREFETCHER_OFF, /* autoprefetcher_model. */
807 (AARCH64_EXTRA_TUNE_NONE) /* tune_flags. */
810 /* Support for fine-grained override of the tuning structures. */
811 struct aarch64_tuning_override_function
813 const char* name;
814 void (*parse_override)(const char*, struct tune_params*);
817 static void aarch64_parse_fuse_string (const char*, struct tune_params*);
818 static void aarch64_parse_tune_string (const char*, struct tune_params*);
820 static const struct aarch64_tuning_override_function
821 aarch64_tuning_override_functions[] =
823 { "fuse", aarch64_parse_fuse_string },
824 { "tune", aarch64_parse_tune_string },
825 { NULL, NULL }
828 /* A processor implementing AArch64. */
829 struct processor
831 const char *const name;
832 enum aarch64_processor ident;
833 enum aarch64_processor sched_core;
834 enum aarch64_arch arch;
835 unsigned architecture_version;
836 const unsigned long flags;
837 const struct tune_params *const tune;
840 /* Architectures implementing AArch64. */
841 static const struct processor all_architectures[] =
843 #define AARCH64_ARCH(NAME, CORE, ARCH_IDENT, ARCH_REV, FLAGS) \
844 {NAME, CORE, CORE, AARCH64_ARCH_##ARCH_IDENT, ARCH_REV, FLAGS, NULL},
845 #include "aarch64-arches.def"
846 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
849 /* Processor cores implementing AArch64. */
850 static const struct processor all_cores[] =
852 #define AARCH64_CORE(NAME, IDENT, SCHED, ARCH, FLAGS, COSTS, IMP, PART, VARIANT) \
853 {NAME, IDENT, SCHED, AARCH64_ARCH_##ARCH, \
854 all_architectures[AARCH64_ARCH_##ARCH].architecture_version, \
855 FLAGS, &COSTS##_tunings},
856 #include "aarch64-cores.def"
857 {"generic", generic, cortexa53, AARCH64_ARCH_8A, 8,
858 AARCH64_FL_FOR_ARCH8, &generic_tunings},
859 {NULL, aarch64_none, aarch64_none, aarch64_no_arch, 0, 0, NULL}
863 /* Target specification. These are populated by the -march, -mtune, -mcpu
864 handling code or by target attributes. */
865 static const struct processor *selected_arch;
866 static const struct processor *selected_cpu;
867 static const struct processor *selected_tune;
869 /* The current tuning set. */
870 struct tune_params aarch64_tune_params = generic_tunings;
872 #define AARCH64_CPU_DEFAULT_FLAGS ((selected_cpu) ? selected_cpu->flags : 0)
874 /* An ISA extension in the co-processor and main instruction set space. */
875 struct aarch64_option_extension
877 const char *const name;
878 const unsigned long flags_on;
879 const unsigned long flags_off;
882 typedef enum aarch64_cond_code
884 AARCH64_EQ = 0, AARCH64_NE, AARCH64_CS, AARCH64_CC, AARCH64_MI, AARCH64_PL,
885 AARCH64_VS, AARCH64_VC, AARCH64_HI, AARCH64_LS, AARCH64_GE, AARCH64_LT,
886 AARCH64_GT, AARCH64_LE, AARCH64_AL, AARCH64_NV
888 aarch64_cc;
890 #define AARCH64_INVERSE_CONDITION_CODE(X) ((aarch64_cc) (((int) X) ^ 1))
892 /* The condition codes of the processor, and the inverse function. */
893 static const char * const aarch64_condition_codes[] =
895 "eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
896 "hi", "ls", "ge", "lt", "gt", "le", "al", "nv"
899 /* Generate code to enable conditional branches in functions over 1 MiB. */
900 const char *
901 aarch64_gen_far_branch (rtx * operands, int pos_label, const char * dest,
902 const char * branch_format)
904 rtx_code_label * tmp_label = gen_label_rtx ();
905 char label_buf[256];
906 char buffer[128];
907 ASM_GENERATE_INTERNAL_LABEL (label_buf, dest,
908 CODE_LABEL_NUMBER (tmp_label));
909 const char *label_ptr = targetm.strip_name_encoding (label_buf);
910 rtx dest_label = operands[pos_label];
911 operands[pos_label] = tmp_label;
913 snprintf (buffer, sizeof (buffer), "%s%s", branch_format, label_ptr);
914 output_asm_insn (buffer, operands);
916 snprintf (buffer, sizeof (buffer), "b\t%%l%d\n%s:", pos_label, label_ptr);
917 operands[pos_label] = dest_label;
918 output_asm_insn (buffer, operands);
919 return "";
922 void
923 aarch64_err_no_fpadvsimd (machine_mode mode, const char *msg)
925 const char *mc = FLOAT_MODE_P (mode) ? "floating-point" : "vector";
926 if (TARGET_GENERAL_REGS_ONLY)
927 error ("%qs is incompatible with %s %s", "-mgeneral-regs-only", mc, msg);
928 else
929 error ("%qs feature modifier is incompatible with %s %s", "+nofp", mc, msg);
932 /* Implement TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS.
933 The register allocator chooses ALL_REGS if FP_REGS and GENERAL_REGS have
934 the same cost even if ALL_REGS has a much larger cost. ALL_REGS is also
935 used if the cost of both FP_REGS and GENERAL_REGS is lower than the memory
936 cost (in this case the best class is the lowest cost one). Using ALL_REGS
937 irrespectively of its cost results in bad allocations with many redundant
938 int<->FP moves which are expensive on various cores.
939 To avoid this we don't allow ALL_REGS as the allocno class, but force a
940 decision between FP_REGS and GENERAL_REGS. We use the allocno class if it
941 isn't ALL_REGS. Similarly, use the best class if it isn't ALL_REGS.
942 Otherwise set the allocno class depending on the mode.
943 The result of this is that it is no longer inefficient to have a higher
944 memory move cost than the register move cost.
947 static reg_class_t
948 aarch64_ira_change_pseudo_allocno_class (int regno, reg_class_t allocno_class,
949 reg_class_t best_class)
951 enum machine_mode mode;
953 if (allocno_class != ALL_REGS)
954 return allocno_class;
956 if (best_class != ALL_REGS)
957 return best_class;
959 mode = PSEUDO_REGNO_MODE (regno);
960 return FLOAT_MODE_P (mode) || VECTOR_MODE_P (mode) ? FP_REGS : GENERAL_REGS;
963 static unsigned int
964 aarch64_min_divisions_for_recip_mul (enum machine_mode mode)
966 if (GET_MODE_UNIT_SIZE (mode) == 4)
967 return aarch64_tune_params.min_div_recip_mul_sf;
968 return aarch64_tune_params.min_div_recip_mul_df;
971 static int
972 aarch64_reassociation_width (unsigned opc ATTRIBUTE_UNUSED,
973 enum machine_mode mode)
975 if (VECTOR_MODE_P (mode))
976 return aarch64_tune_params.vec_reassoc_width;
977 if (INTEGRAL_MODE_P (mode))
978 return aarch64_tune_params.int_reassoc_width;
979 if (FLOAT_MODE_P (mode))
980 return aarch64_tune_params.fp_reassoc_width;
981 return 1;
984 /* Provide a mapping from gcc register numbers to dwarf register numbers. */
985 unsigned
986 aarch64_dbx_register_number (unsigned regno)
988 if (GP_REGNUM_P (regno))
989 return AARCH64_DWARF_R0 + regno - R0_REGNUM;
990 else if (regno == SP_REGNUM)
991 return AARCH64_DWARF_SP;
992 else if (FP_REGNUM_P (regno))
993 return AARCH64_DWARF_V0 + regno - V0_REGNUM;
995 /* Return values >= DWARF_FRAME_REGISTERS indicate that there is no
996 equivalent DWARF register. */
997 return DWARF_FRAME_REGISTERS;
1000 /* Return TRUE if MODE is any of the large INT modes. */
1001 static bool
1002 aarch64_vect_struct_mode_p (machine_mode mode)
1004 return mode == OImode || mode == CImode || mode == XImode;
1007 /* Return TRUE if MODE is any of the vector modes. */
1008 static bool
1009 aarch64_vector_mode_p (machine_mode mode)
1011 return aarch64_vector_mode_supported_p (mode)
1012 || aarch64_vect_struct_mode_p (mode);
1015 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P. */
1016 static bool
1017 aarch64_array_mode_supported_p (machine_mode mode,
1018 unsigned HOST_WIDE_INT nelems)
1020 if (TARGET_SIMD
1021 && (AARCH64_VALID_SIMD_QREG_MODE (mode)
1022 || AARCH64_VALID_SIMD_DREG_MODE (mode))
1023 && (nelems >= 2 && nelems <= 4))
1024 return true;
1026 return false;
1029 /* Implement HARD_REGNO_NREGS. */
1032 aarch64_hard_regno_nregs (unsigned regno, machine_mode mode)
1034 switch (aarch64_regno_regclass (regno))
1036 case FP_REGS:
1037 case FP_LO_REGS:
1038 return (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG;
1039 default:
1040 return (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
1042 gcc_unreachable ();
1045 /* Implement HARD_REGNO_MODE_OK. */
1048 aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
1050 if (GET_MODE_CLASS (mode) == MODE_CC)
1051 return regno == CC_REGNUM;
1053 if (regno == SP_REGNUM)
1054 /* The purpose of comparing with ptr_mode is to support the
1055 global register variable associated with the stack pointer
1056 register via the syntax of asm ("wsp") in ILP32. */
1057 return mode == Pmode || mode == ptr_mode;
1059 if (regno == FRAME_POINTER_REGNUM || regno == ARG_POINTER_REGNUM)
1060 return mode == Pmode;
1062 if (GP_REGNUM_P (regno) && ! aarch64_vect_struct_mode_p (mode))
1063 return 1;
1065 if (FP_REGNUM_P (regno))
1067 if (aarch64_vect_struct_mode_p (mode))
1068 return
1069 (regno + aarch64_hard_regno_nregs (regno, mode) - 1) <= V31_REGNUM;
1070 else
1071 return 1;
1074 return 0;
1077 /* Implement HARD_REGNO_CALLER_SAVE_MODE. */
1078 machine_mode
1079 aarch64_hard_regno_caller_save_mode (unsigned regno, unsigned nregs,
1080 machine_mode mode)
1082 /* Handle modes that fit within single registers. */
1083 if (nregs == 1 && GET_MODE_SIZE (mode) <= 16)
1085 if (GET_MODE_SIZE (mode) >= 4)
1086 return mode;
1087 else
1088 return SImode;
1090 /* Fall back to generic for multi-reg and very large modes. */
1091 else
1092 return choose_hard_reg_mode (regno, nregs, false);
1095 /* Return true if calls to DECL should be treated as
1096 long-calls (ie called via a register). */
1097 static bool
1098 aarch64_decl_is_long_call_p (const_tree decl ATTRIBUTE_UNUSED)
1100 return false;
1103 /* Return true if calls to symbol-ref SYM should be treated as
1104 long-calls (ie called via a register). */
1105 bool
1106 aarch64_is_long_call_p (rtx sym)
1108 return aarch64_decl_is_long_call_p (SYMBOL_REF_DECL (sym));
1111 /* Return true if calls to symbol-ref SYM should not go through
1112 plt stubs. */
1114 bool
1115 aarch64_is_noplt_call_p (rtx sym)
1117 const_tree decl = SYMBOL_REF_DECL (sym);
1119 if (flag_pic
1120 && decl
1121 && (!flag_plt
1122 || lookup_attribute ("noplt", DECL_ATTRIBUTES (decl)))
1123 && !targetm.binds_local_p (decl))
1124 return true;
1126 return false;
1129 /* Return true if the offsets to a zero/sign-extract operation
1130 represent an expression that matches an extend operation. The
1131 operands represent the paramters from
1133 (extract:MODE (mult (reg) (MULT_IMM)) (EXTRACT_IMM) (const_int 0)). */
1134 bool
1135 aarch64_is_extend_from_extract (machine_mode mode, rtx mult_imm,
1136 rtx extract_imm)
1138 HOST_WIDE_INT mult_val, extract_val;
1140 if (! CONST_INT_P (mult_imm) || ! CONST_INT_P (extract_imm))
1141 return false;
1143 mult_val = INTVAL (mult_imm);
1144 extract_val = INTVAL (extract_imm);
1146 if (extract_val > 8
1147 && extract_val < GET_MODE_BITSIZE (mode)
1148 && exact_log2 (extract_val & ~7) > 0
1149 && (extract_val & 7) <= 4
1150 && mult_val == (1 << (extract_val & 7)))
1151 return true;
1153 return false;
1156 /* Emit an insn that's a simple single-set. Both the operands must be
1157 known to be valid. */
1158 inline static rtx_insn *
1159 emit_set_insn (rtx x, rtx y)
1161 return emit_insn (gen_rtx_SET (x, y));
1164 /* X and Y are two things to compare using CODE. Emit the compare insn and
1165 return the rtx for register 0 in the proper mode. */
1167 aarch64_gen_compare_reg (RTX_CODE code, rtx x, rtx y)
1169 machine_mode mode = SELECT_CC_MODE (code, x, y);
1170 rtx cc_reg = gen_rtx_REG (mode, CC_REGNUM);
1172 emit_set_insn (cc_reg, gen_rtx_COMPARE (mode, x, y));
1173 return cc_reg;
1176 /* Build the SYMBOL_REF for __tls_get_addr. */
1178 static GTY(()) rtx tls_get_addr_libfunc;
1181 aarch64_tls_get_addr (void)
1183 if (!tls_get_addr_libfunc)
1184 tls_get_addr_libfunc = init_one_libfunc ("__tls_get_addr");
1185 return tls_get_addr_libfunc;
1188 /* Return the TLS model to use for ADDR. */
1190 static enum tls_model
1191 tls_symbolic_operand_type (rtx addr)
1193 enum tls_model tls_kind = TLS_MODEL_NONE;
1194 rtx sym, addend;
1196 if (GET_CODE (addr) == CONST)
1198 split_const (addr, &sym, &addend);
1199 if (GET_CODE (sym) == SYMBOL_REF)
1200 tls_kind = SYMBOL_REF_TLS_MODEL (sym);
1202 else if (GET_CODE (addr) == SYMBOL_REF)
1203 tls_kind = SYMBOL_REF_TLS_MODEL (addr);
1205 return tls_kind;
1208 /* We'll allow lo_sum's in addresses in our legitimate addresses
1209 so that combine would take care of combining addresses where
1210 necessary, but for generation purposes, we'll generate the address
1211 as :
1212 RTL Absolute
1213 tmp = hi (symbol_ref); adrp x1, foo
1214 dest = lo_sum (tmp, symbol_ref); add dest, x1, :lo_12:foo
1217 PIC TLS
1218 adrp x1, :got:foo adrp tmp, :tlsgd:foo
1219 ldr x1, [:got_lo12:foo] add dest, tmp, :tlsgd_lo12:foo
1220 bl __tls_get_addr
1223 Load TLS symbol, depending on TLS mechanism and TLS access model.
1225 Global Dynamic - Traditional TLS:
1226 adrp tmp, :tlsgd:imm
1227 add dest, tmp, #:tlsgd_lo12:imm
1228 bl __tls_get_addr
1230 Global Dynamic - TLS Descriptors:
1231 adrp dest, :tlsdesc:imm
1232 ldr tmp, [dest, #:tlsdesc_lo12:imm]
1233 add dest, dest, #:tlsdesc_lo12:imm
1234 blr tmp
1235 mrs tp, tpidr_el0
1236 add dest, dest, tp
1238 Initial Exec:
1239 mrs tp, tpidr_el0
1240 adrp tmp, :gottprel:imm
1241 ldr dest, [tmp, #:gottprel_lo12:imm]
1242 add dest, dest, tp
1244 Local Exec:
1245 mrs tp, tpidr_el0
1246 add t0, tp, #:tprel_hi12:imm, lsl #12
1247 add t0, t0, #:tprel_lo12_nc:imm
1250 static void
1251 aarch64_load_symref_appropriately (rtx dest, rtx imm,
1252 enum aarch64_symbol_type type)
1254 switch (type)
1256 case SYMBOL_SMALL_ABSOLUTE:
1258 /* In ILP32, the mode of dest can be either SImode or DImode. */
1259 rtx tmp_reg = dest;
1260 machine_mode mode = GET_MODE (dest);
1262 gcc_assert (mode == Pmode || mode == ptr_mode);
1264 if (can_create_pseudo_p ())
1265 tmp_reg = gen_reg_rtx (mode);
1267 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1268 emit_insn (gen_add_losym (dest, tmp_reg, imm));
1269 return;
1272 case SYMBOL_TINY_ABSOLUTE:
1273 emit_insn (gen_rtx_SET (dest, imm));
1274 return;
1276 case SYMBOL_SMALL_GOT_28K:
1278 machine_mode mode = GET_MODE (dest);
1279 rtx gp_rtx = pic_offset_table_rtx;
1280 rtx insn;
1281 rtx mem;
1283 /* NOTE: pic_offset_table_rtx can be NULL_RTX, because we can reach
1284 here before rtl expand. Tree IVOPT will generate rtl pattern to
1285 decide rtx costs, in which case pic_offset_table_rtx is not
1286 initialized. For that case no need to generate the first adrp
1287 instruction as the final cost for global variable access is
1288 one instruction. */
1289 if (gp_rtx != NULL)
1291 /* -fpic for -mcmodel=small allow 32K GOT table size (but we are
1292 using the page base as GOT base, the first page may be wasted,
1293 in the worst scenario, there is only 28K space for GOT).
1295 The generate instruction sequence for accessing global variable
1298 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym]
1300 Only one instruction needed. But we must initialize
1301 pic_offset_table_rtx properly. We generate initialize insn for
1302 every global access, and allow CSE to remove all redundant.
1304 The final instruction sequences will look like the following
1305 for multiply global variables access.
1307 adrp pic_offset_table_rtx, _GLOBAL_OFFSET_TABLE_
1309 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym1]
1310 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym2]
1311 ldr reg, [pic_offset_table_rtx, #:gotpage_lo15:sym3]
1312 ... */
1314 rtx s = gen_rtx_SYMBOL_REF (Pmode, "_GLOBAL_OFFSET_TABLE_");
1315 crtl->uses_pic_offset_table = 1;
1316 emit_move_insn (gp_rtx, gen_rtx_HIGH (Pmode, s));
1318 if (mode != GET_MODE (gp_rtx))
1319 gp_rtx = gen_lowpart (mode, gp_rtx);
1323 if (mode == ptr_mode)
1325 if (mode == DImode)
1326 insn = gen_ldr_got_small_28k_di (dest, gp_rtx, imm);
1327 else
1328 insn = gen_ldr_got_small_28k_si (dest, gp_rtx, imm);
1330 mem = XVECEXP (SET_SRC (insn), 0, 0);
1332 else
1334 gcc_assert (mode == Pmode);
1336 insn = gen_ldr_got_small_28k_sidi (dest, gp_rtx, imm);
1337 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1340 /* The operand is expected to be MEM. Whenever the related insn
1341 pattern changed, above code which calculate mem should be
1342 updated. */
1343 gcc_assert (GET_CODE (mem) == MEM);
1344 MEM_READONLY_P (mem) = 1;
1345 MEM_NOTRAP_P (mem) = 1;
1346 emit_insn (insn);
1347 return;
1350 case SYMBOL_SMALL_GOT_4G:
1352 /* In ILP32, the mode of dest can be either SImode or DImode,
1353 while the got entry is always of SImode size. The mode of
1354 dest depends on how dest is used: if dest is assigned to a
1355 pointer (e.g. in the memory), it has SImode; it may have
1356 DImode if dest is dereferenced to access the memeory.
1357 This is why we have to handle three different ldr_got_small
1358 patterns here (two patterns for ILP32). */
1360 rtx insn;
1361 rtx mem;
1362 rtx tmp_reg = dest;
1363 machine_mode mode = GET_MODE (dest);
1365 if (can_create_pseudo_p ())
1366 tmp_reg = gen_reg_rtx (mode);
1368 emit_move_insn (tmp_reg, gen_rtx_HIGH (mode, imm));
1369 if (mode == ptr_mode)
1371 if (mode == DImode)
1372 insn = gen_ldr_got_small_di (dest, tmp_reg, imm);
1373 else
1374 insn = gen_ldr_got_small_si (dest, tmp_reg, imm);
1376 mem = XVECEXP (SET_SRC (insn), 0, 0);
1378 else
1380 gcc_assert (mode == Pmode);
1382 insn = gen_ldr_got_small_sidi (dest, tmp_reg, imm);
1383 mem = XVECEXP (XEXP (SET_SRC (insn), 0), 0, 0);
1386 gcc_assert (GET_CODE (mem) == MEM);
1387 MEM_READONLY_P (mem) = 1;
1388 MEM_NOTRAP_P (mem) = 1;
1389 emit_insn (insn);
1390 return;
1393 case SYMBOL_SMALL_TLSGD:
1395 rtx_insn *insns;
1396 machine_mode mode = GET_MODE (dest);
1397 rtx result = gen_rtx_REG (mode, R0_REGNUM);
1399 start_sequence ();
1400 if (TARGET_ILP32)
1401 aarch64_emit_call_insn (gen_tlsgd_small_si (result, imm));
1402 else
1403 aarch64_emit_call_insn (gen_tlsgd_small_di (result, imm));
1404 insns = get_insns ();
1405 end_sequence ();
1407 RTL_CONST_CALL_P (insns) = 1;
1408 emit_libcall_block (insns, dest, result, imm);
1409 return;
1412 case SYMBOL_SMALL_TLSDESC:
1414 machine_mode mode = GET_MODE (dest);
1415 rtx x0 = gen_rtx_REG (mode, R0_REGNUM);
1416 rtx tp;
1418 gcc_assert (mode == Pmode || mode == ptr_mode);
1420 /* In ILP32, the got entry is always of SImode size. Unlike
1421 small GOT, the dest is fixed at reg 0. */
1422 if (TARGET_ILP32)
1423 emit_insn (gen_tlsdesc_small_si (imm));
1424 else
1425 emit_insn (gen_tlsdesc_small_di (imm));
1426 tp = aarch64_load_tp (NULL);
1428 if (mode != Pmode)
1429 tp = gen_lowpart (mode, tp);
1431 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, x0)));
1432 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1433 return;
1436 case SYMBOL_SMALL_TLSIE:
1438 /* In ILP32, the mode of dest can be either SImode or DImode,
1439 while the got entry is always of SImode size. The mode of
1440 dest depends on how dest is used: if dest is assigned to a
1441 pointer (e.g. in the memory), it has SImode; it may have
1442 DImode if dest is dereferenced to access the memeory.
1443 This is why we have to handle three different tlsie_small
1444 patterns here (two patterns for ILP32). */
1445 machine_mode mode = GET_MODE (dest);
1446 rtx tmp_reg = gen_reg_rtx (mode);
1447 rtx tp = aarch64_load_tp (NULL);
1449 if (mode == ptr_mode)
1451 if (mode == DImode)
1452 emit_insn (gen_tlsie_small_di (tmp_reg, imm));
1453 else
1455 emit_insn (gen_tlsie_small_si (tmp_reg, imm));
1456 tp = gen_lowpart (mode, tp);
1459 else
1461 gcc_assert (mode == Pmode);
1462 emit_insn (gen_tlsie_small_sidi (tmp_reg, imm));
1465 emit_insn (gen_rtx_SET (dest, gen_rtx_PLUS (mode, tp, tmp_reg)));
1466 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1467 return;
1470 case SYMBOL_TLSLE12:
1471 case SYMBOL_TLSLE24:
1472 case SYMBOL_TLSLE32:
1473 case SYMBOL_TLSLE48:
1475 machine_mode mode = GET_MODE (dest);
1476 rtx tp = aarch64_load_tp (NULL);
1478 if (mode != Pmode)
1479 tp = gen_lowpart (mode, tp);
1481 switch (type)
1483 case SYMBOL_TLSLE12:
1484 emit_insn ((mode == DImode ? gen_tlsle12_di : gen_tlsle12_si)
1485 (dest, tp, imm));
1486 break;
1487 case SYMBOL_TLSLE24:
1488 emit_insn ((mode == DImode ? gen_tlsle24_di : gen_tlsle24_si)
1489 (dest, tp, imm));
1490 break;
1491 case SYMBOL_TLSLE32:
1492 emit_insn ((mode == DImode ? gen_tlsle32_di : gen_tlsle32_si)
1493 (dest, imm));
1494 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1495 (dest, dest, tp));
1496 break;
1497 case SYMBOL_TLSLE48:
1498 emit_insn ((mode == DImode ? gen_tlsle48_di : gen_tlsle48_si)
1499 (dest, imm));
1500 emit_insn ((mode == DImode ? gen_adddi3 : gen_addsi3)
1501 (dest, dest, tp));
1502 break;
1503 default:
1504 gcc_unreachable ();
1507 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1508 return;
1511 case SYMBOL_TINY_GOT:
1512 emit_insn (gen_ldr_got_tiny (dest, imm));
1513 return;
1515 case SYMBOL_TINY_TLSIE:
1517 machine_mode mode = GET_MODE (dest);
1518 rtx tp = aarch64_load_tp (NULL);
1520 if (mode == ptr_mode)
1522 if (mode == DImode)
1523 emit_insn (gen_tlsie_tiny_di (dest, imm, tp));
1524 else
1526 tp = gen_lowpart (mode, tp);
1527 emit_insn (gen_tlsie_tiny_si (dest, imm, tp));
1530 else
1532 gcc_assert (mode == Pmode);
1533 emit_insn (gen_tlsie_tiny_sidi (dest, imm, tp));
1536 set_unique_reg_note (get_last_insn (), REG_EQUIV, imm);
1537 return;
1540 default:
1541 gcc_unreachable ();
1545 /* Emit a move from SRC to DEST. Assume that the move expanders can
1546 handle all moves if !can_create_pseudo_p (). The distinction is
1547 important because, unlike emit_move_insn, the move expanders know
1548 how to force Pmode objects into the constant pool even when the
1549 constant pool address is not itself legitimate. */
1550 static rtx
1551 aarch64_emit_move (rtx dest, rtx src)
1553 return (can_create_pseudo_p ()
1554 ? emit_move_insn (dest, src)
1555 : emit_move_insn_1 (dest, src));
1558 /* Split a 128-bit move operation into two 64-bit move operations,
1559 taking care to handle partial overlap of register to register
1560 copies. Special cases are needed when moving between GP regs and
1561 FP regs. SRC can be a register, constant or memory; DST a register
1562 or memory. If either operand is memory it must not have any side
1563 effects. */
1564 void
1565 aarch64_split_128bit_move (rtx dst, rtx src)
1567 rtx dst_lo, dst_hi;
1568 rtx src_lo, src_hi;
1570 machine_mode mode = GET_MODE (dst);
1572 gcc_assert (mode == TImode || mode == TFmode);
1573 gcc_assert (!(side_effects_p (src) || side_effects_p (dst)));
1574 gcc_assert (mode == GET_MODE (src) || GET_MODE (src) == VOIDmode);
1576 if (REG_P (dst) && REG_P (src))
1578 int src_regno = REGNO (src);
1579 int dst_regno = REGNO (dst);
1581 /* Handle FP <-> GP regs. */
1582 if (FP_REGNUM_P (dst_regno) && GP_REGNUM_P (src_regno))
1584 src_lo = gen_lowpart (word_mode, src);
1585 src_hi = gen_highpart (word_mode, src);
1587 if (mode == TImode)
1589 emit_insn (gen_aarch64_movtilow_di (dst, src_lo));
1590 emit_insn (gen_aarch64_movtihigh_di (dst, src_hi));
1592 else
1594 emit_insn (gen_aarch64_movtflow_di (dst, src_lo));
1595 emit_insn (gen_aarch64_movtfhigh_di (dst, src_hi));
1597 return;
1599 else if (GP_REGNUM_P (dst_regno) && FP_REGNUM_P (src_regno))
1601 dst_lo = gen_lowpart (word_mode, dst);
1602 dst_hi = gen_highpart (word_mode, dst);
1604 if (mode == TImode)
1606 emit_insn (gen_aarch64_movdi_tilow (dst_lo, src));
1607 emit_insn (gen_aarch64_movdi_tihigh (dst_hi, src));
1609 else
1611 emit_insn (gen_aarch64_movdi_tflow (dst_lo, src));
1612 emit_insn (gen_aarch64_movdi_tfhigh (dst_hi, src));
1614 return;
1618 dst_lo = gen_lowpart (word_mode, dst);
1619 dst_hi = gen_highpart (word_mode, dst);
1620 src_lo = gen_lowpart (word_mode, src);
1621 src_hi = gen_highpart_mode (word_mode, mode, src);
1623 /* At most one pairing may overlap. */
1624 if (reg_overlap_mentioned_p (dst_lo, src_hi))
1626 aarch64_emit_move (dst_hi, src_hi);
1627 aarch64_emit_move (dst_lo, src_lo);
1629 else
1631 aarch64_emit_move (dst_lo, src_lo);
1632 aarch64_emit_move (dst_hi, src_hi);
1636 bool
1637 aarch64_split_128bit_move_p (rtx dst, rtx src)
1639 return (! REG_P (src)
1640 || ! (FP_REGNUM_P (REGNO (dst)) && FP_REGNUM_P (REGNO (src))));
1643 /* Split a complex SIMD combine. */
1645 void
1646 aarch64_split_simd_combine (rtx dst, rtx src1, rtx src2)
1648 machine_mode src_mode = GET_MODE (src1);
1649 machine_mode dst_mode = GET_MODE (dst);
1651 gcc_assert (VECTOR_MODE_P (dst_mode));
1653 if (REG_P (dst) && REG_P (src1) && REG_P (src2))
1655 rtx (*gen) (rtx, rtx, rtx);
1657 switch (src_mode)
1659 case V8QImode:
1660 gen = gen_aarch64_simd_combinev8qi;
1661 break;
1662 case V4HImode:
1663 gen = gen_aarch64_simd_combinev4hi;
1664 break;
1665 case V2SImode:
1666 gen = gen_aarch64_simd_combinev2si;
1667 break;
1668 case V4HFmode:
1669 gen = gen_aarch64_simd_combinev4hf;
1670 break;
1671 case V2SFmode:
1672 gen = gen_aarch64_simd_combinev2sf;
1673 break;
1674 case DImode:
1675 gen = gen_aarch64_simd_combinedi;
1676 break;
1677 case DFmode:
1678 gen = gen_aarch64_simd_combinedf;
1679 break;
1680 default:
1681 gcc_unreachable ();
1684 emit_insn (gen (dst, src1, src2));
1685 return;
1689 /* Split a complex SIMD move. */
1691 void
1692 aarch64_split_simd_move (rtx dst, rtx src)
1694 machine_mode src_mode = GET_MODE (src);
1695 machine_mode dst_mode = GET_MODE (dst);
1697 gcc_assert (VECTOR_MODE_P (dst_mode));
1699 if (REG_P (dst) && REG_P (src))
1701 rtx (*gen) (rtx, rtx);
1703 gcc_assert (VECTOR_MODE_P (src_mode));
1705 switch (src_mode)
1707 case V16QImode:
1708 gen = gen_aarch64_split_simd_movv16qi;
1709 break;
1710 case V8HImode:
1711 gen = gen_aarch64_split_simd_movv8hi;
1712 break;
1713 case V4SImode:
1714 gen = gen_aarch64_split_simd_movv4si;
1715 break;
1716 case V2DImode:
1717 gen = gen_aarch64_split_simd_movv2di;
1718 break;
1719 case V8HFmode:
1720 gen = gen_aarch64_split_simd_movv8hf;
1721 break;
1722 case V4SFmode:
1723 gen = gen_aarch64_split_simd_movv4sf;
1724 break;
1725 case V2DFmode:
1726 gen = gen_aarch64_split_simd_movv2df;
1727 break;
1728 default:
1729 gcc_unreachable ();
1732 emit_insn (gen (dst, src));
1733 return;
1737 bool
1738 aarch64_zero_extend_const_eq (machine_mode xmode, rtx x,
1739 machine_mode ymode, rtx y)
1741 rtx r = simplify_const_unary_operation (ZERO_EXTEND, xmode, y, ymode);
1742 gcc_assert (r != NULL);
1743 return rtx_equal_p (x, r);
1747 static rtx
1748 aarch64_force_temporary (machine_mode mode, rtx x, rtx value)
1750 if (can_create_pseudo_p ())
1751 return force_reg (mode, value);
1752 else
1754 x = aarch64_emit_move (x, value);
1755 return x;
1760 static rtx
1761 aarch64_add_offset (machine_mode mode, rtx temp, rtx reg, HOST_WIDE_INT offset)
1763 if (!aarch64_plus_immediate (GEN_INT (offset), mode))
1765 rtx high;
1766 /* Load the full offset into a register. This
1767 might be improvable in the future. */
1768 high = GEN_INT (offset);
1769 offset = 0;
1770 high = aarch64_force_temporary (mode, temp, high);
1771 reg = aarch64_force_temporary (mode, temp,
1772 gen_rtx_PLUS (mode, high, reg));
1774 return plus_constant (mode, reg, offset);
1777 static int
1778 aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate,
1779 machine_mode mode)
1781 int i;
1782 unsigned HOST_WIDE_INT val, val2, mask;
1783 int one_match, zero_match;
1784 int num_insns;
1786 val = INTVAL (imm);
1788 if (aarch64_move_imm (val, mode))
1790 if (generate)
1791 emit_insn (gen_rtx_SET (dest, imm));
1792 return 1;
1795 if ((val >> 32) == 0 || mode == SImode)
1797 if (generate)
1799 emit_insn (gen_rtx_SET (dest, GEN_INT (val & 0xffff)));
1800 if (mode == SImode)
1801 emit_insn (gen_insv_immsi (dest, GEN_INT (16),
1802 GEN_INT ((val >> 16) & 0xffff)));
1803 else
1804 emit_insn (gen_insv_immdi (dest, GEN_INT (16),
1805 GEN_INT ((val >> 16) & 0xffff)));
1807 return 2;
1810 /* Remaining cases are all for DImode. */
1812 mask = 0xffff;
1813 zero_match = ((val & mask) == 0) + ((val & (mask << 16)) == 0) +
1814 ((val & (mask << 32)) == 0) + ((val & (mask << 48)) == 0);
1815 one_match = ((~val & mask) == 0) + ((~val & (mask << 16)) == 0) +
1816 ((~val & (mask << 32)) == 0) + ((~val & (mask << 48)) == 0);
1818 if (zero_match != 2 && one_match != 2)
1820 /* Try emitting a bitmask immediate with a movk replacing 16 bits.
1821 For a 64-bit bitmask try whether changing 16 bits to all ones or
1822 zeroes creates a valid bitmask. To check any repeated bitmask,
1823 try using 16 bits from the other 32-bit half of val. */
1825 for (i = 0; i < 64; i += 16, mask <<= 16)
1827 val2 = val & ~mask;
1828 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1829 break;
1830 val2 = val | mask;
1831 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1832 break;
1833 val2 = val2 & ~mask;
1834 val2 = val2 | (((val2 >> 32) | (val2 << 32)) & mask);
1835 if (val2 != val && aarch64_bitmask_imm (val2, mode))
1836 break;
1838 if (i != 64)
1840 if (generate)
1842 emit_insn (gen_rtx_SET (dest, GEN_INT (val2)));
1843 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1844 GEN_INT ((val >> i) & 0xffff)));
1846 return 2;
1850 /* Generate 2-4 instructions, skipping 16 bits of all zeroes or ones which
1851 are emitted by the initial mov. If one_match > zero_match, skip set bits,
1852 otherwise skip zero bits. */
1854 num_insns = 1;
1855 mask = 0xffff;
1856 val2 = one_match > zero_match ? ~val : val;
1857 i = (val2 & mask) != 0 ? 0 : (val2 & (mask << 16)) != 0 ? 16 : 32;
1859 if (generate)
1860 emit_insn (gen_rtx_SET (dest, GEN_INT (one_match > zero_match
1861 ? (val | ~(mask << i))
1862 : (val & (mask << i)))));
1863 for (i += 16; i < 64; i += 16)
1865 if ((val2 & (mask << i)) == 0)
1866 continue;
1867 if (generate)
1868 emit_insn (gen_insv_immdi (dest, GEN_INT (i),
1869 GEN_INT ((val >> i) & 0xffff)));
1870 num_insns ++;
1873 return num_insns;
1877 void
1878 aarch64_expand_mov_immediate (rtx dest, rtx imm)
1880 machine_mode mode = GET_MODE (dest);
1882 gcc_assert (mode == SImode || mode == DImode);
1884 /* Check on what type of symbol it is. */
1885 if (GET_CODE (imm) == SYMBOL_REF
1886 || GET_CODE (imm) == LABEL_REF
1887 || GET_CODE (imm) == CONST)
1889 rtx mem, base, offset;
1890 enum aarch64_symbol_type sty;
1892 /* If we have (const (plus symbol offset)), separate out the offset
1893 before we start classifying the symbol. */
1894 split_const (imm, &base, &offset);
1896 sty = aarch64_classify_symbol (base, offset);
1897 switch (sty)
1899 case SYMBOL_FORCE_TO_MEM:
1900 if (offset != const0_rtx
1901 && targetm.cannot_force_const_mem (mode, imm))
1903 gcc_assert (can_create_pseudo_p ());
1904 base = aarch64_force_temporary (mode, dest, base);
1905 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1906 aarch64_emit_move (dest, base);
1907 return;
1910 mem = force_const_mem (ptr_mode, imm);
1911 gcc_assert (mem);
1913 /* If we aren't generating PC relative literals, then
1914 we need to expand the literal pool access carefully.
1915 This is something that needs to be done in a number
1916 of places, so could well live as a separate function. */
1917 if (!aarch64_pcrelative_literal_loads)
1919 gcc_assert (can_create_pseudo_p ());
1920 base = gen_reg_rtx (ptr_mode);
1921 aarch64_expand_mov_immediate (base, XEXP (mem, 0));
1922 mem = gen_rtx_MEM (ptr_mode, base);
1925 if (mode != ptr_mode)
1926 mem = gen_rtx_ZERO_EXTEND (mode, mem);
1928 emit_insn (gen_rtx_SET (dest, mem));
1930 return;
1932 case SYMBOL_SMALL_TLSGD:
1933 case SYMBOL_SMALL_TLSDESC:
1934 case SYMBOL_SMALL_TLSIE:
1935 case SYMBOL_SMALL_GOT_28K:
1936 case SYMBOL_SMALL_GOT_4G:
1937 case SYMBOL_TINY_GOT:
1938 case SYMBOL_TINY_TLSIE:
1939 if (offset != const0_rtx)
1941 gcc_assert(can_create_pseudo_p ());
1942 base = aarch64_force_temporary (mode, dest, base);
1943 base = aarch64_add_offset (mode, NULL, base, INTVAL (offset));
1944 aarch64_emit_move (dest, base);
1945 return;
1947 /* FALLTHRU */
1949 case SYMBOL_SMALL_ABSOLUTE:
1950 case SYMBOL_TINY_ABSOLUTE:
1951 case SYMBOL_TLSLE12:
1952 case SYMBOL_TLSLE24:
1953 case SYMBOL_TLSLE32:
1954 case SYMBOL_TLSLE48:
1955 aarch64_load_symref_appropriately (dest, imm, sty);
1956 return;
1958 default:
1959 gcc_unreachable ();
1963 if (!CONST_INT_P (imm))
1965 if (GET_CODE (imm) == HIGH)
1966 emit_insn (gen_rtx_SET (dest, imm));
1967 else
1969 rtx mem = force_const_mem (mode, imm);
1970 gcc_assert (mem);
1971 emit_insn (gen_rtx_SET (dest, mem));
1974 return;
1977 aarch64_internal_mov_immediate (dest, imm, true, GET_MODE (dest));
1980 /* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a
1981 temporary value if necessary. FRAME_RELATED_P should be true if
1982 the RTX_FRAME_RELATED flag should be set and CFA adjustments added
1983 to the generated instructions. If SCRATCHREG is known to hold
1984 abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the
1985 immediate again.
1987 Since this function may be used to adjust the stack pointer, we must
1988 ensure that it cannot cause transient stack deallocation (for example
1989 by first incrementing SP and then decrementing when adjusting by a
1990 large immediate). */
1992 static void
1993 aarch64_add_constant_internal (machine_mode mode, int regnum, int scratchreg,
1994 HOST_WIDE_INT delta, bool frame_related_p,
1995 bool emit_move_imm)
1997 HOST_WIDE_INT mdelta = abs_hwi (delta);
1998 rtx this_rtx = gen_rtx_REG (mode, regnum);
1999 rtx_insn *insn;
2001 if (!mdelta)
2002 return;
2004 /* Single instruction adjustment. */
2005 if (aarch64_uimm12_shift (mdelta))
2007 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta)));
2008 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2009 return;
2012 /* Emit 2 additions/subtractions if the adjustment is less than 24 bits.
2013 Only do this if mdelta is not a 16-bit move as adjusting using a move
2014 is better. */
2015 if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode))
2017 HOST_WIDE_INT low_off = mdelta & 0xfff;
2019 low_off = delta < 0 ? -low_off : low_off;
2020 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off)));
2021 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2022 insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off)));
2023 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2024 return;
2027 /* Emit a move immediate if required and an addition/subtraction. */
2028 rtx scratch_rtx = gen_rtx_REG (mode, scratchreg);
2029 if (emit_move_imm)
2030 aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode);
2031 insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx)
2032 : gen_add2_insn (this_rtx, scratch_rtx));
2033 if (frame_related_p)
2035 RTX_FRAME_RELATED_P (insn) = frame_related_p;
2036 rtx adj = plus_constant (mode, this_rtx, delta);
2037 add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj));
2041 static inline void
2042 aarch64_add_constant (machine_mode mode, int regnum, int scratchreg,
2043 HOST_WIDE_INT delta)
2045 aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true);
2048 static inline void
2049 aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm)
2051 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta,
2052 true, emit_move_imm);
2055 static inline void
2056 aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p)
2058 aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta,
2059 frame_related_p, true);
2062 static bool
2063 aarch64_function_ok_for_sibcall (tree decl ATTRIBUTE_UNUSED,
2064 tree exp ATTRIBUTE_UNUSED)
2066 /* Currently, always true. */
2067 return true;
2070 /* Implement TARGET_PASS_BY_REFERENCE. */
2072 static bool
2073 aarch64_pass_by_reference (cumulative_args_t pcum ATTRIBUTE_UNUSED,
2074 machine_mode mode,
2075 const_tree type,
2076 bool named ATTRIBUTE_UNUSED)
2078 HOST_WIDE_INT size;
2079 machine_mode dummymode;
2080 int nregs;
2082 /* GET_MODE_SIZE (BLKmode) is useless since it is 0. */
2083 size = (mode == BLKmode && type)
2084 ? int_size_in_bytes (type) : (int) GET_MODE_SIZE (mode);
2086 /* Aggregates are passed by reference based on their size. */
2087 if (type && AGGREGATE_TYPE_P (type))
2089 size = int_size_in_bytes (type);
2092 /* Variable sized arguments are always returned by reference. */
2093 if (size < 0)
2094 return true;
2096 /* Can this be a candidate to be passed in fp/simd register(s)? */
2097 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2098 &dummymode, &nregs,
2099 NULL))
2100 return false;
2102 /* Arguments which are variable sized or larger than 2 registers are
2103 passed by reference unless they are a homogenous floating point
2104 aggregate. */
2105 return size > 2 * UNITS_PER_WORD;
2108 /* Return TRUE if VALTYPE is padded to its least significant bits. */
2109 static bool
2110 aarch64_return_in_msb (const_tree valtype)
2112 machine_mode dummy_mode;
2113 int dummy_int;
2115 /* Never happens in little-endian mode. */
2116 if (!BYTES_BIG_ENDIAN)
2117 return false;
2119 /* Only composite types smaller than or equal to 16 bytes can
2120 be potentially returned in registers. */
2121 if (!aarch64_composite_type_p (valtype, TYPE_MODE (valtype))
2122 || int_size_in_bytes (valtype) <= 0
2123 || int_size_in_bytes (valtype) > 16)
2124 return false;
2126 /* But not a composite that is an HFA (Homogeneous Floating-point Aggregate)
2127 or an HVA (Homogeneous Short-Vector Aggregate); such a special composite
2128 is always passed/returned in the least significant bits of fp/simd
2129 register(s). */
2130 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (valtype), valtype,
2131 &dummy_mode, &dummy_int, NULL))
2132 return false;
2134 return true;
2137 /* Implement TARGET_FUNCTION_VALUE.
2138 Define how to find the value returned by a function. */
2140 static rtx
2141 aarch64_function_value (const_tree type, const_tree func,
2142 bool outgoing ATTRIBUTE_UNUSED)
2144 machine_mode mode;
2145 int unsignedp;
2146 int count;
2147 machine_mode ag_mode;
2149 mode = TYPE_MODE (type);
2150 if (INTEGRAL_TYPE_P (type))
2151 mode = promote_function_mode (type, mode, &unsignedp, func, 1);
2153 if (aarch64_return_in_msb (type))
2155 HOST_WIDE_INT size = int_size_in_bytes (type);
2157 if (size % UNITS_PER_WORD != 0)
2159 size += UNITS_PER_WORD - size % UNITS_PER_WORD;
2160 mode = mode_for_size (size * BITS_PER_UNIT, MODE_INT, 0);
2164 if (aarch64_vfp_is_call_or_return_candidate (mode, type,
2165 &ag_mode, &count, NULL))
2167 if (!aarch64_composite_type_p (type, mode))
2169 gcc_assert (count == 1 && mode == ag_mode);
2170 return gen_rtx_REG (mode, V0_REGNUM);
2172 else
2174 int i;
2175 rtx par;
2177 par = gen_rtx_PARALLEL (mode, rtvec_alloc (count));
2178 for (i = 0; i < count; i++)
2180 rtx tmp = gen_rtx_REG (ag_mode, V0_REGNUM + i);
2181 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2182 GEN_INT (i * GET_MODE_SIZE (ag_mode)));
2183 XVECEXP (par, 0, i) = tmp;
2185 return par;
2188 else
2189 return gen_rtx_REG (mode, R0_REGNUM);
2192 /* Implements TARGET_FUNCTION_VALUE_REGNO_P.
2193 Return true if REGNO is the number of a hard register in which the values
2194 of called function may come back. */
2196 static bool
2197 aarch64_function_value_regno_p (const unsigned int regno)
2199 /* Maximum of 16 bytes can be returned in the general registers. Examples
2200 of 16-byte return values are: 128-bit integers and 16-byte small
2201 structures (excluding homogeneous floating-point aggregates). */
2202 if (regno == R0_REGNUM || regno == R1_REGNUM)
2203 return true;
2205 /* Up to four fp/simd registers can return a function value, e.g. a
2206 homogeneous floating-point aggregate having four members. */
2207 if (regno >= V0_REGNUM && regno < V0_REGNUM + HA_MAX_NUM_FLDS)
2208 return TARGET_FLOAT;
2210 return false;
2213 /* Implement TARGET_RETURN_IN_MEMORY.
2215 If the type T of the result of a function is such that
2216 void func (T arg)
2217 would require that arg be passed as a value in a register (or set of
2218 registers) according to the parameter passing rules, then the result
2219 is returned in the same registers as would be used for such an
2220 argument. */
2222 static bool
2223 aarch64_return_in_memory (const_tree type, const_tree fndecl ATTRIBUTE_UNUSED)
2225 HOST_WIDE_INT size;
2226 machine_mode ag_mode;
2227 int count;
2229 if (!AGGREGATE_TYPE_P (type)
2230 && TREE_CODE (type) != COMPLEX_TYPE
2231 && TREE_CODE (type) != VECTOR_TYPE)
2232 /* Simple scalar types always returned in registers. */
2233 return false;
2235 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type),
2236 type,
2237 &ag_mode,
2238 &count,
2239 NULL))
2240 return false;
2242 /* Types larger than 2 registers returned in memory. */
2243 size = int_size_in_bytes (type);
2244 return (size < 0 || size > 2 * UNITS_PER_WORD);
2247 static bool
2248 aarch64_vfp_is_call_candidate (cumulative_args_t pcum_v, machine_mode mode,
2249 const_tree type, int *nregs)
2251 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2252 return aarch64_vfp_is_call_or_return_candidate (mode,
2253 type,
2254 &pcum->aapcs_vfp_rmode,
2255 nregs,
2256 NULL);
2259 /* Given MODE and TYPE of a function argument, return the alignment in
2260 bits. The idea is to suppress any stronger alignment requested by
2261 the user and opt for the natural alignment (specified in AAPCS64 \S 4.1).
2262 This is a helper function for local use only. */
2264 static unsigned int
2265 aarch64_function_arg_alignment (machine_mode mode, const_tree type)
2267 if (!type)
2268 return GET_MODE_ALIGNMENT (mode);
2269 if (integer_zerop (TYPE_SIZE (type)))
2270 return 0;
2272 gcc_assert (TYPE_MODE (type) == mode);
2274 if (!AGGREGATE_TYPE_P (type))
2275 return TYPE_ALIGN (TYPE_MAIN_VARIANT (type));
2277 if (TREE_CODE (type) == ARRAY_TYPE)
2278 return TYPE_ALIGN (TREE_TYPE (type));
2280 unsigned int alignment = 0;
2282 for (tree field = TYPE_FIELDS (type); field; field = DECL_CHAIN (field))
2283 alignment = std::max (alignment, DECL_ALIGN (field));
2285 return alignment;
2288 /* Layout a function argument according to the AAPCS64 rules. The rule
2289 numbers refer to the rule numbers in the AAPCS64. */
2291 static void
2292 aarch64_layout_arg (cumulative_args_t pcum_v, machine_mode mode,
2293 const_tree type,
2294 bool named ATTRIBUTE_UNUSED)
2296 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2297 int ncrn, nvrn, nregs;
2298 bool allocate_ncrn, allocate_nvrn;
2299 HOST_WIDE_INT size;
2301 /* We need to do this once per argument. */
2302 if (pcum->aapcs_arg_processed)
2303 return;
2305 pcum->aapcs_arg_processed = true;
2307 /* Size in bytes, rounded to the nearest multiple of 8 bytes. */
2308 size
2309 = ROUND_UP (type ? int_size_in_bytes (type) : GET_MODE_SIZE (mode),
2310 UNITS_PER_WORD);
2312 allocate_ncrn = (type) ? !(FLOAT_TYPE_P (type)) : !FLOAT_MODE_P (mode);
2313 allocate_nvrn = aarch64_vfp_is_call_candidate (pcum_v,
2314 mode,
2315 type,
2316 &nregs);
2318 /* allocate_ncrn may be false-positive, but allocate_nvrn is quite reliable.
2319 The following code thus handles passing by SIMD/FP registers first. */
2321 nvrn = pcum->aapcs_nvrn;
2323 /* C1 - C5 for floating point, homogenous floating point aggregates (HFA)
2324 and homogenous short-vector aggregates (HVA). */
2325 if (allocate_nvrn)
2327 if (!TARGET_FLOAT)
2328 aarch64_err_no_fpadvsimd (mode, "argument");
2330 if (nvrn + nregs <= NUM_FP_ARG_REGS)
2332 pcum->aapcs_nextnvrn = nvrn + nregs;
2333 if (!aarch64_composite_type_p (type, mode))
2335 gcc_assert (nregs == 1);
2336 pcum->aapcs_reg = gen_rtx_REG (mode, V0_REGNUM + nvrn);
2338 else
2340 rtx par;
2341 int i;
2342 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2343 for (i = 0; i < nregs; i++)
2345 rtx tmp = gen_rtx_REG (pcum->aapcs_vfp_rmode,
2346 V0_REGNUM + nvrn + i);
2347 tmp = gen_rtx_EXPR_LIST
2348 (VOIDmode, tmp,
2349 GEN_INT (i * GET_MODE_SIZE (pcum->aapcs_vfp_rmode)));
2350 XVECEXP (par, 0, i) = tmp;
2352 pcum->aapcs_reg = par;
2354 return;
2356 else
2358 /* C.3 NSRN is set to 8. */
2359 pcum->aapcs_nextnvrn = NUM_FP_ARG_REGS;
2360 goto on_stack;
2364 ncrn = pcum->aapcs_ncrn;
2365 nregs = size / UNITS_PER_WORD;
2367 /* C6 - C9. though the sign and zero extension semantics are
2368 handled elsewhere. This is the case where the argument fits
2369 entirely general registers. */
2370 if (allocate_ncrn && (ncrn + nregs <= NUM_ARG_REGS))
2372 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2374 gcc_assert (nregs == 0 || nregs == 1 || nregs == 2);
2376 /* C.8 if the argument has an alignment of 16 then the NGRN is
2377 rounded up to the next even number. */
2378 if (nregs == 2 && alignment == 16 * BITS_PER_UNIT && ncrn % 2)
2380 ++ncrn;
2381 gcc_assert (ncrn + nregs <= NUM_ARG_REGS);
2383 /* NREGS can be 0 when e.g. an empty structure is to be passed.
2384 A reg is still generated for it, but the caller should be smart
2385 enough not to use it. */
2386 if (nregs == 0 || nregs == 1 || GET_MODE_CLASS (mode) == MODE_INT)
2388 pcum->aapcs_reg = gen_rtx_REG (mode, R0_REGNUM + ncrn);
2390 else
2392 rtx par;
2393 int i;
2395 par = gen_rtx_PARALLEL (mode, rtvec_alloc (nregs));
2396 for (i = 0; i < nregs; i++)
2398 rtx tmp = gen_rtx_REG (word_mode, R0_REGNUM + ncrn + i);
2399 tmp = gen_rtx_EXPR_LIST (VOIDmode, tmp,
2400 GEN_INT (i * UNITS_PER_WORD));
2401 XVECEXP (par, 0, i) = tmp;
2403 pcum->aapcs_reg = par;
2406 pcum->aapcs_nextncrn = ncrn + nregs;
2407 return;
2410 /* C.11 */
2411 pcum->aapcs_nextncrn = NUM_ARG_REGS;
2413 /* The argument is passed on stack; record the needed number of words for
2414 this argument and align the total size if necessary. */
2415 on_stack:
2416 pcum->aapcs_stack_words = size / UNITS_PER_WORD;
2417 if (aarch64_function_arg_alignment (mode, type) == 16 * BITS_PER_UNIT)
2418 pcum->aapcs_stack_size = ROUND_UP (pcum->aapcs_stack_size,
2419 16 / UNITS_PER_WORD);
2420 return;
2423 /* Implement TARGET_FUNCTION_ARG. */
2425 static rtx
2426 aarch64_function_arg (cumulative_args_t pcum_v, machine_mode mode,
2427 const_tree type, bool named)
2429 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2430 gcc_assert (pcum->pcs_variant == ARM_PCS_AAPCS64);
2432 if (mode == VOIDmode)
2433 return NULL_RTX;
2435 aarch64_layout_arg (pcum_v, mode, type, named);
2436 return pcum->aapcs_reg;
2439 void
2440 aarch64_init_cumulative_args (CUMULATIVE_ARGS *pcum,
2441 const_tree fntype ATTRIBUTE_UNUSED,
2442 rtx libname ATTRIBUTE_UNUSED,
2443 const_tree fndecl ATTRIBUTE_UNUSED,
2444 unsigned n_named ATTRIBUTE_UNUSED)
2446 pcum->aapcs_ncrn = 0;
2447 pcum->aapcs_nvrn = 0;
2448 pcum->aapcs_nextncrn = 0;
2449 pcum->aapcs_nextnvrn = 0;
2450 pcum->pcs_variant = ARM_PCS_AAPCS64;
2451 pcum->aapcs_reg = NULL_RTX;
2452 pcum->aapcs_arg_processed = false;
2453 pcum->aapcs_stack_words = 0;
2454 pcum->aapcs_stack_size = 0;
2456 if (!TARGET_FLOAT
2457 && fndecl && TREE_PUBLIC (fndecl)
2458 && fntype && fntype != error_mark_node)
2460 const_tree type = TREE_TYPE (fntype);
2461 machine_mode mode ATTRIBUTE_UNUSED; /* To pass pointer as argument. */
2462 int nregs ATTRIBUTE_UNUSED; /* Likewise. */
2463 if (aarch64_vfp_is_call_or_return_candidate (TYPE_MODE (type), type,
2464 &mode, &nregs, NULL))
2465 aarch64_err_no_fpadvsimd (TYPE_MODE (type), "return type");
2467 return;
2470 static void
2471 aarch64_function_arg_advance (cumulative_args_t pcum_v,
2472 machine_mode mode,
2473 const_tree type,
2474 bool named)
2476 CUMULATIVE_ARGS *pcum = get_cumulative_args (pcum_v);
2477 if (pcum->pcs_variant == ARM_PCS_AAPCS64)
2479 aarch64_layout_arg (pcum_v, mode, type, named);
2480 gcc_assert ((pcum->aapcs_reg != NULL_RTX)
2481 != (pcum->aapcs_stack_words != 0));
2482 pcum->aapcs_arg_processed = false;
2483 pcum->aapcs_ncrn = pcum->aapcs_nextncrn;
2484 pcum->aapcs_nvrn = pcum->aapcs_nextnvrn;
2485 pcum->aapcs_stack_size += pcum->aapcs_stack_words;
2486 pcum->aapcs_stack_words = 0;
2487 pcum->aapcs_reg = NULL_RTX;
2491 bool
2492 aarch64_function_arg_regno_p (unsigned regno)
2494 return ((GP_REGNUM_P (regno) && regno < R0_REGNUM + NUM_ARG_REGS)
2495 || (FP_REGNUM_P (regno) && regno < V0_REGNUM + NUM_FP_ARG_REGS));
2498 /* Implement FUNCTION_ARG_BOUNDARY. Every parameter gets at least
2499 PARM_BOUNDARY bits of alignment, but will be given anything up
2500 to STACK_BOUNDARY bits if the type requires it. This makes sure
2501 that both before and after the layout of each argument, the Next
2502 Stacked Argument Address (NSAA) will have a minimum alignment of
2503 8 bytes. */
2505 static unsigned int
2506 aarch64_function_arg_boundary (machine_mode mode, const_tree type)
2508 unsigned int alignment = aarch64_function_arg_alignment (mode, type);
2510 if (alignment < PARM_BOUNDARY)
2511 alignment = PARM_BOUNDARY;
2512 if (alignment > STACK_BOUNDARY)
2513 alignment = STACK_BOUNDARY;
2514 return alignment;
2517 /* For use by FUNCTION_ARG_PADDING (MODE, TYPE).
2519 Return true if an argument passed on the stack should be padded upwards,
2520 i.e. if the least-significant byte of the stack slot has useful data.
2522 Small aggregate types are placed in the lowest memory address.
2524 The related parameter passing rules are B.4, C.3, C.5 and C.14. */
2526 bool
2527 aarch64_pad_arg_upward (machine_mode mode, const_tree type)
2529 /* On little-endian targets, the least significant byte of every stack
2530 argument is passed at the lowest byte address of the stack slot. */
2531 if (!BYTES_BIG_ENDIAN)
2532 return true;
2534 /* Otherwise, integral, floating-point and pointer types are padded downward:
2535 the least significant byte of a stack argument is passed at the highest
2536 byte address of the stack slot. */
2537 if (type
2538 ? (INTEGRAL_TYPE_P (type) || SCALAR_FLOAT_TYPE_P (type)
2539 || POINTER_TYPE_P (type))
2540 : (SCALAR_INT_MODE_P (mode) || SCALAR_FLOAT_MODE_P (mode)))
2541 return false;
2543 /* Everything else padded upward, i.e. data in first byte of stack slot. */
2544 return true;
2547 /* Similarly, for use by BLOCK_REG_PADDING (MODE, TYPE, FIRST).
2549 It specifies padding for the last (may also be the only)
2550 element of a block move between registers and memory. If
2551 assuming the block is in the memory, padding upward means that
2552 the last element is padded after its highest significant byte,
2553 while in downward padding, the last element is padded at the
2554 its least significant byte side.
2556 Small aggregates and small complex types are always padded
2557 upwards.
2559 We don't need to worry about homogeneous floating-point or
2560 short-vector aggregates; their move is not affected by the
2561 padding direction determined here. Regardless of endianness,
2562 each element of such an aggregate is put in the least
2563 significant bits of a fp/simd register.
2565 Return !BYTES_BIG_ENDIAN if the least significant byte of the
2566 register has useful data, and return the opposite if the most
2567 significant byte does. */
2569 bool
2570 aarch64_pad_reg_upward (machine_mode mode, const_tree type,
2571 bool first ATTRIBUTE_UNUSED)
2574 /* Small composite types are always padded upward. */
2575 if (BYTES_BIG_ENDIAN && aarch64_composite_type_p (type, mode))
2577 HOST_WIDE_INT size = (type ? int_size_in_bytes (type)
2578 : GET_MODE_SIZE (mode));
2579 if (size < 2 * UNITS_PER_WORD)
2580 return true;
2583 /* Otherwise, use the default padding. */
2584 return !BYTES_BIG_ENDIAN;
2587 static machine_mode
2588 aarch64_libgcc_cmp_return_mode (void)
2590 return SImode;
2593 #define PROBE_INTERVAL (1 << STACK_CHECK_PROBE_INTERVAL_EXP)
2595 /* We use the 12-bit shifted immediate arithmetic instructions so values
2596 must be multiple of (1 << 12), i.e. 4096. */
2597 #define ARITH_FACTOR 4096
2599 #if (PROBE_INTERVAL % ARITH_FACTOR) != 0
2600 #error Cannot use simple address calculation for stack probing
2601 #endif
2603 /* The pair of scratch registers used for stack probing. */
2604 #define PROBE_STACK_FIRST_REG 9
2605 #define PROBE_STACK_SECOND_REG 10
2607 /* Emit code to probe a range of stack addresses from FIRST to FIRST+SIZE,
2608 inclusive. These are offsets from the current stack pointer. */
2610 static void
2611 aarch64_emit_probe_stack_range (HOST_WIDE_INT first, HOST_WIDE_INT size)
2613 rtx reg1 = gen_rtx_REG (ptr_mode, PROBE_STACK_FIRST_REG);
2615 /* See the same assertion on PROBE_INTERVAL above. */
2616 gcc_assert ((first % ARITH_FACTOR) == 0);
2618 /* See if we have a constant small number of probes to generate. If so,
2619 that's the easy case. */
2620 if (size <= PROBE_INTERVAL)
2622 const HOST_WIDE_INT base = ROUND_UP (size, ARITH_FACTOR);
2624 emit_set_insn (reg1,
2625 plus_constant (ptr_mode,
2626 stack_pointer_rtx, -(first + base)));
2627 emit_stack_probe (plus_constant (ptr_mode, reg1, base - size));
2630 /* The run-time loop is made up of 8 insns in the generic case while the
2631 compile-time loop is made up of 4+2*(n-2) insns for n # of intervals. */
2632 else if (size <= 4 * PROBE_INTERVAL)
2634 HOST_WIDE_INT i, rem;
2636 emit_set_insn (reg1,
2637 plus_constant (ptr_mode,
2638 stack_pointer_rtx,
2639 -(first + PROBE_INTERVAL)));
2640 emit_stack_probe (reg1);
2642 /* Probe at FIRST + N * PROBE_INTERVAL for values of N from 2 until
2643 it exceeds SIZE. If only two probes are needed, this will not
2644 generate any code. Then probe at FIRST + SIZE. */
2645 for (i = 2 * PROBE_INTERVAL; i < size; i += PROBE_INTERVAL)
2647 emit_set_insn (reg1,
2648 plus_constant (ptr_mode, reg1, -PROBE_INTERVAL));
2649 emit_stack_probe (reg1);
2652 rem = size - (i - PROBE_INTERVAL);
2653 if (rem > 256)
2655 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2657 emit_set_insn (reg1, plus_constant (ptr_mode, reg1, -base));
2658 emit_stack_probe (plus_constant (ptr_mode, reg1, base - rem));
2660 else
2661 emit_stack_probe (plus_constant (ptr_mode, reg1, -rem));
2664 /* Otherwise, do the same as above, but in a loop. Note that we must be
2665 extra careful with variables wrapping around because we might be at
2666 the very top (or the very bottom) of the address space and we have
2667 to be able to handle this case properly; in particular, we use an
2668 equality test for the loop condition. */
2669 else
2671 rtx reg2 = gen_rtx_REG (ptr_mode, PROBE_STACK_SECOND_REG);
2673 /* Step 1: round SIZE to the previous multiple of the interval. */
2675 HOST_WIDE_INT rounded_size = size & -PROBE_INTERVAL;
2678 /* Step 2: compute initial and final value of the loop counter. */
2680 /* TEST_ADDR = SP + FIRST. */
2681 emit_set_insn (reg1,
2682 plus_constant (ptr_mode, stack_pointer_rtx, -first));
2684 /* LAST_ADDR = SP + FIRST + ROUNDED_SIZE. */
2685 emit_set_insn (reg2,
2686 plus_constant (ptr_mode, stack_pointer_rtx,
2687 -(first + rounded_size)));
2690 /* Step 3: the loop
2694 TEST_ADDR = TEST_ADDR + PROBE_INTERVAL
2695 probe at TEST_ADDR
2697 while (TEST_ADDR != LAST_ADDR)
2699 probes at FIRST + N * PROBE_INTERVAL for values of N from 1
2700 until it is equal to ROUNDED_SIZE. */
2702 if (ptr_mode == DImode)
2703 emit_insn (gen_probe_stack_range_di (reg1, reg1, reg2));
2704 else
2705 emit_insn (gen_probe_stack_range_si (reg1, reg1, reg2));
2708 /* Step 4: probe at FIRST + SIZE if we cannot assert at compile-time
2709 that SIZE is equal to ROUNDED_SIZE. */
2711 if (size != rounded_size)
2713 HOST_WIDE_INT rem = size - rounded_size;
2715 if (rem > 256)
2717 const HOST_WIDE_INT base = ROUND_UP (rem, ARITH_FACTOR);
2719 emit_set_insn (reg2, plus_constant (ptr_mode, reg2, -base));
2720 emit_stack_probe (plus_constant (ptr_mode, reg2, base - rem));
2722 else
2723 emit_stack_probe (plus_constant (ptr_mode, reg2, -rem));
2727 /* Make sure nothing is scheduled before we are done. */
2728 emit_insn (gen_blockage ());
2731 /* Probe a range of stack addresses from REG1 to REG2 inclusive. These are
2732 absolute addresses. */
2734 const char *
2735 aarch64_output_probe_stack_range (rtx reg1, rtx reg2)
2737 static int labelno = 0;
2738 char loop_lab[32];
2739 rtx xops[2];
2741 ASM_GENERATE_INTERNAL_LABEL (loop_lab, "LPSRL", labelno++);
2743 /* Loop. */
2744 ASM_OUTPUT_INTERNAL_LABEL (asm_out_file, loop_lab);
2746 /* TEST_ADDR = TEST_ADDR + PROBE_INTERVAL. */
2747 xops[0] = reg1;
2748 xops[1] = GEN_INT (PROBE_INTERVAL);
2749 output_asm_insn ("sub\t%0, %0, %1", xops);
2751 /* Probe at TEST_ADDR. */
2752 output_asm_insn ("str\txzr, [%0]", xops);
2754 /* Test if TEST_ADDR == LAST_ADDR. */
2755 xops[1] = reg2;
2756 output_asm_insn ("cmp\t%0, %1", xops);
2758 /* Branch. */
2759 fputs ("\tb.ne\t", asm_out_file);
2760 assemble_name_raw (asm_out_file, loop_lab);
2761 fputc ('\n', asm_out_file);
2763 return "";
2766 static bool
2767 aarch64_frame_pointer_required (void)
2769 /* In aarch64_override_options_after_change
2770 flag_omit_leaf_frame_pointer turns off the frame pointer by
2771 default. Turn it back on now if we've not got a leaf
2772 function. */
2773 if (flag_omit_leaf_frame_pointer
2774 && (!crtl->is_leaf || df_regs_ever_live_p (LR_REGNUM)))
2775 return true;
2777 /* Force a frame pointer for EH returns so the return address is at FP+8. */
2778 if (crtl->calls_eh_return)
2779 return true;
2781 return false;
2784 /* Mark the registers that need to be saved by the callee and calculate
2785 the size of the callee-saved registers area and frame record (both FP
2786 and LR may be omitted). */
2787 static void
2788 aarch64_layout_frame (void)
2790 HOST_WIDE_INT offset = 0;
2791 int regno, last_fp_reg = INVALID_REGNUM;
2793 if (reload_completed && cfun->machine->frame.laid_out)
2794 return;
2796 #define SLOT_NOT_REQUIRED (-2)
2797 #define SLOT_REQUIRED (-1)
2799 cfun->machine->frame.wb_candidate1 = INVALID_REGNUM;
2800 cfun->machine->frame.wb_candidate2 = INVALID_REGNUM;
2802 /* First mark all the registers that really need to be saved... */
2803 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2804 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2806 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2807 cfun->machine->frame.reg_offset[regno] = SLOT_NOT_REQUIRED;
2809 /* ... that includes the eh data registers (if needed)... */
2810 if (crtl->calls_eh_return)
2811 for (regno = 0; EH_RETURN_DATA_REGNO (regno) != INVALID_REGNUM; regno++)
2812 cfun->machine->frame.reg_offset[EH_RETURN_DATA_REGNO (regno)]
2813 = SLOT_REQUIRED;
2815 /* ... and any callee saved register that dataflow says is live. */
2816 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2817 if (df_regs_ever_live_p (regno)
2818 && (regno == R30_REGNUM
2819 || !call_used_regs[regno]))
2820 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2822 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2823 if (df_regs_ever_live_p (regno)
2824 && !call_used_regs[regno])
2826 cfun->machine->frame.reg_offset[regno] = SLOT_REQUIRED;
2827 last_fp_reg = regno;
2830 if (frame_pointer_needed)
2832 /* FP and LR are placed in the linkage record. */
2833 cfun->machine->frame.reg_offset[R29_REGNUM] = 0;
2834 cfun->machine->frame.wb_candidate1 = R29_REGNUM;
2835 cfun->machine->frame.reg_offset[R30_REGNUM] = UNITS_PER_WORD;
2836 cfun->machine->frame.wb_candidate2 = R30_REGNUM;
2837 offset += 2 * UNITS_PER_WORD;
2840 /* Now assign stack slots for them. */
2841 for (regno = R0_REGNUM; regno <= R30_REGNUM; regno++)
2842 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2844 cfun->machine->frame.reg_offset[regno] = offset;
2845 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2846 cfun->machine->frame.wb_candidate1 = regno;
2847 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM)
2848 cfun->machine->frame.wb_candidate2 = regno;
2849 offset += UNITS_PER_WORD;
2852 HOST_WIDE_INT max_int_offset = offset;
2853 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2854 bool has_align_gap = offset != max_int_offset;
2856 for (regno = V0_REGNUM; regno <= V31_REGNUM; regno++)
2857 if (cfun->machine->frame.reg_offset[regno] == SLOT_REQUIRED)
2859 /* If there is an alignment gap between integer and fp callee-saves,
2860 allocate the last fp register to it if possible. */
2861 if (regno == last_fp_reg && has_align_gap && (offset & 8) == 0)
2863 cfun->machine->frame.reg_offset[regno] = max_int_offset;
2864 break;
2867 cfun->machine->frame.reg_offset[regno] = offset;
2868 if (cfun->machine->frame.wb_candidate1 == INVALID_REGNUM)
2869 cfun->machine->frame.wb_candidate1 = regno;
2870 else if (cfun->machine->frame.wb_candidate2 == INVALID_REGNUM
2871 && cfun->machine->frame.wb_candidate1 >= V0_REGNUM)
2872 cfun->machine->frame.wb_candidate2 = regno;
2873 offset += UNITS_PER_WORD;
2876 offset = ROUND_UP (offset, STACK_BOUNDARY / BITS_PER_UNIT);
2878 cfun->machine->frame.saved_regs_size = offset;
2880 HOST_WIDE_INT varargs_and_saved_regs_size
2881 = offset + cfun->machine->frame.saved_varargs_size;
2883 cfun->machine->frame.hard_fp_offset
2884 = ROUND_UP (varargs_and_saved_regs_size + get_frame_size (),
2885 STACK_BOUNDARY / BITS_PER_UNIT);
2887 cfun->machine->frame.frame_size
2888 = ROUND_UP (cfun->machine->frame.hard_fp_offset
2889 + crtl->outgoing_args_size,
2890 STACK_BOUNDARY / BITS_PER_UNIT);
2892 cfun->machine->frame.locals_offset = cfun->machine->frame.saved_varargs_size;
2894 cfun->machine->frame.initial_adjust = 0;
2895 cfun->machine->frame.final_adjust = 0;
2896 cfun->machine->frame.callee_adjust = 0;
2897 cfun->machine->frame.callee_offset = 0;
2899 HOST_WIDE_INT max_push_offset = 0;
2900 if (cfun->machine->frame.wb_candidate2 != INVALID_REGNUM)
2901 max_push_offset = 512;
2902 else if (cfun->machine->frame.wb_candidate1 != INVALID_REGNUM)
2903 max_push_offset = 256;
2905 if (cfun->machine->frame.frame_size < max_push_offset
2906 && crtl->outgoing_args_size == 0)
2908 /* Simple, small frame with no outgoing arguments:
2909 stp reg1, reg2, [sp, -frame_size]!
2910 stp reg3, reg4, [sp, 16] */
2911 cfun->machine->frame.callee_adjust = cfun->machine->frame.frame_size;
2913 else if ((crtl->outgoing_args_size
2914 + cfun->machine->frame.saved_regs_size < 512)
2915 && !(cfun->calls_alloca
2916 && cfun->machine->frame.hard_fp_offset < max_push_offset))
2918 /* Frame with small outgoing arguments:
2919 sub sp, sp, frame_size
2920 stp reg1, reg2, [sp, outgoing_args_size]
2921 stp reg3, reg4, [sp, outgoing_args_size + 16] */
2922 cfun->machine->frame.initial_adjust = cfun->machine->frame.frame_size;
2923 cfun->machine->frame.callee_offset
2924 = cfun->machine->frame.frame_size - cfun->machine->frame.hard_fp_offset;
2926 else if (cfun->machine->frame.hard_fp_offset < max_push_offset)
2928 /* Frame with large outgoing arguments but a small local area:
2929 stp reg1, reg2, [sp, -hard_fp_offset]!
2930 stp reg3, reg4, [sp, 16]
2931 sub sp, sp, outgoing_args_size */
2932 cfun->machine->frame.callee_adjust = cfun->machine->frame.hard_fp_offset;
2933 cfun->machine->frame.final_adjust
2934 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2936 else if (!frame_pointer_needed
2937 && varargs_and_saved_regs_size < max_push_offset)
2939 /* Frame with large local area and outgoing arguments (this pushes the
2940 callee-saves first, followed by the locals and outgoing area):
2941 stp reg1, reg2, [sp, -varargs_and_saved_regs_size]!
2942 stp reg3, reg4, [sp, 16]
2943 sub sp, sp, frame_size - varargs_and_saved_regs_size */
2944 cfun->machine->frame.callee_adjust = varargs_and_saved_regs_size;
2945 cfun->machine->frame.final_adjust
2946 = cfun->machine->frame.frame_size - cfun->machine->frame.callee_adjust;
2947 cfun->machine->frame.hard_fp_offset = cfun->machine->frame.callee_adjust;
2948 cfun->machine->frame.locals_offset = cfun->machine->frame.hard_fp_offset;
2950 else
2952 /* Frame with large local area and outgoing arguments using frame pointer:
2953 sub sp, sp, hard_fp_offset
2954 stp x29, x30, [sp, 0]
2955 add x29, sp, 0
2956 stp reg3, reg4, [sp, 16]
2957 sub sp, sp, outgoing_args_size */
2958 cfun->machine->frame.initial_adjust = cfun->machine->frame.hard_fp_offset;
2959 cfun->machine->frame.final_adjust
2960 = cfun->machine->frame.frame_size - cfun->machine->frame.initial_adjust;
2963 cfun->machine->frame.laid_out = true;
2966 /* Return true if the register REGNO is saved on entry to
2967 the current function. */
2969 static bool
2970 aarch64_register_saved_on_entry (int regno)
2972 return cfun->machine->frame.reg_offset[regno] >= 0;
2975 /* Return the next register up from REGNO up to LIMIT for the callee
2976 to save. */
2978 static unsigned
2979 aarch64_next_callee_save (unsigned regno, unsigned limit)
2981 while (regno <= limit && !aarch64_register_saved_on_entry (regno))
2982 regno ++;
2983 return regno;
2986 /* Push the register number REGNO of mode MODE to the stack with write-back
2987 adjusting the stack by ADJUSTMENT. */
2989 static void
2990 aarch64_pushwb_single_reg (machine_mode mode, unsigned regno,
2991 HOST_WIDE_INT adjustment)
2993 rtx base_rtx = stack_pointer_rtx;
2994 rtx insn, reg, mem;
2996 reg = gen_rtx_REG (mode, regno);
2997 mem = gen_rtx_PRE_MODIFY (Pmode, base_rtx,
2998 plus_constant (Pmode, base_rtx, -adjustment));
2999 mem = gen_rtx_MEM (mode, mem);
3001 insn = emit_move_insn (mem, reg);
3002 RTX_FRAME_RELATED_P (insn) = 1;
3005 /* Generate and return an instruction to store the pair of registers
3006 REG and REG2 of mode MODE to location BASE with write-back adjusting
3007 the stack location BASE by ADJUSTMENT. */
3009 static rtx
3010 aarch64_gen_storewb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3011 HOST_WIDE_INT adjustment)
3013 switch (mode)
3015 case DImode:
3016 return gen_storewb_pairdi_di (base, base, reg, reg2,
3017 GEN_INT (-adjustment),
3018 GEN_INT (UNITS_PER_WORD - adjustment));
3019 case DFmode:
3020 return gen_storewb_pairdf_di (base, base, reg, reg2,
3021 GEN_INT (-adjustment),
3022 GEN_INT (UNITS_PER_WORD - adjustment));
3023 default:
3024 gcc_unreachable ();
3028 /* Push registers numbered REGNO1 and REGNO2 to the stack, adjusting the
3029 stack pointer by ADJUSTMENT. */
3031 static void
3032 aarch64_push_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment)
3034 rtx_insn *insn;
3035 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3037 if (regno2 == INVALID_REGNUM)
3038 return aarch64_pushwb_single_reg (mode, regno1, adjustment);
3040 rtx reg1 = gen_rtx_REG (mode, regno1);
3041 rtx reg2 = gen_rtx_REG (mode, regno2);
3043 insn = emit_insn (aarch64_gen_storewb_pair (mode, stack_pointer_rtx, reg1,
3044 reg2, adjustment));
3045 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 2)) = 1;
3046 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3047 RTX_FRAME_RELATED_P (insn) = 1;
3050 /* Load the pair of register REG, REG2 of mode MODE from stack location BASE,
3051 adjusting it by ADJUSTMENT afterwards. */
3053 static rtx
3054 aarch64_gen_loadwb_pair (machine_mode mode, rtx base, rtx reg, rtx reg2,
3055 HOST_WIDE_INT adjustment)
3057 switch (mode)
3059 case DImode:
3060 return gen_loadwb_pairdi_di (base, base, reg, reg2, GEN_INT (adjustment),
3061 GEN_INT (UNITS_PER_WORD));
3062 case DFmode:
3063 return gen_loadwb_pairdf_di (base, base, reg, reg2, GEN_INT (adjustment),
3064 GEN_INT (UNITS_PER_WORD));
3065 default:
3066 gcc_unreachable ();
3070 /* Pop the two registers numbered REGNO1, REGNO2 from the stack, adjusting it
3071 afterwards by ADJUSTMENT and writing the appropriate REG_CFA_RESTORE notes
3072 into CFI_OPS. */
3074 static void
3075 aarch64_pop_regs (unsigned regno1, unsigned regno2, HOST_WIDE_INT adjustment,
3076 rtx *cfi_ops)
3078 machine_mode mode = (regno1 <= R30_REGNUM) ? DImode : DFmode;
3079 rtx reg1 = gen_rtx_REG (mode, regno1);
3081 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg1, *cfi_ops);
3083 if (regno2 == INVALID_REGNUM)
3085 rtx mem = plus_constant (Pmode, stack_pointer_rtx, adjustment);
3086 mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
3087 emit_move_insn (reg1, gen_rtx_MEM (mode, mem));
3089 else
3091 rtx reg2 = gen_rtx_REG (mode, regno2);
3092 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3093 emit_insn (aarch64_gen_loadwb_pair (mode, stack_pointer_rtx, reg1,
3094 reg2, adjustment));
3098 /* Generate and return a store pair instruction of mode MODE to store
3099 register REG1 to MEM1 and register REG2 to MEM2. */
3101 static rtx
3102 aarch64_gen_store_pair (machine_mode mode, rtx mem1, rtx reg1, rtx mem2,
3103 rtx reg2)
3105 switch (mode)
3107 case DImode:
3108 return gen_store_pairdi (mem1, reg1, mem2, reg2);
3110 case DFmode:
3111 return gen_store_pairdf (mem1, reg1, mem2, reg2);
3113 default:
3114 gcc_unreachable ();
3118 /* Generate and regurn a load pair isntruction of mode MODE to load register
3119 REG1 from MEM1 and register REG2 from MEM2. */
3121 static rtx
3122 aarch64_gen_load_pair (machine_mode mode, rtx reg1, rtx mem1, rtx reg2,
3123 rtx mem2)
3125 switch (mode)
3127 case DImode:
3128 return gen_load_pairdi (reg1, mem1, reg2, mem2);
3130 case DFmode:
3131 return gen_load_pairdf (reg1, mem1, reg2, mem2);
3133 default:
3134 gcc_unreachable ();
3138 /* Return TRUE if return address signing should be enabled for the current
3139 function, otherwise return FALSE. */
3141 bool
3142 aarch64_return_address_signing_enabled (void)
3144 /* This function should only be called after frame laid out. */
3145 gcc_assert (cfun->machine->frame.laid_out);
3147 /* If signing scope is AARCH64_FUNCTION_NON_LEAF, we only sign a leaf function
3148 if it's LR is pushed onto stack. */
3149 return (aarch64_ra_sign_scope == AARCH64_FUNCTION_ALL
3150 || (aarch64_ra_sign_scope == AARCH64_FUNCTION_NON_LEAF
3151 && cfun->machine->frame.reg_offset[LR_REGNUM] >= 0));
3154 /* Emit code to save the callee-saved registers from register number START
3155 to LIMIT to the stack at the location starting at offset START_OFFSET,
3156 skipping any write-back candidates if SKIP_WB is true. */
3158 static void
3159 aarch64_save_callee_saves (machine_mode mode, HOST_WIDE_INT start_offset,
3160 unsigned start, unsigned limit, bool skip_wb)
3162 rtx_insn *insn;
3163 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3164 ? gen_frame_mem : gen_rtx_MEM);
3165 unsigned regno;
3166 unsigned regno2;
3168 for (regno = aarch64_next_callee_save (start, limit);
3169 regno <= limit;
3170 regno = aarch64_next_callee_save (regno + 1, limit))
3172 rtx reg, mem;
3173 HOST_WIDE_INT offset;
3175 if (skip_wb
3176 && (regno == cfun->machine->frame.wb_candidate1
3177 || regno == cfun->machine->frame.wb_candidate2))
3178 continue;
3180 if (cfun->machine->reg_is_wrapped_separately[regno])
3181 continue;
3183 reg = gen_rtx_REG (mode, regno);
3184 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3185 mem = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3186 offset));
3188 regno2 = aarch64_next_callee_save (regno + 1, limit);
3190 if (regno2 <= limit
3191 && !cfun->machine->reg_is_wrapped_separately[regno2]
3192 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3193 == cfun->machine->frame.reg_offset[regno2]))
3196 rtx reg2 = gen_rtx_REG (mode, regno2);
3197 rtx mem2;
3199 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3200 mem2 = gen_mem_ref (mode, plus_constant (Pmode, stack_pointer_rtx,
3201 offset));
3202 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2,
3203 reg2));
3205 /* The first part of a frame-related parallel insn is
3206 always assumed to be relevant to the frame
3207 calculations; subsequent parts, are only
3208 frame-related if explicitly marked. */
3209 RTX_FRAME_RELATED_P (XVECEXP (PATTERN (insn), 0, 1)) = 1;
3210 regno = regno2;
3212 else
3213 insn = emit_move_insn (mem, reg);
3215 RTX_FRAME_RELATED_P (insn) = 1;
3219 /* Emit code to restore the callee registers of mode MODE from register
3220 number START up to and including LIMIT. Restore from the stack offset
3221 START_OFFSET, skipping any write-back candidates if SKIP_WB is true.
3222 Write the appropriate REG_CFA_RESTORE notes into CFI_OPS. */
3224 static void
3225 aarch64_restore_callee_saves (machine_mode mode,
3226 HOST_WIDE_INT start_offset, unsigned start,
3227 unsigned limit, bool skip_wb, rtx *cfi_ops)
3229 rtx base_rtx = stack_pointer_rtx;
3230 rtx (*gen_mem_ref) (machine_mode, rtx) = (frame_pointer_needed
3231 ? gen_frame_mem : gen_rtx_MEM);
3232 unsigned regno;
3233 unsigned regno2;
3234 HOST_WIDE_INT offset;
3236 for (regno = aarch64_next_callee_save (start, limit);
3237 regno <= limit;
3238 regno = aarch64_next_callee_save (regno + 1, limit))
3240 if (cfun->machine->reg_is_wrapped_separately[regno])
3241 continue;
3243 rtx reg, mem;
3245 if (skip_wb
3246 && (regno == cfun->machine->frame.wb_candidate1
3247 || regno == cfun->machine->frame.wb_candidate2))
3248 continue;
3250 reg = gen_rtx_REG (mode, regno);
3251 offset = start_offset + cfun->machine->frame.reg_offset[regno];
3252 mem = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3254 regno2 = aarch64_next_callee_save (regno + 1, limit);
3256 if (regno2 <= limit
3257 && !cfun->machine->reg_is_wrapped_separately[regno2]
3258 && ((cfun->machine->frame.reg_offset[regno] + UNITS_PER_WORD)
3259 == cfun->machine->frame.reg_offset[regno2]))
3261 rtx reg2 = gen_rtx_REG (mode, regno2);
3262 rtx mem2;
3264 offset = start_offset + cfun->machine->frame.reg_offset[regno2];
3265 mem2 = gen_mem_ref (mode, plus_constant (Pmode, base_rtx, offset));
3266 emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3268 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg2, *cfi_ops);
3269 regno = regno2;
3271 else
3272 emit_move_insn (reg, mem);
3273 *cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg, *cfi_ops);
3277 static inline bool
3278 offset_9bit_signed_unscaled_p (machine_mode mode ATTRIBUTE_UNUSED,
3279 HOST_WIDE_INT offset)
3281 return offset >= -256 && offset < 256;
3284 static inline bool
3285 offset_12bit_unsigned_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3287 return (offset >= 0
3288 && offset < 4096 * GET_MODE_SIZE (mode)
3289 && offset % GET_MODE_SIZE (mode) == 0);
3292 bool
3293 aarch64_offset_7bit_signed_scaled_p (machine_mode mode, HOST_WIDE_INT offset)
3295 return (offset >= -64 * GET_MODE_SIZE (mode)
3296 && offset < 64 * GET_MODE_SIZE (mode)
3297 && offset % GET_MODE_SIZE (mode) == 0);
3300 /* Implement TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS. */
3302 static sbitmap
3303 aarch64_get_separate_components (void)
3305 aarch64_layout_frame ();
3307 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3308 bitmap_clear (components);
3310 /* The registers we need saved to the frame. */
3311 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3312 if (aarch64_register_saved_on_entry (regno))
3314 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3315 if (!frame_pointer_needed)
3316 offset += cfun->machine->frame.frame_size
3317 - cfun->machine->frame.hard_fp_offset;
3318 /* Check that we can access the stack slot of the register with one
3319 direct load with no adjustments needed. */
3320 if (offset_12bit_unsigned_scaled_p (DImode, offset))
3321 bitmap_set_bit (components, regno);
3324 /* Don't mess with the hard frame pointer. */
3325 if (frame_pointer_needed)
3326 bitmap_clear_bit (components, HARD_FRAME_POINTER_REGNUM);
3328 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3329 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3330 /* If aarch64_layout_frame has chosen registers to store/restore with
3331 writeback don't interfere with them to avoid having to output explicit
3332 stack adjustment instructions. */
3333 if (reg2 != INVALID_REGNUM)
3334 bitmap_clear_bit (components, reg2);
3335 if (reg1 != INVALID_REGNUM)
3336 bitmap_clear_bit (components, reg1);
3338 bitmap_clear_bit (components, LR_REGNUM);
3339 bitmap_clear_bit (components, SP_REGNUM);
3341 return components;
3344 /* Implement TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB. */
3346 static sbitmap
3347 aarch64_components_for_bb (basic_block bb)
3349 bitmap in = DF_LIVE_IN (bb);
3350 bitmap gen = &DF_LIVE_BB_INFO (bb)->gen;
3351 bitmap kill = &DF_LIVE_BB_INFO (bb)->kill;
3353 sbitmap components = sbitmap_alloc (LAST_SAVED_REGNUM + 1);
3354 bitmap_clear (components);
3356 /* GPRs are used in a bb if they are in the IN, GEN, or KILL sets. */
3357 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3358 if ((!call_used_regs[regno])
3359 && (bitmap_bit_p (in, regno)
3360 || bitmap_bit_p (gen, regno)
3361 || bitmap_bit_p (kill, regno)))
3362 bitmap_set_bit (components, regno);
3364 return components;
3367 /* Implement TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS.
3368 Nothing to do for aarch64. */
3370 static void
3371 aarch64_disqualify_components (sbitmap, edge, sbitmap, bool)
3375 /* Return the next set bit in BMP from START onwards. Return the total number
3376 of bits in BMP if no set bit is found at or after START. */
3378 static unsigned int
3379 aarch64_get_next_set_bit (sbitmap bmp, unsigned int start)
3381 unsigned int nbits = SBITMAP_SIZE (bmp);
3382 if (start == nbits)
3383 return start;
3385 gcc_assert (start < nbits);
3386 for (unsigned int i = start; i < nbits; i++)
3387 if (bitmap_bit_p (bmp, i))
3388 return i;
3390 return nbits;
3393 /* Do the work for aarch64_emit_prologue_components and
3394 aarch64_emit_epilogue_components. COMPONENTS is the bitmap of registers
3395 to save/restore, PROLOGUE_P indicates whether to emit the prologue sequence
3396 for these components or the epilogue sequence. That is, it determines
3397 whether we should emit stores or loads and what kind of CFA notes to attach
3398 to the insns. Otherwise the logic for the two sequences is very
3399 similar. */
3401 static void
3402 aarch64_process_components (sbitmap components, bool prologue_p)
3404 rtx ptr_reg = gen_rtx_REG (Pmode, frame_pointer_needed
3405 ? HARD_FRAME_POINTER_REGNUM
3406 : STACK_POINTER_REGNUM);
3408 unsigned last_regno = SBITMAP_SIZE (components);
3409 unsigned regno = aarch64_get_next_set_bit (components, R0_REGNUM);
3410 rtx_insn *insn = NULL;
3412 while (regno != last_regno)
3414 /* AAPCS64 section 5.1.2 requires only the bottom 64 bits to be saved
3415 so DFmode for the vector registers is enough. */
3416 machine_mode mode = GP_REGNUM_P (regno) ? DImode : DFmode;
3417 rtx reg = gen_rtx_REG (mode, regno);
3418 HOST_WIDE_INT offset = cfun->machine->frame.reg_offset[regno];
3419 if (!frame_pointer_needed)
3420 offset += cfun->machine->frame.frame_size
3421 - cfun->machine->frame.hard_fp_offset;
3422 rtx addr = plus_constant (Pmode, ptr_reg, offset);
3423 rtx mem = gen_frame_mem (mode, addr);
3425 rtx set = prologue_p ? gen_rtx_SET (mem, reg) : gen_rtx_SET (reg, mem);
3426 unsigned regno2 = aarch64_get_next_set_bit (components, regno + 1);
3427 /* No more registers to handle after REGNO.
3428 Emit a single save/restore and exit. */
3429 if (regno2 == last_regno)
3431 insn = emit_insn (set);
3432 RTX_FRAME_RELATED_P (insn) = 1;
3433 if (prologue_p)
3434 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3435 else
3436 add_reg_note (insn, REG_CFA_RESTORE, reg);
3437 break;
3440 HOST_WIDE_INT offset2 = cfun->machine->frame.reg_offset[regno2];
3441 /* The next register is not of the same class or its offset is not
3442 mergeable with the current one into a pair. */
3443 if (!satisfies_constraint_Ump (mem)
3444 || GP_REGNUM_P (regno) != GP_REGNUM_P (regno2)
3445 || (offset2 - cfun->machine->frame.reg_offset[regno])
3446 != GET_MODE_SIZE (mode))
3448 insn = emit_insn (set);
3449 RTX_FRAME_RELATED_P (insn) = 1;
3450 if (prologue_p)
3451 add_reg_note (insn, REG_CFA_OFFSET, copy_rtx (set));
3452 else
3453 add_reg_note (insn, REG_CFA_RESTORE, reg);
3455 regno = regno2;
3456 continue;
3459 /* REGNO2 can be saved/restored in a pair with REGNO. */
3460 rtx reg2 = gen_rtx_REG (mode, regno2);
3461 if (!frame_pointer_needed)
3462 offset2 += cfun->machine->frame.frame_size
3463 - cfun->machine->frame.hard_fp_offset;
3464 rtx addr2 = plus_constant (Pmode, ptr_reg, offset2);
3465 rtx mem2 = gen_frame_mem (mode, addr2);
3466 rtx set2 = prologue_p ? gen_rtx_SET (mem2, reg2)
3467 : gen_rtx_SET (reg2, mem2);
3469 if (prologue_p)
3470 insn = emit_insn (aarch64_gen_store_pair (mode, mem, reg, mem2, reg2));
3471 else
3472 insn = emit_insn (aarch64_gen_load_pair (mode, reg, mem, reg2, mem2));
3474 RTX_FRAME_RELATED_P (insn) = 1;
3475 if (prologue_p)
3477 add_reg_note (insn, REG_CFA_OFFSET, set);
3478 add_reg_note (insn, REG_CFA_OFFSET, set2);
3480 else
3482 add_reg_note (insn, REG_CFA_RESTORE, reg);
3483 add_reg_note (insn, REG_CFA_RESTORE, reg2);
3486 regno = aarch64_get_next_set_bit (components, regno2 + 1);
3490 /* Implement TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS. */
3492 static void
3493 aarch64_emit_prologue_components (sbitmap components)
3495 aarch64_process_components (components, true);
3498 /* Implement TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS. */
3500 static void
3501 aarch64_emit_epilogue_components (sbitmap components)
3503 aarch64_process_components (components, false);
3506 /* Implement TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS. */
3508 static void
3509 aarch64_set_handled_components (sbitmap components)
3511 for (unsigned regno = 0; regno <= LAST_SAVED_REGNUM; regno++)
3512 if (bitmap_bit_p (components, regno))
3513 cfun->machine->reg_is_wrapped_separately[regno] = true;
3516 /* AArch64 stack frames generated by this compiler look like:
3518 +-------------------------------+
3520 | incoming stack arguments |
3522 +-------------------------------+
3523 | | <-- incoming stack pointer (aligned)
3524 | callee-allocated save area |
3525 | for register varargs |
3527 +-------------------------------+
3528 | local variables | <-- frame_pointer_rtx
3530 +-------------------------------+
3531 | padding0 | \
3532 +-------------------------------+ |
3533 | callee-saved registers | | frame.saved_regs_size
3534 +-------------------------------+ |
3535 | LR' | |
3536 +-------------------------------+ |
3537 | FP' | / <- hard_frame_pointer_rtx (aligned)
3538 +-------------------------------+
3539 | dynamic allocation |
3540 +-------------------------------+
3541 | padding |
3542 +-------------------------------+
3543 | outgoing stack arguments | <-- arg_pointer
3545 +-------------------------------+
3546 | | <-- stack_pointer_rtx (aligned)
3548 Dynamic stack allocations via alloca() decrease stack_pointer_rtx
3549 but leave frame_pointer_rtx and hard_frame_pointer_rtx
3550 unchanged. */
3552 /* Generate the prologue instructions for entry into a function.
3553 Establish the stack frame by decreasing the stack pointer with a
3554 properly calculated size and, if necessary, create a frame record
3555 filled with the values of LR and previous frame pointer. The
3556 current FP is also set up if it is in use. */
3558 void
3559 aarch64_expand_prologue (void)
3561 aarch64_layout_frame ();
3563 HOST_WIDE_INT frame_size = cfun->machine->frame.frame_size;
3564 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3565 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3566 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3567 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3568 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3569 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3570 rtx_insn *insn;
3572 /* Sign return address for functions. */
3573 if (aarch64_return_address_signing_enabled ())
3575 insn = emit_insn (gen_pacisp ());
3576 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3577 RTX_FRAME_RELATED_P (insn) = 1;
3580 if (flag_stack_usage_info)
3581 current_function_static_stack_size = frame_size;
3583 if (flag_stack_check == STATIC_BUILTIN_STACK_CHECK)
3585 if (crtl->is_leaf && !cfun->calls_alloca)
3587 if (frame_size > PROBE_INTERVAL && frame_size > STACK_CHECK_PROTECT)
3588 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT,
3589 frame_size - STACK_CHECK_PROTECT);
3591 else if (frame_size > 0)
3592 aarch64_emit_probe_stack_range (STACK_CHECK_PROTECT, frame_size);
3595 aarch64_sub_sp (IP0_REGNUM, initial_adjust, true);
3597 if (callee_adjust != 0)
3598 aarch64_push_regs (reg1, reg2, callee_adjust);
3600 if (frame_pointer_needed)
3602 if (callee_adjust == 0)
3603 aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM,
3604 R30_REGNUM, false);
3605 insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx,
3606 stack_pointer_rtx,
3607 GEN_INT (callee_offset)));
3608 RTX_FRAME_RELATED_P (insn) = 1;
3609 emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx));
3612 aarch64_save_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3613 callee_adjust != 0 || frame_pointer_needed);
3614 aarch64_save_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3615 callee_adjust != 0 || frame_pointer_needed);
3616 aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed);
3619 /* Return TRUE if we can use a simple_return insn.
3621 This function checks whether the callee saved stack is empty, which
3622 means no restore actions are need. The pro_and_epilogue will use
3623 this to check whether shrink-wrapping opt is feasible. */
3625 bool
3626 aarch64_use_return_insn_p (void)
3628 if (!reload_completed)
3629 return false;
3631 if (crtl->profile)
3632 return false;
3634 aarch64_layout_frame ();
3636 return cfun->machine->frame.frame_size == 0;
3639 /* Generate the epilogue instructions for returning from a function.
3640 This is almost exactly the reverse of the prolog sequence, except
3641 that we need to insert barriers to avoid scheduling loads that read
3642 from a deallocated stack, and we optimize the unwind records by
3643 emitting them all together if possible. */
3644 void
3645 aarch64_expand_epilogue (bool for_sibcall)
3647 aarch64_layout_frame ();
3649 HOST_WIDE_INT initial_adjust = cfun->machine->frame.initial_adjust;
3650 HOST_WIDE_INT callee_adjust = cfun->machine->frame.callee_adjust;
3651 HOST_WIDE_INT final_adjust = cfun->machine->frame.final_adjust;
3652 HOST_WIDE_INT callee_offset = cfun->machine->frame.callee_offset;
3653 unsigned reg1 = cfun->machine->frame.wb_candidate1;
3654 unsigned reg2 = cfun->machine->frame.wb_candidate2;
3655 rtx cfi_ops = NULL;
3656 rtx_insn *insn;
3658 /* We need to add memory barrier to prevent read from deallocated stack. */
3659 bool need_barrier_p = (get_frame_size ()
3660 + cfun->machine->frame.saved_varargs_size) != 0;
3662 /* Emit a barrier to prevent loads from a deallocated stack. */
3663 if (final_adjust > crtl->outgoing_args_size || cfun->calls_alloca
3664 || crtl->calls_eh_return)
3666 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3667 need_barrier_p = false;
3670 /* Restore the stack pointer from the frame pointer if it may not
3671 be the same as the stack pointer. */
3672 if (frame_pointer_needed && (final_adjust || cfun->calls_alloca))
3674 insn = emit_insn (gen_add3_insn (stack_pointer_rtx,
3675 hard_frame_pointer_rtx,
3676 GEN_INT (-callee_offset)));
3677 /* If writeback is used when restoring callee-saves, the CFA
3678 is restored on the instruction doing the writeback. */
3679 RTX_FRAME_RELATED_P (insn) = callee_adjust == 0;
3681 else
3682 aarch64_add_sp (IP1_REGNUM, final_adjust, df_regs_ever_live_p (IP1_REGNUM));
3684 aarch64_restore_callee_saves (DImode, callee_offset, R0_REGNUM, R30_REGNUM,
3685 callee_adjust != 0, &cfi_ops);
3686 aarch64_restore_callee_saves (DFmode, callee_offset, V0_REGNUM, V31_REGNUM,
3687 callee_adjust != 0, &cfi_ops);
3689 if (need_barrier_p)
3690 emit_insn (gen_stack_tie (stack_pointer_rtx, stack_pointer_rtx));
3692 if (callee_adjust != 0)
3693 aarch64_pop_regs (reg1, reg2, callee_adjust, &cfi_ops);
3695 if (callee_adjust != 0 || initial_adjust > 65536)
3697 /* Emit delayed restores and set the CFA to be SP + initial_adjust. */
3698 insn = get_last_insn ();
3699 rtx new_cfa = plus_constant (Pmode, stack_pointer_rtx, initial_adjust);
3700 REG_NOTES (insn) = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
3701 RTX_FRAME_RELATED_P (insn) = 1;
3702 cfi_ops = NULL;
3705 aarch64_add_sp (IP0_REGNUM, initial_adjust, df_regs_ever_live_p (IP0_REGNUM));
3707 if (cfi_ops)
3709 /* Emit delayed restores and reset the CFA to be SP. */
3710 insn = get_last_insn ();
3711 cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
3712 REG_NOTES (insn) = cfi_ops;
3713 RTX_FRAME_RELATED_P (insn) = 1;
3716 /* We prefer to emit the combined return/authenticate instruction RETAA,
3717 however there are three cases in which we must instead emit an explicit
3718 authentication instruction.
3720 1) Sibcalls don't return in a normal way, so if we're about to call one
3721 we must authenticate.
3723 2) The RETAA instruction is not available before ARMv8.3-A, so if we are
3724 generating code for !TARGET_ARMV8_3 we can't use it and must
3725 explicitly authenticate.
3727 3) On an eh_return path we make extra stack adjustments to update the
3728 canonical frame address to be the exception handler's CFA. We want
3729 to authenticate using the CFA of the function which calls eh_return.
3731 if (aarch64_return_address_signing_enabled ()
3732 && (for_sibcall || !TARGET_ARMV8_3 || crtl->calls_eh_return))
3734 insn = emit_insn (gen_autisp ());
3735 add_reg_note (insn, REG_CFA_TOGGLE_RA_MANGLE, const0_rtx);
3736 RTX_FRAME_RELATED_P (insn) = 1;
3739 /* Stack adjustment for exception handler. */
3740 if (crtl->calls_eh_return)
3742 /* We need to unwind the stack by the offset computed by
3743 EH_RETURN_STACKADJ_RTX. We have already reset the CFA
3744 to be SP; letting the CFA move during this adjustment
3745 is just as correct as retaining the CFA from the body
3746 of the function. Therefore, do nothing special. */
3747 emit_insn (gen_add2_insn (stack_pointer_rtx, EH_RETURN_STACKADJ_RTX));
3750 emit_use (gen_rtx_REG (DImode, LR_REGNUM));
3751 if (!for_sibcall)
3752 emit_jump_insn (ret_rtx);
3755 /* Implement EH_RETURN_HANDLER_RTX. EH returns need to either return
3756 normally or return to a previous frame after unwinding.
3758 An EH return uses a single shared return sequence. The epilogue is
3759 exactly like a normal epilogue except that it has an extra input
3760 register (EH_RETURN_STACKADJ_RTX) which contains the stack adjustment
3761 that must be applied after the frame has been destroyed. An extra label
3762 is inserted before the epilogue which initializes this register to zero,
3763 and this is the entry point for a normal return.
3765 An actual EH return updates the return address, initializes the stack
3766 adjustment and jumps directly into the epilogue (bypassing the zeroing
3767 of the adjustment). Since the return address is typically saved on the
3768 stack when a function makes a call, the saved LR must be updated outside
3769 the epilogue.
3771 This poses problems as the store is generated well before the epilogue,
3772 so the offset of LR is not known yet. Also optimizations will remove the
3773 store as it appears dead, even after the epilogue is generated (as the
3774 base or offset for loading LR is different in many cases).
3776 To avoid these problems this implementation forces the frame pointer
3777 in eh_return functions so that the location of LR is fixed and known early.
3778 It also marks the store volatile, so no optimization is permitted to
3779 remove the store. */
3781 aarch64_eh_return_handler_rtx (void)
3783 rtx tmp = gen_frame_mem (Pmode,
3784 plus_constant (Pmode, hard_frame_pointer_rtx, UNITS_PER_WORD));
3786 /* Mark the store volatile, so no optimization is permitted to remove it. */
3787 MEM_VOLATILE_P (tmp) = true;
3788 return tmp;
3791 /* Output code to add DELTA to the first argument, and then jump
3792 to FUNCTION. Used for C++ multiple inheritance. */
3793 static void
3794 aarch64_output_mi_thunk (FILE *file, tree thunk ATTRIBUTE_UNUSED,
3795 HOST_WIDE_INT delta,
3796 HOST_WIDE_INT vcall_offset,
3797 tree function)
3799 /* The this pointer is always in x0. Note that this differs from
3800 Arm where the this pointer maybe bumped to r1 if r0 is required
3801 to return a pointer to an aggregate. On AArch64 a result value
3802 pointer will be in x8. */
3803 int this_regno = R0_REGNUM;
3804 rtx this_rtx, temp0, temp1, addr, funexp;
3805 rtx_insn *insn;
3807 reload_completed = 1;
3808 emit_note (NOTE_INSN_PROLOGUE_END);
3810 if (vcall_offset == 0)
3811 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3812 else
3814 gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0);
3816 this_rtx = gen_rtx_REG (Pmode, this_regno);
3817 temp0 = gen_rtx_REG (Pmode, IP0_REGNUM);
3818 temp1 = gen_rtx_REG (Pmode, IP1_REGNUM);
3820 addr = this_rtx;
3821 if (delta != 0)
3823 if (delta >= -256 && delta < 256)
3824 addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx,
3825 plus_constant (Pmode, this_rtx, delta));
3826 else
3827 aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta);
3830 if (Pmode == ptr_mode)
3831 aarch64_emit_move (temp0, gen_rtx_MEM (ptr_mode, addr));
3832 else
3833 aarch64_emit_move (temp0,
3834 gen_rtx_ZERO_EXTEND (Pmode,
3835 gen_rtx_MEM (ptr_mode, addr)));
3837 if (vcall_offset >= -256 && vcall_offset < 4096 * POINTER_BYTES)
3838 addr = plus_constant (Pmode, temp0, vcall_offset);
3839 else
3841 aarch64_internal_mov_immediate (temp1, GEN_INT (vcall_offset), true,
3842 Pmode);
3843 addr = gen_rtx_PLUS (Pmode, temp0, temp1);
3846 if (Pmode == ptr_mode)
3847 aarch64_emit_move (temp1, gen_rtx_MEM (ptr_mode,addr));
3848 else
3849 aarch64_emit_move (temp1,
3850 gen_rtx_SIGN_EXTEND (Pmode,
3851 gen_rtx_MEM (ptr_mode, addr)));
3853 emit_insn (gen_add2_insn (this_rtx, temp1));
3856 /* Generate a tail call to the target function. */
3857 if (!TREE_USED (function))
3859 assemble_external (function);
3860 TREE_USED (function) = 1;
3862 funexp = XEXP (DECL_RTL (function), 0);
3863 funexp = gen_rtx_MEM (FUNCTION_MODE, funexp);
3864 insn = emit_call_insn (gen_sibcall (funexp, const0_rtx, NULL_RTX));
3865 SIBLING_CALL_P (insn) = 1;
3867 insn = get_insns ();
3868 shorten_branches (insn);
3869 final_start_function (insn, file, 1);
3870 final (insn, file, 1);
3871 final_end_function ();
3873 /* Stop pretending to be a post-reload pass. */
3874 reload_completed = 0;
3877 static bool
3878 aarch64_tls_referenced_p (rtx x)
3880 if (!TARGET_HAVE_TLS)
3881 return false;
3882 subrtx_iterator::array_type array;
3883 FOR_EACH_SUBRTX (iter, array, x, ALL)
3885 const_rtx x = *iter;
3886 if (GET_CODE (x) == SYMBOL_REF && SYMBOL_REF_TLS_MODEL (x) != 0)
3887 return true;
3888 /* Don't recurse into UNSPEC_TLS looking for TLS symbols; these are
3889 TLS offsets, not real symbol references. */
3890 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
3891 iter.skip_subrtxes ();
3893 return false;
3897 /* Return true if val can be encoded as a 12-bit unsigned immediate with
3898 a left shift of 0 or 12 bits. */
3899 bool
3900 aarch64_uimm12_shift (HOST_WIDE_INT val)
3902 return ((val & (((HOST_WIDE_INT) 0xfff) << 0)) == val
3903 || (val & (((HOST_WIDE_INT) 0xfff) << 12)) == val
3908 /* Return true if val is an immediate that can be loaded into a
3909 register by a MOVZ instruction. */
3910 static bool
3911 aarch64_movw_imm (HOST_WIDE_INT val, machine_mode mode)
3913 if (GET_MODE_SIZE (mode) > 4)
3915 if ((val & (((HOST_WIDE_INT) 0xffff) << 32)) == val
3916 || (val & (((HOST_WIDE_INT) 0xffff) << 48)) == val)
3917 return 1;
3919 else
3921 /* Ignore sign extension. */
3922 val &= (HOST_WIDE_INT) 0xffffffff;
3924 return ((val & (((HOST_WIDE_INT) 0xffff) << 0)) == val
3925 || (val & (((HOST_WIDE_INT) 0xffff) << 16)) == val);
3928 /* Multipliers for repeating bitmasks of width 32, 16, 8, 4, and 2. */
3930 static const unsigned HOST_WIDE_INT bitmask_imm_mul[] =
3932 0x0000000100000001ull,
3933 0x0001000100010001ull,
3934 0x0101010101010101ull,
3935 0x1111111111111111ull,
3936 0x5555555555555555ull,
3940 /* Return true if val is a valid bitmask immediate. */
3942 bool
3943 aarch64_bitmask_imm (HOST_WIDE_INT val_in, machine_mode mode)
3945 unsigned HOST_WIDE_INT val, tmp, mask, first_one, next_one;
3946 int bits;
3948 /* Check for a single sequence of one bits and return quickly if so.
3949 The special cases of all ones and all zeroes returns false. */
3950 val = (unsigned HOST_WIDE_INT) val_in;
3951 tmp = val + (val & -val);
3953 if (tmp == (tmp & -tmp))
3954 return (val + 1) > 1;
3956 /* Replicate 32-bit immediates so we can treat them as 64-bit. */
3957 if (mode == SImode)
3958 val = (val << 32) | (val & 0xffffffff);
3960 /* Invert if the immediate doesn't start with a zero bit - this means we
3961 only need to search for sequences of one bits. */
3962 if (val & 1)
3963 val = ~val;
3965 /* Find the first set bit and set tmp to val with the first sequence of one
3966 bits removed. Return success if there is a single sequence of ones. */
3967 first_one = val & -val;
3968 tmp = val & (val + first_one);
3970 if (tmp == 0)
3971 return true;
3973 /* Find the next set bit and compute the difference in bit position. */
3974 next_one = tmp & -tmp;
3975 bits = clz_hwi (first_one) - clz_hwi (next_one);
3976 mask = val ^ tmp;
3978 /* Check the bit position difference is a power of 2, and that the first
3979 sequence of one bits fits within 'bits' bits. */
3980 if ((mask >> bits) != 0 || bits != (bits & -bits))
3981 return false;
3983 /* Check the sequence of one bits is repeated 64/bits times. */
3984 return val == mask * bitmask_imm_mul[__builtin_clz (bits) - 26];
3987 /* Create mask of ones, covering the lowest to highest bits set in VAL_IN.
3988 Assumed precondition: VAL_IN Is not zero. */
3990 unsigned HOST_WIDE_INT
3991 aarch64_and_split_imm1 (HOST_WIDE_INT val_in)
3993 int lowest_bit_set = ctz_hwi (val_in);
3994 int highest_bit_set = floor_log2 (val_in);
3995 gcc_assert (val_in != 0);
3997 return ((HOST_WIDE_INT_UC (2) << highest_bit_set) -
3998 (HOST_WIDE_INT_1U << lowest_bit_set));
4001 /* Create constant where bits outside of lowest bit set to highest bit set
4002 are set to 1. */
4004 unsigned HOST_WIDE_INT
4005 aarch64_and_split_imm2 (HOST_WIDE_INT val_in)
4007 return val_in | ~aarch64_and_split_imm1 (val_in);
4010 /* Return true if VAL_IN is a valid 'and' bitmask immediate. */
4012 bool
4013 aarch64_and_bitmask_imm (unsigned HOST_WIDE_INT val_in, machine_mode mode)
4015 if (aarch64_bitmask_imm (val_in, mode))
4016 return false;
4018 if (aarch64_move_imm (val_in, mode))
4019 return false;
4021 unsigned HOST_WIDE_INT imm2 = aarch64_and_split_imm2 (val_in);
4023 return aarch64_bitmask_imm (imm2, mode);
4026 /* Return true if val is an immediate that can be loaded into a
4027 register in a single instruction. */
4028 bool
4029 aarch64_move_imm (HOST_WIDE_INT val, machine_mode mode)
4031 if (aarch64_movw_imm (val, mode) || aarch64_movw_imm (~val, mode))
4032 return 1;
4033 return aarch64_bitmask_imm (val, mode);
4036 static bool
4037 aarch64_cannot_force_const_mem (machine_mode mode ATTRIBUTE_UNUSED, rtx x)
4039 rtx base, offset;
4041 if (GET_CODE (x) == HIGH)
4042 return true;
4044 split_const (x, &base, &offset);
4045 if (GET_CODE (base) == SYMBOL_REF || GET_CODE (base) == LABEL_REF)
4047 if (aarch64_classify_symbol (base, offset)
4048 != SYMBOL_FORCE_TO_MEM)
4049 return true;
4050 else
4051 /* Avoid generating a 64-bit relocation in ILP32; leave
4052 to aarch64_expand_mov_immediate to handle it properly. */
4053 return mode != ptr_mode;
4056 return aarch64_tls_referenced_p (x);
4059 /* Implement TARGET_CASE_VALUES_THRESHOLD.
4060 The expansion for a table switch is quite expensive due to the number
4061 of instructions, the table lookup and hard to predict indirect jump.
4062 When optimizing for speed, and -O3 enabled, use the per-core tuning if
4063 set, otherwise use tables for > 16 cases as a tradeoff between size and
4064 performance. When optimizing for size, use the default setting. */
4066 static unsigned int
4067 aarch64_case_values_threshold (void)
4069 /* Use the specified limit for the number of cases before using jump
4070 tables at higher optimization levels. */
4071 if (optimize > 2
4072 && selected_cpu->tune->max_case_values != 0)
4073 return selected_cpu->tune->max_case_values;
4074 else
4075 return optimize_size ? default_case_values_threshold () : 17;
4078 /* Return true if register REGNO is a valid index register.
4079 STRICT_P is true if REG_OK_STRICT is in effect. */
4081 bool
4082 aarch64_regno_ok_for_index_p (int regno, bool strict_p)
4084 if (!HARD_REGISTER_NUM_P (regno))
4086 if (!strict_p)
4087 return true;
4089 if (!reg_renumber)
4090 return false;
4092 regno = reg_renumber[regno];
4094 return GP_REGNUM_P (regno);
4097 /* Return true if register REGNO is a valid base register for mode MODE.
4098 STRICT_P is true if REG_OK_STRICT is in effect. */
4100 bool
4101 aarch64_regno_ok_for_base_p (int regno, bool strict_p)
4103 if (!HARD_REGISTER_NUM_P (regno))
4105 if (!strict_p)
4106 return true;
4108 if (!reg_renumber)
4109 return false;
4111 regno = reg_renumber[regno];
4114 /* The fake registers will be eliminated to either the stack or
4115 hard frame pointer, both of which are usually valid base registers.
4116 Reload deals with the cases where the eliminated form isn't valid. */
4117 return (GP_REGNUM_P (regno)
4118 || regno == SP_REGNUM
4119 || regno == FRAME_POINTER_REGNUM
4120 || regno == ARG_POINTER_REGNUM);
4123 /* Return true if X is a valid base register for mode MODE.
4124 STRICT_P is true if REG_OK_STRICT is in effect. */
4126 static bool
4127 aarch64_base_register_rtx_p (rtx x, bool strict_p)
4129 if (!strict_p && GET_CODE (x) == SUBREG)
4130 x = SUBREG_REG (x);
4132 return (REG_P (x) && aarch64_regno_ok_for_base_p (REGNO (x), strict_p));
4135 /* Return true if address offset is a valid index. If it is, fill in INFO
4136 appropriately. STRICT_P is true if REG_OK_STRICT is in effect. */
4138 static bool
4139 aarch64_classify_index (struct aarch64_address_info *info, rtx x,
4140 machine_mode mode, bool strict_p)
4142 enum aarch64_address_type type;
4143 rtx index;
4144 int shift;
4146 /* (reg:P) */
4147 if ((REG_P (x) || GET_CODE (x) == SUBREG)
4148 && GET_MODE (x) == Pmode)
4150 type = ADDRESS_REG_REG;
4151 index = x;
4152 shift = 0;
4154 /* (sign_extend:DI (reg:SI)) */
4155 else if ((GET_CODE (x) == SIGN_EXTEND
4156 || GET_CODE (x) == ZERO_EXTEND)
4157 && GET_MODE (x) == DImode
4158 && GET_MODE (XEXP (x, 0)) == SImode)
4160 type = (GET_CODE (x) == SIGN_EXTEND)
4161 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4162 index = XEXP (x, 0);
4163 shift = 0;
4165 /* (mult:DI (sign_extend:DI (reg:SI)) (const_int scale)) */
4166 else if (GET_CODE (x) == MULT
4167 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4168 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4169 && GET_MODE (XEXP (x, 0)) == DImode
4170 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4171 && CONST_INT_P (XEXP (x, 1)))
4173 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4174 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4175 index = XEXP (XEXP (x, 0), 0);
4176 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4178 /* (ashift:DI (sign_extend:DI (reg:SI)) (const_int shift)) */
4179 else if (GET_CODE (x) == ASHIFT
4180 && (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND
4181 || GET_CODE (XEXP (x, 0)) == ZERO_EXTEND)
4182 && GET_MODE (XEXP (x, 0)) == DImode
4183 && GET_MODE (XEXP (XEXP (x, 0), 0)) == SImode
4184 && CONST_INT_P (XEXP (x, 1)))
4186 type = (GET_CODE (XEXP (x, 0)) == SIGN_EXTEND)
4187 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4188 index = XEXP (XEXP (x, 0), 0);
4189 shift = INTVAL (XEXP (x, 1));
4191 /* (sign_extract:DI (mult:DI (reg:DI) (const_int scale)) 32+shift 0) */
4192 else if ((GET_CODE (x) == SIGN_EXTRACT
4193 || GET_CODE (x) == ZERO_EXTRACT)
4194 && GET_MODE (x) == DImode
4195 && GET_CODE (XEXP (x, 0)) == MULT
4196 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4197 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4199 type = (GET_CODE (x) == SIGN_EXTRACT)
4200 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4201 index = XEXP (XEXP (x, 0), 0);
4202 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4203 if (INTVAL (XEXP (x, 1)) != 32 + shift
4204 || INTVAL (XEXP (x, 2)) != 0)
4205 shift = -1;
4207 /* (and:DI (mult:DI (reg:DI) (const_int scale))
4208 (const_int 0xffffffff<<shift)) */
4209 else if (GET_CODE (x) == AND
4210 && GET_MODE (x) == DImode
4211 && GET_CODE (XEXP (x, 0)) == MULT
4212 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4213 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4214 && CONST_INT_P (XEXP (x, 1)))
4216 type = ADDRESS_REG_UXTW;
4217 index = XEXP (XEXP (x, 0), 0);
4218 shift = exact_log2 (INTVAL (XEXP (XEXP (x, 0), 1)));
4219 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4220 shift = -1;
4222 /* (sign_extract:DI (ashift:DI (reg:DI) (const_int shift)) 32+shift 0) */
4223 else if ((GET_CODE (x) == SIGN_EXTRACT
4224 || GET_CODE (x) == ZERO_EXTRACT)
4225 && GET_MODE (x) == DImode
4226 && GET_CODE (XEXP (x, 0)) == ASHIFT
4227 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4228 && CONST_INT_P (XEXP (XEXP (x, 0), 1)))
4230 type = (GET_CODE (x) == SIGN_EXTRACT)
4231 ? ADDRESS_REG_SXTW : ADDRESS_REG_UXTW;
4232 index = XEXP (XEXP (x, 0), 0);
4233 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4234 if (INTVAL (XEXP (x, 1)) != 32 + shift
4235 || INTVAL (XEXP (x, 2)) != 0)
4236 shift = -1;
4238 /* (and:DI (ashift:DI (reg:DI) (const_int shift))
4239 (const_int 0xffffffff<<shift)) */
4240 else if (GET_CODE (x) == AND
4241 && GET_MODE (x) == DImode
4242 && GET_CODE (XEXP (x, 0)) == ASHIFT
4243 && GET_MODE (XEXP (XEXP (x, 0), 0)) == DImode
4244 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
4245 && CONST_INT_P (XEXP (x, 1)))
4247 type = ADDRESS_REG_UXTW;
4248 index = XEXP (XEXP (x, 0), 0);
4249 shift = INTVAL (XEXP (XEXP (x, 0), 1));
4250 if (INTVAL (XEXP (x, 1)) != (HOST_WIDE_INT)0xffffffff << shift)
4251 shift = -1;
4253 /* (mult:P (reg:P) (const_int scale)) */
4254 else if (GET_CODE (x) == MULT
4255 && GET_MODE (x) == Pmode
4256 && GET_MODE (XEXP (x, 0)) == Pmode
4257 && CONST_INT_P (XEXP (x, 1)))
4259 type = ADDRESS_REG_REG;
4260 index = XEXP (x, 0);
4261 shift = exact_log2 (INTVAL (XEXP (x, 1)));
4263 /* (ashift:P (reg:P) (const_int shift)) */
4264 else if (GET_CODE (x) == ASHIFT
4265 && GET_MODE (x) == Pmode
4266 && GET_MODE (XEXP (x, 0)) == Pmode
4267 && CONST_INT_P (XEXP (x, 1)))
4269 type = ADDRESS_REG_REG;
4270 index = XEXP (x, 0);
4271 shift = INTVAL (XEXP (x, 1));
4273 else
4274 return false;
4276 if (GET_CODE (index) == SUBREG)
4277 index = SUBREG_REG (index);
4279 if ((shift == 0 ||
4280 (shift > 0 && shift <= 3
4281 && (1 << shift) == GET_MODE_SIZE (mode)))
4282 && REG_P (index)
4283 && aarch64_regno_ok_for_index_p (REGNO (index), strict_p))
4285 info->type = type;
4286 info->offset = index;
4287 info->shift = shift;
4288 return true;
4291 return false;
4294 /* Return true if MODE is one of the modes for which we
4295 support LDP/STP operations. */
4297 static bool
4298 aarch64_mode_valid_for_sched_fusion_p (machine_mode mode)
4300 return mode == SImode || mode == DImode
4301 || mode == SFmode || mode == DFmode
4302 || (aarch64_vector_mode_supported_p (mode)
4303 && GET_MODE_SIZE (mode) == 8);
4306 /* Return true if REGNO is a virtual pointer register, or an eliminable
4307 "soft" frame register. Like REGNO_PTR_FRAME_P except that we don't
4308 include stack_pointer or hard_frame_pointer. */
4309 static bool
4310 virt_or_elim_regno_p (unsigned regno)
4312 return ((regno >= FIRST_VIRTUAL_REGISTER
4313 && regno <= LAST_VIRTUAL_POINTER_REGISTER)
4314 || regno == FRAME_POINTER_REGNUM
4315 || regno == ARG_POINTER_REGNUM);
4318 /* Return true if X is a valid address for machine mode MODE. If it is,
4319 fill in INFO appropriately. STRICT_P is true if REG_OK_STRICT is in
4320 effect. OUTER_CODE is PARALLEL for a load/store pair. */
4322 static bool
4323 aarch64_classify_address (struct aarch64_address_info *info,
4324 rtx x, machine_mode mode,
4325 RTX_CODE outer_code, bool strict_p)
4327 enum rtx_code code = GET_CODE (x);
4328 rtx op0, op1;
4330 /* On BE, we use load/store pair for all large int mode load/stores.
4331 TI/TFmode may also use a load/store pair. */
4332 bool load_store_pair_p = (outer_code == PARALLEL
4333 || mode == TImode
4334 || mode == TFmode
4335 || (BYTES_BIG_ENDIAN
4336 && aarch64_vect_struct_mode_p (mode)));
4338 bool allow_reg_index_p =
4339 !load_store_pair_p
4340 && (GET_MODE_SIZE (mode) != 16 || aarch64_vector_mode_supported_p (mode))
4341 && !aarch64_vect_struct_mode_p (mode);
4343 /* On LE, for AdvSIMD, don't support anything other than POST_INC or
4344 REG addressing. */
4345 if (aarch64_vect_struct_mode_p (mode) && !BYTES_BIG_ENDIAN
4346 && (code != POST_INC && code != REG))
4347 return false;
4349 switch (code)
4351 case REG:
4352 case SUBREG:
4353 info->type = ADDRESS_REG_IMM;
4354 info->base = x;
4355 info->offset = const0_rtx;
4356 return aarch64_base_register_rtx_p (x, strict_p);
4358 case PLUS:
4359 op0 = XEXP (x, 0);
4360 op1 = XEXP (x, 1);
4362 if (! strict_p
4363 && REG_P (op0)
4364 && virt_or_elim_regno_p (REGNO (op0))
4365 && CONST_INT_P (op1))
4367 info->type = ADDRESS_REG_IMM;
4368 info->base = op0;
4369 info->offset = op1;
4371 return true;
4374 if (GET_MODE_SIZE (mode) != 0
4375 && CONST_INT_P (op1)
4376 && aarch64_base_register_rtx_p (op0, strict_p))
4378 HOST_WIDE_INT offset = INTVAL (op1);
4380 info->type = ADDRESS_REG_IMM;
4381 info->base = op0;
4382 info->offset = op1;
4384 /* TImode and TFmode values are allowed in both pairs of X
4385 registers and individual Q registers. The available
4386 address modes are:
4387 X,X: 7-bit signed scaled offset
4388 Q: 9-bit signed offset
4389 We conservatively require an offset representable in either mode.
4390 When performing the check for pairs of X registers i.e. LDP/STP
4391 pass down DImode since that is the natural size of the LDP/STP
4392 instruction memory accesses. */
4393 if (mode == TImode || mode == TFmode)
4394 return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
4395 && (offset_9bit_signed_unscaled_p (mode, offset)
4396 || offset_12bit_unsigned_scaled_p (mode, offset)));
4398 /* A 7bit offset check because OImode will emit a ldp/stp
4399 instruction (only big endian will get here).
4400 For ldp/stp instructions, the offset is scaled for the size of a
4401 single element of the pair. */
4402 if (mode == OImode)
4403 return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
4405 /* Three 9/12 bit offsets checks because CImode will emit three
4406 ldr/str instructions (only big endian will get here). */
4407 if (mode == CImode)
4408 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4409 && (offset_9bit_signed_unscaled_p (V16QImode, offset + 32)
4410 || offset_12bit_unsigned_scaled_p (V16QImode,
4411 offset + 32)));
4413 /* Two 7bit offsets checks because XImode will emit two ldp/stp
4414 instructions (only big endian will get here). */
4415 if (mode == XImode)
4416 return (aarch64_offset_7bit_signed_scaled_p (TImode, offset)
4417 && aarch64_offset_7bit_signed_scaled_p (TImode,
4418 offset + 32));
4420 if (load_store_pair_p)
4421 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4422 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4423 else
4424 return (offset_9bit_signed_unscaled_p (mode, offset)
4425 || offset_12bit_unsigned_scaled_p (mode, offset));
4428 if (allow_reg_index_p)
4430 /* Look for base + (scaled/extended) index register. */
4431 if (aarch64_base_register_rtx_p (op0, strict_p)
4432 && aarch64_classify_index (info, op1, mode, strict_p))
4434 info->base = op0;
4435 return true;
4437 if (aarch64_base_register_rtx_p (op1, strict_p)
4438 && aarch64_classify_index (info, op0, mode, strict_p))
4440 info->base = op1;
4441 return true;
4445 return false;
4447 case POST_INC:
4448 case POST_DEC:
4449 case PRE_INC:
4450 case PRE_DEC:
4451 info->type = ADDRESS_REG_WB;
4452 info->base = XEXP (x, 0);
4453 info->offset = NULL_RTX;
4454 return aarch64_base_register_rtx_p (info->base, strict_p);
4456 case POST_MODIFY:
4457 case PRE_MODIFY:
4458 info->type = ADDRESS_REG_WB;
4459 info->base = XEXP (x, 0);
4460 if (GET_CODE (XEXP (x, 1)) == PLUS
4461 && CONST_INT_P (XEXP (XEXP (x, 1), 1))
4462 && rtx_equal_p (XEXP (XEXP (x, 1), 0), info->base)
4463 && aarch64_base_register_rtx_p (info->base, strict_p))
4465 HOST_WIDE_INT offset;
4466 info->offset = XEXP (XEXP (x, 1), 1);
4467 offset = INTVAL (info->offset);
4469 /* TImode and TFmode values are allowed in both pairs of X
4470 registers and individual Q registers. The available
4471 address modes are:
4472 X,X: 7-bit signed scaled offset
4473 Q: 9-bit signed offset
4474 We conservatively require an offset representable in either mode.
4476 if (mode == TImode || mode == TFmode)
4477 return (aarch64_offset_7bit_signed_scaled_p (mode, offset)
4478 && offset_9bit_signed_unscaled_p (mode, offset));
4480 if (load_store_pair_p)
4481 return ((GET_MODE_SIZE (mode) == 4 || GET_MODE_SIZE (mode) == 8)
4482 && aarch64_offset_7bit_signed_scaled_p (mode, offset));
4483 else
4484 return offset_9bit_signed_unscaled_p (mode, offset);
4486 return false;
4488 case CONST:
4489 case SYMBOL_REF:
4490 case LABEL_REF:
4491 /* load literal: pc-relative constant pool entry. Only supported
4492 for SI mode or larger. */
4493 info->type = ADDRESS_SYMBOLIC;
4495 if (!load_store_pair_p && GET_MODE_SIZE (mode) >= 4)
4497 rtx sym, addend;
4499 split_const (x, &sym, &addend);
4500 return ((GET_CODE (sym) == LABEL_REF
4501 || (GET_CODE (sym) == SYMBOL_REF
4502 && CONSTANT_POOL_ADDRESS_P (sym)
4503 && aarch64_pcrelative_literal_loads)));
4505 return false;
4507 case LO_SUM:
4508 info->type = ADDRESS_LO_SUM;
4509 info->base = XEXP (x, 0);
4510 info->offset = XEXP (x, 1);
4511 if (allow_reg_index_p
4512 && aarch64_base_register_rtx_p (info->base, strict_p))
4514 rtx sym, offs;
4515 split_const (info->offset, &sym, &offs);
4516 if (GET_CODE (sym) == SYMBOL_REF
4517 && (aarch64_classify_symbol (sym, offs) == SYMBOL_SMALL_ABSOLUTE))
4519 /* The symbol and offset must be aligned to the access size. */
4520 unsigned int align;
4521 unsigned int ref_size;
4523 if (CONSTANT_POOL_ADDRESS_P (sym))
4524 align = GET_MODE_ALIGNMENT (get_pool_mode (sym));
4525 else if (TREE_CONSTANT_POOL_ADDRESS_P (sym))
4527 tree exp = SYMBOL_REF_DECL (sym);
4528 align = TYPE_ALIGN (TREE_TYPE (exp));
4529 align = CONSTANT_ALIGNMENT (exp, align);
4531 else if (SYMBOL_REF_DECL (sym))
4532 align = DECL_ALIGN (SYMBOL_REF_DECL (sym));
4533 else if (SYMBOL_REF_HAS_BLOCK_INFO_P (sym)
4534 && SYMBOL_REF_BLOCK (sym) != NULL)
4535 align = SYMBOL_REF_BLOCK (sym)->alignment;
4536 else
4537 align = BITS_PER_UNIT;
4539 ref_size = GET_MODE_SIZE (mode);
4540 if (ref_size == 0)
4541 ref_size = GET_MODE_SIZE (DImode);
4543 return ((INTVAL (offs) & (ref_size - 1)) == 0
4544 && ((align / BITS_PER_UNIT) & (ref_size - 1)) == 0);
4547 return false;
4549 default:
4550 return false;
4554 bool
4555 aarch64_symbolic_address_p (rtx x)
4557 rtx offset;
4559 split_const (x, &x, &offset);
4560 return GET_CODE (x) == SYMBOL_REF || GET_CODE (x) == LABEL_REF;
4563 /* Classify the base of symbolic expression X. */
4565 enum aarch64_symbol_type
4566 aarch64_classify_symbolic_expression (rtx x)
4568 rtx offset;
4570 split_const (x, &x, &offset);
4571 return aarch64_classify_symbol (x, offset);
4575 /* Return TRUE if X is a legitimate address for accessing memory in
4576 mode MODE. */
4577 static bool
4578 aarch64_legitimate_address_hook_p (machine_mode mode, rtx x, bool strict_p)
4580 struct aarch64_address_info addr;
4582 return aarch64_classify_address (&addr, x, mode, MEM, strict_p);
4585 /* Return TRUE if X is a legitimate address for accessing memory in
4586 mode MODE. OUTER_CODE will be PARALLEL if this is a load/store
4587 pair operation. */
4588 bool
4589 aarch64_legitimate_address_p (machine_mode mode, rtx x,
4590 RTX_CODE outer_code, bool strict_p)
4592 struct aarch64_address_info addr;
4594 return aarch64_classify_address (&addr, x, mode, outer_code, strict_p);
4597 /* Split an out-of-range address displacement into a base and offset.
4598 Use 4KB range for 1- and 2-byte accesses and a 16KB range otherwise
4599 to increase opportunities for sharing the base address of different sizes.
4600 For unaligned accesses and TI/TF mode use the signed 9-bit range. */
4601 static bool
4602 aarch64_legitimize_address_displacement (rtx *disp, rtx *off, machine_mode mode)
4604 HOST_WIDE_INT offset = INTVAL (*disp);
4605 HOST_WIDE_INT base = offset & ~(GET_MODE_SIZE (mode) < 4 ? 0xfff : 0x3ffc);
4607 if (mode == TImode || mode == TFmode
4608 || (offset & (GET_MODE_SIZE (mode) - 1)) != 0)
4609 base = (offset + 0x100) & ~0x1ff;
4611 *off = GEN_INT (base);
4612 *disp = GEN_INT (offset - base);
4613 return true;
4616 /* Return TRUE if rtx X is immediate constant 0.0 */
4617 bool
4618 aarch64_float_const_zero_rtx_p (rtx x)
4620 if (GET_MODE (x) == VOIDmode)
4621 return false;
4623 if (REAL_VALUE_MINUS_ZERO (*CONST_DOUBLE_REAL_VALUE (x)))
4624 return !HONOR_SIGNED_ZEROS (GET_MODE (x));
4625 return real_equal (CONST_DOUBLE_REAL_VALUE (x), &dconst0);
4628 /* Return the fixed registers used for condition codes. */
4630 static bool
4631 aarch64_fixed_condition_code_regs (unsigned int *p1, unsigned int *p2)
4633 *p1 = CC_REGNUM;
4634 *p2 = INVALID_REGNUM;
4635 return true;
4638 /* Emit call insn with PAT and do aarch64-specific handling. */
4640 void
4641 aarch64_emit_call_insn (rtx pat)
4643 rtx insn = emit_call_insn (pat);
4645 rtx *fusage = &CALL_INSN_FUNCTION_USAGE (insn);
4646 clobber_reg (fusage, gen_rtx_REG (word_mode, IP0_REGNUM));
4647 clobber_reg (fusage, gen_rtx_REG (word_mode, IP1_REGNUM));
4650 machine_mode
4651 aarch64_select_cc_mode (RTX_CODE code, rtx x, rtx y)
4653 /* All floating point compares return CCFP if it is an equality
4654 comparison, and CCFPE otherwise. */
4655 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_FLOAT)
4657 switch (code)
4659 case EQ:
4660 case NE:
4661 case UNORDERED:
4662 case ORDERED:
4663 case UNLT:
4664 case UNLE:
4665 case UNGT:
4666 case UNGE:
4667 case UNEQ:
4668 case LTGT:
4669 return CCFPmode;
4671 case LT:
4672 case LE:
4673 case GT:
4674 case GE:
4675 return CCFPEmode;
4677 default:
4678 gcc_unreachable ();
4682 /* Equality comparisons of short modes against zero can be performed
4683 using the TST instruction with the appropriate bitmask. */
4684 if (y == const0_rtx && REG_P (x)
4685 && (code == EQ || code == NE)
4686 && (GET_MODE (x) == HImode || GET_MODE (x) == QImode))
4687 return CC_NZmode;
4689 /* Similarly, comparisons of zero_extends from shorter modes can
4690 be performed using an ANDS with an immediate mask. */
4691 if (y == const0_rtx && GET_CODE (x) == ZERO_EXTEND
4692 && (GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4693 && (GET_MODE (XEXP (x, 0)) == HImode || GET_MODE (XEXP (x, 0)) == QImode)
4694 && (code == EQ || code == NE))
4695 return CC_NZmode;
4697 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4698 && y == const0_rtx
4699 && (code == EQ || code == NE || code == LT || code == GE)
4700 && (GET_CODE (x) == PLUS || GET_CODE (x) == MINUS || GET_CODE (x) == AND
4701 || GET_CODE (x) == NEG
4702 || (GET_CODE (x) == ZERO_EXTRACT && CONST_INT_P (XEXP (x, 1))
4703 && CONST_INT_P (XEXP (x, 2)))))
4704 return CC_NZmode;
4706 /* A compare with a shifted operand. Because of canonicalization,
4707 the comparison will have to be swapped when we emit the assembly
4708 code. */
4709 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4710 && (REG_P (y) || GET_CODE (y) == SUBREG)
4711 && (GET_CODE (x) == ASHIFT || GET_CODE (x) == ASHIFTRT
4712 || GET_CODE (x) == LSHIFTRT
4713 || GET_CODE (x) == ZERO_EXTEND || GET_CODE (x) == SIGN_EXTEND))
4714 return CC_SWPmode;
4716 /* Similarly for a negated operand, but we can only do this for
4717 equalities. */
4718 if ((GET_MODE (x) == SImode || GET_MODE (x) == DImode)
4719 && (REG_P (y) || GET_CODE (y) == SUBREG)
4720 && (code == EQ || code == NE)
4721 && GET_CODE (x) == NEG)
4722 return CC_Zmode;
4724 /* A test for unsigned overflow. */
4725 if ((GET_MODE (x) == DImode || GET_MODE (x) == TImode)
4726 && code == NE
4727 && GET_CODE (x) == PLUS
4728 && GET_CODE (y) == ZERO_EXTEND)
4729 return CC_Cmode;
4731 /* For everything else, return CCmode. */
4732 return CCmode;
4735 static int
4736 aarch64_get_condition_code_1 (enum machine_mode, enum rtx_code);
4739 aarch64_get_condition_code (rtx x)
4741 machine_mode mode = GET_MODE (XEXP (x, 0));
4742 enum rtx_code comp_code = GET_CODE (x);
4744 if (GET_MODE_CLASS (mode) != MODE_CC)
4745 mode = SELECT_CC_MODE (comp_code, XEXP (x, 0), XEXP (x, 1));
4746 return aarch64_get_condition_code_1 (mode, comp_code);
4749 static int
4750 aarch64_get_condition_code_1 (enum machine_mode mode, enum rtx_code comp_code)
4752 switch (mode)
4754 case CCFPmode:
4755 case CCFPEmode:
4756 switch (comp_code)
4758 case GE: return AARCH64_GE;
4759 case GT: return AARCH64_GT;
4760 case LE: return AARCH64_LS;
4761 case LT: return AARCH64_MI;
4762 case NE: return AARCH64_NE;
4763 case EQ: return AARCH64_EQ;
4764 case ORDERED: return AARCH64_VC;
4765 case UNORDERED: return AARCH64_VS;
4766 case UNLT: return AARCH64_LT;
4767 case UNLE: return AARCH64_LE;
4768 case UNGT: return AARCH64_HI;
4769 case UNGE: return AARCH64_PL;
4770 default: return -1;
4772 break;
4774 case CCmode:
4775 switch (comp_code)
4777 case NE: return AARCH64_NE;
4778 case EQ: return AARCH64_EQ;
4779 case GE: return AARCH64_GE;
4780 case GT: return AARCH64_GT;
4781 case LE: return AARCH64_LE;
4782 case LT: return AARCH64_LT;
4783 case GEU: return AARCH64_CS;
4784 case GTU: return AARCH64_HI;
4785 case LEU: return AARCH64_LS;
4786 case LTU: return AARCH64_CC;
4787 default: return -1;
4789 break;
4791 case CC_SWPmode:
4792 switch (comp_code)
4794 case NE: return AARCH64_NE;
4795 case EQ: return AARCH64_EQ;
4796 case GE: return AARCH64_LE;
4797 case GT: return AARCH64_LT;
4798 case LE: return AARCH64_GE;
4799 case LT: return AARCH64_GT;
4800 case GEU: return AARCH64_LS;
4801 case GTU: return AARCH64_CC;
4802 case LEU: return AARCH64_CS;
4803 case LTU: return AARCH64_HI;
4804 default: return -1;
4806 break;
4808 case CC_NZmode:
4809 switch (comp_code)
4811 case NE: return AARCH64_NE;
4812 case EQ: return AARCH64_EQ;
4813 case GE: return AARCH64_PL;
4814 case LT: return AARCH64_MI;
4815 default: return -1;
4817 break;
4819 case CC_Zmode:
4820 switch (comp_code)
4822 case NE: return AARCH64_NE;
4823 case EQ: return AARCH64_EQ;
4824 default: return -1;
4826 break;
4828 case CC_Cmode:
4829 switch (comp_code)
4831 case NE: return AARCH64_CS;
4832 case EQ: return AARCH64_CC;
4833 default: return -1;
4835 break;
4837 default:
4838 return -1;
4841 return -1;
4844 bool
4845 aarch64_const_vec_all_same_in_range_p (rtx x,
4846 HOST_WIDE_INT minval,
4847 HOST_WIDE_INT maxval)
4849 HOST_WIDE_INT firstval;
4850 int count, i;
4852 if (GET_CODE (x) != CONST_VECTOR
4853 || GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_INT)
4854 return false;
4856 firstval = INTVAL (CONST_VECTOR_ELT (x, 0));
4857 if (firstval < minval || firstval > maxval)
4858 return false;
4860 count = CONST_VECTOR_NUNITS (x);
4861 for (i = 1; i < count; i++)
4862 if (INTVAL (CONST_VECTOR_ELT (x, i)) != firstval)
4863 return false;
4865 return true;
4868 bool
4869 aarch64_const_vec_all_same_int_p (rtx x, HOST_WIDE_INT val)
4871 return aarch64_const_vec_all_same_in_range_p (x, val, val);
4875 /* N Z C V. */
4876 #define AARCH64_CC_V 1
4877 #define AARCH64_CC_C (1 << 1)
4878 #define AARCH64_CC_Z (1 << 2)
4879 #define AARCH64_CC_N (1 << 3)
4881 /* N Z C V flags for ccmp. Indexed by AARCH64_COND_CODE. */
4882 static const int aarch64_nzcv_codes[] =
4884 0, /* EQ, Z == 1. */
4885 AARCH64_CC_Z, /* NE, Z == 0. */
4886 0, /* CS, C == 1. */
4887 AARCH64_CC_C, /* CC, C == 0. */
4888 0, /* MI, N == 1. */
4889 AARCH64_CC_N, /* PL, N == 0. */
4890 0, /* VS, V == 1. */
4891 AARCH64_CC_V, /* VC, V == 0. */
4892 0, /* HI, C ==1 && Z == 0. */
4893 AARCH64_CC_C, /* LS, !(C == 1 && Z == 0). */
4894 AARCH64_CC_V, /* GE, N == V. */
4895 0, /* LT, N != V. */
4896 AARCH64_CC_Z, /* GT, Z == 0 && N == V. */
4897 0, /* LE, !(Z == 0 && N == V). */
4898 0, /* AL, Any. */
4899 0 /* NV, Any. */
4902 static void
4903 aarch64_print_operand (FILE *f, rtx x, int code)
4905 switch (code)
4907 /* An integer or symbol address without a preceding # sign. */
4908 case 'c':
4909 switch (GET_CODE (x))
4911 case CONST_INT:
4912 fprintf (f, HOST_WIDE_INT_PRINT_DEC, INTVAL (x));
4913 break;
4915 case SYMBOL_REF:
4916 output_addr_const (f, x);
4917 break;
4919 case CONST:
4920 if (GET_CODE (XEXP (x, 0)) == PLUS
4921 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF)
4923 output_addr_const (f, x);
4924 break;
4926 /* Fall through. */
4928 default:
4929 output_operand_lossage ("Unsupported operand for code '%c'", code);
4931 break;
4933 case 'e':
4934 /* Print the sign/zero-extend size as a character 8->b, 16->h, 32->w. */
4936 int n;
4938 if (!CONST_INT_P (x)
4939 || (n = exact_log2 (INTVAL (x) & ~7)) <= 0)
4941 output_operand_lossage ("invalid operand for '%%%c'", code);
4942 return;
4945 switch (n)
4947 case 3:
4948 fputc ('b', f);
4949 break;
4950 case 4:
4951 fputc ('h', f);
4952 break;
4953 case 5:
4954 fputc ('w', f);
4955 break;
4956 default:
4957 output_operand_lossage ("invalid operand for '%%%c'", code);
4958 return;
4961 break;
4963 case 'p':
4965 int n;
4967 /* Print N such that 2^N == X. */
4968 if (!CONST_INT_P (x) || (n = exact_log2 (INTVAL (x))) < 0)
4970 output_operand_lossage ("invalid operand for '%%%c'", code);
4971 return;
4974 asm_fprintf (f, "%d", n);
4976 break;
4978 case 'P':
4979 /* Print the number of non-zero bits in X (a const_int). */
4980 if (!CONST_INT_P (x))
4982 output_operand_lossage ("invalid operand for '%%%c'", code);
4983 return;
4986 asm_fprintf (f, "%u", popcount_hwi (INTVAL (x)));
4987 break;
4989 case 'H':
4990 /* Print the higher numbered register of a pair (TImode) of regs. */
4991 if (!REG_P (x) || !GP_REGNUM_P (REGNO (x) + 1))
4993 output_operand_lossage ("invalid operand for '%%%c'", code);
4994 return;
4997 asm_fprintf (f, "%s", reg_names [REGNO (x) + 1]);
4998 break;
5000 case 'M':
5001 case 'm':
5003 int cond_code;
5004 /* Print a condition (eq, ne, etc) or its inverse. */
5006 /* CONST_TRUE_RTX means al/nv (al is the default, don't print it). */
5007 if (x == const_true_rtx)
5009 if (code == 'M')
5010 fputs ("nv", f);
5011 return;
5014 if (!COMPARISON_P (x))
5016 output_operand_lossage ("invalid operand for '%%%c'", code);
5017 return;
5020 cond_code = aarch64_get_condition_code (x);
5021 gcc_assert (cond_code >= 0);
5022 if (code == 'M')
5023 cond_code = AARCH64_INVERSE_CONDITION_CODE (cond_code);
5024 fputs (aarch64_condition_codes[cond_code], f);
5026 break;
5028 case 'b':
5029 case 'h':
5030 case 's':
5031 case 'd':
5032 case 'q':
5033 /* Print a scalar FP/SIMD register name. */
5034 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5036 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5037 return;
5039 asm_fprintf (f, "%c%d", code, REGNO (x) - V0_REGNUM);
5040 break;
5042 case 'S':
5043 case 'T':
5044 case 'U':
5045 case 'V':
5046 /* Print the first FP/SIMD register name in a list. */
5047 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5049 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5050 return;
5052 asm_fprintf (f, "v%d", REGNO (x) - V0_REGNUM + (code - 'S'));
5053 break;
5055 case 'R':
5056 /* Print a scalar FP/SIMD register name + 1. */
5057 if (!REG_P (x) || !FP_REGNUM_P (REGNO (x)))
5059 output_operand_lossage ("incompatible floating point / vector register operand for '%%%c'", code);
5060 return;
5062 asm_fprintf (f, "q%d", REGNO (x) - V0_REGNUM + 1);
5063 break;
5065 case 'X':
5066 /* Print bottom 16 bits of integer constant in hex. */
5067 if (!CONST_INT_P (x))
5069 output_operand_lossage ("invalid operand for '%%%c'", code);
5070 return;
5072 asm_fprintf (f, "0x%wx", UINTVAL (x) & 0xffff);
5073 break;
5075 case 'w':
5076 case 'x':
5077 /* Print a general register name or the zero register (32-bit or
5078 64-bit). */
5079 if (x == const0_rtx
5080 || (CONST_DOUBLE_P (x) && aarch64_float_const_zero_rtx_p (x)))
5082 asm_fprintf (f, "%czr", code);
5083 break;
5086 if (REG_P (x) && GP_REGNUM_P (REGNO (x)))
5088 asm_fprintf (f, "%c%d", code, REGNO (x) - R0_REGNUM);
5089 break;
5092 if (REG_P (x) && REGNO (x) == SP_REGNUM)
5094 asm_fprintf (f, "%ssp", code == 'w' ? "w" : "");
5095 break;
5098 /* Fall through */
5100 case 0:
5101 /* Print a normal operand, if it's a general register, then we
5102 assume DImode. */
5103 if (x == NULL)
5105 output_operand_lossage ("missing operand");
5106 return;
5109 switch (GET_CODE (x))
5111 case REG:
5112 asm_fprintf (f, "%s", reg_names [REGNO (x)]);
5113 break;
5115 case MEM:
5116 output_address (GET_MODE (x), XEXP (x, 0));
5117 break;
5119 case CONST:
5120 case LABEL_REF:
5121 case SYMBOL_REF:
5122 output_addr_const (asm_out_file, x);
5123 break;
5125 case CONST_INT:
5126 asm_fprintf (f, "%wd", INTVAL (x));
5127 break;
5129 case CONST_VECTOR:
5130 if (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_INT)
5132 gcc_assert (
5133 aarch64_const_vec_all_same_in_range_p (x,
5134 HOST_WIDE_INT_MIN,
5135 HOST_WIDE_INT_MAX));
5136 asm_fprintf (f, "%wd", INTVAL (CONST_VECTOR_ELT (x, 0)));
5138 else if (aarch64_simd_imm_zero_p (x, GET_MODE (x)))
5140 fputc ('0', f);
5142 else
5143 gcc_unreachable ();
5144 break;
5146 case CONST_DOUBLE:
5147 /* Since we define TARGET_SUPPORTS_WIDE_INT we shouldn't ever
5148 be getting CONST_DOUBLEs holding integers. */
5149 gcc_assert (GET_MODE (x) != VOIDmode);
5150 if (aarch64_float_const_zero_rtx_p (x))
5152 fputc ('0', f);
5153 break;
5155 else if (aarch64_float_const_representable_p (x))
5157 #define buf_size 20
5158 char float_buf[buf_size] = {'\0'};
5159 real_to_decimal_for_mode (float_buf,
5160 CONST_DOUBLE_REAL_VALUE (x),
5161 buf_size, buf_size,
5162 1, GET_MODE (x));
5163 asm_fprintf (asm_out_file, "%s", float_buf);
5164 break;
5165 #undef buf_size
5167 output_operand_lossage ("invalid constant");
5168 return;
5169 default:
5170 output_operand_lossage ("invalid operand");
5171 return;
5173 break;
5175 case 'A':
5176 if (GET_CODE (x) == HIGH)
5177 x = XEXP (x, 0);
5179 switch (aarch64_classify_symbolic_expression (x))
5181 case SYMBOL_SMALL_GOT_4G:
5182 asm_fprintf (asm_out_file, ":got:");
5183 break;
5185 case SYMBOL_SMALL_TLSGD:
5186 asm_fprintf (asm_out_file, ":tlsgd:");
5187 break;
5189 case SYMBOL_SMALL_TLSDESC:
5190 asm_fprintf (asm_out_file, ":tlsdesc:");
5191 break;
5193 case SYMBOL_SMALL_TLSIE:
5194 asm_fprintf (asm_out_file, ":gottprel:");
5195 break;
5197 case SYMBOL_TLSLE24:
5198 asm_fprintf (asm_out_file, ":tprel:");
5199 break;
5201 case SYMBOL_TINY_GOT:
5202 gcc_unreachable ();
5203 break;
5205 default:
5206 break;
5208 output_addr_const (asm_out_file, x);
5209 break;
5211 case 'L':
5212 switch (aarch64_classify_symbolic_expression (x))
5214 case SYMBOL_SMALL_GOT_4G:
5215 asm_fprintf (asm_out_file, ":lo12:");
5216 break;
5218 case SYMBOL_SMALL_TLSGD:
5219 asm_fprintf (asm_out_file, ":tlsgd_lo12:");
5220 break;
5222 case SYMBOL_SMALL_TLSDESC:
5223 asm_fprintf (asm_out_file, ":tlsdesc_lo12:");
5224 break;
5226 case SYMBOL_SMALL_TLSIE:
5227 asm_fprintf (asm_out_file, ":gottprel_lo12:");
5228 break;
5230 case SYMBOL_TLSLE12:
5231 asm_fprintf (asm_out_file, ":tprel_lo12:");
5232 break;
5234 case SYMBOL_TLSLE24:
5235 asm_fprintf (asm_out_file, ":tprel_lo12_nc:");
5236 break;
5238 case SYMBOL_TINY_GOT:
5239 asm_fprintf (asm_out_file, ":got:");
5240 break;
5242 case SYMBOL_TINY_TLSIE:
5243 asm_fprintf (asm_out_file, ":gottprel:");
5244 break;
5246 default:
5247 break;
5249 output_addr_const (asm_out_file, x);
5250 break;
5252 case 'G':
5254 switch (aarch64_classify_symbolic_expression (x))
5256 case SYMBOL_TLSLE24:
5257 asm_fprintf (asm_out_file, ":tprel_hi12:");
5258 break;
5259 default:
5260 break;
5262 output_addr_const (asm_out_file, x);
5263 break;
5265 case 'k':
5267 HOST_WIDE_INT cond_code;
5268 /* Print nzcv. */
5270 if (!CONST_INT_P (x))
5272 output_operand_lossage ("invalid operand for '%%%c'", code);
5273 return;
5276 cond_code = INTVAL (x);
5277 gcc_assert (cond_code >= 0 && cond_code <= AARCH64_NV);
5278 asm_fprintf (f, "%d", aarch64_nzcv_codes[cond_code]);
5280 break;
5282 default:
5283 output_operand_lossage ("invalid operand prefix '%%%c'", code);
5284 return;
5288 static void
5289 aarch64_print_operand_address (FILE *f, machine_mode mode, rtx x)
5291 struct aarch64_address_info addr;
5293 if (aarch64_classify_address (&addr, x, mode, MEM, true))
5294 switch (addr.type)
5296 case ADDRESS_REG_IMM:
5297 if (addr.offset == const0_rtx)
5298 asm_fprintf (f, "[%s]", reg_names [REGNO (addr.base)]);
5299 else
5300 asm_fprintf (f, "[%s, %wd]", reg_names [REGNO (addr.base)],
5301 INTVAL (addr.offset));
5302 return;
5304 case ADDRESS_REG_REG:
5305 if (addr.shift == 0)
5306 asm_fprintf (f, "[%s, %s]", reg_names [REGNO (addr.base)],
5307 reg_names [REGNO (addr.offset)]);
5308 else
5309 asm_fprintf (f, "[%s, %s, lsl %u]", reg_names [REGNO (addr.base)],
5310 reg_names [REGNO (addr.offset)], addr.shift);
5311 return;
5313 case ADDRESS_REG_UXTW:
5314 if (addr.shift == 0)
5315 asm_fprintf (f, "[%s, w%d, uxtw]", reg_names [REGNO (addr.base)],
5316 REGNO (addr.offset) - R0_REGNUM);
5317 else
5318 asm_fprintf (f, "[%s, w%d, uxtw %u]", reg_names [REGNO (addr.base)],
5319 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5320 return;
5322 case ADDRESS_REG_SXTW:
5323 if (addr.shift == 0)
5324 asm_fprintf (f, "[%s, w%d, sxtw]", reg_names [REGNO (addr.base)],
5325 REGNO (addr.offset) - R0_REGNUM);
5326 else
5327 asm_fprintf (f, "[%s, w%d, sxtw %u]", reg_names [REGNO (addr.base)],
5328 REGNO (addr.offset) - R0_REGNUM, addr.shift);
5329 return;
5331 case ADDRESS_REG_WB:
5332 switch (GET_CODE (x))
5334 case PRE_INC:
5335 asm_fprintf (f, "[%s, %d]!", reg_names [REGNO (addr.base)],
5336 GET_MODE_SIZE (mode));
5337 return;
5338 case POST_INC:
5339 asm_fprintf (f, "[%s], %d", reg_names [REGNO (addr.base)],
5340 GET_MODE_SIZE (mode));
5341 return;
5342 case PRE_DEC:
5343 asm_fprintf (f, "[%s, -%d]!", reg_names [REGNO (addr.base)],
5344 GET_MODE_SIZE (mode));
5345 return;
5346 case POST_DEC:
5347 asm_fprintf (f, "[%s], -%d", reg_names [REGNO (addr.base)],
5348 GET_MODE_SIZE (mode));
5349 return;
5350 case PRE_MODIFY:
5351 asm_fprintf (f, "[%s, %wd]!", reg_names [REGNO (addr.base)],
5352 INTVAL (addr.offset));
5353 return;
5354 case POST_MODIFY:
5355 asm_fprintf (f, "[%s], %wd", reg_names [REGNO (addr.base)],
5356 INTVAL (addr.offset));
5357 return;
5358 default:
5359 break;
5361 break;
5363 case ADDRESS_LO_SUM:
5364 asm_fprintf (f, "[%s, #:lo12:", reg_names [REGNO (addr.base)]);
5365 output_addr_const (f, addr.offset);
5366 asm_fprintf (f, "]");
5367 return;
5369 case ADDRESS_SYMBOLIC:
5370 break;
5373 output_addr_const (f, x);
5376 bool
5377 aarch64_label_mentioned_p (rtx x)
5379 const char *fmt;
5380 int i;
5382 if (GET_CODE (x) == LABEL_REF)
5383 return true;
5385 /* UNSPEC_TLS entries for a symbol include a LABEL_REF for the
5386 referencing instruction, but they are constant offsets, not
5387 symbols. */
5388 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_TLS)
5389 return false;
5391 fmt = GET_RTX_FORMAT (GET_CODE (x));
5392 for (i = GET_RTX_LENGTH (GET_CODE (x)) - 1; i >= 0; i--)
5394 if (fmt[i] == 'E')
5396 int j;
5398 for (j = XVECLEN (x, i) - 1; j >= 0; j--)
5399 if (aarch64_label_mentioned_p (XVECEXP (x, i, j)))
5400 return 1;
5402 else if (fmt[i] == 'e' && aarch64_label_mentioned_p (XEXP (x, i)))
5403 return 1;
5406 return 0;
5409 /* Implement REGNO_REG_CLASS. */
5411 enum reg_class
5412 aarch64_regno_regclass (unsigned regno)
5414 if (GP_REGNUM_P (regno))
5415 return GENERAL_REGS;
5417 if (regno == SP_REGNUM)
5418 return STACK_REG;
5420 if (regno == FRAME_POINTER_REGNUM
5421 || regno == ARG_POINTER_REGNUM)
5422 return POINTER_REGS;
5424 if (FP_REGNUM_P (regno))
5425 return FP_LO_REGNUM_P (regno) ? FP_LO_REGS : FP_REGS;
5427 return NO_REGS;
5430 static rtx
5431 aarch64_legitimize_address (rtx x, rtx /* orig_x */, machine_mode mode)
5433 /* Try to split X+CONST into Y=X+(CONST & ~mask), Y+(CONST&mask),
5434 where mask is selected by alignment and size of the offset.
5435 We try to pick as large a range for the offset as possible to
5436 maximize the chance of a CSE. However, for aligned addresses
5437 we limit the range to 4k so that structures with different sized
5438 elements are likely to use the same base. We need to be careful
5439 not to split a CONST for some forms of address expression, otherwise
5440 it will generate sub-optimal code. */
5442 if (GET_CODE (x) == PLUS && CONST_INT_P (XEXP (x, 1)))
5444 rtx base = XEXP (x, 0);
5445 rtx offset_rtx = XEXP (x, 1);
5446 HOST_WIDE_INT offset = INTVAL (offset_rtx);
5448 if (GET_CODE (base) == PLUS)
5450 rtx op0 = XEXP (base, 0);
5451 rtx op1 = XEXP (base, 1);
5453 /* Force any scaling into a temp for CSE. */
5454 op0 = force_reg (Pmode, op0);
5455 op1 = force_reg (Pmode, op1);
5457 /* Let the pointer register be in op0. */
5458 if (REG_POINTER (op1))
5459 std::swap (op0, op1);
5461 /* If the pointer is virtual or frame related, then we know that
5462 virtual register instantiation or register elimination is going
5463 to apply a second constant. We want the two constants folded
5464 together easily. Therefore, emit as (OP0 + CONST) + OP1. */
5465 if (virt_or_elim_regno_p (REGNO (op0)))
5467 base = expand_binop (Pmode, add_optab, op0, offset_rtx,
5468 NULL_RTX, true, OPTAB_DIRECT);
5469 return gen_rtx_PLUS (Pmode, base, op1);
5472 /* Otherwise, in order to encourage CSE (and thence loop strength
5473 reduce) scaled addresses, emit as (OP0 + OP1) + CONST. */
5474 base = expand_binop (Pmode, add_optab, op0, op1,
5475 NULL_RTX, true, OPTAB_DIRECT);
5476 x = gen_rtx_PLUS (Pmode, base, offset_rtx);
5479 /* Does it look like we'll need a 16-byte load/store-pair operation? */
5480 HOST_WIDE_INT base_offset;
5481 if (GET_MODE_SIZE (mode) > 16)
5482 base_offset = (offset + 0x400) & ~0x7f0;
5483 /* For offsets aren't a multiple of the access size, the limit is
5484 -256...255. */
5485 else if (offset & (GET_MODE_SIZE (mode) - 1))
5487 base_offset = (offset + 0x100) & ~0x1ff;
5489 /* BLKmode typically uses LDP of X-registers. */
5490 if (mode == BLKmode)
5491 base_offset = (offset + 512) & ~0x3ff;
5493 /* Small negative offsets are supported. */
5494 else if (IN_RANGE (offset, -256, 0))
5495 base_offset = 0;
5496 else if (mode == TImode || mode == TFmode)
5497 base_offset = (offset + 0x100) & ~0x1ff;
5498 /* Use 12-bit offset by access size. */
5499 else
5500 base_offset = offset & (~0xfff * GET_MODE_SIZE (mode));
5502 if (base_offset != 0)
5504 base = plus_constant (Pmode, base, base_offset);
5505 base = force_operand (base, NULL_RTX);
5506 return plus_constant (Pmode, base, offset - base_offset);
5510 return x;
5513 /* Return the reload icode required for a constant pool in mode. */
5514 static enum insn_code
5515 aarch64_constant_pool_reload_icode (machine_mode mode)
5517 switch (mode)
5519 case SFmode:
5520 return CODE_FOR_aarch64_reload_movcpsfdi;
5522 case DFmode:
5523 return CODE_FOR_aarch64_reload_movcpdfdi;
5525 case TFmode:
5526 return CODE_FOR_aarch64_reload_movcptfdi;
5528 case V8QImode:
5529 return CODE_FOR_aarch64_reload_movcpv8qidi;
5531 case V16QImode:
5532 return CODE_FOR_aarch64_reload_movcpv16qidi;
5534 case V4HImode:
5535 return CODE_FOR_aarch64_reload_movcpv4hidi;
5537 case V8HImode:
5538 return CODE_FOR_aarch64_reload_movcpv8hidi;
5540 case V2SImode:
5541 return CODE_FOR_aarch64_reload_movcpv2sidi;
5543 case V4SImode:
5544 return CODE_FOR_aarch64_reload_movcpv4sidi;
5546 case V2DImode:
5547 return CODE_FOR_aarch64_reload_movcpv2didi;
5549 case V2DFmode:
5550 return CODE_FOR_aarch64_reload_movcpv2dfdi;
5552 default:
5553 gcc_unreachable ();
5556 gcc_unreachable ();
5558 static reg_class_t
5559 aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
5560 reg_class_t rclass,
5561 machine_mode mode,
5562 secondary_reload_info *sri)
5565 /* If we have to disable direct literal pool loads and stores because the
5566 function is too big, then we need a scratch register. */
5567 if (MEM_P (x) && GET_CODE (x) == SYMBOL_REF && CONSTANT_POOL_ADDRESS_P (x)
5568 && (SCALAR_FLOAT_MODE_P (GET_MODE (x))
5569 || targetm.vector_mode_supported_p (GET_MODE (x)))
5570 && !aarch64_pcrelative_literal_loads)
5572 sri->icode = aarch64_constant_pool_reload_icode (mode);
5573 return NO_REGS;
5576 /* Without the TARGET_SIMD instructions we cannot move a Q register
5577 to a Q register directly. We need a scratch. */
5578 if (REG_P (x) && (mode == TFmode || mode == TImode) && mode == GET_MODE (x)
5579 && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
5580 && reg_class_subset_p (rclass, FP_REGS))
5582 if (mode == TFmode)
5583 sri->icode = CODE_FOR_aarch64_reload_movtf;
5584 else if (mode == TImode)
5585 sri->icode = CODE_FOR_aarch64_reload_movti;
5586 return NO_REGS;
5589 /* A TFmode or TImode memory access should be handled via an FP_REGS
5590 because AArch64 has richer addressing modes for LDR/STR instructions
5591 than LDP/STP instructions. */
5592 if (TARGET_FLOAT && rclass == GENERAL_REGS
5593 && GET_MODE_SIZE (mode) == 16 && MEM_P (x))
5594 return FP_REGS;
5596 if (rclass == FP_REGS && (mode == TImode || mode == TFmode) && CONSTANT_P(x))
5597 return GENERAL_REGS;
5599 return NO_REGS;
5602 static bool
5603 aarch64_can_eliminate (const int from, const int to)
5605 /* If we need a frame pointer, we must eliminate FRAME_POINTER_REGNUM into
5606 HARD_FRAME_POINTER_REGNUM and not into STACK_POINTER_REGNUM. */
5608 if (frame_pointer_needed)
5610 if (from == ARG_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5611 return true;
5612 if (from == ARG_POINTER_REGNUM && to == STACK_POINTER_REGNUM)
5613 return false;
5614 if (from == FRAME_POINTER_REGNUM && to == STACK_POINTER_REGNUM
5615 && !cfun->calls_alloca)
5616 return true;
5617 if (from == FRAME_POINTER_REGNUM && to == HARD_FRAME_POINTER_REGNUM)
5618 return true;
5620 return false;
5622 else
5624 /* If we decided that we didn't need a leaf frame pointer but then used
5625 LR in the function, then we'll want a frame pointer after all, so
5626 prevent this elimination to ensure a frame pointer is used. */
5627 if (to == STACK_POINTER_REGNUM
5628 && flag_omit_leaf_frame_pointer
5629 && df_regs_ever_live_p (LR_REGNUM))
5630 return false;
5633 return true;
5636 HOST_WIDE_INT
5637 aarch64_initial_elimination_offset (unsigned from, unsigned to)
5639 aarch64_layout_frame ();
5641 if (to == HARD_FRAME_POINTER_REGNUM)
5643 if (from == ARG_POINTER_REGNUM)
5644 return cfun->machine->frame.hard_fp_offset;
5646 if (from == FRAME_POINTER_REGNUM)
5647 return cfun->machine->frame.hard_fp_offset
5648 - cfun->machine->frame.locals_offset;
5651 if (to == STACK_POINTER_REGNUM)
5653 if (from == FRAME_POINTER_REGNUM)
5654 return cfun->machine->frame.frame_size
5655 - cfun->machine->frame.locals_offset;
5658 return cfun->machine->frame.frame_size;
5661 /* Implement RETURN_ADDR_RTX. We do not support moving back to a
5662 previous frame. */
5665 aarch64_return_addr (int count, rtx frame ATTRIBUTE_UNUSED)
5667 if (count != 0)
5668 return const0_rtx;
5669 return get_hard_reg_initial_val (Pmode, LR_REGNUM);
5673 static void
5674 aarch64_asm_trampoline_template (FILE *f)
5676 if (TARGET_ILP32)
5678 asm_fprintf (f, "\tldr\tw%d, .+16\n", IP1_REGNUM - R0_REGNUM);
5679 asm_fprintf (f, "\tldr\tw%d, .+16\n", STATIC_CHAIN_REGNUM - R0_REGNUM);
5681 else
5683 asm_fprintf (f, "\tldr\t%s, .+16\n", reg_names [IP1_REGNUM]);
5684 asm_fprintf (f, "\tldr\t%s, .+20\n", reg_names [STATIC_CHAIN_REGNUM]);
5686 asm_fprintf (f, "\tbr\t%s\n", reg_names [IP1_REGNUM]);
5687 assemble_aligned_integer (4, const0_rtx);
5688 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5689 assemble_aligned_integer (POINTER_BYTES, const0_rtx);
5692 static void
5693 aarch64_trampoline_init (rtx m_tramp, tree fndecl, rtx chain_value)
5695 rtx fnaddr, mem, a_tramp;
5696 const int tramp_code_sz = 16;
5698 /* Don't need to copy the trailing D-words, we fill those in below. */
5699 emit_block_move (m_tramp, assemble_trampoline_template (),
5700 GEN_INT (tramp_code_sz), BLOCK_OP_NORMAL);
5701 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz);
5702 fnaddr = XEXP (DECL_RTL (fndecl), 0);
5703 if (GET_MODE (fnaddr) != ptr_mode)
5704 fnaddr = convert_memory_address (ptr_mode, fnaddr);
5705 emit_move_insn (mem, fnaddr);
5707 mem = adjust_address (m_tramp, ptr_mode, tramp_code_sz + POINTER_BYTES);
5708 emit_move_insn (mem, chain_value);
5710 /* XXX We should really define a "clear_cache" pattern and use
5711 gen_clear_cache(). */
5712 a_tramp = XEXP (m_tramp, 0);
5713 emit_library_call (gen_rtx_SYMBOL_REF (Pmode, "__clear_cache"),
5714 LCT_NORMAL, VOIDmode, 2, a_tramp, ptr_mode,
5715 plus_constant (ptr_mode, a_tramp, TRAMPOLINE_SIZE),
5716 ptr_mode);
5719 static unsigned char
5720 aarch64_class_max_nregs (reg_class_t regclass, machine_mode mode)
5722 switch (regclass)
5724 case CALLER_SAVE_REGS:
5725 case POINTER_REGS:
5726 case GENERAL_REGS:
5727 case ALL_REGS:
5728 case FP_REGS:
5729 case FP_LO_REGS:
5730 return
5731 aarch64_vector_mode_p (mode)
5732 ? (GET_MODE_SIZE (mode) + UNITS_PER_VREG - 1) / UNITS_PER_VREG
5733 : (GET_MODE_SIZE (mode) + UNITS_PER_WORD - 1) / UNITS_PER_WORD;
5734 case STACK_REG:
5735 return 1;
5737 case NO_REGS:
5738 return 0;
5740 default:
5741 break;
5743 gcc_unreachable ();
5746 static reg_class_t
5747 aarch64_preferred_reload_class (rtx x, reg_class_t regclass)
5749 if (regclass == POINTER_REGS)
5750 return GENERAL_REGS;
5752 if (regclass == STACK_REG)
5754 if (REG_P(x)
5755 && reg_class_subset_p (REGNO_REG_CLASS (REGNO (x)), POINTER_REGS))
5756 return regclass;
5758 return NO_REGS;
5761 /* If it's an integer immediate that MOVI can't handle, then
5762 FP_REGS is not an option, so we return NO_REGS instead. */
5763 if (CONST_INT_P (x) && reg_class_subset_p (regclass, FP_REGS)
5764 && !aarch64_simd_imm_scalar_p (x, GET_MODE (x)))
5765 return NO_REGS;
5767 /* Register eliminiation can result in a request for
5768 SP+constant->FP_REGS. We cannot support such operations which
5769 use SP as source and an FP_REG as destination, so reject out
5770 right now. */
5771 if (! reg_class_subset_p (regclass, GENERAL_REGS) && GET_CODE (x) == PLUS)
5773 rtx lhs = XEXP (x, 0);
5775 /* Look through a possible SUBREG introduced by ILP32. */
5776 if (GET_CODE (lhs) == SUBREG)
5777 lhs = SUBREG_REG (lhs);
5779 gcc_assert (REG_P (lhs));
5780 gcc_assert (reg_class_subset_p (REGNO_REG_CLASS (REGNO (lhs)),
5781 POINTER_REGS));
5782 return NO_REGS;
5785 return regclass;
5788 void
5789 aarch64_asm_output_labelref (FILE* f, const char *name)
5791 asm_fprintf (f, "%U%s", name);
5794 static void
5795 aarch64_elf_asm_constructor (rtx symbol, int priority)
5797 if (priority == DEFAULT_INIT_PRIORITY)
5798 default_ctor_section_asm_out_constructor (symbol, priority);
5799 else
5801 section *s;
5802 /* While priority is known to be in range [0, 65535], so 18 bytes
5803 would be enough, the compiler might not know that. To avoid
5804 -Wformat-truncation false positive, use a larger size. */
5805 char buf[23];
5806 snprintf (buf, sizeof (buf), ".init_array.%.5u", priority);
5807 s = get_section (buf, SECTION_WRITE, NULL);
5808 switch_to_section (s);
5809 assemble_align (POINTER_SIZE);
5810 assemble_aligned_integer (POINTER_BYTES, symbol);
5814 static void
5815 aarch64_elf_asm_destructor (rtx symbol, int priority)
5817 if (priority == DEFAULT_INIT_PRIORITY)
5818 default_dtor_section_asm_out_destructor (symbol, priority);
5819 else
5821 section *s;
5822 /* While priority is known to be in range [0, 65535], so 18 bytes
5823 would be enough, the compiler might not know that. To avoid
5824 -Wformat-truncation false positive, use a larger size. */
5825 char buf[23];
5826 snprintf (buf, sizeof (buf), ".fini_array.%.5u", priority);
5827 s = get_section (buf, SECTION_WRITE, NULL);
5828 switch_to_section (s);
5829 assemble_align (POINTER_SIZE);
5830 assemble_aligned_integer (POINTER_BYTES, symbol);
5834 const char*
5835 aarch64_output_casesi (rtx *operands)
5837 char buf[100];
5838 char label[100];
5839 rtx diff_vec = PATTERN (NEXT_INSN (as_a <rtx_insn *> (operands[2])));
5840 int index;
5841 static const char *const patterns[4][2] =
5844 "ldrb\t%w3, [%0,%w1,uxtw]",
5845 "add\t%3, %4, %w3, sxtb #2"
5848 "ldrh\t%w3, [%0,%w1,uxtw #1]",
5849 "add\t%3, %4, %w3, sxth #2"
5852 "ldr\t%w3, [%0,%w1,uxtw #2]",
5853 "add\t%3, %4, %w3, sxtw #2"
5855 /* We assume that DImode is only generated when not optimizing and
5856 that we don't really need 64-bit address offsets. That would
5857 imply an object file with 8GB of code in a single function! */
5859 "ldr\t%w3, [%0,%w1,uxtw #2]",
5860 "add\t%3, %4, %w3, sxtw #2"
5864 gcc_assert (GET_CODE (diff_vec) == ADDR_DIFF_VEC);
5866 index = exact_log2 (GET_MODE_SIZE (GET_MODE (diff_vec)));
5868 gcc_assert (index >= 0 && index <= 3);
5870 /* Need to implement table size reduction, by chaning the code below. */
5871 output_asm_insn (patterns[index][0], operands);
5872 ASM_GENERATE_INTERNAL_LABEL (label, "Lrtx", CODE_LABEL_NUMBER (operands[2]));
5873 snprintf (buf, sizeof (buf),
5874 "adr\t%%4, %s", targetm.strip_name_encoding (label));
5875 output_asm_insn (buf, operands);
5876 output_asm_insn (patterns[index][1], operands);
5877 output_asm_insn ("br\t%3", operands);
5878 assemble_label (asm_out_file, label);
5879 return "";
5883 /* Return size in bits of an arithmetic operand which is shifted/scaled and
5884 masked such that it is suitable for a UXTB, UXTH, or UXTW extend
5885 operator. */
5888 aarch64_uxt_size (int shift, HOST_WIDE_INT mask)
5890 if (shift >= 0 && shift <= 3)
5892 int size;
5893 for (size = 8; size <= 32; size *= 2)
5895 HOST_WIDE_INT bits = ((HOST_WIDE_INT)1U << size) - 1;
5896 if (mask == bits << shift)
5897 return size;
5900 return 0;
5903 /* Constant pools are per function only when PC relative
5904 literal loads are true or we are in the large memory
5905 model. */
5907 static inline bool
5908 aarch64_can_use_per_function_literal_pools_p (void)
5910 return (aarch64_pcrelative_literal_loads
5911 || aarch64_cmodel == AARCH64_CMODEL_LARGE);
5914 static bool
5915 aarch64_use_blocks_for_constant_p (machine_mode, const_rtx)
5917 /* Fixme:: In an ideal world this would work similar
5918 to the logic in aarch64_select_rtx_section but this
5919 breaks bootstrap in gcc go. For now we workaround
5920 this by returning false here. */
5921 return false;
5924 /* Select appropriate section for constants depending
5925 on where we place literal pools. */
5927 static section *
5928 aarch64_select_rtx_section (machine_mode mode,
5929 rtx x,
5930 unsigned HOST_WIDE_INT align)
5932 if (aarch64_can_use_per_function_literal_pools_p ())
5933 return function_section (current_function_decl);
5935 return default_elf_select_rtx_section (mode, x, align);
5938 /* Implement ASM_OUTPUT_POOL_EPILOGUE. */
5939 void
5940 aarch64_asm_output_pool_epilogue (FILE *f, const char *, tree,
5941 HOST_WIDE_INT offset)
5943 /* When using per-function literal pools, we must ensure that any code
5944 section is aligned to the minimal instruction length, lest we get
5945 errors from the assembler re "unaligned instructions". */
5946 if ((offset & 3) && aarch64_can_use_per_function_literal_pools_p ())
5947 ASM_OUTPUT_ALIGN (f, 2);
5950 /* Costs. */
5952 /* Helper function for rtx cost calculation. Strip a shift expression
5953 from X. Returns the inner operand if successful, or the original
5954 expression on failure. */
5955 static rtx
5956 aarch64_strip_shift (rtx x)
5958 rtx op = x;
5960 /* We accept both ROTATERT and ROTATE: since the RHS must be a constant
5961 we can convert both to ROR during final output. */
5962 if ((GET_CODE (op) == ASHIFT
5963 || GET_CODE (op) == ASHIFTRT
5964 || GET_CODE (op) == LSHIFTRT
5965 || GET_CODE (op) == ROTATERT
5966 || GET_CODE (op) == ROTATE)
5967 && CONST_INT_P (XEXP (op, 1)))
5968 return XEXP (op, 0);
5970 if (GET_CODE (op) == MULT
5971 && CONST_INT_P (XEXP (op, 1))
5972 && ((unsigned) exact_log2 (INTVAL (XEXP (op, 1)))) < 64)
5973 return XEXP (op, 0);
5975 return x;
5978 /* Helper function for rtx cost calculation. Strip an extend
5979 expression from X. Returns the inner operand if successful, or the
5980 original expression on failure. We deal with a number of possible
5981 canonicalization variations here. */
5982 static rtx
5983 aarch64_strip_extend (rtx x)
5985 rtx op = x;
5987 /* Zero and sign extraction of a widened value. */
5988 if ((GET_CODE (op) == ZERO_EXTRACT || GET_CODE (op) == SIGN_EXTRACT)
5989 && XEXP (op, 2) == const0_rtx
5990 && GET_CODE (XEXP (op, 0)) == MULT
5991 && aarch64_is_extend_from_extract (GET_MODE (op), XEXP (XEXP (op, 0), 1),
5992 XEXP (op, 1)))
5993 return XEXP (XEXP (op, 0), 0);
5995 /* It can also be represented (for zero-extend) as an AND with an
5996 immediate. */
5997 if (GET_CODE (op) == AND
5998 && GET_CODE (XEXP (op, 0)) == MULT
5999 && CONST_INT_P (XEXP (XEXP (op, 0), 1))
6000 && CONST_INT_P (XEXP (op, 1))
6001 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (XEXP (op, 0), 1))),
6002 INTVAL (XEXP (op, 1))) != 0)
6003 return XEXP (XEXP (op, 0), 0);
6005 /* Now handle extended register, as this may also have an optional
6006 left shift by 1..4. */
6007 if (GET_CODE (op) == ASHIFT
6008 && CONST_INT_P (XEXP (op, 1))
6009 && ((unsigned HOST_WIDE_INT) INTVAL (XEXP (op, 1))) <= 4)
6010 op = XEXP (op, 0);
6012 if (GET_CODE (op) == ZERO_EXTEND
6013 || GET_CODE (op) == SIGN_EXTEND)
6014 op = XEXP (op, 0);
6016 if (op != x)
6017 return op;
6019 return x;
6022 /* Return true iff CODE is a shift supported in combination
6023 with arithmetic instructions. */
6025 static bool
6026 aarch64_shift_p (enum rtx_code code)
6028 return code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT;
6031 /* Helper function for rtx cost calculation. Calculate the cost of
6032 a MULT or ASHIFT, which may be part of a compound PLUS/MINUS rtx.
6033 Return the calculated cost of the expression, recursing manually in to
6034 operands where needed. */
6036 static int
6037 aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
6039 rtx op0, op1;
6040 const struct cpu_cost_table *extra_cost
6041 = aarch64_tune_params.insn_extra_cost;
6042 int cost = 0;
6043 bool compound_p = (outer == PLUS || outer == MINUS);
6044 machine_mode mode = GET_MODE (x);
6046 gcc_checking_assert (code == MULT);
6048 op0 = XEXP (x, 0);
6049 op1 = XEXP (x, 1);
6051 if (VECTOR_MODE_P (mode))
6052 mode = GET_MODE_INNER (mode);
6054 /* Integer multiply/fma. */
6055 if (GET_MODE_CLASS (mode) == MODE_INT)
6057 /* The multiply will be canonicalized as a shift, cost it as such. */
6058 if (aarch64_shift_p (GET_CODE (x))
6059 || (CONST_INT_P (op1)
6060 && exact_log2 (INTVAL (op1)) > 0))
6062 bool is_extend = GET_CODE (op0) == ZERO_EXTEND
6063 || GET_CODE (op0) == SIGN_EXTEND;
6064 if (speed)
6066 if (compound_p)
6068 if (REG_P (op1))
6069 /* ARITH + shift-by-register. */
6070 cost += extra_cost->alu.arith_shift_reg;
6071 else if (is_extend)
6072 /* ARITH + extended register. We don't have a cost field
6073 for ARITH+EXTEND+SHIFT, so use extend_arith here. */
6074 cost += extra_cost->alu.extend_arith;
6075 else
6076 /* ARITH + shift-by-immediate. */
6077 cost += extra_cost->alu.arith_shift;
6079 else
6080 /* LSL (immediate). */
6081 cost += extra_cost->alu.shift;
6084 /* Strip extends as we will have costed them in the case above. */
6085 if (is_extend)
6086 op0 = aarch64_strip_extend (op0);
6088 cost += rtx_cost (op0, VOIDmode, code, 0, speed);
6090 return cost;
6093 /* MNEG or [US]MNEGL. Extract the NEG operand and indicate that it's a
6094 compound and let the below cases handle it. After all, MNEG is a
6095 special-case alias of MSUB. */
6096 if (GET_CODE (op0) == NEG)
6098 op0 = XEXP (op0, 0);
6099 compound_p = true;
6102 /* Integer multiplies or FMAs have zero/sign extending variants. */
6103 if ((GET_CODE (op0) == ZERO_EXTEND
6104 && GET_CODE (op1) == ZERO_EXTEND)
6105 || (GET_CODE (op0) == SIGN_EXTEND
6106 && GET_CODE (op1) == SIGN_EXTEND))
6108 cost += rtx_cost (XEXP (op0, 0), VOIDmode, MULT, 0, speed);
6109 cost += rtx_cost (XEXP (op1, 0), VOIDmode, MULT, 1, speed);
6111 if (speed)
6113 if (compound_p)
6114 /* SMADDL/UMADDL/UMSUBL/SMSUBL. */
6115 cost += extra_cost->mult[0].extend_add;
6116 else
6117 /* MUL/SMULL/UMULL. */
6118 cost += extra_cost->mult[0].extend;
6121 return cost;
6124 /* This is either an integer multiply or a MADD. In both cases
6125 we want to recurse and cost the operands. */
6126 cost += rtx_cost (op0, mode, MULT, 0, speed);
6127 cost += rtx_cost (op1, mode, MULT, 1, speed);
6129 if (speed)
6131 if (compound_p)
6132 /* MADD/MSUB. */
6133 cost += extra_cost->mult[mode == DImode].add;
6134 else
6135 /* MUL. */
6136 cost += extra_cost->mult[mode == DImode].simple;
6139 return cost;
6141 else
6143 if (speed)
6145 /* Floating-point FMA/FMUL can also support negations of the
6146 operands, unless the rounding mode is upward or downward in
6147 which case FNMUL is different than FMUL with operand negation. */
6148 bool neg0 = GET_CODE (op0) == NEG;
6149 bool neg1 = GET_CODE (op1) == NEG;
6150 if (compound_p || !flag_rounding_math || (neg0 && neg1))
6152 if (neg0)
6153 op0 = XEXP (op0, 0);
6154 if (neg1)
6155 op1 = XEXP (op1, 0);
6158 if (compound_p)
6159 /* FMADD/FNMADD/FNMSUB/FMSUB. */
6160 cost += extra_cost->fp[mode == DFmode].fma;
6161 else
6162 /* FMUL/FNMUL. */
6163 cost += extra_cost->fp[mode == DFmode].mult;
6166 cost += rtx_cost (op0, mode, MULT, 0, speed);
6167 cost += rtx_cost (op1, mode, MULT, 1, speed);
6168 return cost;
6172 static int
6173 aarch64_address_cost (rtx x,
6174 machine_mode mode,
6175 addr_space_t as ATTRIBUTE_UNUSED,
6176 bool speed)
6178 enum rtx_code c = GET_CODE (x);
6179 const struct cpu_addrcost_table *addr_cost = aarch64_tune_params.addr_cost;
6180 struct aarch64_address_info info;
6181 int cost = 0;
6182 info.shift = 0;
6184 if (!aarch64_classify_address (&info, x, mode, c, false))
6186 if (GET_CODE (x) == CONST || GET_CODE (x) == SYMBOL_REF)
6188 /* This is a CONST or SYMBOL ref which will be split
6189 in a different way depending on the code model in use.
6190 Cost it through the generic infrastructure. */
6191 int cost_symbol_ref = rtx_cost (x, Pmode, MEM, 1, speed);
6192 /* Divide through by the cost of one instruction to
6193 bring it to the same units as the address costs. */
6194 cost_symbol_ref /= COSTS_N_INSNS (1);
6195 /* The cost is then the cost of preparing the address,
6196 followed by an immediate (possibly 0) offset. */
6197 return cost_symbol_ref + addr_cost->imm_offset;
6199 else
6201 /* This is most likely a jump table from a case
6202 statement. */
6203 return addr_cost->register_offset;
6207 switch (info.type)
6209 case ADDRESS_LO_SUM:
6210 case ADDRESS_SYMBOLIC:
6211 case ADDRESS_REG_IMM:
6212 cost += addr_cost->imm_offset;
6213 break;
6215 case ADDRESS_REG_WB:
6216 if (c == PRE_INC || c == PRE_DEC || c == PRE_MODIFY)
6217 cost += addr_cost->pre_modify;
6218 else if (c == POST_INC || c == POST_DEC || c == POST_MODIFY)
6219 cost += addr_cost->post_modify;
6220 else
6221 gcc_unreachable ();
6223 break;
6225 case ADDRESS_REG_REG:
6226 cost += addr_cost->register_offset;
6227 break;
6229 case ADDRESS_REG_SXTW:
6230 cost += addr_cost->register_sextend;
6231 break;
6233 case ADDRESS_REG_UXTW:
6234 cost += addr_cost->register_zextend;
6235 break;
6237 default:
6238 gcc_unreachable ();
6242 if (info.shift > 0)
6244 /* For the sake of calculating the cost of the shifted register
6245 component, we can treat same sized modes in the same way. */
6246 switch (GET_MODE_BITSIZE (mode))
6248 case 16:
6249 cost += addr_cost->addr_scale_costs.hi;
6250 break;
6252 case 32:
6253 cost += addr_cost->addr_scale_costs.si;
6254 break;
6256 case 64:
6257 cost += addr_cost->addr_scale_costs.di;
6258 break;
6260 /* We can't tell, or this is a 128-bit vector. */
6261 default:
6262 cost += addr_cost->addr_scale_costs.ti;
6263 break;
6267 return cost;
6270 /* Return the cost of a branch. If SPEED_P is true then the compiler is
6271 optimizing for speed. If PREDICTABLE_P is true then the branch is predicted
6272 to be taken. */
6275 aarch64_branch_cost (bool speed_p, bool predictable_p)
6277 /* When optimizing for speed, use the cost of unpredictable branches. */
6278 const struct cpu_branch_cost *branch_costs =
6279 aarch64_tune_params.branch_costs;
6281 if (!speed_p || predictable_p)
6282 return branch_costs->predictable;
6283 else
6284 return branch_costs->unpredictable;
6287 /* Return true if the RTX X in mode MODE is a zero or sign extract
6288 usable in an ADD or SUB (extended register) instruction. */
6289 static bool
6290 aarch64_rtx_arith_op_extract_p (rtx x, machine_mode mode)
6292 /* Catch add with a sign extract.
6293 This is add_<optab><mode>_multp2. */
6294 if (GET_CODE (x) == SIGN_EXTRACT
6295 || GET_CODE (x) == ZERO_EXTRACT)
6297 rtx op0 = XEXP (x, 0);
6298 rtx op1 = XEXP (x, 1);
6299 rtx op2 = XEXP (x, 2);
6301 if (GET_CODE (op0) == MULT
6302 && CONST_INT_P (op1)
6303 && op2 == const0_rtx
6304 && CONST_INT_P (XEXP (op0, 1))
6305 && aarch64_is_extend_from_extract (mode,
6306 XEXP (op0, 1),
6307 op1))
6309 return true;
6312 /* The simple case <ARITH>, XD, XN, XM, [us]xt.
6313 No shift. */
6314 else if (GET_CODE (x) == SIGN_EXTEND
6315 || GET_CODE (x) == ZERO_EXTEND)
6316 return REG_P (XEXP (x, 0));
6318 return false;
6321 static bool
6322 aarch64_frint_unspec_p (unsigned int u)
6324 switch (u)
6326 case UNSPEC_FRINTZ:
6327 case UNSPEC_FRINTP:
6328 case UNSPEC_FRINTM:
6329 case UNSPEC_FRINTA:
6330 case UNSPEC_FRINTN:
6331 case UNSPEC_FRINTX:
6332 case UNSPEC_FRINTI:
6333 return true;
6335 default:
6336 return false;
6340 /* Return true iff X is an rtx that will match an extr instruction
6341 i.e. as described in the *extr<mode>5_insn family of patterns.
6342 OP0 and OP1 will be set to the operands of the shifts involved
6343 on success and will be NULL_RTX otherwise. */
6345 static bool
6346 aarch64_extr_rtx_p (rtx x, rtx *res_op0, rtx *res_op1)
6348 rtx op0, op1;
6349 machine_mode mode = GET_MODE (x);
6351 *res_op0 = NULL_RTX;
6352 *res_op1 = NULL_RTX;
6354 if (GET_CODE (x) != IOR)
6355 return false;
6357 op0 = XEXP (x, 0);
6358 op1 = XEXP (x, 1);
6360 if ((GET_CODE (op0) == ASHIFT && GET_CODE (op1) == LSHIFTRT)
6361 || (GET_CODE (op1) == ASHIFT && GET_CODE (op0) == LSHIFTRT))
6363 /* Canonicalise locally to ashift in op0, lshiftrt in op1. */
6364 if (GET_CODE (op1) == ASHIFT)
6365 std::swap (op0, op1);
6367 if (!CONST_INT_P (XEXP (op0, 1)) || !CONST_INT_P (XEXP (op1, 1)))
6368 return false;
6370 unsigned HOST_WIDE_INT shft_amnt_0 = UINTVAL (XEXP (op0, 1));
6371 unsigned HOST_WIDE_INT shft_amnt_1 = UINTVAL (XEXP (op1, 1));
6373 if (shft_amnt_0 < GET_MODE_BITSIZE (mode)
6374 && shft_amnt_0 + shft_amnt_1 == GET_MODE_BITSIZE (mode))
6376 *res_op0 = XEXP (op0, 0);
6377 *res_op1 = XEXP (op1, 0);
6378 return true;
6382 return false;
6385 /* Calculate the cost of calculating (if_then_else (OP0) (OP1) (OP2)),
6386 storing it in *COST. Result is true if the total cost of the operation
6387 has now been calculated. */
6388 static bool
6389 aarch64_if_then_else_costs (rtx op0, rtx op1, rtx op2, int *cost, bool speed)
6391 rtx inner;
6392 rtx comparator;
6393 enum rtx_code cmpcode;
6395 if (COMPARISON_P (op0))
6397 inner = XEXP (op0, 0);
6398 comparator = XEXP (op0, 1);
6399 cmpcode = GET_CODE (op0);
6401 else
6403 inner = op0;
6404 comparator = const0_rtx;
6405 cmpcode = NE;
6408 if (GET_CODE (op1) == PC || GET_CODE (op2) == PC)
6410 /* Conditional branch. */
6411 if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6412 return true;
6413 else
6415 if (cmpcode == NE || cmpcode == EQ)
6417 if (comparator == const0_rtx)
6419 /* TBZ/TBNZ/CBZ/CBNZ. */
6420 if (GET_CODE (inner) == ZERO_EXTRACT)
6421 /* TBZ/TBNZ. */
6422 *cost += rtx_cost (XEXP (inner, 0), VOIDmode,
6423 ZERO_EXTRACT, 0, speed);
6424 else
6425 /* CBZ/CBNZ. */
6426 *cost += rtx_cost (inner, VOIDmode, cmpcode, 0, speed);
6428 return true;
6431 else if (cmpcode == LT || cmpcode == GE)
6433 /* TBZ/TBNZ. */
6434 if (comparator == const0_rtx)
6435 return true;
6439 else if (GET_MODE_CLASS (GET_MODE (inner)) == MODE_CC)
6441 /* CCMP. */
6442 if (GET_CODE (op1) == COMPARE)
6444 /* Increase cost of CCMP reg, 0, imm, CC to prefer CMP reg, 0. */
6445 if (XEXP (op1, 1) == const0_rtx)
6446 *cost += 1;
6447 if (speed)
6449 machine_mode mode = GET_MODE (XEXP (op1, 0));
6450 const struct cpu_cost_table *extra_cost
6451 = aarch64_tune_params.insn_extra_cost;
6453 if (GET_MODE_CLASS (mode) == MODE_INT)
6454 *cost += extra_cost->alu.arith;
6455 else
6456 *cost += extra_cost->fp[mode == DFmode].compare;
6458 return true;
6461 /* It's a conditional operation based on the status flags,
6462 so it must be some flavor of CSEL. */
6464 /* CSNEG, CSINV, and CSINC are handled for free as part of CSEL. */
6465 if (GET_CODE (op1) == NEG
6466 || GET_CODE (op1) == NOT
6467 || (GET_CODE (op1) == PLUS && XEXP (op1, 1) == const1_rtx))
6468 op1 = XEXP (op1, 0);
6469 else if (GET_CODE (op1) == ZERO_EXTEND && GET_CODE (op2) == ZERO_EXTEND)
6471 /* CSEL with zero-extension (*cmovdi_insn_uxtw). */
6472 op1 = XEXP (op1, 0);
6473 op2 = XEXP (op2, 0);
6476 *cost += rtx_cost (op1, VOIDmode, IF_THEN_ELSE, 1, speed);
6477 *cost += rtx_cost (op2, VOIDmode, IF_THEN_ELSE, 2, speed);
6478 return true;
6481 /* We don't know what this is, cost all operands. */
6482 return false;
6485 /* Check whether X is a bitfield operation of the form shift + extend that
6486 maps down to a UBFIZ/SBFIZ/UBFX/SBFX instruction. If so, return the
6487 operand to which the bitfield operation is applied. Otherwise return
6488 NULL_RTX. */
6490 static rtx
6491 aarch64_extend_bitfield_pattern_p (rtx x)
6493 rtx_code outer_code = GET_CODE (x);
6494 machine_mode outer_mode = GET_MODE (x);
6496 if (outer_code != ZERO_EXTEND && outer_code != SIGN_EXTEND
6497 && outer_mode != SImode && outer_mode != DImode)
6498 return NULL_RTX;
6500 rtx inner = XEXP (x, 0);
6501 rtx_code inner_code = GET_CODE (inner);
6502 machine_mode inner_mode = GET_MODE (inner);
6503 rtx op = NULL_RTX;
6505 switch (inner_code)
6507 case ASHIFT:
6508 if (CONST_INT_P (XEXP (inner, 1))
6509 && (inner_mode == QImode || inner_mode == HImode))
6510 op = XEXP (inner, 0);
6511 break;
6512 case LSHIFTRT:
6513 if (outer_code == ZERO_EXTEND && CONST_INT_P (XEXP (inner, 1))
6514 && (inner_mode == QImode || inner_mode == HImode))
6515 op = XEXP (inner, 0);
6516 break;
6517 case ASHIFTRT:
6518 if (outer_code == SIGN_EXTEND && CONST_INT_P (XEXP (inner, 1))
6519 && (inner_mode == QImode || inner_mode == HImode))
6520 op = XEXP (inner, 0);
6521 break;
6522 default:
6523 break;
6526 return op;
6529 /* Return true if the mask and a shift amount from an RTX of the form
6530 (x << SHFT_AMNT) & MASK are valid to combine into a UBFIZ instruction of
6531 mode MODE. See the *andim_ashift<mode>_bfiz pattern. */
6533 bool
6534 aarch64_mask_and_shift_for_ubfiz_p (machine_mode mode, rtx mask, rtx shft_amnt)
6536 return CONST_INT_P (mask) && CONST_INT_P (shft_amnt)
6537 && INTVAL (shft_amnt) < GET_MODE_BITSIZE (mode)
6538 && exact_log2 ((INTVAL (mask) >> INTVAL (shft_amnt)) + 1) >= 0
6539 && (INTVAL (mask) & ((1 << INTVAL (shft_amnt)) - 1)) == 0;
6542 /* Calculate the cost of calculating X, storing it in *COST. Result
6543 is true if the total cost of the operation has now been calculated. */
6544 static bool
6545 aarch64_rtx_costs (rtx x, machine_mode mode, int outer ATTRIBUTE_UNUSED,
6546 int param ATTRIBUTE_UNUSED, int *cost, bool speed)
6548 rtx op0, op1, op2;
6549 const struct cpu_cost_table *extra_cost
6550 = aarch64_tune_params.insn_extra_cost;
6551 int code = GET_CODE (x);
6553 /* By default, assume that everything has equivalent cost to the
6554 cheapest instruction. Any additional costs are applied as a delta
6555 above this default. */
6556 *cost = COSTS_N_INSNS (1);
6558 switch (code)
6560 case SET:
6561 /* The cost depends entirely on the operands to SET. */
6562 *cost = 0;
6563 op0 = SET_DEST (x);
6564 op1 = SET_SRC (x);
6566 switch (GET_CODE (op0))
6568 case MEM:
6569 if (speed)
6571 rtx address = XEXP (op0, 0);
6572 if (VECTOR_MODE_P (mode))
6573 *cost += extra_cost->ldst.storev;
6574 else if (GET_MODE_CLASS (mode) == MODE_INT)
6575 *cost += extra_cost->ldst.store;
6576 else if (mode == SFmode)
6577 *cost += extra_cost->ldst.storef;
6578 else if (mode == DFmode)
6579 *cost += extra_cost->ldst.stored;
6581 *cost +=
6582 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6583 0, speed));
6586 *cost += rtx_cost (op1, mode, SET, 1, speed);
6587 return true;
6589 case SUBREG:
6590 if (! REG_P (SUBREG_REG (op0)))
6591 *cost += rtx_cost (SUBREG_REG (op0), VOIDmode, SET, 0, speed);
6593 /* Fall through. */
6594 case REG:
6595 /* The cost is one per vector-register copied. */
6596 if (VECTOR_MODE_P (GET_MODE (op0)) && REG_P (op1))
6598 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6599 / GET_MODE_SIZE (V4SImode);
6600 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6602 /* const0_rtx is in general free, but we will use an
6603 instruction to set a register to 0. */
6604 else if (REG_P (op1) || op1 == const0_rtx)
6606 /* The cost is 1 per register copied. */
6607 int n_minus_1 = (GET_MODE_SIZE (GET_MODE (op0)) - 1)
6608 / UNITS_PER_WORD;
6609 *cost = COSTS_N_INSNS (n_minus_1 + 1);
6611 else
6612 /* Cost is just the cost of the RHS of the set. */
6613 *cost += rtx_cost (op1, mode, SET, 1, speed);
6614 return true;
6616 case ZERO_EXTRACT:
6617 case SIGN_EXTRACT:
6618 /* Bit-field insertion. Strip any redundant widening of
6619 the RHS to meet the width of the target. */
6620 if (GET_CODE (op1) == SUBREG)
6621 op1 = SUBREG_REG (op1);
6622 if ((GET_CODE (op1) == ZERO_EXTEND
6623 || GET_CODE (op1) == SIGN_EXTEND)
6624 && CONST_INT_P (XEXP (op0, 1))
6625 && (GET_MODE_BITSIZE (GET_MODE (XEXP (op1, 0)))
6626 >= INTVAL (XEXP (op0, 1))))
6627 op1 = XEXP (op1, 0);
6629 if (CONST_INT_P (op1))
6631 /* MOV immediate is assumed to always be cheap. */
6632 *cost = COSTS_N_INSNS (1);
6634 else
6636 /* BFM. */
6637 if (speed)
6638 *cost += extra_cost->alu.bfi;
6639 *cost += rtx_cost (op1, VOIDmode, (enum rtx_code) code, 1, speed);
6642 return true;
6644 default:
6645 /* We can't make sense of this, assume default cost. */
6646 *cost = COSTS_N_INSNS (1);
6647 return false;
6649 return false;
6651 case CONST_INT:
6652 /* If an instruction can incorporate a constant within the
6653 instruction, the instruction's expression avoids calling
6654 rtx_cost() on the constant. If rtx_cost() is called on a
6655 constant, then it is usually because the constant must be
6656 moved into a register by one or more instructions.
6658 The exception is constant 0, which can be expressed
6659 as XZR/WZR and is therefore free. The exception to this is
6660 if we have (set (reg) (const0_rtx)) in which case we must cost
6661 the move. However, we can catch that when we cost the SET, so
6662 we don't need to consider that here. */
6663 if (x == const0_rtx)
6664 *cost = 0;
6665 else
6667 /* To an approximation, building any other constant is
6668 proportionally expensive to the number of instructions
6669 required to build that constant. This is true whether we
6670 are compiling for SPEED or otherwise. */
6671 *cost = COSTS_N_INSNS (aarch64_internal_mov_immediate
6672 (NULL_RTX, x, false, mode));
6674 return true;
6676 case CONST_DOUBLE:
6677 if (speed)
6679 /* mov[df,sf]_aarch64. */
6680 if (aarch64_float_const_representable_p (x))
6681 /* FMOV (scalar immediate). */
6682 *cost += extra_cost->fp[mode == DFmode].fpconst;
6683 else if (!aarch64_float_const_zero_rtx_p (x))
6685 /* This will be a load from memory. */
6686 if (mode == DFmode)
6687 *cost += extra_cost->ldst.loadd;
6688 else
6689 *cost += extra_cost->ldst.loadf;
6691 else
6692 /* Otherwise this is +0.0. We get this using MOVI d0, #0
6693 or MOV v0.s[0], wzr - neither of which are modeled by the
6694 cost tables. Just use the default cost. */
6699 return true;
6701 case MEM:
6702 if (speed)
6704 /* For loads we want the base cost of a load, plus an
6705 approximation for the additional cost of the addressing
6706 mode. */
6707 rtx address = XEXP (x, 0);
6708 if (VECTOR_MODE_P (mode))
6709 *cost += extra_cost->ldst.loadv;
6710 else if (GET_MODE_CLASS (mode) == MODE_INT)
6711 *cost += extra_cost->ldst.load;
6712 else if (mode == SFmode)
6713 *cost += extra_cost->ldst.loadf;
6714 else if (mode == DFmode)
6715 *cost += extra_cost->ldst.loadd;
6717 *cost +=
6718 COSTS_N_INSNS (aarch64_address_cost (address, mode,
6719 0, speed));
6722 return true;
6724 case NEG:
6725 op0 = XEXP (x, 0);
6727 if (VECTOR_MODE_P (mode))
6729 if (speed)
6731 /* FNEG. */
6732 *cost += extra_cost->vect.alu;
6734 return false;
6737 if (GET_MODE_CLASS (mode) == MODE_INT)
6739 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6740 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6742 /* CSETM. */
6743 *cost += rtx_cost (XEXP (op0, 0), VOIDmode, NEG, 0, speed);
6744 return true;
6747 /* Cost this as SUB wzr, X. */
6748 op0 = CONST0_RTX (mode);
6749 op1 = XEXP (x, 0);
6750 goto cost_minus;
6753 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6755 /* Support (neg(fma...)) as a single instruction only if
6756 sign of zeros is unimportant. This matches the decision
6757 making in aarch64.md. */
6758 if (GET_CODE (op0) == FMA && !HONOR_SIGNED_ZEROS (GET_MODE (op0)))
6760 /* FNMADD. */
6761 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6762 return true;
6764 if (GET_CODE (op0) == MULT)
6766 /* FNMUL. */
6767 *cost = rtx_cost (op0, mode, NEG, 0, speed);
6768 return true;
6770 if (speed)
6771 /* FNEG. */
6772 *cost += extra_cost->fp[mode == DFmode].neg;
6773 return false;
6776 return false;
6778 case CLRSB:
6779 case CLZ:
6780 if (speed)
6782 if (VECTOR_MODE_P (mode))
6783 *cost += extra_cost->vect.alu;
6784 else
6785 *cost += extra_cost->alu.clz;
6788 return false;
6790 case COMPARE:
6791 op0 = XEXP (x, 0);
6792 op1 = XEXP (x, 1);
6794 if (op1 == const0_rtx
6795 && GET_CODE (op0) == AND)
6797 x = op0;
6798 mode = GET_MODE (op0);
6799 goto cost_logic;
6802 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT)
6804 /* TODO: A write to the CC flags possibly costs extra, this
6805 needs encoding in the cost tables. */
6807 mode = GET_MODE (op0);
6808 /* ANDS. */
6809 if (GET_CODE (op0) == AND)
6811 x = op0;
6812 goto cost_logic;
6815 if (GET_CODE (op0) == PLUS)
6817 /* ADDS (and CMN alias). */
6818 x = op0;
6819 goto cost_plus;
6822 if (GET_CODE (op0) == MINUS)
6824 /* SUBS. */
6825 x = op0;
6826 goto cost_minus;
6829 if (GET_CODE (op0) == ZERO_EXTRACT && op1 == const0_rtx
6830 && GET_MODE (x) == CC_NZmode && CONST_INT_P (XEXP (op0, 1))
6831 && CONST_INT_P (XEXP (op0, 2)))
6833 /* COMPARE of ZERO_EXTRACT form of TST-immediate.
6834 Handle it here directly rather than going to cost_logic
6835 since we know the immediate generated for the TST is valid
6836 so we can avoid creating an intermediate rtx for it only
6837 for costing purposes. */
6838 if (speed)
6839 *cost += extra_cost->alu.logical;
6841 *cost += rtx_cost (XEXP (op0, 0), GET_MODE (op0),
6842 ZERO_EXTRACT, 0, speed);
6843 return true;
6846 if (GET_CODE (op1) == NEG)
6848 /* CMN. */
6849 if (speed)
6850 *cost += extra_cost->alu.arith;
6852 *cost += rtx_cost (op0, mode, COMPARE, 0, speed);
6853 *cost += rtx_cost (XEXP (op1, 0), mode, NEG, 1, speed);
6854 return true;
6857 /* CMP.
6859 Compare can freely swap the order of operands, and
6860 canonicalization puts the more complex operation first.
6861 But the integer MINUS logic expects the shift/extend
6862 operation in op1. */
6863 if (! (REG_P (op0)
6864 || (GET_CODE (op0) == SUBREG && REG_P (SUBREG_REG (op0)))))
6866 op0 = XEXP (x, 1);
6867 op1 = XEXP (x, 0);
6869 goto cost_minus;
6872 if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_FLOAT)
6874 /* FCMP. */
6875 if (speed)
6876 *cost += extra_cost->fp[mode == DFmode].compare;
6878 if (CONST_DOUBLE_P (op1) && aarch64_float_const_zero_rtx_p (op1))
6880 *cost += rtx_cost (op0, VOIDmode, COMPARE, 0, speed);
6881 /* FCMP supports constant 0.0 for no extra cost. */
6882 return true;
6884 return false;
6887 if (VECTOR_MODE_P (mode))
6889 /* Vector compare. */
6890 if (speed)
6891 *cost += extra_cost->vect.alu;
6893 if (aarch64_float_const_zero_rtx_p (op1))
6895 /* Vector cm (eq|ge|gt|lt|le) supports constant 0.0 for no extra
6896 cost. */
6897 return true;
6899 return false;
6901 return false;
6903 case MINUS:
6905 op0 = XEXP (x, 0);
6906 op1 = XEXP (x, 1);
6908 cost_minus:
6909 *cost += rtx_cost (op0, mode, MINUS, 0, speed);
6911 /* Detect valid immediates. */
6912 if ((GET_MODE_CLASS (mode) == MODE_INT
6913 || (GET_MODE_CLASS (mode) == MODE_CC
6914 && GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT))
6915 && CONST_INT_P (op1)
6916 && aarch64_uimm12_shift (INTVAL (op1)))
6918 if (speed)
6919 /* SUB(S) (immediate). */
6920 *cost += extra_cost->alu.arith;
6921 return true;
6924 /* Look for SUB (extended register). */
6925 if (aarch64_rtx_arith_op_extract_p (op1, mode))
6927 if (speed)
6928 *cost += extra_cost->alu.extend_arith;
6930 op1 = aarch64_strip_extend (op1);
6931 *cost += rtx_cost (op1, VOIDmode,
6932 (enum rtx_code) GET_CODE (op1), 0, speed);
6933 return true;
6936 rtx new_op1 = aarch64_strip_extend (op1);
6938 /* Cost this as an FMA-alike operation. */
6939 if ((GET_CODE (new_op1) == MULT
6940 || aarch64_shift_p (GET_CODE (new_op1)))
6941 && code != COMPARE)
6943 *cost += aarch64_rtx_mult_cost (new_op1, MULT,
6944 (enum rtx_code) code,
6945 speed);
6946 return true;
6949 *cost += rtx_cost (new_op1, VOIDmode, MINUS, 1, speed);
6951 if (speed)
6953 if (VECTOR_MODE_P (mode))
6955 /* Vector SUB. */
6956 *cost += extra_cost->vect.alu;
6958 else if (GET_MODE_CLASS (mode) == MODE_INT)
6960 /* SUB(S). */
6961 *cost += extra_cost->alu.arith;
6963 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
6965 /* FSUB. */
6966 *cost += extra_cost->fp[mode == DFmode].addsub;
6969 return true;
6972 case PLUS:
6974 rtx new_op0;
6976 op0 = XEXP (x, 0);
6977 op1 = XEXP (x, 1);
6979 cost_plus:
6980 if (GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMPARE
6981 || GET_RTX_CLASS (GET_CODE (op0)) == RTX_COMM_COMPARE)
6983 /* CSINC. */
6984 *cost += rtx_cost (XEXP (op0, 0), mode, PLUS, 0, speed);
6985 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
6986 return true;
6989 if (GET_MODE_CLASS (mode) == MODE_INT
6990 && CONST_INT_P (op1)
6991 && aarch64_uimm12_shift (INTVAL (op1)))
6993 *cost += rtx_cost (op0, mode, PLUS, 0, speed);
6995 if (speed)
6996 /* ADD (immediate). */
6997 *cost += extra_cost->alu.arith;
6998 return true;
7001 *cost += rtx_cost (op1, mode, PLUS, 1, speed);
7003 /* Look for ADD (extended register). */
7004 if (aarch64_rtx_arith_op_extract_p (op0, mode))
7006 if (speed)
7007 *cost += extra_cost->alu.extend_arith;
7009 op0 = aarch64_strip_extend (op0);
7010 *cost += rtx_cost (op0, VOIDmode,
7011 (enum rtx_code) GET_CODE (op0), 0, speed);
7012 return true;
7015 /* Strip any extend, leave shifts behind as we will
7016 cost them through mult_cost. */
7017 new_op0 = aarch64_strip_extend (op0);
7019 if (GET_CODE (new_op0) == MULT
7020 || aarch64_shift_p (GET_CODE (new_op0)))
7022 *cost += aarch64_rtx_mult_cost (new_op0, MULT, PLUS,
7023 speed);
7024 return true;
7027 *cost += rtx_cost (new_op0, VOIDmode, PLUS, 0, speed);
7029 if (speed)
7031 if (VECTOR_MODE_P (mode))
7033 /* Vector ADD. */
7034 *cost += extra_cost->vect.alu;
7036 else if (GET_MODE_CLASS (mode) == MODE_INT)
7038 /* ADD. */
7039 *cost += extra_cost->alu.arith;
7041 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7043 /* FADD. */
7044 *cost += extra_cost->fp[mode == DFmode].addsub;
7047 return true;
7050 case BSWAP:
7051 *cost = COSTS_N_INSNS (1);
7053 if (speed)
7055 if (VECTOR_MODE_P (mode))
7056 *cost += extra_cost->vect.alu;
7057 else
7058 *cost += extra_cost->alu.rev;
7060 return false;
7062 case IOR:
7063 if (aarch_rev16_p (x))
7065 *cost = COSTS_N_INSNS (1);
7067 if (speed)
7069 if (VECTOR_MODE_P (mode))
7070 *cost += extra_cost->vect.alu;
7071 else
7072 *cost += extra_cost->alu.rev;
7074 return true;
7077 if (aarch64_extr_rtx_p (x, &op0, &op1))
7079 *cost += rtx_cost (op0, mode, IOR, 0, speed);
7080 *cost += rtx_cost (op1, mode, IOR, 1, speed);
7081 if (speed)
7082 *cost += extra_cost->alu.shift;
7084 return true;
7086 /* Fall through. */
7087 case XOR:
7088 case AND:
7089 cost_logic:
7090 op0 = XEXP (x, 0);
7091 op1 = XEXP (x, 1);
7093 if (VECTOR_MODE_P (mode))
7095 if (speed)
7096 *cost += extra_cost->vect.alu;
7097 return true;
7100 if (code == AND
7101 && GET_CODE (op0) == MULT
7102 && CONST_INT_P (XEXP (op0, 1))
7103 && CONST_INT_P (op1)
7104 && aarch64_uxt_size (exact_log2 (INTVAL (XEXP (op0, 1))),
7105 INTVAL (op1)) != 0)
7107 /* This is a UBFM/SBFM. */
7108 *cost += rtx_cost (XEXP (op0, 0), mode, ZERO_EXTRACT, 0, speed);
7109 if (speed)
7110 *cost += extra_cost->alu.bfx;
7111 return true;
7114 if (GET_MODE_CLASS (mode) == MODE_INT)
7116 if (CONST_INT_P (op1))
7118 /* We have a mask + shift version of a UBFIZ
7119 i.e. the *andim_ashift<mode>_bfiz pattern. */
7120 if (GET_CODE (op0) == ASHIFT
7121 && aarch64_mask_and_shift_for_ubfiz_p (mode, op1,
7122 XEXP (op0, 1)))
7124 *cost += rtx_cost (XEXP (op0, 0), mode,
7125 (enum rtx_code) code, 0, speed);
7126 if (speed)
7127 *cost += extra_cost->alu.bfx;
7129 return true;
7131 else if (aarch64_bitmask_imm (INTVAL (op1), mode))
7133 /* We possibly get the immediate for free, this is not
7134 modelled. */
7135 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7136 if (speed)
7137 *cost += extra_cost->alu.logical;
7139 return true;
7142 else
7144 rtx new_op0 = op0;
7146 /* Handle ORN, EON, or BIC. */
7147 if (GET_CODE (op0) == NOT)
7148 op0 = XEXP (op0, 0);
7150 new_op0 = aarch64_strip_shift (op0);
7152 /* If we had a shift on op0 then this is a logical-shift-
7153 by-register/immediate operation. Otherwise, this is just
7154 a logical operation. */
7155 if (speed)
7157 if (new_op0 != op0)
7159 /* Shift by immediate. */
7160 if (CONST_INT_P (XEXP (op0, 1)))
7161 *cost += extra_cost->alu.log_shift;
7162 else
7163 *cost += extra_cost->alu.log_shift_reg;
7165 else
7166 *cost += extra_cost->alu.logical;
7169 /* In both cases we want to cost both operands. */
7170 *cost += rtx_cost (new_op0, mode, (enum rtx_code) code, 0, speed);
7171 *cost += rtx_cost (op1, mode, (enum rtx_code) code, 1, speed);
7173 return true;
7176 return false;
7178 case NOT:
7179 x = XEXP (x, 0);
7180 op0 = aarch64_strip_shift (x);
7182 if (VECTOR_MODE_P (mode))
7184 /* Vector NOT. */
7185 *cost += extra_cost->vect.alu;
7186 return false;
7189 /* MVN-shifted-reg. */
7190 if (op0 != x)
7192 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7194 if (speed)
7195 *cost += extra_cost->alu.log_shift;
7197 return true;
7199 /* EON can have two forms: (xor (not a) b) but also (not (xor a b)).
7200 Handle the second form here taking care that 'a' in the above can
7201 be a shift. */
7202 else if (GET_CODE (op0) == XOR)
7204 rtx newop0 = XEXP (op0, 0);
7205 rtx newop1 = XEXP (op0, 1);
7206 rtx op0_stripped = aarch64_strip_shift (newop0);
7208 *cost += rtx_cost (newop1, mode, (enum rtx_code) code, 1, speed);
7209 *cost += rtx_cost (op0_stripped, mode, XOR, 0, speed);
7211 if (speed)
7213 if (op0_stripped != newop0)
7214 *cost += extra_cost->alu.log_shift;
7215 else
7216 *cost += extra_cost->alu.logical;
7219 return true;
7221 /* MVN. */
7222 if (speed)
7223 *cost += extra_cost->alu.logical;
7225 return false;
7227 case ZERO_EXTEND:
7229 op0 = XEXP (x, 0);
7230 /* If a value is written in SI mode, then zero extended to DI
7231 mode, the operation will in general be free as a write to
7232 a 'w' register implicitly zeroes the upper bits of an 'x'
7233 register. However, if this is
7235 (set (reg) (zero_extend (reg)))
7237 we must cost the explicit register move. */
7238 if (mode == DImode
7239 && GET_MODE (op0) == SImode
7240 && outer == SET)
7242 int op_cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, 0, speed);
7244 /* If OP_COST is non-zero, then the cost of the zero extend
7245 is effectively the cost of the inner operation. Otherwise
7246 we have a MOV instruction and we take the cost from the MOV
7247 itself. This is true independently of whether we are
7248 optimizing for space or time. */
7249 if (op_cost)
7250 *cost = op_cost;
7252 return true;
7254 else if (MEM_P (op0))
7256 /* All loads can zero extend to any size for free. */
7257 *cost = rtx_cost (op0, VOIDmode, ZERO_EXTEND, param, speed);
7258 return true;
7261 op0 = aarch64_extend_bitfield_pattern_p (x);
7262 if (op0)
7264 *cost += rtx_cost (op0, mode, ZERO_EXTEND, 0, speed);
7265 if (speed)
7266 *cost += extra_cost->alu.bfx;
7267 return true;
7270 if (speed)
7272 if (VECTOR_MODE_P (mode))
7274 /* UMOV. */
7275 *cost += extra_cost->vect.alu;
7277 else
7279 /* We generate an AND instead of UXTB/UXTH. */
7280 *cost += extra_cost->alu.logical;
7283 return false;
7285 case SIGN_EXTEND:
7286 if (MEM_P (XEXP (x, 0)))
7288 /* LDRSH. */
7289 if (speed)
7291 rtx address = XEXP (XEXP (x, 0), 0);
7292 *cost += extra_cost->ldst.load_sign_extend;
7294 *cost +=
7295 COSTS_N_INSNS (aarch64_address_cost (address, mode,
7296 0, speed));
7298 return true;
7301 op0 = aarch64_extend_bitfield_pattern_p (x);
7302 if (op0)
7304 *cost += rtx_cost (op0, mode, SIGN_EXTEND, 0, speed);
7305 if (speed)
7306 *cost += extra_cost->alu.bfx;
7307 return true;
7310 if (speed)
7312 if (VECTOR_MODE_P (mode))
7313 *cost += extra_cost->vect.alu;
7314 else
7315 *cost += extra_cost->alu.extend;
7317 return false;
7319 case ASHIFT:
7320 op0 = XEXP (x, 0);
7321 op1 = XEXP (x, 1);
7323 if (CONST_INT_P (op1))
7325 if (speed)
7327 if (VECTOR_MODE_P (mode))
7329 /* Vector shift (immediate). */
7330 *cost += extra_cost->vect.alu;
7332 else
7334 /* LSL (immediate), UBMF, UBFIZ and friends. These are all
7335 aliases. */
7336 *cost += extra_cost->alu.shift;
7340 /* We can incorporate zero/sign extend for free. */
7341 if (GET_CODE (op0) == ZERO_EXTEND
7342 || GET_CODE (op0) == SIGN_EXTEND)
7343 op0 = XEXP (op0, 0);
7345 *cost += rtx_cost (op0, VOIDmode, ASHIFT, 0, speed);
7346 return true;
7348 else
7350 if (speed)
7352 if (VECTOR_MODE_P (mode))
7354 /* Vector shift (register). */
7355 *cost += extra_cost->vect.alu;
7357 else
7359 /* LSLV. */
7360 *cost += extra_cost->alu.shift_reg;
7363 return false; /* All arguments need to be in registers. */
7366 case ROTATE:
7367 case ROTATERT:
7368 case LSHIFTRT:
7369 case ASHIFTRT:
7370 op0 = XEXP (x, 0);
7371 op1 = XEXP (x, 1);
7373 if (CONST_INT_P (op1))
7375 /* ASR (immediate) and friends. */
7376 if (speed)
7378 if (VECTOR_MODE_P (mode))
7379 *cost += extra_cost->vect.alu;
7380 else
7381 *cost += extra_cost->alu.shift;
7384 *cost += rtx_cost (op0, mode, (enum rtx_code) code, 0, speed);
7385 return true;
7387 else
7390 /* ASR (register) and friends. */
7391 if (speed)
7393 if (VECTOR_MODE_P (mode))
7394 *cost += extra_cost->vect.alu;
7395 else
7396 *cost += extra_cost->alu.shift_reg;
7398 return false; /* All arguments need to be in registers. */
7401 case SYMBOL_REF:
7403 if (aarch64_cmodel == AARCH64_CMODEL_LARGE
7404 || aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC)
7406 /* LDR. */
7407 if (speed)
7408 *cost += extra_cost->ldst.load;
7410 else if (aarch64_cmodel == AARCH64_CMODEL_SMALL
7411 || aarch64_cmodel == AARCH64_CMODEL_SMALL_PIC)
7413 /* ADRP, followed by ADD. */
7414 *cost += COSTS_N_INSNS (1);
7415 if (speed)
7416 *cost += 2 * extra_cost->alu.arith;
7418 else if (aarch64_cmodel == AARCH64_CMODEL_TINY
7419 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
7421 /* ADR. */
7422 if (speed)
7423 *cost += extra_cost->alu.arith;
7426 if (flag_pic)
7428 /* One extra load instruction, after accessing the GOT. */
7429 *cost += COSTS_N_INSNS (1);
7430 if (speed)
7431 *cost += extra_cost->ldst.load;
7433 return true;
7435 case HIGH:
7436 case LO_SUM:
7437 /* ADRP/ADD (immediate). */
7438 if (speed)
7439 *cost += extra_cost->alu.arith;
7440 return true;
7442 case ZERO_EXTRACT:
7443 case SIGN_EXTRACT:
7444 /* UBFX/SBFX. */
7445 if (speed)
7447 if (VECTOR_MODE_P (mode))
7448 *cost += extra_cost->vect.alu;
7449 else
7450 *cost += extra_cost->alu.bfx;
7453 /* We can trust that the immediates used will be correct (there
7454 are no by-register forms), so we need only cost op0. */
7455 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (enum rtx_code) code, 0, speed);
7456 return true;
7458 case MULT:
7459 *cost += aarch64_rtx_mult_cost (x, MULT, 0, speed);
7460 /* aarch64_rtx_mult_cost always handles recursion to its
7461 operands. */
7462 return true;
7464 case MOD:
7465 /* We can expand signed mod by power of 2 using a NEGS, two parallel
7466 ANDs and a CSNEG. Assume here that CSNEG is the same as the cost of
7467 an unconditional negate. This case should only ever be reached through
7468 the set_smod_pow2_cheap check in expmed.c. */
7469 if (CONST_INT_P (XEXP (x, 1))
7470 && exact_log2 (INTVAL (XEXP (x, 1))) > 0
7471 && (mode == SImode || mode == DImode))
7473 /* We expand to 4 instructions. Reset the baseline. */
7474 *cost = COSTS_N_INSNS (4);
7476 if (speed)
7477 *cost += 2 * extra_cost->alu.logical
7478 + 2 * extra_cost->alu.arith;
7480 return true;
7483 /* Fall-through. */
7484 case UMOD:
7485 if (speed)
7487 if (VECTOR_MODE_P (mode))
7488 *cost += extra_cost->vect.alu;
7489 else if (GET_MODE_CLASS (mode) == MODE_INT)
7490 *cost += (extra_cost->mult[mode == DImode].add
7491 + extra_cost->mult[mode == DImode].idiv);
7492 else if (mode == DFmode)
7493 *cost += (extra_cost->fp[1].mult
7494 + extra_cost->fp[1].div);
7495 else if (mode == SFmode)
7496 *cost += (extra_cost->fp[0].mult
7497 + extra_cost->fp[0].div);
7499 return false; /* All arguments need to be in registers. */
7501 case DIV:
7502 case UDIV:
7503 case SQRT:
7504 if (speed)
7506 if (VECTOR_MODE_P (mode))
7507 *cost += extra_cost->vect.alu;
7508 else if (GET_MODE_CLASS (mode) == MODE_INT)
7509 /* There is no integer SQRT, so only DIV and UDIV can get
7510 here. */
7511 *cost += extra_cost->mult[mode == DImode].idiv;
7512 else
7513 *cost += extra_cost->fp[mode == DFmode].div;
7515 return false; /* All arguments need to be in registers. */
7517 case IF_THEN_ELSE:
7518 return aarch64_if_then_else_costs (XEXP (x, 0), XEXP (x, 1),
7519 XEXP (x, 2), cost, speed);
7521 case EQ:
7522 case NE:
7523 case GT:
7524 case GTU:
7525 case LT:
7526 case LTU:
7527 case GE:
7528 case GEU:
7529 case LE:
7530 case LEU:
7532 return false; /* All arguments must be in registers. */
7534 case FMA:
7535 op0 = XEXP (x, 0);
7536 op1 = XEXP (x, 1);
7537 op2 = XEXP (x, 2);
7539 if (speed)
7541 if (VECTOR_MODE_P (mode))
7542 *cost += extra_cost->vect.alu;
7543 else
7544 *cost += extra_cost->fp[mode == DFmode].fma;
7547 /* FMSUB, FNMADD, and FNMSUB are free. */
7548 if (GET_CODE (op0) == NEG)
7549 op0 = XEXP (op0, 0);
7551 if (GET_CODE (op2) == NEG)
7552 op2 = XEXP (op2, 0);
7554 /* aarch64_fnma4_elt_to_64v2df has the NEG as operand 1,
7555 and the by-element operand as operand 0. */
7556 if (GET_CODE (op1) == NEG)
7557 op1 = XEXP (op1, 0);
7559 /* Catch vector-by-element operations. The by-element operand can
7560 either be (vec_duplicate (vec_select (x))) or just
7561 (vec_select (x)), depending on whether we are multiplying by
7562 a vector or a scalar.
7564 Canonicalization is not very good in these cases, FMA4 will put the
7565 by-element operand as operand 0, FNMA4 will have it as operand 1. */
7566 if (GET_CODE (op0) == VEC_DUPLICATE)
7567 op0 = XEXP (op0, 0);
7568 else if (GET_CODE (op1) == VEC_DUPLICATE)
7569 op1 = XEXP (op1, 0);
7571 if (GET_CODE (op0) == VEC_SELECT)
7572 op0 = XEXP (op0, 0);
7573 else if (GET_CODE (op1) == VEC_SELECT)
7574 op1 = XEXP (op1, 0);
7576 /* If the remaining parameters are not registers,
7577 get the cost to put them into registers. */
7578 *cost += rtx_cost (op0, mode, FMA, 0, speed);
7579 *cost += rtx_cost (op1, mode, FMA, 1, speed);
7580 *cost += rtx_cost (op2, mode, FMA, 2, speed);
7581 return true;
7583 case FLOAT:
7584 case UNSIGNED_FLOAT:
7585 if (speed)
7586 *cost += extra_cost->fp[mode == DFmode].fromint;
7587 return false;
7589 case FLOAT_EXTEND:
7590 if (speed)
7592 if (VECTOR_MODE_P (mode))
7594 /*Vector truncate. */
7595 *cost += extra_cost->vect.alu;
7597 else
7598 *cost += extra_cost->fp[mode == DFmode].widen;
7600 return false;
7602 case FLOAT_TRUNCATE:
7603 if (speed)
7605 if (VECTOR_MODE_P (mode))
7607 /*Vector conversion. */
7608 *cost += extra_cost->vect.alu;
7610 else
7611 *cost += extra_cost->fp[mode == DFmode].narrow;
7613 return false;
7615 case FIX:
7616 case UNSIGNED_FIX:
7617 x = XEXP (x, 0);
7618 /* Strip the rounding part. They will all be implemented
7619 by the fcvt* family of instructions anyway. */
7620 if (GET_CODE (x) == UNSPEC)
7622 unsigned int uns_code = XINT (x, 1);
7624 if (uns_code == UNSPEC_FRINTA
7625 || uns_code == UNSPEC_FRINTM
7626 || uns_code == UNSPEC_FRINTN
7627 || uns_code == UNSPEC_FRINTP
7628 || uns_code == UNSPEC_FRINTZ)
7629 x = XVECEXP (x, 0, 0);
7632 if (speed)
7634 if (VECTOR_MODE_P (mode))
7635 *cost += extra_cost->vect.alu;
7636 else
7637 *cost += extra_cost->fp[GET_MODE (x) == DFmode].toint;
7640 /* We can combine fmul by a power of 2 followed by a fcvt into a single
7641 fixed-point fcvt. */
7642 if (GET_CODE (x) == MULT
7643 && ((VECTOR_MODE_P (mode)
7644 && aarch64_vec_fpconst_pow_of_2 (XEXP (x, 1)) > 0)
7645 || aarch64_fpconst_pow_of_2 (XEXP (x, 1)) > 0))
7647 *cost += rtx_cost (XEXP (x, 0), VOIDmode, (rtx_code) code,
7648 0, speed);
7649 return true;
7652 *cost += rtx_cost (x, VOIDmode, (enum rtx_code) code, 0, speed);
7653 return true;
7655 case ABS:
7656 if (VECTOR_MODE_P (mode))
7658 /* ABS (vector). */
7659 if (speed)
7660 *cost += extra_cost->vect.alu;
7662 else if (GET_MODE_CLASS (mode) == MODE_FLOAT)
7664 op0 = XEXP (x, 0);
7666 /* FABD, which is analogous to FADD. */
7667 if (GET_CODE (op0) == MINUS)
7669 *cost += rtx_cost (XEXP (op0, 0), mode, MINUS, 0, speed);
7670 *cost += rtx_cost (XEXP (op0, 1), mode, MINUS, 1, speed);
7671 if (speed)
7672 *cost += extra_cost->fp[mode == DFmode].addsub;
7674 return true;
7676 /* Simple FABS is analogous to FNEG. */
7677 if (speed)
7678 *cost += extra_cost->fp[mode == DFmode].neg;
7680 else
7682 /* Integer ABS will either be split to
7683 two arithmetic instructions, or will be an ABS
7684 (scalar), which we don't model. */
7685 *cost = COSTS_N_INSNS (2);
7686 if (speed)
7687 *cost += 2 * extra_cost->alu.arith;
7689 return false;
7691 case SMAX:
7692 case SMIN:
7693 if (speed)
7695 if (VECTOR_MODE_P (mode))
7696 *cost += extra_cost->vect.alu;
7697 else
7699 /* FMAXNM/FMINNM/FMAX/FMIN.
7700 TODO: This may not be accurate for all implementations, but
7701 we do not model this in the cost tables. */
7702 *cost += extra_cost->fp[mode == DFmode].addsub;
7705 return false;
7707 case UNSPEC:
7708 /* The floating point round to integer frint* instructions. */
7709 if (aarch64_frint_unspec_p (XINT (x, 1)))
7711 if (speed)
7712 *cost += extra_cost->fp[mode == DFmode].roundint;
7714 return false;
7717 if (XINT (x, 1) == UNSPEC_RBIT)
7719 if (speed)
7720 *cost += extra_cost->alu.rev;
7722 return false;
7724 break;
7726 case TRUNCATE:
7728 /* Decompose <su>muldi3_highpart. */
7729 if (/* (truncate:DI */
7730 mode == DImode
7731 /* (lshiftrt:TI */
7732 && GET_MODE (XEXP (x, 0)) == TImode
7733 && GET_CODE (XEXP (x, 0)) == LSHIFTRT
7734 /* (mult:TI */
7735 && GET_CODE (XEXP (XEXP (x, 0), 0)) == MULT
7736 /* (ANY_EXTEND:TI (reg:DI))
7737 (ANY_EXTEND:TI (reg:DI))) */
7738 && ((GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == ZERO_EXTEND
7739 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == ZERO_EXTEND)
7740 || (GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 0)) == SIGN_EXTEND
7741 && GET_CODE (XEXP (XEXP (XEXP (x, 0), 0), 1)) == SIGN_EXTEND))
7742 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0)) == DImode
7743 && GET_MODE (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0)) == DImode
7744 /* (const_int 64) */
7745 && CONST_INT_P (XEXP (XEXP (x, 0), 1))
7746 && UINTVAL (XEXP (XEXP (x, 0), 1)) == 64)
7748 /* UMULH/SMULH. */
7749 if (speed)
7750 *cost += extra_cost->mult[mode == DImode].extend;
7751 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 0), 0),
7752 mode, MULT, 0, speed);
7753 *cost += rtx_cost (XEXP (XEXP (XEXP (XEXP (x, 0), 0), 1), 0),
7754 mode, MULT, 1, speed);
7755 return true;
7758 /* Fall through. */
7759 default:
7760 break;
7763 if (dump_file
7764 && flag_aarch64_verbose_cost)
7765 fprintf (dump_file,
7766 "\nFailed to cost RTX. Assuming default cost.\n");
7768 return true;
7771 /* Wrapper around aarch64_rtx_costs, dumps the partial, or total cost
7772 calculated for X. This cost is stored in *COST. Returns true
7773 if the total cost of X was calculated. */
7774 static bool
7775 aarch64_rtx_costs_wrapper (rtx x, machine_mode mode, int outer,
7776 int param, int *cost, bool speed)
7778 bool result = aarch64_rtx_costs (x, mode, outer, param, cost, speed);
7780 if (dump_file
7781 && flag_aarch64_verbose_cost)
7783 print_rtl_single (dump_file, x);
7784 fprintf (dump_file, "\n%s cost: %d (%s)\n",
7785 speed ? "Hot" : "Cold",
7786 *cost, result ? "final" : "partial");
7789 return result;
7792 static int
7793 aarch64_register_move_cost (machine_mode mode,
7794 reg_class_t from_i, reg_class_t to_i)
7796 enum reg_class from = (enum reg_class) from_i;
7797 enum reg_class to = (enum reg_class) to_i;
7798 const struct cpu_regmove_cost *regmove_cost
7799 = aarch64_tune_params.regmove_cost;
7801 /* Caller save and pointer regs are equivalent to GENERAL_REGS. */
7802 if (to == CALLER_SAVE_REGS || to == POINTER_REGS)
7803 to = GENERAL_REGS;
7805 if (from == CALLER_SAVE_REGS || from == POINTER_REGS)
7806 from = GENERAL_REGS;
7808 /* Moving between GPR and stack cost is the same as GP2GP. */
7809 if ((from == GENERAL_REGS && to == STACK_REG)
7810 || (to == GENERAL_REGS && from == STACK_REG))
7811 return regmove_cost->GP2GP;
7813 /* To/From the stack register, we move via the gprs. */
7814 if (to == STACK_REG || from == STACK_REG)
7815 return aarch64_register_move_cost (mode, from, GENERAL_REGS)
7816 + aarch64_register_move_cost (mode, GENERAL_REGS, to);
7818 if (GET_MODE_SIZE (mode) == 16)
7820 /* 128-bit operations on general registers require 2 instructions. */
7821 if (from == GENERAL_REGS && to == GENERAL_REGS)
7822 return regmove_cost->GP2GP * 2;
7823 else if (from == GENERAL_REGS)
7824 return regmove_cost->GP2FP * 2;
7825 else if (to == GENERAL_REGS)
7826 return regmove_cost->FP2GP * 2;
7828 /* When AdvSIMD instructions are disabled it is not possible to move
7829 a 128-bit value directly between Q registers. This is handled in
7830 secondary reload. A general register is used as a scratch to move
7831 the upper DI value and the lower DI value is moved directly,
7832 hence the cost is the sum of three moves. */
7833 if (! TARGET_SIMD)
7834 return regmove_cost->GP2FP + regmove_cost->FP2GP + regmove_cost->FP2FP;
7836 return regmove_cost->FP2FP;
7839 if (from == GENERAL_REGS && to == GENERAL_REGS)
7840 return regmove_cost->GP2GP;
7841 else if (from == GENERAL_REGS)
7842 return regmove_cost->GP2FP;
7843 else if (to == GENERAL_REGS)
7844 return regmove_cost->FP2GP;
7846 return regmove_cost->FP2FP;
7849 static int
7850 aarch64_memory_move_cost (machine_mode mode ATTRIBUTE_UNUSED,
7851 reg_class_t rclass ATTRIBUTE_UNUSED,
7852 bool in ATTRIBUTE_UNUSED)
7854 return aarch64_tune_params.memmov_cost;
7857 /* Return true if it is safe and beneficial to use the approximate rsqrt optabs
7858 to optimize 1.0/sqrt. */
7860 static bool
7861 use_rsqrt_p (machine_mode mode)
7863 return (!flag_trapping_math
7864 && flag_unsafe_math_optimizations
7865 && ((aarch64_tune_params.approx_modes->recip_sqrt
7866 & AARCH64_APPROX_MODE (mode))
7867 || flag_mrecip_low_precision_sqrt));
7870 /* Function to decide when to use the approximate reciprocal square root
7871 builtin. */
7873 static tree
7874 aarch64_builtin_reciprocal (tree fndecl)
7876 machine_mode mode = TYPE_MODE (TREE_TYPE (fndecl));
7878 if (!use_rsqrt_p (mode))
7879 return NULL_TREE;
7880 return aarch64_builtin_rsqrt (DECL_FUNCTION_CODE (fndecl));
7883 typedef rtx (*rsqrte_type) (rtx, rtx);
7885 /* Select reciprocal square root initial estimate insn depending on machine
7886 mode. */
7888 static rsqrte_type
7889 get_rsqrte_type (machine_mode mode)
7891 switch (mode)
7893 case DFmode: return gen_aarch64_rsqrtedf;
7894 case SFmode: return gen_aarch64_rsqrtesf;
7895 case V2DFmode: return gen_aarch64_rsqrtev2df;
7896 case V2SFmode: return gen_aarch64_rsqrtev2sf;
7897 case V4SFmode: return gen_aarch64_rsqrtev4sf;
7898 default: gcc_unreachable ();
7902 typedef rtx (*rsqrts_type) (rtx, rtx, rtx);
7904 /* Select reciprocal square root series step insn depending on machine mode. */
7906 static rsqrts_type
7907 get_rsqrts_type (machine_mode mode)
7909 switch (mode)
7911 case DFmode: return gen_aarch64_rsqrtsdf;
7912 case SFmode: return gen_aarch64_rsqrtssf;
7913 case V2DFmode: return gen_aarch64_rsqrtsv2df;
7914 case V2SFmode: return gen_aarch64_rsqrtsv2sf;
7915 case V4SFmode: return gen_aarch64_rsqrtsv4sf;
7916 default: gcc_unreachable ();
7920 /* Emit instruction sequence to compute either the approximate square root
7921 or its approximate reciprocal, depending on the flag RECP, and return
7922 whether the sequence was emitted or not. */
7924 bool
7925 aarch64_emit_approx_sqrt (rtx dst, rtx src, bool recp)
7927 machine_mode mode = GET_MODE (dst);
7929 if (GET_MODE_INNER (mode) == HFmode)
7930 return false;
7932 machine_mode mmsk = mode_for_vector
7933 (int_mode_for_mode (GET_MODE_INNER (mode)),
7934 GET_MODE_NUNITS (mode));
7935 bool use_approx_sqrt_p = (!recp
7936 && (flag_mlow_precision_sqrt
7937 || (aarch64_tune_params.approx_modes->sqrt
7938 & AARCH64_APPROX_MODE (mode))));
7939 bool use_approx_rsqrt_p = (recp
7940 && (flag_mrecip_low_precision_sqrt
7941 || (aarch64_tune_params.approx_modes->recip_sqrt
7942 & AARCH64_APPROX_MODE (mode))));
7944 if (!flag_finite_math_only
7945 || flag_trapping_math
7946 || !flag_unsafe_math_optimizations
7947 || !(use_approx_sqrt_p || use_approx_rsqrt_p)
7948 || optimize_function_for_size_p (cfun))
7949 return false;
7951 rtx xmsk = gen_reg_rtx (mmsk);
7952 if (!recp)
7953 /* When calculating the approximate square root, compare the argument with
7954 0.0 and create a mask. */
7955 emit_insn (gen_rtx_SET (xmsk, gen_rtx_NEG (mmsk, gen_rtx_EQ (mmsk, src,
7956 CONST0_RTX (mode)))));
7958 /* Estimate the approximate reciprocal square root. */
7959 rtx xdst = gen_reg_rtx (mode);
7960 emit_insn ((*get_rsqrte_type (mode)) (xdst, src));
7962 /* Iterate over the series twice for SF and thrice for DF. */
7963 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
7965 /* Optionally iterate over the series once less for faster performance
7966 while sacrificing the accuracy. */
7967 if ((recp && flag_mrecip_low_precision_sqrt)
7968 || (!recp && flag_mlow_precision_sqrt))
7969 iterations--;
7971 /* Iterate over the series to calculate the approximate reciprocal square
7972 root. */
7973 rtx x1 = gen_reg_rtx (mode);
7974 while (iterations--)
7976 rtx x2 = gen_reg_rtx (mode);
7977 emit_set_insn (x2, gen_rtx_MULT (mode, xdst, xdst));
7979 emit_insn ((*get_rsqrts_type (mode)) (x1, src, x2));
7981 if (iterations > 0)
7982 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, x1));
7985 if (!recp)
7987 /* Qualify the approximate reciprocal square root when the argument is
7988 0.0 by squashing the intermediary result to 0.0. */
7989 rtx xtmp = gen_reg_rtx (mmsk);
7990 emit_set_insn (xtmp, gen_rtx_AND (mmsk, gen_rtx_NOT (mmsk, xmsk),
7991 gen_rtx_SUBREG (mmsk, xdst, 0)));
7992 emit_move_insn (xdst, gen_rtx_SUBREG (mode, xtmp, 0));
7994 /* Calculate the approximate square root. */
7995 emit_set_insn (xdst, gen_rtx_MULT (mode, xdst, src));
7998 /* Finalize the approximation. */
7999 emit_set_insn (dst, gen_rtx_MULT (mode, xdst, x1));
8001 return true;
8004 typedef rtx (*recpe_type) (rtx, rtx);
8006 /* Select reciprocal initial estimate insn depending on machine mode. */
8008 static recpe_type
8009 get_recpe_type (machine_mode mode)
8011 switch (mode)
8013 case SFmode: return (gen_aarch64_frecpesf);
8014 case V2SFmode: return (gen_aarch64_frecpev2sf);
8015 case V4SFmode: return (gen_aarch64_frecpev4sf);
8016 case DFmode: return (gen_aarch64_frecpedf);
8017 case V2DFmode: return (gen_aarch64_frecpev2df);
8018 default: gcc_unreachable ();
8022 typedef rtx (*recps_type) (rtx, rtx, rtx);
8024 /* Select reciprocal series step insn depending on machine mode. */
8026 static recps_type
8027 get_recps_type (machine_mode mode)
8029 switch (mode)
8031 case SFmode: return (gen_aarch64_frecpssf);
8032 case V2SFmode: return (gen_aarch64_frecpsv2sf);
8033 case V4SFmode: return (gen_aarch64_frecpsv4sf);
8034 case DFmode: return (gen_aarch64_frecpsdf);
8035 case V2DFmode: return (gen_aarch64_frecpsv2df);
8036 default: gcc_unreachable ();
8040 /* Emit the instruction sequence to compute the approximation for the division
8041 of NUM by DEN in QUO and return whether the sequence was emitted or not. */
8043 bool
8044 aarch64_emit_approx_div (rtx quo, rtx num, rtx den)
8046 machine_mode mode = GET_MODE (quo);
8048 if (GET_MODE_INNER (mode) == HFmode)
8049 return false;
8051 bool use_approx_division_p = (flag_mlow_precision_div
8052 || (aarch64_tune_params.approx_modes->division
8053 & AARCH64_APPROX_MODE (mode)));
8055 if (!flag_finite_math_only
8056 || flag_trapping_math
8057 || !flag_unsafe_math_optimizations
8058 || optimize_function_for_size_p (cfun)
8059 || !use_approx_division_p)
8060 return false;
8062 /* Estimate the approximate reciprocal. */
8063 rtx xrcp = gen_reg_rtx (mode);
8064 emit_insn ((*get_recpe_type (mode)) (xrcp, den));
8066 /* Iterate over the series twice for SF and thrice for DF. */
8067 int iterations = (GET_MODE_INNER (mode) == DFmode) ? 3 : 2;
8069 /* Optionally iterate over the series once less for faster performance,
8070 while sacrificing the accuracy. */
8071 if (flag_mlow_precision_div)
8072 iterations--;
8074 /* Iterate over the series to calculate the approximate reciprocal. */
8075 rtx xtmp = gen_reg_rtx (mode);
8076 while (iterations--)
8078 emit_insn ((*get_recps_type (mode)) (xtmp, xrcp, den));
8080 if (iterations > 0)
8081 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xtmp));
8084 if (num != CONST1_RTX (mode))
8086 /* As the approximate reciprocal of DEN is already calculated, only
8087 calculate the approximate division when NUM is not 1.0. */
8088 rtx xnum = force_reg (mode, num);
8089 emit_set_insn (xrcp, gen_rtx_MULT (mode, xrcp, xnum));
8092 /* Finalize the approximation. */
8093 emit_set_insn (quo, gen_rtx_MULT (mode, xrcp, xtmp));
8094 return true;
8097 /* Return the number of instructions that can be issued per cycle. */
8098 static int
8099 aarch64_sched_issue_rate (void)
8101 return aarch64_tune_params.issue_rate;
8104 static int
8105 aarch64_sched_first_cycle_multipass_dfa_lookahead (void)
8107 int issue_rate = aarch64_sched_issue_rate ();
8109 return issue_rate > 1 && !sched_fusion ? issue_rate : 0;
8113 /* Implement TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD as
8114 autopref_multipass_dfa_lookahead_guard from haifa-sched.c. It only
8115 has an effect if PARAM_SCHED_AUTOPREF_QUEUE_DEPTH > 0. */
8117 static int
8118 aarch64_first_cycle_multipass_dfa_lookahead_guard (rtx_insn *insn,
8119 int ready_index)
8121 return autopref_multipass_dfa_lookahead_guard (insn, ready_index);
8125 /* Vectorizer cost model target hooks. */
8127 /* Implement targetm.vectorize.builtin_vectorization_cost. */
8128 static int
8129 aarch64_builtin_vectorization_cost (enum vect_cost_for_stmt type_of_cost,
8130 tree vectype,
8131 int misalign ATTRIBUTE_UNUSED)
8133 unsigned elements;
8134 const cpu_vector_cost *costs = aarch64_tune_params.vec_costs;
8135 bool fp = false;
8137 if (vectype != NULL)
8138 fp = FLOAT_TYPE_P (vectype);
8140 switch (type_of_cost)
8142 case scalar_stmt:
8143 return fp ? costs->scalar_fp_stmt_cost : costs->scalar_int_stmt_cost;
8145 case scalar_load:
8146 return costs->scalar_load_cost;
8148 case scalar_store:
8149 return costs->scalar_store_cost;
8151 case vector_stmt:
8152 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8154 case vector_load:
8155 return costs->vec_align_load_cost;
8157 case vector_store:
8158 return costs->vec_store_cost;
8160 case vec_to_scalar:
8161 return costs->vec_to_scalar_cost;
8163 case scalar_to_vec:
8164 return costs->scalar_to_vec_cost;
8166 case unaligned_load:
8167 return costs->vec_unalign_load_cost;
8169 case unaligned_store:
8170 return costs->vec_unalign_store_cost;
8172 case cond_branch_taken:
8173 return costs->cond_taken_branch_cost;
8175 case cond_branch_not_taken:
8176 return costs->cond_not_taken_branch_cost;
8178 case vec_perm:
8179 return costs->vec_permute_cost;
8181 case vec_promote_demote:
8182 return fp ? costs->vec_fp_stmt_cost : costs->vec_int_stmt_cost;
8184 case vec_construct:
8185 elements = TYPE_VECTOR_SUBPARTS (vectype);
8186 return elements / 2 + 1;
8188 default:
8189 gcc_unreachable ();
8193 /* Implement targetm.vectorize.add_stmt_cost. */
8194 static unsigned
8195 aarch64_add_stmt_cost (void *data, int count, enum vect_cost_for_stmt kind,
8196 struct _stmt_vec_info *stmt_info, int misalign,
8197 enum vect_cost_model_location where)
8199 unsigned *cost = (unsigned *) data;
8200 unsigned retval = 0;
8202 if (flag_vect_cost_model)
8204 tree vectype = stmt_info ? stmt_vectype (stmt_info) : NULL_TREE;
8205 int stmt_cost =
8206 aarch64_builtin_vectorization_cost (kind, vectype, misalign);
8208 /* Statements in an inner loop relative to the loop being
8209 vectorized are weighted more heavily. The value here is
8210 arbitrary and could potentially be improved with analysis. */
8211 if (where == vect_body && stmt_info && stmt_in_inner_loop_p (stmt_info))
8212 count *= 50; /* FIXME */
8214 retval = (unsigned) (count * stmt_cost);
8215 cost[where] += retval;
8218 return retval;
8221 static void initialize_aarch64_code_model (struct gcc_options *);
8223 /* Parse the TO_PARSE string and put the architecture struct that it
8224 selects into RES and the architectural features into ISA_FLAGS.
8225 Return an aarch64_parse_opt_result describing the parse result.
8226 If there is an error parsing, RES and ISA_FLAGS are left unchanged. */
8228 static enum aarch64_parse_opt_result
8229 aarch64_parse_arch (const char *to_parse, const struct processor **res,
8230 unsigned long *isa_flags)
8232 char *ext;
8233 const struct processor *arch;
8234 char *str = (char *) alloca (strlen (to_parse) + 1);
8235 size_t len;
8237 strcpy (str, to_parse);
8239 ext = strchr (str, '+');
8241 if (ext != NULL)
8242 len = ext - str;
8243 else
8244 len = strlen (str);
8246 if (len == 0)
8247 return AARCH64_PARSE_MISSING_ARG;
8250 /* Loop through the list of supported ARCHes to find a match. */
8251 for (arch = all_architectures; arch->name != NULL; arch++)
8253 if (strlen (arch->name) == len && strncmp (arch->name, str, len) == 0)
8255 unsigned long isa_temp = arch->flags;
8257 if (ext != NULL)
8259 /* TO_PARSE string contains at least one extension. */
8260 enum aarch64_parse_opt_result ext_res
8261 = aarch64_parse_extension (ext, &isa_temp);
8263 if (ext_res != AARCH64_PARSE_OK)
8264 return ext_res;
8266 /* Extension parsing was successful. Confirm the result
8267 arch and ISA flags. */
8268 *res = arch;
8269 *isa_flags = isa_temp;
8270 return AARCH64_PARSE_OK;
8274 /* ARCH name not found in list. */
8275 return AARCH64_PARSE_INVALID_ARG;
8278 /* Parse the TO_PARSE string and put the result tuning in RES and the
8279 architecture flags in ISA_FLAGS. Return an aarch64_parse_opt_result
8280 describing the parse result. If there is an error parsing, RES and
8281 ISA_FLAGS are left unchanged. */
8283 static enum aarch64_parse_opt_result
8284 aarch64_parse_cpu (const char *to_parse, const struct processor **res,
8285 unsigned long *isa_flags)
8287 char *ext;
8288 const struct processor *cpu;
8289 char *str = (char *) alloca (strlen (to_parse) + 1);
8290 size_t len;
8292 strcpy (str, to_parse);
8294 ext = strchr (str, '+');
8296 if (ext != NULL)
8297 len = ext - str;
8298 else
8299 len = strlen (str);
8301 if (len == 0)
8302 return AARCH64_PARSE_MISSING_ARG;
8305 /* Loop through the list of supported CPUs to find a match. */
8306 for (cpu = all_cores; cpu->name != NULL; cpu++)
8308 if (strlen (cpu->name) == len && strncmp (cpu->name, str, len) == 0)
8310 unsigned long isa_temp = cpu->flags;
8313 if (ext != NULL)
8315 /* TO_PARSE string contains at least one extension. */
8316 enum aarch64_parse_opt_result ext_res
8317 = aarch64_parse_extension (ext, &isa_temp);
8319 if (ext_res != AARCH64_PARSE_OK)
8320 return ext_res;
8322 /* Extension parsing was successfull. Confirm the result
8323 cpu and ISA flags. */
8324 *res = cpu;
8325 *isa_flags = isa_temp;
8326 return AARCH64_PARSE_OK;
8330 /* CPU name not found in list. */
8331 return AARCH64_PARSE_INVALID_ARG;
8334 /* Parse the TO_PARSE string and put the cpu it selects into RES.
8335 Return an aarch64_parse_opt_result describing the parse result.
8336 If the parsing fails the RES does not change. */
8338 static enum aarch64_parse_opt_result
8339 aarch64_parse_tune (const char *to_parse, const struct processor **res)
8341 const struct processor *cpu;
8342 char *str = (char *) alloca (strlen (to_parse) + 1);
8344 strcpy (str, to_parse);
8346 /* Loop through the list of supported CPUs to find a match. */
8347 for (cpu = all_cores; cpu->name != NULL; cpu++)
8349 if (strcmp (cpu->name, str) == 0)
8351 *res = cpu;
8352 return AARCH64_PARSE_OK;
8356 /* CPU name not found in list. */
8357 return AARCH64_PARSE_INVALID_ARG;
8360 /* Parse TOKEN, which has length LENGTH to see if it is an option
8361 described in FLAG. If it is, return the index bit for that fusion type.
8362 If not, error (printing OPTION_NAME) and return zero. */
8364 static unsigned int
8365 aarch64_parse_one_option_token (const char *token,
8366 size_t length,
8367 const struct aarch64_flag_desc *flag,
8368 const char *option_name)
8370 for (; flag->name != NULL; flag++)
8372 if (length == strlen (flag->name)
8373 && !strncmp (flag->name, token, length))
8374 return flag->flag;
8377 error ("unknown flag passed in -moverride=%s (%s)", option_name, token);
8378 return 0;
8381 /* Parse OPTION which is a comma-separated list of flags to enable.
8382 FLAGS gives the list of flags we understand, INITIAL_STATE gives any
8383 default state we inherit from the CPU tuning structures. OPTION_NAME
8384 gives the top-level option we are parsing in the -moverride string,
8385 for use in error messages. */
8387 static unsigned int
8388 aarch64_parse_boolean_options (const char *option,
8389 const struct aarch64_flag_desc *flags,
8390 unsigned int initial_state,
8391 const char *option_name)
8393 const char separator = '.';
8394 const char* specs = option;
8395 const char* ntoken = option;
8396 unsigned int found_flags = initial_state;
8398 while ((ntoken = strchr (specs, separator)))
8400 size_t token_length = ntoken - specs;
8401 unsigned token_ops = aarch64_parse_one_option_token (specs,
8402 token_length,
8403 flags,
8404 option_name);
8405 /* If we find "none" (or, for simplicity's sake, an error) anywhere
8406 in the token stream, reset the supported operations. So:
8408 adrp+add.cmp+branch.none.adrp+add
8410 would have the result of turning on only adrp+add fusion. */
8411 if (!token_ops)
8412 found_flags = 0;
8414 found_flags |= token_ops;
8415 specs = ++ntoken;
8418 /* We ended with a comma, print something. */
8419 if (!(*specs))
8421 error ("%s string ill-formed\n", option_name);
8422 return 0;
8425 /* We still have one more token to parse. */
8426 size_t token_length = strlen (specs);
8427 unsigned token_ops = aarch64_parse_one_option_token (specs,
8428 token_length,
8429 flags,
8430 option_name);
8431 if (!token_ops)
8432 found_flags = 0;
8434 found_flags |= token_ops;
8435 return found_flags;
8438 /* Support for overriding instruction fusion. */
8440 static void
8441 aarch64_parse_fuse_string (const char *fuse_string,
8442 struct tune_params *tune)
8444 tune->fusible_ops = aarch64_parse_boolean_options (fuse_string,
8445 aarch64_fusible_pairs,
8446 tune->fusible_ops,
8447 "fuse=");
8450 /* Support for overriding other tuning flags. */
8452 static void
8453 aarch64_parse_tune_string (const char *tune_string,
8454 struct tune_params *tune)
8456 tune->extra_tuning_flags
8457 = aarch64_parse_boolean_options (tune_string,
8458 aarch64_tuning_flags,
8459 tune->extra_tuning_flags,
8460 "tune=");
8463 /* Parse TOKEN, which has length LENGTH to see if it is a tuning option
8464 we understand. If it is, extract the option string and handoff to
8465 the appropriate function. */
8467 void
8468 aarch64_parse_one_override_token (const char* token,
8469 size_t length,
8470 struct tune_params *tune)
8472 const struct aarch64_tuning_override_function *fn
8473 = aarch64_tuning_override_functions;
8475 const char *option_part = strchr (token, '=');
8476 if (!option_part)
8478 error ("tuning string missing in option (%s)", token);
8479 return;
8482 /* Get the length of the option name. */
8483 length = option_part - token;
8484 /* Skip the '=' to get to the option string. */
8485 option_part++;
8487 for (; fn->name != NULL; fn++)
8489 if (!strncmp (fn->name, token, length))
8491 fn->parse_override (option_part, tune);
8492 return;
8496 error ("unknown tuning option (%s)",token);
8497 return;
8500 /* A checking mechanism for the implementation of the tls size. */
8502 static void
8503 initialize_aarch64_tls_size (struct gcc_options *opts)
8505 if (aarch64_tls_size == 0)
8506 aarch64_tls_size = 24;
8508 switch (opts->x_aarch64_cmodel_var)
8510 case AARCH64_CMODEL_TINY:
8511 /* Both the default and maximum TLS size allowed under tiny is 1M which
8512 needs two instructions to address, so we clamp the size to 24. */
8513 if (aarch64_tls_size > 24)
8514 aarch64_tls_size = 24;
8515 break;
8516 case AARCH64_CMODEL_SMALL:
8517 /* The maximum TLS size allowed under small is 4G. */
8518 if (aarch64_tls_size > 32)
8519 aarch64_tls_size = 32;
8520 break;
8521 case AARCH64_CMODEL_LARGE:
8522 /* The maximum TLS size allowed under large is 16E.
8523 FIXME: 16E should be 64bit, we only support 48bit offset now. */
8524 if (aarch64_tls_size > 48)
8525 aarch64_tls_size = 48;
8526 break;
8527 default:
8528 gcc_unreachable ();
8531 return;
8534 /* Parse STRING looking for options in the format:
8535 string :: option:string
8536 option :: name=substring
8537 name :: {a-z}
8538 substring :: defined by option. */
8540 static void
8541 aarch64_parse_override_string (const char* input_string,
8542 struct tune_params* tune)
8544 const char separator = ':';
8545 size_t string_length = strlen (input_string) + 1;
8546 char *string_root = (char *) xmalloc (sizeof (*string_root) * string_length);
8547 char *string = string_root;
8548 strncpy (string, input_string, string_length);
8549 string[string_length - 1] = '\0';
8551 char* ntoken = string;
8553 while ((ntoken = strchr (string, separator)))
8555 size_t token_length = ntoken - string;
8556 /* Make this substring look like a string. */
8557 *ntoken = '\0';
8558 aarch64_parse_one_override_token (string, token_length, tune);
8559 string = ++ntoken;
8562 /* One last option to parse. */
8563 aarch64_parse_one_override_token (string, strlen (string), tune);
8564 free (string_root);
8568 static void
8569 aarch64_override_options_after_change_1 (struct gcc_options *opts)
8571 /* The logic here is that if we are disabling all frame pointer generation
8572 then we do not need to disable leaf frame pointer generation as a
8573 separate operation. But if we are *only* disabling leaf frame pointer
8574 generation then we set flag_omit_frame_pointer to true, but in
8575 aarch64_frame_pointer_required we return false only for leaf functions.
8577 PR 70044: We have to be careful about being called multiple times for the
8578 same function. Once we have decided to set flag_omit_frame_pointer just
8579 so that we can omit leaf frame pointers, we must then not interpret a
8580 second call as meaning that all frame pointer generation should be
8581 omitted. We do this by setting flag_omit_frame_pointer to a special,
8582 non-zero value. */
8583 if (opts->x_flag_omit_frame_pointer == 2)
8584 opts->x_flag_omit_frame_pointer = 0;
8586 if (opts->x_flag_omit_frame_pointer)
8587 opts->x_flag_omit_leaf_frame_pointer = false;
8588 else if (opts->x_flag_omit_leaf_frame_pointer)
8589 opts->x_flag_omit_frame_pointer = 2;
8591 /* If not optimizing for size, set the default
8592 alignment to what the target wants. */
8593 if (!opts->x_optimize_size)
8595 if (opts->x_align_loops <= 0)
8596 opts->x_align_loops = aarch64_tune_params.loop_align;
8597 if (opts->x_align_jumps <= 0)
8598 opts->x_align_jumps = aarch64_tune_params.jump_align;
8599 if (opts->x_align_functions <= 0)
8600 opts->x_align_functions = aarch64_tune_params.function_align;
8603 /* We default to no pc-relative literal loads. */
8605 aarch64_pcrelative_literal_loads = false;
8607 /* If -mpc-relative-literal-loads is set on the command line, this
8608 implies that the user asked for PC relative literal loads. */
8609 if (opts->x_pcrelative_literal_loads == 1)
8610 aarch64_pcrelative_literal_loads = true;
8612 /* This is PR70113. When building the Linux kernel with
8613 CONFIG_ARM64_ERRATUM_843419, support for relocations
8614 R_AARCH64_ADR_PREL_PG_HI21 and R_AARCH64_ADR_PREL_PG_HI21_NC is
8615 removed from the kernel to avoid loading objects with possibly
8616 offending sequences. Without -mpc-relative-literal-loads we would
8617 generate such relocations, preventing the kernel build from
8618 succeeding. */
8619 if (opts->x_pcrelative_literal_loads == 2
8620 && TARGET_FIX_ERR_A53_843419)
8621 aarch64_pcrelative_literal_loads = true;
8623 /* In the tiny memory model it makes no sense to disallow PC relative
8624 literal pool loads. */
8625 if (aarch64_cmodel == AARCH64_CMODEL_TINY
8626 || aarch64_cmodel == AARCH64_CMODEL_TINY_PIC)
8627 aarch64_pcrelative_literal_loads = true;
8629 /* When enabling the lower precision Newton series for the square root, also
8630 enable it for the reciprocal square root, since the latter is an
8631 intermediary step for the former. */
8632 if (flag_mlow_precision_sqrt)
8633 flag_mrecip_low_precision_sqrt = true;
8636 /* 'Unpack' up the internal tuning structs and update the options
8637 in OPTS. The caller must have set up selected_tune and selected_arch
8638 as all the other target-specific codegen decisions are
8639 derived from them. */
8641 void
8642 aarch64_override_options_internal (struct gcc_options *opts)
8644 aarch64_tune_flags = selected_tune->flags;
8645 aarch64_tune = selected_tune->sched_core;
8646 /* Make a copy of the tuning parameters attached to the core, which
8647 we may later overwrite. */
8648 aarch64_tune_params = *(selected_tune->tune);
8649 aarch64_architecture_version = selected_arch->architecture_version;
8651 if (opts->x_aarch64_override_tune_string)
8652 aarch64_parse_override_string (opts->x_aarch64_override_tune_string,
8653 &aarch64_tune_params);
8655 /* This target defaults to strict volatile bitfields. */
8656 if (opts->x_flag_strict_volatile_bitfields < 0 && abi_version_at_least (2))
8657 opts->x_flag_strict_volatile_bitfields = 1;
8659 initialize_aarch64_code_model (opts);
8660 initialize_aarch64_tls_size (opts);
8662 int queue_depth = 0;
8663 switch (aarch64_tune_params.autoprefetcher_model)
8665 case tune_params::AUTOPREFETCHER_OFF:
8666 queue_depth = -1;
8667 break;
8668 case tune_params::AUTOPREFETCHER_WEAK:
8669 queue_depth = 0;
8670 break;
8671 case tune_params::AUTOPREFETCHER_STRONG:
8672 queue_depth = max_insn_queue_index + 1;
8673 break;
8674 default:
8675 gcc_unreachable ();
8678 /* We don't mind passing in global_options_set here as we don't use
8679 the *options_set structs anyway. */
8680 maybe_set_param_value (PARAM_SCHED_AUTOPREF_QUEUE_DEPTH,
8681 queue_depth,
8682 opts->x_param_values,
8683 global_options_set.x_param_values);
8685 /* Set the L1 cache line size. */
8686 if (selected_cpu->tune->cache_line_size != 0)
8687 maybe_set_param_value (PARAM_L1_CACHE_LINE_SIZE,
8688 selected_cpu->tune->cache_line_size,
8689 opts->x_param_values,
8690 global_options_set.x_param_values);
8692 aarch64_override_options_after_change_1 (opts);
8695 /* Print a hint with a suggestion for a core or architecture name that
8696 most closely resembles what the user passed in STR. ARCH is true if
8697 the user is asking for an architecture name. ARCH is false if the user
8698 is asking for a core name. */
8700 static void
8701 aarch64_print_hint_for_core_or_arch (const char *str, bool arch)
8703 auto_vec<const char *> candidates;
8704 const struct processor *entry = arch ? all_architectures : all_cores;
8705 for (; entry->name != NULL; entry++)
8706 candidates.safe_push (entry->name);
8707 char *s;
8708 const char *hint = candidates_list_and_hint (str, s, candidates);
8709 if (hint)
8710 inform (input_location, "valid arguments are: %s;"
8711 " did you mean %qs?", s, hint);
8712 XDELETEVEC (s);
8715 /* Print a hint with a suggestion for a core name that most closely resembles
8716 what the user passed in STR. */
8718 inline static void
8719 aarch64_print_hint_for_core (const char *str)
8721 aarch64_print_hint_for_core_or_arch (str, false);
8724 /* Print a hint with a suggestion for an architecture name that most closely
8725 resembles what the user passed in STR. */
8727 inline static void
8728 aarch64_print_hint_for_arch (const char *str)
8730 aarch64_print_hint_for_core_or_arch (str, true);
8733 /* Validate a command-line -mcpu option. Parse the cpu and extensions (if any)
8734 specified in STR and throw errors if appropriate. Put the results if
8735 they are valid in RES and ISA_FLAGS. Return whether the option is
8736 valid. */
8738 static bool
8739 aarch64_validate_mcpu (const char *str, const struct processor **res,
8740 unsigned long *isa_flags)
8742 enum aarch64_parse_opt_result parse_res
8743 = aarch64_parse_cpu (str, res, isa_flags);
8745 if (parse_res == AARCH64_PARSE_OK)
8746 return true;
8748 switch (parse_res)
8750 case AARCH64_PARSE_MISSING_ARG:
8751 error ("missing cpu name in -mcpu=%qs", str);
8752 break;
8753 case AARCH64_PARSE_INVALID_ARG:
8754 error ("unknown value %qs for -mcpu", str);
8755 aarch64_print_hint_for_core (str);
8756 break;
8757 case AARCH64_PARSE_INVALID_FEATURE:
8758 error ("invalid feature modifier in -mcpu=%qs", str);
8759 break;
8760 default:
8761 gcc_unreachable ();
8764 return false;
8767 /* Validate a command-line -march option. Parse the arch and extensions
8768 (if any) specified in STR and throw errors if appropriate. Put the
8769 results, if they are valid, in RES and ISA_FLAGS. Return whether the
8770 option is valid. */
8772 static bool
8773 aarch64_validate_march (const char *str, const struct processor **res,
8774 unsigned long *isa_flags)
8776 enum aarch64_parse_opt_result parse_res
8777 = aarch64_parse_arch (str, res, isa_flags);
8779 if (parse_res == AARCH64_PARSE_OK)
8780 return true;
8782 switch (parse_res)
8784 case AARCH64_PARSE_MISSING_ARG:
8785 error ("missing arch name in -march=%qs", str);
8786 break;
8787 case AARCH64_PARSE_INVALID_ARG:
8788 error ("unknown value %qs for -march", str);
8789 aarch64_print_hint_for_arch (str);
8790 break;
8791 case AARCH64_PARSE_INVALID_FEATURE:
8792 error ("invalid feature modifier in -march=%qs", str);
8793 break;
8794 default:
8795 gcc_unreachable ();
8798 return false;
8801 /* Validate a command-line -mtune option. Parse the cpu
8802 specified in STR and throw errors if appropriate. Put the
8803 result, if it is valid, in RES. Return whether the option is
8804 valid. */
8806 static bool
8807 aarch64_validate_mtune (const char *str, const struct processor **res)
8809 enum aarch64_parse_opt_result parse_res
8810 = aarch64_parse_tune (str, res);
8812 if (parse_res == AARCH64_PARSE_OK)
8813 return true;
8815 switch (parse_res)
8817 case AARCH64_PARSE_MISSING_ARG:
8818 error ("missing cpu name in -mtune=%qs", str);
8819 break;
8820 case AARCH64_PARSE_INVALID_ARG:
8821 error ("unknown value %qs for -mtune", str);
8822 aarch64_print_hint_for_core (str);
8823 break;
8824 default:
8825 gcc_unreachable ();
8827 return false;
8830 /* Return the CPU corresponding to the enum CPU.
8831 If it doesn't specify a cpu, return the default. */
8833 static const struct processor *
8834 aarch64_get_tune_cpu (enum aarch64_processor cpu)
8836 if (cpu != aarch64_none)
8837 return &all_cores[cpu];
8839 /* The & 0x3f is to extract the bottom 6 bits that encode the
8840 default cpu as selected by the --with-cpu GCC configure option
8841 in config.gcc.
8842 ???: The whole TARGET_CPU_DEFAULT and AARCH64_CPU_DEFAULT_FLAGS
8843 flags mechanism should be reworked to make it more sane. */
8844 return &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8847 /* Return the architecture corresponding to the enum ARCH.
8848 If it doesn't specify a valid architecture, return the default. */
8850 static const struct processor *
8851 aarch64_get_arch (enum aarch64_arch arch)
8853 if (arch != aarch64_no_arch)
8854 return &all_architectures[arch];
8856 const struct processor *cpu = &all_cores[TARGET_CPU_DEFAULT & 0x3f];
8858 return &all_architectures[cpu->arch];
8861 /* Implement TARGET_OPTION_OVERRIDE. This is called once in the beginning
8862 and is used to parse the -m{cpu,tune,arch} strings and setup the initial
8863 tuning structs. In particular it must set selected_tune and
8864 aarch64_isa_flags that define the available ISA features and tuning
8865 decisions. It must also set selected_arch as this will be used to
8866 output the .arch asm tags for each function. */
8868 static void
8869 aarch64_override_options (void)
8871 unsigned long cpu_isa = 0;
8872 unsigned long arch_isa = 0;
8873 aarch64_isa_flags = 0;
8875 bool valid_cpu = true;
8876 bool valid_tune = true;
8877 bool valid_arch = true;
8879 selected_cpu = NULL;
8880 selected_arch = NULL;
8881 selected_tune = NULL;
8883 /* -mcpu=CPU is shorthand for -march=ARCH_FOR_CPU, -mtune=CPU.
8884 If either of -march or -mtune is given, they override their
8885 respective component of -mcpu. */
8886 if (aarch64_cpu_string)
8887 valid_cpu = aarch64_validate_mcpu (aarch64_cpu_string, &selected_cpu,
8888 &cpu_isa);
8890 if (aarch64_arch_string)
8891 valid_arch = aarch64_validate_march (aarch64_arch_string, &selected_arch,
8892 &arch_isa);
8894 if (aarch64_tune_string)
8895 valid_tune = aarch64_validate_mtune (aarch64_tune_string, &selected_tune);
8897 /* If the user did not specify a processor, choose the default
8898 one for them. This will be the CPU set during configuration using
8899 --with-cpu, otherwise it is "generic". */
8900 if (!selected_cpu)
8902 if (selected_arch)
8904 selected_cpu = &all_cores[selected_arch->ident];
8905 aarch64_isa_flags = arch_isa;
8906 explicit_arch = selected_arch->arch;
8908 else
8910 /* Get default configure-time CPU. */
8911 selected_cpu = aarch64_get_tune_cpu (aarch64_none);
8912 aarch64_isa_flags = TARGET_CPU_DEFAULT >> 6;
8915 if (selected_tune)
8916 explicit_tune_core = selected_tune->ident;
8918 /* If both -mcpu and -march are specified check that they are architecturally
8919 compatible, warn if they're not and prefer the -march ISA flags. */
8920 else if (selected_arch)
8922 if (selected_arch->arch != selected_cpu->arch)
8924 warning (0, "switch -mcpu=%s conflicts with -march=%s switch",
8925 all_architectures[selected_cpu->arch].name,
8926 selected_arch->name);
8928 aarch64_isa_flags = arch_isa;
8929 explicit_arch = selected_arch->arch;
8930 explicit_tune_core = selected_tune ? selected_tune->ident
8931 : selected_cpu->ident;
8933 else
8935 /* -mcpu but no -march. */
8936 aarch64_isa_flags = cpu_isa;
8937 explicit_tune_core = selected_tune ? selected_tune->ident
8938 : selected_cpu->ident;
8939 gcc_assert (selected_cpu);
8940 selected_arch = &all_architectures[selected_cpu->arch];
8941 explicit_arch = selected_arch->arch;
8944 /* Set the arch as well as we will need it when outputing
8945 the .arch directive in assembly. */
8946 if (!selected_arch)
8948 gcc_assert (selected_cpu);
8949 selected_arch = &all_architectures[selected_cpu->arch];
8952 if (!selected_tune)
8953 selected_tune = selected_cpu;
8955 #ifndef HAVE_AS_MABI_OPTION
8956 /* The compiler may have been configured with 2.23.* binutils, which does
8957 not have support for ILP32. */
8958 if (TARGET_ILP32)
8959 error ("Assembler does not support -mabi=ilp32");
8960 #endif
8962 if (aarch64_ra_sign_scope != AARCH64_FUNCTION_NONE && TARGET_ILP32)
8963 sorry ("Return address signing is only supported for -mabi=lp64");
8965 /* Make sure we properly set up the explicit options. */
8966 if ((aarch64_cpu_string && valid_cpu)
8967 || (aarch64_tune_string && valid_tune))
8968 gcc_assert (explicit_tune_core != aarch64_none);
8970 if ((aarch64_cpu_string && valid_cpu)
8971 || (aarch64_arch_string && valid_arch))
8972 gcc_assert (explicit_arch != aarch64_no_arch);
8974 aarch64_override_options_internal (&global_options);
8976 /* Save these options as the default ones in case we push and pop them later
8977 while processing functions with potential target attributes. */
8978 target_option_default_node = target_option_current_node
8979 = build_target_option_node (&global_options);
8982 /* Implement targetm.override_options_after_change. */
8984 static void
8985 aarch64_override_options_after_change (void)
8987 aarch64_override_options_after_change_1 (&global_options);
8990 static struct machine_function *
8991 aarch64_init_machine_status (void)
8993 struct machine_function *machine;
8994 machine = ggc_cleared_alloc<machine_function> ();
8995 return machine;
8998 void
8999 aarch64_init_expanders (void)
9001 init_machine_status = aarch64_init_machine_status;
9004 /* A checking mechanism for the implementation of the various code models. */
9005 static void
9006 initialize_aarch64_code_model (struct gcc_options *opts)
9008 if (opts->x_flag_pic)
9010 switch (opts->x_aarch64_cmodel_var)
9012 case AARCH64_CMODEL_TINY:
9013 aarch64_cmodel = AARCH64_CMODEL_TINY_PIC;
9014 break;
9015 case AARCH64_CMODEL_SMALL:
9016 #ifdef HAVE_AS_SMALL_PIC_RELOCS
9017 aarch64_cmodel = (flag_pic == 2
9018 ? AARCH64_CMODEL_SMALL_PIC
9019 : AARCH64_CMODEL_SMALL_SPIC);
9020 #else
9021 aarch64_cmodel = AARCH64_CMODEL_SMALL_PIC;
9022 #endif
9023 break;
9024 case AARCH64_CMODEL_LARGE:
9025 sorry ("code model %qs with -f%s", "large",
9026 opts->x_flag_pic > 1 ? "PIC" : "pic");
9027 break;
9028 default:
9029 gcc_unreachable ();
9032 else
9033 aarch64_cmodel = opts->x_aarch64_cmodel_var;
9036 /* Implement TARGET_OPTION_SAVE. */
9038 static void
9039 aarch64_option_save (struct cl_target_option *ptr, struct gcc_options *opts)
9041 ptr->x_aarch64_override_tune_string = opts->x_aarch64_override_tune_string;
9044 /* Implements TARGET_OPTION_RESTORE. Restore the backend codegen decisions
9045 using the information saved in PTR. */
9047 static void
9048 aarch64_option_restore (struct gcc_options *opts, struct cl_target_option *ptr)
9050 opts->x_explicit_tune_core = ptr->x_explicit_tune_core;
9051 selected_tune = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9052 opts->x_explicit_arch = ptr->x_explicit_arch;
9053 selected_arch = aarch64_get_arch (ptr->x_explicit_arch);
9054 opts->x_aarch64_override_tune_string = ptr->x_aarch64_override_tune_string;
9056 aarch64_override_options_internal (opts);
9059 /* Implement TARGET_OPTION_PRINT. */
9061 static void
9062 aarch64_option_print (FILE *file, int indent, struct cl_target_option *ptr)
9064 const struct processor *cpu
9065 = aarch64_get_tune_cpu (ptr->x_explicit_tune_core);
9066 unsigned long isa_flags = ptr->x_aarch64_isa_flags;
9067 const struct processor *arch = aarch64_get_arch (ptr->x_explicit_arch);
9068 std::string extension
9069 = aarch64_get_extension_string_for_isa_flags (isa_flags, arch->flags);
9071 fprintf (file, "%*sselected tune = %s\n", indent, "", cpu->name);
9072 fprintf (file, "%*sselected arch = %s%s\n", indent, "",
9073 arch->name, extension.c_str ());
9076 static GTY(()) tree aarch64_previous_fndecl;
9078 void
9079 aarch64_reset_previous_fndecl (void)
9081 aarch64_previous_fndecl = NULL;
9084 /* Restore or save the TREE_TARGET_GLOBALS from or to NEW_TREE.
9085 Used by aarch64_set_current_function and aarch64_pragma_target_parse to
9086 make sure optab availability predicates are recomputed when necessary. */
9088 void
9089 aarch64_save_restore_target_globals (tree new_tree)
9091 if (TREE_TARGET_GLOBALS (new_tree))
9092 restore_target_globals (TREE_TARGET_GLOBALS (new_tree));
9093 else if (new_tree == target_option_default_node)
9094 restore_target_globals (&default_target_globals);
9095 else
9096 TREE_TARGET_GLOBALS (new_tree) = save_target_globals_default_opts ();
9099 /* Implement TARGET_SET_CURRENT_FUNCTION. Unpack the codegen decisions
9100 like tuning and ISA features from the DECL_FUNCTION_SPECIFIC_TARGET
9101 of the function, if such exists. This function may be called multiple
9102 times on a single function so use aarch64_previous_fndecl to avoid
9103 setting up identical state. */
9105 static void
9106 aarch64_set_current_function (tree fndecl)
9108 if (!fndecl || fndecl == aarch64_previous_fndecl)
9109 return;
9111 tree old_tree = (aarch64_previous_fndecl
9112 ? DECL_FUNCTION_SPECIFIC_TARGET (aarch64_previous_fndecl)
9113 : NULL_TREE);
9115 tree new_tree = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9117 /* If current function has no attributes but the previous one did,
9118 use the default node. */
9119 if (!new_tree && old_tree)
9120 new_tree = target_option_default_node;
9122 /* If nothing to do, return. #pragma GCC reset or #pragma GCC pop to
9123 the default have been handled by aarch64_save_restore_target_globals from
9124 aarch64_pragma_target_parse. */
9125 if (old_tree == new_tree)
9126 return;
9128 aarch64_previous_fndecl = fndecl;
9130 /* First set the target options. */
9131 cl_target_option_restore (&global_options, TREE_TARGET_OPTION (new_tree));
9133 aarch64_save_restore_target_globals (new_tree);
9136 /* Enum describing the various ways we can handle attributes.
9137 In many cases we can reuse the generic option handling machinery. */
9139 enum aarch64_attr_opt_type
9141 aarch64_attr_mask, /* Attribute should set a bit in target_flags. */
9142 aarch64_attr_bool, /* Attribute sets or unsets a boolean variable. */
9143 aarch64_attr_enum, /* Attribute sets an enum variable. */
9144 aarch64_attr_custom /* Attribute requires a custom handling function. */
9147 /* All the information needed to handle a target attribute.
9148 NAME is the name of the attribute.
9149 ATTR_TYPE specifies the type of behavior of the attribute as described
9150 in the definition of enum aarch64_attr_opt_type.
9151 ALLOW_NEG is true if the attribute supports a "no-" form.
9152 HANDLER is the function that takes the attribute string and whether
9153 it is a pragma or attribute and handles the option. It is needed only
9154 when the ATTR_TYPE is aarch64_attr_custom.
9155 OPT_NUM is the enum specifying the option that the attribute modifies.
9156 This is needed for attributes that mirror the behavior of a command-line
9157 option, that is it has ATTR_TYPE aarch64_attr_mask, aarch64_attr_bool or
9158 aarch64_attr_enum. */
9160 struct aarch64_attribute_info
9162 const char *name;
9163 enum aarch64_attr_opt_type attr_type;
9164 bool allow_neg;
9165 bool (*handler) (const char *, const char *);
9166 enum opt_code opt_num;
9169 /* Handle the ARCH_STR argument to the arch= target attribute.
9170 PRAGMA_OR_ATTR is used in potential error messages. */
9172 static bool
9173 aarch64_handle_attr_arch (const char *str, const char *pragma_or_attr)
9175 const struct processor *tmp_arch = NULL;
9176 enum aarch64_parse_opt_result parse_res
9177 = aarch64_parse_arch (str, &tmp_arch, &aarch64_isa_flags);
9179 if (parse_res == AARCH64_PARSE_OK)
9181 gcc_assert (tmp_arch);
9182 selected_arch = tmp_arch;
9183 explicit_arch = selected_arch->arch;
9184 return true;
9187 switch (parse_res)
9189 case AARCH64_PARSE_MISSING_ARG:
9190 error ("missing architecture name in 'arch' target %s", pragma_or_attr);
9191 break;
9192 case AARCH64_PARSE_INVALID_ARG:
9193 error ("unknown value %qs for 'arch' target %s", str, pragma_or_attr);
9194 aarch64_print_hint_for_arch (str);
9195 break;
9196 case AARCH64_PARSE_INVALID_FEATURE:
9197 error ("invalid feature modifier %qs for 'arch' target %s",
9198 str, pragma_or_attr);
9199 break;
9200 default:
9201 gcc_unreachable ();
9204 return false;
9207 /* Handle the argument CPU_STR to the cpu= target attribute.
9208 PRAGMA_OR_ATTR is used in potential error messages. */
9210 static bool
9211 aarch64_handle_attr_cpu (const char *str, const char *pragma_or_attr)
9213 const struct processor *tmp_cpu = NULL;
9214 enum aarch64_parse_opt_result parse_res
9215 = aarch64_parse_cpu (str, &tmp_cpu, &aarch64_isa_flags);
9217 if (parse_res == AARCH64_PARSE_OK)
9219 gcc_assert (tmp_cpu);
9220 selected_tune = tmp_cpu;
9221 explicit_tune_core = selected_tune->ident;
9223 selected_arch = &all_architectures[tmp_cpu->arch];
9224 explicit_arch = selected_arch->arch;
9225 return true;
9228 switch (parse_res)
9230 case AARCH64_PARSE_MISSING_ARG:
9231 error ("missing cpu name in 'cpu' target %s", pragma_or_attr);
9232 break;
9233 case AARCH64_PARSE_INVALID_ARG:
9234 error ("unknown value %qs for 'cpu' target %s", str, pragma_or_attr);
9235 aarch64_print_hint_for_core (str);
9236 break;
9237 case AARCH64_PARSE_INVALID_FEATURE:
9238 error ("invalid feature modifier %qs for 'cpu' target %s",
9239 str, pragma_or_attr);
9240 break;
9241 default:
9242 gcc_unreachable ();
9245 return false;
9248 /* Handle the argument STR to the tune= target attribute.
9249 PRAGMA_OR_ATTR is used in potential error messages. */
9251 static bool
9252 aarch64_handle_attr_tune (const char *str, const char *pragma_or_attr)
9254 const struct processor *tmp_tune = NULL;
9255 enum aarch64_parse_opt_result parse_res
9256 = aarch64_parse_tune (str, &tmp_tune);
9258 if (parse_res == AARCH64_PARSE_OK)
9260 gcc_assert (tmp_tune);
9261 selected_tune = tmp_tune;
9262 explicit_tune_core = selected_tune->ident;
9263 return true;
9266 switch (parse_res)
9268 case AARCH64_PARSE_INVALID_ARG:
9269 error ("unknown value %qs for 'tune' target %s", str, pragma_or_attr);
9270 aarch64_print_hint_for_core (str);
9271 break;
9272 default:
9273 gcc_unreachable ();
9276 return false;
9279 /* Parse an architecture extensions target attribute string specified in STR.
9280 For example "+fp+nosimd". Show any errors if needed. Return TRUE
9281 if successful. Update aarch64_isa_flags to reflect the ISA features
9282 modified.
9283 PRAGMA_OR_ATTR is used in potential error messages. */
9285 static bool
9286 aarch64_handle_attr_isa_flags (char *str, const char *pragma_or_attr)
9288 enum aarch64_parse_opt_result parse_res;
9289 unsigned long isa_flags = aarch64_isa_flags;
9291 /* We allow "+nothing" in the beginning to clear out all architectural
9292 features if the user wants to handpick specific features. */
9293 if (strncmp ("+nothing", str, 8) == 0)
9295 isa_flags = 0;
9296 str += 8;
9299 parse_res = aarch64_parse_extension (str, &isa_flags);
9301 if (parse_res == AARCH64_PARSE_OK)
9303 aarch64_isa_flags = isa_flags;
9304 return true;
9307 switch (parse_res)
9309 case AARCH64_PARSE_MISSING_ARG:
9310 error ("missing feature modifier in target %s %qs",
9311 pragma_or_attr, str);
9312 break;
9314 case AARCH64_PARSE_INVALID_FEATURE:
9315 error ("invalid feature modifier in target %s %qs",
9316 pragma_or_attr, str);
9317 break;
9319 default:
9320 gcc_unreachable ();
9323 return false;
9326 /* The target attributes that we support. On top of these we also support just
9327 ISA extensions, like __attribute__ ((target ("+crc"))), but that case is
9328 handled explicitly in aarch64_process_one_target_attr. */
9330 static const struct aarch64_attribute_info aarch64_attributes[] =
9332 { "general-regs-only", aarch64_attr_mask, false, NULL,
9333 OPT_mgeneral_regs_only },
9334 { "fix-cortex-a53-835769", aarch64_attr_bool, true, NULL,
9335 OPT_mfix_cortex_a53_835769 },
9336 { "fix-cortex-a53-843419", aarch64_attr_bool, true, NULL,
9337 OPT_mfix_cortex_a53_843419 },
9338 { "cmodel", aarch64_attr_enum, false, NULL, OPT_mcmodel_ },
9339 { "strict-align", aarch64_attr_mask, false, NULL, OPT_mstrict_align },
9340 { "omit-leaf-frame-pointer", aarch64_attr_bool, true, NULL,
9341 OPT_momit_leaf_frame_pointer },
9342 { "tls-dialect", aarch64_attr_enum, false, NULL, OPT_mtls_dialect_ },
9343 { "arch", aarch64_attr_custom, false, aarch64_handle_attr_arch,
9344 OPT_march_ },
9345 { "cpu", aarch64_attr_custom, false, aarch64_handle_attr_cpu, OPT_mcpu_ },
9346 { "tune", aarch64_attr_custom, false, aarch64_handle_attr_tune,
9347 OPT_mtune_ },
9348 { "sign-return-address", aarch64_attr_enum, false, NULL,
9349 OPT_msign_return_address_ },
9350 { NULL, aarch64_attr_custom, false, NULL, OPT____ }
9353 /* Parse ARG_STR which contains the definition of one target attribute.
9354 Show appropriate errors if any or return true if the attribute is valid.
9355 PRAGMA_OR_ATTR holds the string to use in error messages about whether
9356 we're processing a target attribute or pragma. */
9358 static bool
9359 aarch64_process_one_target_attr (char *arg_str, const char* pragma_or_attr)
9361 bool invert = false;
9363 size_t len = strlen (arg_str);
9365 if (len == 0)
9367 error ("malformed target %s", pragma_or_attr);
9368 return false;
9371 char *str_to_check = (char *) alloca (len + 1);
9372 strcpy (str_to_check, arg_str);
9374 /* Skip leading whitespace. */
9375 while (*str_to_check == ' ' || *str_to_check == '\t')
9376 str_to_check++;
9378 /* We have something like __attribute__ ((target ("+fp+nosimd"))).
9379 It is easier to detect and handle it explicitly here rather than going
9380 through the machinery for the rest of the target attributes in this
9381 function. */
9382 if (*str_to_check == '+')
9383 return aarch64_handle_attr_isa_flags (str_to_check, pragma_or_attr);
9385 if (len > 3 && strncmp (str_to_check, "no-", 3) == 0)
9387 invert = true;
9388 str_to_check += 3;
9390 char *arg = strchr (str_to_check, '=');
9392 /* If we found opt=foo then terminate STR_TO_CHECK at the '='
9393 and point ARG to "foo". */
9394 if (arg)
9396 *arg = '\0';
9397 arg++;
9399 const struct aarch64_attribute_info *p_attr;
9400 bool found = false;
9401 for (p_attr = aarch64_attributes; p_attr->name; p_attr++)
9403 /* If the names don't match up, or the user has given an argument
9404 to an attribute that doesn't accept one, or didn't give an argument
9405 to an attribute that expects one, fail to match. */
9406 if (strcmp (str_to_check, p_attr->name) != 0)
9407 continue;
9409 found = true;
9410 bool attr_need_arg_p = p_attr->attr_type == aarch64_attr_custom
9411 || p_attr->attr_type == aarch64_attr_enum;
9413 if (attr_need_arg_p ^ (arg != NULL))
9415 error ("target %s %qs does not accept an argument",
9416 pragma_or_attr, str_to_check);
9417 return false;
9420 /* If the name matches but the attribute does not allow "no-" versions
9421 then we can't match. */
9422 if (invert && !p_attr->allow_neg)
9424 error ("target %s %qs does not allow a negated form",
9425 pragma_or_attr, str_to_check);
9426 return false;
9429 switch (p_attr->attr_type)
9431 /* Has a custom handler registered.
9432 For example, cpu=, arch=, tune=. */
9433 case aarch64_attr_custom:
9434 gcc_assert (p_attr->handler);
9435 if (!p_attr->handler (arg, pragma_or_attr))
9436 return false;
9437 break;
9439 /* Either set or unset a boolean option. */
9440 case aarch64_attr_bool:
9442 struct cl_decoded_option decoded;
9444 generate_option (p_attr->opt_num, NULL, !invert,
9445 CL_TARGET, &decoded);
9446 aarch64_handle_option (&global_options, &global_options_set,
9447 &decoded, input_location);
9448 break;
9450 /* Set or unset a bit in the target_flags. aarch64_handle_option
9451 should know what mask to apply given the option number. */
9452 case aarch64_attr_mask:
9454 struct cl_decoded_option decoded;
9455 /* We only need to specify the option number.
9456 aarch64_handle_option will know which mask to apply. */
9457 decoded.opt_index = p_attr->opt_num;
9458 decoded.value = !invert;
9459 aarch64_handle_option (&global_options, &global_options_set,
9460 &decoded, input_location);
9461 break;
9463 /* Use the option setting machinery to set an option to an enum. */
9464 case aarch64_attr_enum:
9466 gcc_assert (arg);
9467 bool valid;
9468 int value;
9469 valid = opt_enum_arg_to_value (p_attr->opt_num, arg,
9470 &value, CL_TARGET);
9471 if (valid)
9473 set_option (&global_options, NULL, p_attr->opt_num, value,
9474 NULL, DK_UNSPECIFIED, input_location,
9475 global_dc);
9477 else
9479 error ("target %s %s=%s is not valid",
9480 pragma_or_attr, str_to_check, arg);
9482 break;
9484 default:
9485 gcc_unreachable ();
9489 /* If we reached here we either have found an attribute and validated
9490 it or didn't match any. If we matched an attribute but its arguments
9491 were malformed we will have returned false already. */
9492 return found;
9495 /* Count how many times the character C appears in
9496 NULL-terminated string STR. */
9498 static unsigned int
9499 num_occurences_in_str (char c, char *str)
9501 unsigned int res = 0;
9502 while (*str != '\0')
9504 if (*str == c)
9505 res++;
9507 str++;
9510 return res;
9513 /* Parse the tree in ARGS that contains the target attribute information
9514 and update the global target options space. PRAGMA_OR_ATTR is a string
9515 to be used in error messages, specifying whether this is processing
9516 a target attribute or a target pragma. */
9518 bool
9519 aarch64_process_target_attr (tree args, const char* pragma_or_attr)
9521 if (TREE_CODE (args) == TREE_LIST)
9525 tree head = TREE_VALUE (args);
9526 if (head)
9528 if (!aarch64_process_target_attr (head, pragma_or_attr))
9529 return false;
9531 args = TREE_CHAIN (args);
9532 } while (args);
9534 return true;
9536 /* We expect to find a string to parse. */
9537 gcc_assert (TREE_CODE (args) == STRING_CST);
9539 size_t len = strlen (TREE_STRING_POINTER (args));
9540 char *str_to_check = (char *) alloca (len + 1);
9541 strcpy (str_to_check, TREE_STRING_POINTER (args));
9543 if (len == 0)
9545 error ("malformed target %s value", pragma_or_attr);
9546 return false;
9549 /* Used to catch empty spaces between commas i.e.
9550 attribute ((target ("attr1,,attr2"))). */
9551 unsigned int num_commas = num_occurences_in_str (',', str_to_check);
9553 /* Handle multiple target attributes separated by ','. */
9554 char *token = strtok (str_to_check, ",");
9556 unsigned int num_attrs = 0;
9557 while (token)
9559 num_attrs++;
9560 if (!aarch64_process_one_target_attr (token, pragma_or_attr))
9562 error ("target %s %qs is invalid", pragma_or_attr, token);
9563 return false;
9566 token = strtok (NULL, ",");
9569 if (num_attrs != num_commas + 1)
9571 error ("malformed target %s list %qs",
9572 pragma_or_attr, TREE_STRING_POINTER (args));
9573 return false;
9576 return true;
9579 /* Implement TARGET_OPTION_VALID_ATTRIBUTE_P. This is used to
9580 process attribute ((target ("..."))). */
9582 static bool
9583 aarch64_option_valid_attribute_p (tree fndecl, tree, tree args, int)
9585 struct cl_target_option cur_target;
9586 bool ret;
9587 tree old_optimize;
9588 tree new_target, new_optimize;
9589 tree existing_target = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
9591 /* If what we're processing is the current pragma string then the
9592 target option node is already stored in target_option_current_node
9593 by aarch64_pragma_target_parse in aarch64-c.c. Use that to avoid
9594 having to re-parse the string. This is especially useful to keep
9595 arm_neon.h compile times down since that header contains a lot
9596 of intrinsics enclosed in pragmas. */
9597 if (!existing_target && args == current_target_pragma)
9599 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = target_option_current_node;
9600 return true;
9602 tree func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9604 old_optimize = build_optimization_node (&global_options);
9605 func_optimize = DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl);
9607 /* If the function changed the optimization levels as well as setting
9608 target options, start with the optimizations specified. */
9609 if (func_optimize && func_optimize != old_optimize)
9610 cl_optimization_restore (&global_options,
9611 TREE_OPTIMIZATION (func_optimize));
9613 /* Save the current target options to restore at the end. */
9614 cl_target_option_save (&cur_target, &global_options);
9616 /* If fndecl already has some target attributes applied to it, unpack
9617 them so that we add this attribute on top of them, rather than
9618 overwriting them. */
9619 if (existing_target)
9621 struct cl_target_option *existing_options
9622 = TREE_TARGET_OPTION (existing_target);
9624 if (existing_options)
9625 cl_target_option_restore (&global_options, existing_options);
9627 else
9628 cl_target_option_restore (&global_options,
9629 TREE_TARGET_OPTION (target_option_current_node));
9632 ret = aarch64_process_target_attr (args, "attribute");
9634 /* Set up any additional state. */
9635 if (ret)
9637 aarch64_override_options_internal (&global_options);
9638 /* Initialize SIMD builtins if we haven't already.
9639 Set current_target_pragma to NULL for the duration so that
9640 the builtin initialization code doesn't try to tag the functions
9641 being built with the attributes specified by any current pragma, thus
9642 going into an infinite recursion. */
9643 if (TARGET_SIMD)
9645 tree saved_current_target_pragma = current_target_pragma;
9646 current_target_pragma = NULL;
9647 aarch64_init_simd_builtins ();
9648 current_target_pragma = saved_current_target_pragma;
9650 new_target = build_target_option_node (&global_options);
9652 else
9653 new_target = NULL;
9655 new_optimize = build_optimization_node (&global_options);
9657 if (fndecl && ret)
9659 DECL_FUNCTION_SPECIFIC_TARGET (fndecl) = new_target;
9661 if (old_optimize != new_optimize)
9662 DECL_FUNCTION_SPECIFIC_OPTIMIZATION (fndecl) = new_optimize;
9665 cl_target_option_restore (&global_options, &cur_target);
9667 if (old_optimize != new_optimize)
9668 cl_optimization_restore (&global_options,
9669 TREE_OPTIMIZATION (old_optimize));
9670 return ret;
9673 /* Helper for aarch64_can_inline_p. In the case where CALLER and CALLEE are
9674 tri-bool options (yes, no, don't care) and the default value is
9675 DEF, determine whether to reject inlining. */
9677 static bool
9678 aarch64_tribools_ok_for_inlining_p (int caller, int callee,
9679 int dont_care, int def)
9681 /* If the callee doesn't care, always allow inlining. */
9682 if (callee == dont_care)
9683 return true;
9685 /* If the caller doesn't care, always allow inlining. */
9686 if (caller == dont_care)
9687 return true;
9689 /* Otherwise, allow inlining if either the callee and caller values
9690 agree, or if the callee is using the default value. */
9691 return (callee == caller || callee == def);
9694 /* Implement TARGET_CAN_INLINE_P. Decide whether it is valid
9695 to inline CALLEE into CALLER based on target-specific info.
9696 Make sure that the caller and callee have compatible architectural
9697 features. Then go through the other possible target attributes
9698 and see if they can block inlining. Try not to reject always_inline
9699 callees unless they are incompatible architecturally. */
9701 static bool
9702 aarch64_can_inline_p (tree caller, tree callee)
9704 tree caller_tree = DECL_FUNCTION_SPECIFIC_TARGET (caller);
9705 tree callee_tree = DECL_FUNCTION_SPECIFIC_TARGET (callee);
9707 /* If callee has no option attributes, then it is ok to inline. */
9708 if (!callee_tree)
9709 return true;
9711 struct cl_target_option *caller_opts
9712 = TREE_TARGET_OPTION (caller_tree ? caller_tree
9713 : target_option_default_node);
9715 struct cl_target_option *callee_opts = TREE_TARGET_OPTION (callee_tree);
9718 /* Callee's ISA flags should be a subset of the caller's. */
9719 if ((caller_opts->x_aarch64_isa_flags & callee_opts->x_aarch64_isa_flags)
9720 != callee_opts->x_aarch64_isa_flags)
9721 return false;
9723 /* Allow non-strict aligned functions inlining into strict
9724 aligned ones. */
9725 if ((TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)
9726 != TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags))
9727 && !(!TARGET_STRICT_ALIGN_P (callee_opts->x_target_flags)
9728 && TARGET_STRICT_ALIGN_P (caller_opts->x_target_flags)))
9729 return false;
9731 bool always_inline = lookup_attribute ("always_inline",
9732 DECL_ATTRIBUTES (callee));
9734 /* If the architectural features match up and the callee is always_inline
9735 then the other attributes don't matter. */
9736 if (always_inline)
9737 return true;
9739 if (caller_opts->x_aarch64_cmodel_var
9740 != callee_opts->x_aarch64_cmodel_var)
9741 return false;
9743 if (caller_opts->x_aarch64_tls_dialect
9744 != callee_opts->x_aarch64_tls_dialect)
9745 return false;
9747 /* Honour explicit requests to workaround errata. */
9748 if (!aarch64_tribools_ok_for_inlining_p (
9749 caller_opts->x_aarch64_fix_a53_err835769,
9750 callee_opts->x_aarch64_fix_a53_err835769,
9751 2, TARGET_FIX_ERR_A53_835769_DEFAULT))
9752 return false;
9754 if (!aarch64_tribools_ok_for_inlining_p (
9755 caller_opts->x_aarch64_fix_a53_err843419,
9756 callee_opts->x_aarch64_fix_a53_err843419,
9757 2, TARGET_FIX_ERR_A53_843419))
9758 return false;
9760 /* If the user explicitly specified -momit-leaf-frame-pointer for the
9761 caller and calle and they don't match up, reject inlining. */
9762 if (!aarch64_tribools_ok_for_inlining_p (
9763 caller_opts->x_flag_omit_leaf_frame_pointer,
9764 callee_opts->x_flag_omit_leaf_frame_pointer,
9765 2, 1))
9766 return false;
9768 /* If the callee has specific tuning overrides, respect them. */
9769 if (callee_opts->x_aarch64_override_tune_string != NULL
9770 && caller_opts->x_aarch64_override_tune_string == NULL)
9771 return false;
9773 /* If the user specified tuning override strings for the
9774 caller and callee and they don't match up, reject inlining.
9775 We just do a string compare here, we don't analyze the meaning
9776 of the string, as it would be too costly for little gain. */
9777 if (callee_opts->x_aarch64_override_tune_string
9778 && caller_opts->x_aarch64_override_tune_string
9779 && (strcmp (callee_opts->x_aarch64_override_tune_string,
9780 caller_opts->x_aarch64_override_tune_string) != 0))
9781 return false;
9783 return true;
9786 /* Return true if SYMBOL_REF X binds locally. */
9788 static bool
9789 aarch64_symbol_binds_local_p (const_rtx x)
9791 return (SYMBOL_REF_DECL (x)
9792 ? targetm.binds_local_p (SYMBOL_REF_DECL (x))
9793 : SYMBOL_REF_LOCAL_P (x));
9796 /* Return true if SYMBOL_REF X is thread local */
9797 static bool
9798 aarch64_tls_symbol_p (rtx x)
9800 if (! TARGET_HAVE_TLS)
9801 return false;
9803 if (GET_CODE (x) != SYMBOL_REF)
9804 return false;
9806 return SYMBOL_REF_TLS_MODEL (x) != 0;
9809 /* Classify a TLS symbol into one of the TLS kinds. */
9810 enum aarch64_symbol_type
9811 aarch64_classify_tls_symbol (rtx x)
9813 enum tls_model tls_kind = tls_symbolic_operand_type (x);
9815 switch (tls_kind)
9817 case TLS_MODEL_GLOBAL_DYNAMIC:
9818 case TLS_MODEL_LOCAL_DYNAMIC:
9819 return TARGET_TLS_DESC ? SYMBOL_SMALL_TLSDESC : SYMBOL_SMALL_TLSGD;
9821 case TLS_MODEL_INITIAL_EXEC:
9822 switch (aarch64_cmodel)
9824 case AARCH64_CMODEL_TINY:
9825 case AARCH64_CMODEL_TINY_PIC:
9826 return SYMBOL_TINY_TLSIE;
9827 default:
9828 return SYMBOL_SMALL_TLSIE;
9831 case TLS_MODEL_LOCAL_EXEC:
9832 if (aarch64_tls_size == 12)
9833 return SYMBOL_TLSLE12;
9834 else if (aarch64_tls_size == 24)
9835 return SYMBOL_TLSLE24;
9836 else if (aarch64_tls_size == 32)
9837 return SYMBOL_TLSLE32;
9838 else if (aarch64_tls_size == 48)
9839 return SYMBOL_TLSLE48;
9840 else
9841 gcc_unreachable ();
9843 case TLS_MODEL_EMULATED:
9844 case TLS_MODEL_NONE:
9845 return SYMBOL_FORCE_TO_MEM;
9847 default:
9848 gcc_unreachable ();
9852 /* Return the method that should be used to access SYMBOL_REF or
9853 LABEL_REF X. */
9855 enum aarch64_symbol_type
9856 aarch64_classify_symbol (rtx x, rtx offset)
9858 if (GET_CODE (x) == LABEL_REF)
9860 switch (aarch64_cmodel)
9862 case AARCH64_CMODEL_LARGE:
9863 return SYMBOL_FORCE_TO_MEM;
9865 case AARCH64_CMODEL_TINY_PIC:
9866 case AARCH64_CMODEL_TINY:
9867 return SYMBOL_TINY_ABSOLUTE;
9869 case AARCH64_CMODEL_SMALL_SPIC:
9870 case AARCH64_CMODEL_SMALL_PIC:
9871 case AARCH64_CMODEL_SMALL:
9872 return SYMBOL_SMALL_ABSOLUTE;
9874 default:
9875 gcc_unreachable ();
9879 if (GET_CODE (x) == SYMBOL_REF)
9881 if (aarch64_tls_symbol_p (x))
9882 return aarch64_classify_tls_symbol (x);
9884 switch (aarch64_cmodel)
9886 case AARCH64_CMODEL_TINY:
9887 /* When we retrieve symbol + offset address, we have to make sure
9888 the offset does not cause overflow of the final address. But
9889 we have no way of knowing the address of symbol at compile time
9890 so we can't accurately say if the distance between the PC and
9891 symbol + offset is outside the addressible range of +/-1M in the
9892 TINY code model. So we rely on images not being greater than
9893 1M and cap the offset at 1M and anything beyond 1M will have to
9894 be loaded using an alternative mechanism. Furthermore if the
9895 symbol is a weak reference to something that isn't known to
9896 resolve to a symbol in this module, then force to memory. */
9897 if ((SYMBOL_REF_WEAK (x)
9898 && !aarch64_symbol_binds_local_p (x))
9899 || INTVAL (offset) < -1048575 || INTVAL (offset) > 1048575)
9900 return SYMBOL_FORCE_TO_MEM;
9901 return SYMBOL_TINY_ABSOLUTE;
9903 case AARCH64_CMODEL_SMALL:
9904 /* Same reasoning as the tiny code model, but the offset cap here is
9905 4G. */
9906 if ((SYMBOL_REF_WEAK (x)
9907 && !aarch64_symbol_binds_local_p (x))
9908 || !IN_RANGE (INTVAL (offset), HOST_WIDE_INT_C (-4294967263),
9909 HOST_WIDE_INT_C (4294967264)))
9910 return SYMBOL_FORCE_TO_MEM;
9911 return SYMBOL_SMALL_ABSOLUTE;
9913 case AARCH64_CMODEL_TINY_PIC:
9914 if (!aarch64_symbol_binds_local_p (x))
9915 return SYMBOL_TINY_GOT;
9916 return SYMBOL_TINY_ABSOLUTE;
9918 case AARCH64_CMODEL_SMALL_SPIC:
9919 case AARCH64_CMODEL_SMALL_PIC:
9920 if (!aarch64_symbol_binds_local_p (x))
9921 return (aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC
9922 ? SYMBOL_SMALL_GOT_28K : SYMBOL_SMALL_GOT_4G);
9923 return SYMBOL_SMALL_ABSOLUTE;
9925 case AARCH64_CMODEL_LARGE:
9926 /* This is alright even in PIC code as the constant
9927 pool reference is always PC relative and within
9928 the same translation unit. */
9929 if (CONSTANT_POOL_ADDRESS_P (x))
9930 return SYMBOL_SMALL_ABSOLUTE;
9931 else
9932 return SYMBOL_FORCE_TO_MEM;
9934 default:
9935 gcc_unreachable ();
9939 /* By default push everything into the constant pool. */
9940 return SYMBOL_FORCE_TO_MEM;
9943 bool
9944 aarch64_constant_address_p (rtx x)
9946 return (CONSTANT_P (x) && memory_address_p (DImode, x));
9949 bool
9950 aarch64_legitimate_pic_operand_p (rtx x)
9952 if (GET_CODE (x) == SYMBOL_REF
9953 || (GET_CODE (x) == CONST
9954 && GET_CODE (XEXP (x, 0)) == PLUS
9955 && GET_CODE (XEXP (XEXP (x, 0), 0)) == SYMBOL_REF))
9956 return false;
9958 return true;
9961 /* Return true if X holds either a quarter-precision or
9962 floating-point +0.0 constant. */
9963 static bool
9964 aarch64_valid_floating_const (machine_mode mode, rtx x)
9966 if (!CONST_DOUBLE_P (x))
9967 return false;
9969 if (aarch64_float_const_zero_rtx_p (x))
9970 return true;
9972 /* We only handle moving 0.0 to a TFmode register. */
9973 if (!(mode == SFmode || mode == DFmode))
9974 return false;
9976 return aarch64_float_const_representable_p (x);
9979 static bool
9980 aarch64_legitimate_constant_p (machine_mode mode, rtx x)
9982 /* Do not allow vector struct mode constants. We could support
9983 0 and -1 easily, but they need support in aarch64-simd.md. */
9984 if (TARGET_SIMD && aarch64_vect_struct_mode_p (mode))
9985 return false;
9987 /* This could probably go away because
9988 we now decompose CONST_INTs according to expand_mov_immediate. */
9989 if ((GET_CODE (x) == CONST_VECTOR
9990 && aarch64_simd_valid_immediate (x, mode, false, NULL))
9991 || CONST_INT_P (x) || aarch64_valid_floating_const (mode, x))
9992 return !targetm.cannot_force_const_mem (mode, x);
9994 if (GET_CODE (x) == HIGH
9995 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
9996 return true;
9998 return aarch64_constant_address_p (x);
10002 aarch64_load_tp (rtx target)
10004 if (!target
10005 || GET_MODE (target) != Pmode
10006 || !register_operand (target, Pmode))
10007 target = gen_reg_rtx (Pmode);
10009 /* Can return in any reg. */
10010 emit_insn (gen_aarch64_load_tp_hard (target));
10011 return target;
10014 /* On AAPCS systems, this is the "struct __va_list". */
10015 static GTY(()) tree va_list_type;
10017 /* Implement TARGET_BUILD_BUILTIN_VA_LIST.
10018 Return the type to use as __builtin_va_list.
10020 AAPCS64 \S 7.1.4 requires that va_list be a typedef for a type defined as:
10022 struct __va_list
10024 void *__stack;
10025 void *__gr_top;
10026 void *__vr_top;
10027 int __gr_offs;
10028 int __vr_offs;
10029 }; */
10031 static tree
10032 aarch64_build_builtin_va_list (void)
10034 tree va_list_name;
10035 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10037 /* Create the type. */
10038 va_list_type = lang_hooks.types.make_type (RECORD_TYPE);
10039 /* Give it the required name. */
10040 va_list_name = build_decl (BUILTINS_LOCATION,
10041 TYPE_DECL,
10042 get_identifier ("__va_list"),
10043 va_list_type);
10044 DECL_ARTIFICIAL (va_list_name) = 1;
10045 TYPE_NAME (va_list_type) = va_list_name;
10046 TYPE_STUB_DECL (va_list_type) = va_list_name;
10048 /* Create the fields. */
10049 f_stack = build_decl (BUILTINS_LOCATION,
10050 FIELD_DECL, get_identifier ("__stack"),
10051 ptr_type_node);
10052 f_grtop = build_decl (BUILTINS_LOCATION,
10053 FIELD_DECL, get_identifier ("__gr_top"),
10054 ptr_type_node);
10055 f_vrtop = build_decl (BUILTINS_LOCATION,
10056 FIELD_DECL, get_identifier ("__vr_top"),
10057 ptr_type_node);
10058 f_groff = build_decl (BUILTINS_LOCATION,
10059 FIELD_DECL, get_identifier ("__gr_offs"),
10060 integer_type_node);
10061 f_vroff = build_decl (BUILTINS_LOCATION,
10062 FIELD_DECL, get_identifier ("__vr_offs"),
10063 integer_type_node);
10065 /* Tell tree-stdarg pass about our internal offset fields.
10066 NOTE: va_list_gpr/fpr_counter_field are only used for tree comparision
10067 purpose to identify whether the code is updating va_list internal
10068 offset fields through irregular way. */
10069 va_list_gpr_counter_field = f_groff;
10070 va_list_fpr_counter_field = f_vroff;
10072 DECL_ARTIFICIAL (f_stack) = 1;
10073 DECL_ARTIFICIAL (f_grtop) = 1;
10074 DECL_ARTIFICIAL (f_vrtop) = 1;
10075 DECL_ARTIFICIAL (f_groff) = 1;
10076 DECL_ARTIFICIAL (f_vroff) = 1;
10078 DECL_FIELD_CONTEXT (f_stack) = va_list_type;
10079 DECL_FIELD_CONTEXT (f_grtop) = va_list_type;
10080 DECL_FIELD_CONTEXT (f_vrtop) = va_list_type;
10081 DECL_FIELD_CONTEXT (f_groff) = va_list_type;
10082 DECL_FIELD_CONTEXT (f_vroff) = va_list_type;
10084 TYPE_FIELDS (va_list_type) = f_stack;
10085 DECL_CHAIN (f_stack) = f_grtop;
10086 DECL_CHAIN (f_grtop) = f_vrtop;
10087 DECL_CHAIN (f_vrtop) = f_groff;
10088 DECL_CHAIN (f_groff) = f_vroff;
10090 /* Compute its layout. */
10091 layout_type (va_list_type);
10093 return va_list_type;
10096 /* Implement TARGET_EXPAND_BUILTIN_VA_START. */
10097 static void
10098 aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
10100 const CUMULATIVE_ARGS *cum;
10101 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10102 tree stack, grtop, vrtop, groff, vroff;
10103 tree t;
10104 int gr_save_area_size = cfun->va_list_gpr_size;
10105 int vr_save_area_size = cfun->va_list_fpr_size;
10106 int vr_offset;
10108 cum = &crtl->args.info;
10109 if (cfun->va_list_gpr_size)
10110 gr_save_area_size = MIN ((NUM_ARG_REGS - cum->aapcs_ncrn) * UNITS_PER_WORD,
10111 cfun->va_list_gpr_size);
10112 if (cfun->va_list_fpr_size)
10113 vr_save_area_size = MIN ((NUM_FP_ARG_REGS - cum->aapcs_nvrn)
10114 * UNITS_PER_VREG, cfun->va_list_fpr_size);
10116 if (!TARGET_FLOAT)
10118 gcc_assert (cum->aapcs_nvrn == 0);
10119 vr_save_area_size = 0;
10122 f_stack = TYPE_FIELDS (va_list_type_node);
10123 f_grtop = DECL_CHAIN (f_stack);
10124 f_vrtop = DECL_CHAIN (f_grtop);
10125 f_groff = DECL_CHAIN (f_vrtop);
10126 f_vroff = DECL_CHAIN (f_groff);
10128 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), valist, f_stack,
10129 NULL_TREE);
10130 grtop = build3 (COMPONENT_REF, TREE_TYPE (f_grtop), valist, f_grtop,
10131 NULL_TREE);
10132 vrtop = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop), valist, f_vrtop,
10133 NULL_TREE);
10134 groff = build3 (COMPONENT_REF, TREE_TYPE (f_groff), valist, f_groff,
10135 NULL_TREE);
10136 vroff = build3 (COMPONENT_REF, TREE_TYPE (f_vroff), valist, f_vroff,
10137 NULL_TREE);
10139 /* Emit code to initialize STACK, which points to the next varargs stack
10140 argument. CUM->AAPCS_STACK_SIZE gives the number of stack words used
10141 by named arguments. STACK is 8-byte aligned. */
10142 t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
10143 if (cum->aapcs_stack_size > 0)
10144 t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
10145 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
10146 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10148 /* Emit code to initialize GRTOP, the top of the GR save area.
10149 virtual_incoming_args_rtx should have been 16 byte aligned. */
10150 t = make_tree (TREE_TYPE (grtop), virtual_incoming_args_rtx);
10151 t = build2 (MODIFY_EXPR, TREE_TYPE (grtop), grtop, t);
10152 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10154 /* Emit code to initialize VRTOP, the top of the VR save area.
10155 This address is gr_save_area_bytes below GRTOP, rounded
10156 down to the next 16-byte boundary. */
10157 t = make_tree (TREE_TYPE (vrtop), virtual_incoming_args_rtx);
10158 vr_offset = ROUND_UP (gr_save_area_size,
10159 STACK_BOUNDARY / BITS_PER_UNIT);
10161 if (vr_offset)
10162 t = fold_build_pointer_plus_hwi (t, -vr_offset);
10163 t = build2 (MODIFY_EXPR, TREE_TYPE (vrtop), vrtop, t);
10164 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10166 /* Emit code to initialize GROFF, the offset from GRTOP of the
10167 next GPR argument. */
10168 t = build2 (MODIFY_EXPR, TREE_TYPE (groff), groff,
10169 build_int_cst (TREE_TYPE (groff), -gr_save_area_size));
10170 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10172 /* Likewise emit code to initialize VROFF, the offset from FTOP
10173 of the next VR argument. */
10174 t = build2 (MODIFY_EXPR, TREE_TYPE (vroff), vroff,
10175 build_int_cst (TREE_TYPE (vroff), -vr_save_area_size));
10176 expand_expr (t, const0_rtx, VOIDmode, EXPAND_NORMAL);
10179 /* Implement TARGET_GIMPLIFY_VA_ARG_EXPR. */
10181 static tree
10182 aarch64_gimplify_va_arg_expr (tree valist, tree type, gimple_seq *pre_p,
10183 gimple_seq *post_p ATTRIBUTE_UNUSED)
10185 tree addr;
10186 bool indirect_p;
10187 bool is_ha; /* is HFA or HVA. */
10188 bool dw_align; /* double-word align. */
10189 machine_mode ag_mode = VOIDmode;
10190 int nregs;
10191 machine_mode mode;
10193 tree f_stack, f_grtop, f_vrtop, f_groff, f_vroff;
10194 tree stack, f_top, f_off, off, arg, roundup, on_stack;
10195 HOST_WIDE_INT size, rsize, adjust, align;
10196 tree t, u, cond1, cond2;
10198 indirect_p = pass_by_reference (NULL, TYPE_MODE (type), type, false);
10199 if (indirect_p)
10200 type = build_pointer_type (type);
10202 mode = TYPE_MODE (type);
10204 f_stack = TYPE_FIELDS (va_list_type_node);
10205 f_grtop = DECL_CHAIN (f_stack);
10206 f_vrtop = DECL_CHAIN (f_grtop);
10207 f_groff = DECL_CHAIN (f_vrtop);
10208 f_vroff = DECL_CHAIN (f_groff);
10210 stack = build3 (COMPONENT_REF, TREE_TYPE (f_stack), unshare_expr (valist),
10211 f_stack, NULL_TREE);
10212 size = int_size_in_bytes (type);
10213 align = aarch64_function_arg_alignment (mode, type) / BITS_PER_UNIT;
10215 dw_align = false;
10216 adjust = 0;
10217 if (aarch64_vfp_is_call_or_return_candidate (mode,
10218 type,
10219 &ag_mode,
10220 &nregs,
10221 &is_ha))
10223 /* TYPE passed in fp/simd registers. */
10224 if (!TARGET_FLOAT)
10225 aarch64_err_no_fpadvsimd (mode, "varargs");
10227 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_vrtop),
10228 unshare_expr (valist), f_vrtop, NULL_TREE);
10229 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_vroff),
10230 unshare_expr (valist), f_vroff, NULL_TREE);
10232 rsize = nregs * UNITS_PER_VREG;
10234 if (is_ha)
10236 if (BYTES_BIG_ENDIAN && GET_MODE_SIZE (ag_mode) < UNITS_PER_VREG)
10237 adjust = UNITS_PER_VREG - GET_MODE_SIZE (ag_mode);
10239 else if (BLOCK_REG_PADDING (mode, type, 1) == downward
10240 && size < UNITS_PER_VREG)
10242 adjust = UNITS_PER_VREG - size;
10245 else
10247 /* TYPE passed in general registers. */
10248 f_top = build3 (COMPONENT_REF, TREE_TYPE (f_grtop),
10249 unshare_expr (valist), f_grtop, NULL_TREE);
10250 f_off = build3 (COMPONENT_REF, TREE_TYPE (f_groff),
10251 unshare_expr (valist), f_groff, NULL_TREE);
10252 rsize = ROUND_UP (size, UNITS_PER_WORD);
10253 nregs = rsize / UNITS_PER_WORD;
10255 if (align > 8)
10256 dw_align = true;
10258 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10259 && size < UNITS_PER_WORD)
10261 adjust = UNITS_PER_WORD - size;
10265 /* Get a local temporary for the field value. */
10266 off = get_initialized_tmp_var (f_off, pre_p, NULL);
10268 /* Emit code to branch if off >= 0. */
10269 t = build2 (GE_EXPR, boolean_type_node, off,
10270 build_int_cst (TREE_TYPE (off), 0));
10271 cond1 = build3 (COND_EXPR, ptr_type_node, t, NULL_TREE, NULL_TREE);
10273 if (dw_align)
10275 /* Emit: offs = (offs + 15) & -16. */
10276 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10277 build_int_cst (TREE_TYPE (off), 15));
10278 t = build2 (BIT_AND_EXPR, TREE_TYPE (off), t,
10279 build_int_cst (TREE_TYPE (off), -16));
10280 roundup = build2 (MODIFY_EXPR, TREE_TYPE (off), off, t);
10282 else
10283 roundup = NULL;
10285 /* Update ap.__[g|v]r_offs */
10286 t = build2 (PLUS_EXPR, TREE_TYPE (off), off,
10287 build_int_cst (TREE_TYPE (off), rsize));
10288 t = build2 (MODIFY_EXPR, TREE_TYPE (f_off), unshare_expr (f_off), t);
10290 /* String up. */
10291 if (roundup)
10292 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10294 /* [cond2] if (ap.__[g|v]r_offs > 0) */
10295 u = build2 (GT_EXPR, boolean_type_node, unshare_expr (f_off),
10296 build_int_cst (TREE_TYPE (f_off), 0));
10297 cond2 = build3 (COND_EXPR, ptr_type_node, u, NULL_TREE, NULL_TREE);
10299 /* String up: make sure the assignment happens before the use. */
10300 t = build2 (COMPOUND_EXPR, TREE_TYPE (cond2), t, cond2);
10301 COND_EXPR_ELSE (cond1) = t;
10303 /* Prepare the trees handling the argument that is passed on the stack;
10304 the top level node will store in ON_STACK. */
10305 arg = get_initialized_tmp_var (stack, pre_p, NULL);
10306 if (align > 8)
10308 /* if (alignof(type) > 8) (arg = arg + 15) & -16; */
10309 t = fold_convert (intDI_type_node, arg);
10310 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10311 build_int_cst (TREE_TYPE (t), 15));
10312 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10313 build_int_cst (TREE_TYPE (t), -16));
10314 t = fold_convert (TREE_TYPE (arg), t);
10315 roundup = build2 (MODIFY_EXPR, TREE_TYPE (arg), arg, t);
10317 else
10318 roundup = NULL;
10319 /* Advance ap.__stack */
10320 t = fold_convert (intDI_type_node, arg);
10321 t = build2 (PLUS_EXPR, TREE_TYPE (t), t,
10322 build_int_cst (TREE_TYPE (t), size + 7));
10323 t = build2 (BIT_AND_EXPR, TREE_TYPE (t), t,
10324 build_int_cst (TREE_TYPE (t), -8));
10325 t = fold_convert (TREE_TYPE (arg), t);
10326 t = build2 (MODIFY_EXPR, TREE_TYPE (stack), unshare_expr (stack), t);
10327 /* String up roundup and advance. */
10328 if (roundup)
10329 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), roundup, t);
10330 /* String up with arg */
10331 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), t, arg);
10332 /* Big-endianness related address adjustment. */
10333 if (BLOCK_REG_PADDING (mode, type, 1) == downward
10334 && size < UNITS_PER_WORD)
10336 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (arg), arg,
10337 size_int (UNITS_PER_WORD - size));
10338 on_stack = build2 (COMPOUND_EXPR, TREE_TYPE (arg), on_stack, t);
10341 COND_EXPR_THEN (cond1) = unshare_expr (on_stack);
10342 COND_EXPR_THEN (cond2) = unshare_expr (on_stack);
10344 /* Adjustment to OFFSET in the case of BIG_ENDIAN. */
10345 t = off;
10346 if (adjust)
10347 t = build2 (PREINCREMENT_EXPR, TREE_TYPE (off), off,
10348 build_int_cst (TREE_TYPE (off), adjust));
10350 t = fold_convert (sizetype, t);
10351 t = build2 (POINTER_PLUS_EXPR, TREE_TYPE (f_top), f_top, t);
10353 if (is_ha)
10355 /* type ha; // treat as "struct {ftype field[n];}"
10356 ... [computing offs]
10357 for (i = 0; i <nregs; ++i, offs += 16)
10358 ha.field[i] = *((ftype *)(ap.__vr_top + offs));
10359 return ha; */
10360 int i;
10361 tree tmp_ha, field_t, field_ptr_t;
10363 /* Declare a local variable. */
10364 tmp_ha = create_tmp_var_raw (type, "ha");
10365 gimple_add_tmp_var (tmp_ha);
10367 /* Establish the base type. */
10368 switch (ag_mode)
10370 case SFmode:
10371 field_t = float_type_node;
10372 field_ptr_t = float_ptr_type_node;
10373 break;
10374 case DFmode:
10375 field_t = double_type_node;
10376 field_ptr_t = double_ptr_type_node;
10377 break;
10378 case TFmode:
10379 field_t = long_double_type_node;
10380 field_ptr_t = long_double_ptr_type_node;
10381 break;
10382 case HFmode:
10383 field_t = aarch64_fp16_type_node;
10384 field_ptr_t = aarch64_fp16_ptr_type_node;
10385 break;
10386 case V2SImode:
10387 case V4SImode:
10389 tree innertype = make_signed_type (GET_MODE_PRECISION (SImode));
10390 field_t = build_vector_type_for_mode (innertype, ag_mode);
10391 field_ptr_t = build_pointer_type (field_t);
10393 break;
10394 default:
10395 gcc_assert (0);
10398 /* *(field_ptr_t)&ha = *((field_ptr_t)vr_saved_area */
10399 tmp_ha = build1 (ADDR_EXPR, field_ptr_t, tmp_ha);
10400 addr = t;
10401 t = fold_convert (field_ptr_t, addr);
10402 t = build2 (MODIFY_EXPR, field_t,
10403 build1 (INDIRECT_REF, field_t, tmp_ha),
10404 build1 (INDIRECT_REF, field_t, t));
10406 /* ha.field[i] = *((field_ptr_t)vr_saved_area + i) */
10407 for (i = 1; i < nregs; ++i)
10409 addr = fold_build_pointer_plus_hwi (addr, UNITS_PER_VREG);
10410 u = fold_convert (field_ptr_t, addr);
10411 u = build2 (MODIFY_EXPR, field_t,
10412 build2 (MEM_REF, field_t, tmp_ha,
10413 build_int_cst (field_ptr_t,
10414 (i *
10415 int_size_in_bytes (field_t)))),
10416 build1 (INDIRECT_REF, field_t, u));
10417 t = build2 (COMPOUND_EXPR, TREE_TYPE (t), t, u);
10420 u = fold_convert (TREE_TYPE (f_top), tmp_ha);
10421 t = build2 (COMPOUND_EXPR, TREE_TYPE (f_top), t, u);
10424 COND_EXPR_ELSE (cond2) = t;
10425 addr = fold_convert (build_pointer_type (type), cond1);
10426 addr = build_va_arg_indirect_ref (addr);
10428 if (indirect_p)
10429 addr = build_va_arg_indirect_ref (addr);
10431 return addr;
10434 /* Implement TARGET_SETUP_INCOMING_VARARGS. */
10436 static void
10437 aarch64_setup_incoming_varargs (cumulative_args_t cum_v, machine_mode mode,
10438 tree type, int *pretend_size ATTRIBUTE_UNUSED,
10439 int no_rtl)
10441 CUMULATIVE_ARGS *cum = get_cumulative_args (cum_v);
10442 CUMULATIVE_ARGS local_cum;
10443 int gr_saved = cfun->va_list_gpr_size;
10444 int vr_saved = cfun->va_list_fpr_size;
10446 /* The caller has advanced CUM up to, but not beyond, the last named
10447 argument. Advance a local copy of CUM past the last "real" named
10448 argument, to find out how many registers are left over. */
10449 local_cum = *cum;
10450 aarch64_function_arg_advance (pack_cumulative_args(&local_cum), mode, type, true);
10452 /* Found out how many registers we need to save.
10453 Honor tree-stdvar analysis results. */
10454 if (cfun->va_list_gpr_size)
10455 gr_saved = MIN (NUM_ARG_REGS - local_cum.aapcs_ncrn,
10456 cfun->va_list_gpr_size / UNITS_PER_WORD);
10457 if (cfun->va_list_fpr_size)
10458 vr_saved = MIN (NUM_FP_ARG_REGS - local_cum.aapcs_nvrn,
10459 cfun->va_list_fpr_size / UNITS_PER_VREG);
10461 if (!TARGET_FLOAT)
10463 gcc_assert (local_cum.aapcs_nvrn == 0);
10464 vr_saved = 0;
10467 if (!no_rtl)
10469 if (gr_saved > 0)
10471 rtx ptr, mem;
10473 /* virtual_incoming_args_rtx should have been 16-byte aligned. */
10474 ptr = plus_constant (Pmode, virtual_incoming_args_rtx,
10475 - gr_saved * UNITS_PER_WORD);
10476 mem = gen_frame_mem (BLKmode, ptr);
10477 set_mem_alias_set (mem, get_varargs_alias_set ());
10479 move_block_from_reg (local_cum.aapcs_ncrn + R0_REGNUM,
10480 mem, gr_saved);
10482 if (vr_saved > 0)
10484 /* We can't use move_block_from_reg, because it will use
10485 the wrong mode, storing D regs only. */
10486 machine_mode mode = TImode;
10487 int off, i, vr_start;
10489 /* Set OFF to the offset from virtual_incoming_args_rtx of
10490 the first vector register. The VR save area lies below
10491 the GR one, and is aligned to 16 bytes. */
10492 off = -ROUND_UP (gr_saved * UNITS_PER_WORD,
10493 STACK_BOUNDARY / BITS_PER_UNIT);
10494 off -= vr_saved * UNITS_PER_VREG;
10496 vr_start = V0_REGNUM + local_cum.aapcs_nvrn;
10497 for (i = 0; i < vr_saved; ++i)
10499 rtx ptr, mem;
10501 ptr = plus_constant (Pmode, virtual_incoming_args_rtx, off);
10502 mem = gen_frame_mem (mode, ptr);
10503 set_mem_alias_set (mem, get_varargs_alias_set ());
10504 aarch64_emit_move (mem, gen_rtx_REG (mode, vr_start + i));
10505 off += UNITS_PER_VREG;
10510 /* We don't save the size into *PRETEND_SIZE because we want to avoid
10511 any complication of having crtl->args.pretend_args_size changed. */
10512 cfun->machine->frame.saved_varargs_size
10513 = (ROUND_UP (gr_saved * UNITS_PER_WORD,
10514 STACK_BOUNDARY / BITS_PER_UNIT)
10515 + vr_saved * UNITS_PER_VREG);
10518 static void
10519 aarch64_conditional_register_usage (void)
10521 int i;
10522 if (!TARGET_FLOAT)
10524 for (i = V0_REGNUM; i <= V31_REGNUM; i++)
10526 fixed_regs[i] = 1;
10527 call_used_regs[i] = 1;
10532 /* Walk down the type tree of TYPE counting consecutive base elements.
10533 If *MODEP is VOIDmode, then set it to the first valid floating point
10534 type. If a non-floating point type is found, or if a floating point
10535 type that doesn't match a non-VOIDmode *MODEP is found, then return -1,
10536 otherwise return the count in the sub-tree. */
10537 static int
10538 aapcs_vfp_sub_candidate (const_tree type, machine_mode *modep)
10540 machine_mode mode;
10541 HOST_WIDE_INT size;
10543 switch (TREE_CODE (type))
10545 case REAL_TYPE:
10546 mode = TYPE_MODE (type);
10547 if (mode != DFmode && mode != SFmode
10548 && mode != TFmode && mode != HFmode)
10549 return -1;
10551 if (*modep == VOIDmode)
10552 *modep = mode;
10554 if (*modep == mode)
10555 return 1;
10557 break;
10559 case COMPLEX_TYPE:
10560 mode = TYPE_MODE (TREE_TYPE (type));
10561 if (mode != DFmode && mode != SFmode
10562 && mode != TFmode && mode != HFmode)
10563 return -1;
10565 if (*modep == VOIDmode)
10566 *modep = mode;
10568 if (*modep == mode)
10569 return 2;
10571 break;
10573 case VECTOR_TYPE:
10574 /* Use V2SImode and V4SImode as representatives of all 64-bit
10575 and 128-bit vector types. */
10576 size = int_size_in_bytes (type);
10577 switch (size)
10579 case 8:
10580 mode = V2SImode;
10581 break;
10582 case 16:
10583 mode = V4SImode;
10584 break;
10585 default:
10586 return -1;
10589 if (*modep == VOIDmode)
10590 *modep = mode;
10592 /* Vector modes are considered to be opaque: two vectors are
10593 equivalent for the purposes of being homogeneous aggregates
10594 if they are the same size. */
10595 if (*modep == mode)
10596 return 1;
10598 break;
10600 case ARRAY_TYPE:
10602 int count;
10603 tree index = TYPE_DOMAIN (type);
10605 /* Can't handle incomplete types nor sizes that are not
10606 fixed. */
10607 if (!COMPLETE_TYPE_P (type)
10608 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10609 return -1;
10611 count = aapcs_vfp_sub_candidate (TREE_TYPE (type), modep);
10612 if (count == -1
10613 || !index
10614 || !TYPE_MAX_VALUE (index)
10615 || !tree_fits_uhwi_p (TYPE_MAX_VALUE (index))
10616 || !TYPE_MIN_VALUE (index)
10617 || !tree_fits_uhwi_p (TYPE_MIN_VALUE (index))
10618 || count < 0)
10619 return -1;
10621 count *= (1 + tree_to_uhwi (TYPE_MAX_VALUE (index))
10622 - tree_to_uhwi (TYPE_MIN_VALUE (index)));
10624 /* There must be no padding. */
10625 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10626 return -1;
10628 return count;
10631 case RECORD_TYPE:
10633 int count = 0;
10634 int sub_count;
10635 tree field;
10637 /* Can't handle incomplete types nor sizes that are not
10638 fixed. */
10639 if (!COMPLETE_TYPE_P (type)
10640 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10641 return -1;
10643 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10645 if (TREE_CODE (field) != FIELD_DECL)
10646 continue;
10648 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10649 if (sub_count < 0)
10650 return -1;
10651 count += sub_count;
10654 /* There must be no padding. */
10655 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10656 return -1;
10658 return count;
10661 case UNION_TYPE:
10662 case QUAL_UNION_TYPE:
10664 /* These aren't very interesting except in a degenerate case. */
10665 int count = 0;
10666 int sub_count;
10667 tree field;
10669 /* Can't handle incomplete types nor sizes that are not
10670 fixed. */
10671 if (!COMPLETE_TYPE_P (type)
10672 || TREE_CODE (TYPE_SIZE (type)) != INTEGER_CST)
10673 return -1;
10675 for (field = TYPE_FIELDS (type); field; field = TREE_CHAIN (field))
10677 if (TREE_CODE (field) != FIELD_DECL)
10678 continue;
10680 sub_count = aapcs_vfp_sub_candidate (TREE_TYPE (field), modep);
10681 if (sub_count < 0)
10682 return -1;
10683 count = count > sub_count ? count : sub_count;
10686 /* There must be no padding. */
10687 if (wi::ne_p (TYPE_SIZE (type), count * GET_MODE_BITSIZE (*modep)))
10688 return -1;
10690 return count;
10693 default:
10694 break;
10697 return -1;
10700 /* Return TRUE if the type, as described by TYPE and MODE, is a short vector
10701 type as described in AAPCS64 \S 4.1.2.
10703 See the comment above aarch64_composite_type_p for the notes on MODE. */
10705 static bool
10706 aarch64_short_vector_p (const_tree type,
10707 machine_mode mode)
10709 HOST_WIDE_INT size = -1;
10711 if (type && TREE_CODE (type) == VECTOR_TYPE)
10712 size = int_size_in_bytes (type);
10713 else if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
10714 || GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
10715 size = GET_MODE_SIZE (mode);
10717 return (size == 8 || size == 16);
10720 /* Return TRUE if the type, as described by TYPE and MODE, is a composite
10721 type as described in AAPCS64 \S 4.3. This includes aggregate, union and
10722 array types. The C99 floating-point complex types are also considered
10723 as composite types, according to AAPCS64 \S 7.1.1. The complex integer
10724 types, which are GCC extensions and out of the scope of AAPCS64, are
10725 treated as composite types here as well.
10727 Note that MODE itself is not sufficient in determining whether a type
10728 is such a composite type or not. This is because
10729 stor-layout.c:compute_record_mode may have already changed the MODE
10730 (BLKmode) of a RECORD_TYPE TYPE to some other mode. For example, a
10731 structure with only one field may have its MODE set to the mode of the
10732 field. Also an integer mode whose size matches the size of the
10733 RECORD_TYPE type may be used to substitute the original mode
10734 (i.e. BLKmode) in certain circumstances. In other words, MODE cannot be
10735 solely relied on. */
10737 static bool
10738 aarch64_composite_type_p (const_tree type,
10739 machine_mode mode)
10741 if (aarch64_short_vector_p (type, mode))
10742 return false;
10744 if (type && (AGGREGATE_TYPE_P (type) || TREE_CODE (type) == COMPLEX_TYPE))
10745 return true;
10747 if (mode == BLKmode
10748 || GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT
10749 || GET_MODE_CLASS (mode) == MODE_COMPLEX_INT)
10750 return true;
10752 return false;
10755 /* Return TRUE if an argument, whose type is described by TYPE and MODE,
10756 shall be passed or returned in simd/fp register(s) (providing these
10757 parameter passing registers are available).
10759 Upon successful return, *COUNT returns the number of needed registers,
10760 *BASE_MODE returns the mode of the individual register and when IS_HAF
10761 is not NULL, *IS_HA indicates whether or not the argument is a homogeneous
10762 floating-point aggregate or a homogeneous short-vector aggregate. */
10764 static bool
10765 aarch64_vfp_is_call_or_return_candidate (machine_mode mode,
10766 const_tree type,
10767 machine_mode *base_mode,
10768 int *count,
10769 bool *is_ha)
10771 machine_mode new_mode = VOIDmode;
10772 bool composite_p = aarch64_composite_type_p (type, mode);
10774 if (is_ha != NULL) *is_ha = false;
10776 if ((!composite_p && GET_MODE_CLASS (mode) == MODE_FLOAT)
10777 || aarch64_short_vector_p (type, mode))
10779 *count = 1;
10780 new_mode = mode;
10782 else if (GET_MODE_CLASS (mode) == MODE_COMPLEX_FLOAT)
10784 if (is_ha != NULL) *is_ha = true;
10785 *count = 2;
10786 new_mode = GET_MODE_INNER (mode);
10788 else if (type && composite_p)
10790 int ag_count = aapcs_vfp_sub_candidate (type, &new_mode);
10792 if (ag_count > 0 && ag_count <= HA_MAX_NUM_FLDS)
10794 if (is_ha != NULL) *is_ha = true;
10795 *count = ag_count;
10797 else
10798 return false;
10800 else
10801 return false;
10803 *base_mode = new_mode;
10804 return true;
10807 /* Implement TARGET_STRUCT_VALUE_RTX. */
10809 static rtx
10810 aarch64_struct_value_rtx (tree fndecl ATTRIBUTE_UNUSED,
10811 int incoming ATTRIBUTE_UNUSED)
10813 return gen_rtx_REG (Pmode, AARCH64_STRUCT_VALUE_REGNUM);
10816 /* Implements target hook vector_mode_supported_p. */
10817 static bool
10818 aarch64_vector_mode_supported_p (machine_mode mode)
10820 if (TARGET_SIMD
10821 && (mode == V4SImode || mode == V8HImode
10822 || mode == V16QImode || mode == V2DImode
10823 || mode == V2SImode || mode == V4HImode
10824 || mode == V8QImode || mode == V2SFmode
10825 || mode == V4SFmode || mode == V2DFmode
10826 || mode == V4HFmode || mode == V8HFmode
10827 || mode == V1DFmode))
10828 return true;
10830 return false;
10833 /* Return appropriate SIMD container
10834 for MODE within a vector of WIDTH bits. */
10835 static machine_mode
10836 aarch64_simd_container_mode (machine_mode mode, unsigned width)
10838 gcc_assert (width == 64 || width == 128);
10839 if (TARGET_SIMD)
10841 if (width == 128)
10842 switch (mode)
10844 case DFmode:
10845 return V2DFmode;
10846 case SFmode:
10847 return V4SFmode;
10848 case HFmode:
10849 return V8HFmode;
10850 case SImode:
10851 return V4SImode;
10852 case HImode:
10853 return V8HImode;
10854 case QImode:
10855 return V16QImode;
10856 case DImode:
10857 return V2DImode;
10858 default:
10859 break;
10861 else
10862 switch (mode)
10864 case SFmode:
10865 return V2SFmode;
10866 case HFmode:
10867 return V4HFmode;
10868 case SImode:
10869 return V2SImode;
10870 case HImode:
10871 return V4HImode;
10872 case QImode:
10873 return V8QImode;
10874 default:
10875 break;
10878 return word_mode;
10881 /* Return 128-bit container as the preferred SIMD mode for MODE. */
10882 static machine_mode
10883 aarch64_preferred_simd_mode (machine_mode mode)
10885 return aarch64_simd_container_mode (mode, 128);
10888 /* Return the bitmask of possible vector sizes for the vectorizer
10889 to iterate over. */
10890 static unsigned int
10891 aarch64_autovectorize_vector_sizes (void)
10893 return (16 | 8);
10896 /* Implement TARGET_MANGLE_TYPE. */
10898 static const char *
10899 aarch64_mangle_type (const_tree type)
10901 /* The AArch64 ABI documents say that "__va_list" has to be
10902 managled as if it is in the "std" namespace. */
10903 if (lang_hooks.types_compatible_p (CONST_CAST_TREE (type), va_list_type))
10904 return "St9__va_list";
10906 /* Half-precision float. */
10907 if (TREE_CODE (type) == REAL_TYPE && TYPE_PRECISION (type) == 16)
10908 return "Dh";
10910 /* Mangle AArch64-specific internal types. TYPE_NAME is non-NULL_TREE for
10911 builtin types. */
10912 if (TYPE_NAME (type) != NULL)
10913 return aarch64_mangle_builtin_type (type);
10915 /* Use the default mangling. */
10916 return NULL;
10919 /* Find the first rtx_insn before insn that will generate an assembly
10920 instruction. */
10922 static rtx_insn *
10923 aarch64_prev_real_insn (rtx_insn *insn)
10925 if (!insn)
10926 return NULL;
10930 insn = prev_real_insn (insn);
10932 while (insn && recog_memoized (insn) < 0);
10934 return insn;
10937 static bool
10938 is_madd_op (enum attr_type t1)
10940 unsigned int i;
10941 /* A number of these may be AArch32 only. */
10942 enum attr_type mlatypes[] = {
10943 TYPE_MLA, TYPE_MLAS, TYPE_SMLAD, TYPE_SMLADX, TYPE_SMLAL, TYPE_SMLALD,
10944 TYPE_SMLALS, TYPE_SMLALXY, TYPE_SMLAWX, TYPE_SMLAWY, TYPE_SMLAXY,
10945 TYPE_SMMLA, TYPE_UMLAL, TYPE_UMLALS,TYPE_SMLSD, TYPE_SMLSDX, TYPE_SMLSLD
10948 for (i = 0; i < sizeof (mlatypes) / sizeof (enum attr_type); i++)
10950 if (t1 == mlatypes[i])
10951 return true;
10954 return false;
10957 /* Check if there is a register dependency between a load and the insn
10958 for which we hold recog_data. */
10960 static bool
10961 dep_between_memop_and_curr (rtx memop)
10963 rtx load_reg;
10964 int opno;
10966 gcc_assert (GET_CODE (memop) == SET);
10968 if (!REG_P (SET_DEST (memop)))
10969 return false;
10971 load_reg = SET_DEST (memop);
10972 for (opno = 1; opno < recog_data.n_operands; opno++)
10974 rtx operand = recog_data.operand[opno];
10975 if (REG_P (operand)
10976 && reg_overlap_mentioned_p (load_reg, operand))
10977 return true;
10980 return false;
10984 /* When working around the Cortex-A53 erratum 835769,
10985 given rtx_insn INSN, return true if it is a 64-bit multiply-accumulate
10986 instruction and has a preceding memory instruction such that a NOP
10987 should be inserted between them. */
10989 bool
10990 aarch64_madd_needs_nop (rtx_insn* insn)
10992 enum attr_type attr_type;
10993 rtx_insn *prev;
10994 rtx body;
10996 if (!TARGET_FIX_ERR_A53_835769)
10997 return false;
10999 if (!INSN_P (insn) || recog_memoized (insn) < 0)
11000 return false;
11002 attr_type = get_attr_type (insn);
11003 if (!is_madd_op (attr_type))
11004 return false;
11006 prev = aarch64_prev_real_insn (insn);
11007 /* aarch64_prev_real_insn can call recog_memoized on insns other than INSN.
11008 Restore recog state to INSN to avoid state corruption. */
11009 extract_constrain_insn_cached (insn);
11011 if (!prev || !contains_mem_rtx_p (PATTERN (prev)))
11012 return false;
11014 body = single_set (prev);
11016 /* If the previous insn is a memory op and there is no dependency between
11017 it and the DImode madd, emit a NOP between them. If body is NULL then we
11018 have a complex memory operation, probably a load/store pair.
11019 Be conservative for now and emit a NOP. */
11020 if (GET_MODE (recog_data.operand[0]) == DImode
11021 && (!body || !dep_between_memop_and_curr (body)))
11022 return true;
11024 return false;
11029 /* Implement FINAL_PRESCAN_INSN. */
11031 void
11032 aarch64_final_prescan_insn (rtx_insn *insn)
11034 if (aarch64_madd_needs_nop (insn))
11035 fprintf (asm_out_file, "\tnop // between mem op and mult-accumulate\n");
11039 /* Return the equivalent letter for size. */
11040 static char
11041 sizetochar (int size)
11043 switch (size)
11045 case 64: return 'd';
11046 case 32: return 's';
11047 case 16: return 'h';
11048 case 8 : return 'b';
11049 default: gcc_unreachable ();
11053 /* Return true iff x is a uniform vector of floating-point
11054 constants, and the constant can be represented in
11055 quarter-precision form. Note, as aarch64_float_const_representable
11056 rejects both +0.0 and -0.0, we will also reject +0.0 and -0.0. */
11057 static bool
11058 aarch64_vect_float_const_representable_p (rtx x)
11060 rtx elt;
11061 return (GET_MODE_CLASS (GET_MODE (x)) == MODE_VECTOR_FLOAT
11062 && const_vec_duplicate_p (x, &elt)
11063 && aarch64_float_const_representable_p (elt));
11066 /* Return true for valid and false for invalid. */
11067 bool
11068 aarch64_simd_valid_immediate (rtx op, machine_mode mode, bool inverse,
11069 struct simd_immediate_info *info)
11071 #define CHECK(STRIDE, ELSIZE, CLASS, TEST, SHIFT, NEG) \
11072 matches = 1; \
11073 for (i = 0; i < idx; i += (STRIDE)) \
11074 if (!(TEST)) \
11075 matches = 0; \
11076 if (matches) \
11078 immtype = (CLASS); \
11079 elsize = (ELSIZE); \
11080 eshift = (SHIFT); \
11081 emvn = (NEG); \
11082 break; \
11085 unsigned int i, elsize = 0, idx = 0, n_elts = CONST_VECTOR_NUNITS (op);
11086 unsigned int innersize = GET_MODE_UNIT_SIZE (mode);
11087 unsigned char bytes[16];
11088 int immtype = -1, matches;
11089 unsigned int invmask = inverse ? 0xff : 0;
11090 int eshift, emvn;
11092 if (GET_MODE_CLASS (mode) == MODE_VECTOR_FLOAT)
11094 if (! (aarch64_simd_imm_zero_p (op, mode)
11095 || aarch64_vect_float_const_representable_p (op)))
11096 return false;
11098 if (info)
11100 info->value = CONST_VECTOR_ELT (op, 0);
11101 info->element_width = GET_MODE_BITSIZE (GET_MODE (info->value));
11102 info->mvn = false;
11103 info->shift = 0;
11106 return true;
11109 /* Splat vector constant out into a byte vector. */
11110 for (i = 0; i < n_elts; i++)
11112 /* The vector is provided in gcc endian-neutral fashion. For aarch64_be,
11113 it must be laid out in the vector register in reverse order. */
11114 rtx el = CONST_VECTOR_ELT (op, BYTES_BIG_ENDIAN ? (n_elts - 1 - i) : i);
11115 unsigned HOST_WIDE_INT elpart;
11117 gcc_assert (CONST_INT_P (el));
11118 elpart = INTVAL (el);
11120 for (unsigned int byte = 0; byte < innersize; byte++)
11122 bytes[idx++] = (elpart & 0xff) ^ invmask;
11123 elpart >>= BITS_PER_UNIT;
11128 /* Sanity check. */
11129 gcc_assert (idx == GET_MODE_SIZE (mode));
11133 CHECK (4, 32, 0, bytes[i] == bytes[0] && bytes[i + 1] == 0
11134 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 0, 0);
11136 CHECK (4, 32, 1, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11137 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11139 CHECK (4, 32, 2, bytes[i] == 0 && bytes[i + 1] == 0
11140 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11142 CHECK (4, 32, 3, bytes[i] == 0 && bytes[i + 1] == 0
11143 && bytes[i + 2] == 0 && bytes[i + 3] == bytes[3], 24, 0);
11145 CHECK (2, 16, 4, bytes[i] == bytes[0] && bytes[i + 1] == 0, 0, 0);
11147 CHECK (2, 16, 5, bytes[i] == 0 && bytes[i + 1] == bytes[1], 8, 0);
11149 CHECK (4, 32, 6, bytes[i] == bytes[0] && bytes[i + 1] == 0xff
11150 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 0, 1);
11152 CHECK (4, 32, 7, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11153 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11155 CHECK (4, 32, 8, bytes[i] == 0xff && bytes[i + 1] == 0xff
11156 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11158 CHECK (4, 32, 9, bytes[i] == 0xff && bytes[i + 1] == 0xff
11159 && bytes[i + 2] == 0xff && bytes[i + 3] == bytes[3], 24, 1);
11161 CHECK (2, 16, 10, bytes[i] == bytes[0] && bytes[i + 1] == 0xff, 0, 1);
11163 CHECK (2, 16, 11, bytes[i] == 0xff && bytes[i + 1] == bytes[1], 8, 1);
11165 CHECK (4, 32, 12, bytes[i] == 0xff && bytes[i + 1] == bytes[1]
11166 && bytes[i + 2] == 0 && bytes[i + 3] == 0, 8, 0);
11168 CHECK (4, 32, 13, bytes[i] == 0 && bytes[i + 1] == bytes[1]
11169 && bytes[i + 2] == 0xff && bytes[i + 3] == 0xff, 8, 1);
11171 CHECK (4, 32, 14, bytes[i] == 0xff && bytes[i + 1] == 0xff
11172 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0, 16, 0);
11174 CHECK (4, 32, 15, bytes[i] == 0 && bytes[i + 1] == 0
11175 && bytes[i + 2] == bytes[2] && bytes[i + 3] == 0xff, 16, 1);
11177 CHECK (1, 8, 16, bytes[i] == bytes[0], 0, 0);
11179 CHECK (1, 64, 17, (bytes[i] == 0 || bytes[i] == 0xff)
11180 && bytes[i] == bytes[(i + 8) % idx], 0, 0);
11182 while (0);
11184 if (immtype == -1)
11185 return false;
11187 if (info)
11189 info->element_width = elsize;
11190 info->mvn = emvn != 0;
11191 info->shift = eshift;
11193 unsigned HOST_WIDE_INT imm = 0;
11195 if (immtype >= 12 && immtype <= 15)
11196 info->msl = true;
11198 /* Un-invert bytes of recognized vector, if necessary. */
11199 if (invmask != 0)
11200 for (i = 0; i < idx; i++)
11201 bytes[i] ^= invmask;
11203 if (immtype == 17)
11205 /* FIXME: Broken on 32-bit H_W_I hosts. */
11206 gcc_assert (sizeof (HOST_WIDE_INT) == 8);
11208 for (i = 0; i < 8; i++)
11209 imm |= (unsigned HOST_WIDE_INT) (bytes[i] ? 0xff : 0)
11210 << (i * BITS_PER_UNIT);
11213 info->value = GEN_INT (imm);
11215 else
11217 for (i = 0; i < elsize / BITS_PER_UNIT; i++)
11218 imm |= (unsigned HOST_WIDE_INT) bytes[i] << (i * BITS_PER_UNIT);
11220 /* Construct 'abcdefgh' because the assembler cannot handle
11221 generic constants. */
11222 if (info->mvn)
11223 imm = ~imm;
11224 imm = (imm >> info->shift) & 0xff;
11225 info->value = GEN_INT (imm);
11229 return true;
11230 #undef CHECK
11233 /* Check of immediate shift constants are within range. */
11234 bool
11235 aarch64_simd_shift_imm_p (rtx x, machine_mode mode, bool left)
11237 int bit_width = GET_MODE_UNIT_SIZE (mode) * BITS_PER_UNIT;
11238 if (left)
11239 return aarch64_const_vec_all_same_in_range_p (x, 0, bit_width - 1);
11240 else
11241 return aarch64_const_vec_all_same_in_range_p (x, 1, bit_width);
11244 /* Return true if X is a uniform vector where all elements
11245 are either the floating-point constant 0.0 or the
11246 integer constant 0. */
11247 bool
11248 aarch64_simd_imm_zero_p (rtx x, machine_mode mode)
11250 return x == CONST0_RTX (mode);
11254 /* Return the bitmask CONST_INT to select the bits required by a zero extract
11255 operation of width WIDTH at bit position POS. */
11258 aarch64_mask_from_zextract_ops (rtx width, rtx pos)
11260 gcc_assert (CONST_INT_P (width));
11261 gcc_assert (CONST_INT_P (pos));
11263 unsigned HOST_WIDE_INT mask
11264 = ((unsigned HOST_WIDE_INT) 1 << UINTVAL (width)) - 1;
11265 return GEN_INT (mask << UINTVAL (pos));
11268 bool
11269 aarch64_simd_imm_scalar_p (rtx x, machine_mode mode ATTRIBUTE_UNUSED)
11271 HOST_WIDE_INT imm = INTVAL (x);
11272 int i;
11274 for (i = 0; i < 8; i++)
11276 unsigned int byte = imm & 0xff;
11277 if (byte != 0xff && byte != 0)
11278 return false;
11279 imm >>= 8;
11282 return true;
11285 bool
11286 aarch64_mov_operand_p (rtx x, machine_mode mode)
11288 if (GET_CODE (x) == HIGH
11289 && aarch64_valid_symref (XEXP (x, 0), GET_MODE (XEXP (x, 0))))
11290 return true;
11292 if (CONST_INT_P (x))
11293 return true;
11295 if (GET_CODE (x) == SYMBOL_REF && mode == DImode && CONSTANT_ADDRESS_P (x))
11296 return true;
11298 return aarch64_classify_symbolic_expression (x)
11299 == SYMBOL_TINY_ABSOLUTE;
11302 /* Return a const_int vector of VAL. */
11304 aarch64_simd_gen_const_vector_dup (machine_mode mode, HOST_WIDE_INT val)
11306 int nunits = GET_MODE_NUNITS (mode);
11307 rtvec v = rtvec_alloc (nunits);
11308 int i;
11310 rtx cache = GEN_INT (val);
11312 for (i=0; i < nunits; i++)
11313 RTVEC_ELT (v, i) = cache;
11315 return gen_rtx_CONST_VECTOR (mode, v);
11318 /* Check OP is a legal scalar immediate for the MOVI instruction. */
11320 bool
11321 aarch64_simd_scalar_immediate_valid_for_move (rtx op, machine_mode mode)
11323 machine_mode vmode;
11325 gcc_assert (!VECTOR_MODE_P (mode));
11326 vmode = aarch64_preferred_simd_mode (mode);
11327 rtx op_v = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (op));
11328 return aarch64_simd_valid_immediate (op_v, vmode, false, NULL);
11331 /* Construct and return a PARALLEL RTX vector with elements numbering the
11332 lanes of either the high (HIGH == TRUE) or low (HIGH == FALSE) half of
11333 the vector - from the perspective of the architecture. This does not
11334 line up with GCC's perspective on lane numbers, so we end up with
11335 different masks depending on our target endian-ness. The diagram
11336 below may help. We must draw the distinction when building masks
11337 which select one half of the vector. An instruction selecting
11338 architectural low-lanes for a big-endian target, must be described using
11339 a mask selecting GCC high-lanes.
11341 Big-Endian Little-Endian
11343 GCC 0 1 2 3 3 2 1 0
11344 | x | x | x | x | | x | x | x | x |
11345 Architecture 3 2 1 0 3 2 1 0
11347 Low Mask: { 2, 3 } { 0, 1 }
11348 High Mask: { 0, 1 } { 2, 3 }
11352 aarch64_simd_vect_par_cnst_half (machine_mode mode, bool high)
11354 int nunits = GET_MODE_NUNITS (mode);
11355 rtvec v = rtvec_alloc (nunits / 2);
11356 int high_base = nunits / 2;
11357 int low_base = 0;
11358 int base;
11359 rtx t1;
11360 int i;
11362 if (BYTES_BIG_ENDIAN)
11363 base = high ? low_base : high_base;
11364 else
11365 base = high ? high_base : low_base;
11367 for (i = 0; i < nunits / 2; i++)
11368 RTVEC_ELT (v, i) = GEN_INT (base + i);
11370 t1 = gen_rtx_PARALLEL (mode, v);
11371 return t1;
11374 /* Check OP for validity as a PARALLEL RTX vector with elements
11375 numbering the lanes of either the high (HIGH == TRUE) or low lanes,
11376 from the perspective of the architecture. See the diagram above
11377 aarch64_simd_vect_par_cnst_half for more details. */
11379 bool
11380 aarch64_simd_check_vect_par_cnst_half (rtx op, machine_mode mode,
11381 bool high)
11383 rtx ideal = aarch64_simd_vect_par_cnst_half (mode, high);
11384 HOST_WIDE_INT count_op = XVECLEN (op, 0);
11385 HOST_WIDE_INT count_ideal = XVECLEN (ideal, 0);
11386 int i = 0;
11388 if (!VECTOR_MODE_P (mode))
11389 return false;
11391 if (count_op != count_ideal)
11392 return false;
11394 for (i = 0; i < count_ideal; i++)
11396 rtx elt_op = XVECEXP (op, 0, i);
11397 rtx elt_ideal = XVECEXP (ideal, 0, i);
11399 if (!CONST_INT_P (elt_op)
11400 || INTVAL (elt_ideal) != INTVAL (elt_op))
11401 return false;
11403 return true;
11406 /* Bounds-check lanes. Ensure OPERAND lies between LOW (inclusive) and
11407 HIGH (exclusive). */
11408 void
11409 aarch64_simd_lane_bounds (rtx operand, HOST_WIDE_INT low, HOST_WIDE_INT high,
11410 const_tree exp)
11412 HOST_WIDE_INT lane;
11413 gcc_assert (CONST_INT_P (operand));
11414 lane = INTVAL (operand);
11416 if (lane < low || lane >= high)
11418 if (exp)
11419 error ("%Klane %wd out of range %wd - %wd", exp, lane, low, high - 1);
11420 else
11421 error ("lane %wd out of range %wd - %wd", lane, low, high - 1);
11425 /* Return TRUE if OP is a valid vector addressing mode. */
11426 bool
11427 aarch64_simd_mem_operand_p (rtx op)
11429 return MEM_P (op) && (GET_CODE (XEXP (op, 0)) == POST_INC
11430 || REG_P (XEXP (op, 0)));
11433 /* Emit a register copy from operand to operand, taking care not to
11434 early-clobber source registers in the process.
11436 COUNT is the number of components into which the copy needs to be
11437 decomposed. */
11438 void
11439 aarch64_simd_emit_reg_reg_move (rtx *operands, enum machine_mode mode,
11440 unsigned int count)
11442 unsigned int i;
11443 int rdest = REGNO (operands[0]);
11444 int rsrc = REGNO (operands[1]);
11446 if (!reg_overlap_mentioned_p (operands[0], operands[1])
11447 || rdest < rsrc)
11448 for (i = 0; i < count; i++)
11449 emit_move_insn (gen_rtx_REG (mode, rdest + i),
11450 gen_rtx_REG (mode, rsrc + i));
11451 else
11452 for (i = 0; i < count; i++)
11453 emit_move_insn (gen_rtx_REG (mode, rdest + count - i - 1),
11454 gen_rtx_REG (mode, rsrc + count - i - 1));
11457 /* Compute and return the length of aarch64_simd_reglist<mode>, where <mode> is
11458 one of VSTRUCT modes: OI, CI, or XI. */
11460 aarch64_simd_attr_length_rglist (enum machine_mode mode)
11462 return (GET_MODE_SIZE (mode) / UNITS_PER_VREG) * 4;
11465 /* Implement target hook TARGET_VECTOR_ALIGNMENT. The AAPCS64 sets the maximum
11466 alignment of a vector to 128 bits. */
11467 static HOST_WIDE_INT
11468 aarch64_simd_vector_alignment (const_tree type)
11470 HOST_WIDE_INT align = tree_to_shwi (TYPE_SIZE (type));
11471 return MIN (align, 128);
11474 /* Implement target hook TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE. */
11475 static bool
11476 aarch64_simd_vector_alignment_reachable (const_tree type, bool is_packed)
11478 if (is_packed)
11479 return false;
11481 /* We guarantee alignment for vectors up to 128-bits. */
11482 if (tree_int_cst_compare (TYPE_SIZE (type),
11483 bitsize_int (BIGGEST_ALIGNMENT)) > 0)
11484 return false;
11486 /* Vectors whose size is <= BIGGEST_ALIGNMENT are naturally aligned. */
11487 return true;
11490 /* Return true if the vector misalignment factor is supported by the
11491 target. */
11492 static bool
11493 aarch64_builtin_support_vector_misalignment (machine_mode mode,
11494 const_tree type, int misalignment,
11495 bool is_packed)
11497 if (TARGET_SIMD && STRICT_ALIGNMENT)
11499 /* Return if movmisalign pattern is not supported for this mode. */
11500 if (optab_handler (movmisalign_optab, mode) == CODE_FOR_nothing)
11501 return false;
11503 if (misalignment == -1)
11505 /* Misalignment factor is unknown at compile time but we know
11506 it's word aligned. */
11507 if (aarch64_simd_vector_alignment_reachable (type, is_packed))
11509 int element_size = TREE_INT_CST_LOW (TYPE_SIZE (type));
11511 if (element_size != 64)
11512 return true;
11514 return false;
11517 return default_builtin_support_vector_misalignment (mode, type, misalignment,
11518 is_packed);
11521 /* If VALS is a vector constant that can be loaded into a register
11522 using DUP, generate instructions to do so and return an RTX to
11523 assign to the register. Otherwise return NULL_RTX. */
11524 static rtx
11525 aarch64_simd_dup_constant (rtx vals)
11527 machine_mode mode = GET_MODE (vals);
11528 machine_mode inner_mode = GET_MODE_INNER (mode);
11529 rtx x;
11531 if (!const_vec_duplicate_p (vals, &x))
11532 return NULL_RTX;
11534 /* We can load this constant by using DUP and a constant in a
11535 single ARM register. This will be cheaper than a vector
11536 load. */
11537 x = copy_to_mode_reg (inner_mode, x);
11538 return gen_rtx_VEC_DUPLICATE (mode, x);
11542 /* Generate code to load VALS, which is a PARALLEL containing only
11543 constants (for vec_init) or CONST_VECTOR, efficiently into a
11544 register. Returns an RTX to copy into the register, or NULL_RTX
11545 for a PARALLEL that can not be converted into a CONST_VECTOR. */
11546 static rtx
11547 aarch64_simd_make_constant (rtx vals)
11549 machine_mode mode = GET_MODE (vals);
11550 rtx const_dup;
11551 rtx const_vec = NULL_RTX;
11552 int n_elts = GET_MODE_NUNITS (mode);
11553 int n_const = 0;
11554 int i;
11556 if (GET_CODE (vals) == CONST_VECTOR)
11557 const_vec = vals;
11558 else if (GET_CODE (vals) == PARALLEL)
11560 /* A CONST_VECTOR must contain only CONST_INTs and
11561 CONST_DOUBLEs, but CONSTANT_P allows more (e.g. SYMBOL_REF).
11562 Only store valid constants in a CONST_VECTOR. */
11563 for (i = 0; i < n_elts; ++i)
11565 rtx x = XVECEXP (vals, 0, i);
11566 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11567 n_const++;
11569 if (n_const == n_elts)
11570 const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0));
11572 else
11573 gcc_unreachable ();
11575 if (const_vec != NULL_RTX
11576 && aarch64_simd_valid_immediate (const_vec, mode, false, NULL))
11577 /* Load using MOVI/MVNI. */
11578 return const_vec;
11579 else if ((const_dup = aarch64_simd_dup_constant (vals)) != NULL_RTX)
11580 /* Loaded using DUP. */
11581 return const_dup;
11582 else if (const_vec != NULL_RTX)
11583 /* Load from constant pool. We can not take advantage of single-cycle
11584 LD1 because we need a PC-relative addressing mode. */
11585 return const_vec;
11586 else
11587 /* A PARALLEL containing something not valid inside CONST_VECTOR.
11588 We can not construct an initializer. */
11589 return NULL_RTX;
11592 /* Expand a vector initialisation sequence, such that TARGET is
11593 initialised to contain VALS. */
11595 void
11596 aarch64_expand_vector_init (rtx target, rtx vals)
11598 machine_mode mode = GET_MODE (target);
11599 machine_mode inner_mode = GET_MODE_INNER (mode);
11600 /* The number of vector elements. */
11601 int n_elts = GET_MODE_NUNITS (mode);
11602 /* The number of vector elements which are not constant. */
11603 int n_var = 0;
11604 rtx any_const = NULL_RTX;
11605 /* The first element of vals. */
11606 rtx v0 = XVECEXP (vals, 0, 0);
11607 bool all_same = true;
11609 /* Count the number of variable elements to initialise. */
11610 for (int i = 0; i < n_elts; ++i)
11612 rtx x = XVECEXP (vals, 0, i);
11613 if (!(CONST_INT_P (x) || CONST_DOUBLE_P (x)))
11614 ++n_var;
11615 else
11616 any_const = x;
11618 all_same &= rtx_equal_p (x, v0);
11621 /* No variable elements, hand off to aarch64_simd_make_constant which knows
11622 how best to handle this. */
11623 if (n_var == 0)
11625 rtx constant = aarch64_simd_make_constant (vals);
11626 if (constant != NULL_RTX)
11628 emit_move_insn (target, constant);
11629 return;
11633 /* Splat a single non-constant element if we can. */
11634 if (all_same)
11636 rtx x = copy_to_mode_reg (inner_mode, v0);
11637 aarch64_emit_move (target, gen_rtx_VEC_DUPLICATE (mode, x));
11638 return;
11641 /* Initialise a vector which is part-variable. We want to first try
11642 to build those lanes which are constant in the most efficient way we
11643 can. */
11644 if (n_var != n_elts)
11646 rtx copy = copy_rtx (vals);
11648 /* Load constant part of vector. We really don't care what goes into the
11649 parts we will overwrite, but we're more likely to be able to load the
11650 constant efficiently if it has fewer, larger, repeating parts
11651 (see aarch64_simd_valid_immediate). */
11652 for (int i = 0; i < n_elts; i++)
11654 rtx x = XVECEXP (vals, 0, i);
11655 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11656 continue;
11657 rtx subst = any_const;
11658 for (int bit = n_elts / 2; bit > 0; bit /= 2)
11660 /* Look in the copied vector, as more elements are const. */
11661 rtx test = XVECEXP (copy, 0, i ^ bit);
11662 if (CONST_INT_P (test) || CONST_DOUBLE_P (test))
11664 subst = test;
11665 break;
11668 XVECEXP (copy, 0, i) = subst;
11670 aarch64_expand_vector_init (target, copy);
11673 /* Insert the variable lanes directly. */
11675 enum insn_code icode = optab_handler (vec_set_optab, mode);
11676 gcc_assert (icode != CODE_FOR_nothing);
11678 for (int i = 0; i < n_elts; i++)
11680 rtx x = XVECEXP (vals, 0, i);
11681 if (CONST_INT_P (x) || CONST_DOUBLE_P (x))
11682 continue;
11683 x = copy_to_mode_reg (inner_mode, x);
11684 emit_insn (GEN_FCN (icode) (target, x, GEN_INT (i)));
11688 static unsigned HOST_WIDE_INT
11689 aarch64_shift_truncation_mask (machine_mode mode)
11691 return
11692 (!SHIFT_COUNT_TRUNCATED
11693 || aarch64_vector_mode_supported_p (mode)
11694 || aarch64_vect_struct_mode_p (mode)) ? 0 : (GET_MODE_BITSIZE (mode) - 1);
11697 /* Select a format to encode pointers in exception handling data. */
11699 aarch64_asm_preferred_eh_data_format (int code ATTRIBUTE_UNUSED, int global)
11701 int type;
11702 switch (aarch64_cmodel)
11704 case AARCH64_CMODEL_TINY:
11705 case AARCH64_CMODEL_TINY_PIC:
11706 case AARCH64_CMODEL_SMALL:
11707 case AARCH64_CMODEL_SMALL_PIC:
11708 case AARCH64_CMODEL_SMALL_SPIC:
11709 /* text+got+data < 4Gb. 4-byte signed relocs are sufficient
11710 for everything. */
11711 type = DW_EH_PE_sdata4;
11712 break;
11713 default:
11714 /* No assumptions here. 8-byte relocs required. */
11715 type = DW_EH_PE_sdata8;
11716 break;
11718 return (global ? DW_EH_PE_indirect : 0) | DW_EH_PE_pcrel | type;
11721 /* The last .arch and .tune assembly strings that we printed. */
11722 static std::string aarch64_last_printed_arch_string;
11723 static std::string aarch64_last_printed_tune_string;
11725 /* Implement ASM_DECLARE_FUNCTION_NAME. Output the ISA features used
11726 by the function fndecl. */
11728 void
11729 aarch64_declare_function_name (FILE *stream, const char* name,
11730 tree fndecl)
11732 tree target_parts = DECL_FUNCTION_SPECIFIC_TARGET (fndecl);
11734 struct cl_target_option *targ_options;
11735 if (target_parts)
11736 targ_options = TREE_TARGET_OPTION (target_parts);
11737 else
11738 targ_options = TREE_TARGET_OPTION (target_option_current_node);
11739 gcc_assert (targ_options);
11741 const struct processor *this_arch
11742 = aarch64_get_arch (targ_options->x_explicit_arch);
11744 unsigned long isa_flags = targ_options->x_aarch64_isa_flags;
11745 std::string extension
11746 = aarch64_get_extension_string_for_isa_flags (isa_flags,
11747 this_arch->flags);
11748 /* Only update the assembler .arch string if it is distinct from the last
11749 such string we printed. */
11750 std::string to_print = this_arch->name + extension;
11751 if (to_print != aarch64_last_printed_arch_string)
11753 asm_fprintf (asm_out_file, "\t.arch %s\n", to_print.c_str ());
11754 aarch64_last_printed_arch_string = to_print;
11757 /* Print the cpu name we're tuning for in the comments, might be
11758 useful to readers of the generated asm. Do it only when it changes
11759 from function to function and verbose assembly is requested. */
11760 const struct processor *this_tune
11761 = aarch64_get_tune_cpu (targ_options->x_explicit_tune_core);
11763 if (flag_debug_asm && aarch64_last_printed_tune_string != this_tune->name)
11765 asm_fprintf (asm_out_file, "\t" ASM_COMMENT_START ".tune %s\n",
11766 this_tune->name);
11767 aarch64_last_printed_tune_string = this_tune->name;
11770 /* Don't forget the type directive for ELF. */
11771 ASM_OUTPUT_TYPE_DIRECTIVE (stream, name, "function");
11772 ASM_OUTPUT_LABEL (stream, name);
11775 /* Implements TARGET_ASM_FILE_START. Output the assembly header. */
11777 static void
11778 aarch64_start_file (void)
11780 struct cl_target_option *default_options
11781 = TREE_TARGET_OPTION (target_option_default_node);
11783 const struct processor *default_arch
11784 = aarch64_get_arch (default_options->x_explicit_arch);
11785 unsigned long default_isa_flags = default_options->x_aarch64_isa_flags;
11786 std::string extension
11787 = aarch64_get_extension_string_for_isa_flags (default_isa_flags,
11788 default_arch->flags);
11790 aarch64_last_printed_arch_string = default_arch->name + extension;
11791 aarch64_last_printed_tune_string = "";
11792 asm_fprintf (asm_out_file, "\t.arch %s\n",
11793 aarch64_last_printed_arch_string.c_str ());
11795 default_file_start ();
11798 /* Emit load exclusive. */
11800 static void
11801 aarch64_emit_load_exclusive (machine_mode mode, rtx rval,
11802 rtx mem, rtx model_rtx)
11804 rtx (*gen) (rtx, rtx, rtx);
11806 switch (mode)
11808 case QImode: gen = gen_aarch64_load_exclusiveqi; break;
11809 case HImode: gen = gen_aarch64_load_exclusivehi; break;
11810 case SImode: gen = gen_aarch64_load_exclusivesi; break;
11811 case DImode: gen = gen_aarch64_load_exclusivedi; break;
11812 default:
11813 gcc_unreachable ();
11816 emit_insn (gen (rval, mem, model_rtx));
11819 /* Emit store exclusive. */
11821 static void
11822 aarch64_emit_store_exclusive (machine_mode mode, rtx bval,
11823 rtx rval, rtx mem, rtx model_rtx)
11825 rtx (*gen) (rtx, rtx, rtx, rtx);
11827 switch (mode)
11829 case QImode: gen = gen_aarch64_store_exclusiveqi; break;
11830 case HImode: gen = gen_aarch64_store_exclusivehi; break;
11831 case SImode: gen = gen_aarch64_store_exclusivesi; break;
11832 case DImode: gen = gen_aarch64_store_exclusivedi; break;
11833 default:
11834 gcc_unreachable ();
11837 emit_insn (gen (bval, rval, mem, model_rtx));
11840 /* Mark the previous jump instruction as unlikely. */
11842 static void
11843 aarch64_emit_unlikely_jump (rtx insn)
11845 int very_unlikely = REG_BR_PROB_BASE / 100 - 1;
11847 rtx_insn *jump = emit_jump_insn (insn);
11848 add_int_reg_note (jump, REG_BR_PROB, very_unlikely);
11851 /* Expand a compare and swap pattern. */
11853 void
11854 aarch64_expand_compare_and_swap (rtx operands[])
11856 rtx bval, rval, mem, oldval, newval, is_weak, mod_s, mod_f, x;
11857 machine_mode mode, cmp_mode;
11858 typedef rtx (*gen_cas_fn) (rtx, rtx, rtx, rtx, rtx, rtx, rtx);
11859 int idx;
11860 gen_cas_fn gen;
11861 const gen_cas_fn split_cas[] =
11863 gen_aarch64_compare_and_swapqi,
11864 gen_aarch64_compare_and_swaphi,
11865 gen_aarch64_compare_and_swapsi,
11866 gen_aarch64_compare_and_swapdi
11868 const gen_cas_fn atomic_cas[] =
11870 gen_aarch64_compare_and_swapqi_lse,
11871 gen_aarch64_compare_and_swaphi_lse,
11872 gen_aarch64_compare_and_swapsi_lse,
11873 gen_aarch64_compare_and_swapdi_lse
11876 bval = operands[0];
11877 rval = operands[1];
11878 mem = operands[2];
11879 oldval = operands[3];
11880 newval = operands[4];
11881 is_weak = operands[5];
11882 mod_s = operands[6];
11883 mod_f = operands[7];
11884 mode = GET_MODE (mem);
11885 cmp_mode = mode;
11887 /* Normally the succ memory model must be stronger than fail, but in the
11888 unlikely event of fail being ACQUIRE and succ being RELEASE we need to
11889 promote succ to ACQ_REL so that we don't lose the acquire semantics. */
11891 if (is_mm_acquire (memmodel_from_int (INTVAL (mod_f)))
11892 && is_mm_release (memmodel_from_int (INTVAL (mod_s))))
11893 mod_s = GEN_INT (MEMMODEL_ACQ_REL);
11895 switch (mode)
11897 case QImode:
11898 case HImode:
11899 /* For short modes, we're going to perform the comparison in SImode,
11900 so do the zero-extension now. */
11901 cmp_mode = SImode;
11902 rval = gen_reg_rtx (SImode);
11903 oldval = convert_modes (SImode, mode, oldval, true);
11904 /* Fall through. */
11906 case SImode:
11907 case DImode:
11908 /* Force the value into a register if needed. */
11909 if (!aarch64_plus_operand (oldval, mode))
11910 oldval = force_reg (cmp_mode, oldval);
11911 break;
11913 default:
11914 gcc_unreachable ();
11917 switch (mode)
11919 case QImode: idx = 0; break;
11920 case HImode: idx = 1; break;
11921 case SImode: idx = 2; break;
11922 case DImode: idx = 3; break;
11923 default:
11924 gcc_unreachable ();
11926 if (TARGET_LSE)
11927 gen = atomic_cas[idx];
11928 else
11929 gen = split_cas[idx];
11931 emit_insn (gen (rval, mem, oldval, newval, is_weak, mod_s, mod_f));
11933 if (mode == QImode || mode == HImode)
11934 emit_move_insn (operands[1], gen_lowpart (mode, rval));
11936 x = gen_rtx_REG (CCmode, CC_REGNUM);
11937 x = gen_rtx_EQ (SImode, x, const0_rtx);
11938 emit_insn (gen_rtx_SET (bval, x));
11941 /* Test whether the target supports using a atomic load-operate instruction.
11942 CODE is the operation and AFTER is TRUE if the data in memory after the
11943 operation should be returned and FALSE if the data before the operation
11944 should be returned. Returns FALSE if the operation isn't supported by the
11945 architecture. */
11947 bool
11948 aarch64_atomic_ldop_supported_p (enum rtx_code code)
11950 if (!TARGET_LSE)
11951 return false;
11953 switch (code)
11955 case SET:
11956 case AND:
11957 case IOR:
11958 case XOR:
11959 case MINUS:
11960 case PLUS:
11961 return true;
11962 default:
11963 return false;
11967 /* Emit a barrier, that is appropriate for memory model MODEL, at the end of a
11968 sequence implementing an atomic operation. */
11970 static void
11971 aarch64_emit_post_barrier (enum memmodel model)
11973 const enum memmodel base_model = memmodel_base (model);
11975 if (is_mm_sync (model)
11976 && (base_model == MEMMODEL_ACQUIRE
11977 || base_model == MEMMODEL_ACQ_REL
11978 || base_model == MEMMODEL_SEQ_CST))
11980 emit_insn (gen_mem_thread_fence (GEN_INT (MEMMODEL_SEQ_CST)));
11984 /* Emit an atomic compare-and-swap operation. RVAL is the destination register
11985 for the data in memory. EXPECTED is the value expected to be in memory.
11986 DESIRED is the value to store to memory. MEM is the memory location. MODEL
11987 is the memory ordering to use. */
11989 void
11990 aarch64_gen_atomic_cas (rtx rval, rtx mem,
11991 rtx expected, rtx desired,
11992 rtx model)
11994 rtx (*gen) (rtx, rtx, rtx, rtx);
11995 machine_mode mode;
11997 mode = GET_MODE (mem);
11999 switch (mode)
12001 case QImode: gen = gen_aarch64_atomic_casqi; break;
12002 case HImode: gen = gen_aarch64_atomic_cashi; break;
12003 case SImode: gen = gen_aarch64_atomic_cassi; break;
12004 case DImode: gen = gen_aarch64_atomic_casdi; break;
12005 default:
12006 gcc_unreachable ();
12009 /* Move the expected value into the CAS destination register. */
12010 emit_insn (gen_rtx_SET (rval, expected));
12012 /* Emit the CAS. */
12013 emit_insn (gen (rval, mem, desired, model));
12015 /* Compare the expected value with the value loaded by the CAS, to establish
12016 whether the swap was made. */
12017 aarch64_gen_compare_reg (EQ, rval, expected);
12020 /* Split a compare and swap pattern. */
12022 void
12023 aarch64_split_compare_and_swap (rtx operands[])
12025 rtx rval, mem, oldval, newval, scratch;
12026 machine_mode mode;
12027 bool is_weak;
12028 rtx_code_label *label1, *label2;
12029 rtx x, cond;
12030 enum memmodel model;
12031 rtx model_rtx;
12033 rval = operands[0];
12034 mem = operands[1];
12035 oldval = operands[2];
12036 newval = operands[3];
12037 is_weak = (operands[4] != const0_rtx);
12038 model_rtx = operands[5];
12039 scratch = operands[7];
12040 mode = GET_MODE (mem);
12041 model = memmodel_from_int (INTVAL (model_rtx));
12043 label1 = NULL;
12044 if (!is_weak)
12046 label1 = gen_label_rtx ();
12047 emit_label (label1);
12049 label2 = gen_label_rtx ();
12051 /* The initial load can be relaxed for a __sync operation since a final
12052 barrier will be emitted to stop code hoisting. */
12053 if (is_mm_sync (model))
12054 aarch64_emit_load_exclusive (mode, rval, mem,
12055 GEN_INT (MEMMODEL_RELAXED));
12056 else
12057 aarch64_emit_load_exclusive (mode, rval, mem, model_rtx);
12059 cond = aarch64_gen_compare_reg (NE, rval, oldval);
12060 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12061 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12062 gen_rtx_LABEL_REF (Pmode, label2), pc_rtx);
12063 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12065 aarch64_emit_store_exclusive (mode, scratch, mem, newval, model_rtx);
12067 if (!is_weak)
12069 x = gen_rtx_NE (VOIDmode, scratch, const0_rtx);
12070 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12071 gen_rtx_LABEL_REF (Pmode, label1), pc_rtx);
12072 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12074 else
12076 cond = gen_rtx_REG (CCmode, CC_REGNUM);
12077 x = gen_rtx_COMPARE (CCmode, scratch, const0_rtx);
12078 emit_insn (gen_rtx_SET (cond, x));
12081 emit_label (label2);
12083 /* Emit any final barrier needed for a __sync operation. */
12084 if (is_mm_sync (model))
12085 aarch64_emit_post_barrier (model);
12088 /* Emit a BIC instruction. */
12090 static void
12091 aarch64_emit_bic (machine_mode mode, rtx dst, rtx s1, rtx s2, int shift)
12093 rtx shift_rtx = GEN_INT (shift);
12094 rtx (*gen) (rtx, rtx, rtx, rtx);
12096 switch (mode)
12098 case SImode: gen = gen_and_one_cmpl_lshrsi3; break;
12099 case DImode: gen = gen_and_one_cmpl_lshrdi3; break;
12100 default:
12101 gcc_unreachable ();
12104 emit_insn (gen (dst, s2, shift_rtx, s1));
12107 /* Emit an atomic swap. */
12109 static void
12110 aarch64_emit_atomic_swap (machine_mode mode, rtx dst, rtx value,
12111 rtx mem, rtx model)
12113 rtx (*gen) (rtx, rtx, rtx, rtx);
12115 switch (mode)
12117 case QImode: gen = gen_aarch64_atomic_swpqi; break;
12118 case HImode: gen = gen_aarch64_atomic_swphi; break;
12119 case SImode: gen = gen_aarch64_atomic_swpsi; break;
12120 case DImode: gen = gen_aarch64_atomic_swpdi; break;
12121 default:
12122 gcc_unreachable ();
12125 emit_insn (gen (dst, mem, value, model));
12128 /* Operations supported by aarch64_emit_atomic_load_op. */
12130 enum aarch64_atomic_load_op_code
12132 AARCH64_LDOP_PLUS, /* A + B */
12133 AARCH64_LDOP_XOR, /* A ^ B */
12134 AARCH64_LDOP_OR, /* A | B */
12135 AARCH64_LDOP_BIC /* A & ~B */
12138 /* Emit an atomic load-operate. */
12140 static void
12141 aarch64_emit_atomic_load_op (enum aarch64_atomic_load_op_code code,
12142 machine_mode mode, rtx dst, rtx src,
12143 rtx mem, rtx model)
12145 typedef rtx (*aarch64_atomic_load_op_fn) (rtx, rtx, rtx, rtx);
12146 const aarch64_atomic_load_op_fn plus[] =
12148 gen_aarch64_atomic_loadaddqi,
12149 gen_aarch64_atomic_loadaddhi,
12150 gen_aarch64_atomic_loadaddsi,
12151 gen_aarch64_atomic_loadadddi
12153 const aarch64_atomic_load_op_fn eor[] =
12155 gen_aarch64_atomic_loadeorqi,
12156 gen_aarch64_atomic_loadeorhi,
12157 gen_aarch64_atomic_loadeorsi,
12158 gen_aarch64_atomic_loadeordi
12160 const aarch64_atomic_load_op_fn ior[] =
12162 gen_aarch64_atomic_loadsetqi,
12163 gen_aarch64_atomic_loadsethi,
12164 gen_aarch64_atomic_loadsetsi,
12165 gen_aarch64_atomic_loadsetdi
12167 const aarch64_atomic_load_op_fn bic[] =
12169 gen_aarch64_atomic_loadclrqi,
12170 gen_aarch64_atomic_loadclrhi,
12171 gen_aarch64_atomic_loadclrsi,
12172 gen_aarch64_atomic_loadclrdi
12174 aarch64_atomic_load_op_fn gen;
12175 int idx = 0;
12177 switch (mode)
12179 case QImode: idx = 0; break;
12180 case HImode: idx = 1; break;
12181 case SImode: idx = 2; break;
12182 case DImode: idx = 3; break;
12183 default:
12184 gcc_unreachable ();
12187 switch (code)
12189 case AARCH64_LDOP_PLUS: gen = plus[idx]; break;
12190 case AARCH64_LDOP_XOR: gen = eor[idx]; break;
12191 case AARCH64_LDOP_OR: gen = ior[idx]; break;
12192 case AARCH64_LDOP_BIC: gen = bic[idx]; break;
12193 default:
12194 gcc_unreachable ();
12197 emit_insn (gen (dst, mem, src, model));
12200 /* Emit an atomic load+operate. CODE is the operation. OUT_DATA is the
12201 location to store the data read from memory. OUT_RESULT is the location to
12202 store the result of the operation. MEM is the memory location to read and
12203 modify. MODEL_RTX is the memory ordering to use. VALUE is the second
12204 operand for the operation. Either OUT_DATA or OUT_RESULT, but not both, can
12205 be NULL. */
12207 void
12208 aarch64_gen_atomic_ldop (enum rtx_code code, rtx out_data, rtx out_result,
12209 rtx mem, rtx value, rtx model_rtx)
12211 machine_mode mode = GET_MODE (mem);
12212 machine_mode wmode = (mode == DImode ? DImode : SImode);
12213 const bool short_mode = (mode < SImode);
12214 aarch64_atomic_load_op_code ldop_code;
12215 rtx src;
12216 rtx x;
12218 if (out_data)
12219 out_data = gen_lowpart (mode, out_data);
12221 if (out_result)
12222 out_result = gen_lowpart (mode, out_result);
12224 /* Make sure the value is in a register, putting it into a destination
12225 register if it needs to be manipulated. */
12226 if (!register_operand (value, mode)
12227 || code == AND || code == MINUS)
12229 src = out_result ? out_result : out_data;
12230 emit_move_insn (src, gen_lowpart (mode, value));
12232 else
12233 src = value;
12234 gcc_assert (register_operand (src, mode));
12236 /* Preprocess the data for the operation as necessary. If the operation is
12237 a SET then emit a swap instruction and finish. */
12238 switch (code)
12240 case SET:
12241 aarch64_emit_atomic_swap (mode, out_data, src, mem, model_rtx);
12242 return;
12244 case MINUS:
12245 /* Negate the value and treat it as a PLUS. */
12247 rtx neg_src;
12249 /* Resize the value if necessary. */
12250 if (short_mode)
12251 src = gen_lowpart (wmode, src);
12253 neg_src = gen_rtx_NEG (wmode, src);
12254 emit_insn (gen_rtx_SET (src, neg_src));
12256 if (short_mode)
12257 src = gen_lowpart (mode, src);
12259 /* Fall-through. */
12260 case PLUS:
12261 ldop_code = AARCH64_LDOP_PLUS;
12262 break;
12264 case IOR:
12265 ldop_code = AARCH64_LDOP_OR;
12266 break;
12268 case XOR:
12269 ldop_code = AARCH64_LDOP_XOR;
12270 break;
12272 case AND:
12274 rtx not_src;
12276 /* Resize the value if necessary. */
12277 if (short_mode)
12278 src = gen_lowpart (wmode, src);
12280 not_src = gen_rtx_NOT (wmode, src);
12281 emit_insn (gen_rtx_SET (src, not_src));
12283 if (short_mode)
12284 src = gen_lowpart (mode, src);
12286 ldop_code = AARCH64_LDOP_BIC;
12287 break;
12289 default:
12290 /* The operation can't be done with atomic instructions. */
12291 gcc_unreachable ();
12294 aarch64_emit_atomic_load_op (ldop_code, mode, out_data, src, mem, model_rtx);
12296 /* If necessary, calculate the data in memory after the update by redoing the
12297 operation from values in registers. */
12298 if (!out_result)
12299 return;
12301 if (short_mode)
12303 src = gen_lowpart (wmode, src);
12304 out_data = gen_lowpart (wmode, out_data);
12305 out_result = gen_lowpart (wmode, out_result);
12308 x = NULL_RTX;
12310 switch (code)
12312 case MINUS:
12313 case PLUS:
12314 x = gen_rtx_PLUS (wmode, out_data, src);
12315 break;
12316 case IOR:
12317 x = gen_rtx_IOR (wmode, out_data, src);
12318 break;
12319 case XOR:
12320 x = gen_rtx_XOR (wmode, out_data, src);
12321 break;
12322 case AND:
12323 aarch64_emit_bic (wmode, out_result, out_data, src, 0);
12324 return;
12325 default:
12326 gcc_unreachable ();
12329 emit_set_insn (out_result, x);
12331 return;
12334 /* Split an atomic operation. */
12336 void
12337 aarch64_split_atomic_op (enum rtx_code code, rtx old_out, rtx new_out, rtx mem,
12338 rtx value, rtx model_rtx, rtx cond)
12340 machine_mode mode = GET_MODE (mem);
12341 machine_mode wmode = (mode == DImode ? DImode : SImode);
12342 const enum memmodel model = memmodel_from_int (INTVAL (model_rtx));
12343 const bool is_sync = is_mm_sync (model);
12344 rtx_code_label *label;
12345 rtx x;
12347 /* Split the atomic operation into a sequence. */
12348 label = gen_label_rtx ();
12349 emit_label (label);
12351 if (new_out)
12352 new_out = gen_lowpart (wmode, new_out);
12353 if (old_out)
12354 old_out = gen_lowpart (wmode, old_out);
12355 else
12356 old_out = new_out;
12357 value = simplify_gen_subreg (wmode, value, mode, 0);
12359 /* The initial load can be relaxed for a __sync operation since a final
12360 barrier will be emitted to stop code hoisting. */
12361 if (is_sync)
12362 aarch64_emit_load_exclusive (mode, old_out, mem,
12363 GEN_INT (MEMMODEL_RELAXED));
12364 else
12365 aarch64_emit_load_exclusive (mode, old_out, mem, model_rtx);
12367 switch (code)
12369 case SET:
12370 new_out = value;
12371 break;
12373 case NOT:
12374 x = gen_rtx_AND (wmode, old_out, value);
12375 emit_insn (gen_rtx_SET (new_out, x));
12376 x = gen_rtx_NOT (wmode, new_out);
12377 emit_insn (gen_rtx_SET (new_out, x));
12378 break;
12380 case MINUS:
12381 if (CONST_INT_P (value))
12383 value = GEN_INT (-INTVAL (value));
12384 code = PLUS;
12386 /* Fall through. */
12388 default:
12389 x = gen_rtx_fmt_ee (code, wmode, old_out, value);
12390 emit_insn (gen_rtx_SET (new_out, x));
12391 break;
12394 aarch64_emit_store_exclusive (mode, cond, mem,
12395 gen_lowpart (mode, new_out), model_rtx);
12397 x = gen_rtx_NE (VOIDmode, cond, const0_rtx);
12398 x = gen_rtx_IF_THEN_ELSE (VOIDmode, x,
12399 gen_rtx_LABEL_REF (Pmode, label), pc_rtx);
12400 aarch64_emit_unlikely_jump (gen_rtx_SET (pc_rtx, x));
12402 /* Emit any final barrier needed for a __sync operation. */
12403 if (is_sync)
12404 aarch64_emit_post_barrier (model);
12407 static void
12408 aarch64_init_libfuncs (void)
12410 /* Half-precision float operations. The compiler handles all operations
12411 with NULL libfuncs by converting to SFmode. */
12413 /* Conversions. */
12414 set_conv_libfunc (trunc_optab, HFmode, SFmode, "__gnu_f2h_ieee");
12415 set_conv_libfunc (sext_optab, SFmode, HFmode, "__gnu_h2f_ieee");
12417 /* Arithmetic. */
12418 set_optab_libfunc (add_optab, HFmode, NULL);
12419 set_optab_libfunc (sdiv_optab, HFmode, NULL);
12420 set_optab_libfunc (smul_optab, HFmode, NULL);
12421 set_optab_libfunc (neg_optab, HFmode, NULL);
12422 set_optab_libfunc (sub_optab, HFmode, NULL);
12424 /* Comparisons. */
12425 set_optab_libfunc (eq_optab, HFmode, NULL);
12426 set_optab_libfunc (ne_optab, HFmode, NULL);
12427 set_optab_libfunc (lt_optab, HFmode, NULL);
12428 set_optab_libfunc (le_optab, HFmode, NULL);
12429 set_optab_libfunc (ge_optab, HFmode, NULL);
12430 set_optab_libfunc (gt_optab, HFmode, NULL);
12431 set_optab_libfunc (unord_optab, HFmode, NULL);
12434 /* Target hook for c_mode_for_suffix. */
12435 static machine_mode
12436 aarch64_c_mode_for_suffix (char suffix)
12438 if (suffix == 'q')
12439 return TFmode;
12441 return VOIDmode;
12444 /* We can only represent floating point constants which will fit in
12445 "quarter-precision" values. These values are characterised by
12446 a sign bit, a 4-bit mantissa and a 3-bit exponent. And are given
12449 (-1)^s * (n/16) * 2^r
12451 Where:
12452 's' is the sign bit.
12453 'n' is an integer in the range 16 <= n <= 31.
12454 'r' is an integer in the range -3 <= r <= 4. */
12456 /* Return true iff X can be represented by a quarter-precision
12457 floating point immediate operand X. Note, we cannot represent 0.0. */
12458 bool
12459 aarch64_float_const_representable_p (rtx x)
12461 /* This represents our current view of how many bits
12462 make up the mantissa. */
12463 int point_pos = 2 * HOST_BITS_PER_WIDE_INT - 1;
12464 int exponent;
12465 unsigned HOST_WIDE_INT mantissa, mask;
12466 REAL_VALUE_TYPE r, m;
12467 bool fail;
12469 if (!CONST_DOUBLE_P (x))
12470 return false;
12472 /* We don't support HFmode constants yet. */
12473 if (GET_MODE (x) == VOIDmode || GET_MODE (x) == HFmode)
12474 return false;
12476 r = *CONST_DOUBLE_REAL_VALUE (x);
12478 /* We cannot represent infinities, NaNs or +/-zero. We won't
12479 know if we have +zero until we analyse the mantissa, but we
12480 can reject the other invalid values. */
12481 if (REAL_VALUE_ISINF (r) || REAL_VALUE_ISNAN (r)
12482 || REAL_VALUE_MINUS_ZERO (r))
12483 return false;
12485 /* Extract exponent. */
12486 r = real_value_abs (&r);
12487 exponent = REAL_EXP (&r);
12489 /* For the mantissa, we expand into two HOST_WIDE_INTS, apart from the
12490 highest (sign) bit, with a fixed binary point at bit point_pos.
12491 m1 holds the low part of the mantissa, m2 the high part.
12492 WARNING: If we ever have a representation using more than 2 * H_W_I - 1
12493 bits for the mantissa, this can fail (low bits will be lost). */
12494 real_ldexp (&m, &r, point_pos - exponent);
12495 wide_int w = real_to_integer (&m, &fail, HOST_BITS_PER_WIDE_INT * 2);
12497 /* If the low part of the mantissa has bits set we cannot represent
12498 the value. */
12499 if (w.ulow () != 0)
12500 return false;
12501 /* We have rejected the lower HOST_WIDE_INT, so update our
12502 understanding of how many bits lie in the mantissa and
12503 look only at the high HOST_WIDE_INT. */
12504 mantissa = w.elt (1);
12505 point_pos -= HOST_BITS_PER_WIDE_INT;
12507 /* We can only represent values with a mantissa of the form 1.xxxx. */
12508 mask = ((unsigned HOST_WIDE_INT)1 << (point_pos - 5)) - 1;
12509 if ((mantissa & mask) != 0)
12510 return false;
12512 /* Having filtered unrepresentable values, we may now remove all
12513 but the highest 5 bits. */
12514 mantissa >>= point_pos - 5;
12516 /* We cannot represent the value 0.0, so reject it. This is handled
12517 elsewhere. */
12518 if (mantissa == 0)
12519 return false;
12521 /* Then, as bit 4 is always set, we can mask it off, leaving
12522 the mantissa in the range [0, 15]. */
12523 mantissa &= ~(1 << 4);
12524 gcc_assert (mantissa <= 15);
12526 /* GCC internally does not use IEEE754-like encoding (where normalized
12527 significands are in the range [1, 2). GCC uses [0.5, 1) (see real.c).
12528 Our mantissa values are shifted 4 places to the left relative to
12529 normalized IEEE754 so we must modify the exponent returned by REAL_EXP
12530 by 5 places to correct for GCC's representation. */
12531 exponent = 5 - exponent;
12533 return (exponent >= 0 && exponent <= 7);
12536 char*
12537 aarch64_output_simd_mov_immediate (rtx const_vector,
12538 machine_mode mode,
12539 unsigned width)
12541 bool is_valid;
12542 static char templ[40];
12543 const char *mnemonic;
12544 const char *shift_op;
12545 unsigned int lane_count = 0;
12546 char element_char;
12548 struct simd_immediate_info info = { NULL_RTX, 0, 0, false, false };
12550 /* This will return true to show const_vector is legal for use as either
12551 a AdvSIMD MOVI instruction (or, implicitly, MVNI) immediate. It will
12552 also update INFO to show how the immediate should be generated. */
12553 is_valid = aarch64_simd_valid_immediate (const_vector, mode, false, &info);
12554 gcc_assert (is_valid);
12556 element_char = sizetochar (info.element_width);
12557 lane_count = width / info.element_width;
12559 mode = GET_MODE_INNER (mode);
12560 if (GET_MODE_CLASS (mode) == MODE_FLOAT)
12562 gcc_assert (info.shift == 0 && ! info.mvn);
12563 /* For FP zero change it to a CONST_INT 0 and use the integer SIMD
12564 move immediate path. */
12565 if (aarch64_float_const_zero_rtx_p (info.value))
12566 info.value = GEN_INT (0);
12567 else
12569 const unsigned int buf_size = 20;
12570 char float_buf[buf_size] = {'\0'};
12571 real_to_decimal_for_mode (float_buf,
12572 CONST_DOUBLE_REAL_VALUE (info.value),
12573 buf_size, buf_size, 1, mode);
12575 if (lane_count == 1)
12576 snprintf (templ, sizeof (templ), "fmov\t%%d0, %s", float_buf);
12577 else
12578 snprintf (templ, sizeof (templ), "fmov\t%%0.%d%c, %s",
12579 lane_count, element_char, float_buf);
12580 return templ;
12584 mnemonic = info.mvn ? "mvni" : "movi";
12585 shift_op = info.msl ? "msl" : "lsl";
12587 gcc_assert (CONST_INT_P (info.value));
12588 if (lane_count == 1)
12589 snprintf (templ, sizeof (templ), "%s\t%%d0, " HOST_WIDE_INT_PRINT_HEX,
12590 mnemonic, UINTVAL (info.value));
12591 else if (info.shift)
12592 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX
12593 ", %s %d", mnemonic, lane_count, element_char,
12594 UINTVAL (info.value), shift_op, info.shift);
12595 else
12596 snprintf (templ, sizeof (templ), "%s\t%%0.%d%c, " HOST_WIDE_INT_PRINT_HEX,
12597 mnemonic, lane_count, element_char, UINTVAL (info.value));
12598 return templ;
12601 char*
12602 aarch64_output_scalar_simd_mov_immediate (rtx immediate,
12603 machine_mode mode)
12605 machine_mode vmode;
12607 gcc_assert (!VECTOR_MODE_P (mode));
12608 vmode = aarch64_simd_container_mode (mode, 64);
12609 rtx v_op = aarch64_simd_gen_const_vector_dup (vmode, INTVAL (immediate));
12610 return aarch64_output_simd_mov_immediate (v_op, vmode, 64);
12613 /* Split operands into moves from op[1] + op[2] into op[0]. */
12615 void
12616 aarch64_split_combinev16qi (rtx operands[3])
12618 unsigned int dest = REGNO (operands[0]);
12619 unsigned int src1 = REGNO (operands[1]);
12620 unsigned int src2 = REGNO (operands[2]);
12621 machine_mode halfmode = GET_MODE (operands[1]);
12622 unsigned int halfregs = HARD_REGNO_NREGS (src1, halfmode);
12623 rtx destlo, desthi;
12625 gcc_assert (halfmode == V16QImode);
12627 if (src1 == dest && src2 == dest + halfregs)
12629 /* No-op move. Can't split to nothing; emit something. */
12630 emit_note (NOTE_INSN_DELETED);
12631 return;
12634 /* Preserve register attributes for variable tracking. */
12635 destlo = gen_rtx_REG_offset (operands[0], halfmode, dest, 0);
12636 desthi = gen_rtx_REG_offset (operands[0], halfmode, dest + halfregs,
12637 GET_MODE_SIZE (halfmode));
12639 /* Special case of reversed high/low parts. */
12640 if (reg_overlap_mentioned_p (operands[2], destlo)
12641 && reg_overlap_mentioned_p (operands[1], desthi))
12643 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12644 emit_insn (gen_xorv16qi3 (operands[2], operands[1], operands[2]));
12645 emit_insn (gen_xorv16qi3 (operands[1], operands[1], operands[2]));
12647 else if (!reg_overlap_mentioned_p (operands[2], destlo))
12649 /* Try to avoid unnecessary moves if part of the result
12650 is in the right place already. */
12651 if (src1 != dest)
12652 emit_move_insn (destlo, operands[1]);
12653 if (src2 != dest + halfregs)
12654 emit_move_insn (desthi, operands[2]);
12656 else
12658 if (src2 != dest + halfregs)
12659 emit_move_insn (desthi, operands[2]);
12660 if (src1 != dest)
12661 emit_move_insn (destlo, operands[1]);
12665 /* vec_perm support. */
12667 #define MAX_VECT_LEN 16
12669 struct expand_vec_perm_d
12671 rtx target, op0, op1;
12672 unsigned char perm[MAX_VECT_LEN];
12673 machine_mode vmode;
12674 unsigned char nelt;
12675 bool one_vector_p;
12676 bool testing_p;
12679 /* Generate a variable permutation. */
12681 static void
12682 aarch64_expand_vec_perm_1 (rtx target, rtx op0, rtx op1, rtx sel)
12684 machine_mode vmode = GET_MODE (target);
12685 bool one_vector_p = rtx_equal_p (op0, op1);
12687 gcc_checking_assert (vmode == V8QImode || vmode == V16QImode);
12688 gcc_checking_assert (GET_MODE (op0) == vmode);
12689 gcc_checking_assert (GET_MODE (op1) == vmode);
12690 gcc_checking_assert (GET_MODE (sel) == vmode);
12691 gcc_checking_assert (TARGET_SIMD);
12693 if (one_vector_p)
12695 if (vmode == V8QImode)
12697 /* Expand the argument to a V16QI mode by duplicating it. */
12698 rtx pair = gen_reg_rtx (V16QImode);
12699 emit_insn (gen_aarch64_combinev8qi (pair, op0, op0));
12700 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12702 else
12704 emit_insn (gen_aarch64_tbl1v16qi (target, op0, sel));
12707 else
12709 rtx pair;
12711 if (vmode == V8QImode)
12713 pair = gen_reg_rtx (V16QImode);
12714 emit_insn (gen_aarch64_combinev8qi (pair, op0, op1));
12715 emit_insn (gen_aarch64_tbl1v8qi (target, pair, sel));
12717 else
12719 pair = gen_reg_rtx (OImode);
12720 emit_insn (gen_aarch64_combinev16qi (pair, op0, op1));
12721 emit_insn (gen_aarch64_tbl2v16qi (target, pair, sel));
12726 void
12727 aarch64_expand_vec_perm (rtx target, rtx op0, rtx op1, rtx sel)
12729 machine_mode vmode = GET_MODE (target);
12730 unsigned int nelt = GET_MODE_NUNITS (vmode);
12731 bool one_vector_p = rtx_equal_p (op0, op1);
12732 rtx mask;
12734 /* The TBL instruction does not use a modulo index, so we must take care
12735 of that ourselves. */
12736 mask = aarch64_simd_gen_const_vector_dup (vmode,
12737 one_vector_p ? nelt - 1 : 2 * nelt - 1);
12738 sel = expand_simple_binop (vmode, AND, sel, mask, NULL, 0, OPTAB_LIB_WIDEN);
12740 /* For big-endian, we also need to reverse the index within the vector
12741 (but not which vector). */
12742 if (BYTES_BIG_ENDIAN)
12744 /* If one_vector_p, mask is a vector of (nelt - 1)'s already. */
12745 if (!one_vector_p)
12746 mask = aarch64_simd_gen_const_vector_dup (vmode, nelt - 1);
12747 sel = expand_simple_binop (vmode, XOR, sel, mask,
12748 NULL, 0, OPTAB_LIB_WIDEN);
12750 aarch64_expand_vec_perm_1 (target, op0, op1, sel);
12753 /* Recognize patterns suitable for the TRN instructions. */
12754 static bool
12755 aarch64_evpc_trn (struct expand_vec_perm_d *d)
12757 unsigned int i, odd, mask, nelt = d->nelt;
12758 rtx out, in0, in1, x;
12759 rtx (*gen) (rtx, rtx, rtx);
12760 machine_mode vmode = d->vmode;
12762 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12763 return false;
12765 /* Note that these are little-endian tests.
12766 We correct for big-endian later. */
12767 if (d->perm[0] == 0)
12768 odd = 0;
12769 else if (d->perm[0] == 1)
12770 odd = 1;
12771 else
12772 return false;
12773 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12775 for (i = 0; i < nelt; i += 2)
12777 if (d->perm[i] != i + odd)
12778 return false;
12779 if (d->perm[i + 1] != ((i + nelt + odd) & mask))
12780 return false;
12783 /* Success! */
12784 if (d->testing_p)
12785 return true;
12787 in0 = d->op0;
12788 in1 = d->op1;
12789 if (BYTES_BIG_ENDIAN)
12791 x = in0, in0 = in1, in1 = x;
12792 odd = !odd;
12794 out = d->target;
12796 if (odd)
12798 switch (vmode)
12800 case V16QImode: gen = gen_aarch64_trn2v16qi; break;
12801 case V8QImode: gen = gen_aarch64_trn2v8qi; break;
12802 case V8HImode: gen = gen_aarch64_trn2v8hi; break;
12803 case V4HImode: gen = gen_aarch64_trn2v4hi; break;
12804 case V4SImode: gen = gen_aarch64_trn2v4si; break;
12805 case V2SImode: gen = gen_aarch64_trn2v2si; break;
12806 case V2DImode: gen = gen_aarch64_trn2v2di; break;
12807 case V4HFmode: gen = gen_aarch64_trn2v4hf; break;
12808 case V8HFmode: gen = gen_aarch64_trn2v8hf; break;
12809 case V4SFmode: gen = gen_aarch64_trn2v4sf; break;
12810 case V2SFmode: gen = gen_aarch64_trn2v2sf; break;
12811 case V2DFmode: gen = gen_aarch64_trn2v2df; break;
12812 default:
12813 return false;
12816 else
12818 switch (vmode)
12820 case V16QImode: gen = gen_aarch64_trn1v16qi; break;
12821 case V8QImode: gen = gen_aarch64_trn1v8qi; break;
12822 case V8HImode: gen = gen_aarch64_trn1v8hi; break;
12823 case V4HImode: gen = gen_aarch64_trn1v4hi; break;
12824 case V4SImode: gen = gen_aarch64_trn1v4si; break;
12825 case V2SImode: gen = gen_aarch64_trn1v2si; break;
12826 case V2DImode: gen = gen_aarch64_trn1v2di; break;
12827 case V4HFmode: gen = gen_aarch64_trn1v4hf; break;
12828 case V8HFmode: gen = gen_aarch64_trn1v8hf; break;
12829 case V4SFmode: gen = gen_aarch64_trn1v4sf; break;
12830 case V2SFmode: gen = gen_aarch64_trn1v2sf; break;
12831 case V2DFmode: gen = gen_aarch64_trn1v2df; break;
12832 default:
12833 return false;
12837 emit_insn (gen (out, in0, in1));
12838 return true;
12841 /* Recognize patterns suitable for the UZP instructions. */
12842 static bool
12843 aarch64_evpc_uzp (struct expand_vec_perm_d *d)
12845 unsigned int i, odd, mask, nelt = d->nelt;
12846 rtx out, in0, in1, x;
12847 rtx (*gen) (rtx, rtx, rtx);
12848 machine_mode vmode = d->vmode;
12850 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12851 return false;
12853 /* Note that these are little-endian tests.
12854 We correct for big-endian later. */
12855 if (d->perm[0] == 0)
12856 odd = 0;
12857 else if (d->perm[0] == 1)
12858 odd = 1;
12859 else
12860 return false;
12861 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12863 for (i = 0; i < nelt; i++)
12865 unsigned elt = (i * 2 + odd) & mask;
12866 if (d->perm[i] != elt)
12867 return false;
12870 /* Success! */
12871 if (d->testing_p)
12872 return true;
12874 in0 = d->op0;
12875 in1 = d->op1;
12876 if (BYTES_BIG_ENDIAN)
12878 x = in0, in0 = in1, in1 = x;
12879 odd = !odd;
12881 out = d->target;
12883 if (odd)
12885 switch (vmode)
12887 case V16QImode: gen = gen_aarch64_uzp2v16qi; break;
12888 case V8QImode: gen = gen_aarch64_uzp2v8qi; break;
12889 case V8HImode: gen = gen_aarch64_uzp2v8hi; break;
12890 case V4HImode: gen = gen_aarch64_uzp2v4hi; break;
12891 case V4SImode: gen = gen_aarch64_uzp2v4si; break;
12892 case V2SImode: gen = gen_aarch64_uzp2v2si; break;
12893 case V2DImode: gen = gen_aarch64_uzp2v2di; break;
12894 case V4HFmode: gen = gen_aarch64_uzp2v4hf; break;
12895 case V8HFmode: gen = gen_aarch64_uzp2v8hf; break;
12896 case V4SFmode: gen = gen_aarch64_uzp2v4sf; break;
12897 case V2SFmode: gen = gen_aarch64_uzp2v2sf; break;
12898 case V2DFmode: gen = gen_aarch64_uzp2v2df; break;
12899 default:
12900 return false;
12903 else
12905 switch (vmode)
12907 case V16QImode: gen = gen_aarch64_uzp1v16qi; break;
12908 case V8QImode: gen = gen_aarch64_uzp1v8qi; break;
12909 case V8HImode: gen = gen_aarch64_uzp1v8hi; break;
12910 case V4HImode: gen = gen_aarch64_uzp1v4hi; break;
12911 case V4SImode: gen = gen_aarch64_uzp1v4si; break;
12912 case V2SImode: gen = gen_aarch64_uzp1v2si; break;
12913 case V2DImode: gen = gen_aarch64_uzp1v2di; break;
12914 case V4HFmode: gen = gen_aarch64_uzp1v4hf; break;
12915 case V8HFmode: gen = gen_aarch64_uzp1v8hf; break;
12916 case V4SFmode: gen = gen_aarch64_uzp1v4sf; break;
12917 case V2SFmode: gen = gen_aarch64_uzp1v2sf; break;
12918 case V2DFmode: gen = gen_aarch64_uzp1v2df; break;
12919 default:
12920 return false;
12924 emit_insn (gen (out, in0, in1));
12925 return true;
12928 /* Recognize patterns suitable for the ZIP instructions. */
12929 static bool
12930 aarch64_evpc_zip (struct expand_vec_perm_d *d)
12932 unsigned int i, high, mask, nelt = d->nelt;
12933 rtx out, in0, in1, x;
12934 rtx (*gen) (rtx, rtx, rtx);
12935 machine_mode vmode = d->vmode;
12937 if (GET_MODE_UNIT_SIZE (vmode) > 8)
12938 return false;
12940 /* Note that these are little-endian tests.
12941 We correct for big-endian later. */
12942 high = nelt / 2;
12943 if (d->perm[0] == high)
12944 /* Do Nothing. */
12946 else if (d->perm[0] == 0)
12947 high = 0;
12948 else
12949 return false;
12950 mask = (d->one_vector_p ? nelt - 1 : 2 * nelt - 1);
12952 for (i = 0; i < nelt / 2; i++)
12954 unsigned elt = (i + high) & mask;
12955 if (d->perm[i * 2] != elt)
12956 return false;
12957 elt = (elt + nelt) & mask;
12958 if (d->perm[i * 2 + 1] != elt)
12959 return false;
12962 /* Success! */
12963 if (d->testing_p)
12964 return true;
12966 in0 = d->op0;
12967 in1 = d->op1;
12968 if (BYTES_BIG_ENDIAN)
12970 x = in0, in0 = in1, in1 = x;
12971 high = !high;
12973 out = d->target;
12975 if (high)
12977 switch (vmode)
12979 case V16QImode: gen = gen_aarch64_zip2v16qi; break;
12980 case V8QImode: gen = gen_aarch64_zip2v8qi; break;
12981 case V8HImode: gen = gen_aarch64_zip2v8hi; break;
12982 case V4HImode: gen = gen_aarch64_zip2v4hi; break;
12983 case V4SImode: gen = gen_aarch64_zip2v4si; break;
12984 case V2SImode: gen = gen_aarch64_zip2v2si; break;
12985 case V2DImode: gen = gen_aarch64_zip2v2di; break;
12986 case V4HFmode: gen = gen_aarch64_zip2v4hf; break;
12987 case V8HFmode: gen = gen_aarch64_zip2v8hf; break;
12988 case V4SFmode: gen = gen_aarch64_zip2v4sf; break;
12989 case V2SFmode: gen = gen_aarch64_zip2v2sf; break;
12990 case V2DFmode: gen = gen_aarch64_zip2v2df; break;
12991 default:
12992 return false;
12995 else
12997 switch (vmode)
12999 case V16QImode: gen = gen_aarch64_zip1v16qi; break;
13000 case V8QImode: gen = gen_aarch64_zip1v8qi; break;
13001 case V8HImode: gen = gen_aarch64_zip1v8hi; break;
13002 case V4HImode: gen = gen_aarch64_zip1v4hi; break;
13003 case V4SImode: gen = gen_aarch64_zip1v4si; break;
13004 case V2SImode: gen = gen_aarch64_zip1v2si; break;
13005 case V2DImode: gen = gen_aarch64_zip1v2di; break;
13006 case V4HFmode: gen = gen_aarch64_zip1v4hf; break;
13007 case V8HFmode: gen = gen_aarch64_zip1v8hf; break;
13008 case V4SFmode: gen = gen_aarch64_zip1v4sf; break;
13009 case V2SFmode: gen = gen_aarch64_zip1v2sf; break;
13010 case V2DFmode: gen = gen_aarch64_zip1v2df; break;
13011 default:
13012 return false;
13016 emit_insn (gen (out, in0, in1));
13017 return true;
13020 /* Recognize patterns for the EXT insn. */
13022 static bool
13023 aarch64_evpc_ext (struct expand_vec_perm_d *d)
13025 unsigned int i, nelt = d->nelt;
13026 rtx (*gen) (rtx, rtx, rtx, rtx);
13027 rtx offset;
13029 unsigned int location = d->perm[0]; /* Always < nelt. */
13031 /* Check if the extracted indices are increasing by one. */
13032 for (i = 1; i < nelt; i++)
13034 unsigned int required = location + i;
13035 if (d->one_vector_p)
13037 /* We'll pass the same vector in twice, so allow indices to wrap. */
13038 required &= (nelt - 1);
13040 if (d->perm[i] != required)
13041 return false;
13044 switch (d->vmode)
13046 case V16QImode: gen = gen_aarch64_extv16qi; break;
13047 case V8QImode: gen = gen_aarch64_extv8qi; break;
13048 case V4HImode: gen = gen_aarch64_extv4hi; break;
13049 case V8HImode: gen = gen_aarch64_extv8hi; break;
13050 case V2SImode: gen = gen_aarch64_extv2si; break;
13051 case V4SImode: gen = gen_aarch64_extv4si; break;
13052 case V4HFmode: gen = gen_aarch64_extv4hf; break;
13053 case V8HFmode: gen = gen_aarch64_extv8hf; break;
13054 case V2SFmode: gen = gen_aarch64_extv2sf; break;
13055 case V4SFmode: gen = gen_aarch64_extv4sf; break;
13056 case V2DImode: gen = gen_aarch64_extv2di; break;
13057 case V2DFmode: gen = gen_aarch64_extv2df; break;
13058 default:
13059 return false;
13062 /* Success! */
13063 if (d->testing_p)
13064 return true;
13066 /* The case where (location == 0) is a no-op for both big- and little-endian,
13067 and is removed by the mid-end at optimization levels -O1 and higher. */
13069 if (BYTES_BIG_ENDIAN && (location != 0))
13071 /* After setup, we want the high elements of the first vector (stored
13072 at the LSB end of the register), and the low elements of the second
13073 vector (stored at the MSB end of the register). So swap. */
13074 std::swap (d->op0, d->op1);
13075 /* location != 0 (above), so safe to assume (nelt - location) < nelt. */
13076 location = nelt - location;
13079 offset = GEN_INT (location);
13080 emit_insn (gen (d->target, d->op0, d->op1, offset));
13081 return true;
13084 /* Recognize patterns for the REV insns. */
13086 static bool
13087 aarch64_evpc_rev (struct expand_vec_perm_d *d)
13089 unsigned int i, j, diff, nelt = d->nelt;
13090 rtx (*gen) (rtx, rtx);
13092 if (!d->one_vector_p)
13093 return false;
13095 diff = d->perm[0];
13096 switch (diff)
13098 case 7:
13099 switch (d->vmode)
13101 case V16QImode: gen = gen_aarch64_rev64v16qi; break;
13102 case V8QImode: gen = gen_aarch64_rev64v8qi; break;
13103 default:
13104 return false;
13106 break;
13107 case 3:
13108 switch (d->vmode)
13110 case V16QImode: gen = gen_aarch64_rev32v16qi; break;
13111 case V8QImode: gen = gen_aarch64_rev32v8qi; break;
13112 case V8HImode: gen = gen_aarch64_rev64v8hi; break;
13113 case V4HImode: gen = gen_aarch64_rev64v4hi; break;
13114 default:
13115 return false;
13117 break;
13118 case 1:
13119 switch (d->vmode)
13121 case V16QImode: gen = gen_aarch64_rev16v16qi; break;
13122 case V8QImode: gen = gen_aarch64_rev16v8qi; break;
13123 case V8HImode: gen = gen_aarch64_rev32v8hi; break;
13124 case V4HImode: gen = gen_aarch64_rev32v4hi; break;
13125 case V4SImode: gen = gen_aarch64_rev64v4si; break;
13126 case V2SImode: gen = gen_aarch64_rev64v2si; break;
13127 case V4SFmode: gen = gen_aarch64_rev64v4sf; break;
13128 case V2SFmode: gen = gen_aarch64_rev64v2sf; break;
13129 case V8HFmode: gen = gen_aarch64_rev64v8hf; break;
13130 case V4HFmode: gen = gen_aarch64_rev64v4hf; break;
13131 default:
13132 return false;
13134 break;
13135 default:
13136 return false;
13139 for (i = 0; i < nelt ; i += diff + 1)
13140 for (j = 0; j <= diff; j += 1)
13142 /* This is guaranteed to be true as the value of diff
13143 is 7, 3, 1 and we should have enough elements in the
13144 queue to generate this. Getting a vector mask with a
13145 value of diff other than these values implies that
13146 something is wrong by the time we get here. */
13147 gcc_assert (i + j < nelt);
13148 if (d->perm[i + j] != i + diff - j)
13149 return false;
13152 /* Success! */
13153 if (d->testing_p)
13154 return true;
13156 emit_insn (gen (d->target, d->op0));
13157 return true;
13160 static bool
13161 aarch64_evpc_dup (struct expand_vec_perm_d *d)
13163 rtx (*gen) (rtx, rtx, rtx);
13164 rtx out = d->target;
13165 rtx in0;
13166 machine_mode vmode = d->vmode;
13167 unsigned int i, elt, nelt = d->nelt;
13168 rtx lane;
13170 elt = d->perm[0];
13171 for (i = 1; i < nelt; i++)
13173 if (elt != d->perm[i])
13174 return false;
13177 /* The generic preparation in aarch64_expand_vec_perm_const_1
13178 swaps the operand order and the permute indices if it finds
13179 d->perm[0] to be in the second operand. Thus, we can always
13180 use d->op0 and need not do any extra arithmetic to get the
13181 correct lane number. */
13182 in0 = d->op0;
13183 lane = GEN_INT (elt); /* The pattern corrects for big-endian. */
13185 switch (vmode)
13187 case V16QImode: gen = gen_aarch64_dup_lanev16qi; break;
13188 case V8QImode: gen = gen_aarch64_dup_lanev8qi; break;
13189 case V8HImode: gen = gen_aarch64_dup_lanev8hi; break;
13190 case V4HImode: gen = gen_aarch64_dup_lanev4hi; break;
13191 case V4SImode: gen = gen_aarch64_dup_lanev4si; break;
13192 case V2SImode: gen = gen_aarch64_dup_lanev2si; break;
13193 case V2DImode: gen = gen_aarch64_dup_lanev2di; break;
13194 case V8HFmode: gen = gen_aarch64_dup_lanev8hf; break;
13195 case V4HFmode: gen = gen_aarch64_dup_lanev4hf; break;
13196 case V4SFmode: gen = gen_aarch64_dup_lanev4sf; break;
13197 case V2SFmode: gen = gen_aarch64_dup_lanev2sf; break;
13198 case V2DFmode: gen = gen_aarch64_dup_lanev2df; break;
13199 default:
13200 return false;
13203 emit_insn (gen (out, in0, lane));
13204 return true;
13207 static bool
13208 aarch64_evpc_tbl (struct expand_vec_perm_d *d)
13210 rtx rperm[MAX_VECT_LEN], sel;
13211 machine_mode vmode = d->vmode;
13212 unsigned int i, nelt = d->nelt;
13214 if (d->testing_p)
13215 return true;
13217 /* Generic code will try constant permutation twice. Once with the
13218 original mode and again with the elements lowered to QImode.
13219 So wait and don't do the selector expansion ourselves. */
13220 if (vmode != V8QImode && vmode != V16QImode)
13221 return false;
13223 for (i = 0; i < nelt; ++i)
13225 int nunits = GET_MODE_NUNITS (vmode);
13227 /* If big-endian and two vectors we end up with a weird mixed-endian
13228 mode on NEON. Reverse the index within each word but not the word
13229 itself. */
13230 rperm[i] = GEN_INT (BYTES_BIG_ENDIAN ? d->perm[i] ^ (nunits - 1)
13231 : d->perm[i]);
13233 sel = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
13234 sel = force_reg (vmode, sel);
13236 aarch64_expand_vec_perm_1 (d->target, d->op0, d->op1, sel);
13237 return true;
13240 static bool
13241 aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
13243 /* The pattern matching functions above are written to look for a small
13244 number to begin the sequence (0, 1, N/2). If we begin with an index
13245 from the second operand, we can swap the operands. */
13246 if (d->perm[0] >= d->nelt)
13248 unsigned i, nelt = d->nelt;
13250 gcc_assert (nelt == (nelt & -nelt));
13251 for (i = 0; i < nelt; ++i)
13252 d->perm[i] ^= nelt; /* Keep the same index, but in the other vector. */
13254 std::swap (d->op0, d->op1);
13257 if (TARGET_SIMD)
13259 if (aarch64_evpc_rev (d))
13260 return true;
13261 else if (aarch64_evpc_ext (d))
13262 return true;
13263 else if (aarch64_evpc_dup (d))
13264 return true;
13265 else if (aarch64_evpc_zip (d))
13266 return true;
13267 else if (aarch64_evpc_uzp (d))
13268 return true;
13269 else if (aarch64_evpc_trn (d))
13270 return true;
13271 return aarch64_evpc_tbl (d);
13273 return false;
13276 /* Expand a vec_perm_const pattern. */
13278 bool
13279 aarch64_expand_vec_perm_const (rtx target, rtx op0, rtx op1, rtx sel)
13281 struct expand_vec_perm_d d;
13282 int i, nelt, which;
13284 d.target = target;
13285 d.op0 = op0;
13286 d.op1 = op1;
13288 d.vmode = GET_MODE (target);
13289 gcc_assert (VECTOR_MODE_P (d.vmode));
13290 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13291 d.testing_p = false;
13293 for (i = which = 0; i < nelt; ++i)
13295 rtx e = XVECEXP (sel, 0, i);
13296 int ei = INTVAL (e) & (2 * nelt - 1);
13297 which |= (ei < nelt ? 1 : 2);
13298 d.perm[i] = ei;
13301 switch (which)
13303 default:
13304 gcc_unreachable ();
13306 case 3:
13307 d.one_vector_p = false;
13308 if (!rtx_equal_p (op0, op1))
13309 break;
13311 /* The elements of PERM do not suggest that only the first operand
13312 is used, but both operands are identical. Allow easier matching
13313 of the permutation by folding the permutation into the single
13314 input vector. */
13315 /* Fall Through. */
13316 case 2:
13317 for (i = 0; i < nelt; ++i)
13318 d.perm[i] &= nelt - 1;
13319 d.op0 = op1;
13320 d.one_vector_p = true;
13321 break;
13323 case 1:
13324 d.op1 = op0;
13325 d.one_vector_p = true;
13326 break;
13329 return aarch64_expand_vec_perm_const_1 (&d);
13332 static bool
13333 aarch64_vectorize_vec_perm_const_ok (machine_mode vmode,
13334 const unsigned char *sel)
13336 struct expand_vec_perm_d d;
13337 unsigned int i, nelt, which;
13338 bool ret;
13340 d.vmode = vmode;
13341 d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
13342 d.testing_p = true;
13343 memcpy (d.perm, sel, nelt);
13345 /* Calculate whether all elements are in one vector. */
13346 for (i = which = 0; i < nelt; ++i)
13348 unsigned char e = d.perm[i];
13349 gcc_assert (e < 2 * nelt);
13350 which |= (e < nelt ? 1 : 2);
13353 /* If all elements are from the second vector, reindex as if from the
13354 first vector. */
13355 if (which == 2)
13356 for (i = 0; i < nelt; ++i)
13357 d.perm[i] -= nelt;
13359 /* Check whether the mask can be applied to a single vector. */
13360 d.one_vector_p = (which != 3);
13362 d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
13363 d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
13364 if (!d.one_vector_p)
13365 d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
13367 start_sequence ();
13368 ret = aarch64_expand_vec_perm_const_1 (&d);
13369 end_sequence ();
13371 return ret;
13375 aarch64_reverse_mask (enum machine_mode mode)
13377 /* We have to reverse each vector because we dont have
13378 a permuted load that can reverse-load according to ABI rules. */
13379 rtx mask;
13380 rtvec v = rtvec_alloc (16);
13381 int i, j;
13382 int nunits = GET_MODE_NUNITS (mode);
13383 int usize = GET_MODE_UNIT_SIZE (mode);
13385 gcc_assert (BYTES_BIG_ENDIAN);
13386 gcc_assert (AARCH64_VALID_SIMD_QREG_MODE (mode));
13388 for (i = 0; i < nunits; i++)
13389 for (j = 0; j < usize; j++)
13390 RTVEC_ELT (v, i * usize + j) = GEN_INT ((i + 1) * usize - 1 - j);
13391 mask = gen_rtx_CONST_VECTOR (V16QImode, v);
13392 return force_reg (V16QImode, mask);
13395 /* Implement MODES_TIEABLE_P. In principle we should always return true.
13396 However due to issues with register allocation it is preferable to avoid
13397 tieing integer scalar and FP scalar modes. Executing integer operations
13398 in general registers is better than treating them as scalar vector
13399 operations. This reduces latency and avoids redundant int<->FP moves.
13400 So tie modes if they are either the same class, or vector modes with
13401 other vector modes, vector structs or any scalar mode.
13404 bool
13405 aarch64_modes_tieable_p (machine_mode mode1, machine_mode mode2)
13407 if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
13408 return true;
13410 /* We specifically want to allow elements of "structure" modes to
13411 be tieable to the structure. This more general condition allows
13412 other rarer situations too. */
13413 if (aarch64_vector_mode_p (mode1) && aarch64_vector_mode_p (mode2))
13414 return true;
13416 /* Also allow any scalar modes with vectors. */
13417 if (aarch64_vector_mode_supported_p (mode1)
13418 || aarch64_vector_mode_supported_p (mode2))
13419 return true;
13421 return false;
13424 /* Return a new RTX holding the result of moving POINTER forward by
13425 AMOUNT bytes. */
13427 static rtx
13428 aarch64_move_pointer (rtx pointer, int amount)
13430 rtx next = plus_constant (Pmode, XEXP (pointer, 0), amount);
13432 return adjust_automodify_address (pointer, GET_MODE (pointer),
13433 next, amount);
13436 /* Return a new RTX holding the result of moving POINTER forward by the
13437 size of the mode it points to. */
13439 static rtx
13440 aarch64_progress_pointer (rtx pointer)
13442 HOST_WIDE_INT amount = GET_MODE_SIZE (GET_MODE (pointer));
13444 return aarch64_move_pointer (pointer, amount);
13447 /* Copy one MODE sized block from SRC to DST, then progress SRC and DST by
13448 MODE bytes. */
13450 static void
13451 aarch64_copy_one_block_and_progress_pointers (rtx *src, rtx *dst,
13452 machine_mode mode)
13454 rtx reg = gen_reg_rtx (mode);
13456 /* "Cast" the pointers to the correct mode. */
13457 *src = adjust_address (*src, mode, 0);
13458 *dst = adjust_address (*dst, mode, 0);
13459 /* Emit the memcpy. */
13460 emit_move_insn (reg, *src);
13461 emit_move_insn (*dst, reg);
13462 /* Move the pointers forward. */
13463 *src = aarch64_progress_pointer (*src);
13464 *dst = aarch64_progress_pointer (*dst);
13467 /* Expand movmem, as if from a __builtin_memcpy. Return true if
13468 we succeed, otherwise return false. */
13470 bool
13471 aarch64_expand_movmem (rtx *operands)
13473 unsigned int n;
13474 rtx dst = operands[0];
13475 rtx src = operands[1];
13476 rtx base;
13477 bool speed_p = !optimize_function_for_size_p (cfun);
13479 /* When optimizing for size, give a better estimate of the length of a
13480 memcpy call, but use the default otherwise. */
13481 unsigned int max_instructions = (speed_p ? 15 : AARCH64_CALL_RATIO) / 2;
13483 /* We can't do anything smart if the amount to copy is not constant. */
13484 if (!CONST_INT_P (operands[2]))
13485 return false;
13487 n = UINTVAL (operands[2]);
13489 /* Try to keep the number of instructions low. For cases below 16 bytes we
13490 need to make at most two moves. For cases above 16 bytes it will be one
13491 move for each 16 byte chunk, then at most two additional moves. */
13492 if (((n / 16) + (n % 16 ? 2 : 0)) > max_instructions)
13493 return false;
13495 base = copy_to_mode_reg (Pmode, XEXP (dst, 0));
13496 dst = adjust_automodify_address (dst, VOIDmode, base, 0);
13498 base = copy_to_mode_reg (Pmode, XEXP (src, 0));
13499 src = adjust_automodify_address (src, VOIDmode, base, 0);
13501 /* Simple cases. Copy 0-3 bytes, as (if applicable) a 2-byte, then a
13502 1-byte chunk. */
13503 if (n < 4)
13505 if (n >= 2)
13507 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13508 n -= 2;
13511 if (n == 1)
13512 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13514 return true;
13517 /* Copy 4-8 bytes. First a 4-byte chunk, then (if applicable) a second
13518 4-byte chunk, partially overlapping with the previously copied chunk. */
13519 if (n < 8)
13521 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13522 n -= 4;
13523 if (n > 0)
13525 int move = n - 4;
13527 src = aarch64_move_pointer (src, move);
13528 dst = aarch64_move_pointer (dst, move);
13529 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13531 return true;
13534 /* Copy more than 8 bytes. Copy chunks of 16 bytes until we run out of
13535 them, then (if applicable) an 8-byte chunk. */
13536 while (n >= 8)
13538 if (n / 16)
13540 aarch64_copy_one_block_and_progress_pointers (&src, &dst, TImode);
13541 n -= 16;
13543 else
13545 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13546 n -= 8;
13550 /* Finish the final bytes of the copy. We can always do this in one
13551 instruction. We either copy the exact amount we need, or partially
13552 overlap with the previous chunk we copied and copy 8-bytes. */
13553 if (n == 0)
13554 return true;
13555 else if (n == 1)
13556 aarch64_copy_one_block_and_progress_pointers (&src, &dst, QImode);
13557 else if (n == 2)
13558 aarch64_copy_one_block_and_progress_pointers (&src, &dst, HImode);
13559 else if (n == 4)
13560 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13561 else
13563 if (n == 3)
13565 src = aarch64_move_pointer (src, -1);
13566 dst = aarch64_move_pointer (dst, -1);
13567 aarch64_copy_one_block_and_progress_pointers (&src, &dst, SImode);
13569 else
13571 int move = n - 8;
13573 src = aarch64_move_pointer (src, move);
13574 dst = aarch64_move_pointer (dst, move);
13575 aarch64_copy_one_block_and_progress_pointers (&src, &dst, DImode);
13579 return true;
13582 /* Split a DImode store of a CONST_INT SRC to MEM DST as two
13583 SImode stores. Handle the case when the constant has identical
13584 bottom and top halves. This is beneficial when the two stores can be
13585 merged into an STP and we avoid synthesising potentially expensive
13586 immediates twice. Return true if such a split is possible. */
13588 bool
13589 aarch64_split_dimode_const_store (rtx dst, rtx src)
13591 rtx lo = gen_lowpart (SImode, src);
13592 rtx hi = gen_highpart_mode (SImode, DImode, src);
13594 bool size_p = optimize_function_for_size_p (cfun);
13596 if (!rtx_equal_p (lo, hi))
13597 return false;
13599 unsigned int orig_cost
13600 = aarch64_internal_mov_immediate (NULL_RTX, src, false, DImode);
13601 unsigned int lo_cost
13602 = aarch64_internal_mov_immediate (NULL_RTX, lo, false, SImode);
13604 /* We want to transform:
13605 MOV x1, 49370
13606 MOVK x1, 0x140, lsl 16
13607 MOVK x1, 0xc0da, lsl 32
13608 MOVK x1, 0x140, lsl 48
13609 STR x1, [x0]
13610 into:
13611 MOV w1, 49370
13612 MOVK w1, 0x140, lsl 16
13613 STP w1, w1, [x0]
13614 So we want to perform this only when we save two instructions
13615 or more. When optimizing for size, however, accept any code size
13616 savings we can. */
13617 if (size_p && orig_cost <= lo_cost)
13618 return false;
13620 if (!size_p
13621 && (orig_cost <= lo_cost + 1))
13622 return false;
13624 rtx mem_lo = adjust_address (dst, SImode, 0);
13625 if (!aarch64_mem_pair_operand (mem_lo, SImode))
13626 return false;
13628 rtx tmp_reg = gen_reg_rtx (SImode);
13629 aarch64_expand_mov_immediate (tmp_reg, lo);
13630 rtx mem_hi = aarch64_move_pointer (mem_lo, GET_MODE_SIZE (SImode));
13631 /* Don't emit an explicit store pair as this may not be always profitable.
13632 Let the sched-fusion logic decide whether to merge them. */
13633 emit_move_insn (mem_lo, tmp_reg);
13634 emit_move_insn (mem_hi, tmp_reg);
13636 return true;
13639 /* Implement the TARGET_ASAN_SHADOW_OFFSET hook. */
13641 static unsigned HOST_WIDE_INT
13642 aarch64_asan_shadow_offset (void)
13644 return (HOST_WIDE_INT_1 << 36);
13647 static bool
13648 aarch64_use_by_pieces_infrastructure_p (unsigned HOST_WIDE_INT size,
13649 unsigned int align,
13650 enum by_pieces_operation op,
13651 bool speed_p)
13653 /* STORE_BY_PIECES can be used when copying a constant string, but
13654 in that case each 64-bit chunk takes 5 insns instead of 2 (LDR/STR).
13655 For now we always fail this and let the move_by_pieces code copy
13656 the string from read-only memory. */
13657 if (op == STORE_BY_PIECES)
13658 return false;
13660 return default_use_by_pieces_infrastructure_p (size, align, op, speed_p);
13663 static rtx
13664 aarch64_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
13665 int code, tree treeop0, tree treeop1)
13667 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13668 rtx op0, op1;
13669 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13670 insn_code icode;
13671 struct expand_operand ops[4];
13673 start_sequence ();
13674 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13676 op_mode = GET_MODE (op0);
13677 if (op_mode == VOIDmode)
13678 op_mode = GET_MODE (op1);
13680 switch (op_mode)
13682 case QImode:
13683 case HImode:
13684 case SImode:
13685 cmp_mode = SImode;
13686 icode = CODE_FOR_cmpsi;
13687 break;
13689 case DImode:
13690 cmp_mode = DImode;
13691 icode = CODE_FOR_cmpdi;
13692 break;
13694 case SFmode:
13695 cmp_mode = SFmode;
13696 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13697 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpesf : CODE_FOR_fcmpsf;
13698 break;
13700 case DFmode:
13701 cmp_mode = DFmode;
13702 cc_mode = aarch64_select_cc_mode ((rtx_code) code, op0, op1);
13703 icode = cc_mode == CCFPEmode ? CODE_FOR_fcmpedf : CODE_FOR_fcmpdf;
13704 break;
13706 default:
13707 end_sequence ();
13708 return NULL_RTX;
13711 op0 = prepare_operand (icode, op0, 0, op_mode, cmp_mode, unsignedp);
13712 op1 = prepare_operand (icode, op1, 1, op_mode, cmp_mode, unsignedp);
13713 if (!op0 || !op1)
13715 end_sequence ();
13716 return NULL_RTX;
13718 *prep_seq = get_insns ();
13719 end_sequence ();
13721 create_fixed_operand (&ops[0], op0);
13722 create_fixed_operand (&ops[1], op1);
13724 start_sequence ();
13725 if (!maybe_expand_insn (icode, 2, ops))
13727 end_sequence ();
13728 return NULL_RTX;
13730 *gen_seq = get_insns ();
13731 end_sequence ();
13733 return gen_rtx_fmt_ee ((rtx_code) code, cc_mode,
13734 gen_rtx_REG (cc_mode, CC_REGNUM), const0_rtx);
13737 static rtx
13738 aarch64_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
13739 int cmp_code, tree treeop0, tree treeop1, int bit_code)
13741 rtx op0, op1, target;
13742 machine_mode op_mode, cmp_mode, cc_mode = CCmode;
13743 int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
13744 insn_code icode;
13745 struct expand_operand ops[6];
13746 int aarch64_cond;
13748 push_to_sequence (*prep_seq);
13749 expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
13751 op_mode = GET_MODE (op0);
13752 if (op_mode == VOIDmode)
13753 op_mode = GET_MODE (op1);
13755 switch (op_mode)
13757 case QImode:
13758 case HImode:
13759 case SImode:
13760 cmp_mode = SImode;
13761 icode = CODE_FOR_ccmpsi;
13762 break;
13764 case DImode:
13765 cmp_mode = DImode;
13766 icode = CODE_FOR_ccmpdi;
13767 break;
13769 case SFmode:
13770 cmp_mode = SFmode;
13771 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13772 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpesf : CODE_FOR_fccmpsf;
13773 break;
13775 case DFmode:
13776 cmp_mode = DFmode;
13777 cc_mode = aarch64_select_cc_mode ((rtx_code) cmp_code, op0, op1);
13778 icode = cc_mode == CCFPEmode ? CODE_FOR_fccmpedf : CODE_FOR_fccmpdf;
13779 break;
13781 default:
13782 end_sequence ();
13783 return NULL_RTX;
13786 op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
13787 op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
13788 if (!op0 || !op1)
13790 end_sequence ();
13791 return NULL_RTX;
13793 *prep_seq = get_insns ();
13794 end_sequence ();
13796 target = gen_rtx_REG (cc_mode, CC_REGNUM);
13797 aarch64_cond = aarch64_get_condition_code_1 (cc_mode, (rtx_code) cmp_code);
13799 if (bit_code != AND)
13801 prev = gen_rtx_fmt_ee (REVERSE_CONDITION (GET_CODE (prev),
13802 GET_MODE (XEXP (prev, 0))),
13803 VOIDmode, XEXP (prev, 0), const0_rtx);
13804 aarch64_cond = AARCH64_INVERSE_CONDITION_CODE (aarch64_cond);
13807 create_fixed_operand (&ops[0], XEXP (prev, 0));
13808 create_fixed_operand (&ops[1], target);
13809 create_fixed_operand (&ops[2], op0);
13810 create_fixed_operand (&ops[3], op1);
13811 create_fixed_operand (&ops[4], prev);
13812 create_fixed_operand (&ops[5], GEN_INT (aarch64_cond));
13814 push_to_sequence (*gen_seq);
13815 if (!maybe_expand_insn (icode, 6, ops))
13817 end_sequence ();
13818 return NULL_RTX;
13821 *gen_seq = get_insns ();
13822 end_sequence ();
13824 return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
13827 #undef TARGET_GEN_CCMP_FIRST
13828 #define TARGET_GEN_CCMP_FIRST aarch64_gen_ccmp_first
13830 #undef TARGET_GEN_CCMP_NEXT
13831 #define TARGET_GEN_CCMP_NEXT aarch64_gen_ccmp_next
13833 /* Implement TARGET_SCHED_MACRO_FUSION_P. Return true if target supports
13834 instruction fusion of some sort. */
13836 static bool
13837 aarch64_macro_fusion_p (void)
13839 return aarch64_tune_params.fusible_ops != AARCH64_FUSE_NOTHING;
13843 /* Implement TARGET_SCHED_MACRO_FUSION_PAIR_P. Return true if PREV and CURR
13844 should be kept together during scheduling. */
13846 static bool
13847 aarch_macro_fusion_pair_p (rtx_insn *prev, rtx_insn *curr)
13849 rtx set_dest;
13850 rtx prev_set = single_set (prev);
13851 rtx curr_set = single_set (curr);
13852 /* prev and curr are simple SET insns i.e. no flag setting or branching. */
13853 bool simple_sets_p = prev_set && curr_set && !any_condjump_p (curr);
13855 if (!aarch64_macro_fusion_p ())
13856 return false;
13858 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOV_MOVK))
13860 /* We are trying to match:
13861 prev (mov) == (set (reg r0) (const_int imm16))
13862 curr (movk) == (set (zero_extract (reg r0)
13863 (const_int 16)
13864 (const_int 16))
13865 (const_int imm16_1)) */
13867 set_dest = SET_DEST (curr_set);
13869 if (GET_CODE (set_dest) == ZERO_EXTRACT
13870 && CONST_INT_P (SET_SRC (curr_set))
13871 && CONST_INT_P (SET_SRC (prev_set))
13872 && CONST_INT_P (XEXP (set_dest, 2))
13873 && INTVAL (XEXP (set_dest, 2)) == 16
13874 && REG_P (XEXP (set_dest, 0))
13875 && REG_P (SET_DEST (prev_set))
13876 && REGNO (XEXP (set_dest, 0)) == REGNO (SET_DEST (prev_set)))
13878 return true;
13882 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_ADD))
13885 /* We're trying to match:
13886 prev (adrp) == (set (reg r1)
13887 (high (symbol_ref ("SYM"))))
13888 curr (add) == (set (reg r0)
13889 (lo_sum (reg r1)
13890 (symbol_ref ("SYM"))))
13891 Note that r0 need not necessarily be the same as r1, especially
13892 during pre-regalloc scheduling. */
13894 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13895 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13897 if (GET_CODE (SET_SRC (curr_set)) == LO_SUM
13898 && REG_P (XEXP (SET_SRC (curr_set), 0))
13899 && REGNO (XEXP (SET_SRC (curr_set), 0))
13900 == REGNO (SET_DEST (prev_set))
13901 && rtx_equal_p (XEXP (SET_SRC (prev_set), 0),
13902 XEXP (SET_SRC (curr_set), 1)))
13903 return true;
13907 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_MOVK_MOVK))
13910 /* We're trying to match:
13911 prev (movk) == (set (zero_extract (reg r0)
13912 (const_int 16)
13913 (const_int 32))
13914 (const_int imm16_1))
13915 curr (movk) == (set (zero_extract (reg r0)
13916 (const_int 16)
13917 (const_int 48))
13918 (const_int imm16_2)) */
13920 if (GET_CODE (SET_DEST (prev_set)) == ZERO_EXTRACT
13921 && GET_CODE (SET_DEST (curr_set)) == ZERO_EXTRACT
13922 && REG_P (XEXP (SET_DEST (prev_set), 0))
13923 && REG_P (XEXP (SET_DEST (curr_set), 0))
13924 && REGNO (XEXP (SET_DEST (prev_set), 0))
13925 == REGNO (XEXP (SET_DEST (curr_set), 0))
13926 && CONST_INT_P (XEXP (SET_DEST (prev_set), 2))
13927 && CONST_INT_P (XEXP (SET_DEST (curr_set), 2))
13928 && INTVAL (XEXP (SET_DEST (prev_set), 2)) == 32
13929 && INTVAL (XEXP (SET_DEST (curr_set), 2)) == 48
13930 && CONST_INT_P (SET_SRC (prev_set))
13931 && CONST_INT_P (SET_SRC (curr_set)))
13932 return true;
13935 if (simple_sets_p && aarch64_fusion_enabled_p (AARCH64_FUSE_ADRP_LDR))
13937 /* We're trying to match:
13938 prev (adrp) == (set (reg r0)
13939 (high (symbol_ref ("SYM"))))
13940 curr (ldr) == (set (reg r1)
13941 (mem (lo_sum (reg r0)
13942 (symbol_ref ("SYM")))))
13944 curr (ldr) == (set (reg r1)
13945 (zero_extend (mem
13946 (lo_sum (reg r0)
13947 (symbol_ref ("SYM")))))) */
13948 if (satisfies_constraint_Ush (SET_SRC (prev_set))
13949 && REG_P (SET_DEST (prev_set)) && REG_P (SET_DEST (curr_set)))
13951 rtx curr_src = SET_SRC (curr_set);
13953 if (GET_CODE (curr_src) == ZERO_EXTEND)
13954 curr_src = XEXP (curr_src, 0);
13956 if (MEM_P (curr_src) && GET_CODE (XEXP (curr_src, 0)) == LO_SUM
13957 && REG_P (XEXP (XEXP (curr_src, 0), 0))
13958 && REGNO (XEXP (XEXP (curr_src, 0), 0))
13959 == REGNO (SET_DEST (prev_set))
13960 && rtx_equal_p (XEXP (XEXP (curr_src, 0), 1),
13961 XEXP (SET_SRC (prev_set), 0)))
13962 return true;
13966 if (aarch64_fusion_enabled_p (AARCH64_FUSE_AES_AESMC)
13967 && aarch_crypto_can_dual_issue (prev, curr))
13968 return true;
13970 if (aarch64_fusion_enabled_p (AARCH64_FUSE_CMP_BRANCH)
13971 && any_condjump_p (curr))
13973 enum attr_type prev_type = get_attr_type (prev);
13975 /* FIXME: this misses some which is considered simple arthematic
13976 instructions for ThunderX. Simple shifts are missed here. */
13977 if (prev_type == TYPE_ALUS_SREG
13978 || prev_type == TYPE_ALUS_IMM
13979 || prev_type == TYPE_LOGICS_REG
13980 || prev_type == TYPE_LOGICS_IMM)
13981 return true;
13984 return false;
13987 /* Return true iff the instruction fusion described by OP is enabled. */
13989 bool
13990 aarch64_fusion_enabled_p (enum aarch64_fusion_pairs op)
13992 return (aarch64_tune_params.fusible_ops & op) != 0;
13995 /* If MEM is in the form of [base+offset], extract the two parts
13996 of address and set to BASE and OFFSET, otherwise return false
13997 after clearing BASE and OFFSET. */
13999 bool
14000 extract_base_offset_in_addr (rtx mem, rtx *base, rtx *offset)
14002 rtx addr;
14004 gcc_assert (MEM_P (mem));
14006 addr = XEXP (mem, 0);
14008 if (REG_P (addr))
14010 *base = addr;
14011 *offset = const0_rtx;
14012 return true;
14015 if (GET_CODE (addr) == PLUS
14016 && REG_P (XEXP (addr, 0)) && CONST_INT_P (XEXP (addr, 1)))
14018 *base = XEXP (addr, 0);
14019 *offset = XEXP (addr, 1);
14020 return true;
14023 *base = NULL_RTX;
14024 *offset = NULL_RTX;
14026 return false;
14029 /* Types for scheduling fusion. */
14030 enum sched_fusion_type
14032 SCHED_FUSION_NONE = 0,
14033 SCHED_FUSION_LD_SIGN_EXTEND,
14034 SCHED_FUSION_LD_ZERO_EXTEND,
14035 SCHED_FUSION_LD,
14036 SCHED_FUSION_ST,
14037 SCHED_FUSION_NUM
14040 /* If INSN is a load or store of address in the form of [base+offset],
14041 extract the two parts and set to BASE and OFFSET. Return scheduling
14042 fusion type this INSN is. */
14044 static enum sched_fusion_type
14045 fusion_load_store (rtx_insn *insn, rtx *base, rtx *offset)
14047 rtx x, dest, src;
14048 enum sched_fusion_type fusion = SCHED_FUSION_LD;
14050 gcc_assert (INSN_P (insn));
14051 x = PATTERN (insn);
14052 if (GET_CODE (x) != SET)
14053 return SCHED_FUSION_NONE;
14055 src = SET_SRC (x);
14056 dest = SET_DEST (x);
14058 machine_mode dest_mode = GET_MODE (dest);
14060 if (!aarch64_mode_valid_for_sched_fusion_p (dest_mode))
14061 return SCHED_FUSION_NONE;
14063 if (GET_CODE (src) == SIGN_EXTEND)
14065 fusion = SCHED_FUSION_LD_SIGN_EXTEND;
14066 src = XEXP (src, 0);
14067 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14068 return SCHED_FUSION_NONE;
14070 else if (GET_CODE (src) == ZERO_EXTEND)
14072 fusion = SCHED_FUSION_LD_ZERO_EXTEND;
14073 src = XEXP (src, 0);
14074 if (GET_CODE (src) != MEM || GET_MODE (src) != SImode)
14075 return SCHED_FUSION_NONE;
14078 if (GET_CODE (src) == MEM && REG_P (dest))
14079 extract_base_offset_in_addr (src, base, offset);
14080 else if (GET_CODE (dest) == MEM && (REG_P (src) || src == const0_rtx))
14082 fusion = SCHED_FUSION_ST;
14083 extract_base_offset_in_addr (dest, base, offset);
14085 else
14086 return SCHED_FUSION_NONE;
14088 if (*base == NULL_RTX || *offset == NULL_RTX)
14089 fusion = SCHED_FUSION_NONE;
14091 return fusion;
14094 /* Implement the TARGET_SCHED_FUSION_PRIORITY hook.
14096 Currently we only support to fuse ldr or str instructions, so FUSION_PRI
14097 and PRI are only calculated for these instructions. For other instruction,
14098 FUSION_PRI and PRI are simply set to MAX_PRI - 1. In the future, other
14099 type instruction fusion can be added by returning different priorities.
14101 It's important that irrelevant instructions get the largest FUSION_PRI. */
14103 static void
14104 aarch64_sched_fusion_priority (rtx_insn *insn, int max_pri,
14105 int *fusion_pri, int *pri)
14107 int tmp, off_val;
14108 rtx base, offset;
14109 enum sched_fusion_type fusion;
14111 gcc_assert (INSN_P (insn));
14113 tmp = max_pri - 1;
14114 fusion = fusion_load_store (insn, &base, &offset);
14115 if (fusion == SCHED_FUSION_NONE)
14117 *pri = tmp;
14118 *fusion_pri = tmp;
14119 return;
14122 /* Set FUSION_PRI according to fusion type and base register. */
14123 *fusion_pri = tmp - fusion * FIRST_PSEUDO_REGISTER - REGNO (base);
14125 /* Calculate PRI. */
14126 tmp /= 2;
14128 /* INSN with smaller offset goes first. */
14129 off_val = (int)(INTVAL (offset));
14130 if (off_val >= 0)
14131 tmp -= (off_val & 0xfffff);
14132 else
14133 tmp += ((- off_val) & 0xfffff);
14135 *pri = tmp;
14136 return;
14139 /* Implement the TARGET_SCHED_ADJUST_PRIORITY hook.
14140 Adjust priority of sha1h instructions so they are scheduled before
14141 other SHA1 instructions. */
14143 static int
14144 aarch64_sched_adjust_priority (rtx_insn *insn, int priority)
14146 rtx x = PATTERN (insn);
14148 if (GET_CODE (x) == SET)
14150 x = SET_SRC (x);
14152 if (GET_CODE (x) == UNSPEC && XINT (x, 1) == UNSPEC_SHA1H)
14153 return priority + 10;
14156 return priority;
14159 /* Given OPERANDS of consecutive load/store, check if we can merge
14160 them into ldp/stp. LOAD is true if they are load instructions.
14161 MODE is the mode of memory operands. */
14163 bool
14164 aarch64_operands_ok_for_ldpstp (rtx *operands, bool load,
14165 enum machine_mode mode)
14167 HOST_WIDE_INT offval_1, offval_2, msize;
14168 enum reg_class rclass_1, rclass_2;
14169 rtx mem_1, mem_2, reg_1, reg_2, base_1, base_2, offset_1, offset_2;
14171 if (load)
14173 mem_1 = operands[1];
14174 mem_2 = operands[3];
14175 reg_1 = operands[0];
14176 reg_2 = operands[2];
14177 gcc_assert (REG_P (reg_1) && REG_P (reg_2));
14178 if (REGNO (reg_1) == REGNO (reg_2))
14179 return false;
14181 else
14183 mem_1 = operands[0];
14184 mem_2 = operands[2];
14185 reg_1 = operands[1];
14186 reg_2 = operands[3];
14189 /* The mems cannot be volatile. */
14190 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2))
14191 return false;
14193 /* If we have SImode and slow unaligned ldp,
14194 check the alignment to be at least 8 byte. */
14195 if (mode == SImode
14196 && (aarch64_tune_params.extra_tuning_flags
14197 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14198 && !optimize_size
14199 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14200 return false;
14202 /* Check if the addresses are in the form of [base+offset]. */
14203 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14204 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14205 return false;
14206 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14207 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14208 return false;
14210 /* Check if the bases are same. */
14211 if (!rtx_equal_p (base_1, base_2))
14212 return false;
14214 offval_1 = INTVAL (offset_1);
14215 offval_2 = INTVAL (offset_2);
14216 msize = GET_MODE_SIZE (mode);
14217 /* Check if the offsets are consecutive. */
14218 if (offval_1 != (offval_2 + msize) && offval_2 != (offval_1 + msize))
14219 return false;
14221 /* Check if the addresses are clobbered by load. */
14222 if (load)
14224 if (reg_mentioned_p (reg_1, mem_1))
14225 return false;
14227 /* In increasing order, the last load can clobber the address. */
14228 if (offval_1 > offval_2 && reg_mentioned_p (reg_2, mem_2))
14229 return false;
14232 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14233 rclass_1 = FP_REGS;
14234 else
14235 rclass_1 = GENERAL_REGS;
14237 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14238 rclass_2 = FP_REGS;
14239 else
14240 rclass_2 = GENERAL_REGS;
14242 /* Check if the registers are of same class. */
14243 if (rclass_1 != rclass_2)
14244 return false;
14246 return true;
14249 /* Given OPERANDS of consecutive load/store, check if we can merge
14250 them into ldp/stp by adjusting the offset. LOAD is true if they
14251 are load instructions. MODE is the mode of memory operands.
14253 Given below consecutive stores:
14255 str w1, [xb, 0x100]
14256 str w1, [xb, 0x104]
14257 str w1, [xb, 0x108]
14258 str w1, [xb, 0x10c]
14260 Though the offsets are out of the range supported by stp, we can
14261 still pair them after adjusting the offset, like:
14263 add scratch, xb, 0x100
14264 stp w1, w1, [scratch]
14265 stp w1, w1, [scratch, 0x8]
14267 The peephole patterns detecting this opportunity should guarantee
14268 the scratch register is avaliable. */
14270 bool
14271 aarch64_operands_adjust_ok_for_ldpstp (rtx *operands, bool load,
14272 enum machine_mode mode)
14274 enum reg_class rclass_1, rclass_2, rclass_3, rclass_4;
14275 HOST_WIDE_INT offval_1, offval_2, offval_3, offval_4, msize;
14276 rtx mem_1, mem_2, mem_3, mem_4, reg_1, reg_2, reg_3, reg_4;
14277 rtx base_1, base_2, base_3, base_4, offset_1, offset_2, offset_3, offset_4;
14279 if (load)
14281 reg_1 = operands[0];
14282 mem_1 = operands[1];
14283 reg_2 = operands[2];
14284 mem_2 = operands[3];
14285 reg_3 = operands[4];
14286 mem_3 = operands[5];
14287 reg_4 = operands[6];
14288 mem_4 = operands[7];
14289 gcc_assert (REG_P (reg_1) && REG_P (reg_2)
14290 && REG_P (reg_3) && REG_P (reg_4));
14291 if (REGNO (reg_1) == REGNO (reg_2) || REGNO (reg_3) == REGNO (reg_4))
14292 return false;
14294 else
14296 mem_1 = operands[0];
14297 reg_1 = operands[1];
14298 mem_2 = operands[2];
14299 reg_2 = operands[3];
14300 mem_3 = operands[4];
14301 reg_3 = operands[5];
14302 mem_4 = operands[6];
14303 reg_4 = operands[7];
14305 /* Skip if memory operand is by itslef valid for ldp/stp. */
14306 if (!MEM_P (mem_1) || aarch64_mem_pair_operand (mem_1, mode))
14307 return false;
14309 /* The mems cannot be volatile. */
14310 if (MEM_VOLATILE_P (mem_1) || MEM_VOLATILE_P (mem_2)
14311 || MEM_VOLATILE_P (mem_3) ||MEM_VOLATILE_P (mem_4))
14312 return false;
14314 /* Check if the addresses are in the form of [base+offset]. */
14315 extract_base_offset_in_addr (mem_1, &base_1, &offset_1);
14316 if (base_1 == NULL_RTX || offset_1 == NULL_RTX)
14317 return false;
14318 extract_base_offset_in_addr (mem_2, &base_2, &offset_2);
14319 if (base_2 == NULL_RTX || offset_2 == NULL_RTX)
14320 return false;
14321 extract_base_offset_in_addr (mem_3, &base_3, &offset_3);
14322 if (base_3 == NULL_RTX || offset_3 == NULL_RTX)
14323 return false;
14324 extract_base_offset_in_addr (mem_4, &base_4, &offset_4);
14325 if (base_4 == NULL_RTX || offset_4 == NULL_RTX)
14326 return false;
14328 /* Check if the bases are same. */
14329 if (!rtx_equal_p (base_1, base_2)
14330 || !rtx_equal_p (base_2, base_3)
14331 || !rtx_equal_p (base_3, base_4))
14332 return false;
14334 offval_1 = INTVAL (offset_1);
14335 offval_2 = INTVAL (offset_2);
14336 offval_3 = INTVAL (offset_3);
14337 offval_4 = INTVAL (offset_4);
14338 msize = GET_MODE_SIZE (mode);
14339 /* Check if the offsets are consecutive. */
14340 if ((offval_1 != (offval_2 + msize)
14341 || offval_1 != (offval_3 + msize * 2)
14342 || offval_1 != (offval_4 + msize * 3))
14343 && (offval_4 != (offval_3 + msize)
14344 || offval_4 != (offval_2 + msize * 2)
14345 || offval_4 != (offval_1 + msize * 3)))
14346 return false;
14348 /* Check if the addresses are clobbered by load. */
14349 if (load)
14351 if (reg_mentioned_p (reg_1, mem_1)
14352 || reg_mentioned_p (reg_2, mem_2)
14353 || reg_mentioned_p (reg_3, mem_3))
14354 return false;
14356 /* In increasing order, the last load can clobber the address. */
14357 if (offval_1 > offval_2 && reg_mentioned_p (reg_4, mem_4))
14358 return false;
14361 /* If we have SImode and slow unaligned ldp,
14362 check the alignment to be at least 8 byte. */
14363 if (mode == SImode
14364 && (aarch64_tune_params.extra_tuning_flags
14365 & AARCH64_EXTRA_TUNE_SLOW_UNALIGNED_LDPW)
14366 && !optimize_size
14367 && MEM_ALIGN (mem_1) < 8 * BITS_PER_UNIT)
14368 return false;
14370 if (REG_P (reg_1) && FP_REGNUM_P (REGNO (reg_1)))
14371 rclass_1 = FP_REGS;
14372 else
14373 rclass_1 = GENERAL_REGS;
14375 if (REG_P (reg_2) && FP_REGNUM_P (REGNO (reg_2)))
14376 rclass_2 = FP_REGS;
14377 else
14378 rclass_2 = GENERAL_REGS;
14380 if (REG_P (reg_3) && FP_REGNUM_P (REGNO (reg_3)))
14381 rclass_3 = FP_REGS;
14382 else
14383 rclass_3 = GENERAL_REGS;
14385 if (REG_P (reg_4) && FP_REGNUM_P (REGNO (reg_4)))
14386 rclass_4 = FP_REGS;
14387 else
14388 rclass_4 = GENERAL_REGS;
14390 /* Check if the registers are of same class. */
14391 if (rclass_1 != rclass_2 || rclass_2 != rclass_3 || rclass_3 != rclass_4)
14392 return false;
14394 return true;
14397 /* Given OPERANDS of consecutive load/store, this function pairs them
14398 into ldp/stp after adjusting the offset. It depends on the fact
14399 that addresses of load/store instructions are in increasing order.
14400 MODE is the mode of memory operands. CODE is the rtl operator
14401 which should be applied to all memory operands, it's SIGN_EXTEND,
14402 ZERO_EXTEND or UNKNOWN. */
14404 bool
14405 aarch64_gen_adjusted_ldpstp (rtx *operands, bool load,
14406 enum machine_mode mode, RTX_CODE code)
14408 rtx base, offset, t1, t2;
14409 rtx mem_1, mem_2, mem_3, mem_4;
14410 HOST_WIDE_INT off_val, abs_off, adj_off, new_off, stp_off_limit, msize;
14412 if (load)
14414 mem_1 = operands[1];
14415 mem_2 = operands[3];
14416 mem_3 = operands[5];
14417 mem_4 = operands[7];
14419 else
14421 mem_1 = operands[0];
14422 mem_2 = operands[2];
14423 mem_3 = operands[4];
14424 mem_4 = operands[6];
14425 gcc_assert (code == UNKNOWN);
14428 extract_base_offset_in_addr (mem_1, &base, &offset);
14429 gcc_assert (base != NULL_RTX && offset != NULL_RTX);
14431 /* Adjust offset thus it can fit in ldp/stp instruction. */
14432 msize = GET_MODE_SIZE (mode);
14433 stp_off_limit = msize * 0x40;
14434 off_val = INTVAL (offset);
14435 abs_off = (off_val < 0) ? -off_val : off_val;
14436 new_off = abs_off % stp_off_limit;
14437 adj_off = abs_off - new_off;
14439 /* Further adjust to make sure all offsets are OK. */
14440 if ((new_off + msize * 2) >= stp_off_limit)
14442 adj_off += stp_off_limit;
14443 new_off -= stp_off_limit;
14446 /* Make sure the adjustment can be done with ADD/SUB instructions. */
14447 if (adj_off >= 0x1000)
14448 return false;
14450 if (off_val < 0)
14452 adj_off = -adj_off;
14453 new_off = -new_off;
14456 /* Create new memory references. */
14457 mem_1 = change_address (mem_1, VOIDmode,
14458 plus_constant (DImode, operands[8], new_off));
14460 /* Check if the adjusted address is OK for ldp/stp. */
14461 if (!aarch64_mem_pair_operand (mem_1, mode))
14462 return false;
14464 msize = GET_MODE_SIZE (mode);
14465 mem_2 = change_address (mem_2, VOIDmode,
14466 plus_constant (DImode,
14467 operands[8],
14468 new_off + msize));
14469 mem_3 = change_address (mem_3, VOIDmode,
14470 plus_constant (DImode,
14471 operands[8],
14472 new_off + msize * 2));
14473 mem_4 = change_address (mem_4, VOIDmode,
14474 plus_constant (DImode,
14475 operands[8],
14476 new_off + msize * 3));
14478 if (code == ZERO_EXTEND)
14480 mem_1 = gen_rtx_ZERO_EXTEND (DImode, mem_1);
14481 mem_2 = gen_rtx_ZERO_EXTEND (DImode, mem_2);
14482 mem_3 = gen_rtx_ZERO_EXTEND (DImode, mem_3);
14483 mem_4 = gen_rtx_ZERO_EXTEND (DImode, mem_4);
14485 else if (code == SIGN_EXTEND)
14487 mem_1 = gen_rtx_SIGN_EXTEND (DImode, mem_1);
14488 mem_2 = gen_rtx_SIGN_EXTEND (DImode, mem_2);
14489 mem_3 = gen_rtx_SIGN_EXTEND (DImode, mem_3);
14490 mem_4 = gen_rtx_SIGN_EXTEND (DImode, mem_4);
14493 if (load)
14495 operands[1] = mem_1;
14496 operands[3] = mem_2;
14497 operands[5] = mem_3;
14498 operands[7] = mem_4;
14500 else
14502 operands[0] = mem_1;
14503 operands[2] = mem_2;
14504 operands[4] = mem_3;
14505 operands[6] = mem_4;
14508 /* Emit adjusting instruction. */
14509 emit_insn (gen_rtx_SET (operands[8], plus_constant (DImode, base, adj_off)));
14510 /* Emit ldp/stp instructions. */
14511 t1 = gen_rtx_SET (operands[0], operands[1]);
14512 t2 = gen_rtx_SET (operands[2], operands[3]);
14513 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14514 t1 = gen_rtx_SET (operands[4], operands[5]);
14515 t2 = gen_rtx_SET (operands[6], operands[7]);
14516 emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, t1, t2)));
14517 return true;
14520 /* Return 1 if pseudo register should be created and used to hold
14521 GOT address for PIC code. */
14523 bool
14524 aarch64_use_pseudo_pic_reg (void)
14526 return aarch64_cmodel == AARCH64_CMODEL_SMALL_SPIC;
14529 /* Implement TARGET_UNSPEC_MAY_TRAP_P. */
14531 static int
14532 aarch64_unspec_may_trap_p (const_rtx x, unsigned flags)
14534 switch (XINT (x, 1))
14536 case UNSPEC_GOTSMALLPIC:
14537 case UNSPEC_GOTSMALLPIC28K:
14538 case UNSPEC_GOTTINYPIC:
14539 return 0;
14540 default:
14541 break;
14544 return default_unspec_may_trap_p (x, flags);
14548 /* If X is a positive CONST_DOUBLE with a value that is a power of 2
14549 return the log2 of that value. Otherwise return -1. */
14552 aarch64_fpconst_pow_of_2 (rtx x)
14554 const REAL_VALUE_TYPE *r;
14556 if (!CONST_DOUBLE_P (x))
14557 return -1;
14559 r = CONST_DOUBLE_REAL_VALUE (x);
14561 if (REAL_VALUE_NEGATIVE (*r)
14562 || REAL_VALUE_ISNAN (*r)
14563 || REAL_VALUE_ISINF (*r)
14564 || !real_isinteger (r, DFmode))
14565 return -1;
14567 return exact_log2 (real_to_integer (r));
14570 /* If X is a vector of equal CONST_DOUBLE values and that value is
14571 Y, return the aarch64_fpconst_pow_of_2 of Y. Otherwise return -1. */
14574 aarch64_vec_fpconst_pow_of_2 (rtx x)
14576 if (GET_CODE (x) != CONST_VECTOR)
14577 return -1;
14579 if (GET_MODE_CLASS (GET_MODE (x)) != MODE_VECTOR_FLOAT)
14580 return -1;
14582 int firstval = aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, 0));
14583 if (firstval <= 0)
14584 return -1;
14586 for (int i = 1; i < CONST_VECTOR_NUNITS (x); i++)
14587 if (aarch64_fpconst_pow_of_2 (CONST_VECTOR_ELT (x, i)) != firstval)
14588 return -1;
14590 return firstval;
14593 /* Implement TARGET_PROMOTED_TYPE to promote 16-bit floating point types
14594 to float.
14596 __fp16 always promotes through this hook.
14597 _Float16 may promote if TARGET_FLT_EVAL_METHOD is 16, but we do that
14598 through the generic excess precision logic rather than here. */
14600 static tree
14601 aarch64_promoted_type (const_tree t)
14603 if (SCALAR_FLOAT_TYPE_P (t)
14604 && TYPE_MAIN_VARIANT (t) == aarch64_fp16_type_node)
14605 return float_type_node;
14607 return NULL_TREE;
14610 /* Implement the TARGET_OPTAB_SUPPORTED_P hook. */
14612 static bool
14613 aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
14614 optimization_type opt_type)
14616 switch (op)
14618 case rsqrt_optab:
14619 return opt_type == OPTIMIZE_FOR_SPEED && use_rsqrt_p (mode1);
14621 default:
14622 return true;
14626 /* Implement TARGET_LIBGCC_FLOATING_POINT_MODE_SUPPORTED_P - return TRUE
14627 if MODE is HFmode, and punt to the generic implementation otherwise. */
14629 static bool
14630 aarch64_libgcc_floating_mode_supported_p (machine_mode mode)
14632 return (mode == HFmode
14633 ? true
14634 : default_libgcc_floating_mode_supported_p (mode));
14637 /* Implement TARGET_SCALAR_MODE_SUPPORTED_P - return TRUE
14638 if MODE is HFmode, and punt to the generic implementation otherwise. */
14640 static bool
14641 aarch64_scalar_mode_supported_p (machine_mode mode)
14643 return (mode == HFmode
14644 ? true
14645 : default_scalar_mode_supported_p (mode));
14648 /* Set the value of FLT_EVAL_METHOD.
14649 ISO/IEC TS 18661-3 defines two values that we'd like to make use of:
14651 0: evaluate all operations and constants, whose semantic type has at
14652 most the range and precision of type float, to the range and
14653 precision of float; evaluate all other operations and constants to
14654 the range and precision of the semantic type;
14656 N, where _FloatN is a supported interchange floating type
14657 evaluate all operations and constants, whose semantic type has at
14658 most the range and precision of _FloatN type, to the range and
14659 precision of the _FloatN type; evaluate all other operations and
14660 constants to the range and precision of the semantic type;
14662 If we have the ARMv8.2-A extensions then we support _Float16 in native
14663 precision, so we should set this to 16. Otherwise, we support the type,
14664 but want to evaluate expressions in float precision, so set this to
14665 0. */
14667 static enum flt_eval_method
14668 aarch64_excess_precision (enum excess_precision_type type)
14670 switch (type)
14672 case EXCESS_PRECISION_TYPE_FAST:
14673 case EXCESS_PRECISION_TYPE_STANDARD:
14674 /* We can calculate either in 16-bit range and precision or
14675 32-bit range and precision. Make that decision based on whether
14676 we have native support for the ARMv8.2-A 16-bit floating-point
14677 instructions or not. */
14678 return (TARGET_FP_F16INST
14679 ? FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16
14680 : FLT_EVAL_METHOD_PROMOTE_TO_FLOAT);
14681 case EXCESS_PRECISION_TYPE_IMPLICIT:
14682 return FLT_EVAL_METHOD_PROMOTE_TO_FLOAT16;
14683 default:
14684 gcc_unreachable ();
14686 return FLT_EVAL_METHOD_UNPREDICTABLE;
14689 /* Implement TARGET_SCHED_CAN_SPECULATE_INSN. Return true if INSN can be
14690 scheduled for speculative execution. Reject the long-running division
14691 and square-root instructions. */
14693 static bool
14694 aarch64_sched_can_speculate_insn (rtx_insn *insn)
14696 switch (get_attr_type (insn))
14698 case TYPE_SDIV:
14699 case TYPE_UDIV:
14700 case TYPE_FDIVS:
14701 case TYPE_FDIVD:
14702 case TYPE_FSQRTS:
14703 case TYPE_FSQRTD:
14704 case TYPE_NEON_FP_SQRT_S:
14705 case TYPE_NEON_FP_SQRT_D:
14706 case TYPE_NEON_FP_SQRT_S_Q:
14707 case TYPE_NEON_FP_SQRT_D_Q:
14708 case TYPE_NEON_FP_DIV_S:
14709 case TYPE_NEON_FP_DIV_D:
14710 case TYPE_NEON_FP_DIV_S_Q:
14711 case TYPE_NEON_FP_DIV_D_Q:
14712 return false;
14713 default:
14714 return true;
14718 /* Target-specific selftests. */
14720 #if CHECKING_P
14722 namespace selftest {
14724 /* Selftest for the RTL loader.
14725 Verify that the RTL loader copes with a dump from
14726 print_rtx_function. This is essentially just a test that class
14727 function_reader can handle a real dump, but it also verifies
14728 that lookup_reg_by_dump_name correctly handles hard regs.
14729 The presence of hard reg names in the dump means that the test is
14730 target-specific, hence it is in this file. */
14732 static void
14733 aarch64_test_loading_full_dump ()
14735 rtl_dump_test t (SELFTEST_LOCATION, locate_file ("aarch64/times-two.rtl"));
14737 ASSERT_STREQ ("times_two", IDENTIFIER_POINTER (DECL_NAME (cfun->decl)));
14739 rtx_insn *insn_1 = get_insn_by_uid (1);
14740 ASSERT_EQ (NOTE, GET_CODE (insn_1));
14742 rtx_insn *insn_15 = get_insn_by_uid (15);
14743 ASSERT_EQ (INSN, GET_CODE (insn_15));
14744 ASSERT_EQ (USE, GET_CODE (PATTERN (insn_15)));
14746 /* Verify crtl->return_rtx. */
14747 ASSERT_EQ (REG, GET_CODE (crtl->return_rtx));
14748 ASSERT_EQ (0, REGNO (crtl->return_rtx));
14749 ASSERT_EQ (SImode, GET_MODE (crtl->return_rtx));
14752 /* Run all target-specific selftests. */
14754 static void
14755 aarch64_run_selftests (void)
14757 aarch64_test_loading_full_dump ();
14760 } // namespace selftest
14762 #endif /* #if CHECKING_P */
14764 #undef TARGET_ADDRESS_COST
14765 #define TARGET_ADDRESS_COST aarch64_address_cost
14767 /* This hook will determines whether unnamed bitfields affect the alignment
14768 of the containing structure. The hook returns true if the structure
14769 should inherit the alignment requirements of an unnamed bitfield's
14770 type. */
14771 #undef TARGET_ALIGN_ANON_BITFIELD
14772 #define TARGET_ALIGN_ANON_BITFIELD hook_bool_void_true
14774 #undef TARGET_ASM_ALIGNED_DI_OP
14775 #define TARGET_ASM_ALIGNED_DI_OP "\t.xword\t"
14777 #undef TARGET_ASM_ALIGNED_HI_OP
14778 #define TARGET_ASM_ALIGNED_HI_OP "\t.hword\t"
14780 #undef TARGET_ASM_ALIGNED_SI_OP
14781 #define TARGET_ASM_ALIGNED_SI_OP "\t.word\t"
14783 #undef TARGET_ASM_CAN_OUTPUT_MI_THUNK
14784 #define TARGET_ASM_CAN_OUTPUT_MI_THUNK \
14785 hook_bool_const_tree_hwi_hwi_const_tree_true
14787 #undef TARGET_ASM_FILE_START
14788 #define TARGET_ASM_FILE_START aarch64_start_file
14790 #undef TARGET_ASM_OUTPUT_MI_THUNK
14791 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
14793 #undef TARGET_ASM_SELECT_RTX_SECTION
14794 #define TARGET_ASM_SELECT_RTX_SECTION aarch64_select_rtx_section
14796 #undef TARGET_ASM_TRAMPOLINE_TEMPLATE
14797 #define TARGET_ASM_TRAMPOLINE_TEMPLATE aarch64_asm_trampoline_template
14799 #undef TARGET_BUILD_BUILTIN_VA_LIST
14800 #define TARGET_BUILD_BUILTIN_VA_LIST aarch64_build_builtin_va_list
14802 #undef TARGET_CALLEE_COPIES
14803 #define TARGET_CALLEE_COPIES hook_bool_CUMULATIVE_ARGS_mode_tree_bool_false
14805 #undef TARGET_CAN_ELIMINATE
14806 #define TARGET_CAN_ELIMINATE aarch64_can_eliminate
14808 #undef TARGET_CAN_INLINE_P
14809 #define TARGET_CAN_INLINE_P aarch64_can_inline_p
14811 #undef TARGET_CANNOT_FORCE_CONST_MEM
14812 #define TARGET_CANNOT_FORCE_CONST_MEM aarch64_cannot_force_const_mem
14814 #undef TARGET_CASE_VALUES_THRESHOLD
14815 #define TARGET_CASE_VALUES_THRESHOLD aarch64_case_values_threshold
14817 #undef TARGET_CONDITIONAL_REGISTER_USAGE
14818 #define TARGET_CONDITIONAL_REGISTER_USAGE aarch64_conditional_register_usage
14820 /* Only the least significant bit is used for initialization guard
14821 variables. */
14822 #undef TARGET_CXX_GUARD_MASK_BIT
14823 #define TARGET_CXX_GUARD_MASK_BIT hook_bool_void_true
14825 #undef TARGET_C_MODE_FOR_SUFFIX
14826 #define TARGET_C_MODE_FOR_SUFFIX aarch64_c_mode_for_suffix
14828 #ifdef TARGET_BIG_ENDIAN_DEFAULT
14829 #undef TARGET_DEFAULT_TARGET_FLAGS
14830 #define TARGET_DEFAULT_TARGET_FLAGS (MASK_BIG_END)
14831 #endif
14833 #undef TARGET_CLASS_MAX_NREGS
14834 #define TARGET_CLASS_MAX_NREGS aarch64_class_max_nregs
14836 #undef TARGET_BUILTIN_DECL
14837 #define TARGET_BUILTIN_DECL aarch64_builtin_decl
14839 #undef TARGET_BUILTIN_RECIPROCAL
14840 #define TARGET_BUILTIN_RECIPROCAL aarch64_builtin_reciprocal
14842 #undef TARGET_C_EXCESS_PRECISION
14843 #define TARGET_C_EXCESS_PRECISION aarch64_excess_precision
14845 #undef TARGET_EXPAND_BUILTIN
14846 #define TARGET_EXPAND_BUILTIN aarch64_expand_builtin
14848 #undef TARGET_EXPAND_BUILTIN_VA_START
14849 #define TARGET_EXPAND_BUILTIN_VA_START aarch64_expand_builtin_va_start
14851 #undef TARGET_FOLD_BUILTIN
14852 #define TARGET_FOLD_BUILTIN aarch64_fold_builtin
14854 #undef TARGET_FUNCTION_ARG
14855 #define TARGET_FUNCTION_ARG aarch64_function_arg
14857 #undef TARGET_FUNCTION_ARG_ADVANCE
14858 #define TARGET_FUNCTION_ARG_ADVANCE aarch64_function_arg_advance
14860 #undef TARGET_FUNCTION_ARG_BOUNDARY
14861 #define TARGET_FUNCTION_ARG_BOUNDARY aarch64_function_arg_boundary
14863 #undef TARGET_FUNCTION_OK_FOR_SIBCALL
14864 #define TARGET_FUNCTION_OK_FOR_SIBCALL aarch64_function_ok_for_sibcall
14866 #undef TARGET_FUNCTION_VALUE
14867 #define TARGET_FUNCTION_VALUE aarch64_function_value
14869 #undef TARGET_FUNCTION_VALUE_REGNO_P
14870 #define TARGET_FUNCTION_VALUE_REGNO_P aarch64_function_value_regno_p
14872 #undef TARGET_FRAME_POINTER_REQUIRED
14873 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
14875 #undef TARGET_GIMPLE_FOLD_BUILTIN
14876 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
14878 #undef TARGET_GIMPLIFY_VA_ARG_EXPR
14879 #define TARGET_GIMPLIFY_VA_ARG_EXPR aarch64_gimplify_va_arg_expr
14881 #undef TARGET_INIT_BUILTINS
14882 #define TARGET_INIT_BUILTINS aarch64_init_builtins
14884 #undef TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS
14885 #define TARGET_IRA_CHANGE_PSEUDO_ALLOCNO_CLASS \
14886 aarch64_ira_change_pseudo_allocno_class
14888 #undef TARGET_LEGITIMATE_ADDRESS_P
14889 #define TARGET_LEGITIMATE_ADDRESS_P aarch64_legitimate_address_hook_p
14891 #undef TARGET_LEGITIMATE_CONSTANT_P
14892 #define TARGET_LEGITIMATE_CONSTANT_P aarch64_legitimate_constant_p
14894 #undef TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT
14895 #define TARGET_LEGITIMIZE_ADDRESS_DISPLACEMENT \
14896 aarch64_legitimize_address_displacement
14898 #undef TARGET_LIBGCC_CMP_RETURN_MODE
14899 #define TARGET_LIBGCC_CMP_RETURN_MODE aarch64_libgcc_cmp_return_mode
14901 #undef TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P
14902 #define TARGET_LIBGCC_FLOATING_MODE_SUPPORTED_P \
14903 aarch64_libgcc_floating_mode_supported_p
14905 #undef TARGET_MANGLE_TYPE
14906 #define TARGET_MANGLE_TYPE aarch64_mangle_type
14908 #undef TARGET_MEMORY_MOVE_COST
14909 #define TARGET_MEMORY_MOVE_COST aarch64_memory_move_cost
14911 #undef TARGET_MIN_DIVISIONS_FOR_RECIP_MUL
14912 #define TARGET_MIN_DIVISIONS_FOR_RECIP_MUL aarch64_min_divisions_for_recip_mul
14914 #undef TARGET_MUST_PASS_IN_STACK
14915 #define TARGET_MUST_PASS_IN_STACK must_pass_in_stack_var_size
14917 /* This target hook should return true if accesses to volatile bitfields
14918 should use the narrowest mode possible. It should return false if these
14919 accesses should use the bitfield container type. */
14920 #undef TARGET_NARROW_VOLATILE_BITFIELD
14921 #define TARGET_NARROW_VOLATILE_BITFIELD hook_bool_void_false
14923 #undef TARGET_OPTION_OVERRIDE
14924 #define TARGET_OPTION_OVERRIDE aarch64_override_options
14926 #undef TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE
14927 #define TARGET_OVERRIDE_OPTIONS_AFTER_CHANGE \
14928 aarch64_override_options_after_change
14930 #undef TARGET_OPTION_SAVE
14931 #define TARGET_OPTION_SAVE aarch64_option_save
14933 #undef TARGET_OPTION_RESTORE
14934 #define TARGET_OPTION_RESTORE aarch64_option_restore
14936 #undef TARGET_OPTION_PRINT
14937 #define TARGET_OPTION_PRINT aarch64_option_print
14939 #undef TARGET_OPTION_VALID_ATTRIBUTE_P
14940 #define TARGET_OPTION_VALID_ATTRIBUTE_P aarch64_option_valid_attribute_p
14942 #undef TARGET_SET_CURRENT_FUNCTION
14943 #define TARGET_SET_CURRENT_FUNCTION aarch64_set_current_function
14945 #undef TARGET_PASS_BY_REFERENCE
14946 #define TARGET_PASS_BY_REFERENCE aarch64_pass_by_reference
14948 #undef TARGET_PREFERRED_RELOAD_CLASS
14949 #define TARGET_PREFERRED_RELOAD_CLASS aarch64_preferred_reload_class
14951 #undef TARGET_SCHED_REASSOCIATION_WIDTH
14952 #define TARGET_SCHED_REASSOCIATION_WIDTH aarch64_reassociation_width
14954 #undef TARGET_PROMOTED_TYPE
14955 #define TARGET_PROMOTED_TYPE aarch64_promoted_type
14957 #undef TARGET_SECONDARY_RELOAD
14958 #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
14960 #undef TARGET_SHIFT_TRUNCATION_MASK
14961 #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
14963 #undef TARGET_SETUP_INCOMING_VARARGS
14964 #define TARGET_SETUP_INCOMING_VARARGS aarch64_setup_incoming_varargs
14966 #undef TARGET_STRUCT_VALUE_RTX
14967 #define TARGET_STRUCT_VALUE_RTX aarch64_struct_value_rtx
14969 #undef TARGET_REGISTER_MOVE_COST
14970 #define TARGET_REGISTER_MOVE_COST aarch64_register_move_cost
14972 #undef TARGET_RETURN_IN_MEMORY
14973 #define TARGET_RETURN_IN_MEMORY aarch64_return_in_memory
14975 #undef TARGET_RETURN_IN_MSB
14976 #define TARGET_RETURN_IN_MSB aarch64_return_in_msb
14978 #undef TARGET_RTX_COSTS
14979 #define TARGET_RTX_COSTS aarch64_rtx_costs_wrapper
14981 #undef TARGET_SCALAR_MODE_SUPPORTED_P
14982 #define TARGET_SCALAR_MODE_SUPPORTED_P aarch64_scalar_mode_supported_p
14984 #undef TARGET_SCHED_ISSUE_RATE
14985 #define TARGET_SCHED_ISSUE_RATE aarch64_sched_issue_rate
14987 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD
14988 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD \
14989 aarch64_sched_first_cycle_multipass_dfa_lookahead
14991 #undef TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD
14992 #define TARGET_SCHED_FIRST_CYCLE_MULTIPASS_DFA_LOOKAHEAD_GUARD \
14993 aarch64_first_cycle_multipass_dfa_lookahead_guard
14995 #undef TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS
14996 #define TARGET_SHRINK_WRAP_GET_SEPARATE_COMPONENTS \
14997 aarch64_get_separate_components
14999 #undef TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB
15000 #define TARGET_SHRINK_WRAP_COMPONENTS_FOR_BB \
15001 aarch64_components_for_bb
15003 #undef TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS
15004 #define TARGET_SHRINK_WRAP_DISQUALIFY_COMPONENTS \
15005 aarch64_disqualify_components
15007 #undef TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS
15008 #define TARGET_SHRINK_WRAP_EMIT_PROLOGUE_COMPONENTS \
15009 aarch64_emit_prologue_components
15011 #undef TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS
15012 #define TARGET_SHRINK_WRAP_EMIT_EPILOGUE_COMPONENTS \
15013 aarch64_emit_epilogue_components
15015 #undef TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS
15016 #define TARGET_SHRINK_WRAP_SET_HANDLED_COMPONENTS \
15017 aarch64_set_handled_components
15019 #undef TARGET_TRAMPOLINE_INIT
15020 #define TARGET_TRAMPOLINE_INIT aarch64_trampoline_init
15022 #undef TARGET_USE_BLOCKS_FOR_CONSTANT_P
15023 #define TARGET_USE_BLOCKS_FOR_CONSTANT_P aarch64_use_blocks_for_constant_p
15025 #undef TARGET_VECTOR_MODE_SUPPORTED_P
15026 #define TARGET_VECTOR_MODE_SUPPORTED_P aarch64_vector_mode_supported_p
15028 #undef TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT
15029 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
15030 aarch64_builtin_support_vector_misalignment
15032 #undef TARGET_ARRAY_MODE_SUPPORTED_P
15033 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
15035 #undef TARGET_VECTORIZE_ADD_STMT_COST
15036 #define TARGET_VECTORIZE_ADD_STMT_COST aarch64_add_stmt_cost
15038 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST
15039 #define TARGET_VECTORIZE_BUILTIN_VECTORIZATION_COST \
15040 aarch64_builtin_vectorization_cost
15042 #undef TARGET_VECTORIZE_PREFERRED_SIMD_MODE
15043 #define TARGET_VECTORIZE_PREFERRED_SIMD_MODE aarch64_preferred_simd_mode
15045 #undef TARGET_VECTORIZE_BUILTINS
15046 #define TARGET_VECTORIZE_BUILTINS
15048 #undef TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION
15049 #define TARGET_VECTORIZE_BUILTIN_VECTORIZED_FUNCTION \
15050 aarch64_builtin_vectorized_function
15052 #undef TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES
15053 #define TARGET_VECTORIZE_AUTOVECTORIZE_VECTOR_SIZES \
15054 aarch64_autovectorize_vector_sizes
15056 #undef TARGET_ATOMIC_ASSIGN_EXPAND_FENV
15057 #define TARGET_ATOMIC_ASSIGN_EXPAND_FENV \
15058 aarch64_atomic_assign_expand_fenv
15060 /* Section anchor support. */
15062 #undef TARGET_MIN_ANCHOR_OFFSET
15063 #define TARGET_MIN_ANCHOR_OFFSET -256
15065 /* Limit the maximum anchor offset to 4k-1, since that's the limit for a
15066 byte offset; we can do much more for larger data types, but have no way
15067 to determine the size of the access. We assume accesses are aligned. */
15068 #undef TARGET_MAX_ANCHOR_OFFSET
15069 #define TARGET_MAX_ANCHOR_OFFSET 4095
15071 #undef TARGET_VECTOR_ALIGNMENT
15072 #define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment
15074 #undef TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE
15075 #define TARGET_VECTORIZE_VECTOR_ALIGNMENT_REACHABLE \
15076 aarch64_simd_vector_alignment_reachable
15078 /* vec_perm support. */
15080 #undef TARGET_VECTORIZE_VEC_PERM_CONST_OK
15081 #define TARGET_VECTORIZE_VEC_PERM_CONST_OK \
15082 aarch64_vectorize_vec_perm_const_ok
15084 #undef TARGET_INIT_LIBFUNCS
15085 #define TARGET_INIT_LIBFUNCS aarch64_init_libfuncs
15087 #undef TARGET_FIXED_CONDITION_CODE_REGS
15088 #define TARGET_FIXED_CONDITION_CODE_REGS aarch64_fixed_condition_code_regs
15090 #undef TARGET_FLAGS_REGNUM
15091 #define TARGET_FLAGS_REGNUM CC_REGNUM
15093 #undef TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS
15094 #define TARGET_CALL_FUSAGE_CONTAINS_NON_CALLEE_CLOBBERS true
15096 #undef TARGET_ASAN_SHADOW_OFFSET
15097 #define TARGET_ASAN_SHADOW_OFFSET aarch64_asan_shadow_offset
15099 #undef TARGET_LEGITIMIZE_ADDRESS
15100 #define TARGET_LEGITIMIZE_ADDRESS aarch64_legitimize_address
15102 #undef TARGET_USE_BY_PIECES_INFRASTRUCTURE_P
15103 #define TARGET_USE_BY_PIECES_INFRASTRUCTURE_P \
15104 aarch64_use_by_pieces_infrastructure_p
15106 #undef TARGET_SCHED_CAN_SPECULATE_INSN
15107 #define TARGET_SCHED_CAN_SPECULATE_INSN aarch64_sched_can_speculate_insn
15109 #undef TARGET_CAN_USE_DOLOOP_P
15110 #define TARGET_CAN_USE_DOLOOP_P can_use_doloop_if_innermost
15112 #undef TARGET_SCHED_ADJUST_PRIORITY
15113 #define TARGET_SCHED_ADJUST_PRIORITY aarch64_sched_adjust_priority
15115 #undef TARGET_SCHED_MACRO_FUSION_P
15116 #define TARGET_SCHED_MACRO_FUSION_P aarch64_macro_fusion_p
15118 #undef TARGET_SCHED_MACRO_FUSION_PAIR_P
15119 #define TARGET_SCHED_MACRO_FUSION_PAIR_P aarch_macro_fusion_pair_p
15121 #undef TARGET_SCHED_FUSION_PRIORITY
15122 #define TARGET_SCHED_FUSION_PRIORITY aarch64_sched_fusion_priority
15124 #undef TARGET_UNSPEC_MAY_TRAP_P
15125 #define TARGET_UNSPEC_MAY_TRAP_P aarch64_unspec_may_trap_p
15127 #undef TARGET_USE_PSEUDO_PIC_REG
15128 #define TARGET_USE_PSEUDO_PIC_REG aarch64_use_pseudo_pic_reg
15130 #undef TARGET_PRINT_OPERAND
15131 #define TARGET_PRINT_OPERAND aarch64_print_operand
15133 #undef TARGET_PRINT_OPERAND_ADDRESS
15134 #define TARGET_PRINT_OPERAND_ADDRESS aarch64_print_operand_address
15136 #undef TARGET_OPTAB_SUPPORTED_P
15137 #define TARGET_OPTAB_SUPPORTED_P aarch64_optab_supported_p
15139 #undef TARGET_OMIT_STRUCT_RETURN_REG
15140 #define TARGET_OMIT_STRUCT_RETURN_REG true
15142 /* The architecture reserves bits 0 and 1 so use bit 2 for descriptors. */
15143 #undef TARGET_CUSTOM_FUNCTION_DESCRIPTORS
15144 #define TARGET_CUSTOM_FUNCTION_DESCRIPTORS 4
15146 #if CHECKING_P
15147 #undef TARGET_RUN_TARGET_SELFTESTS
15148 #define TARGET_RUN_TARGET_SELFTESTS selftest::aarch64_run_selftests
15149 #endif /* #if CHECKING_P */
15151 struct gcc_target targetm = TARGET_INITIALIZER;
15153 #include "gt-aarch64.h"